mirror of
https://github.com/isledecomp/isle.git
synced 2025-10-24 08:54:15 +00:00
493 lines
16 KiB
Python
493 lines
16 KiB
Python
"""For all addresses matched by code annotations or recomp pdb,
|
|
report how "far off" the recomp symbol is from its proper place
|
|
in the original binary."""
|
|
|
|
import os
|
|
import argparse
|
|
import logging
|
|
import statistics
|
|
import bisect
|
|
from typing import Iterator, List, Optional, Tuple
|
|
from collections import namedtuple
|
|
from isledecomp import Bin as IsleBin
|
|
from isledecomp.bin import InvalidVirtualAddressError
|
|
from isledecomp.cvdump import Cvdump
|
|
from isledecomp.compare import Compare as IsleCompare
|
|
from isledecomp.types import SymbolType
|
|
|
|
# Ignore all compare-db messages.
|
|
logging.getLogger("isledecomp.compare").addHandler(logging.NullHandler())
|
|
|
|
|
|
def or_blank(value) -> str:
|
|
"""Helper for dealing with potential None values in text output."""
|
|
return "" if value is None else str(value)
|
|
|
|
|
|
class ModuleMap:
|
|
"""Load a subset of sections from the pdb to allow you to look up the
|
|
module number based on the recomp address."""
|
|
|
|
def __init__(self, pdb, binfile) -> None:
|
|
cvdump = Cvdump(pdb).section_contributions().modules().run()
|
|
self.module_lookup = {m.id: (m.lib, m.obj) for m in cvdump.modules}
|
|
self.library_lookup = {m.obj: m.lib for m in cvdump.modules}
|
|
self.section_contrib = [
|
|
(
|
|
binfile.get_abs_addr(sizeref.section, sizeref.offset),
|
|
sizeref.size,
|
|
sizeref.module,
|
|
)
|
|
for sizeref in cvdump.sizerefs
|
|
if binfile.is_valid_section(sizeref.section)
|
|
]
|
|
|
|
# For bisect performance enhancement
|
|
self.contrib_starts = [start for (start, _, __) in self.section_contrib]
|
|
|
|
def get_lib_for_module(self, module: str) -> Optional[str]:
|
|
return self.library_lookup.get(module)
|
|
|
|
def get_all_cmake_modules(self) -> List[str]:
|
|
return [
|
|
obj
|
|
for (_, (__, obj)) in self.module_lookup.items()
|
|
if obj.startswith("CMakeFiles")
|
|
]
|
|
|
|
def get_module(self, addr: int) -> Optional[str]:
|
|
i = bisect.bisect_left(self.contrib_starts, addr)
|
|
# If the addr matches the section contribution start, we are in the
|
|
# right spot. Otherwise, we need to subtract one here.
|
|
# We don't want the insertion point given by bisect, but the
|
|
# section contribution that contains the address.
|
|
|
|
(potential_start, _, __) = self.section_contrib[i]
|
|
if potential_start != addr:
|
|
i -= 1
|
|
|
|
# Safety catch: clamp to range of indices from section_contrib.
|
|
i = max(0, min(i, len(self.section_contrib) - 1))
|
|
|
|
(start, size, module_id) = self.section_contrib[i]
|
|
if start <= addr < start + size:
|
|
if (module := self.module_lookup.get(module_id)) is not None:
|
|
return module
|
|
|
|
return None
|
|
|
|
|
|
def print_sections(sections):
|
|
print(" name | start | v.size | raw size")
|
|
print("---------|----------|----------|----------")
|
|
for sect in sections:
|
|
name = sect.name
|
|
print(
|
|
f"{name:>8} | {sect.virtual_address:8x} | {sect.virtual_size:8x} | {sect.size_of_raw_data:8x}"
|
|
)
|
|
print()
|
|
|
|
|
|
ALLOWED_TYPE_ABBREVIATIONS = ["fun", "dat", "poi", "str", "vta", "flo"]
|
|
|
|
|
|
def match_type_abbreviation(mtype: Optional[SymbolType]) -> str:
|
|
"""Return abbreviation of the given SymbolType name"""
|
|
if mtype is None:
|
|
return ""
|
|
|
|
return mtype.name.lower()[:3]
|
|
|
|
|
|
def get_cmakefiles_prefix(module: str) -> str:
|
|
"""For the given .obj, get the "CMakeFiles/something.dir/" prefix.
|
|
For lack of a better option, this is the library for this module."""
|
|
if module.startswith("CMakeFiles"):
|
|
return "/".join(module.split("/", 2)[:2]) + "/"
|
|
|
|
return module
|
|
|
|
|
|
def truncate_module_name(prefix: str, module: str) -> str:
|
|
"""Remove the CMakeFiles prefix and the .obj suffix for the given module.
|
|
Input: CMakeFiles/lego1.dir/, CMakeFiles/lego1.dir/LEGO1/define.cpp.obj
|
|
Output: LEGO1/define.cpp"""
|
|
|
|
if module.startswith(prefix):
|
|
module = module[len(prefix) :]
|
|
|
|
if module.endswith(".obj"):
|
|
module = module[:-4]
|
|
|
|
return module
|
|
|
|
|
|
def avg_remove_outliers(entries: List[int]) -> int:
|
|
"""Compute the average from this list of entries (addresses)
|
|
after removing outlier values."""
|
|
|
|
if len(entries) == 1:
|
|
return entries[0]
|
|
|
|
avg = statistics.mean(entries)
|
|
sd = statistics.pstdev(entries)
|
|
|
|
return int(statistics.mean([e for e in entries if abs(e - avg) <= 2 * sd]))
|
|
|
|
|
|
RoadmapRow = namedtuple(
|
|
"RoadmapRow",
|
|
[
|
|
"orig_sect_ofs",
|
|
"recomp_sect_ofs",
|
|
"orig_addr",
|
|
"recomp_addr",
|
|
"displacement",
|
|
"sym_type",
|
|
"size",
|
|
"name",
|
|
"module",
|
|
],
|
|
)
|
|
|
|
|
|
class DeltaCollector:
|
|
"""Reads each row of the results and aggregates information about the
|
|
placement of each module."""
|
|
|
|
def __init__(self, match_type: str = "fun") -> None:
|
|
# The displacement for each symbol from each module
|
|
self.disp_map = {}
|
|
|
|
# Each address for each module
|
|
self.addresses = {}
|
|
|
|
# The earliest address for each module
|
|
self.earliest = {}
|
|
|
|
# String abbreviation for which symbol type we are checking
|
|
self.match_type = "fun"
|
|
|
|
match_type = str(match_type).strip().lower()[:3]
|
|
if match_type in ALLOWED_TYPE_ABBREVIATIONS:
|
|
self.match_type = match_type
|
|
|
|
def read_row(self, row: RoadmapRow):
|
|
if row.module is None:
|
|
return
|
|
|
|
if row.sym_type != self.match_type:
|
|
return
|
|
|
|
if row.orig_addr is not None:
|
|
if row.module not in self.addresses:
|
|
self.addresses[row.module] = []
|
|
|
|
self.addresses[row.module].append(row.orig_addr)
|
|
|
|
if row.orig_addr < self.earliest.get(row.module, 0xFFFFFFFFF):
|
|
self.earliest[row.module] = row.orig_addr
|
|
|
|
if row.displacement is not None:
|
|
if row.module not in self.disp_map:
|
|
self.disp_map[row.module] = []
|
|
|
|
self.disp_map[row.module].append(row.displacement)
|
|
|
|
def iter_sorted(self) -> Iterator[Tuple[int, int]]:
|
|
"""Compute the average address for each module, then generate them
|
|
in ascending order."""
|
|
avg_address = {
|
|
mod: avg_remove_outliers(values) for mod, values in self.addresses.items()
|
|
}
|
|
for mod, avg in sorted(avg_address.items(), key=lambda x: x[1]):
|
|
yield (avg, mod)
|
|
|
|
|
|
def suggest_order(results: List[RoadmapRow], module_map: ModuleMap, match_type: str):
|
|
"""Suggest the order of modules for CMakeLists.txt"""
|
|
|
|
dc = DeltaCollector(match_type)
|
|
for row in results:
|
|
dc.read_row(row)
|
|
|
|
# First, show the order of .obj files for the "CMake Modules"
|
|
# Meaning: the modules where the .obj file begins with "CMakeFiles".
|
|
# These are the libraries where we directly control the order.
|
|
# The library name (from cvdump) doesn't make it obvious that these are
|
|
# our libraries so we derive the name based on the CMakeFiles prefix.
|
|
leftover_modules = set(module_map.get_all_cmake_modules())
|
|
|
|
# A little convoluted, but we want to take the first two tokens
|
|
# of the string with '/' as the delimiter.
|
|
# i.e. CMakeFiles/isle.dir/
|
|
# The idea is to print exactly what appears in CMakeLists.txt.
|
|
cmake_prefixes = sorted(set(get_cmakefiles_prefix(mod) for mod in leftover_modules))
|
|
|
|
# Save this off because we'll use it again later.
|
|
computed_order = list(dc.iter_sorted())
|
|
|
|
for prefix in cmake_prefixes:
|
|
print(prefix)
|
|
|
|
last_earliest = 0
|
|
# Show modules ordered by the computed average of addresses
|
|
for _, module in computed_order:
|
|
if not module.startswith(prefix):
|
|
continue
|
|
|
|
leftover_modules.remove(module)
|
|
|
|
avg_displacement = None
|
|
displacements = dc.disp_map.get(module)
|
|
if displacements is not None and len(displacements) > 0:
|
|
avg_displacement = int(statistics.mean(displacements))
|
|
|
|
# Call attention to any modules where ordering by earliest
|
|
# address is different from the computed order we display.
|
|
earliest = dc.earliest.get(module)
|
|
ooo_mark = "*" if earliest < last_earliest else " "
|
|
last_earliest = earliest
|
|
|
|
code_file = truncate_module_name(prefix, module)
|
|
print(f"0x{earliest:08x}{ooo_mark} {avg_displacement:10} {code_file}")
|
|
|
|
# These modules are included in the final binary (in some form) but
|
|
# don't contribute any symbols of the type we are checking.
|
|
# n.b. There could still be other modules that are part of
|
|
# CMakeLists.txt but are not included in the pdb for whatever reason.
|
|
# In other words: don't take the list we provide as the final word on
|
|
# what should or should not be included.
|
|
# This is merely a suggestion of the order.
|
|
for module in leftover_modules:
|
|
if not module.startswith(prefix):
|
|
continue
|
|
|
|
# aligned with previous print
|
|
code_file = truncate_module_name(prefix, module)
|
|
print(f" no suggestion {code_file}")
|
|
|
|
print()
|
|
|
|
# Now display the order of all libaries in the final file.
|
|
library_order = {}
|
|
|
|
for start, module in computed_order:
|
|
lib = module_map.get_lib_for_module(module)
|
|
if lib is None:
|
|
lib = get_cmakefiles_prefix(module)
|
|
|
|
if start < library_order.get(lib, 0xFFFFFFFFF):
|
|
library_order[lib] = start
|
|
|
|
print("Library order (average address shown):")
|
|
for lib, start in sorted(library_order.items(), key=lambda x: x[1]):
|
|
# Strip off any OS path for brevity
|
|
if not lib.startswith("CMakeFiles"):
|
|
lib = os.path.basename(lib)
|
|
|
|
print(f"{lib:40} {start:08x}")
|
|
|
|
|
|
def print_text_report(results: List[RoadmapRow]):
|
|
"""Print the result with original and recomp addresses."""
|
|
for row in results:
|
|
print(
|
|
" ".join(
|
|
[
|
|
f"{or_blank(row.orig_sect_ofs):14}",
|
|
f"{or_blank(row.recomp_sect_ofs):14}",
|
|
f"{or_blank(row.displacement):>8}",
|
|
f"{row.sym_type:3}",
|
|
f"{or_blank(row.size):6}",
|
|
or_blank(row.name),
|
|
]
|
|
)
|
|
)
|
|
|
|
|
|
def print_diff_report(results: List[RoadmapRow]):
|
|
"""Print only entries where we have the recomp address.
|
|
This is intended for generating a file to diff against.
|
|
The recomp addresses are always changing so we hide those."""
|
|
for row in results:
|
|
if row.orig_addr is None or row.recomp_addr is None:
|
|
continue
|
|
|
|
print(
|
|
" ".join(
|
|
[
|
|
f"{or_blank(row.orig_sect_ofs):14}",
|
|
f"{or_blank(row.displacement):>8}",
|
|
f"{row.sym_type:3}",
|
|
f"{or_blank(row.size):6}",
|
|
or_blank(row.name),
|
|
]
|
|
)
|
|
)
|
|
|
|
|
|
def export_to_csv(csv_file: str, results: List[RoadmapRow]):
|
|
with open(csv_file, "w+", encoding="utf-8") as f:
|
|
f.write(
|
|
"orig_sect_ofs,recomp_sect_ofs,orig_addr,recomp_addr,displacement,row_type,size,name,module\n"
|
|
)
|
|
for row in results:
|
|
f.write(",".join(map(or_blank, row)))
|
|
f.write("\n")
|
|
|
|
|
|
def parse_args() -> argparse.Namespace:
|
|
parser = argparse.ArgumentParser(
|
|
description="Show all addresses from original and recomp."
|
|
)
|
|
parser.add_argument(
|
|
"original", metavar="original-binary", help="The original binary"
|
|
)
|
|
parser.add_argument(
|
|
"recompiled", metavar="recompiled-binary", help="The recompiled binary"
|
|
)
|
|
parser.add_argument(
|
|
"pdb", metavar="recompiled-pdb", help="The PDB of the recompiled binary"
|
|
)
|
|
parser.add_argument(
|
|
"decomp_dir", metavar="decomp-dir", help="The decompiled source tree"
|
|
)
|
|
parser.add_argument("--csv", metavar="<file>", help="If set, export to CSV")
|
|
parser.add_argument(
|
|
"--verbose", "-v", action="store_true", help="Show recomp addresses in output"
|
|
)
|
|
parser.add_argument(
|
|
"--order",
|
|
const="fun",
|
|
nargs="?",
|
|
type=str,
|
|
help="Show suggested order of modules (using the specified symbol type)",
|
|
)
|
|
|
|
(args, _) = parser.parse_known_args()
|
|
|
|
if not os.path.isfile(args.original):
|
|
parser.error(f"Original binary {args.original} does not exist")
|
|
|
|
if not os.path.isfile(args.recompiled):
|
|
parser.error(f"Recompiled binary {args.recompiled} does not exist")
|
|
|
|
if not os.path.isfile(args.pdb):
|
|
parser.error(f"Symbols PDB {args.pdb} does not exist")
|
|
|
|
if not os.path.isdir(args.decomp_dir):
|
|
parser.error(f"Source directory {args.decomp_dir} does not exist")
|
|
|
|
return args
|
|
|
|
|
|
def main():
|
|
args = parse_args()
|
|
|
|
with IsleBin(args.original, find_str=True) as orig_bin, IsleBin(
|
|
args.recompiled
|
|
) as recomp_bin:
|
|
engine = IsleCompare(orig_bin, recomp_bin, args.pdb, args.decomp_dir)
|
|
|
|
module_map = ModuleMap(args.pdb, recomp_bin)
|
|
|
|
def is_same_section(orig: int, recomp: int) -> bool:
|
|
"""Compare the section name instead of the index.
|
|
LEGO1.dll adds extra sections for some reason. (Smacker library?)"""
|
|
|
|
try:
|
|
orig_name = orig_bin.sections[orig - 1].name
|
|
recomp_name = recomp_bin.sections[recomp - 1].name
|
|
return orig_name == recomp_name
|
|
except IndexError:
|
|
return False
|
|
|
|
def to_roadmap_row(match):
|
|
orig_sect = None
|
|
orig_ofs = None
|
|
orig_sect_ofs = None
|
|
recomp_sect = None
|
|
recomp_ofs = None
|
|
recomp_sect_ofs = None
|
|
orig_addr = None
|
|
recomp_addr = None
|
|
displacement = None
|
|
module_name = None
|
|
|
|
if match.recomp_addr is not None:
|
|
if (module_ref := module_map.get_module(match.recomp_addr)) is not None:
|
|
(_, module_name) = module_ref
|
|
|
|
row_type = match_type_abbreviation(match.compare_type)
|
|
name = (
|
|
repr(match.name)
|
|
if match.compare_type == SymbolType.STRING
|
|
else match.name
|
|
)
|
|
|
|
if match.orig_addr is not None:
|
|
orig_addr = match.orig_addr
|
|
(orig_sect, orig_ofs) = orig_bin.get_relative_addr(match.orig_addr)
|
|
orig_sect_ofs = f"{orig_sect:04}:{orig_ofs:08x}"
|
|
|
|
if match.recomp_addr is not None:
|
|
recomp_addr = match.recomp_addr
|
|
(recomp_sect, recomp_ofs) = recomp_bin.get_relative_addr(
|
|
match.recomp_addr
|
|
)
|
|
recomp_sect_ofs = f"{recomp_sect:04}:{recomp_ofs:08x}"
|
|
|
|
if (
|
|
orig_sect is not None
|
|
and recomp_sect is not None
|
|
and is_same_section(orig_sect, recomp_sect)
|
|
):
|
|
displacement = recomp_ofs - orig_ofs
|
|
|
|
return RoadmapRow(
|
|
orig_sect_ofs,
|
|
recomp_sect_ofs,
|
|
orig_addr,
|
|
recomp_addr,
|
|
displacement,
|
|
row_type,
|
|
match.size,
|
|
name,
|
|
module_name,
|
|
)
|
|
|
|
def roadmap_row_generator(matches):
|
|
for match in matches:
|
|
try:
|
|
yield to_roadmap_row(match)
|
|
except InvalidVirtualAddressError:
|
|
# This is here to work around the fact that we have RVA
|
|
# values (i.e. not real virtual addrs) in our compare db.
|
|
pass
|
|
|
|
results = list(roadmap_row_generator(engine.get_all()))
|
|
|
|
if args.order is not None:
|
|
suggest_order(results, module_map, args.order)
|
|
return
|
|
|
|
if args.csv is None:
|
|
if args.verbose:
|
|
print("ORIG sections:")
|
|
print_sections(orig_bin.sections)
|
|
|
|
print("RECOMP sections:")
|
|
print_sections(recomp_bin.sections)
|
|
|
|
print_text_report(results)
|
|
else:
|
|
print_diff_report(results)
|
|
|
|
if args.csv is not None:
|
|
export_to_csv(args.csv, results)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|