Reccmp: Use symbol names in asm output (#433)

* Name substitution for reccmp asm output * Decomp marker corrections * Fix a few annotations * Fix IslePathActor dtor * Fix audio presenter * Fix LegoEntity::Create * Fix Pizza and related * Fix path part * Add missing annotations * Add missing annotations * Add more missing annotations * Fix MxNotificationParam * More fixes * More fixes * Add missing annotations * Fixes * More annotations * More annotations * More annotations * More annotations * Fixes and annotations * Find imports and thunk functions * Fix more bugs * Add some markers for LEGO1 imports, fix SIZE comment * Add more annotations * Rename annotation * Fix bugs and annotations * Fix bug * Order * Update legoanimpresenter.h * Re-enable print-rec-addr option --------- Co-authored-by: Christian Semmler <mail@csemmler.com>
2025-10-23 16:34:06 +00:00 · 2024-01-14 16:28:46 -05:00
parent 7f7e6e37dd
commit 7e9d3bde65
73 changed files with 1357 additions and 427 deletions
--- a/tools/reccmp/reccmp.py
+++ b/tools/reccmp/reccmp.py
@@ -2,167 +2,20 @@

 import argparse
 import base64
-import difflib
 import json
 import logging
 import os
-import re

 from isledecomp import (
    Bin,
    get_file_in_script_dir,
-    OffsetPlaceholderGenerator,
    print_diff,
 )
 from isledecomp.compare import Compare as IsleCompare
-
-from capstone import Cs, CS_ARCH_X86, CS_MODE_32
-import colorama
 from pystache import Renderer
+import colorama

-
-REGISTER_LIST = set(
-    [
-        "ax",
-        "bp",
-        "bx",
-        "cx",
-        "di",
-        "dx",
-        "eax",
-        "ebp",
-        "ebx",
-        "ecx",
-        "edi",
-        "edx",
-        "esi",
-        "esp",
-        "si",
-        "sp",
-    ]
-)
-WORDS = re.compile(r"\w+")
-
-
-def sanitize(file, placeholder_generator, mnemonic, op_str):
-    op_str_is_number = False
-    try:
-        int(op_str, 16)
-        op_str_is_number = True
-    except ValueError:
-        pass
-
-    if (mnemonic in ["call", "jmp"]) and op_str_is_number:
-        # Filter out "calls" because the offsets we're not currently trying to
-        # match offsets. As long as there's a call in the right place, it's
-        # probably accurate.
-        op_str = placeholder_generator.get(int(op_str, 16))
-    else:
-
-        def filter_out_ptr(ptype, op_str):
-            try:
-                ptrstr = ptype + " ptr ["
-                start = op_str.index(ptrstr) + len(ptrstr)
-                end = op_str.index("]", start)
-
-                # This will throw ValueError if not hex
-                inttest = int(op_str[start:end], 16)
-
-                return (
-                    op_str[0:start] + placeholder_generator.get(inttest) + op_str[end:]
-                )
-            except ValueError:
-                return op_str
-
-        # Filter out dword ptrs where the pointer is to an offset
-        op_str = filter_out_ptr("dword", op_str)
-        op_str = filter_out_ptr("word", op_str)
-        op_str = filter_out_ptr("byte", op_str)
-
-        # Use heuristics to filter out any args that look like offsets
-        words = op_str.split(" ")
-        for i, word in enumerate(words):
-            try:
-                inttest = int(word, 16)
-                if file.is_relocated_addr(inttest):
-                    words[i] = placeholder_generator.get(inttest)
-            except ValueError:
-                pass
-        op_str = " ".join(words)
-
-    return mnemonic, op_str
-
-
-def parse_asm(disassembler, file, asm_addr, size):
-    asm = []
-    data = file.read(asm_addr, size)
-    placeholder_generator = OffsetPlaceholderGenerator()
-    for i in disassembler.disasm(data, 0):
-        # Use heuristics to disregard some differences that aren't representative
-        # of the accuracy of a function (e.g. global offsets)
-        mnemonic, op_str = sanitize(file, placeholder_generator, i.mnemonic, i.op_str)
-        if op_str is None:
-            asm.append(mnemonic)
-        else:
-            asm.append(f"{mnemonic} {op_str}")
-    return asm
-
-
-def get_registers(line: str):
-    to_replace = []
-    # use words regex to find all matching positions:
-    for match in WORDS.finditer(line):
-        reg = match.group(0)
-        if reg in REGISTER_LIST:
-            to_replace.append((reg, match.start()))
-    return to_replace
-
-
-def replace_register(
-    lines: list[str], start_line: int, reg: str, replacement: str
-) -> list[str]:
-    return [
-        line.replace(reg, replacement) if i >= start_line else line
-        for i, line in enumerate(lines)
-    ]
-
-
-# Is it possible to make new_asm the same as original_asm by swapping registers?
-def can_resolve_register_differences(original_asm, new_asm):
-    # Split the ASM on spaces to get more granularity, and so
-    # that we don't modify the original arrays passed in.
-    original_asm = [part for line in original_asm for part in line.split()]
-    new_asm = [part for line in new_asm for part in line.split()]
-
-    # Swapping ain't gonna help if the lengths are different
-    if len(original_asm) != len(new_asm):
-        return False
-
-    # Look for the mismatching lines
-    for i, original_line in enumerate(original_asm):
-        new_line = new_asm[i]
-        if new_line != original_line:
-            # Find all the registers to replace
-            to_replace = get_registers(original_line)
-
-            for replace in to_replace:
-                (reg, reg_index) = replace
-                replacing_reg = new_line[reg_index : reg_index + len(reg)]
-                if replacing_reg in REGISTER_LIST:
-                    if replacing_reg != reg:
-                        # Do a three-way swap replacing in all the subsequent lines
-                        temp_reg = "&" * len(reg)
-                        new_asm = replace_register(new_asm, i, replacing_reg, temp_reg)
-                        new_asm = replace_register(new_asm, i, reg, replacing_reg)
-                        new_asm = replace_register(new_asm, i, temp_reg, reg)
-                else:
-                    # No replacement to do, different code, bail out
-                    return False
-    # Check if the lines are now the same
-    for i, original_line in enumerate(original_asm):
-        if new_asm[i] != original_line:
-            return False
-    return True
+colorama.init()


 def gen_html(html_file, data):
@@ -197,9 +50,88 @@ def gen_svg(svg_file, name_svg, icon, svg_implemented_funcs, total_funcs, raw_ac
        svgfile.write(output_data)


-# Do the actual work
-def main():
-    # pylint: disable=too-many-locals, too-many-nested-blocks, too-many-branches, too-many-statements
+def get_percent_color(value: float) -> str:
+    """Return colorama ANSI escape character for the given decimal value."""
+    if value == 1.0:
+        return colorama.Fore.GREEN
+    if value > 0.8:
+        return colorama.Fore.YELLOW
+
+    return colorama.Fore.RED
+
+
+def percent_string(
+    ratio: float, is_effective: bool = False, is_plain: bool = False
+) -> str:
+    """Helper to construct a percentage string from the given ratio.
+    If is_effective (i.e. effective match), indicate that with the asterisk.
+    If is_plain, don't use colorama ANSI codes."""
+
+    percenttext = f"{(ratio * 100):.2f}%"
+    effective_star = "*" if is_effective else ""
+
+    if is_plain:
+        return percenttext + effective_star
+
+    return "".join(
+        [
+            get_percent_color(ratio),
+            percenttext,
+            colorama.Fore.RED if is_effective else "",
+            effective_star,
+            colorama.Style.RESET_ALL,
+        ]
+    )
+
+
+def print_match_verbose(match, show_both_addrs: bool = False, is_plain: bool = False):
+    percenttext = percent_string(
+        match.effective_ratio, match.is_effective_match, is_plain
+    )
+
+    if show_both_addrs:
+        addrs = f"0x{match.orig_addr:x} / 0x{match.recomp_addr:x}"
+    else:
+        addrs = hex(match.orig_addr)
+
+    if match.effective_ratio == 1.0:
+        ok_text = (
+            "OK!"
+            if is_plain
+            else (colorama.Fore.GREEN + "✨ OK! ✨" + colorama.Style.RESET_ALL)
+        )
+        if match.ratio == 1.0:
+            print(f"{addrs}: {match.name} 100% match.\n\n{ok_text}\n\n")
+        else:
+            print(
+                f"{addrs}: {match.name} Effective 100%% match. (Differs in register allocation only)\n\n{ok_text} (still differs in register allocation)\n\n"
+            )
+    else:
+        print_diff(match.udiff, is_plain)
+
+        print(
+            f"\n{match.name} is only {percenttext} similar to the original, diff above"
+        )
+
+
+def print_match_oneline(match, show_both_addrs: bool = False, is_plain: bool = False):
+    percenttext = percent_string(
+        match.effective_ratio, match.is_effective_match, is_plain
+    )
+
+    if show_both_addrs:
+        addrs = f"0x{match.orig_addr:x} / 0x{match.recomp_addr:x}"
+    else:
+        addrs = hex(match.orig_addr)
+
+    print(f"  {match.name} ({addrs}) is {percenttext} similar to the original")
+
+
+def parse_args() -> argparse.Namespace:
+    def virtual_address(value) -> int:
+        """Helper method for argparse, verbose parameter"""
+        return int(value, 16)
+
    parser = argparse.ArgumentParser(
        allow_abbrev=False,
        description="Recompilation Compare: compare an original EXE with a recompiled EXE + PDB.",
@@ -226,6 +158,7 @@ def main():
        "--verbose",
        "-v",
        metavar="<offset>",
+        type=virtual_address,
        help="Print assembly diff for specific function (original file's offset)",
    )
    parser.add_argument(
@@ -258,198 +191,103 @@ def main():

    args = parser.parse_args()

+    if not os.path.isfile(args.original):
+        parser.error(f"Original binary {args.original} does not exist")
+
+    if not os.path.isfile(args.recompiled):
+        parser.error(f"Recompiled binary {args.recompiled} does not exist")
+
+    if not os.path.isfile(args.pdb):
+        parser.error(f"Symbols PDB {args.pdb} does not exist")
+
+    if not os.path.isdir(args.decomp_dir):
+        parser.error(f"Source directory {args.decomp_dir} does not exist")
+
+    return args
+
+
+def main():
+    args = parse_args()
    logging.basicConfig(level=args.loglevel, format="[%(levelname)s] %(message)s")

-    colorama.init()
-
-    verbose = None
-    found_verbose_target = False
-    if args.verbose:
-        try:
-            verbose = int(args.verbose, 16)
-        except ValueError:
-            parser.error("invalid verbose argument")
-    html_path = args.html
-
-    plain = args.no_color
-
-    original = args.original
-    if not os.path.isfile(original):
-        parser.error(f"Original binary {original} does not exist")
-
-    recomp = args.recompiled
-    if not os.path.isfile(recomp):
-        parser.error(f"Recompiled binary {recomp} does not exist")
-
-    syms = args.pdb
-    if not os.path.isfile(syms):
-        parser.error(f"Symbols PDB {syms} does not exist")
-
-    source = args.decomp_dir
-    if not os.path.isdir(source):
-        parser.error(f"Source directory {source} does not exist")
-
-    svg = args.svg
-
-    with Bin(original, find_str=True) as origfile, Bin(recomp) as recompfile:
-        if verbose is not None:
+    with Bin(args.original, find_str=True) as origfile, Bin(
+        args.recompiled
+    ) as recompfile:
+        if args.verbose is not None:
            # Mute logger events from compare engine
            logging.getLogger("isledecomp.compare.db").setLevel(logging.CRITICAL)
            logging.getLogger("isledecomp.compare.lines").setLevel(logging.CRITICAL)

-        isle_compare = IsleCompare(origfile, recompfile, syms, source)
+        isle_compare = IsleCompare(origfile, recompfile, args.pdb, args.decomp_dir)

        print()

-        capstone_disassembler = Cs(CS_ARCH_X86, CS_MODE_32)
+        ### Compare one or none.
+
+        if args.verbose is not None:
+            match = isle_compare.compare_function(args.verbose)
+            if match is None:
+                print(f"Failed to find the function with address 0x{args.verbose:x}")
+                return
+
+            print_match_verbose(
+                match, show_both_addrs=args.print_rec_addr, is_plain=args.no_color
+            )
+            return
+
+        ### Compare everything.

        function_count = 0
        total_accuracy = 0
        total_effective_accuracy = 0
        htmlinsert = []

-        matches = []
-        if verbose is not None:
-            match = isle_compare.get_one_function(verbose)
-            if match is not None:
-                found_verbose_target = True
-                matches = [match]
-        else:
-            matches = isle_compare.get_functions()
-
-        for match in matches:
-            # The effective_ratio is the ratio when ignoring differing register
-            # allocation vs the ratio is the true ratio.
-            ratio = 0.0
-            effective_ratio = 0.0
-            if match.size:
-                origasm = parse_asm(
-                    capstone_disassembler,
-                    origfile,
-                    match.orig_addr,
-                    match.size,
-                )
-                recompasm = parse_asm(
-                    capstone_disassembler,
-                    recompfile,
-                    match.recomp_addr,
-                    match.size,
-                )
-
-                diff = difflib.SequenceMatcher(None, origasm, recompasm)
-                ratio = diff.ratio()
-                effective_ratio = ratio
-
-                if ratio != 1.0:
-                    # Check whether we can resolve register swaps which are actually
-                    # perfect matches modulo compiler entropy.
-                    if can_resolve_register_differences(origasm, recompasm):
-                        effective_ratio = 1.0
-            else:
-                ratio = 0
-
-            percenttext = f"{(effective_ratio * 100):.2f}%"
-            if not plain:
-                if effective_ratio == 1.0:
-                    percenttext = (
-                        colorama.Fore.GREEN + percenttext + colorama.Style.RESET_ALL
-                    )
-                elif effective_ratio > 0.8:
-                    percenttext = (
-                        colorama.Fore.YELLOW + percenttext + colorama.Style.RESET_ALL
-                    )
-                else:
-                    percenttext = (
-                        colorama.Fore.RED + percenttext + colorama.Style.RESET_ALL
-                    )
-
-            if effective_ratio == 1.0 and ratio != 1.0:
-                if plain:
-                    percenttext += "*"
-                else:
-                    percenttext += colorama.Fore.RED + "*" + colorama.Style.RESET_ALL
-
-            if args.print_rec_addr:
-                addrs = f"0x{match.orig_addr:x} / 0x{match.recomp_addr:x}"
-            else:
-                addrs = hex(match.orig_addr)
-
-            if not verbose:
-                print(
-                    f"  {match.name} ({addrs}) is {percenttext} similar to the original"
-                )
+        for match in isle_compare.compare_functions():
+            print_match_oneline(
+                match, show_both_addrs=args.print_rec_addr, is_plain=args.no_color
+            )

            function_count += 1
-            total_accuracy += ratio
-            total_effective_accuracy += effective_ratio
+            total_accuracy += match.ratio
+            total_effective_accuracy += match.effective_ratio

-            if match.size:
-                udiff = difflib.unified_diff(origasm, recompasm, n=10)
-
-                # If verbose, print the diff for that function to the output
-                if verbose:
-                    if effective_ratio == 1.0:
-                        ok_text = (
-                            "OK!"
-                            if plain
-                            else (
-                                colorama.Fore.GREEN
-                                + "✨ OK! ✨"
-                                + colorama.Style.RESET_ALL
-                            )
-                        )
-                        if ratio == 1.0:
-                            print(f"{addrs}: {match.name} 100% match.\n\n{ok_text}\n\n")
-                        else:
-                            print(
-                                f"{addrs}: {match.name} Effective 100%% match. (Differs in register allocation only)\n\n{ok_text} (still differs in register allocation)\n\n"
-                            )
-                    else:
-                        print_diff(udiff, plain)
-
-                        print(
-                            f"\n{match.name} is only {percenttext} similar to the original, diff above"
-                        )
-
-                # If html, record the diffs to an HTML file
-                if html_path:
-                    htmlinsert.append(
-                        {
-                            "address": f"0x{match.orig_addr:x}",
-                            "name": match.name,
-                            "matching": effective_ratio,
-                            "diff": "\n".join(udiff),
-                        }
-                    )
-
-        if html_path:
-            gen_html(html_path, json.dumps(htmlinsert))
-
-        if verbose:
-            if not found_verbose_target:
-                print(f"Failed to find the function with address 0x{verbose:x}")
-        else:
-            implemented_funcs = function_count
-
-            if args.total:
-                function_count = int(args.total)
-
-            if function_count > 0:
-                effective_accuracy = total_effective_accuracy / function_count * 100
-                actual_accuracy = total_accuracy / function_count * 100
-                print(
-                    f"\nTotal effective accuracy {effective_accuracy:.2f}% across {function_count} functions ({actual_accuracy:.2f}% actual accuracy)"
+            # If html, record the diffs to an HTML file
+            if args.html is not None:
+                htmlinsert.append(
+                    {
+                        "address": f"0x{match.orig_addr:x}",
+                        "name": match.name,
+                        "matching": match.effective_ratio,
+                        "diff": "\n".join(match.udiff),
+                    }
                )

-                if svg:
-                    gen_svg(
-                        svg,
-                        os.path.basename(original),
-                        args.svg_icon,
-                        implemented_funcs,
-                        function_count,
-                        total_effective_accuracy,
-                    )
+        ## Generate files and show summary.
+
+        if args.html is not None:
+            gen_html(args.html, json.dumps(htmlinsert))
+
+        implemented_funcs = function_count
+
+        if args.total:
+            function_count = int(args.total)
+
+        if function_count > 0:
+            effective_accuracy = total_effective_accuracy / function_count * 100
+            actual_accuracy = total_accuracy / function_count * 100
+            print(
+                f"\nTotal effective accuracy {effective_accuracy:.2f}% across {function_count} functions ({actual_accuracy:.2f}% actual accuracy)"
+            )
+
+            if args.svg is not None:
+                gen_svg(
+                    args.svg,
+                    os.path.basename(args.original),
+                    args.svg_icon,
+                    implemented_funcs,
+                    function_count,
+                    total_effective_accuracy,
+                )


 if __name__ == "__main__":