reccmp: HTML refactor and diff address display (#581)

* reccmp: HTML refactor and diff address display * Restore the @@ range indicator
2025-10-23 00:14:22 +00:00 · 2024-02-20 02:56:33 -05:00
parent ba8f2b1c0f
commit 9c71209fb9
8 changed files with 878 additions and 240 deletions
--- a/tools/isledecomp/isledecomp/compare/asm/parse.py
+++ b/tools/isledecomp/isledecomp/compare/asm/parse.py
@@ -192,11 +192,13 @@ class ParseAsm:
    def parse_asm(self, data: bytes, start_addr: Optional[int] = 0) -> List[str]:
        asm = []

-        for inst in disassembler.disasm_lite(data, start_addr):
+        for raw_inst in disassembler.disasm_lite(data, start_addr):
            # Use heuristics to disregard some differences that aren't representative
            # of the accuracy of a function (e.g. global offsets)
-            result = self.sanitize(DisasmLiteInst(*inst))
+            inst = DisasmLiteInst(*raw_inst)
+            result = self.sanitize(inst)
+
            # mnemonic + " " + op_str
-            asm.append(" ".join(result))
+            asm.append((hex(inst.address), " ".join(result)))

        return asm
--- a/tools/isledecomp/isledecomp/compare/core.py
+++ b/tools/isledecomp/isledecomp/compare/core.py
@@ -12,6 +12,7 @@ from isledecomp.dir import walk_source_dir
 from isledecomp.types import SymbolType
 from isledecomp.compare.asm import ParseAsm, can_resolve_register_differences
 from .db import CompareDb, MatchInfo
+from .diff import combined_diff
 from .lines import LinesDb


@@ -307,8 +308,12 @@ class Compare:
            float_lookup=recomp_float,
        )

-        orig_asm = orig_parse.parse_asm(orig_raw, match.orig_addr)
-        recomp_asm = recomp_parse.parse_asm(recomp_raw, match.recomp_addr)
+        orig_combined = orig_parse.parse_asm(orig_raw, match.orig_addr)
+        recomp_combined = recomp_parse.parse_asm(recomp_raw, match.recomp_addr)
+
+        # Detach addresses from asm lines for the text diff.
+        orig_asm = [x[1] for x in orig_combined]
+        recomp_asm = [x[1] for x in recomp_combined]

        diff = difflib.SequenceMatcher(None, orig_asm, recomp_asm)
        ratio = diff.ratio()
@@ -317,7 +322,9 @@ class Compare:
            # Check whether we can resolve register swaps which are actually
            # perfect matches modulo compiler entropy.
            is_effective_match = can_resolve_register_differences(orig_asm, recomp_asm)
-            unified_diff = difflib.unified_diff(orig_asm, recomp_asm, n=10)
+            unified_diff = combined_diff(
+                diff, orig_combined, recomp_combined, context_size=10
+            )
        else:
            is_effective_match = False
            unified_diff = []
@@ -352,9 +359,7 @@ class Compare:
            [t for (t,) in struct.iter_unpack("<L", recomp_table)],
        )

-        def match_text(
-            i: int, m: Optional[MatchInfo], raw_addr: Optional[int] = None
-        ) -> str:
+        def match_text(m: Optional[MatchInfo], raw_addr: Optional[int] = None) -> str:
            """Format the function reference at this vtable index as text.
            If we have not identified this function, we have the option to
            display the raw address. This is only worth doing for the original addr
@@ -363,19 +368,18 @@ class Compare:
            should override the given function from the superclass, but we have not
            implemented this yet.
            """
-            index = f"vtable0x{i*4:02x}"

            if m is not None:
                orig = hex(m.orig_addr) if m.orig_addr is not None else "no orig"
                recomp = (
                    hex(m.recomp_addr) if m.recomp_addr is not None else "no recomp"
                )
-                return f"{index:>12}  :  ({orig:10} / {recomp:10})  :  {m.name}"
+                return f"({orig} / {recomp})  :  {m.name}"

            if raw_addr is not None:
-                return f"{index:>12}  :  0x{raw_addr:x} from orig not annotated."
+                return f"0x{raw_addr:x} from orig not annotated."

-            return f"{index:>12}  :  (no match)"
+            return "(no match)"

        orig_text = []
        recomp_text = []
@@ -395,14 +399,22 @@ class Compare:
                ratio += 1

            n_entries += 1
-            orig_text.append(match_text(i, orig, raw_orig))
-            recomp_text.append(match_text(i, recomp))
+            index = f"vtable0x{i*4:02x}"
+            orig_text.append((index, match_text(orig, raw_orig)))
+            recomp_text.append((index, match_text(recomp)))

        ratio = ratio / float(n_entries) if n_entries > 0 else 0

        # n=100: Show the entire table if there is a diff to display.
        # Otherwise it would be confusing if the table got cut off.
-        unified_diff = difflib.unified_diff(orig_text, recomp_text, n=100)
+
+        sm = difflib.SequenceMatcher(
+            None,
+            [x[1] for x in orig_text],
+            [x[1] for x in recomp_text],
+        )
+
+        unified_diff = combined_diff(sm, orig_text, recomp_text, context_size=100)

        return DiffReport(
            match_type=SymbolType.VTABLE,
--- a/tools/isledecomp/isledecomp/compare/diff.py
+++ b/tools/isledecomp/isledecomp/compare/diff.py
@@ -0,0 +1,81 @@
+from difflib import SequenceMatcher
+from typing import Dict, List, Tuple
+
+CombinedDiffInput = List[Tuple[str, str]]
+CombinedDiffOutput = List[Tuple[str, List[Dict[str, Tuple[str, str]]]]]
+
+
+def combined_diff(
+    diff: SequenceMatcher,
+    orig_combined: CombinedDiffInput,
+    recomp_combined: CombinedDiffInput,
+    context_size: int = 3,
+) -> CombinedDiffOutput:
+    """We want to diff the original and recomp assembly. The "combined" assembly
+    input has two components: the address of the instruction and the assembly text.
+    We have already diffed the text only. This is the SequenceMatcher object.
+    The SequenceMatcher can generate "opcodes" that describe how to turn "Text A"
+    into "Text B". These refer to list indices of the original arrays, so we can
+    use those to create the final diff and include the address for each line of assembly.
+    This is almost the same procedure as the difflib.unified_diff function, but we
+    are reusing the already generated SequenceMatcher object.
+    """
+
+    unified_diff = []
+
+    for group in diff.get_grouped_opcodes(context_size):
+        subgroups = []
+
+        # Keep track of the addresses we've seen in this diff group.
+        # This helps create the "@@" line. (Does this have a name?)
+        # Do it this way because not every line in each list will have an
+        # address. If our context begins or ends on a line that does not
+        # have one, we will have an incomplete range string.
+        orig_addrs = set()
+        recomp_addrs = set()
+
+        for code, i1, i2, j1, j2 in group:
+            if code == "equal":
+                # The sections are equal, so the list slices are guaranteed
+                # to have the same length. We only need the diffed value (asm text)
+                # from one of the lists, but we need the addresses from both.
+                # Use zip to put the two lists together and then take out what we want.
+                both = [
+                    (a, b, c)
+                    for ((a, b), (c, _)) in zip(
+                        orig_combined[i1:i2], recomp_combined[j1:j2]
+                    )
+                ]
+
+                for orig_addr, _, recomp_addr in both:
+                    if orig_addr is not None:
+                        orig_addrs.add(orig_addr)
+
+                    if recomp_addr is not None:
+                        recomp_addrs.add(recomp_addr)
+
+                subgroups.append({"both": both})
+            else:
+                for orig_addr, _ in orig_combined[i1:i2]:
+                    if orig_addr is not None:
+                        orig_addrs.add(orig_addr)
+
+                for recomp_addr, _ in recomp_combined[j1:j2]:
+                    if recomp_addr is not None:
+                        recomp_addrs.add(recomp_addr)
+
+                subgroups.append(
+                    {
+                        "orig": orig_combined[i1:i2],
+                        "recomp": recomp_combined[j1:j2],
+                    }
+                )
+
+        orig_sorted = sorted(orig_addrs)
+        recomp_sorted = sorted(recomp_addrs)
+
+        diff_slug = f"@@ -{orig_sorted[0]},{orig_sorted[-1]} +{recomp_sorted[0]},{recomp_sorted[-1]} @@"
+
+        unified_diff.append((diff_slug, subgroups))
+
+    return unified_diff
--- a/tools/isledecomp/isledecomp/utils.py
+++ b/tools/isledecomp/isledecomp/utils.py
@@ -5,7 +5,70 @@ import logging
 import colorama


+def print_combined_diff(udiff, plain: bool = False, show_both: bool = False):
+    if udiff is None:
+        return
+
+    # We don't know how long the address string will be ahead of time.
+    # Set this value for each address to try to line things up.
+    padding_size = 0
+
+    for slug, subgroups in udiff:
+        if plain:
+            print("---")
+            print("+++")
+            print(slug)
+        else:
+            print(f"{colorama.Fore.RED}---")
+            print(f"{colorama.Fore.GREEN}+++")
+            print(f"{colorama.Fore.BLUE}{slug}")
+            print(colorama.Style.RESET_ALL, end="")
+
+        for subgroup in subgroups:
+            equal = subgroup.get("both") is not None
+
+            if equal:
+                for orig_addr, line, recomp_addr in subgroup["both"]:
+                    padding_size = max(padding_size, len(orig_addr))
+                    if show_both:
+                        print(f"{orig_addr} / {recomp_addr} : {line}")
+                    else:
+                        print(f"{orig_addr} : {line}")
+            else:
+                for orig_addr, line in subgroup["orig"]:
+                    padding_size = max(padding_size, len(orig_addr))
+                    addr_prefix = (
+                        f"{orig_addr} / {'':{padding_size}}" if show_both else orig_addr
+                    )
+
+                    if plain:
+                        print(f"{addr_prefix} : -{line}")
+                    else:
+                        print(
+                            f"{addr_prefix} : {colorama.Fore.RED}-{line}{colorama.Style.RESET_ALL}"
+                        )
+
+                for recomp_addr, line in subgroup["recomp"]:
+                    padding_size = max(padding_size, len(recomp_addr))
+                    addr_prefix = (
+                        f"{'':{padding_size}} / {recomp_addr}"
+                        if show_both
+                        else recomp_addr
+                    )
+
+                    if plain:
+                        print(f"{addr_prefix} : +{line}")
+                    else:
+                        print(
+                            f"{addr_prefix} : {colorama.Fore.GREEN}+{line}{colorama.Style.RESET_ALL}"
+                        )
+
+        # Newline between each diff subgroup.
+        print()
+
+
 def print_diff(udiff, plain):
+    """Print diff in difflib.unified_diff format."""
    if udiff is None:
        return False