reccmp: HTML refactor and diff address display (#581)

* reccmp: HTML refactor and diff address display

* Restore the @@ range indicator
This commit is contained in:
MS
2024-02-20 02:56:33 -05:00
committed by GitHub
parent ba8f2b1c0f
commit 9c71209fb9
8 changed files with 878 additions and 240 deletions

View File

@@ -192,11 +192,13 @@ class ParseAsm:
def parse_asm(self, data: bytes, start_addr: Optional[int] = 0) -> List[str]:
asm = []
for inst in disassembler.disasm_lite(data, start_addr):
for raw_inst in disassembler.disasm_lite(data, start_addr):
# Use heuristics to disregard some differences that aren't representative
# of the accuracy of a function (e.g. global offsets)
result = self.sanitize(DisasmLiteInst(*inst))
inst = DisasmLiteInst(*raw_inst)
result = self.sanitize(inst)
# mnemonic + " " + op_str
asm.append(" ".join(result))
asm.append((hex(inst.address), " ".join(result)))
return asm

View File

@@ -12,6 +12,7 @@ from isledecomp.dir import walk_source_dir
from isledecomp.types import SymbolType
from isledecomp.compare.asm import ParseAsm, can_resolve_register_differences
from .db import CompareDb, MatchInfo
from .diff import combined_diff
from .lines import LinesDb
@@ -307,8 +308,12 @@ class Compare:
float_lookup=recomp_float,
)
orig_asm = orig_parse.parse_asm(orig_raw, match.orig_addr)
recomp_asm = recomp_parse.parse_asm(recomp_raw, match.recomp_addr)
orig_combined = orig_parse.parse_asm(orig_raw, match.orig_addr)
recomp_combined = recomp_parse.parse_asm(recomp_raw, match.recomp_addr)
# Detach addresses from asm lines for the text diff.
orig_asm = [x[1] for x in orig_combined]
recomp_asm = [x[1] for x in recomp_combined]
diff = difflib.SequenceMatcher(None, orig_asm, recomp_asm)
ratio = diff.ratio()
@@ -317,7 +322,9 @@ class Compare:
# Check whether we can resolve register swaps which are actually
# perfect matches modulo compiler entropy.
is_effective_match = can_resolve_register_differences(orig_asm, recomp_asm)
unified_diff = difflib.unified_diff(orig_asm, recomp_asm, n=10)
unified_diff = combined_diff(
diff, orig_combined, recomp_combined, context_size=10
)
else:
is_effective_match = False
unified_diff = []
@@ -352,9 +359,7 @@ class Compare:
[t for (t,) in struct.iter_unpack("<L", recomp_table)],
)
def match_text(
i: int, m: Optional[MatchInfo], raw_addr: Optional[int] = None
) -> str:
def match_text(m: Optional[MatchInfo], raw_addr: Optional[int] = None) -> str:
"""Format the function reference at this vtable index as text.
If we have not identified this function, we have the option to
display the raw address. This is only worth doing for the original addr
@@ -363,19 +368,18 @@ class Compare:
should override the given function from the superclass, but we have not
implemented this yet.
"""
index = f"vtable0x{i*4:02x}"
if m is not None:
orig = hex(m.orig_addr) if m.orig_addr is not None else "no orig"
recomp = (
hex(m.recomp_addr) if m.recomp_addr is not None else "no recomp"
)
return f"{index:>12} : ({orig:10} / {recomp:10}) : {m.name}"
return f"({orig} / {recomp}) : {m.name}"
if raw_addr is not None:
return f"{index:>12} : 0x{raw_addr:x} from orig not annotated."
return f"0x{raw_addr:x} from orig not annotated."
return f"{index:>12} : (no match)"
return "(no match)"
orig_text = []
recomp_text = []
@@ -395,14 +399,22 @@ class Compare:
ratio += 1
n_entries += 1
orig_text.append(match_text(i, orig, raw_orig))
recomp_text.append(match_text(i, recomp))
index = f"vtable0x{i*4:02x}"
orig_text.append((index, match_text(orig, raw_orig)))
recomp_text.append((index, match_text(recomp)))
ratio = ratio / float(n_entries) if n_entries > 0 else 0
# n=100: Show the entire table if there is a diff to display.
# Otherwise it would be confusing if the table got cut off.
unified_diff = difflib.unified_diff(orig_text, recomp_text, n=100)
sm = difflib.SequenceMatcher(
None,
[x[1] for x in orig_text],
[x[1] for x in recomp_text],
)
unified_diff = combined_diff(sm, orig_text, recomp_text, context_size=100)
return DiffReport(
match_type=SymbolType.VTABLE,

View File

@@ -0,0 +1,81 @@
from difflib import SequenceMatcher
from typing import Dict, List, Tuple
CombinedDiffInput = List[Tuple[str, str]]
CombinedDiffOutput = List[Tuple[str, List[Dict[str, Tuple[str, str]]]]]
def combined_diff(
diff: SequenceMatcher,
orig_combined: CombinedDiffInput,
recomp_combined: CombinedDiffInput,
context_size: int = 3,
) -> CombinedDiffOutput:
"""We want to diff the original and recomp assembly. The "combined" assembly
input has two components: the address of the instruction and the assembly text.
We have already diffed the text only. This is the SequenceMatcher object.
The SequenceMatcher can generate "opcodes" that describe how to turn "Text A"
into "Text B". These refer to list indices of the original arrays, so we can
use those to create the final diff and include the address for each line of assembly.
This is almost the same procedure as the difflib.unified_diff function, but we
are reusing the already generated SequenceMatcher object.
"""
unified_diff = []
for group in diff.get_grouped_opcodes(context_size):
subgroups = []
# Keep track of the addresses we've seen in this diff group.
# This helps create the "@@" line. (Does this have a name?)
# Do it this way because not every line in each list will have an
# address. If our context begins or ends on a line that does not
# have one, we will have an incomplete range string.
orig_addrs = set()
recomp_addrs = set()
for code, i1, i2, j1, j2 in group:
if code == "equal":
# The sections are equal, so the list slices are guaranteed
# to have the same length. We only need the diffed value (asm text)
# from one of the lists, but we need the addresses from both.
# Use zip to put the two lists together and then take out what we want.
both = [
(a, b, c)
for ((a, b), (c, _)) in zip(
orig_combined[i1:i2], recomp_combined[j1:j2]
)
]
for orig_addr, _, recomp_addr in both:
if orig_addr is not None:
orig_addrs.add(orig_addr)
if recomp_addr is not None:
recomp_addrs.add(recomp_addr)
subgroups.append({"both": both})
else:
for orig_addr, _ in orig_combined[i1:i2]:
if orig_addr is not None:
orig_addrs.add(orig_addr)
for recomp_addr, _ in recomp_combined[j1:j2]:
if recomp_addr is not None:
recomp_addrs.add(recomp_addr)
subgroups.append(
{
"orig": orig_combined[i1:i2],
"recomp": recomp_combined[j1:j2],
}
)
orig_sorted = sorted(orig_addrs)
recomp_sorted = sorted(recomp_addrs)
diff_slug = f"@@ -{orig_sorted[0]},{orig_sorted[-1]} +{recomp_sorted[0]},{recomp_sorted[-1]} @@"
unified_diff.append((diff_slug, subgroups))
return unified_diff

View File

@@ -5,7 +5,70 @@ import logging
import colorama
def print_combined_diff(udiff, plain: bool = False, show_both: bool = False):
if udiff is None:
return
# We don't know how long the address string will be ahead of time.
# Set this value for each address to try to line things up.
padding_size = 0
for slug, subgroups in udiff:
if plain:
print("---")
print("+++")
print(slug)
else:
print(f"{colorama.Fore.RED}---")
print(f"{colorama.Fore.GREEN}+++")
print(f"{colorama.Fore.BLUE}{slug}")
print(colorama.Style.RESET_ALL, end="")
for subgroup in subgroups:
equal = subgroup.get("both") is not None
if equal:
for orig_addr, line, recomp_addr in subgroup["both"]:
padding_size = max(padding_size, len(orig_addr))
if show_both:
print(f"{orig_addr} / {recomp_addr} : {line}")
else:
print(f"{orig_addr} : {line}")
else:
for orig_addr, line in subgroup["orig"]:
padding_size = max(padding_size, len(orig_addr))
addr_prefix = (
f"{orig_addr} / {'':{padding_size}}" if show_both else orig_addr
)
if plain:
print(f"{addr_prefix} : -{line}")
else:
print(
f"{addr_prefix} : {colorama.Fore.RED}-{line}{colorama.Style.RESET_ALL}"
)
for recomp_addr, line in subgroup["recomp"]:
padding_size = max(padding_size, len(recomp_addr))
addr_prefix = (
f"{'':{padding_size}} / {recomp_addr}"
if show_both
else recomp_addr
)
if plain:
print(f"{addr_prefix} : +{line}")
else:
print(
f"{addr_prefix} : {colorama.Fore.GREEN}+{line}{colorama.Style.RESET_ALL}"
)
# Newline between each diff subgroup.
print()
def print_diff(udiff, plain):
"""Print diff in difflib.unified_diff format."""
if udiff is None:
return False