mirror of
https://github.com/isledecomp/isle.git
synced 2025-10-23 00:14:22 +00:00
reccmp: HTML refactor and diff address display (#581)
* reccmp: HTML refactor and diff address display * Restore the @@ range indicator
This commit is contained in:
@@ -192,11 +192,13 @@ class ParseAsm:
|
||||
def parse_asm(self, data: bytes, start_addr: Optional[int] = 0) -> List[str]:
|
||||
asm = []
|
||||
|
||||
for inst in disassembler.disasm_lite(data, start_addr):
|
||||
for raw_inst in disassembler.disasm_lite(data, start_addr):
|
||||
# Use heuristics to disregard some differences that aren't representative
|
||||
# of the accuracy of a function (e.g. global offsets)
|
||||
result = self.sanitize(DisasmLiteInst(*inst))
|
||||
inst = DisasmLiteInst(*raw_inst)
|
||||
result = self.sanitize(inst)
|
||||
|
||||
# mnemonic + " " + op_str
|
||||
asm.append(" ".join(result))
|
||||
asm.append((hex(inst.address), " ".join(result)))
|
||||
|
||||
return asm
|
||||
|
@@ -12,6 +12,7 @@ from isledecomp.dir import walk_source_dir
|
||||
from isledecomp.types import SymbolType
|
||||
from isledecomp.compare.asm import ParseAsm, can_resolve_register_differences
|
||||
from .db import CompareDb, MatchInfo
|
||||
from .diff import combined_diff
|
||||
from .lines import LinesDb
|
||||
|
||||
|
||||
@@ -307,8 +308,12 @@ class Compare:
|
||||
float_lookup=recomp_float,
|
||||
)
|
||||
|
||||
orig_asm = orig_parse.parse_asm(orig_raw, match.orig_addr)
|
||||
recomp_asm = recomp_parse.parse_asm(recomp_raw, match.recomp_addr)
|
||||
orig_combined = orig_parse.parse_asm(orig_raw, match.orig_addr)
|
||||
recomp_combined = recomp_parse.parse_asm(recomp_raw, match.recomp_addr)
|
||||
|
||||
# Detach addresses from asm lines for the text diff.
|
||||
orig_asm = [x[1] for x in orig_combined]
|
||||
recomp_asm = [x[1] for x in recomp_combined]
|
||||
|
||||
diff = difflib.SequenceMatcher(None, orig_asm, recomp_asm)
|
||||
ratio = diff.ratio()
|
||||
@@ -317,7 +322,9 @@ class Compare:
|
||||
# Check whether we can resolve register swaps which are actually
|
||||
# perfect matches modulo compiler entropy.
|
||||
is_effective_match = can_resolve_register_differences(orig_asm, recomp_asm)
|
||||
unified_diff = difflib.unified_diff(orig_asm, recomp_asm, n=10)
|
||||
unified_diff = combined_diff(
|
||||
diff, orig_combined, recomp_combined, context_size=10
|
||||
)
|
||||
else:
|
||||
is_effective_match = False
|
||||
unified_diff = []
|
||||
@@ -352,9 +359,7 @@ class Compare:
|
||||
[t for (t,) in struct.iter_unpack("<L", recomp_table)],
|
||||
)
|
||||
|
||||
def match_text(
|
||||
i: int, m: Optional[MatchInfo], raw_addr: Optional[int] = None
|
||||
) -> str:
|
||||
def match_text(m: Optional[MatchInfo], raw_addr: Optional[int] = None) -> str:
|
||||
"""Format the function reference at this vtable index as text.
|
||||
If we have not identified this function, we have the option to
|
||||
display the raw address. This is only worth doing for the original addr
|
||||
@@ -363,19 +368,18 @@ class Compare:
|
||||
should override the given function from the superclass, but we have not
|
||||
implemented this yet.
|
||||
"""
|
||||
index = f"vtable0x{i*4:02x}"
|
||||
|
||||
if m is not None:
|
||||
orig = hex(m.orig_addr) if m.orig_addr is not None else "no orig"
|
||||
recomp = (
|
||||
hex(m.recomp_addr) if m.recomp_addr is not None else "no recomp"
|
||||
)
|
||||
return f"{index:>12} : ({orig:10} / {recomp:10}) : {m.name}"
|
||||
return f"({orig} / {recomp}) : {m.name}"
|
||||
|
||||
if raw_addr is not None:
|
||||
return f"{index:>12} : 0x{raw_addr:x} from orig not annotated."
|
||||
return f"0x{raw_addr:x} from orig not annotated."
|
||||
|
||||
return f"{index:>12} : (no match)"
|
||||
return "(no match)"
|
||||
|
||||
orig_text = []
|
||||
recomp_text = []
|
||||
@@ -395,14 +399,22 @@ class Compare:
|
||||
ratio += 1
|
||||
|
||||
n_entries += 1
|
||||
orig_text.append(match_text(i, orig, raw_orig))
|
||||
recomp_text.append(match_text(i, recomp))
|
||||
index = f"vtable0x{i*4:02x}"
|
||||
orig_text.append((index, match_text(orig, raw_orig)))
|
||||
recomp_text.append((index, match_text(recomp)))
|
||||
|
||||
ratio = ratio / float(n_entries) if n_entries > 0 else 0
|
||||
|
||||
# n=100: Show the entire table if there is a diff to display.
|
||||
# Otherwise it would be confusing if the table got cut off.
|
||||
unified_diff = difflib.unified_diff(orig_text, recomp_text, n=100)
|
||||
|
||||
sm = difflib.SequenceMatcher(
|
||||
None,
|
||||
[x[1] for x in orig_text],
|
||||
[x[1] for x in recomp_text],
|
||||
)
|
||||
|
||||
unified_diff = combined_diff(sm, orig_text, recomp_text, context_size=100)
|
||||
|
||||
return DiffReport(
|
||||
match_type=SymbolType.VTABLE,
|
||||
|
81
tools/isledecomp/isledecomp/compare/diff.py
Normal file
81
tools/isledecomp/isledecomp/compare/diff.py
Normal file
@@ -0,0 +1,81 @@
|
||||
from difflib import SequenceMatcher
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
CombinedDiffInput = List[Tuple[str, str]]
|
||||
CombinedDiffOutput = List[Tuple[str, List[Dict[str, Tuple[str, str]]]]]
|
||||
|
||||
|
||||
def combined_diff(
|
||||
diff: SequenceMatcher,
|
||||
orig_combined: CombinedDiffInput,
|
||||
recomp_combined: CombinedDiffInput,
|
||||
context_size: int = 3,
|
||||
) -> CombinedDiffOutput:
|
||||
"""We want to diff the original and recomp assembly. The "combined" assembly
|
||||
input has two components: the address of the instruction and the assembly text.
|
||||
We have already diffed the text only. This is the SequenceMatcher object.
|
||||
The SequenceMatcher can generate "opcodes" that describe how to turn "Text A"
|
||||
into "Text B". These refer to list indices of the original arrays, so we can
|
||||
use those to create the final diff and include the address for each line of assembly.
|
||||
This is almost the same procedure as the difflib.unified_diff function, but we
|
||||
are reusing the already generated SequenceMatcher object.
|
||||
"""
|
||||
|
||||
unified_diff = []
|
||||
|
||||
for group in diff.get_grouped_opcodes(context_size):
|
||||
subgroups = []
|
||||
|
||||
# Keep track of the addresses we've seen in this diff group.
|
||||
# This helps create the "@@" line. (Does this have a name?)
|
||||
# Do it this way because not every line in each list will have an
|
||||
# address. If our context begins or ends on a line that does not
|
||||
# have one, we will have an incomplete range string.
|
||||
orig_addrs = set()
|
||||
recomp_addrs = set()
|
||||
|
||||
for code, i1, i2, j1, j2 in group:
|
||||
if code == "equal":
|
||||
# The sections are equal, so the list slices are guaranteed
|
||||
# to have the same length. We only need the diffed value (asm text)
|
||||
# from one of the lists, but we need the addresses from both.
|
||||
# Use zip to put the two lists together and then take out what we want.
|
||||
both = [
|
||||
(a, b, c)
|
||||
for ((a, b), (c, _)) in zip(
|
||||
orig_combined[i1:i2], recomp_combined[j1:j2]
|
||||
)
|
||||
]
|
||||
|
||||
for orig_addr, _, recomp_addr in both:
|
||||
if orig_addr is not None:
|
||||
orig_addrs.add(orig_addr)
|
||||
|
||||
if recomp_addr is not None:
|
||||
recomp_addrs.add(recomp_addr)
|
||||
|
||||
subgroups.append({"both": both})
|
||||
else:
|
||||
for orig_addr, _ in orig_combined[i1:i2]:
|
||||
if orig_addr is not None:
|
||||
orig_addrs.add(orig_addr)
|
||||
|
||||
for recomp_addr, _ in recomp_combined[j1:j2]:
|
||||
if recomp_addr is not None:
|
||||
recomp_addrs.add(recomp_addr)
|
||||
|
||||
subgroups.append(
|
||||
{
|
||||
"orig": orig_combined[i1:i2],
|
||||
"recomp": recomp_combined[j1:j2],
|
||||
}
|
||||
)
|
||||
|
||||
orig_sorted = sorted(orig_addrs)
|
||||
recomp_sorted = sorted(recomp_addrs)
|
||||
|
||||
diff_slug = f"@@ -{orig_sorted[0]},{orig_sorted[-1]} +{recomp_sorted[0]},{recomp_sorted[-1]} @@"
|
||||
|
||||
unified_diff.append((diff_slug, subgroups))
|
||||
|
||||
return unified_diff
|
@@ -5,7 +5,70 @@ import logging
|
||||
import colorama
|
||||
|
||||
|
||||
def print_combined_diff(udiff, plain: bool = False, show_both: bool = False):
|
||||
if udiff is None:
|
||||
return
|
||||
|
||||
# We don't know how long the address string will be ahead of time.
|
||||
# Set this value for each address to try to line things up.
|
||||
padding_size = 0
|
||||
|
||||
for slug, subgroups in udiff:
|
||||
if plain:
|
||||
print("---")
|
||||
print("+++")
|
||||
print(slug)
|
||||
else:
|
||||
print(f"{colorama.Fore.RED}---")
|
||||
print(f"{colorama.Fore.GREEN}+++")
|
||||
print(f"{colorama.Fore.BLUE}{slug}")
|
||||
print(colorama.Style.RESET_ALL, end="")
|
||||
|
||||
for subgroup in subgroups:
|
||||
equal = subgroup.get("both") is not None
|
||||
|
||||
if equal:
|
||||
for orig_addr, line, recomp_addr in subgroup["both"]:
|
||||
padding_size = max(padding_size, len(orig_addr))
|
||||
if show_both:
|
||||
print(f"{orig_addr} / {recomp_addr} : {line}")
|
||||
else:
|
||||
print(f"{orig_addr} : {line}")
|
||||
else:
|
||||
for orig_addr, line in subgroup["orig"]:
|
||||
padding_size = max(padding_size, len(orig_addr))
|
||||
addr_prefix = (
|
||||
f"{orig_addr} / {'':{padding_size}}" if show_both else orig_addr
|
||||
)
|
||||
|
||||
if plain:
|
||||
print(f"{addr_prefix} : -{line}")
|
||||
else:
|
||||
print(
|
||||
f"{addr_prefix} : {colorama.Fore.RED}-{line}{colorama.Style.RESET_ALL}"
|
||||
)
|
||||
|
||||
for recomp_addr, line in subgroup["recomp"]:
|
||||
padding_size = max(padding_size, len(recomp_addr))
|
||||
addr_prefix = (
|
||||
f"{'':{padding_size}} / {recomp_addr}"
|
||||
if show_both
|
||||
else recomp_addr
|
||||
)
|
||||
|
||||
if plain:
|
||||
print(f"{addr_prefix} : +{line}")
|
||||
else:
|
||||
print(
|
||||
f"{addr_prefix} : {colorama.Fore.GREEN}+{line}{colorama.Style.RESET_ALL}"
|
||||
)
|
||||
|
||||
# Newline between each diff subgroup.
|
||||
print()
|
||||
|
||||
|
||||
def print_diff(udiff, plain):
|
||||
"""Print diff in difflib.unified_diff format."""
|
||||
if udiff is None:
|
||||
return False
|
||||
|
||||
|
Reference in New Issue
Block a user