Swap cmp operands for effective match (#783)

2025-10-23 08:24:16 +00:00 · 2024-04-07 16:57:41 -04:00
parent 1bfe47357b
commit 70912d16c6
2 changed files with 110 additions and 1 deletions
--- a/tools/isledecomp/isledecomp/compare/asm/fixes.py
+++ b/tools/isledecomp/isledecomp/compare/asm/fixes.py
@@ -0,0 +1,106 @@
 from difflib import SequenceMatcher
 from typing import List
 ALLOWED_JUMP_SWAPS = (
    ("ja", "jb"),
    ("jae", "jbe"),
    ("jb", "ja"),
    ("jbe", "jae"),
    ("jg", "jl"),
    ("jge", "jle"),
    ("jl", "jg"),
    ("jle", "jge"),
    ("je", "je"),
    ("jne", "jne"),
 )
 def jump_swap_ok(a: str, b: str) -> bool:
    """For the instructions a,b, are they both jump instructions
    that are compatible with a swapped cmp operand order?"""
    # Grab the mnemonic
    (jmp_a, _, __) = a.partition(" ")
    (jmp_b, _, __) = b.partition(" ")
    return (jmp_a, jmp_b) in ALLOWED_JUMP_SWAPS
 def is_operand_swap(a: str, b: str) -> bool:
    """This is a hack to avoid parsing the operands. It's not as simple as
    breaking on the comma because templates or string literals interfere
    with this. Instead we check:
        1. Do both strings use the exact same set of characters?
        2. If we do break on ', ', is the first token of each different?
    2 is needed to catch an edge case like:
        cmp eax, dword ptr [ecx + 0x1234]
        cmp ecx, dword ptr [eax + 0x1234]
    """
    return a.partition(", ")[0] != b.partition(", ")[0] and sorted(a) == sorted(b)
 def can_cmp_swap(orig: List[str], recomp: List[str]) -> bool:
    # Make sure we have 1 cmp and 1 jmp for both
    if len(orig) != 2 or len(recomp) != 2:
        return False
    if not orig[0].startswith("cmp") or not recomp[0].startswith("cmp"):
        return False
    if not orig[1].startswith("j") or not recomp[1].startswith("j"):
        return False
    # Checking two things:
    # Are the cmp operands flipped?
    # Is the jump instruction compatible with a flip?
    return is_operand_swap(orig[0], recomp[0]) and jump_swap_ok(orig[1], recomp[1])
 def patch_jump(a: str, b: str) -> str:
    """For jump instructions a, b, return `(mnemonic_a) (operand_b)`.
    The reason to do it this way (instead of just returning `a`) is that
    the jump instructions might use different displacement offsets
    or labels. If we just replace `b` with `a`, this diff would be
    incorrectly eliminated."""
    (mnemonic_a, _, __) = a.partition(" ")
    (_, __, operand_b) = b.partition(" ")
    return mnemonic_a + " " + operand_b
 def patch_cmp_swaps(
    sm: SequenceMatcher, orig_asm: List[str], recomp_asm: List[str]
 ) -> bool:
    """Can we resolve the diffs between orig and recomp by patching
    swapped cmp instructions?
    For example:
        cmp eax, ebx            cmp ebx, eax
        je .label               je .label
        cmp eax, ebx            cmp ebx, eax
        ja .label               jb .label
    """
    # Copy the instructions so we can patch
    # TODO: If we change our strategy to allow multiple rounds of patching,
    # we should modify the recomp array directly.
    new_asm = recomp_asm[::]
    codes = sm.get_opcodes()
    for code, i1, i2, j1, j2 in codes:
        # To save us the trouble of finding "compatible" cmp instructions
        # use the diff information we already have.
        if code != "replace":
            continue
        # If the ranges in orig and recomp are not equal, use the shorter one
        for i, j in zip(range(i1, i2), range(j1, j2)):
            if can_cmp_swap(orig_asm[i : i + 2], recomp_asm[j : j + 2]):
                # Patch cmp
                new_asm[j] = orig_asm[i]
                # Patch the jump if necessary
                new_asm[j + 1] = patch_jump(orig_asm[i + 1], recomp_asm[j + 1])
    return orig_asm == new_asm
--- a/tools/isledecomp/isledecomp/compare/core.py
+++ b/tools/isledecomp/isledecomp/compare/core.py
@@ -11,6 +11,7 @@ from isledecomp.parser import DecompCodebase
 from isledecomp.dir import walk_source_dir
 from isledecomp.types import SymbolType
 from isledecomp.compare.asm import ParseAsm, can_resolve_register_differences
 from isledecomp.compare.asm.fixes import patch_cmp_swaps
 from .db import CompareDb, MatchInfo
 from .diff import combined_diff
 from .lines import LinesDb
@@ -470,7 +471,9 @@ class Compare:
        if ratio != 1.0:
            # Check whether we can resolve register swaps which are actually
            # perfect matches modulo compiler entropy.
-            is_effective_match = can_resolve_register_differences(orig_asm, recomp_asm)
+            is_effective_match = patch_cmp_swaps(
                diff, orig_asm, recomp_asm
            ) or can_resolve_register_differences(orig_asm, recomp_asm)
            unified_diff = combined_diff(
                diff, orig_combined, recomp_combined, context_size=10
            )