Checkorder tool to keep functions in original binary order (#228)

* First commit of order tool * More flexible match on module name. Bugfix on blank_or_comment * Report inexact offset comments in verbose mode. Bugfix for exact regex * Refactor checkorder into reusable isledecomp module * Find bad comments in one pass, add awareness of TEMPLATE * Refactor of state machine to prepare for reccmp integration * Use isledecomp lib in reccmp * Build isledecomp in GH actions, fix mypy complaint * Ensure unit test cpp files will be ignored by reccmp * Allow multiple offset markers, pep8 cleanup * Remove unused variable * Code style, remove unneeded module and TODO * Final renaming and type hints * Fix checkorder issues, add GH action and enforce (#2) * Fix checkorder issues * Add GH action * Test error case * Works * Fixes --------- Co-authored-by: Christian Semmler <mail@csemmler.com>
2025-10-23 00:14:22 +00:00 · 2023-11-21 03:44:45 -05:00
parent 714d36b57d
commit 1ae3b07dc2
84 changed files with 4021 additions and 3209 deletions
--- a/tools/checkorder/checkorder.py
+++ b/tools/checkorder/checkorder.py
@@ -0,0 +1,120 @@
+import os
+import sys
+import argparse
+from isledecomp.dir import (
+    walk_source_dir,
+    is_file_cpp
+)
+from isledecomp.parser import find_code_blocks
+from isledecomp.parser.util import (
+    is_exact_offset_comment
+)
+
+
+def sig_truncate(sig: str) -> str:
+    """Helper to truncate function names to 50 chars and append ellipsis
+       if needed. Goal is to stay under 80 columns for tool output."""
+    return f"{sig[:47]}{'...' if len(sig) >= 50 else ''}"
+
+
+def check_file(filename: str, verbose: bool = False) -> bool:
+    """Open and read the given file, then check whether the code blocks
+       are in order. If verbose, print each block."""
+
+    with open(filename, 'r') as f:
+        code_blocks = find_code_blocks(f)
+
+    bad_comments = [(block.start_line, block.offset_comment)
+                    for block in code_blocks
+                    if not is_exact_offset_comment(block.offset_comment)]
+
+    just_offsets = [block.offset for block in code_blocks]
+    sorted_offsets = sorted(just_offsets)
+    file_out_of_order = just_offsets != sorted_offsets
+
+    # If we detect inexact comments, don't print anything unless we are
+    # in verbose mode. If the file is out of order, we always print the
+    # file name.
+    should_report = ((len(bad_comments) > 0 and verbose)
+                     or file_out_of_order)
+
+    if not should_report and not file_out_of_order:
+        return False
+
+    # Else: we are alerting to some problem in this file
+    print(filename)
+    if verbose:
+        if file_out_of_order:
+            order_lookup = {k: i for i, k in enumerate(sorted_offsets)}
+            prev_offset = 0
+
+            for block in code_blocks:
+                msg = ' '.join([
+                    ' ' if block.offset > prev_offset else '!',
+                    f'{block.offset:08x}',
+                    f'{block.end_line - block.start_line:4} lines',
+                    f'{order_lookup[block.offset]:3}',
+                    '    ',
+                    sig_truncate(block.signature),
+                ])
+                print(msg)
+                prev_offset = block.offset
+
+        for (line_no, line) in bad_comments:
+            print(f'* line {line_no:3} bad offset comment ({line})')
+
+        print()
+
+    return file_out_of_order
+
+
+def parse_args(test_args: list | None = None) -> dict:
+    p = argparse.ArgumentParser()
+    p.add_argument('target', help='The file or directory to check.')
+    p.add_argument('--enforce', action=argparse.BooleanOptionalAction,
+                   default=False,
+                   help='Fail with error code if target is out of order.')
+    p.add_argument('--verbose', action=argparse.BooleanOptionalAction,
+                   default=False,
+                   help=('Display each code block in the file and show '
+                         'where each consecutive run of blocks is broken.'))
+
+    if test_args is None:
+        args = p.parse_args()
+    else:
+        args = p.parse_args(test_args)
+
+    return vars(args)
+
+
+def main():
+    args = parse_args()
+
+    if os.path.isdir(args['target']):
+        files_to_check = list(walk_source_dir(args['target']))
+    elif os.path.isfile(args['target']) and is_file_cpp(args['target']):
+        files_to_check = [args['target']]
+    else:
+        sys.exit('Invalid target')
+
+    files_out_of_order = 0
+
+    for file in files_to_check:
+        is_jumbled = check_file(file, args['verbose'])
+        if is_jumbled:
+            files_out_of_order += 1
+
+    if files_out_of_order > 0:
+        error_message = ' '.join([
+            str(files_out_of_order),
+            'files are' if files_out_of_order > 1 else 'file is',
+            'out of order'
+        ])
+        print(error_message)
+
+    if files_out_of_order > 0 and args['enforce']:
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()
--- a/tools/checkorder/requirements.txt
+++ b/tools/checkorder/requirements.txt
@@ -0,0 +1 @@
+isledecomp
--- a/tools/isledecomp/.gitignore
+++ b/tools/isledecomp/.gitignore
@@ -0,0 +1 @@
+isledecomp.egg-info/
--- a/tools/isledecomp/isledecomp/init.py
+++ b/tools/isledecomp/isledecomp/init.py
--- a/tools/isledecomp/isledecomp/dir.py
+++ b/tools/isledecomp/isledecomp/dir.py
@@ -0,0 +1,21 @@
+import os
+from typing import Iterator
+
+
+def is_file_cpp(filename: str) -> bool:
+    (basefile, ext) = os.path.splitext(filename)
+    return ext.lower() in ('.h', '.cpp')
+
+
+def walk_source_dir(source: str, recursive: bool = True) -> Iterator[str]:
+    """Generator to walk the given directory recursively and return
+       any C++ files found."""
+
+    source = os.path.abspath(source)
+    for subdir, dirs, files in os.walk(source):
+        for file in files:
+            if is_file_cpp(file):
+                yield os.path.join(subdir, file)
+
+        if not recursive:
+            break
--- a/tools/isledecomp/isledecomp/parser/init.py
+++ b/tools/isledecomp/isledecomp/parser/init.py
@@ -0,0 +1 @@
+from .parser import find_code_blocks
--- a/tools/isledecomp/isledecomp/parser/parser.py
+++ b/tools/isledecomp/isledecomp/parser/parser.py
@@ -0,0 +1,142 @@
+# C++ file parser
+
+from typing import List, TextIO
+from enum import Enum
+from .util import (
+    CodeBlock,
+    OffsetMatch,
+    is_blank_or_comment,
+    match_offset_comment,
+    is_exact_offset_comment,
+    get_template_function_name,
+    remove_trailing_comment,
+    distinct_by_module,
+)
+
+
+class ReaderState(Enum):
+    WANT_OFFSET = 0
+    WANT_SIG = 1
+    IN_FUNC = 2
+    IN_TEMPLATE = 3
+    WANT_CURLY = 4
+    FUNCTION_DONE = 5
+
+
+def find_code_blocks(stream: TextIO) -> List[CodeBlock]:
+    """Read the IO stream (file) line-by-line and give the following report:
+       Foreach code block (function) in the file, what are its starting and
+       ending line numbers, and what is the given offset in the original
+       binary. We expect the result to be ordered by line number because we
+       are reading the file from start to finish."""
+
+    blocks: List[CodeBlock] = []
+
+    offset_matches: List[OffsetMatch] = []
+
+    function_sig = None
+    start_line = None
+    end_line = None
+    state = ReaderState.WANT_OFFSET
+
+    # 1-based to match cvdump and your text editor
+    # I know it says 0, but we will increment before each readline()
+    line_no = 0
+    can_seek = True
+
+    while True:
+        # Do this before reading again so that an EOF will not
+        # cause us to miss the last function of the file.
+        if state == ReaderState.FUNCTION_DONE:
+            # Our list of offset marks could have duplicates on
+            # module name, so we'll eliminate those now.
+            for offset_match in distinct_by_module(offset_matches):
+                block = CodeBlock(offset=offset_match.address,
+                                  signature=function_sig,
+                                  start_line=start_line,
+                                  end_line=end_line,
+                                  offset_comment=offset_match.comment,
+                                  module=offset_match.module,
+                                  is_template=offset_match.is_template,
+                                  is_stub=offset_match.is_stub)
+                blocks.append(block)
+            offset_matches = []
+            state = ReaderState.WANT_OFFSET
+
+        if can_seek:
+            line_no += 1
+            line = stream.readline()
+            if line == '':
+                break
+
+        new_match = match_offset_comment(line)
+        if new_match is not None:
+            # We will allow multiple offsets if we have just begun
+            # the code block, but not after we hit the curly brace.
+            if state in (ReaderState.WANT_OFFSET, ReaderState.IN_TEMPLATE,
+                         ReaderState.WANT_SIG):
+                # If we detected an offset marker unexpectedly,
+                # we are handling it here so we can continue seeking.
+                can_seek = True
+
+                offset_matches.append(new_match)
+
+                if new_match.is_template:
+                    state = ReaderState.IN_TEMPLATE
+                else:
+                    state = ReaderState.WANT_SIG
+            else:
+                # We hit another offset unexpectedly.
+                # We can recover easily by just ending the function here.
+                end_line = line_no - 1
+                state = ReaderState.FUNCTION_DONE
+
+                # Pause reading here so we handle the offset marker
+                # on the next loop iteration
+                can_seek = False
+
+        elif state == ReaderState.IN_TEMPLATE:
+            # TEMPLATE functions are a special case. The signature is
+            # given on the next line (in a // comment)
+            function_sig = get_template_function_name(line)
+            start_line = line_no
+            end_line = line_no
+            state = ReaderState.FUNCTION_DONE
+
+        elif state == ReaderState.WANT_SIG:
+            # Skip blank lines or comments that come after the offset
+            # marker. There is not a formal procedure for this, so just
+            # assume the next "code line" is the function signature
+            if not is_blank_or_comment(line):
+                # Inline functions may end with a comment. Strip that out
+                # to help parsing.
+                function_sig = remove_trailing_comment(line.strip())
+
+                # Now check to see if the opening curly bracket is on the
+                # same line. clang-format should prevent this (BraceWrapping)
+                # but it is easy to detect.
+                # If the entire function is on one line, handle that too.
+                if function_sig.endswith('{'):
+                    start_line = line_no
+                    state = ReaderState.IN_FUNC
+                elif (function_sig.endswith('}') or
+                        function_sig.endswith('};')):
+                    start_line = line_no
+                    end_line = line_no
+                    state = ReaderState.FUNCTION_DONE
+                else:
+                    state = ReaderState.WANT_CURLY
+
+        elif state == ReaderState.WANT_CURLY:
+            if line.strip() == '{':
+                start_line = line_no
+                state = ReaderState.IN_FUNC
+
+        elif state == ReaderState.IN_FUNC:
+            # Naive but reasonable assumption that functions will end with
+            # a curly brace on its own line with no prepended spaces.
+            if line.startswith('}'):
+                end_line = line_no
+                state = ReaderState.FUNCTION_DONE
+
+    return blocks
--- a/tools/isledecomp/isledecomp/parser/util.py
+++ b/tools/isledecomp/isledecomp/parser/util.py
@@ -0,0 +1,97 @@
+# C++ Parser utility functions and data structures
+from __future__ import annotations  # python <3.10 compatibility
+import re
+from typing import List
+from collections import namedtuple
+
+
+CodeBlock = namedtuple('CodeBlock',
+                       ['offset', 'signature', 'start_line', 'end_line',
+                        'offset_comment', 'module', 'is_template', 'is_stub'])
+
+OffsetMatch = namedtuple('OffsetMatch', ['module', 'address', 'is_template',
+                                         'is_stub', 'comment'])
+
+# This has not been formally established, but considering that "STUB"
+# is a temporary state for a function, we assume it will appear last,
+# after any other modifiers (i.e. TEMPLATE)
+
+# To match a reasonable variance of formatting for the offset comment
+offsetCommentRegex = re.compile(r'\s*//\s*OFFSET:\s*(\w+)\s+(?:0x)?([a-f0-9]+)(\s+TEMPLATE)?(\s+STUB)?',  # nopep8
+                                flags=re.I)
+
+# To match the exact syntax (text upper case, hex lower case, with spaces)
+# that is used in most places
+offsetCommentExactRegex = re.compile(r'^// OFFSET: [A-Z0-9]+ (0x[a-f0-9]+)( TEMPLATE)?( STUB)?$')  # nopep8
+
+
+# The goal here is to just read whatever is on the next line, so some
+# flexibility in the formatting seems OK
+templateCommentRegex = re.compile(r'\s*//\s+(.*)')
+
+
+# To remove any comment (//) or block comment (/*) and its leading spaces
+# from the end of a code line
+trailingCommentRegex = re.compile(r'(\s*(?://|/\*).*)$')
+
+
+def get_template_function_name(line: str) -> str:
+    """Parse function signature for special TEMPLATE functions"""
+    template_match = templateCommentRegex.match(line)
+
+    # If we don't match, you get whatever is on the line as the signature
+    if template_match is not None:
+        return template_match.group(1)
+
+    return line
+
+
+def remove_trailing_comment(line: str) -> str:
+    return trailingCommentRegex.sub('', line)
+
+
+def is_blank_or_comment(line: str) -> bool:
+    """Helper to read ahead after the offset comment is matched.
+       There could be blank lines or other comments before the
+       function signature, and we want to skip those."""
+    line_strip = line.strip()
+    return (len(line_strip) == 0
+            or line_strip.startswith('//')
+            or line_strip.startswith('/*')
+            or line_strip.endswith('*/'))
+
+
+def is_exact_offset_comment(line: str) -> bool:
+    """If the offset comment does not match our (unofficial) syntax
+       we may want to alert the user to fix it for style points."""
+    return offsetCommentExactRegex.match(line) is not None
+
+
+def match_offset_comment(line: str) -> OffsetMatch | None:
+    match = offsetCommentRegex.match(line)
+    if match is None:
+        return None
+
+    return OffsetMatch(module=match.group(1),
+                       address=int(match.group(2), 16),
+                       is_template=match.group(3) is not None,
+                       is_stub=match.group(4) is not None,
+                       comment=line.strip())
+
+
+def distinct_by_module(offsets: List) -> List:
+    """Given a list of offset markers, return a list with distinct
+       module names. If module names (case-insensitive) are repeated,
+       choose the offset that appears first."""
+
+    if len(offsets) < 2:
+        return offsets
+
+    # Dict maintains insertion order in python >=3.7
+    offsets_dict = {}
+    for offset in offsets:
+        module_upper = offset.module.upper()
+        if module_upper not in offsets_dict:
+            offsets_dict[module_upper] = offset
+
+    return list(offsets_dict.values())
--- a/tools/isledecomp/setup.py
+++ b/tools/isledecomp/setup.py
@@ -0,0 +1,9 @@
+from setuptools import setup, find_packages
+
+setup(
+    name='isledecomp',
+    version='0.1.0',
+    description='Python tools for the isledecomp project',
+    packages=find_packages(),
+    tests_require=['pytest'],
+)
--- a/tools/isledecomp/tests/init.py
+++ b/tools/isledecomp/tests/init.py
--- a/tools/isledecomp/tests/samples/basic_class.cpp
+++ b/tools/isledecomp/tests/samples/basic_class.cpp
@@ -0,0 +1,29 @@
+// Sample for python unit tests
+// Not part of the decomp
+
+// A very simple class
+
+class TestClass {
+public:
+  TestClass();
+  virtual ~TestClass() override;
+
+  virtual MxResult Tickle() override; // vtable+08
+
+  // OFFSET: TEST 0x12345678
+  inline const char* ClassName() const // vtable+0c
+  {
+    // 0xabcd1234
+    return "TestClass";
+  }
+
+  // OFFSET: TEST 0xdeadbeef
+  inline MxBool IsA(const char* name) const override // vtable+10
+  {
+    return !strcmp(name, TestClass::ClassName());
+  }
+
+private:
+  int m_hello;
+  int m_hiThere;
+};
--- a/tools/isledecomp/tests/samples/basic_file.cpp
+++ b/tools/isledecomp/tests/samples/basic_file.cpp
@@ -0,0 +1,22 @@
+// Sample for python unit tests
+// Not part of the decomp
+
+// A very simple well-formed code file
+
+// OFFSET: TEST 0x1234
+void function01()
+{
+  // TODO
+}
+
+// OFFSET: TEST 0x2345
+void function02()
+{
+  // TODO
+}
+
+// OFFSET: TEST 0x3456
+void function03()
+{
+  // TODO
+}
--- a/tools/isledecomp/tests/samples/inline.cpp
+++ b/tools/isledecomp/tests/samples/inline.cpp
@@ -0,0 +1,8 @@
+// Sample for python unit tests
+// Not part of the decomp
+
+// OFFSET: TEST 0x10000001
+inline const char* OneLineWithComment() const { return "MxDSObject"; }; // hi there
+
+// OFFSET: TEST 0x10000002
+inline const char* OneLine() const { return "MxDSObject"; };
--- a/tools/isledecomp/tests/samples/missing_offset.cpp
+++ b/tools/isledecomp/tests/samples/missing_offset.cpp
@@ -0,0 +1,16 @@
+// Sample for python unit tests
+// Not part of the decomp
+
+#include <stdio.h>
+
+int no_offset_comment()
+{
+  static int dummy = 123;
+  return -1;
+}
+
+// OFFSET: TEST 0xdeadbeef
+void regular_ole_function()
+{
+  printf("hi there");
+}
--- a/tools/isledecomp/tests/samples/multiple_offsets.cpp
+++ b/tools/isledecomp/tests/samples/multiple_offsets.cpp
@@ -0,0 +1,25 @@
+// Sample for python unit tests
+// Not part of the decomp
+
+// Handling multiple offset markers
+
+// OFFSET: TEST 0x1234
+// OFFSET: HELLO 0x5555
+void different_modules()
+{
+  // TODO
+}
+
+// OFFSET: TEST 0x2345
+// OFFSET: TEST 0x1234
+void same_module()
+{
+  // TODO
+}
+
+// OFFSET: TEST 0x2002
+// OFFSET: test 0x1001
+void same_case_insensitive()
+{
+  // TODO
+}
--- a/tools/isledecomp/tests/samples/oneline_function.cpp
+++ b/tools/isledecomp/tests/samples/oneline_function.cpp
@@ -0,0 +1,12 @@
+// Sample for python unit tests
+// Not part of the decomp
+
+// OFFSET: TEST 0x1234
+void short_function() { static char* msg = "oneliner"; }
+
+// OFFSET: TEST 0x5555
+void function_after_one_liner()
+{
+  // This function comes after the previous that is on a single line.
+  // Do we report the offset for this one correctly?
+}
--- a/tools/isledecomp/tests/samples/out_of_order.cpp
+++ b/tools/isledecomp/tests/samples/out_of_order.cpp
@@ -0,0 +1,20 @@
+// Sample for python unit tests
+// Not part of the decomp
+
+// OFFSET: TEST 0x1001
+void function_order01()
+{
+    // TODO
+}
+
+// OFFSET: TEST 0x1003
+void function_order03()
+{
+    // TODO
+}
+
+// OFFSET: TEST 0x1002
+void function_order02()
+{
+    // TODO
+}
--- a/tools/isledecomp/tests/samples/poorly_formatted.cpp
+++ b/tools/isledecomp/tests/samples/poorly_formatted.cpp
@@ -0,0 +1,23 @@
+// Sample for python unit tests
+// Not part of the decomp
+
+// While it's reasonable to expect a well-formed file (and clang-format
+// will make sure we get one), this will put the parser through its paces.
+
+// OFFSET: TEST 0x1234
+void curly_with_spaces()
+  {
+  static char* msg = "hello";
+  }
+
+// OFFSET: TEST 0x5555
+void weird_closing_curly()
+{
+  int x = 123; }
+
+// OFFSET: HELLO 0x5656
+void bad_indenting() {
+  if (0)
+{
+  int y = 5;
+}}
--- a/tools/isledecomp/tests/test_parser.py
+++ b/tools/isledecomp/tests/test_parser.py
@@ -0,0 +1,128 @@
+import os
+import pytest
+from typing import List, TextIO
+from isledecomp.parser import find_code_blocks
+from isledecomp.parser.util import CodeBlock
+
+SAMPLE_DIR = os.path.join(os.path.dirname(__file__), 'samples')
+
+
+def sample_file(filename: str) -> TextIO:
+    """Wrapper for opening the samples from the directory that does not
+       depend on the cwd where we run the test"""
+    full_path = os.path.join(SAMPLE_DIR, filename)
+    return open(full_path, 'r')
+
+
+def code_blocks_are_sorted(blocks: List[CodeBlock]) -> bool:
+    """Helper to make this more idiomatic"""
+    just_offsets = [block.offset for block in blocks]
+    return just_offsets == sorted(just_offsets)
+
+
+# Tests are below #
+
+
+def test_sanity():
+    """Read a very basic file"""
+    with sample_file('basic_file.cpp') as f:
+        blocks = find_code_blocks(f)
+
+    assert len(blocks) == 3
+    assert code_blocks_are_sorted(blocks) is True
+    # n.b. The parser returns line numbers as 1-based
+    # Function starts when we see the opening curly brace
+    assert blocks[0].start_line == 8
+    assert blocks[0].end_line == 10
+
+
+def test_oneline():
+    """(Assuming clang-format permits this) This sample has a function
+    on a single line. This will test the end-of-function detection"""
+    with sample_file('oneline_function.cpp') as f:
+        blocks = find_code_blocks(f)
+
+    assert len(blocks) == 2
+    assert blocks[0].start_line == 5
+    assert blocks[0].end_line == 5
+
+
+def test_missing_offset():
+    """What if the function doesn't have an offset comment?"""
+    with sample_file('missing_offset.cpp') as f:
+        blocks = find_code_blocks(f)
+
+    # TODO: For now, the function without the offset will just be ignored.
+    # Would be the same outcome if the comment was present but mangled and
+    # we failed to match it. We should detect these cases in the future.
+    assert len(blocks) == 1
+
+
+def test_jumbled_case():
+    """The parser just reports what it sees. It is the responsibility of
+       the downstream tools to do something about a jumbled file.
+       Just verify that we are reading it correctly."""
+    with sample_file('out_of_order.cpp') as f:
+        blocks = find_code_blocks(f)
+
+    assert len(blocks) == 3
+    assert code_blocks_are_sorted(blocks) is False
+
+
+def test_bad_file():
+    with sample_file('poorly_formatted.cpp') as f:
+        blocks = find_code_blocks(f)
+
+    assert len(blocks) == 3
+
+
+def test_indented():
+    """Offsets for functions inside of a class will probably be indented."""
+    with sample_file('basic_class.cpp') as f:
+        blocks = find_code_blocks(f)
+
+    # TODO: We don't properly detect the end of these functions
+    # because the closing brace is indented. However... knowing where each
+    # function ends is less important (for now) than capturing
+    # all the functions that are there.
+
+    assert len(blocks) == 2
+    assert blocks[0].offset == int('0x12345678', 16)
+    assert blocks[0].start_line == 15
+    # assert blocks[0].end_line == 18
+
+    assert blocks[1].offset == int('0xdeadbeef', 16)
+    assert blocks[1].start_line == 22
+    # assert blocks[1].end_line == 24
+
+
+def test_inline():
+    with sample_file('inline.cpp') as f:
+        blocks = find_code_blocks(f)
+
+    assert len(blocks) == 2
+    for block in blocks:
+        assert block.start_line is not None
+        assert block.start_line == block.end_line
+
+
+def test_multiple_offsets():
+    """If multiple offset marks appear before for a code block, take them
+       all but ensure module name (case-insensitive) is distinct.
+       Use first module occurrence in case of duplicates."""
+    with sample_file('multiple_offsets.cpp') as f:
+        blocks = find_code_blocks(f)
+
+    assert len(blocks) == 4
+    assert blocks[0].module == 'TEST'
+    assert blocks[0].start_line == 9
+
+    assert blocks[1].module == 'HELLO'
+    assert blocks[1].start_line == 9
+
+    # Duplicate modules are ignored
+    assert blocks[2].start_line == 16
+    assert blocks[2].offset == 0x2345
+
+    assert blocks[3].module == 'TEST'
+    assert blocks[3].offset == 0x2002
--- a/tools/isledecomp/tests/test_parser_util.py
+++ b/tools/isledecomp/tests/test_parser_util.py
@@ -0,0 +1,113 @@
+import pytest
+from collections import namedtuple
+from typing import List
+from isledecomp.parser.util import (
+    is_blank_or_comment,
+    match_offset_comment,
+    is_exact_offset_comment,
+    distinct_by_module,
+)
+
+
+blank_or_comment_param = [
+    (True,  ''),
+    (True,  '\t'),
+    (True,  '    '),
+    (False, '\tint abc=123;'),
+    (True,  '// OFFSET: LEGO1 0xdeadbeef'),
+    (True,  '   /* Block comment beginning'),
+    (True,  'Block comment ending */   '),
+
+    # TODO: does clang-format have anything to say about these cases?
+    (False, 'x++; // Comment folows'),
+    (False, 'x++; /* Block comment begins'),
+]
+
+
+@pytest.mark.parametrize('expected, line', blank_or_comment_param)
+def test_is_blank_or_comment(line: str, expected: bool):
+    assert is_blank_or_comment(line) is expected
+
+
+offset_comment_samples = [
+    # (can_parse: bool, exact_match: bool, line: str)
+    # Should match both expected modules with optional STUB marker
+    (True,  True,  '// OFFSET: LEGO1 0xdeadbeef'),
+    (True,  True,  '// OFFSET: LEGO1 0xdeadbeef STUB'),
+    (True,  True,  '// OFFSET: ISLE 0x12345678'),
+    (True,  True,  '// OFFSET: ISLE 0x12345678 STUB'),
+
+    # No trailing spaces allowed
+    (True,  False, '// OFFSET: LEGO1 0xdeadbeef  '),
+    (True,  False, '// OFFSET: LEGO1 0xdeadbeef STUB '),
+
+    # Must have exactly one space between elements
+    (True,  False, '//OFFSET: ISLE 0xdeadbeef'),
+    (True,  False, '// OFFSET:ISLE 0xdeadbeef'),
+    (True,  False, '//  OFFSET: ISLE 0xdeadbeef'),
+    (True,  False, '// OFFSET:  ISLE 0xdeadbeef'),
+    (True,  False, '// OFFSET: ISLE  0xdeadbeef'),
+    (True,  False, '// OFFSET: ISLE 0xdeadbeef  STUB'),
+
+    # Must have 0x prefix for hex number
+    (True,  False, '// OFFSET: ISLE deadbeef'),
+
+    # Offset, module name, and STUB must be uppercase
+    (True,  False, '// offset: ISLE 0xdeadbeef'),
+    (True,  False, '// offset: isle 0xdeadbeef'),
+    (True,  False, '// OFFSET: LEGO1 0xdeadbeef stub'),
+
+    # Hex string must be lowercase
+    (True,  False, '// OFFSET: ISLE 0xDEADBEEF'),
+
+    # TODO: How flexible should we be with matching the module name?
+    (True,  True,  '// OFFSET: OMNI 0x12345678'),
+    (True,  True,  '// OFFSET: LEG01 0x12345678'),
+    (True,  False,  '// OFFSET: hello 0x12345678'),
+
+    # Not close enough to match
+    (False, False, '// OFFSET: ISLE0x12345678'),
+    (False, False, '// OFFSET: 0x12345678'),
+    (False, False, '// LEGO1: 0x12345678'),
+
+    # Hex string shorter than 8 characters
+    (True,  True,  '// OFFSET: LEGO1 0x1234'),
+
+    # TODO: These match but shouldn't.
+    # (False, False, '// OFFSET: LEGO1 0'),
+    # (False, False, '// OFFSET: LEGO1 0x'),
+]
+
+
+@pytest.mark.parametrize('match, exact, line', offset_comment_samples)
+def test_offset_match(line: str, match: bool, exact):
+    did_match = match_offset_comment(line) is not None
+    assert did_match is match
+
+
+@pytest.mark.parametrize('match, exact, line', offset_comment_samples)
+def test_exact_offset_comment(line: str, exact: bool, match):
+    assert is_exact_offset_comment(line) is exact
+
+
+# Helper for the next test: cut down version of OffsetMatch
+MiniOfs = namedtuple('MiniOfs', ['module', 'value'])
+
+distinct_by_module_samples = [
+    # empty set
+    ([], []),
+    # same module name
+    ([MiniOfs('TEST', 123), MiniOfs('TEST', 555)],
+     [MiniOfs('TEST', 123)]),
+    # same module name, case-insensitive
+    ([MiniOfs('test', 123), MiniOfs('TEST', 555)],
+     [MiniOfs('test', 123)]),
+    # duplicates, non-consecutive
+    ([MiniOfs('test', 123), MiniOfs('abc', 111), MiniOfs('TEST', 555)],
+     [MiniOfs('test', 123), MiniOfs('abc', 111)]),
+]
+
+
+@pytest.mark.parametrize('sample, expected', distinct_by_module_samples)
+def test_distinct_by_module(sample: List[MiniOfs], expected: List[MiniOfs]):
+    assert distinct_by_module(sample) == expected
--- a/tools/reccmp/reccmp.py
+++ b/tools/reccmp/reccmp.py
@@ -12,6 +12,8 @@ import sys
 import colorama
 import html
 import re
+from isledecomp.dir import walk_source_dir
+from isledecomp.parser import find_code_blocks
 from pystache import Renderer

 parser = argparse.ArgumentParser(allow_abbrev=False,
@@ -413,145 +415,120 @@ htmlinsert = []

 # Generate basename of original file, used in locating OFFSET lines
 basename = os.path.basename(os.path.splitext(original)[0])
-pattern = '// OFFSET:'

-for subdir, dirs, files in os.walk(source):
-  for file in files:
-    srcfilename = os.path.join(os.path.abspath(subdir), file)
-    with open(srcfilename, 'r') as srcfile:
-      line_no = 0
+for srcfilename in walk_source_dir(source):
+  with open(srcfilename, 'r') as srcfile:
+    blocks = find_code_blocks(srcfile)

-      while True:
-        try:
-          line = srcfile.readline()
-          line_no += 1
+  for block in blocks:
+    if block.is_stub:
+      continue

-          if not line:
-            break
+    if block.module != basename:
+      continue

-          line = line.strip()
+    addr = block.offset
+    # Verbose flag handling
+    if verbose:
+      if addr == verbose:
+        found_verbose_target = True
+      else:
+        continue

-          if line.startswith(pattern) and not line.endswith('STUB'):
-            par = line[len(pattern):].strip().split()
-            module = par[0]
-            if module != basename:
-              continue
+    if block.is_template:
+      recinfo = syminfo.get_recompiled_address_from_name(block.signature)
+      if not recinfo:
+        continue
+    else:
+      recinfo = syminfo.get_recompiled_address(srcfilename, block.start_line)
+      if not recinfo:
+        continue

-            addr = int(par[1], 16)
+    # The effective_ratio is the ratio when ignoring differing register
+    # allocation vs the ratio is the true ratio.
+    ratio = 0.0
+    effective_ratio = 0.0
+    if recinfo.size:
+      origasm = parse_asm(origfile, addr + recinfo.start, recinfo.size)
+      recompasm = parse_asm(recompfile, recinfo.addr + recinfo.start, recinfo.size)

-            # Verbose flag handling
-            if verbose:
-              if addr == verbose:
-                found_verbose_target = True
-              else:
-                continue
+      diff = difflib.SequenceMatcher(None, origasm, recompasm)
+      ratio = diff.ratio()
+      effective_ratio = ratio

-            if line.endswith('TEMPLATE'):
-                line = srcfile.readline()
-                line_no += 1
-                # Name comes after // comment
-                name = line.strip()[2:].strip()
+      if ratio != 1.0:
+        # Check whether we can resolve register swaps which are actually
+        # perfect matches modulo compiler entropy.
+        if can_resolve_register_differences(origasm, recompasm):
+          effective_ratio = 1.0
+    else:
+      ratio = 0

-                recinfo = syminfo.get_recompiled_address_from_name(name)
-                if not recinfo:
-                  continue
-            else:
-                find_open_bracket = line
-                while '{' not in find_open_bracket:
-                  find_open_bracket = srcfile.readline()
-                  line_no += 1
+    percenttext = f'{(effective_ratio * 100):.2f}%'
+    if not plain:
+      if effective_ratio == 1.0:
+        percenttext = colorama.Fore.GREEN + percenttext + colorama.Style.RESET_ALL
+      elif effective_ratio > 0.8:
+        percenttext = colorama.Fore.YELLOW + percenttext + colorama.Style.RESET_ALL
+      else:
+        percenttext = colorama.Fore.RED + percenttext + colorama.Style.RESET_ALL

-                recinfo = syminfo.get_recompiled_address(srcfilename, line_no)
-                if not recinfo:
-                  continue
+    if effective_ratio == 1.0 and ratio != 1.0:
+      if plain:
+        percenttext += '*'
+      else:
+        percenttext += colorama.Fore.RED + '*' + colorama.Style.RESET_ALL

-            # The effective_ratio is the ratio when ignoring differing register
-            # allocation vs the ratio is the true ratio.
-            ratio = 0.0
-            effective_ratio = 0.0
-            if recinfo.size:
-              origasm = parse_asm(origfile, addr + recinfo.start, recinfo.size)
-              recompasm = parse_asm(recompfile, recinfo.addr + recinfo.start, recinfo.size)
+    if args.print_rec_addr:
+      addrs = f'0x{addr:x} / 0x{recinfo.addr:x}'
+    else:
+      addrs = hex(addr)

-              diff = difflib.SequenceMatcher(None, origasm, recompasm)
-              ratio = diff.ratio()
-              effective_ratio = ratio
+    if not verbose:
+      print(f'  {recinfo.name} ({addrs}) is {percenttext} similar to the original')

-              if ratio != 1.0:
-                # Check whether we can resolve register swaps which are actually
-                # perfect matches modulo compiler entropy.
-                if can_resolve_register_differences(origasm, recompasm):
-                  effective_ratio = 1.0
-            else:
-              ratio = 0
+    function_count += 1
+    total_accuracy += ratio
+    total_effective_accuracy += effective_ratio

-            percenttext = f'{(effective_ratio * 100):.2f}%'
-            if not plain:
-              if effective_ratio == 1.0:
-                percenttext = colorama.Fore.GREEN + percenttext + colorama.Style.RESET_ALL
-              elif effective_ratio > 0.8:
-                percenttext = colorama.Fore.YELLOW + percenttext + colorama.Style.RESET_ALL
-              else:
-                percenttext = colorama.Fore.RED + percenttext + colorama.Style.RESET_ALL
+    if recinfo.size:
+      udiff = difflib.unified_diff(origasm, recompasm, n=10)

-            if effective_ratio == 1.0 and ratio != 1.0:
+      # If verbose, print the diff for that function to the output
+      if verbose:
+        if effective_ratio == 1.0:
+          ok_text = 'OK!' if plain else (colorama.Fore.GREEN + '✨ OK! ✨' + colorama.Style.RESET_ALL)
+          if ratio == 1.0:
+            print(f'{addrs}: {recinfo.name} 100% match.\n\n{ok_text}\n\n')
+          else:
+            print(f'{addrs}: {recinfo.name} Effective 100%% match. (Differs in register allocation only)\n\n{ok_text} (still differs in register allocation)\n\n')
+        else:
+          for line in udiff:
+            if line.startswith('++') or line.startswith('@@') or line.startswith('--'):
+              # Skip unneeded parts of the diff for the brief view
+              pass
+            elif line.startswith('+'):
              if plain:
-                percenttext += '*'
+                print(line)
              else:
-                percenttext += colorama.Fore.RED + '*' + colorama.Style.RESET_ALL
-
-            if args.print_rec_addr:
-              addrs = f'0x{addr:x} / 0x{recinfo.addr:x}'
+                print(colorama.Fore.GREEN + line)
+            elif line.startswith('-'):
+              if plain:
+                print(line)
+              else:
+                print(colorama.Fore.RED + line)
            else:
-              addrs = hex(addr)
+              print(line)
+            if not plain:
+              print(colorama.Style.RESET_ALL, end='')

-            if not verbose:
-              print(f'  {recinfo.name} ({addrs}) is {percenttext} similar to the original')
+          print(f'\n{recinfo.name} is only {percenttext} similar to the original, diff above')

-            function_count += 1
-            total_accuracy += ratio
-            total_effective_accuracy += effective_ratio
+      # If html, record the diffs to an HTML file
+      if html_path:
+        escaped = html.escape('\\n'.join(udiff).replace('"', '\\"').replace('\n', '\\n'))
+        htmlinsert.append(f'{{address: "0x{addr:x}", name: "{html.escape(recinfo.name)}", matching: {effective_ratio}, diff: "{escaped}"}}')

-            if recinfo.size:
-              udiff = difflib.unified_diff(origasm, recompasm, n=10)
-
-              # If verbose, print the diff for that function to the output
-              if verbose:
-                if effective_ratio == 1.0:
-                  ok_text = 'OK!' if plain else (colorama.Fore.GREEN + '✨ OK! ✨' + colorama.Style.RESET_ALL)
-                  if ratio == 1.0:
-                    print(f'{addrs}: {recinfo.name} 100% match.\n\n{ok_text}\n\n')
-                  else:
-                    print(f'{addrs}: {recinfo.name} Effective 100%% match. (Differs in register allocation only)\n\n{ok_text} (still differs in register allocation)\n\n')
-                else:
-                  for line in udiff:
-                    if line.startswith('++') or line.startswith('@@') or line.startswith('--'):
-                      # Skip unneeded parts of the diff for the brief view
-                      pass
-                    elif line.startswith('+'):
-                      if plain:
-                        print(line)
-                      else:
-                        print(colorama.Fore.GREEN + line)
-                    elif line.startswith('-'):
-                      if plain:
-                        print(line)
-                      else:
-                        print(colorama.Fore.RED + line)
-                    else:
-                      print(line)
-                    if not plain:
-                      print(colorama.Style.RESET_ALL, end='')
-
-                  print(f'\n{recinfo.name} is only {percenttext} similar to the original, diff above')
-
-              # If html, record the diffs to an HTML file
-              if html_path:
-                escaped = html.escape('\\n'.join(udiff).replace('"', '\\"').replace('\n', '\\n'))
-                htmlinsert.append(f'{{address: "0x{addr:x}", name: "{html.escape(recinfo.name)}", matching: {effective_ratio}, diff: "{escaped}"}}')
-
-        except UnicodeDecodeError:
-          break

 def gen_html(html_file, data):
  output_data = Renderer().render_path(get_file_in_script_dir('template.html'),
--- a/tools/reccmp/requirements.txt
+++ b/tools/reccmp/requirements.txt
@@ -1,3 +1,4 @@
 capstone
 colorama
-pystache
+isledecomp
+pystache