Checkorder tool to keep functions in original binary order (#228)

* First commit of order tool * More flexible match on module name. Bugfix on blank_or_comment * Report inexact offset comments in verbose mode. Bugfix for exact regex * Refactor checkorder into reusable isledecomp module * Find bad comments in one pass, add awareness of TEMPLATE * Refactor of state machine to prepare for reccmp integration * Use isledecomp lib in reccmp * Build isledecomp in GH actions, fix mypy complaint * Ensure unit test cpp files will be ignored by reccmp * Allow multiple offset markers, pep8 cleanup * Remove unused variable * Code style, remove unneeded module and TODO * Final renaming and type hints * Fix checkorder issues, add GH action and enforce (#2) * Fix checkorder issues * Add GH action * Test error case * Works * Fixes --------- Co-authored-by: Christian Semmler <mail@csemmler.com>
2025-12-11 08:33:13 +00:00 · 2023-11-21 03:44:45 -05:00
parent 714d36b57d
commit 1ae3b07dc2
84 changed files with 4021 additions and 3209 deletions
--- a/tools/isledecomp/isledecomp/init.py
+++ b/tools/isledecomp/isledecomp/init.py
--- a/tools/isledecomp/isledecomp/dir.py
+++ b/tools/isledecomp/isledecomp/dir.py
@@ -0,0 +1,21 @@
+import os
+from typing import Iterator
+
+
+def is_file_cpp(filename: str) -> bool:
+    (basefile, ext) = os.path.splitext(filename)
+    return ext.lower() in ('.h', '.cpp')
+
+
+def walk_source_dir(source: str, recursive: bool = True) -> Iterator[str]:
+    """Generator to walk the given directory recursively and return
+       any C++ files found."""
+
+    source = os.path.abspath(source)
+    for subdir, dirs, files in os.walk(source):
+        for file in files:
+            if is_file_cpp(file):
+                yield os.path.join(subdir, file)
+
+        if not recursive:
+            break
--- a/tools/isledecomp/isledecomp/parser/init.py
+++ b/tools/isledecomp/isledecomp/parser/init.py
@@ -0,0 +1 @@
+from .parser import find_code_blocks
--- a/tools/isledecomp/isledecomp/parser/parser.py
+++ b/tools/isledecomp/isledecomp/parser/parser.py
@@ -0,0 +1,142 @@
+# C++ file parser
+
+from typing import List, TextIO
+from enum import Enum
+from .util import (
+    CodeBlock,
+    OffsetMatch,
+    is_blank_or_comment,
+    match_offset_comment,
+    is_exact_offset_comment,
+    get_template_function_name,
+    remove_trailing_comment,
+    distinct_by_module,
+)
+
+
+class ReaderState(Enum):
+    WANT_OFFSET = 0
+    WANT_SIG = 1
+    IN_FUNC = 2
+    IN_TEMPLATE = 3
+    WANT_CURLY = 4
+    FUNCTION_DONE = 5
+
+
+def find_code_blocks(stream: TextIO) -> List[CodeBlock]:
+    """Read the IO stream (file) line-by-line and give the following report:
+       Foreach code block (function) in the file, what are its starting and
+       ending line numbers, and what is the given offset in the original
+       binary. We expect the result to be ordered by line number because we
+       are reading the file from start to finish."""
+
+    blocks: List[CodeBlock] = []
+
+    offset_matches: List[OffsetMatch] = []
+
+    function_sig = None
+    start_line = None
+    end_line = None
+    state = ReaderState.WANT_OFFSET
+
+    # 1-based to match cvdump and your text editor
+    # I know it says 0, but we will increment before each readline()
+    line_no = 0
+    can_seek = True
+
+    while True:
+        # Do this before reading again so that an EOF will not
+        # cause us to miss the last function of the file.
+        if state == ReaderState.FUNCTION_DONE:
+            # Our list of offset marks could have duplicates on
+            # module name, so we'll eliminate those now.
+            for offset_match in distinct_by_module(offset_matches):
+                block = CodeBlock(offset=offset_match.address,
+                                  signature=function_sig,
+                                  start_line=start_line,
+                                  end_line=end_line,
+                                  offset_comment=offset_match.comment,
+                                  module=offset_match.module,
+                                  is_template=offset_match.is_template,
+                                  is_stub=offset_match.is_stub)
+                blocks.append(block)
+            offset_matches = []
+            state = ReaderState.WANT_OFFSET
+
+        if can_seek:
+            line_no += 1
+            line = stream.readline()
+            if line == '':
+                break
+
+        new_match = match_offset_comment(line)
+        if new_match is not None:
+            # We will allow multiple offsets if we have just begun
+            # the code block, but not after we hit the curly brace.
+            if state in (ReaderState.WANT_OFFSET, ReaderState.IN_TEMPLATE,
+                         ReaderState.WANT_SIG):
+                # If we detected an offset marker unexpectedly,
+                # we are handling it here so we can continue seeking.
+                can_seek = True
+
+                offset_matches.append(new_match)
+
+                if new_match.is_template:
+                    state = ReaderState.IN_TEMPLATE
+                else:
+                    state = ReaderState.WANT_SIG
+            else:
+                # We hit another offset unexpectedly.
+                # We can recover easily by just ending the function here.
+                end_line = line_no - 1
+                state = ReaderState.FUNCTION_DONE
+
+                # Pause reading here so we handle the offset marker
+                # on the next loop iteration
+                can_seek = False
+
+        elif state == ReaderState.IN_TEMPLATE:
+            # TEMPLATE functions are a special case. The signature is
+            # given on the next line (in a // comment)
+            function_sig = get_template_function_name(line)
+            start_line = line_no
+            end_line = line_no
+            state = ReaderState.FUNCTION_DONE
+
+        elif state == ReaderState.WANT_SIG:
+            # Skip blank lines or comments that come after the offset
+            # marker. There is not a formal procedure for this, so just
+            # assume the next "code line" is the function signature
+            if not is_blank_or_comment(line):
+                # Inline functions may end with a comment. Strip that out
+                # to help parsing.
+                function_sig = remove_trailing_comment(line.strip())
+
+                # Now check to see if the opening curly bracket is on the
+                # same line. clang-format should prevent this (BraceWrapping)
+                # but it is easy to detect.
+                # If the entire function is on one line, handle that too.
+                if function_sig.endswith('{'):
+                    start_line = line_no
+                    state = ReaderState.IN_FUNC
+                elif (function_sig.endswith('}') or
+                        function_sig.endswith('};')):
+                    start_line = line_no
+                    end_line = line_no
+                    state = ReaderState.FUNCTION_DONE
+                else:
+                    state = ReaderState.WANT_CURLY
+
+        elif state == ReaderState.WANT_CURLY:
+            if line.strip() == '{':
+                start_line = line_no
+                state = ReaderState.IN_FUNC
+
+        elif state == ReaderState.IN_FUNC:
+            # Naive but reasonable assumption that functions will end with
+            # a curly brace on its own line with no prepended spaces.
+            if line.startswith('}'):
+                end_line = line_no
+                state = ReaderState.FUNCTION_DONE
+
+    return blocks
--- a/tools/isledecomp/isledecomp/parser/util.py
+++ b/tools/isledecomp/isledecomp/parser/util.py
@@ -0,0 +1,97 @@
+# C++ Parser utility functions and data structures
+from __future__ import annotations  # python <3.10 compatibility
+import re
+from typing import List
+from collections import namedtuple
+
+
+CodeBlock = namedtuple('CodeBlock',
+                       ['offset', 'signature', 'start_line', 'end_line',
+                        'offset_comment', 'module', 'is_template', 'is_stub'])
+
+OffsetMatch = namedtuple('OffsetMatch', ['module', 'address', 'is_template',
+                                         'is_stub', 'comment'])
+
+# This has not been formally established, but considering that "STUB"
+# is a temporary state for a function, we assume it will appear last,
+# after any other modifiers (i.e. TEMPLATE)
+
+# To match a reasonable variance of formatting for the offset comment
+offsetCommentRegex = re.compile(r'\s*//\s*OFFSET:\s*(\w+)\s+(?:0x)?([a-f0-9]+)(\s+TEMPLATE)?(\s+STUB)?',  # nopep8
+                                flags=re.I)
+
+# To match the exact syntax (text upper case, hex lower case, with spaces)
+# that is used in most places
+offsetCommentExactRegex = re.compile(r'^// OFFSET: [A-Z0-9]+ (0x[a-f0-9]+)( TEMPLATE)?( STUB)?$')  # nopep8
+
+
+# The goal here is to just read whatever is on the next line, so some
+# flexibility in the formatting seems OK
+templateCommentRegex = re.compile(r'\s*//\s+(.*)')
+
+
+# To remove any comment (//) or block comment (/*) and its leading spaces
+# from the end of a code line
+trailingCommentRegex = re.compile(r'(\s*(?://|/\*).*)$')
+
+
+def get_template_function_name(line: str) -> str:
+    """Parse function signature for special TEMPLATE functions"""
+    template_match = templateCommentRegex.match(line)
+
+    # If we don't match, you get whatever is on the line as the signature
+    if template_match is not None:
+        return template_match.group(1)
+
+    return line
+
+
+def remove_trailing_comment(line: str) -> str:
+    return trailingCommentRegex.sub('', line)
+
+
+def is_blank_or_comment(line: str) -> bool:
+    """Helper to read ahead after the offset comment is matched.
+       There could be blank lines or other comments before the
+       function signature, and we want to skip those."""
+    line_strip = line.strip()
+    return (len(line_strip) == 0
+            or line_strip.startswith('//')
+            or line_strip.startswith('/*')
+            or line_strip.endswith('*/'))
+
+
+def is_exact_offset_comment(line: str) -> bool:
+    """If the offset comment does not match our (unofficial) syntax
+       we may want to alert the user to fix it for style points."""
+    return offsetCommentExactRegex.match(line) is not None
+
+
+def match_offset_comment(line: str) -> OffsetMatch | None:
+    match = offsetCommentRegex.match(line)
+    if match is None:
+        return None
+
+    return OffsetMatch(module=match.group(1),
+                       address=int(match.group(2), 16),
+                       is_template=match.group(3) is not None,
+                       is_stub=match.group(4) is not None,
+                       comment=line.strip())
+
+
+def distinct_by_module(offsets: List) -> List:
+    """Given a list of offset markers, return a list with distinct
+       module names. If module names (case-insensitive) are repeated,
+       choose the offset that appears first."""
+
+    if len(offsets) < 2:
+        return offsets
+
+    # Dict maintains insertion order in python >=3.7
+    offsets_dict = {}
+    for offset in offsets:
+        module_upper = offset.module.upper()
+        if module_upper not in offsets_dict:
+            offsets_dict[module_upper] = offset
+
+    return list(offsets_dict.values())