Merge from parser2 branch

2025-12-13 09:23:36 +00:00 · 2023-12-01 15:10:32 -05:00
parent 47a6ea2de7
commit 75802101ac
20 changed files with 923 additions and 352 deletions
--- a/tools/checkorder/checkorder.py
+++ b/tools/checkorder/checkorder.py
@@ -2,8 +2,7 @@ import os
 import sys
 import argparse
 from isledecomp.dir import walk_source_dir, is_file_cpp
-from isledecomp.parser import find_code_blocks
+from isledecomp.parser import DecompParser
 from isledecomp.parser.util import is_exact_offset_comment
 def sig_truncate(sig: str) -> str:
@@ -16,23 +15,22 @@ def check_file(filename: str, verbose: bool = False) -> bool:
    """Open and read the given file, then check whether the code blocks
    are in order. If verbose, print each block."""
    parser = DecompParser()
    with open(filename, "r", encoding="utf-8") as f:
-        code_blocks = find_code_blocks(f)
+        parser.read_lines(f)
    bad_comments = [
        (block.start_line, block.offset_comment)
        for block in code_blocks
        if not is_exact_offset_comment(block.offset_comment)
    ]
-    just_offsets = [block.offset for block in code_blocks]
+    just_offsets = [block.offset for block in parser.functions]
    sorted_offsets = sorted(just_offsets)
    file_out_of_order = just_offsets != sorted_offsets
    # TODO: When we add parser error severity, actual errors that obstruct
    # parsing should probably be shown here regardless of verbose mode
    # If we detect inexact comments, don't print anything unless we are
    # in verbose mode. If the file is out of order, we always print the
    # file name.
-    should_report = (len(bad_comments) > 0 and verbose) or file_out_of_order
+    should_report = (len(parser.alerts) > 0 and verbose) or file_out_of_order
    if not should_report and not file_out_of_order:
        return False
@@ -44,22 +42,22 @@ def check_file(filename: str, verbose: bool = False) -> bool:
            order_lookup = {k: i for i, k in enumerate(sorted_offsets)}
            prev_offset = 0
-            for block in code_blocks:
+            for fun in parser.functions:
                msg = " ".join(
                    [
-                        " " if block.offset > prev_offset else "!",
+                        " " if fun.offset > prev_offset else "!",
                        f"{block.offset:08x}",
-                        f"{block.end_line - block.start_line:4} lines",
+                        f"{fun.end_line - fun.line_number:4} lines",
-                        f"{order_lookup[block.offset]:3}",
+                        f"{order_lookup[fun.offset]:3}",
                        "    ",
-                        sig_truncate(block.signature),
+                        sig_truncate(fun.signature),
                    ]
                )
                print(msg)
-                prev_offset = block.offset
+                prev_offset = fun.offset
-        for line_no, line in bad_comments:
+        for alert in parser.alerts:
-            print(f"* line {line_no:3} bad offset comment ({line})")
+            print(f"* line {alert.line_number:4} {alert.code} ({alert.line})")
        print()
--- a/tools/isledecomp/isledecomp/parser/init.py
+++ b/tools/isledecomp/isledecomp/parser/init.py
@@ -1 +1 @@
-from .parser import find_code_blocks
+from .parser import DecompParser
--- a/tools/isledecomp/isledecomp/parser/error.py
+++ b/tools/isledecomp/isledecomp/parser/error.py
@@ -0,0 +1,33 @@
 from enum import Enum
 class ParserError(Enum):
    # WARN: Stub function exceeds some line number threshold
    UNLIKELY_STUB = 100
    # WARN: Decomp marker is close enough to be recognized, but does not follow syntax exactly
    BAD_DECOMP_MARKER = 101
    # WARN: Multiple markers in sequence do not have distinct modules
    DUPLICATE_MODULE = 102
    # WARN: Detected a dupcliate module/offset pair in the current file
    DUPLICATE_OFFSET = 103
    # WARN: We read a line that matches the decomp marker pattern, but we are not set up
    # to handle it
    BOGUS_MARKER = 104
    # WARN: Under a synthetic marker we expected a comment but found a code line instead
    SYNTHETIC_NOT_COMMENT = 110
    # WARN: New function marker appeared while we were inside a function
    MISSED_END_OF_FUNCTION = 117
    # ERROR: We found a marker unexpectedly
    UNEXPECTED_MARKER = 200
    # ERROR: We found a marker where we expected to find one, but it is incompatible
    # with the preceding markers.
    # For example, a GLOBAL cannot follow FUNCTION/STUB
    INCOMPATIBLE_MARKER = 201
--- a/tools/isledecomp/isledecomp/parser/node.py
+++ b/tools/isledecomp/isledecomp/parser/node.py
@@ -0,0 +1,41 @@
 from dataclasses import dataclass
 from enum import Enum
@dataclass
 class ParserNode:
    line_number: int
@dataclass
 class ParserAlert(ParserNode):
    code: int
    line: str
@dataclass
 class ParserSymbol(ParserNode):
    module: str
    offset: int
@dataclass
 class ParserFunction(ParserSymbol):
    name: str
    is_stub: bool = False
    is_synthetic: bool = False
    is_template: bool = False
    end_line: int = -1
@dataclass
 class ParserVariable(ParserSymbol):
    name: str
    size: int = -1
    is_static: bool = False
@dataclass
 class ParserVtable(ParserSymbol):
    class_name: str
    num_entries: int = -1
--- a/tools/isledecomp/isledecomp/parser/parser.py
+++ b/tools/isledecomp/isledecomp/parser/parser.py
@@ -1,145 +1,346 @@
 # C++ file parser
-from typing import List, TextIO
+from typing import List, TextIO, Iterable
 from enum import Enum
 from .util import (
-    CodeBlock,
+    DecompMarker,
    OffsetMatch,
    is_blank_or_comment,
-    match_offset_comment,
+    match_marker,
    is_marker_exact,
    get_template_function_name,
    remove_trailing_comment,
    distinct_by_module,
 )
 from .node import (
    ParserAlert,
    ParserNode,
    ParserFunction,
    ParserVariable,
    ParserVtable,
 )
 from .error import ParserError
 class ReaderState(Enum):
-    WANT_OFFSET = 0
+    SEARCH = 0
    WANT_SIG = 1
    IN_FUNC = 2
    IN_TEMPLATE = 3
    WANT_CURLY = 4
-    FUNCTION_DONE = 5
+    IN_GLOBAL = 5
    IN_FUNC_GLOBAL = 6
    IN_VTABLE = 7
-def find_code_blocks(stream: TextIO) -> List[CodeBlock]:
+def marker_is_stub(marker: DecompMarker) -> bool:
-    """Read the IO stream (file) line-by-line and give the following report:
+    return marker.type.upper() == "STUB"
    Foreach code block (function) in the file, what are its starting and
    ending line numbers, and what is the given offset in the original
    binary. We expect the result to be ordered by line number because we
    are reading the file from start to finish."""
    blocks: List[CodeBlock] = []
-    offset_matches: List[OffsetMatch] = []
+def marker_is_variable(marker: DecompMarker) -> bool:
    return marker.type.upper() == "GLOBAL"
    function_sig = None
    start_line = None
    end_line = None
    state = ReaderState.WANT_OFFSET
-    # 1-based to match cvdump and your text editor
+def marker_is_synthetic(marker: DecompMarker) -> bool:
-    # I know it says 0, but we will increment before each readline()
+    return marker.type.upper() in ("SYNTHETIC", "TEMPLATE")
    line_no = 0
    can_seek = True
-    while True:
+
-        # Do this before reading again so that an EOF will not
+def marker_is_function(marker: DecompMarker) -> bool:
-        # cause us to miss the last function of the file.
+    return marker.type.upper() in ("FUNCTION", "STUB")
-        if state == ReaderState.FUNCTION_DONE:
+
-            # Our list of offset marks could have duplicates on
+
-            # module name, so we'll eliminate those now.
+def marker_is_vtable(marker: DecompMarker) -> bool:
-            for offset_match in distinct_by_module(offset_matches):
+    return marker.type.upper() == "VTABLE"
-                block = CodeBlock(
+
-                    offset=offset_match.address,
+
-                    signature=function_sig,
+class MarkerDict:
-                    start_line=start_line,
+    def __init__(self):
        self.markers: dict = {}
    def insert(self, marker: DecompMarker) -> bool:
        module = marker.module.upper()
        # Return True if this insert would overwrite
        if module in self.markers:
            return True
        self.markers[module] = (marker.type, marker.offset)
        return False
    def iter(self):
        for module in self.markers:
            (marker_type, offset) = self.markers[module]
            yield DecompMarker(marker_type, module, offset)
    def empty(self):
        self.markers = {}
 class DecompParser:
    def __init__(self):
        self.fun_markers = MarkerDict()
        self.var_markers = MarkerDict()
        self.tbl_markers = MarkerDict()
        self.reset()
    def reset(self):
        # Output values
        self.functions = []
        self.vtables = []
        self.variables = []
        self.alerts = []
        # Internal state machine stuff
        self.line_number: int = 0
        self.state: ReaderState = ReaderState.SEARCH
        self.last_line: str = ""
        self.fun_markers.empty()
        self.var_markers.empty()
        self.tbl_markers.empty()
        self.function_start: int = 0
        self.function_sig: str = ""
    def _recover(self):
        """We hit a syntax error and need to reset temp structures"""
        self.state = ReaderState.SEARCH
        self.fun_markers.empty()
        self.var_markers.empty()
        self.tbl_markers.empty()
    def _syntax_warning(self, code):
        self.alerts.append(
            ParserAlert(
                line_number=self.line_number,
                code=code,
                line=self.last_line.strip(),
            )
        )
    def _syntax_error(self, code):
        self._syntax_warning(code)
        self._recover()
    def _function_starts_here(self):
        self.function_start = self.line_number
    def _function_marker(self, marker: DecompMarker):
        if self.fun_markers.insert(marker):
            self._syntax_warning(ParserError.DUPLICATE_MODULE)
        self.state = ReaderState.WANT_SIG
    def _synthetic_marker(self, marker: DecompMarker):
        if self.fun_markers.insert(marker):
            self._syntax_warning(ParserError.DUPLICATE_MODULE)
        self.state = ReaderState.IN_TEMPLATE
    def _function_done(self, unexpected: bool = False):
        end_line = self.line_number
        if unexpected:
            end_line -= -1
        for marker in self.fun_markers.iter():
            self.functions.append(
                ParserFunction(
                    line_number=self.function_start,
                    module=marker.module,
                    offset=marker.offset,
                    is_stub=marker_is_stub(marker),
                    is_template=marker_is_synthetic(marker),
                    name=self.function_sig,
                    end_line=end_line,
                    offset_comment=offset_match.comment,
                    module=offset_match.module,
                    is_template=offset_match.is_template,
                    is_stub=offset_match.is_stub,
                )
-                blocks.append(block)
+            )
            offset_matches = []
            state = ReaderState.WANT_OFFSET
-        if can_seek:
+        self.fun_markers.empty()
-            line_no += 1
+        self.state = ReaderState.SEARCH
            line = stream.readline()
            if line == "":
                break
-        new_match = match_offset_comment(line)
+    def _vtable_marker(self, marker: DecompMarker):
-        if new_match is not None:
+        if self.tbl_markers.insert(marker):
-            # We will allow multiple offsets if we have just begun
+            self._syntax_warning(ParserError.DUPLICATE_MODULE)
-            # the code block, but not after we hit the curly brace.
+        self.state = ReaderState.IN_VTABLE
-            if state in (
+
-                ReaderState.WANT_OFFSET,
+    def _vtable_done(self):
-                ReaderState.IN_TEMPLATE,
+        for marker in self.tbl_markers.iter():
            self.vtables.append(
                ParserVtable(
                    line_number=self.line_number,
                    module=marker.module,
                    offset=marker.offset,
                    class_name=self.last_line.strip(),
                )
            )
        self.tbl_markers.empty()
        self.state = ReaderState.SEARCH
    def _variable_marker(self, marker: DecompMarker):
        if self.var_markers.insert(marker):
            self._syntax_warning(ParserError.DUPLICATE_MODULE)
        if self.state in (ReaderState.IN_FUNC, ReaderState.IN_FUNC_GLOBAL):
            self.state = ReaderState.IN_FUNC_GLOBAL
        else:
            self.state = ReaderState.IN_GLOBAL
    def _variable_done(self):
        for marker in self.var_markers.iter():
            self.variables.append(
                ParserVariable(
                    line_number=self.line_number,
                    module=marker.module,
                    offset=marker.offset,
                    name=self.last_line.strip(),
                )
            )
        self.var_markers.empty()
        if self.state == ReaderState.IN_FUNC_GLOBAL:
            self.state = ReaderState.IN_FUNC
        else:
            self.state = ReaderState.SEARCH
    def _handle_marker(self, marker: DecompMarker):
        # Cannot handle any markers between function sig and opening curly brace
        if self.state == ReaderState.WANT_CURLY:
            self._syntax_error(ParserError.UNEXPECTED_MARKER)
            return
        # TODO: How uncertain are we of detecting the end of a function
        # in a clang-formatted file? For now we assume we have missed the
        # end if we detect a non-GLOBAL marker while state is IN_FUNC.
        # Maybe these cases should be syntax errors instead
        if marker_is_function(marker):
            if self.state in (
                ReaderState.SEARCH,
                ReaderState.WANT_SIG,
            ):
-                # If we detected an offset marker unexpectedly,
+                # We will allow multiple offsets if we have just begun
-                # we are handling it here so we can continue seeking.
+                # the code block, but not after we hit the curly brace.
-                can_seek = True
+                self._function_marker(marker)
-
+            elif self.state == ReaderState.IN_FUNC:
                offset_matches.append(new_match)
                if new_match.is_template:
                    state = ReaderState.IN_TEMPLATE
                else:
                    state = ReaderState.WANT_SIG
            else:
                # We hit another offset unexpectedly.
                # We can recover easily by just ending the function here.
-                end_line = line_no - 1
+                self._syntax_warning(ParserError.MISSED_END_OF_FUNCTION)
-                state = ReaderState.FUNCTION_DONE
+                self._function_done()
-                # Pause reading here so we handle the offset marker
+                # Start the next function right after so we can
-                # on the next loop iteration
+                # read the next line.
-                can_seek = False
+                self._function_marker(marker)
            else:
                self._syntax_error(ParserError.INCOMPATIBLE_MARKER)
-        elif state == ReaderState.IN_TEMPLATE:
+        elif marker_is_synthetic(marker):
            if self.state in (ReaderState.SEARCH, ReaderState.IN_TEMPLATE):
                self._synthetic_marker(marker)
            elif self.state == ReaderState.IN_FUNC:
                self._syntax_warning(ParserError.MISSED_END_OF_FUNCTION)
                self._function_done()
                self._synthetic_marker(marker)
            else:
                self._syntax_error(ParserError.INCOMPATIBLE_MARKER)
        elif marker_is_variable(marker):
            if self.state in (
                ReaderState.SEARCH,
                ReaderState.IN_GLOBAL,
                ReaderState.IN_FUNC,
                ReaderState.IN_FUNC_GLOBAL,
            ):
                self._variable_marker(marker)
            else:
                self._syntax_error(ParserError.INCOMPATIBLE_MARKER)
        elif marker_is_vtable(marker):
            if self.state in (ReaderState.SEARCH, ReaderState.IN_VTABLE):
                self._vtable_marker(marker)
            elif self.state == ReaderState.IN_FUNC:
                self._syntax_warning(ParserError.MISSED_END_OF_FUNCTION)
                self._function_done()
                self._vtable_marker(marker)
            else:
                self._syntax_error(ParserError.INCOMPATIBLE_MARKER)
        else:
            self._syntax_warning(ParserError.BOGUS_MARKER)
    def read_line(self, line: str):
        self.last_line = line  # TODO: Useful or hack for error reporting?
        self.line_number += 1
        marker = match_marker(line)
        if marker is not None:
            # TODO: what's the best place for this?
            # Does it belong with reading or marker handling?
            if not is_marker_exact(self.last_line):
                self._syntax_warning(ParserError.BAD_DECOMP_MARKER)
            self._handle_marker(marker)
            return
        if self.state == ReaderState.IN_TEMPLATE:
            # TEMPLATE functions are a special case. The signature is
            # given on the next line (in a // comment)
-            function_sig = get_template_function_name(line)
+            self.function_sig = get_template_function_name(line)
-            start_line = line_no
+            self._function_starts_here()
-            end_line = line_no
+            self._function_done()
            state = ReaderState.FUNCTION_DONE
-        elif state == ReaderState.WANT_SIG:
+        elif self.state == ReaderState.WANT_SIG:
            # Skip blank lines or comments that come after the offset
            # marker. There is not a formal procedure for this, so just
            # assume the next "code line" is the function signature
            if not is_blank_or_comment(line):
                # Inline functions may end with a comment. Strip that out
                # to help parsing.
-                function_sig = remove_trailing_comment(line.strip())
+                self.function_sig = remove_trailing_comment(line.strip())
                # Now check to see if the opening curly bracket is on the
                # same line. clang-format should prevent this (BraceWrapping)
                # but it is easy to detect.
                # If the entire function is on one line, handle that too.
-                if function_sig.endswith("{"):
+                if self.function_sig.endswith("{"):
-                    start_line = line_no
+                    self._function_starts_here()
-                    state = ReaderState.IN_FUNC
+                    self.state = ReaderState.IN_FUNC
-                elif function_sig.endswith("}") or function_sig.endswith("};"):
+                elif self.function_sig.endswith("}") or self.function_sig.endswith(
-                    start_line = line_no
+                    "};"
-                    end_line = line_no
+                ):
-                    state = ReaderState.FUNCTION_DONE
+                    self._function_starts_here()
                    self._function_done()
                else:
-                    state = ReaderState.WANT_CURLY
+                    self.state = ReaderState.WANT_CURLY
-        elif state == ReaderState.WANT_CURLY:
+        elif self.state == ReaderState.WANT_CURLY:
            if line.strip() == "{":
-                start_line = line_no
+                self._function_starts_here()
-                state = ReaderState.IN_FUNC
+                self.state = ReaderState.IN_FUNC
-        elif state == ReaderState.IN_FUNC:
+        elif self.state == ReaderState.IN_FUNC:
            # Naive but reasonable assumption that functions will end with
            # a curly brace on its own line with no prepended spaces.
            if line.startswith("}"):
-                end_line = line_no
+                self._function_done()
                state = ReaderState.FUNCTION_DONE
-    return blocks
+        elif self.state in (ReaderState.IN_GLOBAL, ReaderState.IN_FUNC_GLOBAL):
            if not is_blank_or_comment(line):
                self._variable_done()
        elif self.state == ReaderState.IN_VTABLE:
            if not is_blank_or_comment(line):
                self._vtable_done()
    def read_lines(self, lines: Iterable):
        for line in lines:
            self.read_line(line)
 def find_code_blocks(stream: TextIO) -> List[ParserNode]:
    """Read the IO stream (file) line-by-line and give the following report:
    Foreach code block (function) in the file, what are its starting and
    ending line numbers, and what is the given offset in the original
    binary. We expect the result to be ordered by line number because we
    are reading the file from start to finish."""
    # TODO: this will be replaced shortly. shim for now to avoid
    # making more changes elsewhere
    p = DecompParser()
    for line in stream:
        p.read_line(line)
    return p.functions
--- a/tools/isledecomp/isledecomp/parser/util.py
+++ b/tools/isledecomp/isledecomp/parser/util.py
@@ -4,41 +4,15 @@ import re
 from typing import List
 from collections import namedtuple
 DecompMarker = namedtuple("DecompMarker", ["type", "module", "offset"])
 CodeBlock = namedtuple(
    "CodeBlock",
    [
        "offset",
        "signature",
        "start_line",
        "end_line",
        "offset_comment",
        "module",
        "is_template",
        "is_stub",
    ],
 )
-OffsetMatch = namedtuple(
+markerRegex = re.compile(
-    "OffsetMatch", ["module", "address", "is_template", "is_stub", "comment"]
+    r"\s*//\s*(\w+):\s*(\w+)\s+((?:0x)?[a-f0-9]+)",
 )
 # This has not been formally established, but considering that "STUB"
 # is a temporary state for a function, we assume it will appear last,
 # after any other modifiers (i.e. TEMPLATE)
 # To match a reasonable variance of formatting for the offset comment
 offsetCommentRegex = re.compile(
    r"\s*//\s*OFFSET:\s*(\w+)\s+(?:0x)?([a-f0-9]+)(\s+TEMPLATE)?(\s+STUB)?",  # nopep8
    flags=re.I,
 )
-# To match the exact syntax (text upper case, hex lower case, with spaces)
+markerExactRegex = re.compile(r"// ([A-Z]+): ([A-Z0-9]+) (0x[a-f0-9]+)$")
 # that is used in most places
 offsetCommentExactRegex = re.compile(
    r"^// OFFSET: [A-Z0-9]+ (0x[a-f0-9]+)( TEMPLATE)?( STUB)?$"
 )  # nopep8
 # The goal here is to just read whatever is on the next line, so some
 # flexibility in the formatting seems OK
@@ -78,39 +52,15 @@ def is_blank_or_comment(line: str) -> bool:
    )
-def is_exact_offset_comment(line: str) -> bool:
+def match_marker(line: str) -> DecompMarker | None:
-    """If the offset comment does not match our (unofficial) syntax
+    match = markerRegex.match(line)
    we may want to alert the user to fix it for style points."""
    return offsetCommentExactRegex.match(line) is not None
 def match_offset_comment(line: str) -> OffsetMatch | None:
    match = offsetCommentRegex.match(line)
    if match is None:
        return None
-    return OffsetMatch(
+    return DecompMarker(
-        module=match.group(1),
+        type=match.group(1), module=match.group(2), offset=int(match.group(3), 16)
        address=int(match.group(2), 16),
        is_template=match.group(3) is not None,
        is_stub=match.group(4) is not None,
        comment=line.strip(),
    )
-def distinct_by_module(offsets: List) -> List:
+def is_marker_exact(line: str) -> bool:
-    """Given a list of offset markers, return a list with distinct
+    return markerExactRegex.match(line) is not None
    module names. If module names (case-insensitive) are repeated,
    choose the offset that appears first."""
    if len(offsets) < 2:
        return offsets
    # Dict maintains insertion order in python >=3.7
    offsets_dict = {}
    for offset in offsets:
        module_upper = offset.module.upper()
        if module_upper not in offsets_dict:
            offsets_dict[module_upper] = offset
    return list(offsets_dict.values())
--- a/tools/isledecomp/tests/samples/basic_class.cpp
+++ b/tools/isledecomp/tests/samples/basic_class.cpp
@@ -3,6 +3,7 @@
 // A very simple class
 // VTABLE: TEST 0x1001002
 class TestClass {
 public:
  TestClass();
@@ -10,14 +11,14 @@ public:
  virtual MxResult Tickle() override; // vtable+08
-  // OFFSET: TEST 0x12345678
+  // FUNCTION: TEST 0x12345678
  inline const char* ClassName() const // vtable+0c
  {
    // 0xabcd1234
    return "TestClass";
  }
-  // OFFSET: TEST 0xdeadbeef
+  // FUNCTION: TEST 0xdeadbeef
  inline MxBool IsA(const char* name) const override // vtable+10
  {
    return !strcmp(name, TestClass::ClassName());
--- a/tools/isledecomp/tests/samples/basic_file.cpp
+++ b/tools/isledecomp/tests/samples/basic_file.cpp
@@ -3,19 +3,19 @@
 // A very simple well-formed code file
-// OFFSET: TEST 0x1234
+// FUNCTION: TEST 0x1234
 void function01()
 {
  // TODO
 }
-// OFFSET: TEST 0x2345
+// FUNCTION: TEST 0x2345
 void function02()
 {
  // TODO
 }
-// OFFSET: TEST 0x3456
+// FUNCTION: TEST 0x3456
 void function03()
 {
  // TODO
--- a/tools/isledecomp/tests/samples/global_variables.cpp
+++ b/tools/isledecomp/tests/samples/global_variables.cpp
@@ -0,0 +1,14 @@
 // Sample for python unit tests
 // Not part of the decomp
 // Global variables inside and outside of functions
 // GLOBAL: TEST 0x1000
 const char *g_message = "test";
 // FUNCTION: TEST 0x1234
 void function01()
 {
  // GLOBAL: TEST 0x5555
  static int g_hello = 123;
 }
--- a/tools/isledecomp/tests/samples/inline.cpp
+++ b/tools/isledecomp/tests/samples/inline.cpp
@@ -1,8 +1,8 @@
 // Sample for python unit tests
 // Not part of the decomp
-// OFFSET: TEST 0x10000001
+// FUNCTION: TEST 0x10000001
 inline const char* OneLineWithComment() const { return "MxDSObject"; }; // hi there
-// OFFSET: TEST 0x10000002
+// FUNCTION: TEST 0x10000002
 inline const char* OneLine() const { return "MxDSObject"; };
--- a/tools/isledecomp/tests/samples/missing_offset.cpp
+++ b/tools/isledecomp/tests/samples/missing_offset.cpp
@@ -9,7 +9,7 @@ int no_offset_comment()
  return -1;
 }
-// OFFSET: TEST 0xdeadbeef
+// FUNCTION: TEST 0xdeadbeef
 void regular_ole_function()
 {
  printf("hi there");
--- a/tools/isledecomp/tests/samples/multiple_offsets.cpp
+++ b/tools/isledecomp/tests/samples/multiple_offsets.cpp
@@ -3,22 +3,22 @@
 // Handling multiple offset markers
-// OFFSET: TEST 0x1234
+// FUNCTION: TEST 0x1234
-// OFFSET: HELLO 0x5555
+// FUNCTION: HELLO 0x5555
 void different_modules()
 {
  // TODO
 }
-// OFFSET: TEST 0x2345
+// FUNCTION: TEST 0x2345
-// OFFSET: TEST 0x1234
+// FUNCTION: TEST 0x1234
 void same_module()
 {
  // TODO
 }
-// OFFSET: TEST 0x2002
+// FUNCTION: TEST 0x2002
-// OFFSET: test 0x1001
+// FUNCTION: test 0x1001
 void same_case_insensitive()
 {
  // TODO
--- a/tools/isledecomp/tests/samples/oneline_function.cpp
+++ b/tools/isledecomp/tests/samples/oneline_function.cpp
@@ -1,10 +1,10 @@
 // Sample for python unit tests
 // Not part of the decomp
-// OFFSET: TEST 0x1234
+// FUNCTION: TEST 0x1234
 void short_function() { static char* msg = "oneliner"; }
-// OFFSET: TEST 0x5555
+// FUNCTION: TEST 0x5555
 void function_after_one_liner()
 {
  // This function comes after the previous that is on a single line.
--- a/tools/isledecomp/tests/samples/out_of_order.cpp
+++ b/tools/isledecomp/tests/samples/out_of_order.cpp
@@ -1,19 +1,19 @@
 // Sample for python unit tests
 // Not part of the decomp
-// OFFSET: TEST 0x1001
+// FUNCTION: TEST 0x1001
 void function_order01()
 {
    // TODO
 }
-// OFFSET: TEST 0x1003
+// FUNCTION: TEST 0x1003
 void function_order03()
 {
    // TODO
 }
-// OFFSET: TEST 0x1002
+// FUNCTION: TEST 0x1002
 void function_order02()
 {
    // TODO
--- a/tools/isledecomp/tests/samples/poorly_formatted.cpp
+++ b/tools/isledecomp/tests/samples/poorly_formatted.cpp
@@ -4,18 +4,18 @@
 // While it's reasonable to expect a well-formed file (and clang-format
 // will make sure we get one), this will put the parser through its paces.
-// OFFSET: TEST 0x1234
+// FUNCTION: TEST 0x1234
 void curly_with_spaces()
  {
  static char* msg = "hello";
  }
-// OFFSET: TEST 0x5555
+// FUNCTION: TEST 0x5555
 void weird_closing_curly()
 {
  int x = 123; }
-// OFFSET: HELLO 0x5656
+// FUNCTION: HELLO 0x5656
 void bad_indenting() {
  if (0)
 {
--- a/tools/isledecomp/tests/test_parser.py
+++ b/tools/isledecomp/tests/test_parser.py
@@ -1,127 +1,170 @@
-import os
+import pytest
-from typing import List, TextIO
+from isledecomp.parser.parser import (
-from isledecomp.parser import find_code_blocks
+    ReaderState,
-from isledecomp.parser.util import CodeBlock
+    DecompParser,
-
+)
-SAMPLE_DIR = os.path.join(os.path.dirname(__file__), "samples")
+from isledecomp.parser.util import DecompMarker
 from isledecomp.parser.error import ParserError
-def sample_file(filename: str) -> TextIO:
+@pytest.fixture
-    """Wrapper for opening the samples from the directory that does not
+def parser():
-    depend on the cwd where we run the test"""
+    return DecompParser()
    full_path = os.path.join(SAMPLE_DIR, filename)
    return open(full_path, "r", encoding="utf-8")
-def code_blocks_are_sorted(blocks: List[CodeBlock]) -> bool:
+@pytest.mark.skip(reason="todo")
-    """Helper to make this more idiomatic"""
+def test_missing_sig(parser):
-    just_offsets = [block.offset for block in blocks]
+    """Bad syntax: function signature is missing"""
-    return just_offsets == sorted(just_offsets)
+    parser.read_lines(["// FUNCTION: TEST 0x1234", "{"])
    assert parser.state == ReaderState.IN_FUNC
    assert len(parser.alerts) == 1
    parser.read_line("}")
    assert len(parser.functions) == 1
    assert parser.functions[0] != "{"
-# Tests are below #
+def test_not_exact_syntax(parser):
    """Alert to inexact syntax right here in the parser instead of kicking it downstream.
    Doing this means we don't have to save the actual text."""
    parser.read_line("// function: test 1234")
    assert len(parser.alerts) == 1
    assert parser.alerts[0].code == ParserError.BAD_DECOMP_MARKER
-def test_sanity():
+def test_invalid_marker(parser):
-    """Read a very basic file"""
+    """We matched a decomp marker, but it's not one we care about"""
-    with sample_file("basic_file.cpp") as f:
+    parser.read_line("// BANANA: TEST 0x1234")
-        blocks = find_code_blocks(f)
+    assert parser.state == ReaderState.SEARCH
-    assert len(blocks) == 3
+    assert len(parser.alerts) == 1
-    assert code_blocks_are_sorted(blocks) is True
+    assert parser.alerts[0].code == ParserError.BOGUS_MARKER
    # n.b. The parser returns line numbers as 1-based
    # Function starts when we see the opening curly brace
    assert blocks[0].start_line == 8
    assert blocks[0].end_line == 10
-def test_oneline():
+def test_unexpected_marker(parser):
-    """(Assuming clang-format permits this) This sample has a function
+    parser.read_lines(
-    on a single line. This will test the end-of-function detection"""
+        [
-    with sample_file("oneline_function.cpp") as f:
+            "// FUNCTION: TEST 0x1234",
-        blocks = find_code_blocks(f)
+            "// GLOBAL: TEST 0x5000",
-
+        ]
-    assert len(blocks) == 2
+    )
-    assert blocks[0].start_line == 5
+    assert parser.state == ReaderState.SEARCH
-    assert blocks[0].end_line == 5
+    assert len(parser.alerts) == 1
    assert parser.alerts[0].code == ParserError.INCOMPATIBLE_MARKER
-def test_missing_offset():
+def test_variable(parser):
-    """What if the function doesn't have an offset comment?"""
+    parser.read_lines(
-    with sample_file("missing_offset.cpp") as f:
+        [
-        blocks = find_code_blocks(f)
+            "// GLOBAL: HELLO 0x1234",
-
+            "int g_value = 5;",
-    # TODO: For now, the function without the offset will just be ignored.
+        ]
-    # Would be the same outcome if the comment was present but mangled and
+    )
-    # we failed to match it. We should detect these cases in the future.
+    assert len(parser.variables) == 1
    assert len(blocks) == 1
-def test_jumbled_case():
+def test_synthetic_plus_marker(parser):
-    """The parser just reports what it sees. It is the responsibility of
+    """Should fail with error and not log the synthetic"""
-    the downstream tools to do something about a jumbled file.
+    parser.read_lines(
-    Just verify that we are reading it correctly."""
+        [
-    with sample_file("out_of_order.cpp") as f:
+            "// SYNTHETIC: HEY 0x555",
-        blocks = find_code_blocks(f)
+            "// FUNCTION: HOWDY 0x1234",
-
+        ]
-    assert len(blocks) == 3
+    )
-    assert code_blocks_are_sorted(blocks) is False
+    assert len(parser.functions) == 0
    assert len(parser.alerts) == 1
    assert parser.alerts[0].code == ParserError.INCOMPATIBLE_MARKER
-def test_bad_file():
+def test_different_markers_different_module(parser):
-    with sample_file("poorly_formatted.cpp") as f:
+    """Does it make any sense for a function to be a stub in one module,
-        blocks = find_code_blocks(f)
+    but not in another? I don't know. But it's no problem for us."""
    parser.read_lines(
        [
            "// FUNCTION: HOWDY 0x1234",
            "// STUB: SUP 0x5555",
            "void interesting_function() {",
            "}",
        ]
    )
-    assert len(blocks) == 3
+    assert len(parser.alerts) == 0
    assert len(parser.functions) == 2
-def test_indented():
+def test_different_markers_same_module(parser):
-    """Offsets for functions inside of a class will probably be indented."""
+    """Now, if something is a regular function but then a stub,
-    with sample_file("basic_class.cpp") as f:
+    what do we say about that?"""
-        blocks = find_code_blocks(f)
+    parser.read_lines(
        [
            "// FUNCTION: HOWDY 0x1234",
            "// STUB: HOWDY 0x5555",
            "void interesting_function() {",
            "}",
        ]
    )
-    # TODO: We don't properly detect the end of these functions
+    # Use first marker declaration, don't replace
-    # because the closing brace is indented. However... knowing where each
+    assert len(parser.functions) == 1
-    # function ends is less important (for now) than capturing
+    assert parser.functions[0].is_stub is False
    # all the functions that are there.
-    assert len(blocks) == 2
+    # Should alert to this
-    assert blocks[0].offset == int("0x12345678", 16)
+    assert len(parser.alerts) == 1
-    assert blocks[0].start_line == 15
+    assert parser.alerts[0].code == ParserError.DUPLICATE_MODULE
    # assert blocks[0].end_line == 18
    assert blocks[1].offset == int("0xdeadbeef", 16)
    assert blocks[1].start_line == 22
    # assert blocks[1].end_line == 24
-def test_inline():
+def test_unexpected_synthetic(parser):
-    with sample_file("inline.cpp") as f:
+    """FUNCTION then SYNTHETIC should fail to report either one"""
-        blocks = find_code_blocks(f)
+    parser.read_lines(
        [
            "// FUNCTION: HOWDY 0x1234",
            "// SYNTHETIC: HOWDY 0x5555",
            "void interesting_function() {",
            "}",
        ]
    )
-    assert len(blocks) == 2
+    assert parser.state == ReaderState.SEARCH
-    for block in blocks:
+    assert len(parser.functions) == 0
-        assert block.start_line is not None
+    assert len(parser.alerts) == 1
-        assert block.start_line == block.end_line
+    assert parser.alerts[0].code == ParserError.INCOMPATIBLE_MARKER
-def test_multiple_offsets():
+@pytest.mark.skip(reason="not implemented yet")
-    """If multiple offset marks appear before for a code block, take them
+def test_duplicate_offset(parser):
-    all but ensure module name (case-insensitive) is distinct.
+    """Repeating the same module/offset in the same file is probably a typo"""
-    Use first module occurrence in case of duplicates."""
+    parser.read_lines(
-    with sample_file("multiple_offsets.cpp") as f:
+        [
-        blocks = find_code_blocks(f)
+            "// GLOBAL: HELLO 0x1234",
            "int x = 1;",
            "// GLOBAL: HELLO 0x1234",
            "int y = 2;",
        ]
    )
-    assert len(blocks) == 4
+    assert len(parser.alerts) == 1
-    assert blocks[0].module == "TEST"
+    assert parser.alerts[0].code == ParserError.DUPLICATE_OFFSET
    assert blocks[0].start_line == 9
    assert blocks[1].module == "HELLO"
    assert blocks[1].start_line == 9
-    # Duplicate modules are ignored
+def test_multiple_variables(parser):
-    assert blocks[2].start_line == 16
+    """Theoretically the same global variable can appear in multiple modules"""
-    assert blocks[2].offset == 0x2345
+    parser.read_lines(
        [
            "// GLOBAL: HELLO 0x1234",
            "// GLOBAL: WUZZUP 0x555",
            "const char *g_greeting;",
        ]
    )
    assert len(parser.alerts) == 0
    assert len(parser.variables) == 2
-    assert blocks[3].module == "TEST"
+
-    assert blocks[3].offset == 0x2002
+def test_multiple_vtables(parser):
    parser.read_lines(
        [
            "// VTABLE: HELLO 0x1234",
            "// VTABLE: TEST 0x5432",
            "class MxString : public MxCore {",
        ]
    )
    assert len(parser.alerts) == 0
    assert len(parser.vtables) == 2
--- a/tools/isledecomp/tests/test_parser_samples.py
+++ b/tools/isledecomp/tests/test_parser_samples.py
@@ -0,0 +1,141 @@
 import os
 import pytest
 from typing import List, TextIO
 from isledecomp.parser import DecompParser
 from isledecomp.parser.node import ParserSymbol
 SAMPLE_DIR = os.path.join(os.path.dirname(__file__), "samples")
 def sample_file(filename: str) -> TextIO:
    """Wrapper for opening the samples from the directory that does not
    depend on the cwd where we run the test"""
    full_path = os.path.join(SAMPLE_DIR, filename)
    return open(full_path, "r", encoding="utf-8")
 def code_blocks_are_sorted(blocks: List[ParserSymbol]) -> bool:
    """Helper to make this more idiomatic"""
    just_offsets = [block.offset for block in blocks]
    return just_offsets == sorted(just_offsets)
@pytest.fixture
 def parser():
    return DecompParser()
 # Tests are below #
 def test_sanity(parser):
    """Read a very basic file"""
    with sample_file("basic_file.cpp") as f:
        parser.read_lines(f)
    assert len(parser.functions) == 3
    assert code_blocks_are_sorted(parser.functions) is True
    # n.b. The parser returns line numbers as 1-based
    # Function starts when we see the opening curly brace
    assert parser.functions[0].line_number == 8
    assert parser.functions[0].end_line == 10
 def test_oneline(parser):
    """(Assuming clang-format permits this) This sample has a function
    on a single line. This will test the end-of-function detection"""
    with sample_file("oneline_function.cpp") as f:
        parser.read_lines(f)
    assert len(parser.functions) == 2
    assert parser.functions[0].line_number == 5
    assert parser.functions[0].end_line == 5
 def test_missing_offset(parser):
    """What if the function doesn't have an offset comment?"""
    with sample_file("missing_offset.cpp") as f:
        parser.read_lines(f)
    # TODO: For now, the function without the offset will just be ignored.
    # Would be the same outcome if the comment was present but mangled and
    # we failed to match it. We should detect these cases in the future.
    assert len(parser.functions) == 1
 def test_jumbled_case(parser):
    """The parser just reports what it sees. It is the responsibility of
    the downstream tools to do something about a jumbled file.
    Just verify that we are reading it correctly."""
    with sample_file("out_of_order.cpp") as f:
        parser.read_lines(f)
    assert len(parser.functions) == 3
    assert code_blocks_are_sorted(parser.functions) is False
 def test_bad_file(parser):
    with sample_file("poorly_formatted.cpp") as f:
        parser.read_lines(f)
    assert len(parser.functions) == 3
 def test_indented(parser):
    """Offsets for functions inside of a class will probably be indented."""
    with sample_file("basic_class.cpp") as f:
        parser.read_lines(f)
    # TODO: We don't properly detect the end of these functions
    # because the closing brace is indented. However... knowing where each
    # function ends is less important (for now) than capturing
    # all the functions that are there.
    assert len(parser.functions) == 2
    assert parser.functions[0].offset == int("0x12345678", 16)
    assert parser.functions[0].line_number == 16
    # assert parser.functions[0].end_line == 19
    assert parser.functions[1].offset == int("0xdeadbeef", 16)
    assert parser.functions[1].line_number == 23
    # assert parser.functions[1].end_line == 25
 def test_inline(parser):
    with sample_file("inline.cpp") as f:
        parser.read_lines(f)
    assert len(parser.functions) == 2
    for fun in parser.functions:
        assert fun.line_number is not None
        assert fun.line_number == fun.end_line
 def test_multiple_offsets(parser):
    """If multiple offset marks appear before for a code block, take them
    all but ensure module name (case-insensitive) is distinct.
    Use first module occurrence in case of duplicates."""
    with sample_file("multiple_offsets.cpp") as f:
        parser.read_lines(f)
    assert len(parser.functions) == 4
    assert parser.functions[0].module == "TEST"
    assert parser.functions[0].line_number == 9
    assert parser.functions[1].module == "HELLO"
    assert parser.functions[1].line_number == 9
    # Duplicate modules are ignored
    assert parser.functions[2].line_number == 16
    assert parser.functions[2].offset == 0x2345
    assert parser.functions[3].module == "TEST"
    assert parser.functions[3].offset == 0x2002
 def test_variables(parser):
    with sample_file("global_variables.cpp") as f:
        parser.read_lines(f)
    assert len(parser.functions) == 1
    assert len(parser.variables) == 2
--- a/tools/isledecomp/tests/test_parser_statechange.py
+++ b/tools/isledecomp/tests/test_parser_statechange.py
@@ -0,0 +1,150 @@
 import pytest
 from isledecomp.parser.parser import (
    ReaderState as _rs,
    DecompParser,
 )
 from isledecomp.parser.util import DecompMarker
 from isledecomp.parser.error import ParserError as _pe
 # fmt: off
 state_change_marker_cases = [
    (_rs.SEARCH,          "FUNCTION",   _rs.WANT_SIG,        None),
    (_rs.SEARCH,          "GLOBAL",     _rs.IN_GLOBAL,       None),
    (_rs.SEARCH,          "STUB",       _rs.WANT_SIG,        None),
    (_rs.SEARCH,          "SYNTHETIC",  _rs.IN_TEMPLATE,     None),
    (_rs.SEARCH,          "TEMPLATE",   _rs.IN_TEMPLATE,     None),
    (_rs.SEARCH,          "VTABLE",     _rs.IN_VTABLE,       None),
    (_rs.WANT_SIG,        "FUNCTION",   _rs.WANT_SIG,        None),
    (_rs.WANT_SIG,        "GLOBAL",     _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
    (_rs.WANT_SIG,        "STUB",       _rs.WANT_SIG,        None),
    (_rs.WANT_SIG,        "SYNTHETIC",  _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
    (_rs.WANT_SIG,        "TEMPLATE",   _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
    (_rs.WANT_SIG,        "VTABLE",     _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
    (_rs.IN_FUNC,         "FUNCTION",   _rs.WANT_SIG,        _pe.MISSED_END_OF_FUNCTION),
    (_rs.IN_FUNC,         "GLOBAL",     _rs.IN_FUNC_GLOBAL,  None),
    (_rs.IN_FUNC,         "STUB",       _rs.WANT_SIG,        _pe.MISSED_END_OF_FUNCTION),
    (_rs.IN_FUNC,         "SYNTHETIC",  _rs.IN_TEMPLATE,     _pe.MISSED_END_OF_FUNCTION),
    (_rs.IN_FUNC,         "TEMPLATE",   _rs.IN_TEMPLATE,     _pe.MISSED_END_OF_FUNCTION),
    (_rs.IN_FUNC,         "VTABLE",     _rs.IN_VTABLE,       _pe.MISSED_END_OF_FUNCTION),
    (_rs.IN_TEMPLATE,     "FUNCTION",   _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
    (_rs.IN_TEMPLATE,     "GLOBAL",     _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
    (_rs.IN_TEMPLATE,     "STUB",       _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
    (_rs.IN_TEMPLATE,     "SYNTHETIC",  _rs.IN_TEMPLATE,     None),
    (_rs.IN_TEMPLATE,     "TEMPLATE",   _rs.IN_TEMPLATE,     None),
    (_rs.IN_TEMPLATE,     "VTABLE",     _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
    (_rs.WANT_CURLY,      "FUNCTION",   _rs.SEARCH,          _pe.UNEXPECTED_MARKER),
    (_rs.WANT_CURLY,      "GLOBAL",     _rs.SEARCH,          _pe.UNEXPECTED_MARKER),
    (_rs.WANT_CURLY,      "STUB",       _rs.SEARCH,          _pe.UNEXPECTED_MARKER),
    (_rs.WANT_CURLY,      "SYNTHETIC",  _rs.SEARCH,          _pe.UNEXPECTED_MARKER),
    (_rs.WANT_CURLY,      "TEMPLATE",   _rs.SEARCH,          _pe.UNEXPECTED_MARKER),
    (_rs.WANT_CURLY,      "VTABLE",     _rs.SEARCH,          _pe.UNEXPECTED_MARKER),
    (_rs.IN_GLOBAL,       "FUNCTION",   _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
    (_rs.IN_GLOBAL,       "GLOBAL",     _rs.IN_GLOBAL,       None),
    (_rs.IN_GLOBAL,       "STUB",       _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
    (_rs.IN_GLOBAL,       "SYNTHETIC",  _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
    (_rs.IN_GLOBAL,       "TEMPLATE",   _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
    (_rs.IN_GLOBAL,       "VTABLE",     _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
    (_rs.IN_FUNC_GLOBAL,  "FUNCTION",   _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
    (_rs.IN_FUNC_GLOBAL,  "GLOBAL",     _rs.IN_FUNC_GLOBAL,  None),
    (_rs.IN_FUNC_GLOBAL,  "STUB",       _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
    (_rs.IN_FUNC_GLOBAL,  "SYNTHETIC",  _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
    (_rs.IN_FUNC_GLOBAL,  "TEMPLATE",   _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
    (_rs.IN_FUNC_GLOBAL,  "VTABLE",     _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
    (_rs.IN_VTABLE,       "FUNCTION",   _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
    (_rs.IN_VTABLE,       "GLOBAL",     _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
    (_rs.IN_VTABLE,       "STUB",       _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
    (_rs.IN_VTABLE,       "SYNTHETIC",  _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
    (_rs.IN_VTABLE,       "TEMPLATE",   _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
    (_rs.IN_VTABLE,       "VTABLE",     _rs.IN_VTABLE,       None),
 ]
 # fmt: on
@pytest.mark.parametrize(
    "state, marker_type, new_state, expected_error", state_change_marker_cases
 )
 def test_state_change_by_marker(
    state: _rs, marker_type: str, new_state: _rs, expected_error: None | _pe
 ):
    p = DecompParser()
    p.state = state
    p._handle_marker(DecompMarker(marker_type, "TEST", 0x1234))
    assert p.state == new_state
    if expected_error is not None:
        assert len(p.alerts) > 0
        assert p.alerts[0].code == expected_error
 # Reading any of these lines should have no effect in ReaderState.SEARCH
 search_lines_no_effect = [
    "",
    "\t",
    "    ",
    "int x = 0;",
    "// Comment",
    "/*",
    "*/",
    "/* Block comment */",
    "{",
    "}",
 ]
@pytest.mark.parametrize("line", search_lines_no_effect)
 def test_state_search_line(line: str):
    p = DecompParser()
    p.read_line(line)
    assert p.state == _rs.SEARCH
    assert len(p.alerts) == 0
 global_lines = [
    ("// A comment", _rs.IN_GLOBAL),
    ("", _rs.IN_GLOBAL),
    ("\t", _rs.IN_GLOBAL),
    ("    ", _rs.IN_GLOBAL),
    # TODO: no check for "likely" variable declaration so these all count
    ("void function()", _rs.SEARCH),
    ("int x = 123;", _rs.SEARCH),
    ("just some text", _rs.SEARCH),
 ]
@pytest.mark.parametrize("line, new_state", global_lines)
 def test_state_global_line(line: str, new_state: _rs):
    p = DecompParser()
    p.read_line("// GLOBAL: TEST 0x1234")
    assert p.state == _rs.IN_GLOBAL
    p.read_line(line)
    assert p.state == new_state
 # mostly same as above
 in_func_global_lines = [
    ("// A comment", _rs.IN_FUNC_GLOBAL),
    ("", _rs.IN_FUNC_GLOBAL),
    ("\t", _rs.IN_FUNC_GLOBAL),
    ("    ", _rs.IN_FUNC_GLOBAL),
    # TODO: no check for "likely" variable declaration so these all count
    ("void function()", _rs.IN_FUNC),
    ("int x = 123;", _rs.IN_FUNC),
    ("just some text", _rs.IN_FUNC),
 ]
@pytest.mark.parametrize("line, new_state", in_func_global_lines)
 def test_state_in_func_global_line(line: str, new_state: _rs):
    p = DecompParser()
    p.state = _rs.IN_FUNC
    p.read_line("// GLOBAL: TEST 0x1234")
    assert p.state == _rs.IN_FUNC_GLOBAL
    p.read_line(line)
    assert p.state == new_state
--- a/tools/isledecomp/tests/test_parser_util.py
+++ b/tools/isledecomp/tests/test_parser_util.py
@@ -1,11 +1,12 @@
 from collections import namedtuple
 from typing import List
 import pytest
 from isledecomp.parser.parser import MarkerDict
 from isledecomp.parser.util import (
    DecompMarker,
    is_blank_or_comment,
-    match_offset_comment,
+    match_marker,
-    is_exact_offset_comment,
+    is_marker_exact,
    distinct_by_module,
 )
@@ -28,76 +29,72 @@ def test_is_blank_or_comment(line: str, expected: bool):
    assert is_blank_or_comment(line) is expected
-offset_comment_samples = [
+marker_samples = [
    # (can_parse: bool, exact_match: bool, line: str)
-    # Should match both expected modules with optional STUB marker
+    (True, True, "// FUNCTION: LEGO1 0xdeadbeef"),
-    (True, True, "// OFFSET: LEGO1 0xdeadbeef"),
+    (True, True, "// FUNCTION: ISLE 0x12345678"),
    (True, True, "// OFFSET: LEGO1 0xdeadbeef STUB"),
    (True, True, "// OFFSET: ISLE 0x12345678"),
    (True, True, "// OFFSET: ISLE 0x12345678 STUB"),
    # No trailing spaces allowed
-    (True, False, "// OFFSET: LEGO1 0xdeadbeef  "),
+    (True, False, "// FUNCTION: LEGO1 0xdeadbeef  "),
    (True, False, "// OFFSET: LEGO1 0xdeadbeef STUB "),
    # Must have exactly one space between elements
-    (True, False, "//OFFSET: ISLE 0xdeadbeef"),
+    (True, False, "//FUNCTION: ISLE 0xdeadbeef"),
-    (True, False, "// OFFSET:ISLE 0xdeadbeef"),
+    (True, False, "// FUNCTION:ISLE 0xdeadbeef"),
-    (True, False, "//  OFFSET: ISLE 0xdeadbeef"),
+    (True, False, "//  FUNCTION: ISLE 0xdeadbeef"),
-    (True, False, "// OFFSET:  ISLE 0xdeadbeef"),
+    (True, False, "// FUNCTION:  ISLE 0xdeadbeef"),
-    (True, False, "// OFFSET: ISLE  0xdeadbeef"),
+    (True, False, "// FUNCTION: ISLE  0xdeadbeef"),
    (True, False, "// OFFSET: ISLE 0xdeadbeef  STUB"),
    # Must have 0x prefix for hex number
-    (True, False, "// OFFSET: ISLE deadbeef"),
+    (True, False, "// FUNCTION: ISLE deadbeef"),
    # Offset, module name, and STUB must be uppercase
-    (True, False, "// offset: ISLE 0xdeadbeef"),
+    (True, False, "// function: ISLE 0xdeadbeef"),
-    (True, False, "// offset: isle 0xdeadbeef"),
+    (True, False, "// function: isle 0xdeadbeef"),
    (True, False, "// OFFSET: LEGO1 0xdeadbeef stub"),
    # Hex string must be lowercase
-    (True, False, "// OFFSET: ISLE 0xDEADBEEF"),
+    (True, False, "// FUNCTION: ISLE 0xDEADBEEF"),
    # TODO: How flexible should we be with matching the module name?
-    (True, True, "// OFFSET: OMNI 0x12345678"),
+    (True, True, "// FUNCTION: OMNI 0x12345678"),
-    (True, True, "// OFFSET: LEG01 0x12345678"),
+    (True, True, "// FUNCTION: LEG01 0x12345678"),
-    (True, False, "// OFFSET: hello 0x12345678"),
+    (True, False, "// FUNCTION: hello 0x12345678"),
    # Not close enough to match
-    (False, False, "// OFFSET: ISLE0x12345678"),
+    (False, False, "// FUNCTION: ISLE0x12345678"),
-    (False, False, "// OFFSET: 0x12345678"),
+    (False, False, "// FUNCTION: 0x12345678"),
    (False, False, "// LEGO1: 0x12345678"),
    # Hex string shorter than 8 characters
-    (True, True, "// OFFSET: LEGO1 0x1234"),
+    (True, True, "// FUNCTION: LEGO1 0x1234"),
    # TODO: These match but shouldn't.
-    # (False, False, '// OFFSET: LEGO1 0'),
+    # (False, False, '// FUNCTION: LEGO1 0'),
-    # (False, False, '// OFFSET: LEGO1 0x'),
+    # (False, False, '// FUNCTION: LEGO1 0x'),
 ]
-@pytest.mark.parametrize("match, _, line", offset_comment_samples)
+@pytest.mark.parametrize("match, _, line", marker_samples)
-def test_offset_match(line: str, match: bool, _):
+def test_marker_match(line: str, match: bool, _):
-    did_match = match_offset_comment(line) is not None
+    did_match = match_marker(line) is not None
    assert did_match is match
-@pytest.mark.parametrize("_, exact, line", offset_comment_samples)
+@pytest.mark.parametrize("_, exact, line", marker_samples)
-def test_exact_offset_comment(line: str, exact: bool, _):
+def test_marker_exact(line: str, exact: bool, _):
-    assert is_exact_offset_comment(line) is exact
+    assert is_marker_exact(line) is exact
-# Helper for the next test: cut down version of OffsetMatch
+def test_marker_dict_simple():
-MiniOfs = namedtuple("MiniOfs", ["module", "value"])
+    d = MarkerDict()
-
+    d.insert(DecompMarker("FUNCTION", "TEST", 0x1234))
-distinct_by_module_samples = [
+    markers = list(d.iter())
-    # empty set
+    assert len(markers) == 1
    ([], []),
    # same module name
    ([MiniOfs("TEST", 123), MiniOfs("TEST", 555)], [MiniOfs("TEST", 123)]),
    # same module name, case-insensitive
    ([MiniOfs("test", 123), MiniOfs("TEST", 555)], [MiniOfs("test", 123)]),
    # duplicates, non-consecutive
    (
        [MiniOfs("test", 123), MiniOfs("abc", 111), MiniOfs("TEST", 555)],
        [MiniOfs("test", 123), MiniOfs("abc", 111)],
    ),
 ]
-@pytest.mark.parametrize("sample, expected", distinct_by_module_samples)
+def test_marker_dict_ofs_replace():
-def test_distinct_by_module(sample: List[MiniOfs], expected: List[MiniOfs]):
+    d = MarkerDict()
-    assert distinct_by_module(sample) == expected
+    d.insert(DecompMarker("FUNCTION", "TEST", 0x1234))
    d.insert(DecompMarker("FUNCTION", "TEST", 0x555))
    markers = list(d.iter())
    assert len(markers) == 1
    assert markers[0].offset == 0x1234
 def test_marker_dict_type_replace():
    d = MarkerDict()
    d.insert(DecompMarker("FUNCTION", "TEST", 0x1234))
    d.insert(DecompMarker("STUB", "TEST", 0x1234))
    markers = list(d.iter())
    assert len(markers) == 1
    assert markers[0].type == "FUNCTION"
--- a/tools/reccmp/reccmp.py
+++ b/tools/reccmp/reccmp.py
@@ -10,7 +10,7 @@ import re
 from isledecomp import (
    Bin,
-    find_code_blocks,
+    DecompParser,
    get_file_in_script_dir,
    OffsetPlaceholderGenerator,
    print_diff,
@@ -313,18 +313,20 @@ if __name__ == "__main__":
        # Generate basename of original file, used in locating OFFSET lines
        basename = os.path.basename(os.path.splitext(original)[0])
        parser = DecompParser()
        for srcfilename in walk_source_dir(source):
            parser.reset()
            with open(srcfilename, "r", encoding="utf-8") as srcfile:
-                blocks = find_code_blocks(srcfile)
+                parser.read_lines(srcfile)
-            for block in blocks:
+            for fun in parser.functions:
-                if block.is_stub:
+                if fun.is_stub:
                    continue
-                if block.module != basename:
+                if fun.module != basename:
                    continue
-                addr = block.offset
+                addr = fun.offset
                # Verbose flag handling
                if verbose:
                    if addr == verbose:
@@ -332,13 +334,13 @@ if __name__ == "__main__":
                    else:
                        continue
-                if block.is_template:
+                if fun.is_template:
-                    recinfo = syminfo.get_recompiled_address_from_name(block.signature)
+                    recinfo = syminfo.get_recompiled_address_from_name(fun.name)
                    if not recinfo:
                        continue
                else:
                    recinfo = syminfo.get_recompiled_address(
-                        srcfilename, block.start_line
+                        srcfilename, fun.line_number
                    )
                    if not recinfo:
                        continue
`@@ -1 +1 @@`
	`from .parser import find_code_blocks`	`from .parser import DecompParser`