Merge from parser2 branch

2025-12-16 02:43:40 +00:00 · 2023-12-01 15:10:32 -05:00
parent 47a6ea2de7
commit 75802101ac
20 changed files with 923 additions and 352 deletions
--- a/tools/isledecomp/tests/samples/basic_class.cpp
+++ b/tools/isledecomp/tests/samples/basic_class.cpp
@@ -3,6 +3,7 @@

 // A very simple class

+// VTABLE: TEST 0x1001002
 class TestClass {
 public:
  TestClass();
@@ -10,14 +11,14 @@ public:

  virtual MxResult Tickle() override; // vtable+08

-  // OFFSET: TEST 0x12345678
+  // FUNCTION: TEST 0x12345678
  inline const char* ClassName() const // vtable+0c
  {
    // 0xabcd1234
    return "TestClass";
  }

-  // OFFSET: TEST 0xdeadbeef
+  // FUNCTION: TEST 0xdeadbeef
  inline MxBool IsA(const char* name) const override // vtable+10
  {
    return !strcmp(name, TestClass::ClassName());
--- a/tools/isledecomp/tests/samples/basic_file.cpp
+++ b/tools/isledecomp/tests/samples/basic_file.cpp
@@ -3,19 +3,19 @@

 // A very simple well-formed code file

-// OFFSET: TEST 0x1234
+// FUNCTION: TEST 0x1234
 void function01()
 {
  // TODO
 }

-// OFFSET: TEST 0x2345
+// FUNCTION: TEST 0x2345
 void function02()
 {
  // TODO
 }

-// OFFSET: TEST 0x3456
+// FUNCTION: TEST 0x3456
 void function03()
 {
  // TODO
--- a/tools/isledecomp/tests/samples/global_variables.cpp
+++ b/tools/isledecomp/tests/samples/global_variables.cpp
@@ -0,0 +1,14 @@
+// Sample for python unit tests
+// Not part of the decomp
+
+// Global variables inside and outside of functions
+
+// GLOBAL: TEST 0x1000
+const char *g_message = "test";
+
+// FUNCTION: TEST 0x1234
+void function01()
+{
+  // GLOBAL: TEST 0x5555
+  static int g_hello = 123;
+}
--- a/tools/isledecomp/tests/samples/inline.cpp
+++ b/tools/isledecomp/tests/samples/inline.cpp
@@ -1,8 +1,8 @@
 // Sample for python unit tests
 // Not part of the decomp

-// OFFSET: TEST 0x10000001
+// FUNCTION: TEST 0x10000001
 inline const char* OneLineWithComment() const { return "MxDSObject"; }; // hi there

-// OFFSET: TEST 0x10000002
+// FUNCTION: TEST 0x10000002
 inline const char* OneLine() const { return "MxDSObject"; };
--- a/tools/isledecomp/tests/samples/missing_offset.cpp
+++ b/tools/isledecomp/tests/samples/missing_offset.cpp
@@ -9,7 +9,7 @@ int no_offset_comment()
  return -1;
 }

-// OFFSET: TEST 0xdeadbeef
+// FUNCTION: TEST 0xdeadbeef
 void regular_ole_function()
 {
  printf("hi there");
--- a/tools/isledecomp/tests/samples/multiple_offsets.cpp
+++ b/tools/isledecomp/tests/samples/multiple_offsets.cpp
@@ -3,22 +3,22 @@

 // Handling multiple offset markers

-// OFFSET: TEST 0x1234
-// OFFSET: HELLO 0x5555
+// FUNCTION: TEST 0x1234
+// FUNCTION: HELLO 0x5555
 void different_modules()
 {
  // TODO
 }

-// OFFSET: TEST 0x2345
-// OFFSET: TEST 0x1234
+// FUNCTION: TEST 0x2345
+// FUNCTION: TEST 0x1234
 void same_module()
 {
  // TODO
 }

-// OFFSET: TEST 0x2002
-// OFFSET: test 0x1001
+// FUNCTION: TEST 0x2002
+// FUNCTION: test 0x1001
 void same_case_insensitive()
 {
  // TODO
--- a/tools/isledecomp/tests/samples/oneline_function.cpp
+++ b/tools/isledecomp/tests/samples/oneline_function.cpp
@@ -1,10 +1,10 @@
 // Sample for python unit tests
 // Not part of the decomp

-// OFFSET: TEST 0x1234
+// FUNCTION: TEST 0x1234
 void short_function() { static char* msg = "oneliner"; }

-// OFFSET: TEST 0x5555
+// FUNCTION: TEST 0x5555
 void function_after_one_liner()
 {
  // This function comes after the previous that is on a single line.
--- a/tools/isledecomp/tests/samples/out_of_order.cpp
+++ b/tools/isledecomp/tests/samples/out_of_order.cpp
@@ -1,19 +1,19 @@
 // Sample for python unit tests
 // Not part of the decomp

-// OFFSET: TEST 0x1001
+// FUNCTION: TEST 0x1001
 void function_order01()
 {
    // TODO
 }

-// OFFSET: TEST 0x1003
+// FUNCTION: TEST 0x1003
 void function_order03()
 {
    // TODO
 }

-// OFFSET: TEST 0x1002
+// FUNCTION: TEST 0x1002
 void function_order02()
 {
    // TODO
--- a/tools/isledecomp/tests/samples/poorly_formatted.cpp
+++ b/tools/isledecomp/tests/samples/poorly_formatted.cpp
@@ -4,18 +4,18 @@
 // While it's reasonable to expect a well-formed file (and clang-format
 // will make sure we get one), this will put the parser through its paces.

-// OFFSET: TEST 0x1234
+// FUNCTION: TEST 0x1234
 void curly_with_spaces()
  {
  static char* msg = "hello";
  }

-// OFFSET: TEST 0x5555
+// FUNCTION: TEST 0x5555
 void weird_closing_curly()
 {
  int x = 123; }

-// OFFSET: HELLO 0x5656
+// FUNCTION: HELLO 0x5656
 void bad_indenting() {
  if (0)
 {
--- a/tools/isledecomp/tests/test_parser.py
+++ b/tools/isledecomp/tests/test_parser.py
@@ -1,127 +1,170 @@
-import os
-from typing import List, TextIO
-from isledecomp.parser import find_code_blocks
-from isledecomp.parser.util import CodeBlock
-
-SAMPLE_DIR = os.path.join(os.path.dirname(__file__), "samples")
+import pytest
+from isledecomp.parser.parser import (
+    ReaderState,
+    DecompParser,
+)
+from isledecomp.parser.util import DecompMarker
+from isledecomp.parser.error import ParserError


-def sample_file(filename: str) -> TextIO:
-    """Wrapper for opening the samples from the directory that does not
-    depend on the cwd where we run the test"""
-    full_path = os.path.join(SAMPLE_DIR, filename)
-    return open(full_path, "r", encoding="utf-8")
+@pytest.fixture
+def parser():
+    return DecompParser()


-def code_blocks_are_sorted(blocks: List[CodeBlock]) -> bool:
-    """Helper to make this more idiomatic"""
-    just_offsets = [block.offset for block in blocks]
-    return just_offsets == sorted(just_offsets)
+@pytest.mark.skip(reason="todo")
+def test_missing_sig(parser):
+    """Bad syntax: function signature is missing"""
+    parser.read_lines(["// FUNCTION: TEST 0x1234", "{"])
+    assert parser.state == ReaderState.IN_FUNC
+    assert len(parser.alerts) == 1
+    parser.read_line("}")
+    assert len(parser.functions) == 1
+    assert parser.functions[0] != "{"


-# Tests are below #
+def test_not_exact_syntax(parser):
+    """Alert to inexact syntax right here in the parser instead of kicking it downstream.
+    Doing this means we don't have to save the actual text."""
+    parser.read_line("// function: test 1234")
+    assert len(parser.alerts) == 1
+    assert parser.alerts[0].code == ParserError.BAD_DECOMP_MARKER


-def test_sanity():
-    """Read a very basic file"""
-    with sample_file("basic_file.cpp") as f:
-        blocks = find_code_blocks(f)
+def test_invalid_marker(parser):
+    """We matched a decomp marker, but it's not one we care about"""
+    parser.read_line("// BANANA: TEST 0x1234")
+    assert parser.state == ReaderState.SEARCH

-    assert len(blocks) == 3
-    assert code_blocks_are_sorted(blocks) is True
-    # n.b. The parser returns line numbers as 1-based
-    # Function starts when we see the opening curly brace
-    assert blocks[0].start_line == 8
-    assert blocks[0].end_line == 10
+    assert len(parser.alerts) == 1
+    assert parser.alerts[0].code == ParserError.BOGUS_MARKER


-def test_oneline():
-    """(Assuming clang-format permits this) This sample has a function
-    on a single line. This will test the end-of-function detection"""
-    with sample_file("oneline_function.cpp") as f:
-        blocks = find_code_blocks(f)
-
-    assert len(blocks) == 2
-    assert blocks[0].start_line == 5
-    assert blocks[0].end_line == 5
+def test_unexpected_marker(parser):
+    parser.read_lines(
+        [
+            "// FUNCTION: TEST 0x1234",
+            "// GLOBAL: TEST 0x5000",
+        ]
+    )
+    assert parser.state == ReaderState.SEARCH
+    assert len(parser.alerts) == 1
+    assert parser.alerts[0].code == ParserError.INCOMPATIBLE_MARKER


-def test_missing_offset():
-    """What if the function doesn't have an offset comment?"""
-    with sample_file("missing_offset.cpp") as f:
-        blocks = find_code_blocks(f)
-
-    # TODO: For now, the function without the offset will just be ignored.
-    # Would be the same outcome if the comment was present but mangled and
-    # we failed to match it. We should detect these cases in the future.
-    assert len(blocks) == 1
+def test_variable(parser):
+    parser.read_lines(
+        [
+            "// GLOBAL: HELLO 0x1234",
+            "int g_value = 5;",
+        ]
+    )
+    assert len(parser.variables) == 1


-def test_jumbled_case():
-    """The parser just reports what it sees. It is the responsibility of
-    the downstream tools to do something about a jumbled file.
-    Just verify that we are reading it correctly."""
-    with sample_file("out_of_order.cpp") as f:
-        blocks = find_code_blocks(f)
-
-    assert len(blocks) == 3
-    assert code_blocks_are_sorted(blocks) is False
+def test_synthetic_plus_marker(parser):
+    """Should fail with error and not log the synthetic"""
+    parser.read_lines(
+        [
+            "// SYNTHETIC: HEY 0x555",
+            "// FUNCTION: HOWDY 0x1234",
+        ]
+    )
+    assert len(parser.functions) == 0
+    assert len(parser.alerts) == 1
+    assert parser.alerts[0].code == ParserError.INCOMPATIBLE_MARKER


-def test_bad_file():
-    with sample_file("poorly_formatted.cpp") as f:
-        blocks = find_code_blocks(f)
+def test_different_markers_different_module(parser):
+    """Does it make any sense for a function to be a stub in one module,
+    but not in another? I don't know. But it's no problem for us."""
+    parser.read_lines(
+        [
+            "// FUNCTION: HOWDY 0x1234",
+            "// STUB: SUP 0x5555",
+            "void interesting_function() {",
+            "}",
+        ]
+    )

-    assert len(blocks) == 3
+    assert len(parser.alerts) == 0
+    assert len(parser.functions) == 2


-def test_indented():
-    """Offsets for functions inside of a class will probably be indented."""
-    with sample_file("basic_class.cpp") as f:
-        blocks = find_code_blocks(f)
+def test_different_markers_same_module(parser):
+    """Now, if something is a regular function but then a stub,
+    what do we say about that?"""
+    parser.read_lines(
+        [
+            "// FUNCTION: HOWDY 0x1234",
+            "// STUB: HOWDY 0x5555",
+            "void interesting_function() {",
+            "}",
+        ]
+    )

-    # TODO: We don't properly detect the end of these functions
-    # because the closing brace is indented. However... knowing where each
-    # function ends is less important (for now) than capturing
-    # all the functions that are there.
+    # Use first marker declaration, don't replace
+    assert len(parser.functions) == 1
+    assert parser.functions[0].is_stub is False

-    assert len(blocks) == 2
-    assert blocks[0].offset == int("0x12345678", 16)
-    assert blocks[0].start_line == 15
-    # assert blocks[0].end_line == 18
-
-    assert blocks[1].offset == int("0xdeadbeef", 16)
-    assert blocks[1].start_line == 22
-    # assert blocks[1].end_line == 24
+    # Should alert to this
+    assert len(parser.alerts) == 1
+    assert parser.alerts[0].code == ParserError.DUPLICATE_MODULE


-def test_inline():
-    with sample_file("inline.cpp") as f:
-        blocks = find_code_blocks(f)
+def test_unexpected_synthetic(parser):
+    """FUNCTION then SYNTHETIC should fail to report either one"""
+    parser.read_lines(
+        [
+            "// FUNCTION: HOWDY 0x1234",
+            "// SYNTHETIC: HOWDY 0x5555",
+            "void interesting_function() {",
+            "}",
+        ]
+    )

-    assert len(blocks) == 2
-    for block in blocks:
-        assert block.start_line is not None
-        assert block.start_line == block.end_line
+    assert parser.state == ReaderState.SEARCH
+    assert len(parser.functions) == 0
+    assert len(parser.alerts) == 1
+    assert parser.alerts[0].code == ParserError.INCOMPATIBLE_MARKER


-def test_multiple_offsets():
-    """If multiple offset marks appear before for a code block, take them
-    all but ensure module name (case-insensitive) is distinct.
-    Use first module occurrence in case of duplicates."""
-    with sample_file("multiple_offsets.cpp") as f:
-        blocks = find_code_blocks(f)
+@pytest.mark.skip(reason="not implemented yet")
+def test_duplicate_offset(parser):
+    """Repeating the same module/offset in the same file is probably a typo"""
+    parser.read_lines(
+        [
+            "// GLOBAL: HELLO 0x1234",
+            "int x = 1;",
+            "// GLOBAL: HELLO 0x1234",
+            "int y = 2;",
+        ]
+    )

-    assert len(blocks) == 4
-    assert blocks[0].module == "TEST"
-    assert blocks[0].start_line == 9
+    assert len(parser.alerts) == 1
+    assert parser.alerts[0].code == ParserError.DUPLICATE_OFFSET

-    assert blocks[1].module == "HELLO"
-    assert blocks[1].start_line == 9

-    # Duplicate modules are ignored
-    assert blocks[2].start_line == 16
-    assert blocks[2].offset == 0x2345
+def test_multiple_variables(parser):
+    """Theoretically the same global variable can appear in multiple modules"""
+    parser.read_lines(
+        [
+            "// GLOBAL: HELLO 0x1234",
+            "// GLOBAL: WUZZUP 0x555",
+            "const char *g_greeting;",
+        ]
+    )
+    assert len(parser.alerts) == 0
+    assert len(parser.variables) == 2

-    assert blocks[3].module == "TEST"
-    assert blocks[3].offset == 0x2002
+
+def test_multiple_vtables(parser):
+    parser.read_lines(
+        [
+            "// VTABLE: HELLO 0x1234",
+            "// VTABLE: TEST 0x5432",
+            "class MxString : public MxCore {",
+        ]
+    )
+    assert len(parser.alerts) == 0
+    assert len(parser.vtables) == 2
--- a/tools/isledecomp/tests/test_parser_samples.py
+++ b/tools/isledecomp/tests/test_parser_samples.py
@@ -0,0 +1,141 @@
+import os
+import pytest
+from typing import List, TextIO
+from isledecomp.parser import DecompParser
+from isledecomp.parser.node import ParserSymbol
+
+SAMPLE_DIR = os.path.join(os.path.dirname(__file__), "samples")
+
+
+def sample_file(filename: str) -> TextIO:
+    """Wrapper for opening the samples from the directory that does not
+    depend on the cwd where we run the test"""
+    full_path = os.path.join(SAMPLE_DIR, filename)
+    return open(full_path, "r", encoding="utf-8")
+
+
+def code_blocks_are_sorted(blocks: List[ParserSymbol]) -> bool:
+    """Helper to make this more idiomatic"""
+    just_offsets = [block.offset for block in blocks]
+    return just_offsets == sorted(just_offsets)
+
+
+@pytest.fixture
+def parser():
+    return DecompParser()
+
+
+# Tests are below #
+
+
+def test_sanity(parser):
+    """Read a very basic file"""
+    with sample_file("basic_file.cpp") as f:
+        parser.read_lines(f)
+
+    assert len(parser.functions) == 3
+    assert code_blocks_are_sorted(parser.functions) is True
+    # n.b. The parser returns line numbers as 1-based
+    # Function starts when we see the opening curly brace
+    assert parser.functions[0].line_number == 8
+    assert parser.functions[0].end_line == 10
+
+
+def test_oneline(parser):
+    """(Assuming clang-format permits this) This sample has a function
+    on a single line. This will test the end-of-function detection"""
+    with sample_file("oneline_function.cpp") as f:
+        parser.read_lines(f)
+
+    assert len(parser.functions) == 2
+    assert parser.functions[0].line_number == 5
+    assert parser.functions[0].end_line == 5
+
+
+def test_missing_offset(parser):
+    """What if the function doesn't have an offset comment?"""
+    with sample_file("missing_offset.cpp") as f:
+        parser.read_lines(f)
+
+    # TODO: For now, the function without the offset will just be ignored.
+    # Would be the same outcome if the comment was present but mangled and
+    # we failed to match it. We should detect these cases in the future.
+    assert len(parser.functions) == 1
+
+
+def test_jumbled_case(parser):
+    """The parser just reports what it sees. It is the responsibility of
+    the downstream tools to do something about a jumbled file.
+    Just verify that we are reading it correctly."""
+    with sample_file("out_of_order.cpp") as f:
+        parser.read_lines(f)
+
+    assert len(parser.functions) == 3
+    assert code_blocks_are_sorted(parser.functions) is False
+
+
+def test_bad_file(parser):
+    with sample_file("poorly_formatted.cpp") as f:
+        parser.read_lines(f)
+
+    assert len(parser.functions) == 3
+
+
+def test_indented(parser):
+    """Offsets for functions inside of a class will probably be indented."""
+    with sample_file("basic_class.cpp") as f:
+        parser.read_lines(f)
+
+    # TODO: We don't properly detect the end of these functions
+    # because the closing brace is indented. However... knowing where each
+    # function ends is less important (for now) than capturing
+    # all the functions that are there.
+
+    assert len(parser.functions) == 2
+    assert parser.functions[0].offset == int("0x12345678", 16)
+    assert parser.functions[0].line_number == 16
+    # assert parser.functions[0].end_line == 19
+
+    assert parser.functions[1].offset == int("0xdeadbeef", 16)
+    assert parser.functions[1].line_number == 23
+    # assert parser.functions[1].end_line == 25
+
+
+def test_inline(parser):
+    with sample_file("inline.cpp") as f:
+        parser.read_lines(f)
+
+    assert len(parser.functions) == 2
+    for fun in parser.functions:
+        assert fun.line_number is not None
+        assert fun.line_number == fun.end_line
+
+
+def test_multiple_offsets(parser):
+    """If multiple offset marks appear before for a code block, take them
+    all but ensure module name (case-insensitive) is distinct.
+    Use first module occurrence in case of duplicates."""
+    with sample_file("multiple_offsets.cpp") as f:
+        parser.read_lines(f)
+
+    assert len(parser.functions) == 4
+    assert parser.functions[0].module == "TEST"
+    assert parser.functions[0].line_number == 9
+
+    assert parser.functions[1].module == "HELLO"
+    assert parser.functions[1].line_number == 9
+
+    # Duplicate modules are ignored
+    assert parser.functions[2].line_number == 16
+    assert parser.functions[2].offset == 0x2345
+
+    assert parser.functions[3].module == "TEST"
+    assert parser.functions[3].offset == 0x2002
+
+
+def test_variables(parser):
+    with sample_file("global_variables.cpp") as f:
+        parser.read_lines(f)
+
+    assert len(parser.functions) == 1
+    assert len(parser.variables) == 2
--- a/tools/isledecomp/tests/test_parser_statechange.py
+++ b/tools/isledecomp/tests/test_parser_statechange.py
@@ -0,0 +1,150 @@
+import pytest
+from isledecomp.parser.parser import (
+    ReaderState as _rs,
+    DecompParser,
+)
+from isledecomp.parser.util import DecompMarker
+from isledecomp.parser.error import ParserError as _pe
+
+# fmt: off
+state_change_marker_cases = [
+    (_rs.SEARCH,          "FUNCTION",   _rs.WANT_SIG,        None),
+    (_rs.SEARCH,          "GLOBAL",     _rs.IN_GLOBAL,       None),
+    (_rs.SEARCH,          "STUB",       _rs.WANT_SIG,        None),
+    (_rs.SEARCH,          "SYNTHETIC",  _rs.IN_TEMPLATE,     None),
+    (_rs.SEARCH,          "TEMPLATE",   _rs.IN_TEMPLATE,     None),
+    (_rs.SEARCH,          "VTABLE",     _rs.IN_VTABLE,       None),
+
+    (_rs.WANT_SIG,        "FUNCTION",   _rs.WANT_SIG,        None),
+    (_rs.WANT_SIG,        "GLOBAL",     _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
+    (_rs.WANT_SIG,        "STUB",       _rs.WANT_SIG,        None),
+    (_rs.WANT_SIG,        "SYNTHETIC",  _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
+    (_rs.WANT_SIG,        "TEMPLATE",   _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
+    (_rs.WANT_SIG,        "VTABLE",     _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
+
+    (_rs.IN_FUNC,         "FUNCTION",   _rs.WANT_SIG,        _pe.MISSED_END_OF_FUNCTION),
+    (_rs.IN_FUNC,         "GLOBAL",     _rs.IN_FUNC_GLOBAL,  None),
+    (_rs.IN_FUNC,         "STUB",       _rs.WANT_SIG,        _pe.MISSED_END_OF_FUNCTION),
+    (_rs.IN_FUNC,         "SYNTHETIC",  _rs.IN_TEMPLATE,     _pe.MISSED_END_OF_FUNCTION),
+    (_rs.IN_FUNC,         "TEMPLATE",   _rs.IN_TEMPLATE,     _pe.MISSED_END_OF_FUNCTION),
+    (_rs.IN_FUNC,         "VTABLE",     _rs.IN_VTABLE,       _pe.MISSED_END_OF_FUNCTION),
+
+    (_rs.IN_TEMPLATE,     "FUNCTION",   _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
+    (_rs.IN_TEMPLATE,     "GLOBAL",     _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
+    (_rs.IN_TEMPLATE,     "STUB",       _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
+    (_rs.IN_TEMPLATE,     "SYNTHETIC",  _rs.IN_TEMPLATE,     None),
+    (_rs.IN_TEMPLATE,     "TEMPLATE",   _rs.IN_TEMPLATE,     None),
+    (_rs.IN_TEMPLATE,     "VTABLE",     _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
+    
+    (_rs.WANT_CURLY,      "FUNCTION",   _rs.SEARCH,          _pe.UNEXPECTED_MARKER),
+    (_rs.WANT_CURLY,      "GLOBAL",     _rs.SEARCH,          _pe.UNEXPECTED_MARKER),
+    (_rs.WANT_CURLY,      "STUB",       _rs.SEARCH,          _pe.UNEXPECTED_MARKER),
+    (_rs.WANT_CURLY,      "SYNTHETIC",  _rs.SEARCH,          _pe.UNEXPECTED_MARKER),
+    (_rs.WANT_CURLY,      "TEMPLATE",   _rs.SEARCH,          _pe.UNEXPECTED_MARKER),
+    (_rs.WANT_CURLY,      "VTABLE",     _rs.SEARCH,          _pe.UNEXPECTED_MARKER),
+    
+    (_rs.IN_GLOBAL,       "FUNCTION",   _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
+    (_rs.IN_GLOBAL,       "GLOBAL",     _rs.IN_GLOBAL,       None),
+    (_rs.IN_GLOBAL,       "STUB",       _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
+    (_rs.IN_GLOBAL,       "SYNTHETIC",  _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
+    (_rs.IN_GLOBAL,       "TEMPLATE",   _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
+    (_rs.IN_GLOBAL,       "VTABLE",     _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
+    
+    (_rs.IN_FUNC_GLOBAL,  "FUNCTION",   _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
+    (_rs.IN_FUNC_GLOBAL,  "GLOBAL",     _rs.IN_FUNC_GLOBAL,  None),
+    (_rs.IN_FUNC_GLOBAL,  "STUB",       _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
+    (_rs.IN_FUNC_GLOBAL,  "SYNTHETIC",  _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
+    (_rs.IN_FUNC_GLOBAL,  "TEMPLATE",   _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
+    (_rs.IN_FUNC_GLOBAL,  "VTABLE",     _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
+    
+    (_rs.IN_VTABLE,       "FUNCTION",   _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
+    (_rs.IN_VTABLE,       "GLOBAL",     _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
+    (_rs.IN_VTABLE,       "STUB",       _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
+    (_rs.IN_VTABLE,       "SYNTHETIC",  _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
+    (_rs.IN_VTABLE,       "TEMPLATE",   _rs.SEARCH,          _pe.INCOMPATIBLE_MARKER),
+    (_rs.IN_VTABLE,       "VTABLE",     _rs.IN_VTABLE,       None),
+]
+# fmt: on
+
+
+@pytest.mark.parametrize(
+    "state, marker_type, new_state, expected_error", state_change_marker_cases
+)
+def test_state_change_by_marker(
+    state: _rs, marker_type: str, new_state: _rs, expected_error: None | _pe
+):
+    p = DecompParser()
+    p.state = state
+    p._handle_marker(DecompMarker(marker_type, "TEST", 0x1234))
+    assert p.state == new_state
+
+    if expected_error is not None:
+        assert len(p.alerts) > 0
+        assert p.alerts[0].code == expected_error
+
+
+# Reading any of these lines should have no effect in ReaderState.SEARCH
+search_lines_no_effect = [
+    "",
+    "\t",
+    "    ",
+    "int x = 0;",
+    "// Comment",
+    "/*",
+    "*/",
+    "/* Block comment */",
+    "{",
+    "}",
+]
+
+
+@pytest.mark.parametrize("line", search_lines_no_effect)
+def test_state_search_line(line: str):
+    p = DecompParser()
+    p.read_line(line)
+    assert p.state == _rs.SEARCH
+    assert len(p.alerts) == 0
+
+
+global_lines = [
+    ("// A comment", _rs.IN_GLOBAL),
+    ("", _rs.IN_GLOBAL),
+    ("\t", _rs.IN_GLOBAL),
+    ("    ", _rs.IN_GLOBAL),
+    # TODO: no check for "likely" variable declaration so these all count
+    ("void function()", _rs.SEARCH),
+    ("int x = 123;", _rs.SEARCH),
+    ("just some text", _rs.SEARCH),
+]
+
+
+@pytest.mark.parametrize("line, new_state", global_lines)
+def test_state_global_line(line: str, new_state: _rs):
+    p = DecompParser()
+    p.read_line("// GLOBAL: TEST 0x1234")
+    assert p.state == _rs.IN_GLOBAL
+    p.read_line(line)
+    assert p.state == new_state
+
+
+# mostly same as above
+in_func_global_lines = [
+    ("// A comment", _rs.IN_FUNC_GLOBAL),
+    ("", _rs.IN_FUNC_GLOBAL),
+    ("\t", _rs.IN_FUNC_GLOBAL),
+    ("    ", _rs.IN_FUNC_GLOBAL),
+    # TODO: no check for "likely" variable declaration so these all count
+    ("void function()", _rs.IN_FUNC),
+    ("int x = 123;", _rs.IN_FUNC),
+    ("just some text", _rs.IN_FUNC),
+]
+
+
+@pytest.mark.parametrize("line, new_state", in_func_global_lines)
+def test_state_in_func_global_line(line: str, new_state: _rs):
+    p = DecompParser()
+    p.state = _rs.IN_FUNC
+    p.read_line("// GLOBAL: TEST 0x1234")
+    assert p.state == _rs.IN_FUNC_GLOBAL
+    p.read_line(line)
+    assert p.state == new_state
--- a/tools/isledecomp/tests/test_parser_util.py
+++ b/tools/isledecomp/tests/test_parser_util.py
@@ -1,11 +1,12 @@
 from collections import namedtuple
 from typing import List
 import pytest
+from isledecomp.parser.parser import MarkerDict
 from isledecomp.parser.util import (
+    DecompMarker,
    is_blank_or_comment,
-    match_offset_comment,
-    is_exact_offset_comment,
-    distinct_by_module,
+    match_marker,
+    is_marker_exact,
 )


@@ -28,76 +29,72 @@ def test_is_blank_or_comment(line: str, expected: bool):
    assert is_blank_or_comment(line) is expected


-offset_comment_samples = [
+marker_samples = [
    # (can_parse: bool, exact_match: bool, line: str)
-    # Should match both expected modules with optional STUB marker
-    (True, True, "// OFFSET: LEGO1 0xdeadbeef"),
-    (True, True, "// OFFSET: LEGO1 0xdeadbeef STUB"),
-    (True, True, "// OFFSET: ISLE 0x12345678"),
-    (True, True, "// OFFSET: ISLE 0x12345678 STUB"),
+    (True, True, "// FUNCTION: LEGO1 0xdeadbeef"),
+    (True, True, "// FUNCTION: ISLE 0x12345678"),
    # No trailing spaces allowed
-    (True, False, "// OFFSET: LEGO1 0xdeadbeef  "),
-    (True, False, "// OFFSET: LEGO1 0xdeadbeef STUB "),
+    (True, False, "// FUNCTION: LEGO1 0xdeadbeef  "),
    # Must have exactly one space between elements
-    (True, False, "//OFFSET: ISLE 0xdeadbeef"),
-    (True, False, "// OFFSET:ISLE 0xdeadbeef"),
-    (True, False, "//  OFFSET: ISLE 0xdeadbeef"),
-    (True, False, "// OFFSET:  ISLE 0xdeadbeef"),
-    (True, False, "// OFFSET: ISLE  0xdeadbeef"),
-    (True, False, "// OFFSET: ISLE 0xdeadbeef  STUB"),
+    (True, False, "//FUNCTION: ISLE 0xdeadbeef"),
+    (True, False, "// FUNCTION:ISLE 0xdeadbeef"),
+    (True, False, "//  FUNCTION: ISLE 0xdeadbeef"),
+    (True, False, "// FUNCTION:  ISLE 0xdeadbeef"),
+    (True, False, "// FUNCTION: ISLE  0xdeadbeef"),
    # Must have 0x prefix for hex number
-    (True, False, "// OFFSET: ISLE deadbeef"),
+    (True, False, "// FUNCTION: ISLE deadbeef"),
    # Offset, module name, and STUB must be uppercase
-    (True, False, "// offset: ISLE 0xdeadbeef"),
-    (True, False, "// offset: isle 0xdeadbeef"),
-    (True, False, "// OFFSET: LEGO1 0xdeadbeef stub"),
+    (True, False, "// function: ISLE 0xdeadbeef"),
+    (True, False, "// function: isle 0xdeadbeef"),
    # Hex string must be lowercase
-    (True, False, "// OFFSET: ISLE 0xDEADBEEF"),
+    (True, False, "// FUNCTION: ISLE 0xDEADBEEF"),
    # TODO: How flexible should we be with matching the module name?
-    (True, True, "// OFFSET: OMNI 0x12345678"),
-    (True, True, "// OFFSET: LEG01 0x12345678"),
-    (True, False, "// OFFSET: hello 0x12345678"),
+    (True, True, "// FUNCTION: OMNI 0x12345678"),
+    (True, True, "// FUNCTION: LEG01 0x12345678"),
+    (True, False, "// FUNCTION: hello 0x12345678"),
    # Not close enough to match
-    (False, False, "// OFFSET: ISLE0x12345678"),
-    (False, False, "// OFFSET: 0x12345678"),
+    (False, False, "// FUNCTION: ISLE0x12345678"),
+    (False, False, "// FUNCTION: 0x12345678"),
    (False, False, "// LEGO1: 0x12345678"),
    # Hex string shorter than 8 characters
-    (True, True, "// OFFSET: LEGO1 0x1234"),
+    (True, True, "// FUNCTION: LEGO1 0x1234"),
    # TODO: These match but shouldn't.
-    # (False, False, '// OFFSET: LEGO1 0'),
-    # (False, False, '// OFFSET: LEGO1 0x'),
+    # (False, False, '// FUNCTION: LEGO1 0'),
+    # (False, False, '// FUNCTION: LEGO1 0x'),
 ]


-@pytest.mark.parametrize("match, _, line", offset_comment_samples)
-def test_offset_match(line: str, match: bool, _):
-    did_match = match_offset_comment(line) is not None
+@pytest.mark.parametrize("match, _, line", marker_samples)
+def test_marker_match(line: str, match: bool, _):
+    did_match = match_marker(line) is not None
    assert did_match is match


-@pytest.mark.parametrize("_, exact, line", offset_comment_samples)
-def test_exact_offset_comment(line: str, exact: bool, _):
-    assert is_exact_offset_comment(line) is exact
+@pytest.mark.parametrize("_, exact, line", marker_samples)
+def test_marker_exact(line: str, exact: bool, _):
+    assert is_marker_exact(line) is exact


-# Helper for the next test: cut down version of OffsetMatch
-MiniOfs = namedtuple("MiniOfs", ["module", "value"])
-
-distinct_by_module_samples = [
-    # empty set
-    ([], []),
-    # same module name
-    ([MiniOfs("TEST", 123), MiniOfs("TEST", 555)], [MiniOfs("TEST", 123)]),
-    # same module name, case-insensitive
-    ([MiniOfs("test", 123), MiniOfs("TEST", 555)], [MiniOfs("test", 123)]),
-    # duplicates, non-consecutive
-    (
-        [MiniOfs("test", 123), MiniOfs("abc", 111), MiniOfs("TEST", 555)],
-        [MiniOfs("test", 123), MiniOfs("abc", 111)],
-    ),
-]
+def test_marker_dict_simple():
+    d = MarkerDict()
+    d.insert(DecompMarker("FUNCTION", "TEST", 0x1234))
+    markers = list(d.iter())
+    assert len(markers) == 1


-@pytest.mark.parametrize("sample, expected", distinct_by_module_samples)
-def test_distinct_by_module(sample: List[MiniOfs], expected: List[MiniOfs]):
-    assert distinct_by_module(sample) == expected
+def test_marker_dict_ofs_replace():
+    d = MarkerDict()
+    d.insert(DecompMarker("FUNCTION", "TEST", 0x1234))
+    d.insert(DecompMarker("FUNCTION", "TEST", 0x555))
+    markers = list(d.iter())
+    assert len(markers) == 1
+    assert markers[0].offset == 0x1234
+
+
+def test_marker_dict_type_replace():
+    d = MarkerDict()
+    d.insert(DecompMarker("FUNCTION", "TEST", 0x1234))
+    d.insert(DecompMarker("STUB", "TEST", 0x1234))
+    markers = list(d.iter())
+    assert len(markers) == 1
+    assert markers[0].type == "FUNCTION"