(Proposal) Adjustments to "decomp" language (#308)

* Adjustments to "decomp" language * Fix a comment * Fix accidental clang-formatting * Fix order * Fix order * Remove junk * Fix OFFSET * Adjustments based on new suggestions * Annotate globals * Globals in ISLE * More globals * Merge from parser2 branch * Allow prepending space for exact marker match * To eliminate noise, require the 0x prefix on offset for marker match * fix test from previous * Count tab stops for indented functions to reduce MISSED_END_OF_FUNCTION noise * FUNCTION to SYNTHETIC where needed * Missed marker conversion on SetAtomId * pylint cleanup, remove unused code * Fix unexpected function end, add more unit tests * Be more strict about synthetic name syntax * Revert "Missed marker conversion on SetAtomId" This reverts commit d87d665127. * Revert "FUNCTION to SYNTHETIC where needed" This reverts commit 8c815418d2. * Implicit lookup by name for functions * Fix VTABLE SYNTHETIC and other decomp markers * Get vtable class name * Vtable marker should identify struct * No colon for SIZE comment * Update README.md * Update README.md * Update CONTRIBUTING.md * Update README.md * Update README.md * Update CONTRIBUTING.md * Update README.md * Update CONTRIBUTING.md * Fix destructor/annotation * Update README.md * Update README.md * Update README.md * Update README.md * Update README.md --------- Co-authored-by: disinvite <disinvite@users.noreply.github.com>
2025-12-10 08:03:13 +00:00 · 2023-12-06 07:10:45 -05:00
parent 4f5b70013f
commit 494a556f8e
407 changed files with 3505 additions and 2493 deletions
--- a/tools/isledecomp/tests/test_parser.py
+++ b/tools/isledecomp/tests/test_parser.py
@@ -1,127 +1,360 @@
-import os
-from typing import List, TextIO
-from isledecomp.parser import find_code_blocks
-from isledecomp.parser.util import CodeBlock
-
-SAMPLE_DIR = os.path.join(os.path.dirname(__file__), "samples")
+import pytest
+from isledecomp.parser.parser import (
+    ReaderState,
+    DecompParser,
+)
+from isledecomp.parser.error import ParserError


-def sample_file(filename: str) -> TextIO:
-    """Wrapper for opening the samples from the directory that does not
-    depend on the cwd where we run the test"""
-    full_path = os.path.join(SAMPLE_DIR, filename)
-    return open(full_path, "r", encoding="utf-8")
+@pytest.fixture(name="parser")
+def fixture_parser():
+    return DecompParser()


-def code_blocks_are_sorted(blocks: List[CodeBlock]) -> bool:
-    """Helper to make this more idiomatic"""
-    just_offsets = [block.offset for block in blocks]
-    return just_offsets == sorted(just_offsets)
+def test_missing_sig(parser):
+    """In the hopefully rare scenario that the function signature and marker
+    are swapped, we still have enough to match witch reccmp"""
+    parser.read_lines(
+        [
+            "void my_function()",
+            "// FUNCTION: TEST 0x1234",
+            "{",
+            "}",
+        ]
+    )
+    assert parser.state == ReaderState.SEARCH
+    assert len(parser.functions) == 1
+    assert parser.functions[0].line_number == 3
+
+    assert len(parser.alerts) == 1
+    assert parser.alerts[0].code == ParserError.MISSED_START_OF_FUNCTION


-# Tests are below #
+def test_not_exact_syntax(parser):
+    """Alert to inexact syntax right here in the parser instead of kicking it downstream.
+    Doing this means we don't have to save the actual text."""
+    parser.read_line("// function: test 0x1234")
+    assert len(parser.alerts) == 1
+    assert parser.alerts[0].code == ParserError.BAD_DECOMP_MARKER


-def test_sanity():
-    """Read a very basic file"""
-    with sample_file("basic_file.cpp") as f:
-        blocks = find_code_blocks(f)
+def test_invalid_marker(parser):
+    """We matched a decomp marker, but it's not one we care about"""
+    parser.read_line("// BANANA: TEST 0x1234")
+    assert parser.state == ReaderState.SEARCH

-    assert len(blocks) == 3
-    assert code_blocks_are_sorted(blocks) is True
-    # n.b. The parser returns line numbers as 1-based
-    # Function starts when we see the opening curly brace
-    assert blocks[0].start_line == 8
-    assert blocks[0].end_line == 10
+    assert len(parser.alerts) == 1
+    assert parser.alerts[0].code == ParserError.BOGUS_MARKER


-def test_oneline():
-    """(Assuming clang-format permits this) This sample has a function
-    on a single line. This will test the end-of-function detection"""
-    with sample_file("oneline_function.cpp") as f:
-        blocks = find_code_blocks(f)
-
-    assert len(blocks) == 2
-    assert blocks[0].start_line == 5
-    assert blocks[0].end_line == 5
+def test_incompatible_marker(parser):
+    """The marker we just read cannot be handled in the current parser state"""
+    parser.read_lines(
+        [
+            "// FUNCTION: TEST 0x1234",
+            "// GLOBAL: TEST 0x5000",
+        ]
+    )
+    assert parser.state == ReaderState.SEARCH
+    assert len(parser.alerts) == 1
+    assert parser.alerts[0].code == ParserError.INCOMPATIBLE_MARKER


-def test_missing_offset():
-    """What if the function doesn't have an offset comment?"""
-    with sample_file("missing_offset.cpp") as f:
-        blocks = find_code_blocks(f)
-
-    # TODO: For now, the function without the offset will just be ignored.
-    # Would be the same outcome if the comment was present but mangled and
-    # we failed to match it. We should detect these cases in the future.
-    assert len(blocks) == 1
+def test_variable(parser):
+    """Should identify a global variable"""
+    parser.read_lines(
+        [
+            "// GLOBAL: HELLO 0x1234",
+            "int g_value = 5;",
+        ]
+    )
+    assert len(parser.variables) == 1


-def test_jumbled_case():
-    """The parser just reports what it sees. It is the responsibility of
-    the downstream tools to do something about a jumbled file.
-    Just verify that we are reading it correctly."""
-    with sample_file("out_of_order.cpp") as f:
-        blocks = find_code_blocks(f)
-
-    assert len(blocks) == 3
-    assert code_blocks_are_sorted(blocks) is False
+def test_synthetic_plus_marker(parser):
+    """Marker tracking preempts synthetic name detection.
+    Should fail with error and not log the synthetic"""
+    parser.read_lines(
+        [
+            "// SYNTHETIC: HEY 0x555",
+            "// FUNCTION: HOWDY 0x1234",
+        ]
+    )
+    assert len(parser.functions) == 0
+    assert len(parser.alerts) == 1
+    assert parser.alerts[0].code == ParserError.INCOMPATIBLE_MARKER


-def test_bad_file():
-    with sample_file("poorly_formatted.cpp") as f:
-        blocks = find_code_blocks(f)
+def test_different_markers_different_module(parser):
+    """Does it make any sense for a function to be a stub in one module,
+    but not in another? I don't know. But it's no problem for us."""
+    parser.read_lines(
+        [
+            "// FUNCTION: HOWDY 0x1234",
+            "// STUB: SUP 0x5555",
+            "void interesting_function() {",
+            "}",
+        ]
+    )

-    assert len(blocks) == 3
+    assert len(parser.alerts) == 0
+    assert len(parser.functions) == 2


-def test_indented():
-    """Offsets for functions inside of a class will probably be indented."""
-    with sample_file("basic_class.cpp") as f:
-        blocks = find_code_blocks(f)
+def test_different_markers_same_module(parser):
+    """Now, if something is a regular function but then a stub,
+    what do we say about that?"""
+    parser.read_lines(
+        [
+            "// FUNCTION: HOWDY 0x1234",
+            "// STUB: HOWDY 0x5555",
+            "void interesting_function() {",
+            "}",
+        ]
+    )

-    # TODO: We don't properly detect the end of these functions
-    # because the closing brace is indented. However... knowing where each
-    # function ends is less important (for now) than capturing
-    # all the functions that are there.
+    # Use first marker declaration, don't replace
+    assert len(parser.functions) == 1
+    assert parser.functions[0].is_stub is False

-    assert len(blocks) == 2
-    assert blocks[0].offset == int("0x12345678", 16)
-    assert blocks[0].start_line == 15
-    # assert blocks[0].end_line == 18
-
-    assert blocks[1].offset == int("0xdeadbeef", 16)
-    assert blocks[1].start_line == 22
-    # assert blocks[1].end_line == 24
+    # Should alert to this
+    assert len(parser.alerts) == 1
+    assert parser.alerts[0].code == ParserError.DUPLICATE_MODULE


-def test_inline():
-    with sample_file("inline.cpp") as f:
-        blocks = find_code_blocks(f)
+def test_unexpected_synthetic(parser):
+    """FUNCTION then SYNTHETIC should fail to report either one"""
+    parser.read_lines(
+        [
+            "// FUNCTION: HOWDY 0x1234",
+            "// SYNTHETIC: HOWDY 0x5555",
+            "void interesting_function() {",
+            "}",
+        ]
+    )

-    assert len(blocks) == 2
-    for block in blocks:
-        assert block.start_line is not None
-        assert block.start_line == block.end_line
+    assert parser.state == ReaderState.SEARCH
+    assert len(parser.functions) == 0
+    assert len(parser.alerts) == 1
+    assert parser.alerts[0].code == ParserError.INCOMPATIBLE_MARKER


-def test_multiple_offsets():
-    """If multiple offset marks appear before for a code block, take them
-    all but ensure module name (case-insensitive) is distinct.
-    Use first module occurrence in case of duplicates."""
-    with sample_file("multiple_offsets.cpp") as f:
-        blocks = find_code_blocks(f)
+@pytest.mark.skip(reason="not implemented yet")
+def test_duplicate_offset(parser):
+    """Repeating the same module/offset in the same file is probably a typo"""
+    parser.read_lines(
+        [
+            "// GLOBAL: HELLO 0x1234",
+            "int x = 1;",
+            "// GLOBAL: HELLO 0x1234",
+            "int y = 2;",
+        ]
+    )

-    assert len(blocks) == 4
-    assert blocks[0].module == "TEST"
-    assert blocks[0].start_line == 9
+    assert len(parser.alerts) == 1
+    assert parser.alerts[0].code == ParserError.DUPLICATE_OFFSET

-    assert blocks[1].module == "HELLO"
-    assert blocks[1].start_line == 9

-    # Duplicate modules are ignored
-    assert blocks[2].start_line == 16
-    assert blocks[2].offset == 0x2345
+def test_multiple_variables(parser):
+    """Theoretically the same global variable can appear in multiple modules"""
+    parser.read_lines(
+        [
+            "// GLOBAL: HELLO 0x1234",
+            "// GLOBAL: WUZZUP 0x555",
+            "const char *g_greeting;",
+        ]
+    )
+    assert len(parser.alerts) == 0
+    assert len(parser.variables) == 2

-    assert blocks[3].module == "TEST"
-    assert blocks[3].offset == 0x2002
+
+def test_multiple_variables_same_module(parser):
+    """Should not overwrite offset"""
+    parser.read_lines(
+        [
+            "// GLOBAL: HELLO 0x1234",
+            "// GLOBAL: HELLO 0x555",
+            "const char *g_greeting;",
+        ]
+    )
+    assert len(parser.alerts) == 1
+    assert parser.alerts[0].code == ParserError.DUPLICATE_MODULE
+    assert len(parser.variables) == 1
+    assert parser.variables[0].offset == 0x1234
+
+
+def test_multiple_vtables(parser):
+    parser.read_lines(
+        [
+            "// VTABLE: HELLO 0x1234",
+            "// VTABLE: TEST 0x5432",
+            "class MxString : public MxCore {",
+        ]
+    )
+    assert len(parser.alerts) == 0
+    assert len(parser.vtables) == 2
+    assert parser.vtables[0].class_name == "MxString"
+
+
+def test_multiple_vtables_same_module(parser):
+    """Should not overwrite offset"""
+    parser.read_lines(
+        [
+            "// VTABLE: HELLO 0x1234",
+            "// VTABLE: HELLO 0x5432",
+            "class MxString : public MxCore {",
+        ]
+    )
+    assert len(parser.alerts) == 1
+    assert parser.alerts[0].code == ParserError.DUPLICATE_MODULE
+    assert len(parser.vtables) == 1
+    assert parser.vtables[0].offset == 0x1234
+
+
+def test_synthetic(parser):
+    parser.read_lines(
+        [
+            "// SYNTHETIC: TEST 0x1234",
+            "// TestClass::TestMethod",
+        ]
+    )
+    assert len(parser.functions) == 1
+    assert parser.functions[0].lookup_by_name is True
+    assert parser.functions[0].name == "TestClass::TestMethod"
+
+
+def test_synthetic_same_module(parser):
+    parser.read_lines(
+        [
+            "// SYNTHETIC: TEST 0x1234",
+            "// SYNTHETIC: TEST 0x555",
+            "// TestClass::TestMethod",
+        ]
+    )
+    assert len(parser.alerts) == 1
+    assert parser.alerts[0].code == ParserError.DUPLICATE_MODULE
+    assert len(parser.functions) == 1
+    assert parser.functions[0].offset == 0x1234
+
+
+def test_synthetic_no_comment(parser):
+    """Synthetic marker followed by a code line (i.e. non-comment)"""
+    parser.read_lines(
+        [
+            "// SYNTHETIC: TEST 0x1234",
+            "int x = 123;",
+        ]
+    )
+    assert len(parser.functions) == 0
+    assert len(parser.alerts) == 1
+    assert parser.alerts[0].code == ParserError.BAD_SYNTHETIC
+    assert parser.state == ReaderState.SEARCH
+
+
+def test_single_line_function(parser):
+    parser.read_lines(
+        [
+            "// FUNCTION: TEST 0x1234",
+            "int hello() { return 1234; }",
+        ]
+    )
+    assert len(parser.functions) == 1
+    assert parser.functions[0].line_number == 2
+    assert parser.functions[0].end_line == 2
+
+
+def test_indented_function(parser):
+    """Track the number of whitespace characters when we begin the function
+    and check that against each closing curly brace we read.
+    Should not report a syntax warning if the function is indented"""
+    parser.read_lines(
+        [
+            "    // FUNCTION: TEST 0x1234",
+            "    void indented()",
+            "    {",
+            "        // TODO",
+            "    }",
+            "    // FUNCTION: NEXT 0x555",
+        ]
+    )
+    assert len(parser.alerts) == 0
+
+
+@pytest.mark.xfail(reason="todo")
+def test_indented_no_curly_hint(parser):
+    """Same as above, but opening curly brace is on the same line.
+    Without the hint of how many whitespace characters to check, can we
+    still identify the end of the function?"""
+    parser.read_lines(
+        [
+            "    // FUNCTION: TEST 0x1234",
+            "    void indented() {",
+            "    }",
+            "    // FUNCTION: NEXT 0x555",
+        ]
+    )
+    assert len(parser.alerts) == 0
+
+
+def test_implicit_lookup_by_name(parser):
+    """FUNCTION (or STUB) offsets must directly precede the function signature.
+    If we detect a comment instead, we assume that this is a lookup-by-name
+    function and end here."""
+    parser.read_lines(
+        [
+            "// FUNCTION: TEST 0x1234",
+            "// TestClass::TestMethod()",
+        ]
+    )
+    assert parser.state == ReaderState.SEARCH
+    assert len(parser.functions) == 1
+    assert parser.functions[0].lookup_by_name is True
+    assert parser.functions[0].name == "TestClass::TestMethod()"
+
+
+def test_function_with_spaces(parser):
+    """There should not be any spaces between the end of FUNCTION markers
+    and the start or name of the function. If it's a blank line, we can safely
+    ignore but should alert to this."""
+    parser.read_lines(
+        [
+            "// FUNCTION: TEST 0x1234",
+            "   ",
+            "inline void test_function() { };",
+        ]
+    )
+    assert len(parser.functions) == 1
+    assert len(parser.alerts) == 1
+    assert parser.alerts[0].code == ParserError.UNEXPECTED_BLANK_LINE
+
+
+def test_function_with_spaces_implicit(parser):
+    """Same as above, but for implicit lookup-by-name"""
+    parser.read_lines(
+        [
+            "// FUNCTION: TEST 0x1234",
+            "   ",
+            "// Implicit::Method",
+        ]
+    )
+    assert len(parser.functions) == 1
+    assert len(parser.alerts) == 1
+    assert parser.alerts[0].code == ParserError.UNEXPECTED_BLANK_LINE
+
+
+@pytest.mark.xfail(reason="will assume implicit lookup-by-name function")
+def test_function_is_commented(parser):
+    """In an ideal world, we would recognize that there is no code here.
+    Some editors (or users) might comment the function on each line like this
+    but hopefully it is rare."""
+    parser.read_lines(
+        [
+            "// FUNCTION: TEST 0x1234",
+            "// int my_function()",
+            "// {",
+            "//     return 5;",
+            "// }",
+        ]
+    )
+
+    assert len(parser.functions) == 0