(Proposal) Adjustments to "decomp" language (#308)

* Adjustments to "decomp" language

* Fix a comment

* Fix accidental clang-formatting

* Fix order

* Fix order

* Remove junk

* Fix OFFSET

* Adjustments based on new suggestions

* Annotate globals

* Globals in ISLE

* More globals

* Merge from parser2 branch

* Allow prepending space for exact marker match

* To eliminate noise, require the 0x prefix on offset for marker match

* fix test from previous

* Count tab stops for indented functions to reduce MISSED_END_OF_FUNCTION noise

* FUNCTION to SYNTHETIC where needed

* Missed marker conversion on SetAtomId

* pylint cleanup, remove unused code

* Fix unexpected function end, add more unit tests

* Be more strict about synthetic name syntax

* Revert "Missed marker conversion on SetAtomId"

This reverts commit d87d665127.

* Revert "FUNCTION to SYNTHETIC where needed"

This reverts commit 8c815418d2.

* Implicit lookup by name for functions

* Fix VTABLE SYNTHETIC and other decomp markers

* Get vtable class name

* Vtable marker should identify struct

* No colon for SIZE comment

* Update README.md

* Update README.md

* Update CONTRIBUTING.md

* Update README.md

* Update README.md

* Update CONTRIBUTING.md

* Update README.md

* Update CONTRIBUTING.md

* Fix destructor/annotation

* Update README.md

* Update README.md

* Update README.md

* Update README.md

* Update README.md

---------

Co-authored-by: disinvite <disinvite@users.noreply.github.com>
This commit is contained in:
Christian Semmler
2023-12-06 07:10:45 -05:00
committed by GitHub
parent 4f5b70013f
commit 494a556f8e
407 changed files with 3505 additions and 2493 deletions

View File

@@ -1,127 +1,360 @@
import os
from typing import List, TextIO
from isledecomp.parser import find_code_blocks
from isledecomp.parser.util import CodeBlock
SAMPLE_DIR = os.path.join(os.path.dirname(__file__), "samples")
import pytest
from isledecomp.parser.parser import (
ReaderState,
DecompParser,
)
from isledecomp.parser.error import ParserError
def sample_file(filename: str) -> TextIO:
"""Wrapper for opening the samples from the directory that does not
depend on the cwd where we run the test"""
full_path = os.path.join(SAMPLE_DIR, filename)
return open(full_path, "r", encoding="utf-8")
@pytest.fixture(name="parser")
def fixture_parser():
return DecompParser()
def code_blocks_are_sorted(blocks: List[CodeBlock]) -> bool:
"""Helper to make this more idiomatic"""
just_offsets = [block.offset for block in blocks]
return just_offsets == sorted(just_offsets)
def test_missing_sig(parser):
"""In the hopefully rare scenario that the function signature and marker
are swapped, we still have enough to match witch reccmp"""
parser.read_lines(
[
"void my_function()",
"// FUNCTION: TEST 0x1234",
"{",
"}",
]
)
assert parser.state == ReaderState.SEARCH
assert len(parser.functions) == 1
assert parser.functions[0].line_number == 3
assert len(parser.alerts) == 1
assert parser.alerts[0].code == ParserError.MISSED_START_OF_FUNCTION
# Tests are below #
def test_not_exact_syntax(parser):
"""Alert to inexact syntax right here in the parser instead of kicking it downstream.
Doing this means we don't have to save the actual text."""
parser.read_line("// function: test 0x1234")
assert len(parser.alerts) == 1
assert parser.alerts[0].code == ParserError.BAD_DECOMP_MARKER
def test_sanity():
"""Read a very basic file"""
with sample_file("basic_file.cpp") as f:
blocks = find_code_blocks(f)
def test_invalid_marker(parser):
"""We matched a decomp marker, but it's not one we care about"""
parser.read_line("// BANANA: TEST 0x1234")
assert parser.state == ReaderState.SEARCH
assert len(blocks) == 3
assert code_blocks_are_sorted(blocks) is True
# n.b. The parser returns line numbers as 1-based
# Function starts when we see the opening curly brace
assert blocks[0].start_line == 8
assert blocks[0].end_line == 10
assert len(parser.alerts) == 1
assert parser.alerts[0].code == ParserError.BOGUS_MARKER
def test_oneline():
"""(Assuming clang-format permits this) This sample has a function
on a single line. This will test the end-of-function detection"""
with sample_file("oneline_function.cpp") as f:
blocks = find_code_blocks(f)
assert len(blocks) == 2
assert blocks[0].start_line == 5
assert blocks[0].end_line == 5
def test_incompatible_marker(parser):
"""The marker we just read cannot be handled in the current parser state"""
parser.read_lines(
[
"// FUNCTION: TEST 0x1234",
"// GLOBAL: TEST 0x5000",
]
)
assert parser.state == ReaderState.SEARCH
assert len(parser.alerts) == 1
assert parser.alerts[0].code == ParserError.INCOMPATIBLE_MARKER
def test_missing_offset():
"""What if the function doesn't have an offset comment?"""
with sample_file("missing_offset.cpp") as f:
blocks = find_code_blocks(f)
# TODO: For now, the function without the offset will just be ignored.
# Would be the same outcome if the comment was present but mangled and
# we failed to match it. We should detect these cases in the future.
assert len(blocks) == 1
def test_variable(parser):
"""Should identify a global variable"""
parser.read_lines(
[
"// GLOBAL: HELLO 0x1234",
"int g_value = 5;",
]
)
assert len(parser.variables) == 1
def test_jumbled_case():
"""The parser just reports what it sees. It is the responsibility of
the downstream tools to do something about a jumbled file.
Just verify that we are reading it correctly."""
with sample_file("out_of_order.cpp") as f:
blocks = find_code_blocks(f)
assert len(blocks) == 3
assert code_blocks_are_sorted(blocks) is False
def test_synthetic_plus_marker(parser):
"""Marker tracking preempts synthetic name detection.
Should fail with error and not log the synthetic"""
parser.read_lines(
[
"// SYNTHETIC: HEY 0x555",
"// FUNCTION: HOWDY 0x1234",
]
)
assert len(parser.functions) == 0
assert len(parser.alerts) == 1
assert parser.alerts[0].code == ParserError.INCOMPATIBLE_MARKER
def test_bad_file():
with sample_file("poorly_formatted.cpp") as f:
blocks = find_code_blocks(f)
def test_different_markers_different_module(parser):
"""Does it make any sense for a function to be a stub in one module,
but not in another? I don't know. But it's no problem for us."""
parser.read_lines(
[
"// FUNCTION: HOWDY 0x1234",
"// STUB: SUP 0x5555",
"void interesting_function() {",
"}",
]
)
assert len(blocks) == 3
assert len(parser.alerts) == 0
assert len(parser.functions) == 2
def test_indented():
"""Offsets for functions inside of a class will probably be indented."""
with sample_file("basic_class.cpp") as f:
blocks = find_code_blocks(f)
def test_different_markers_same_module(parser):
"""Now, if something is a regular function but then a stub,
what do we say about that?"""
parser.read_lines(
[
"// FUNCTION: HOWDY 0x1234",
"// STUB: HOWDY 0x5555",
"void interesting_function() {",
"}",
]
)
# TODO: We don't properly detect the end of these functions
# because the closing brace is indented. However... knowing where each
# function ends is less important (for now) than capturing
# all the functions that are there.
# Use first marker declaration, don't replace
assert len(parser.functions) == 1
assert parser.functions[0].is_stub is False
assert len(blocks) == 2
assert blocks[0].offset == int("0x12345678", 16)
assert blocks[0].start_line == 15
# assert blocks[0].end_line == 18
assert blocks[1].offset == int("0xdeadbeef", 16)
assert blocks[1].start_line == 22
# assert blocks[1].end_line == 24
# Should alert to this
assert len(parser.alerts) == 1
assert parser.alerts[0].code == ParserError.DUPLICATE_MODULE
def test_inline():
with sample_file("inline.cpp") as f:
blocks = find_code_blocks(f)
def test_unexpected_synthetic(parser):
"""FUNCTION then SYNTHETIC should fail to report either one"""
parser.read_lines(
[
"// FUNCTION: HOWDY 0x1234",
"// SYNTHETIC: HOWDY 0x5555",
"void interesting_function() {",
"}",
]
)
assert len(blocks) == 2
for block in blocks:
assert block.start_line is not None
assert block.start_line == block.end_line
assert parser.state == ReaderState.SEARCH
assert len(parser.functions) == 0
assert len(parser.alerts) == 1
assert parser.alerts[0].code == ParserError.INCOMPATIBLE_MARKER
def test_multiple_offsets():
"""If multiple offset marks appear before for a code block, take them
all but ensure module name (case-insensitive) is distinct.
Use first module occurrence in case of duplicates."""
with sample_file("multiple_offsets.cpp") as f:
blocks = find_code_blocks(f)
@pytest.mark.skip(reason="not implemented yet")
def test_duplicate_offset(parser):
"""Repeating the same module/offset in the same file is probably a typo"""
parser.read_lines(
[
"// GLOBAL: HELLO 0x1234",
"int x = 1;",
"// GLOBAL: HELLO 0x1234",
"int y = 2;",
]
)
assert len(blocks) == 4
assert blocks[0].module == "TEST"
assert blocks[0].start_line == 9
assert len(parser.alerts) == 1
assert parser.alerts[0].code == ParserError.DUPLICATE_OFFSET
assert blocks[1].module == "HELLO"
assert blocks[1].start_line == 9
# Duplicate modules are ignored
assert blocks[2].start_line == 16
assert blocks[2].offset == 0x2345
def test_multiple_variables(parser):
"""Theoretically the same global variable can appear in multiple modules"""
parser.read_lines(
[
"// GLOBAL: HELLO 0x1234",
"// GLOBAL: WUZZUP 0x555",
"const char *g_greeting;",
]
)
assert len(parser.alerts) == 0
assert len(parser.variables) == 2
assert blocks[3].module == "TEST"
assert blocks[3].offset == 0x2002
def test_multiple_variables_same_module(parser):
"""Should not overwrite offset"""
parser.read_lines(
[
"// GLOBAL: HELLO 0x1234",
"// GLOBAL: HELLO 0x555",
"const char *g_greeting;",
]
)
assert len(parser.alerts) == 1
assert parser.alerts[0].code == ParserError.DUPLICATE_MODULE
assert len(parser.variables) == 1
assert parser.variables[0].offset == 0x1234
def test_multiple_vtables(parser):
parser.read_lines(
[
"// VTABLE: HELLO 0x1234",
"// VTABLE: TEST 0x5432",
"class MxString : public MxCore {",
]
)
assert len(parser.alerts) == 0
assert len(parser.vtables) == 2
assert parser.vtables[0].class_name == "MxString"
def test_multiple_vtables_same_module(parser):
"""Should not overwrite offset"""
parser.read_lines(
[
"// VTABLE: HELLO 0x1234",
"// VTABLE: HELLO 0x5432",
"class MxString : public MxCore {",
]
)
assert len(parser.alerts) == 1
assert parser.alerts[0].code == ParserError.DUPLICATE_MODULE
assert len(parser.vtables) == 1
assert parser.vtables[0].offset == 0x1234
def test_synthetic(parser):
parser.read_lines(
[
"// SYNTHETIC: TEST 0x1234",
"// TestClass::TestMethod",
]
)
assert len(parser.functions) == 1
assert parser.functions[0].lookup_by_name is True
assert parser.functions[0].name == "TestClass::TestMethod"
def test_synthetic_same_module(parser):
parser.read_lines(
[
"// SYNTHETIC: TEST 0x1234",
"// SYNTHETIC: TEST 0x555",
"// TestClass::TestMethod",
]
)
assert len(parser.alerts) == 1
assert parser.alerts[0].code == ParserError.DUPLICATE_MODULE
assert len(parser.functions) == 1
assert parser.functions[0].offset == 0x1234
def test_synthetic_no_comment(parser):
"""Synthetic marker followed by a code line (i.e. non-comment)"""
parser.read_lines(
[
"// SYNTHETIC: TEST 0x1234",
"int x = 123;",
]
)
assert len(parser.functions) == 0
assert len(parser.alerts) == 1
assert parser.alerts[0].code == ParserError.BAD_SYNTHETIC
assert parser.state == ReaderState.SEARCH
def test_single_line_function(parser):
parser.read_lines(
[
"// FUNCTION: TEST 0x1234",
"int hello() { return 1234; }",
]
)
assert len(parser.functions) == 1
assert parser.functions[0].line_number == 2
assert parser.functions[0].end_line == 2
def test_indented_function(parser):
"""Track the number of whitespace characters when we begin the function
and check that against each closing curly brace we read.
Should not report a syntax warning if the function is indented"""
parser.read_lines(
[
" // FUNCTION: TEST 0x1234",
" void indented()",
" {",
" // TODO",
" }",
" // FUNCTION: NEXT 0x555",
]
)
assert len(parser.alerts) == 0
@pytest.mark.xfail(reason="todo")
def test_indented_no_curly_hint(parser):
"""Same as above, but opening curly brace is on the same line.
Without the hint of how many whitespace characters to check, can we
still identify the end of the function?"""
parser.read_lines(
[
" // FUNCTION: TEST 0x1234",
" void indented() {",
" }",
" // FUNCTION: NEXT 0x555",
]
)
assert len(parser.alerts) == 0
def test_implicit_lookup_by_name(parser):
"""FUNCTION (or STUB) offsets must directly precede the function signature.
If we detect a comment instead, we assume that this is a lookup-by-name
function and end here."""
parser.read_lines(
[
"// FUNCTION: TEST 0x1234",
"// TestClass::TestMethod()",
]
)
assert parser.state == ReaderState.SEARCH
assert len(parser.functions) == 1
assert parser.functions[0].lookup_by_name is True
assert parser.functions[0].name == "TestClass::TestMethod()"
def test_function_with_spaces(parser):
"""There should not be any spaces between the end of FUNCTION markers
and the start or name of the function. If it's a blank line, we can safely
ignore but should alert to this."""
parser.read_lines(
[
"// FUNCTION: TEST 0x1234",
" ",
"inline void test_function() { };",
]
)
assert len(parser.functions) == 1
assert len(parser.alerts) == 1
assert parser.alerts[0].code == ParserError.UNEXPECTED_BLANK_LINE
def test_function_with_spaces_implicit(parser):
"""Same as above, but for implicit lookup-by-name"""
parser.read_lines(
[
"// FUNCTION: TEST 0x1234",
" ",
"// Implicit::Method",
]
)
assert len(parser.functions) == 1
assert len(parser.alerts) == 1
assert parser.alerts[0].code == ParserError.UNEXPECTED_BLANK_LINE
@pytest.mark.xfail(reason="will assume implicit lookup-by-name function")
def test_function_is_commented(parser):
"""In an ideal world, we would recognize that there is no code here.
Some editors (or users) might comment the function on each line like this
but hopefully it is rare."""
parser.read_lines(
[
"// FUNCTION: TEST 0x1234",
"// int my_function()",
"// {",
"// return 5;",
"// }",
]
)
assert len(parser.functions) == 0