diff --git a/tools/checkorder/checkorder.py b/tools/checkorder/checkorder.py index 1ac8391f..d2f0b23e 100644 --- a/tools/checkorder/checkorder.py +++ b/tools/checkorder/checkorder.py @@ -2,8 +2,7 @@ import os import sys import argparse from isledecomp.dir import walk_source_dir, is_file_cpp -from isledecomp.parser import find_code_blocks -from isledecomp.parser.util import is_exact_offset_comment +from isledecomp.parser import DecompParser def sig_truncate(sig: str) -> str: @@ -16,23 +15,22 @@ def check_file(filename: str, verbose: bool = False) -> bool: """Open and read the given file, then check whether the code blocks are in order. If verbose, print each block.""" + parser = DecompParser() with open(filename, "r", encoding="utf-8") as f: - code_blocks = find_code_blocks(f) + parser.read_lines(f) - bad_comments = [ - (block.start_line, block.offset_comment) - for block in code_blocks - if not is_exact_offset_comment(block.offset_comment) - ] - just_offsets = [block.offset for block in code_blocks] + just_offsets = [block.offset for block in parser.functions] sorted_offsets = sorted(just_offsets) file_out_of_order = just_offsets != sorted_offsets + # TODO: When we add parser error severity, actual errors that obstruct + # parsing should probably be shown here regardless of verbose mode + # If we detect inexact comments, don't print anything unless we are # in verbose mode. If the file is out of order, we always print the # file name. - should_report = (len(bad_comments) > 0 and verbose) or file_out_of_order + should_report = (len(parser.alerts) > 0 and verbose) or file_out_of_order if not should_report and not file_out_of_order: return False @@ -44,22 +42,22 @@ def check_file(filename: str, verbose: bool = False) -> bool: order_lookup = {k: i for i, k in enumerate(sorted_offsets)} prev_offset = 0 - for block in code_blocks: + for fun in parser.functions: msg = " ".join( [ - " " if block.offset > prev_offset else "!", + " " if fun.offset > prev_offset else "!", f"{block.offset:08x}", - f"{block.end_line - block.start_line:4} lines", - f"{order_lookup[block.offset]:3}", + f"{fun.end_line - fun.line_number:4} lines", + f"{order_lookup[fun.offset]:3}", " ", - sig_truncate(block.signature), + sig_truncate(fun.signature), ] ) print(msg) - prev_offset = block.offset + prev_offset = fun.offset - for line_no, line in bad_comments: - print(f"* line {line_no:3} bad offset comment ({line})") + for alert in parser.alerts: + print(f"* line {alert.line_number:4} {alert.code} ({alert.line})") print() diff --git a/tools/isledecomp/isledecomp/parser/__init__.py b/tools/isledecomp/isledecomp/parser/__init__.py index 0d504619..c9394d4a 100644 --- a/tools/isledecomp/isledecomp/parser/__init__.py +++ b/tools/isledecomp/isledecomp/parser/__init__.py @@ -1 +1 @@ -from .parser import find_code_blocks +from .parser import DecompParser diff --git a/tools/isledecomp/isledecomp/parser/error.py b/tools/isledecomp/isledecomp/parser/error.py new file mode 100644 index 00000000..8bda90da --- /dev/null +++ b/tools/isledecomp/isledecomp/parser/error.py @@ -0,0 +1,33 @@ +from enum import Enum + + +class ParserError(Enum): + # WARN: Stub function exceeds some line number threshold + UNLIKELY_STUB = 100 + + # WARN: Decomp marker is close enough to be recognized, but does not follow syntax exactly + BAD_DECOMP_MARKER = 101 + + # WARN: Multiple markers in sequence do not have distinct modules + DUPLICATE_MODULE = 102 + + # WARN: Detected a dupcliate module/offset pair in the current file + DUPLICATE_OFFSET = 103 + + # WARN: We read a line that matches the decomp marker pattern, but we are not set up + # to handle it + BOGUS_MARKER = 104 + + # WARN: Under a synthetic marker we expected a comment but found a code line instead + SYNTHETIC_NOT_COMMENT = 110 + + # WARN: New function marker appeared while we were inside a function + MISSED_END_OF_FUNCTION = 117 + + # ERROR: We found a marker unexpectedly + UNEXPECTED_MARKER = 200 + + # ERROR: We found a marker where we expected to find one, but it is incompatible + # with the preceding markers. + # For example, a GLOBAL cannot follow FUNCTION/STUB + INCOMPATIBLE_MARKER = 201 diff --git a/tools/isledecomp/isledecomp/parser/node.py b/tools/isledecomp/isledecomp/parser/node.py new file mode 100644 index 00000000..f9fbe3b5 --- /dev/null +++ b/tools/isledecomp/isledecomp/parser/node.py @@ -0,0 +1,41 @@ +from dataclasses import dataclass +from enum import Enum + + +@dataclass +class ParserNode: + line_number: int + + +@dataclass +class ParserAlert(ParserNode): + code: int + line: str + + +@dataclass +class ParserSymbol(ParserNode): + module: str + offset: int + + +@dataclass +class ParserFunction(ParserSymbol): + name: str + is_stub: bool = False + is_synthetic: bool = False + is_template: bool = False + end_line: int = -1 + + +@dataclass +class ParserVariable(ParserSymbol): + name: str + size: int = -1 + is_static: bool = False + + +@dataclass +class ParserVtable(ParserSymbol): + class_name: str + num_entries: int = -1 diff --git a/tools/isledecomp/isledecomp/parser/parser.py b/tools/isledecomp/isledecomp/parser/parser.py index 1039ac41..aebe1c86 100644 --- a/tools/isledecomp/isledecomp/parser/parser.py +++ b/tools/isledecomp/isledecomp/parser/parser.py @@ -1,145 +1,346 @@ # C++ file parser -from typing import List, TextIO +from typing import List, TextIO, Iterable from enum import Enum from .util import ( - CodeBlock, - OffsetMatch, + DecompMarker, is_blank_or_comment, - match_offset_comment, + match_marker, + is_marker_exact, get_template_function_name, remove_trailing_comment, - distinct_by_module, ) +from .node import ( + ParserAlert, + ParserNode, + ParserFunction, + ParserVariable, + ParserVtable, +) +from .error import ParserError class ReaderState(Enum): - WANT_OFFSET = 0 + SEARCH = 0 WANT_SIG = 1 IN_FUNC = 2 IN_TEMPLATE = 3 WANT_CURLY = 4 - FUNCTION_DONE = 5 + IN_GLOBAL = 5 + IN_FUNC_GLOBAL = 6 + IN_VTABLE = 7 -def find_code_blocks(stream: TextIO) -> List[CodeBlock]: - """Read the IO stream (file) line-by-line and give the following report: - Foreach code block (function) in the file, what are its starting and - ending line numbers, and what is the given offset in the original - binary. We expect the result to be ordered by line number because we - are reading the file from start to finish.""" +def marker_is_stub(marker: DecompMarker) -> bool: + return marker.type.upper() == "STUB" - blocks: List[CodeBlock] = [] - offset_matches: List[OffsetMatch] = [] +def marker_is_variable(marker: DecompMarker) -> bool: + return marker.type.upper() == "GLOBAL" - function_sig = None - start_line = None - end_line = None - state = ReaderState.WANT_OFFSET - # 1-based to match cvdump and your text editor - # I know it says 0, but we will increment before each readline() - line_no = 0 - can_seek = True +def marker_is_synthetic(marker: DecompMarker) -> bool: + return marker.type.upper() in ("SYNTHETIC", "TEMPLATE") - while True: - # Do this before reading again so that an EOF will not - # cause us to miss the last function of the file. - if state == ReaderState.FUNCTION_DONE: - # Our list of offset marks could have duplicates on - # module name, so we'll eliminate those now. - for offset_match in distinct_by_module(offset_matches): - block = CodeBlock( - offset=offset_match.address, - signature=function_sig, - start_line=start_line, + +def marker_is_function(marker: DecompMarker) -> bool: + return marker.type.upper() in ("FUNCTION", "STUB") + + +def marker_is_vtable(marker: DecompMarker) -> bool: + return marker.type.upper() == "VTABLE" + + +class MarkerDict: + def __init__(self): + self.markers: dict = {} + + def insert(self, marker: DecompMarker) -> bool: + module = marker.module.upper() + # Return True if this insert would overwrite + if module in self.markers: + return True + + self.markers[module] = (marker.type, marker.offset) + return False + + def iter(self): + for module in self.markers: + (marker_type, offset) = self.markers[module] + yield DecompMarker(marker_type, module, offset) + + def empty(self): + self.markers = {} + + +class DecompParser: + def __init__(self): + self.fun_markers = MarkerDict() + self.var_markers = MarkerDict() + self.tbl_markers = MarkerDict() + self.reset() + + def reset(self): + # Output values + self.functions = [] + self.vtables = [] + self.variables = [] + self.alerts = [] + + # Internal state machine stuff + self.line_number: int = 0 + self.state: ReaderState = ReaderState.SEARCH + + self.last_line: str = "" + self.fun_markers.empty() + self.var_markers.empty() + self.tbl_markers.empty() + self.function_start: int = 0 + self.function_sig: str = "" + + def _recover(self): + """We hit a syntax error and need to reset temp structures""" + self.state = ReaderState.SEARCH + self.fun_markers.empty() + self.var_markers.empty() + self.tbl_markers.empty() + + def _syntax_warning(self, code): + self.alerts.append( + ParserAlert( + line_number=self.line_number, + code=code, + line=self.last_line.strip(), + ) + ) + + def _syntax_error(self, code): + self._syntax_warning(code) + self._recover() + + def _function_starts_here(self): + self.function_start = self.line_number + + def _function_marker(self, marker: DecompMarker): + if self.fun_markers.insert(marker): + self._syntax_warning(ParserError.DUPLICATE_MODULE) + self.state = ReaderState.WANT_SIG + + def _synthetic_marker(self, marker: DecompMarker): + if self.fun_markers.insert(marker): + self._syntax_warning(ParserError.DUPLICATE_MODULE) + self.state = ReaderState.IN_TEMPLATE + + def _function_done(self, unexpected: bool = False): + end_line = self.line_number + if unexpected: + end_line -= -1 + + for marker in self.fun_markers.iter(): + self.functions.append( + ParserFunction( + line_number=self.function_start, + module=marker.module, + offset=marker.offset, + is_stub=marker_is_stub(marker), + is_template=marker_is_synthetic(marker), + name=self.function_sig, end_line=end_line, - offset_comment=offset_match.comment, - module=offset_match.module, - is_template=offset_match.is_template, - is_stub=offset_match.is_stub, ) - blocks.append(block) - offset_matches = [] - state = ReaderState.WANT_OFFSET + ) - if can_seek: - line_no += 1 - line = stream.readline() - if line == "": - break + self.fun_markers.empty() + self.state = ReaderState.SEARCH - new_match = match_offset_comment(line) - if new_match is not None: - # We will allow multiple offsets if we have just begun - # the code block, but not after we hit the curly brace. - if state in ( - ReaderState.WANT_OFFSET, - ReaderState.IN_TEMPLATE, + def _vtable_marker(self, marker: DecompMarker): + if self.tbl_markers.insert(marker): + self._syntax_warning(ParserError.DUPLICATE_MODULE) + self.state = ReaderState.IN_VTABLE + + def _vtable_done(self): + for marker in self.tbl_markers.iter(): + self.vtables.append( + ParserVtable( + line_number=self.line_number, + module=marker.module, + offset=marker.offset, + class_name=self.last_line.strip(), + ) + ) + + self.tbl_markers.empty() + self.state = ReaderState.SEARCH + + def _variable_marker(self, marker: DecompMarker): + if self.var_markers.insert(marker): + self._syntax_warning(ParserError.DUPLICATE_MODULE) + + if self.state in (ReaderState.IN_FUNC, ReaderState.IN_FUNC_GLOBAL): + self.state = ReaderState.IN_FUNC_GLOBAL + else: + self.state = ReaderState.IN_GLOBAL + + def _variable_done(self): + for marker in self.var_markers.iter(): + self.variables.append( + ParserVariable( + line_number=self.line_number, + module=marker.module, + offset=marker.offset, + name=self.last_line.strip(), + ) + ) + + self.var_markers.empty() + if self.state == ReaderState.IN_FUNC_GLOBAL: + self.state = ReaderState.IN_FUNC + else: + self.state = ReaderState.SEARCH + + def _handle_marker(self, marker: DecompMarker): + # Cannot handle any markers between function sig and opening curly brace + if self.state == ReaderState.WANT_CURLY: + self._syntax_error(ParserError.UNEXPECTED_MARKER) + return + + # TODO: How uncertain are we of detecting the end of a function + # in a clang-formatted file? For now we assume we have missed the + # end if we detect a non-GLOBAL marker while state is IN_FUNC. + # Maybe these cases should be syntax errors instead + + if marker_is_function(marker): + if self.state in ( + ReaderState.SEARCH, ReaderState.WANT_SIG, ): - # If we detected an offset marker unexpectedly, - # we are handling it here so we can continue seeking. - can_seek = True - - offset_matches.append(new_match) - - if new_match.is_template: - state = ReaderState.IN_TEMPLATE - else: - state = ReaderState.WANT_SIG - else: + # We will allow multiple offsets if we have just begun + # the code block, but not after we hit the curly brace. + self._function_marker(marker) + elif self.state == ReaderState.IN_FUNC: # We hit another offset unexpectedly. # We can recover easily by just ending the function here. - end_line = line_no - 1 - state = ReaderState.FUNCTION_DONE + self._syntax_warning(ParserError.MISSED_END_OF_FUNCTION) + self._function_done() - # Pause reading here so we handle the offset marker - # on the next loop iteration - can_seek = False + # Start the next function right after so we can + # read the next line. + self._function_marker(marker) + else: + self._syntax_error(ParserError.INCOMPATIBLE_MARKER) - elif state == ReaderState.IN_TEMPLATE: + elif marker_is_synthetic(marker): + if self.state in (ReaderState.SEARCH, ReaderState.IN_TEMPLATE): + self._synthetic_marker(marker) + elif self.state == ReaderState.IN_FUNC: + self._syntax_warning(ParserError.MISSED_END_OF_FUNCTION) + self._function_done() + self._synthetic_marker(marker) + else: + self._syntax_error(ParserError.INCOMPATIBLE_MARKER) + + elif marker_is_variable(marker): + if self.state in ( + ReaderState.SEARCH, + ReaderState.IN_GLOBAL, + ReaderState.IN_FUNC, + ReaderState.IN_FUNC_GLOBAL, + ): + self._variable_marker(marker) + else: + self._syntax_error(ParserError.INCOMPATIBLE_MARKER) + + elif marker_is_vtable(marker): + if self.state in (ReaderState.SEARCH, ReaderState.IN_VTABLE): + self._vtable_marker(marker) + elif self.state == ReaderState.IN_FUNC: + self._syntax_warning(ParserError.MISSED_END_OF_FUNCTION) + self._function_done() + self._vtable_marker(marker) + else: + self._syntax_error(ParserError.INCOMPATIBLE_MARKER) + + else: + self._syntax_warning(ParserError.BOGUS_MARKER) + + def read_line(self, line: str): + self.last_line = line # TODO: Useful or hack for error reporting? + self.line_number += 1 + + marker = match_marker(line) + if marker is not None: + # TODO: what's the best place for this? + # Does it belong with reading or marker handling? + if not is_marker_exact(self.last_line): + self._syntax_warning(ParserError.BAD_DECOMP_MARKER) + self._handle_marker(marker) + return + + if self.state == ReaderState.IN_TEMPLATE: # TEMPLATE functions are a special case. The signature is # given on the next line (in a // comment) - function_sig = get_template_function_name(line) - start_line = line_no - end_line = line_no - state = ReaderState.FUNCTION_DONE + self.function_sig = get_template_function_name(line) + self._function_starts_here() + self._function_done() - elif state == ReaderState.WANT_SIG: + elif self.state == ReaderState.WANT_SIG: # Skip blank lines or comments that come after the offset # marker. There is not a formal procedure for this, so just # assume the next "code line" is the function signature if not is_blank_or_comment(line): # Inline functions may end with a comment. Strip that out # to help parsing. - function_sig = remove_trailing_comment(line.strip()) + self.function_sig = remove_trailing_comment(line.strip()) # Now check to see if the opening curly bracket is on the # same line. clang-format should prevent this (BraceWrapping) # but it is easy to detect. # If the entire function is on one line, handle that too. - if function_sig.endswith("{"): - start_line = line_no - state = ReaderState.IN_FUNC - elif function_sig.endswith("}") or function_sig.endswith("};"): - start_line = line_no - end_line = line_no - state = ReaderState.FUNCTION_DONE + if self.function_sig.endswith("{"): + self._function_starts_here() + self.state = ReaderState.IN_FUNC + elif self.function_sig.endswith("}") or self.function_sig.endswith( + "};" + ): + self._function_starts_here() + self._function_done() else: - state = ReaderState.WANT_CURLY + self.state = ReaderState.WANT_CURLY - elif state == ReaderState.WANT_CURLY: + elif self.state == ReaderState.WANT_CURLY: if line.strip() == "{": - start_line = line_no - state = ReaderState.IN_FUNC + self._function_starts_here() + self.state = ReaderState.IN_FUNC - elif state == ReaderState.IN_FUNC: + elif self.state == ReaderState.IN_FUNC: # Naive but reasonable assumption that functions will end with # a curly brace on its own line with no prepended spaces. if line.startswith("}"): - end_line = line_no - state = ReaderState.FUNCTION_DONE + self._function_done() - return blocks + elif self.state in (ReaderState.IN_GLOBAL, ReaderState.IN_FUNC_GLOBAL): + if not is_blank_or_comment(line): + self._variable_done() + + elif self.state == ReaderState.IN_VTABLE: + if not is_blank_or_comment(line): + self._vtable_done() + + def read_lines(self, lines: Iterable): + for line in lines: + self.read_line(line) + + +def find_code_blocks(stream: TextIO) -> List[ParserNode]: + """Read the IO stream (file) line-by-line and give the following report: + Foreach code block (function) in the file, what are its starting and + ending line numbers, and what is the given offset in the original + binary. We expect the result to be ordered by line number because we + are reading the file from start to finish.""" + + # TODO: this will be replaced shortly. shim for now to avoid + # making more changes elsewhere + p = DecompParser() + for line in stream: + p.read_line(line) + + return p.functions diff --git a/tools/isledecomp/isledecomp/parser/util.py b/tools/isledecomp/isledecomp/parser/util.py index 59fca75b..02d3c976 100644 --- a/tools/isledecomp/isledecomp/parser/util.py +++ b/tools/isledecomp/isledecomp/parser/util.py @@ -4,41 +4,15 @@ import re from typing import List from collections import namedtuple +DecompMarker = namedtuple("DecompMarker", ["type", "module", "offset"]) -CodeBlock = namedtuple( - "CodeBlock", - [ - "offset", - "signature", - "start_line", - "end_line", - "offset_comment", - "module", - "is_template", - "is_stub", - ], -) -OffsetMatch = namedtuple( - "OffsetMatch", ["module", "address", "is_template", "is_stub", "comment"] -) - -# This has not been formally established, but considering that "STUB" -# is a temporary state for a function, we assume it will appear last, -# after any other modifiers (i.e. TEMPLATE) - -# To match a reasonable variance of formatting for the offset comment -offsetCommentRegex = re.compile( - r"\s*//\s*OFFSET:\s*(\w+)\s+(?:0x)?([a-f0-9]+)(\s+TEMPLATE)?(\s+STUB)?", # nopep8 +markerRegex = re.compile( + r"\s*//\s*(\w+):\s*(\w+)\s+((?:0x)?[a-f0-9]+)", flags=re.I, ) -# To match the exact syntax (text upper case, hex lower case, with spaces) -# that is used in most places -offsetCommentExactRegex = re.compile( - r"^// OFFSET: [A-Z0-9]+ (0x[a-f0-9]+)( TEMPLATE)?( STUB)?$" -) # nopep8 - +markerExactRegex = re.compile(r"// ([A-Z]+): ([A-Z0-9]+) (0x[a-f0-9]+)$") # The goal here is to just read whatever is on the next line, so some # flexibility in the formatting seems OK @@ -78,39 +52,15 @@ def is_blank_or_comment(line: str) -> bool: ) -def is_exact_offset_comment(line: str) -> bool: - """If the offset comment does not match our (unofficial) syntax - we may want to alert the user to fix it for style points.""" - return offsetCommentExactRegex.match(line) is not None - - -def match_offset_comment(line: str) -> OffsetMatch | None: - match = offsetCommentRegex.match(line) +def match_marker(line: str) -> DecompMarker | None: + match = markerRegex.match(line) if match is None: return None - return OffsetMatch( - module=match.group(1), - address=int(match.group(2), 16), - is_template=match.group(3) is not None, - is_stub=match.group(4) is not None, - comment=line.strip(), + return DecompMarker( + type=match.group(1), module=match.group(2), offset=int(match.group(3), 16) ) -def distinct_by_module(offsets: List) -> List: - """Given a list of offset markers, return a list with distinct - module names. If module names (case-insensitive) are repeated, - choose the offset that appears first.""" - - if len(offsets) < 2: - return offsets - - # Dict maintains insertion order in python >=3.7 - offsets_dict = {} - for offset in offsets: - module_upper = offset.module.upper() - if module_upper not in offsets_dict: - offsets_dict[module_upper] = offset - - return list(offsets_dict.values()) +def is_marker_exact(line: str) -> bool: + return markerExactRegex.match(line) is not None diff --git a/tools/isledecomp/tests/samples/basic_class.cpp b/tools/isledecomp/tests/samples/basic_class.cpp index 23ce3c39..4316ad4a 100644 --- a/tools/isledecomp/tests/samples/basic_class.cpp +++ b/tools/isledecomp/tests/samples/basic_class.cpp @@ -3,6 +3,7 @@ // A very simple class +// VTABLE: TEST 0x1001002 class TestClass { public: TestClass(); @@ -10,14 +11,14 @@ public: virtual MxResult Tickle() override; // vtable+08 - // OFFSET: TEST 0x12345678 + // FUNCTION: TEST 0x12345678 inline const char* ClassName() const // vtable+0c { // 0xabcd1234 return "TestClass"; } - // OFFSET: TEST 0xdeadbeef + // FUNCTION: TEST 0xdeadbeef inline MxBool IsA(const char* name) const override // vtable+10 { return !strcmp(name, TestClass::ClassName()); diff --git a/tools/isledecomp/tests/samples/basic_file.cpp b/tools/isledecomp/tests/samples/basic_file.cpp index 6a4017b5..99930de8 100644 --- a/tools/isledecomp/tests/samples/basic_file.cpp +++ b/tools/isledecomp/tests/samples/basic_file.cpp @@ -3,19 +3,19 @@ // A very simple well-formed code file -// OFFSET: TEST 0x1234 +// FUNCTION: TEST 0x1234 void function01() { // TODO } -// OFFSET: TEST 0x2345 +// FUNCTION: TEST 0x2345 void function02() { // TODO } -// OFFSET: TEST 0x3456 +// FUNCTION: TEST 0x3456 void function03() { // TODO diff --git a/tools/isledecomp/tests/samples/global_variables.cpp b/tools/isledecomp/tests/samples/global_variables.cpp new file mode 100644 index 00000000..3be0316a --- /dev/null +++ b/tools/isledecomp/tests/samples/global_variables.cpp @@ -0,0 +1,14 @@ +// Sample for python unit tests +// Not part of the decomp + +// Global variables inside and outside of functions + +// GLOBAL: TEST 0x1000 +const char *g_message = "test"; + +// FUNCTION: TEST 0x1234 +void function01() +{ + // GLOBAL: TEST 0x5555 + static int g_hello = 123; +} diff --git a/tools/isledecomp/tests/samples/inline.cpp b/tools/isledecomp/tests/samples/inline.cpp index 0bfedf6d..8a36c89a 100644 --- a/tools/isledecomp/tests/samples/inline.cpp +++ b/tools/isledecomp/tests/samples/inline.cpp @@ -1,8 +1,8 @@ // Sample for python unit tests // Not part of the decomp -// OFFSET: TEST 0x10000001 +// FUNCTION: TEST 0x10000001 inline const char* OneLineWithComment() const { return "MxDSObject"; }; // hi there -// OFFSET: TEST 0x10000002 +// FUNCTION: TEST 0x10000002 inline const char* OneLine() const { return "MxDSObject"; }; diff --git a/tools/isledecomp/tests/samples/missing_offset.cpp b/tools/isledecomp/tests/samples/missing_offset.cpp index 332fed2c..3f6b3811 100644 --- a/tools/isledecomp/tests/samples/missing_offset.cpp +++ b/tools/isledecomp/tests/samples/missing_offset.cpp @@ -9,7 +9,7 @@ int no_offset_comment() return -1; } -// OFFSET: TEST 0xdeadbeef +// FUNCTION: TEST 0xdeadbeef void regular_ole_function() { printf("hi there"); diff --git a/tools/isledecomp/tests/samples/multiple_offsets.cpp b/tools/isledecomp/tests/samples/multiple_offsets.cpp index eecdd95b..dc3c5bfa 100644 --- a/tools/isledecomp/tests/samples/multiple_offsets.cpp +++ b/tools/isledecomp/tests/samples/multiple_offsets.cpp @@ -3,22 +3,22 @@ // Handling multiple offset markers -// OFFSET: TEST 0x1234 -// OFFSET: HELLO 0x5555 +// FUNCTION: TEST 0x1234 +// FUNCTION: HELLO 0x5555 void different_modules() { // TODO } -// OFFSET: TEST 0x2345 -// OFFSET: TEST 0x1234 +// FUNCTION: TEST 0x2345 +// FUNCTION: TEST 0x1234 void same_module() { // TODO } -// OFFSET: TEST 0x2002 -// OFFSET: test 0x1001 +// FUNCTION: TEST 0x2002 +// FUNCTION: test 0x1001 void same_case_insensitive() { // TODO diff --git a/tools/isledecomp/tests/samples/oneline_function.cpp b/tools/isledecomp/tests/samples/oneline_function.cpp index 8d7fdc5a..feb82314 100644 --- a/tools/isledecomp/tests/samples/oneline_function.cpp +++ b/tools/isledecomp/tests/samples/oneline_function.cpp @@ -1,10 +1,10 @@ // Sample for python unit tests // Not part of the decomp -// OFFSET: TEST 0x1234 +// FUNCTION: TEST 0x1234 void short_function() { static char* msg = "oneliner"; } -// OFFSET: TEST 0x5555 +// FUNCTION: TEST 0x5555 void function_after_one_liner() { // This function comes after the previous that is on a single line. diff --git a/tools/isledecomp/tests/samples/out_of_order.cpp b/tools/isledecomp/tests/samples/out_of_order.cpp index 749c4f2b..951c99e7 100644 --- a/tools/isledecomp/tests/samples/out_of_order.cpp +++ b/tools/isledecomp/tests/samples/out_of_order.cpp @@ -1,19 +1,19 @@ // Sample for python unit tests // Not part of the decomp -// OFFSET: TEST 0x1001 +// FUNCTION: TEST 0x1001 void function_order01() { // TODO } -// OFFSET: TEST 0x1003 +// FUNCTION: TEST 0x1003 void function_order03() { // TODO } -// OFFSET: TEST 0x1002 +// FUNCTION: TEST 0x1002 void function_order02() { // TODO diff --git a/tools/isledecomp/tests/samples/poorly_formatted.cpp b/tools/isledecomp/tests/samples/poorly_formatted.cpp index 32dd774c..69f365ec 100644 --- a/tools/isledecomp/tests/samples/poorly_formatted.cpp +++ b/tools/isledecomp/tests/samples/poorly_formatted.cpp @@ -4,18 +4,18 @@ // While it's reasonable to expect a well-formed file (and clang-format // will make sure we get one), this will put the parser through its paces. -// OFFSET: TEST 0x1234 +// FUNCTION: TEST 0x1234 void curly_with_spaces() { static char* msg = "hello"; } -// OFFSET: TEST 0x5555 +// FUNCTION: TEST 0x5555 void weird_closing_curly() { int x = 123; } -// OFFSET: HELLO 0x5656 +// FUNCTION: HELLO 0x5656 void bad_indenting() { if (0) { diff --git a/tools/isledecomp/tests/test_parser.py b/tools/isledecomp/tests/test_parser.py index 48bb0e44..7731918f 100644 --- a/tools/isledecomp/tests/test_parser.py +++ b/tools/isledecomp/tests/test_parser.py @@ -1,127 +1,170 @@ -import os -from typing import List, TextIO -from isledecomp.parser import find_code_blocks -from isledecomp.parser.util import CodeBlock - -SAMPLE_DIR = os.path.join(os.path.dirname(__file__), "samples") +import pytest +from isledecomp.parser.parser import ( + ReaderState, + DecompParser, +) +from isledecomp.parser.util import DecompMarker +from isledecomp.parser.error import ParserError -def sample_file(filename: str) -> TextIO: - """Wrapper for opening the samples from the directory that does not - depend on the cwd where we run the test""" - full_path = os.path.join(SAMPLE_DIR, filename) - return open(full_path, "r", encoding="utf-8") +@pytest.fixture +def parser(): + return DecompParser() -def code_blocks_are_sorted(blocks: List[CodeBlock]) -> bool: - """Helper to make this more idiomatic""" - just_offsets = [block.offset for block in blocks] - return just_offsets == sorted(just_offsets) +@pytest.mark.skip(reason="todo") +def test_missing_sig(parser): + """Bad syntax: function signature is missing""" + parser.read_lines(["// FUNCTION: TEST 0x1234", "{"]) + assert parser.state == ReaderState.IN_FUNC + assert len(parser.alerts) == 1 + parser.read_line("}") + assert len(parser.functions) == 1 + assert parser.functions[0] != "{" -# Tests are below # +def test_not_exact_syntax(parser): + """Alert to inexact syntax right here in the parser instead of kicking it downstream. + Doing this means we don't have to save the actual text.""" + parser.read_line("// function: test 1234") + assert len(parser.alerts) == 1 + assert parser.alerts[0].code == ParserError.BAD_DECOMP_MARKER -def test_sanity(): - """Read a very basic file""" - with sample_file("basic_file.cpp") as f: - blocks = find_code_blocks(f) +def test_invalid_marker(parser): + """We matched a decomp marker, but it's not one we care about""" + parser.read_line("// BANANA: TEST 0x1234") + assert parser.state == ReaderState.SEARCH - assert len(blocks) == 3 - assert code_blocks_are_sorted(blocks) is True - # n.b. The parser returns line numbers as 1-based - # Function starts when we see the opening curly brace - assert blocks[0].start_line == 8 - assert blocks[0].end_line == 10 + assert len(parser.alerts) == 1 + assert parser.alerts[0].code == ParserError.BOGUS_MARKER -def test_oneline(): - """(Assuming clang-format permits this) This sample has a function - on a single line. This will test the end-of-function detection""" - with sample_file("oneline_function.cpp") as f: - blocks = find_code_blocks(f) - - assert len(blocks) == 2 - assert blocks[0].start_line == 5 - assert blocks[0].end_line == 5 +def test_unexpected_marker(parser): + parser.read_lines( + [ + "// FUNCTION: TEST 0x1234", + "// GLOBAL: TEST 0x5000", + ] + ) + assert parser.state == ReaderState.SEARCH + assert len(parser.alerts) == 1 + assert parser.alerts[0].code == ParserError.INCOMPATIBLE_MARKER -def test_missing_offset(): - """What if the function doesn't have an offset comment?""" - with sample_file("missing_offset.cpp") as f: - blocks = find_code_blocks(f) - - # TODO: For now, the function without the offset will just be ignored. - # Would be the same outcome if the comment was present but mangled and - # we failed to match it. We should detect these cases in the future. - assert len(blocks) == 1 +def test_variable(parser): + parser.read_lines( + [ + "// GLOBAL: HELLO 0x1234", + "int g_value = 5;", + ] + ) + assert len(parser.variables) == 1 -def test_jumbled_case(): - """The parser just reports what it sees. It is the responsibility of - the downstream tools to do something about a jumbled file. - Just verify that we are reading it correctly.""" - with sample_file("out_of_order.cpp") as f: - blocks = find_code_blocks(f) - - assert len(blocks) == 3 - assert code_blocks_are_sorted(blocks) is False +def test_synthetic_plus_marker(parser): + """Should fail with error and not log the synthetic""" + parser.read_lines( + [ + "// SYNTHETIC: HEY 0x555", + "// FUNCTION: HOWDY 0x1234", + ] + ) + assert len(parser.functions) == 0 + assert len(parser.alerts) == 1 + assert parser.alerts[0].code == ParserError.INCOMPATIBLE_MARKER -def test_bad_file(): - with sample_file("poorly_formatted.cpp") as f: - blocks = find_code_blocks(f) +def test_different_markers_different_module(parser): + """Does it make any sense for a function to be a stub in one module, + but not in another? I don't know. But it's no problem for us.""" + parser.read_lines( + [ + "// FUNCTION: HOWDY 0x1234", + "// STUB: SUP 0x5555", + "void interesting_function() {", + "}", + ] + ) - assert len(blocks) == 3 + assert len(parser.alerts) == 0 + assert len(parser.functions) == 2 -def test_indented(): - """Offsets for functions inside of a class will probably be indented.""" - with sample_file("basic_class.cpp") as f: - blocks = find_code_blocks(f) +def test_different_markers_same_module(parser): + """Now, if something is a regular function but then a stub, + what do we say about that?""" + parser.read_lines( + [ + "// FUNCTION: HOWDY 0x1234", + "// STUB: HOWDY 0x5555", + "void interesting_function() {", + "}", + ] + ) - # TODO: We don't properly detect the end of these functions - # because the closing brace is indented. However... knowing where each - # function ends is less important (for now) than capturing - # all the functions that are there. + # Use first marker declaration, don't replace + assert len(parser.functions) == 1 + assert parser.functions[0].is_stub is False - assert len(blocks) == 2 - assert blocks[0].offset == int("0x12345678", 16) - assert blocks[0].start_line == 15 - # assert blocks[0].end_line == 18 - - assert blocks[1].offset == int("0xdeadbeef", 16) - assert blocks[1].start_line == 22 - # assert blocks[1].end_line == 24 + # Should alert to this + assert len(parser.alerts) == 1 + assert parser.alerts[0].code == ParserError.DUPLICATE_MODULE -def test_inline(): - with sample_file("inline.cpp") as f: - blocks = find_code_blocks(f) +def test_unexpected_synthetic(parser): + """FUNCTION then SYNTHETIC should fail to report either one""" + parser.read_lines( + [ + "// FUNCTION: HOWDY 0x1234", + "// SYNTHETIC: HOWDY 0x5555", + "void interesting_function() {", + "}", + ] + ) - assert len(blocks) == 2 - for block in blocks: - assert block.start_line is not None - assert block.start_line == block.end_line + assert parser.state == ReaderState.SEARCH + assert len(parser.functions) == 0 + assert len(parser.alerts) == 1 + assert parser.alerts[0].code == ParserError.INCOMPATIBLE_MARKER -def test_multiple_offsets(): - """If multiple offset marks appear before for a code block, take them - all but ensure module name (case-insensitive) is distinct. - Use first module occurrence in case of duplicates.""" - with sample_file("multiple_offsets.cpp") as f: - blocks = find_code_blocks(f) +@pytest.mark.skip(reason="not implemented yet") +def test_duplicate_offset(parser): + """Repeating the same module/offset in the same file is probably a typo""" + parser.read_lines( + [ + "// GLOBAL: HELLO 0x1234", + "int x = 1;", + "// GLOBAL: HELLO 0x1234", + "int y = 2;", + ] + ) - assert len(blocks) == 4 - assert blocks[0].module == "TEST" - assert blocks[0].start_line == 9 + assert len(parser.alerts) == 1 + assert parser.alerts[0].code == ParserError.DUPLICATE_OFFSET - assert blocks[1].module == "HELLO" - assert blocks[1].start_line == 9 - # Duplicate modules are ignored - assert blocks[2].start_line == 16 - assert blocks[2].offset == 0x2345 +def test_multiple_variables(parser): + """Theoretically the same global variable can appear in multiple modules""" + parser.read_lines( + [ + "// GLOBAL: HELLO 0x1234", + "// GLOBAL: WUZZUP 0x555", + "const char *g_greeting;", + ] + ) + assert len(parser.alerts) == 0 + assert len(parser.variables) == 2 - assert blocks[3].module == "TEST" - assert blocks[3].offset == 0x2002 + +def test_multiple_vtables(parser): + parser.read_lines( + [ + "// VTABLE: HELLO 0x1234", + "// VTABLE: TEST 0x5432", + "class MxString : public MxCore {", + ] + ) + assert len(parser.alerts) == 0 + assert len(parser.vtables) == 2 diff --git a/tools/isledecomp/tests/test_parser_samples.py b/tools/isledecomp/tests/test_parser_samples.py new file mode 100644 index 00000000..a045e3cc --- /dev/null +++ b/tools/isledecomp/tests/test_parser_samples.py @@ -0,0 +1,141 @@ +import os +import pytest +from typing import List, TextIO +from isledecomp.parser import DecompParser +from isledecomp.parser.node import ParserSymbol + +SAMPLE_DIR = os.path.join(os.path.dirname(__file__), "samples") + + +def sample_file(filename: str) -> TextIO: + """Wrapper for opening the samples from the directory that does not + depend on the cwd where we run the test""" + full_path = os.path.join(SAMPLE_DIR, filename) + return open(full_path, "r", encoding="utf-8") + + +def code_blocks_are_sorted(blocks: List[ParserSymbol]) -> bool: + """Helper to make this more idiomatic""" + just_offsets = [block.offset for block in blocks] + return just_offsets == sorted(just_offsets) + + +@pytest.fixture +def parser(): + return DecompParser() + + +# Tests are below # + + +def test_sanity(parser): + """Read a very basic file""" + with sample_file("basic_file.cpp") as f: + parser.read_lines(f) + + assert len(parser.functions) == 3 + assert code_blocks_are_sorted(parser.functions) is True + # n.b. The parser returns line numbers as 1-based + # Function starts when we see the opening curly brace + assert parser.functions[0].line_number == 8 + assert parser.functions[0].end_line == 10 + + +def test_oneline(parser): + """(Assuming clang-format permits this) This sample has a function + on a single line. This will test the end-of-function detection""" + with sample_file("oneline_function.cpp") as f: + parser.read_lines(f) + + assert len(parser.functions) == 2 + assert parser.functions[0].line_number == 5 + assert parser.functions[0].end_line == 5 + + +def test_missing_offset(parser): + """What if the function doesn't have an offset comment?""" + with sample_file("missing_offset.cpp") as f: + parser.read_lines(f) + + # TODO: For now, the function without the offset will just be ignored. + # Would be the same outcome if the comment was present but mangled and + # we failed to match it. We should detect these cases in the future. + assert len(parser.functions) == 1 + + +def test_jumbled_case(parser): + """The parser just reports what it sees. It is the responsibility of + the downstream tools to do something about a jumbled file. + Just verify that we are reading it correctly.""" + with sample_file("out_of_order.cpp") as f: + parser.read_lines(f) + + assert len(parser.functions) == 3 + assert code_blocks_are_sorted(parser.functions) is False + + +def test_bad_file(parser): + with sample_file("poorly_formatted.cpp") as f: + parser.read_lines(f) + + assert len(parser.functions) == 3 + + +def test_indented(parser): + """Offsets for functions inside of a class will probably be indented.""" + with sample_file("basic_class.cpp") as f: + parser.read_lines(f) + + # TODO: We don't properly detect the end of these functions + # because the closing brace is indented. However... knowing where each + # function ends is less important (for now) than capturing + # all the functions that are there. + + assert len(parser.functions) == 2 + assert parser.functions[0].offset == int("0x12345678", 16) + assert parser.functions[0].line_number == 16 + # assert parser.functions[0].end_line == 19 + + assert parser.functions[1].offset == int("0xdeadbeef", 16) + assert parser.functions[1].line_number == 23 + # assert parser.functions[1].end_line == 25 + + +def test_inline(parser): + with sample_file("inline.cpp") as f: + parser.read_lines(f) + + assert len(parser.functions) == 2 + for fun in parser.functions: + assert fun.line_number is not None + assert fun.line_number == fun.end_line + + +def test_multiple_offsets(parser): + """If multiple offset marks appear before for a code block, take them + all but ensure module name (case-insensitive) is distinct. + Use first module occurrence in case of duplicates.""" + with sample_file("multiple_offsets.cpp") as f: + parser.read_lines(f) + + assert len(parser.functions) == 4 + assert parser.functions[0].module == "TEST" + assert parser.functions[0].line_number == 9 + + assert parser.functions[1].module == "HELLO" + assert parser.functions[1].line_number == 9 + + # Duplicate modules are ignored + assert parser.functions[2].line_number == 16 + assert parser.functions[2].offset == 0x2345 + + assert parser.functions[3].module == "TEST" + assert parser.functions[3].offset == 0x2002 + + +def test_variables(parser): + with sample_file("global_variables.cpp") as f: + parser.read_lines(f) + + assert len(parser.functions) == 1 + assert len(parser.variables) == 2 diff --git a/tools/isledecomp/tests/test_parser_statechange.py b/tools/isledecomp/tests/test_parser_statechange.py new file mode 100644 index 00000000..62c19175 --- /dev/null +++ b/tools/isledecomp/tests/test_parser_statechange.py @@ -0,0 +1,150 @@ +import pytest +from isledecomp.parser.parser import ( + ReaderState as _rs, + DecompParser, +) +from isledecomp.parser.util import DecompMarker +from isledecomp.parser.error import ParserError as _pe + +# fmt: off +state_change_marker_cases = [ + (_rs.SEARCH, "FUNCTION", _rs.WANT_SIG, None), + (_rs.SEARCH, "GLOBAL", _rs.IN_GLOBAL, None), + (_rs.SEARCH, "STUB", _rs.WANT_SIG, None), + (_rs.SEARCH, "SYNTHETIC", _rs.IN_TEMPLATE, None), + (_rs.SEARCH, "TEMPLATE", _rs.IN_TEMPLATE, None), + (_rs.SEARCH, "VTABLE", _rs.IN_VTABLE, None), + + (_rs.WANT_SIG, "FUNCTION", _rs.WANT_SIG, None), + (_rs.WANT_SIG, "GLOBAL", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER), + (_rs.WANT_SIG, "STUB", _rs.WANT_SIG, None), + (_rs.WANT_SIG, "SYNTHETIC", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER), + (_rs.WANT_SIG, "TEMPLATE", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER), + (_rs.WANT_SIG, "VTABLE", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER), + + (_rs.IN_FUNC, "FUNCTION", _rs.WANT_SIG, _pe.MISSED_END_OF_FUNCTION), + (_rs.IN_FUNC, "GLOBAL", _rs.IN_FUNC_GLOBAL, None), + (_rs.IN_FUNC, "STUB", _rs.WANT_SIG, _pe.MISSED_END_OF_FUNCTION), + (_rs.IN_FUNC, "SYNTHETIC", _rs.IN_TEMPLATE, _pe.MISSED_END_OF_FUNCTION), + (_rs.IN_FUNC, "TEMPLATE", _rs.IN_TEMPLATE, _pe.MISSED_END_OF_FUNCTION), + (_rs.IN_FUNC, "VTABLE", _rs.IN_VTABLE, _pe.MISSED_END_OF_FUNCTION), + + (_rs.IN_TEMPLATE, "FUNCTION", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER), + (_rs.IN_TEMPLATE, "GLOBAL", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER), + (_rs.IN_TEMPLATE, "STUB", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER), + (_rs.IN_TEMPLATE, "SYNTHETIC", _rs.IN_TEMPLATE, None), + (_rs.IN_TEMPLATE, "TEMPLATE", _rs.IN_TEMPLATE, None), + (_rs.IN_TEMPLATE, "VTABLE", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER), + + (_rs.WANT_CURLY, "FUNCTION", _rs.SEARCH, _pe.UNEXPECTED_MARKER), + (_rs.WANT_CURLY, "GLOBAL", _rs.SEARCH, _pe.UNEXPECTED_MARKER), + (_rs.WANT_CURLY, "STUB", _rs.SEARCH, _pe.UNEXPECTED_MARKER), + (_rs.WANT_CURLY, "SYNTHETIC", _rs.SEARCH, _pe.UNEXPECTED_MARKER), + (_rs.WANT_CURLY, "TEMPLATE", _rs.SEARCH, _pe.UNEXPECTED_MARKER), + (_rs.WANT_CURLY, "VTABLE", _rs.SEARCH, _pe.UNEXPECTED_MARKER), + + (_rs.IN_GLOBAL, "FUNCTION", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER), + (_rs.IN_GLOBAL, "GLOBAL", _rs.IN_GLOBAL, None), + (_rs.IN_GLOBAL, "STUB", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER), + (_rs.IN_GLOBAL, "SYNTHETIC", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER), + (_rs.IN_GLOBAL, "TEMPLATE", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER), + (_rs.IN_GLOBAL, "VTABLE", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER), + + (_rs.IN_FUNC_GLOBAL, "FUNCTION", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER), + (_rs.IN_FUNC_GLOBAL, "GLOBAL", _rs.IN_FUNC_GLOBAL, None), + (_rs.IN_FUNC_GLOBAL, "STUB", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER), + (_rs.IN_FUNC_GLOBAL, "SYNTHETIC", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER), + (_rs.IN_FUNC_GLOBAL, "TEMPLATE", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER), + (_rs.IN_FUNC_GLOBAL, "VTABLE", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER), + + (_rs.IN_VTABLE, "FUNCTION", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER), + (_rs.IN_VTABLE, "GLOBAL", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER), + (_rs.IN_VTABLE, "STUB", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER), + (_rs.IN_VTABLE, "SYNTHETIC", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER), + (_rs.IN_VTABLE, "TEMPLATE", _rs.SEARCH, _pe.INCOMPATIBLE_MARKER), + (_rs.IN_VTABLE, "VTABLE", _rs.IN_VTABLE, None), +] +# fmt: on + + +@pytest.mark.parametrize( + "state, marker_type, new_state, expected_error", state_change_marker_cases +) +def test_state_change_by_marker( + state: _rs, marker_type: str, new_state: _rs, expected_error: None | _pe +): + p = DecompParser() + p.state = state + p._handle_marker(DecompMarker(marker_type, "TEST", 0x1234)) + assert p.state == new_state + + if expected_error is not None: + assert len(p.alerts) > 0 + assert p.alerts[0].code == expected_error + + +# Reading any of these lines should have no effect in ReaderState.SEARCH +search_lines_no_effect = [ + "", + "\t", + " ", + "int x = 0;", + "// Comment", + "/*", + "*/", + "/* Block comment */", + "{", + "}", +] + + +@pytest.mark.parametrize("line", search_lines_no_effect) +def test_state_search_line(line: str): + p = DecompParser() + p.read_line(line) + assert p.state == _rs.SEARCH + assert len(p.alerts) == 0 + + +global_lines = [ + ("// A comment", _rs.IN_GLOBAL), + ("", _rs.IN_GLOBAL), + ("\t", _rs.IN_GLOBAL), + (" ", _rs.IN_GLOBAL), + # TODO: no check for "likely" variable declaration so these all count + ("void function()", _rs.SEARCH), + ("int x = 123;", _rs.SEARCH), + ("just some text", _rs.SEARCH), +] + + +@pytest.mark.parametrize("line, new_state", global_lines) +def test_state_global_line(line: str, new_state: _rs): + p = DecompParser() + p.read_line("// GLOBAL: TEST 0x1234") + assert p.state == _rs.IN_GLOBAL + p.read_line(line) + assert p.state == new_state + + +# mostly same as above +in_func_global_lines = [ + ("// A comment", _rs.IN_FUNC_GLOBAL), + ("", _rs.IN_FUNC_GLOBAL), + ("\t", _rs.IN_FUNC_GLOBAL), + (" ", _rs.IN_FUNC_GLOBAL), + # TODO: no check for "likely" variable declaration so these all count + ("void function()", _rs.IN_FUNC), + ("int x = 123;", _rs.IN_FUNC), + ("just some text", _rs.IN_FUNC), +] + + +@pytest.mark.parametrize("line, new_state", in_func_global_lines) +def test_state_in_func_global_line(line: str, new_state: _rs): + p = DecompParser() + p.state = _rs.IN_FUNC + p.read_line("// GLOBAL: TEST 0x1234") + assert p.state == _rs.IN_FUNC_GLOBAL + p.read_line(line) + assert p.state == new_state diff --git a/tools/isledecomp/tests/test_parser_util.py b/tools/isledecomp/tests/test_parser_util.py index 91fd285b..bbbf2d3a 100644 --- a/tools/isledecomp/tests/test_parser_util.py +++ b/tools/isledecomp/tests/test_parser_util.py @@ -1,11 +1,12 @@ from collections import namedtuple from typing import List import pytest +from isledecomp.parser.parser import MarkerDict from isledecomp.parser.util import ( + DecompMarker, is_blank_or_comment, - match_offset_comment, - is_exact_offset_comment, - distinct_by_module, + match_marker, + is_marker_exact, ) @@ -28,76 +29,72 @@ def test_is_blank_or_comment(line: str, expected: bool): assert is_blank_or_comment(line) is expected -offset_comment_samples = [ +marker_samples = [ # (can_parse: bool, exact_match: bool, line: str) - # Should match both expected modules with optional STUB marker - (True, True, "// OFFSET: LEGO1 0xdeadbeef"), - (True, True, "// OFFSET: LEGO1 0xdeadbeef STUB"), - (True, True, "// OFFSET: ISLE 0x12345678"), - (True, True, "// OFFSET: ISLE 0x12345678 STUB"), + (True, True, "// FUNCTION: LEGO1 0xdeadbeef"), + (True, True, "// FUNCTION: ISLE 0x12345678"), # No trailing spaces allowed - (True, False, "// OFFSET: LEGO1 0xdeadbeef "), - (True, False, "// OFFSET: LEGO1 0xdeadbeef STUB "), + (True, False, "// FUNCTION: LEGO1 0xdeadbeef "), # Must have exactly one space between elements - (True, False, "//OFFSET: ISLE 0xdeadbeef"), - (True, False, "// OFFSET:ISLE 0xdeadbeef"), - (True, False, "// OFFSET: ISLE 0xdeadbeef"), - (True, False, "// OFFSET: ISLE 0xdeadbeef"), - (True, False, "// OFFSET: ISLE 0xdeadbeef"), - (True, False, "// OFFSET: ISLE 0xdeadbeef STUB"), + (True, False, "//FUNCTION: ISLE 0xdeadbeef"), + (True, False, "// FUNCTION:ISLE 0xdeadbeef"), + (True, False, "// FUNCTION: ISLE 0xdeadbeef"), + (True, False, "// FUNCTION: ISLE 0xdeadbeef"), + (True, False, "// FUNCTION: ISLE 0xdeadbeef"), # Must have 0x prefix for hex number - (True, False, "// OFFSET: ISLE deadbeef"), + (True, False, "// FUNCTION: ISLE deadbeef"), # Offset, module name, and STUB must be uppercase - (True, False, "// offset: ISLE 0xdeadbeef"), - (True, False, "// offset: isle 0xdeadbeef"), - (True, False, "// OFFSET: LEGO1 0xdeadbeef stub"), + (True, False, "// function: ISLE 0xdeadbeef"), + (True, False, "// function: isle 0xdeadbeef"), # Hex string must be lowercase - (True, False, "// OFFSET: ISLE 0xDEADBEEF"), + (True, False, "// FUNCTION: ISLE 0xDEADBEEF"), # TODO: How flexible should we be with matching the module name? - (True, True, "// OFFSET: OMNI 0x12345678"), - (True, True, "// OFFSET: LEG01 0x12345678"), - (True, False, "// OFFSET: hello 0x12345678"), + (True, True, "// FUNCTION: OMNI 0x12345678"), + (True, True, "// FUNCTION: LEG01 0x12345678"), + (True, False, "// FUNCTION: hello 0x12345678"), # Not close enough to match - (False, False, "// OFFSET: ISLE0x12345678"), - (False, False, "// OFFSET: 0x12345678"), + (False, False, "// FUNCTION: ISLE0x12345678"), + (False, False, "// FUNCTION: 0x12345678"), (False, False, "// LEGO1: 0x12345678"), # Hex string shorter than 8 characters - (True, True, "// OFFSET: LEGO1 0x1234"), + (True, True, "// FUNCTION: LEGO1 0x1234"), # TODO: These match but shouldn't. - # (False, False, '// OFFSET: LEGO1 0'), - # (False, False, '// OFFSET: LEGO1 0x'), + # (False, False, '// FUNCTION: LEGO1 0'), + # (False, False, '// FUNCTION: LEGO1 0x'), ] -@pytest.mark.parametrize("match, _, line", offset_comment_samples) -def test_offset_match(line: str, match: bool, _): - did_match = match_offset_comment(line) is not None +@pytest.mark.parametrize("match, _, line", marker_samples) +def test_marker_match(line: str, match: bool, _): + did_match = match_marker(line) is not None assert did_match is match -@pytest.mark.parametrize("_, exact, line", offset_comment_samples) -def test_exact_offset_comment(line: str, exact: bool, _): - assert is_exact_offset_comment(line) is exact +@pytest.mark.parametrize("_, exact, line", marker_samples) +def test_marker_exact(line: str, exact: bool, _): + assert is_marker_exact(line) is exact -# Helper for the next test: cut down version of OffsetMatch -MiniOfs = namedtuple("MiniOfs", ["module", "value"]) - -distinct_by_module_samples = [ - # empty set - ([], []), - # same module name - ([MiniOfs("TEST", 123), MiniOfs("TEST", 555)], [MiniOfs("TEST", 123)]), - # same module name, case-insensitive - ([MiniOfs("test", 123), MiniOfs("TEST", 555)], [MiniOfs("test", 123)]), - # duplicates, non-consecutive - ( - [MiniOfs("test", 123), MiniOfs("abc", 111), MiniOfs("TEST", 555)], - [MiniOfs("test", 123), MiniOfs("abc", 111)], - ), -] +def test_marker_dict_simple(): + d = MarkerDict() + d.insert(DecompMarker("FUNCTION", "TEST", 0x1234)) + markers = list(d.iter()) + assert len(markers) == 1 -@pytest.mark.parametrize("sample, expected", distinct_by_module_samples) -def test_distinct_by_module(sample: List[MiniOfs], expected: List[MiniOfs]): - assert distinct_by_module(sample) == expected +def test_marker_dict_ofs_replace(): + d = MarkerDict() + d.insert(DecompMarker("FUNCTION", "TEST", 0x1234)) + d.insert(DecompMarker("FUNCTION", "TEST", 0x555)) + markers = list(d.iter()) + assert len(markers) == 1 + assert markers[0].offset == 0x1234 + + +def test_marker_dict_type_replace(): + d = MarkerDict() + d.insert(DecompMarker("FUNCTION", "TEST", 0x1234)) + d.insert(DecompMarker("STUB", "TEST", 0x1234)) + markers = list(d.iter()) + assert len(markers) == 1 + assert markers[0].type == "FUNCTION" diff --git a/tools/reccmp/reccmp.py b/tools/reccmp/reccmp.py index 02c16029..1d4cd5b1 100755 --- a/tools/reccmp/reccmp.py +++ b/tools/reccmp/reccmp.py @@ -10,7 +10,7 @@ import re from isledecomp import ( Bin, - find_code_blocks, + DecompParser, get_file_in_script_dir, OffsetPlaceholderGenerator, print_diff, @@ -313,18 +313,20 @@ if __name__ == "__main__": # Generate basename of original file, used in locating OFFSET lines basename = os.path.basename(os.path.splitext(original)[0]) + parser = DecompParser() for srcfilename in walk_source_dir(source): + parser.reset() with open(srcfilename, "r", encoding="utf-8") as srcfile: - blocks = find_code_blocks(srcfile) + parser.read_lines(srcfile) - for block in blocks: - if block.is_stub: + for fun in parser.functions: + if fun.is_stub: continue - if block.module != basename: + if fun.module != basename: continue - addr = block.offset + addr = fun.offset # Verbose flag handling if verbose: if addr == verbose: @@ -332,13 +334,13 @@ if __name__ == "__main__": else: continue - if block.is_template: - recinfo = syminfo.get_recompiled_address_from_name(block.signature) + if fun.is_template: + recinfo = syminfo.get_recompiled_address_from_name(fun.name) if not recinfo: continue else: recinfo = syminfo.get_recompiled_address( - srcfilename, block.start_line + srcfilename, fun.line_number ) if not recinfo: continue