(Proposal) Adjustments to "decomp" language (#308)

* Adjustments to "decomp" language

* Fix a comment

* Fix accidental clang-formatting

* Fix order

* Fix order

* Remove junk

* Fix OFFSET

* Adjustments based on new suggestions

* Annotate globals

* Globals in ISLE

* More globals

* Merge from parser2 branch

* Allow prepending space for exact marker match

* To eliminate noise, require the 0x prefix on offset for marker match

* fix test from previous

* Count tab stops for indented functions to reduce MISSED_END_OF_FUNCTION noise

* FUNCTION to SYNTHETIC where needed

* Missed marker conversion on SetAtomId

* pylint cleanup, remove unused code

* Fix unexpected function end, add more unit tests

* Be more strict about synthetic name syntax

* Revert "Missed marker conversion on SetAtomId"

This reverts commit d87d665127.

* Revert "FUNCTION to SYNTHETIC where needed"

This reverts commit 8c815418d2.

* Implicit lookup by name for functions

* Fix VTABLE SYNTHETIC and other decomp markers

* Get vtable class name

* Vtable marker should identify struct

* No colon for SIZE comment

* Update README.md

* Update README.md

* Update CONTRIBUTING.md

* Update README.md

* Update README.md

* Update CONTRIBUTING.md

* Update README.md

* Update CONTRIBUTING.md

* Fix destructor/annotation

* Update README.md

* Update README.md

* Update README.md

* Update README.md

* Update README.md

---------

Co-authored-by: disinvite <disinvite@users.noreply.github.com>
This commit is contained in:
Christian Semmler
2023-12-06 07:10:45 -05:00
committed by GitHub
parent 4f5b70013f
commit 494a556f8e
407 changed files with 3505 additions and 2493 deletions

View File

@@ -1 +1 @@
from .parser import find_code_blocks
from .parser import DecompParser

View File

@@ -0,0 +1,41 @@
from enum import Enum
class ParserError(Enum):
# WARN: Stub function exceeds some line number threshold
UNLIKELY_STUB = 100
# WARN: Decomp marker is close enough to be recognized, but does not follow syntax exactly
BAD_DECOMP_MARKER = 101
# WARN: Multiple markers in sequence do not have distinct modules
DUPLICATE_MODULE = 102
# WARN: Detected a dupcliate module/offset pair in the current file
DUPLICATE_OFFSET = 103
# WARN: We read a line that matches the decomp marker pattern, but we are not set up
# to handle it
BOGUS_MARKER = 104
# WARN: New function marker appeared while we were inside a function
MISSED_END_OF_FUNCTION = 105
# WARN: If we find a curly brace right after the function declaration
# this is wrong but we still have enough to make a match with reccmp
MISSED_START_OF_FUNCTION = 106
# WARN: A blank line appeared between the end of FUNCTION markers
# and the start of the function. We can ignore it, but the line shouldn't be there
UNEXPECTED_BLANK_LINE = 107
# ERROR: We found a marker unexpectedly
UNEXPECTED_MARKER = 200
# ERROR: We found a marker where we expected to find one, but it is incompatible
# with the preceding markers.
# For example, a GLOBAL cannot follow FUNCTION/STUB
INCOMPATIBLE_MARKER = 201
# ERROR: The line following a synthetic marker was not a comment
BAD_SYNTHETIC = 202

View File

@@ -0,0 +1,41 @@
from dataclasses import dataclass
@dataclass
class ParserNode:
line_number: int
@dataclass
class ParserAlert(ParserNode):
code: int
line: str
@dataclass
class ParserSymbol(ParserNode):
module: str
offset: int
@dataclass
class ParserFunction(ParserSymbol):
name: str
lookup_by_name: bool = False
is_stub: bool = False
is_synthetic: bool = False
is_template: bool = False
end_line: int = -1
@dataclass
class ParserVariable(ParserSymbol):
name: str
size: int = -1
is_static: bool = False
@dataclass
class ParserVtable(ParserSymbol):
class_name: str
num_entries: int = -1

View File

@@ -1,145 +1,394 @@
# C++ file parser
from typing import List, TextIO
from typing import List, Iterable, Iterator
from enum import Enum
from .util import (
CodeBlock,
OffsetMatch,
DecompMarker,
is_blank_or_comment,
match_offset_comment,
get_template_function_name,
match_marker,
is_marker_exact,
get_class_name,
get_synthetic_name,
remove_trailing_comment,
distinct_by_module,
)
from .node import (
ParserAlert,
ParserFunction,
ParserVariable,
ParserVtable,
)
from .error import ParserError
class ReaderState(Enum):
WANT_OFFSET = 0
SEARCH = 0
WANT_SIG = 1
IN_FUNC = 2
IN_TEMPLATE = 3
WANT_CURLY = 4
FUNCTION_DONE = 5
IN_GLOBAL = 5
IN_FUNC_GLOBAL = 6
IN_VTABLE = 7
def find_code_blocks(stream: TextIO) -> List[CodeBlock]:
"""Read the IO stream (file) line-by-line and give the following report:
Foreach code block (function) in the file, what are its starting and
ending line numbers, and what is the given offset in the original
binary. We expect the result to be ordered by line number because we
are reading the file from start to finish."""
def marker_is_stub(marker: DecompMarker) -> bool:
return marker.type.upper() == "STUB"
blocks: List[CodeBlock] = []
offset_matches: List[OffsetMatch] = []
def marker_is_variable(marker: DecompMarker) -> bool:
return marker.type.upper() == "GLOBAL"
function_sig = None
start_line = None
end_line = None
state = ReaderState.WANT_OFFSET
# 1-based to match cvdump and your text editor
# I know it says 0, but we will increment before each readline()
line_no = 0
can_seek = True
def marker_is_synthetic(marker: DecompMarker) -> bool:
return marker.type.upper() in ("SYNTHETIC", "TEMPLATE")
while True:
# Do this before reading again so that an EOF will not
# cause us to miss the last function of the file.
if state == ReaderState.FUNCTION_DONE:
# Our list of offset marks could have duplicates on
# module name, so we'll eliminate those now.
for offset_match in distinct_by_module(offset_matches):
block = CodeBlock(
offset=offset_match.address,
signature=function_sig,
start_line=start_line,
def marker_is_template(marker: DecompMarker) -> bool:
return marker.type.upper() == "TEMPLATE"
def marker_is_function(marker: DecompMarker) -> bool:
return marker.type.upper() in ("FUNCTION", "STUB")
def marker_is_vtable(marker: DecompMarker) -> bool:
return marker.type.upper() == "VTABLE"
class MarkerDict:
def __init__(self):
self.markers: dict = {}
def insert(self, marker: DecompMarker) -> bool:
"""Return True if this insert would overwrite"""
module = marker.module.upper()
if module in self.markers:
return True
self.markers[module] = (marker.type, marker.offset)
return False
def iter(self) -> Iterator[DecompMarker]:
for module, (marker_type, offset) in self.markers.items():
yield DecompMarker(marker_type, module, offset)
def empty(self):
self.markers = {}
class DecompParser:
# pylint: disable=too-many-instance-attributes
# Could combine output lists into a single list to get under the limit,
# but not right now
def __init__(self):
# The lists to be populated as we parse
self.functions: List[ParserFunction] = []
self.vtables: List[ParserVtable] = []
self.variables: List[ParserVariable] = []
self.alerts: List[ParserAlert] = []
self.line_number: int = 0
self.state: ReaderState = ReaderState.SEARCH
self.last_line: str = ""
# To allow for multiple markers where code is shared across different
# modules, save lists of compatible markers that appear in sequence
self.fun_markers = MarkerDict()
self.var_markers = MarkerDict()
self.tbl_markers = MarkerDict()
# To handle functions that are entirely indented (i.e. those defined
# in class declarations), remember how many whitespace characters
# came before the opening curly brace and match that up at the end.
# This should give us the same or better accuracy for a well-formed file.
# The alternative is counting the curly braces on each line
# but that's probably too cumbersome.
self.curly_indent_stops: int = 0
# For non-synthetic functions, save the line number where the function begins
# (i.e. where we see the curly brace) along with the function signature.
# We will need both when we reach the end of the function.
self.function_start: int = 0
self.function_sig: str = ""
def reset(self):
self.functions = []
self.vtables = []
self.variables = []
self.alerts = []
self.line_number = 0
self.state = ReaderState.SEARCH
self.last_line = ""
self.fun_markers.empty()
self.var_markers.empty()
self.tbl_markers.empty()
self.curly_indent_stops = 0
self.function_start = 0
self.function_sig = ""
def _recover(self):
"""We hit a syntax error and need to reset temp structures"""
self.state = ReaderState.SEARCH
self.fun_markers.empty()
self.var_markers.empty()
self.tbl_markers.empty()
def _syntax_warning(self, code):
self.alerts.append(
ParserAlert(
line_number=self.line_number,
code=code,
line=self.last_line.strip(),
)
)
def _syntax_error(self, code):
self._syntax_warning(code)
self._recover()
def _function_starts_here(self):
self.function_start = self.line_number
def _function_marker(self, marker: DecompMarker):
if self.fun_markers.insert(marker):
self._syntax_warning(ParserError.DUPLICATE_MODULE)
self.state = ReaderState.WANT_SIG
def _synthetic_marker(self, marker: DecompMarker):
if self.fun_markers.insert(marker):
self._syntax_warning(ParserError.DUPLICATE_MODULE)
self.state = ReaderState.IN_TEMPLATE
def _function_done(self, lookup_by_name: bool = False, unexpected: bool = False):
end_line = self.line_number
if unexpected:
# If we missed the end of the previous function, assume it ended
# on the previous line and that whatever we are tracking next
# begins on the current line.
end_line -= 1
for marker in self.fun_markers.iter():
self.functions.append(
ParserFunction(
line_number=self.function_start,
module=marker.module,
offset=marker.offset,
lookup_by_name=lookup_by_name,
is_stub=marker_is_stub(marker),
is_synthetic=marker_is_synthetic(marker),
is_template=marker_is_template(marker),
name=self.function_sig,
end_line=end_line,
offset_comment=offset_match.comment,
module=offset_match.module,
is_template=offset_match.is_template,
is_stub=offset_match.is_stub,
)
blocks.append(block)
offset_matches = []
state = ReaderState.WANT_OFFSET
)
if can_seek:
line_no += 1
line = stream.readline()
if line == "":
break
self.fun_markers.empty()
self.curly_indent_stops = 0
self.state = ReaderState.SEARCH
new_match = match_offset_comment(line)
if new_match is not None:
# We will allow multiple offsets if we have just begun
# the code block, but not after we hit the curly brace.
if state in (
ReaderState.WANT_OFFSET,
ReaderState.IN_TEMPLATE,
def _vtable_marker(self, marker: DecompMarker):
if self.tbl_markers.insert(marker):
self._syntax_warning(ParserError.DUPLICATE_MODULE)
self.state = ReaderState.IN_VTABLE
def _vtable_done(self, class_name: str = None):
if class_name is None:
# Best we can do
class_name = self.last_line.strip()
for marker in self.tbl_markers.iter():
self.vtables.append(
ParserVtable(
line_number=self.line_number,
module=marker.module,
offset=marker.offset,
class_name=class_name,
)
)
self.tbl_markers.empty()
self.state = ReaderState.SEARCH
def _variable_marker(self, marker: DecompMarker):
if self.var_markers.insert(marker):
self._syntax_warning(ParserError.DUPLICATE_MODULE)
if self.state in (ReaderState.IN_FUNC, ReaderState.IN_FUNC_GLOBAL):
self.state = ReaderState.IN_FUNC_GLOBAL
else:
self.state = ReaderState.IN_GLOBAL
def _variable_done(self):
for marker in self.var_markers.iter():
self.variables.append(
ParserVariable(
line_number=self.line_number,
module=marker.module,
offset=marker.offset,
name=self.last_line.strip(),
)
)
self.var_markers.empty()
if self.state == ReaderState.IN_FUNC_GLOBAL:
self.state = ReaderState.IN_FUNC
else:
self.state = ReaderState.SEARCH
def _handle_marker(self, marker: DecompMarker):
# Cannot handle any markers between function sig and opening curly brace
if self.state == ReaderState.WANT_CURLY:
self._syntax_error(ParserError.UNEXPECTED_MARKER)
return
# TODO: How uncertain are we of detecting the end of a function
# in a clang-formatted file? For now we assume we have missed the
# end if we detect a non-GLOBAL marker while state is IN_FUNC.
# Maybe these cases should be syntax errors instead
if marker_is_function(marker):
if self.state in (
ReaderState.SEARCH,
ReaderState.WANT_SIG,
):
# If we detected an offset marker unexpectedly,
# we are handling it here so we can continue seeking.
can_seek = True
offset_matches.append(new_match)
if new_match.is_template:
state = ReaderState.IN_TEMPLATE
else:
state = ReaderState.WANT_SIG
else:
# We will allow multiple offsets if we have just begun
# the code block, but not after we hit the curly brace.
self._function_marker(marker)
elif self.state == ReaderState.IN_FUNC:
# We hit another offset unexpectedly.
# We can recover easily by just ending the function here.
end_line = line_no - 1
state = ReaderState.FUNCTION_DONE
self._syntax_warning(ParserError.MISSED_END_OF_FUNCTION)
self._function_done(unexpected=True)
# Pause reading here so we handle the offset marker
# on the next loop iteration
can_seek = False
# Start the next function right after so we can
# read the next line.
self._function_marker(marker)
else:
self._syntax_error(ParserError.INCOMPATIBLE_MARKER)
elif state == ReaderState.IN_TEMPLATE:
elif marker_is_synthetic(marker):
if self.state in (ReaderState.SEARCH, ReaderState.IN_TEMPLATE):
self._synthetic_marker(marker)
elif self.state == ReaderState.IN_FUNC:
self._syntax_warning(ParserError.MISSED_END_OF_FUNCTION)
self._function_done(lookup_by_name=True, unexpected=True)
self._synthetic_marker(marker)
else:
self._syntax_error(ParserError.INCOMPATIBLE_MARKER)
elif marker_is_variable(marker):
if self.state in (
ReaderState.SEARCH,
ReaderState.IN_GLOBAL,
ReaderState.IN_FUNC,
ReaderState.IN_FUNC_GLOBAL,
):
self._variable_marker(marker)
else:
self._syntax_error(ParserError.INCOMPATIBLE_MARKER)
elif marker_is_vtable(marker):
if self.state in (ReaderState.SEARCH, ReaderState.IN_VTABLE):
self._vtable_marker(marker)
elif self.state == ReaderState.IN_FUNC:
self._syntax_warning(ParserError.MISSED_END_OF_FUNCTION)
self._function_done(unexpected=True)
self._vtable_marker(marker)
else:
self._syntax_error(ParserError.INCOMPATIBLE_MARKER)
else:
self._syntax_warning(ParserError.BOGUS_MARKER)
def read_line(self, line: str):
self.last_line = line # TODO: Useful or hack for error reporting?
self.line_number += 1
marker = match_marker(line)
if marker is not None:
# TODO: what's the best place for this?
# Does it belong with reading or marker handling?
if not is_marker_exact(self.last_line):
self._syntax_warning(ParserError.BAD_DECOMP_MARKER)
self._handle_marker(marker)
return
line_strip = line.strip()
if self.state == ReaderState.IN_TEMPLATE:
# TEMPLATE functions are a special case. The signature is
# given on the next line (in a // comment)
function_sig = get_template_function_name(line)
start_line = line_no
end_line = line_no
state = ReaderState.FUNCTION_DONE
name = get_synthetic_name(line)
if name is None:
self._syntax_error(ParserError.BAD_SYNTHETIC)
else:
self.function_sig = name
self._function_starts_here()
self._function_done(lookup_by_name=True)
elif state == ReaderState.WANT_SIG:
# Skip blank lines or comments that come after the offset
# marker. There is not a formal procedure for this, so just
# assume the next "code line" is the function signature
if not is_blank_or_comment(line):
elif self.state == ReaderState.WANT_SIG:
# Ignore blanks on the way to function start or function name
if len(line_strip) == 0:
self._syntax_warning(ParserError.UNEXPECTED_BLANK_LINE)
elif line_strip.startswith("//"):
# If we found a comment, assume implicit lookup-by-name
# function and end here. We know this is not a decomp marker
# because it would have been handled already.
self.function_sig = get_synthetic_name(line)
self._function_starts_here()
self._function_done(lookup_by_name=True)
elif line_strip == "{":
# We missed the function signature but we can recover from this
self.function_sig = "(unknown)"
self._function_starts_here()
self._syntax_warning(ParserError.MISSED_START_OF_FUNCTION)
self.state = ReaderState.IN_FUNC
else:
# Inline functions may end with a comment. Strip that out
# to help parsing.
function_sig = remove_trailing_comment(line.strip())
self.function_sig = remove_trailing_comment(line_strip)
# Now check to see if the opening curly bracket is on the
# same line. clang-format should prevent this (BraceWrapping)
# but it is easy to detect.
# If the entire function is on one line, handle that too.
if function_sig.endswith("{"):
start_line = line_no
state = ReaderState.IN_FUNC
elif function_sig.endswith("}") or function_sig.endswith("};"):
start_line = line_no
end_line = line_no
state = ReaderState.FUNCTION_DONE
if self.function_sig.endswith("{"):
self._function_starts_here()
self.state = ReaderState.IN_FUNC
elif self.function_sig.endswith("}") or self.function_sig.endswith(
"};"
):
self._function_starts_here()
self._function_done()
else:
state = ReaderState.WANT_CURLY
self.state = ReaderState.WANT_CURLY
elif state == ReaderState.WANT_CURLY:
if line.strip() == "{":
start_line = line_no
state = ReaderState.IN_FUNC
elif self.state == ReaderState.WANT_CURLY:
if line_strip == "{":
self.curly_indent_stops = line.index("{")
self._function_starts_here()
self.state = ReaderState.IN_FUNC
elif state == ReaderState.IN_FUNC:
# Naive but reasonable assumption that functions will end with
# a curly brace on its own line with no prepended spaces.
if line.startswith("}"):
end_line = line_no
state = ReaderState.FUNCTION_DONE
elif self.state == ReaderState.IN_FUNC:
if line_strip.startswith("}") and line[self.curly_indent_stops] == "}":
self._function_done()
return blocks
elif self.state in (ReaderState.IN_GLOBAL, ReaderState.IN_FUNC_GLOBAL):
if not is_blank_or_comment(line):
self._variable_done()
elif self.state == ReaderState.IN_VTABLE:
vtable_class = get_class_name(line)
if vtable_class is not None:
self._vtable_done(class_name=vtable_class)
def read_lines(self, lines: Iterable):
for line in lines:
self.read_line(line)

View File

@@ -1,44 +1,17 @@
# C++ Parser utility functions and data structures
from __future__ import annotations # python <3.10 compatibility
import re
from typing import List
from collections import namedtuple
DecompMarker = namedtuple("DecompMarker", ["type", "module", "offset"])
CodeBlock = namedtuple(
"CodeBlock",
[
"offset",
"signature",
"start_line",
"end_line",
"offset_comment",
"module",
"is_template",
"is_stub",
],
)
OffsetMatch = namedtuple(
"OffsetMatch", ["module", "address", "is_template", "is_stub", "comment"]
)
# This has not been formally established, but considering that "STUB"
# is a temporary state for a function, we assume it will appear last,
# after any other modifiers (i.e. TEMPLATE)
# To match a reasonable variance of formatting for the offset comment
offsetCommentRegex = re.compile(
r"\s*//\s*OFFSET:\s*(\w+)\s+(?:0x)?([a-f0-9]+)(\s+TEMPLATE)?(\s+STUB)?", # nopep8
markerRegex = re.compile(
r"\s*//\s*(\w+):\s*(\w+)\s+(0x[a-f0-9]+)",
flags=re.I,
)
# To match the exact syntax (text upper case, hex lower case, with spaces)
# that is used in most places
offsetCommentExactRegex = re.compile(
r"^// OFFSET: [A-Z0-9]+ (0x[a-f0-9]+)( TEMPLATE)?( STUB)?$"
) # nopep8
markerExactRegex = re.compile(r"\s*// ([A-Z]+): ([A-Z0-9]+) (0x[a-f0-9]+)$")
# The goal here is to just read whatever is on the next line, so some
# flexibility in the formatting seems OK
@@ -50,15 +23,15 @@ templateCommentRegex = re.compile(r"\s*//\s+(.*)")
trailingCommentRegex = re.compile(r"(\s*(?://|/\*).*)$")
def get_template_function_name(line: str) -> str:
"""Parse function signature for special TEMPLATE functions"""
def get_synthetic_name(line: str) -> str | None:
"""Synthetic names appear on a single line comment on the line after the marker.
If that's not what we have, return None"""
template_match = templateCommentRegex.match(line)
# If we don't match, you get whatever is on the line as the signature
if template_match is not None:
return template_match.group(1)
return line
return None
def remove_trailing_comment(line: str) -> str:
@@ -78,39 +51,45 @@ def is_blank_or_comment(line: str) -> bool:
)
def is_exact_offset_comment(line: str) -> bool:
"""If the offset comment does not match our (unofficial) syntax
we may want to alert the user to fix it for style points."""
return offsetCommentExactRegex.match(line) is not None
def match_offset_comment(line: str) -> OffsetMatch | None:
match = offsetCommentRegex.match(line)
def match_marker(line: str) -> DecompMarker | None:
match = markerRegex.match(line)
if match is None:
return None
return OffsetMatch(
module=match.group(1),
address=int(match.group(2), 16),
is_template=match.group(3) is not None,
is_stub=match.group(4) is not None,
comment=line.strip(),
return DecompMarker(
type=match.group(1), module=match.group(2), offset=int(match.group(3), 16)
)
def distinct_by_module(offsets: List) -> List:
"""Given a list of offset markers, return a list with distinct
module names. If module names (case-insensitive) are repeated,
choose the offset that appears first."""
def is_marker_exact(line: str) -> bool:
return markerExactRegex.match(line) is not None
if len(offsets) < 2:
return offsets
# Dict maintains insertion order in python >=3.7
offsets_dict = {}
for offset in offsets:
module_upper = offset.module.upper()
if module_upper not in offsets_dict:
offsets_dict[module_upper] = offset
template_class_decl_regex = re.compile(
r"\s*(?:\/\/)?\s*(?:class|struct) (\w+)<([\w]+)\s*(\*+)?\s*>"
)
return list(offsets_dict.values())
class_decl_regex = re.compile(r"\s*(?:\/\/)?\s*(?:class|struct) (\w+)")
def get_class_name(line: str) -> str | None:
"""For VTABLE markers, extract the class name from the code line or comment
where it appears."""
match = template_class_decl_regex.match(line)
if match is not None:
# For template classes, we should reformat the class name so it matches
# the output from cvdump: one space between the template type and any asterisks
# if it is a pointer type.
(class_name, template_type, asterisks) = match.groups()
if asterisks is not None:
return f"{class_name}<{template_type} {asterisks}>"
return f"{class_name}<{template_type}>"
match = class_decl_regex.match(line)
if match is not None:
return match.group(1)
return None