mirror of
https://github.com/isledecomp/isle.git
synced 2025-10-26 18:04:06 +00:00
(Proposal) Adjustments to "decomp" language (#308)
* Adjustments to "decomp" language * Fix a comment * Fix accidental clang-formatting * Fix order * Fix order * Remove junk * Fix OFFSET * Adjustments based on new suggestions * Annotate globals * Globals in ISLE * More globals * Merge from parser2 branch * Allow prepending space for exact marker match * To eliminate noise, require the 0x prefix on offset for marker match * fix test from previous * Count tab stops for indented functions to reduce MISSED_END_OF_FUNCTION noise * FUNCTION to SYNTHETIC where needed * Missed marker conversion on SetAtomId * pylint cleanup, remove unused code * Fix unexpected function end, add more unit tests * Be more strict about synthetic name syntax * Revert "Missed marker conversion on SetAtomId" This reverts commitd87d665127. * Revert "FUNCTION to SYNTHETIC where needed" This reverts commit8c815418d2. * Implicit lookup by name for functions * Fix VTABLE SYNTHETIC and other decomp markers * Get vtable class name * Vtable marker should identify struct * No colon for SIZE comment * Update README.md * Update README.md * Update CONTRIBUTING.md * Update README.md * Update README.md * Update CONTRIBUTING.md * Update README.md * Update CONTRIBUTING.md * Fix destructor/annotation * Update README.md * Update README.md * Update README.md * Update README.md * Update README.md --------- Co-authored-by: disinvite <disinvite@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
4f5b70013f
commit
494a556f8e
@@ -1 +1 @@
|
||||
from .parser import find_code_blocks
|
||||
from .parser import DecompParser
|
||||
|
||||
41
tools/isledecomp/isledecomp/parser/error.py
Normal file
41
tools/isledecomp/isledecomp/parser/error.py
Normal file
@@ -0,0 +1,41 @@
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class ParserError(Enum):
|
||||
# WARN: Stub function exceeds some line number threshold
|
||||
UNLIKELY_STUB = 100
|
||||
|
||||
# WARN: Decomp marker is close enough to be recognized, but does not follow syntax exactly
|
||||
BAD_DECOMP_MARKER = 101
|
||||
|
||||
# WARN: Multiple markers in sequence do not have distinct modules
|
||||
DUPLICATE_MODULE = 102
|
||||
|
||||
# WARN: Detected a dupcliate module/offset pair in the current file
|
||||
DUPLICATE_OFFSET = 103
|
||||
|
||||
# WARN: We read a line that matches the decomp marker pattern, but we are not set up
|
||||
# to handle it
|
||||
BOGUS_MARKER = 104
|
||||
|
||||
# WARN: New function marker appeared while we were inside a function
|
||||
MISSED_END_OF_FUNCTION = 105
|
||||
|
||||
# WARN: If we find a curly brace right after the function declaration
|
||||
# this is wrong but we still have enough to make a match with reccmp
|
||||
MISSED_START_OF_FUNCTION = 106
|
||||
|
||||
# WARN: A blank line appeared between the end of FUNCTION markers
|
||||
# and the start of the function. We can ignore it, but the line shouldn't be there
|
||||
UNEXPECTED_BLANK_LINE = 107
|
||||
|
||||
# ERROR: We found a marker unexpectedly
|
||||
UNEXPECTED_MARKER = 200
|
||||
|
||||
# ERROR: We found a marker where we expected to find one, but it is incompatible
|
||||
# with the preceding markers.
|
||||
# For example, a GLOBAL cannot follow FUNCTION/STUB
|
||||
INCOMPATIBLE_MARKER = 201
|
||||
|
||||
# ERROR: The line following a synthetic marker was not a comment
|
||||
BAD_SYNTHETIC = 202
|
||||
41
tools/isledecomp/isledecomp/parser/node.py
Normal file
41
tools/isledecomp/isledecomp/parser/node.py
Normal file
@@ -0,0 +1,41 @@
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParserNode:
|
||||
line_number: int
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParserAlert(ParserNode):
|
||||
code: int
|
||||
line: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParserSymbol(ParserNode):
|
||||
module: str
|
||||
offset: int
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParserFunction(ParserSymbol):
|
||||
name: str
|
||||
lookup_by_name: bool = False
|
||||
is_stub: bool = False
|
||||
is_synthetic: bool = False
|
||||
is_template: bool = False
|
||||
end_line: int = -1
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParserVariable(ParserSymbol):
|
||||
name: str
|
||||
size: int = -1
|
||||
is_static: bool = False
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParserVtable(ParserSymbol):
|
||||
class_name: str
|
||||
num_entries: int = -1
|
||||
@@ -1,145 +1,394 @@
|
||||
# C++ file parser
|
||||
|
||||
from typing import List, TextIO
|
||||
from typing import List, Iterable, Iterator
|
||||
from enum import Enum
|
||||
from .util import (
|
||||
CodeBlock,
|
||||
OffsetMatch,
|
||||
DecompMarker,
|
||||
is_blank_or_comment,
|
||||
match_offset_comment,
|
||||
get_template_function_name,
|
||||
match_marker,
|
||||
is_marker_exact,
|
||||
get_class_name,
|
||||
get_synthetic_name,
|
||||
remove_trailing_comment,
|
||||
distinct_by_module,
|
||||
)
|
||||
from .node import (
|
||||
ParserAlert,
|
||||
ParserFunction,
|
||||
ParserVariable,
|
||||
ParserVtable,
|
||||
)
|
||||
from .error import ParserError
|
||||
|
||||
|
||||
class ReaderState(Enum):
|
||||
WANT_OFFSET = 0
|
||||
SEARCH = 0
|
||||
WANT_SIG = 1
|
||||
IN_FUNC = 2
|
||||
IN_TEMPLATE = 3
|
||||
WANT_CURLY = 4
|
||||
FUNCTION_DONE = 5
|
||||
IN_GLOBAL = 5
|
||||
IN_FUNC_GLOBAL = 6
|
||||
IN_VTABLE = 7
|
||||
|
||||
|
||||
def find_code_blocks(stream: TextIO) -> List[CodeBlock]:
|
||||
"""Read the IO stream (file) line-by-line and give the following report:
|
||||
Foreach code block (function) in the file, what are its starting and
|
||||
ending line numbers, and what is the given offset in the original
|
||||
binary. We expect the result to be ordered by line number because we
|
||||
are reading the file from start to finish."""
|
||||
def marker_is_stub(marker: DecompMarker) -> bool:
|
||||
return marker.type.upper() == "STUB"
|
||||
|
||||
blocks: List[CodeBlock] = []
|
||||
|
||||
offset_matches: List[OffsetMatch] = []
|
||||
def marker_is_variable(marker: DecompMarker) -> bool:
|
||||
return marker.type.upper() == "GLOBAL"
|
||||
|
||||
function_sig = None
|
||||
start_line = None
|
||||
end_line = None
|
||||
state = ReaderState.WANT_OFFSET
|
||||
|
||||
# 1-based to match cvdump and your text editor
|
||||
# I know it says 0, but we will increment before each readline()
|
||||
line_no = 0
|
||||
can_seek = True
|
||||
def marker_is_synthetic(marker: DecompMarker) -> bool:
|
||||
return marker.type.upper() in ("SYNTHETIC", "TEMPLATE")
|
||||
|
||||
while True:
|
||||
# Do this before reading again so that an EOF will not
|
||||
# cause us to miss the last function of the file.
|
||||
if state == ReaderState.FUNCTION_DONE:
|
||||
# Our list of offset marks could have duplicates on
|
||||
# module name, so we'll eliminate those now.
|
||||
for offset_match in distinct_by_module(offset_matches):
|
||||
block = CodeBlock(
|
||||
offset=offset_match.address,
|
||||
signature=function_sig,
|
||||
start_line=start_line,
|
||||
|
||||
def marker_is_template(marker: DecompMarker) -> bool:
|
||||
return marker.type.upper() == "TEMPLATE"
|
||||
|
||||
|
||||
def marker_is_function(marker: DecompMarker) -> bool:
|
||||
return marker.type.upper() in ("FUNCTION", "STUB")
|
||||
|
||||
|
||||
def marker_is_vtable(marker: DecompMarker) -> bool:
|
||||
return marker.type.upper() == "VTABLE"
|
||||
|
||||
|
||||
class MarkerDict:
|
||||
def __init__(self):
|
||||
self.markers: dict = {}
|
||||
|
||||
def insert(self, marker: DecompMarker) -> bool:
|
||||
"""Return True if this insert would overwrite"""
|
||||
module = marker.module.upper()
|
||||
if module in self.markers:
|
||||
return True
|
||||
|
||||
self.markers[module] = (marker.type, marker.offset)
|
||||
return False
|
||||
|
||||
def iter(self) -> Iterator[DecompMarker]:
|
||||
for module, (marker_type, offset) in self.markers.items():
|
||||
yield DecompMarker(marker_type, module, offset)
|
||||
|
||||
def empty(self):
|
||||
self.markers = {}
|
||||
|
||||
|
||||
class DecompParser:
|
||||
# pylint: disable=too-many-instance-attributes
|
||||
# Could combine output lists into a single list to get under the limit,
|
||||
# but not right now
|
||||
def __init__(self):
|
||||
# The lists to be populated as we parse
|
||||
self.functions: List[ParserFunction] = []
|
||||
self.vtables: List[ParserVtable] = []
|
||||
self.variables: List[ParserVariable] = []
|
||||
self.alerts: List[ParserAlert] = []
|
||||
|
||||
self.line_number: int = 0
|
||||
self.state: ReaderState = ReaderState.SEARCH
|
||||
|
||||
self.last_line: str = ""
|
||||
|
||||
# To allow for multiple markers where code is shared across different
|
||||
# modules, save lists of compatible markers that appear in sequence
|
||||
self.fun_markers = MarkerDict()
|
||||
self.var_markers = MarkerDict()
|
||||
self.tbl_markers = MarkerDict()
|
||||
|
||||
# To handle functions that are entirely indented (i.e. those defined
|
||||
# in class declarations), remember how many whitespace characters
|
||||
# came before the opening curly brace and match that up at the end.
|
||||
# This should give us the same or better accuracy for a well-formed file.
|
||||
# The alternative is counting the curly braces on each line
|
||||
# but that's probably too cumbersome.
|
||||
self.curly_indent_stops: int = 0
|
||||
|
||||
# For non-synthetic functions, save the line number where the function begins
|
||||
# (i.e. where we see the curly brace) along with the function signature.
|
||||
# We will need both when we reach the end of the function.
|
||||
self.function_start: int = 0
|
||||
self.function_sig: str = ""
|
||||
|
||||
def reset(self):
|
||||
self.functions = []
|
||||
self.vtables = []
|
||||
self.variables = []
|
||||
self.alerts = []
|
||||
|
||||
self.line_number = 0
|
||||
self.state = ReaderState.SEARCH
|
||||
|
||||
self.last_line = ""
|
||||
|
||||
self.fun_markers.empty()
|
||||
self.var_markers.empty()
|
||||
self.tbl_markers.empty()
|
||||
|
||||
self.curly_indent_stops = 0
|
||||
self.function_start = 0
|
||||
self.function_sig = ""
|
||||
|
||||
def _recover(self):
|
||||
"""We hit a syntax error and need to reset temp structures"""
|
||||
self.state = ReaderState.SEARCH
|
||||
self.fun_markers.empty()
|
||||
self.var_markers.empty()
|
||||
self.tbl_markers.empty()
|
||||
|
||||
def _syntax_warning(self, code):
|
||||
self.alerts.append(
|
||||
ParserAlert(
|
||||
line_number=self.line_number,
|
||||
code=code,
|
||||
line=self.last_line.strip(),
|
||||
)
|
||||
)
|
||||
|
||||
def _syntax_error(self, code):
|
||||
self._syntax_warning(code)
|
||||
self._recover()
|
||||
|
||||
def _function_starts_here(self):
|
||||
self.function_start = self.line_number
|
||||
|
||||
def _function_marker(self, marker: DecompMarker):
|
||||
if self.fun_markers.insert(marker):
|
||||
self._syntax_warning(ParserError.DUPLICATE_MODULE)
|
||||
self.state = ReaderState.WANT_SIG
|
||||
|
||||
def _synthetic_marker(self, marker: DecompMarker):
|
||||
if self.fun_markers.insert(marker):
|
||||
self._syntax_warning(ParserError.DUPLICATE_MODULE)
|
||||
self.state = ReaderState.IN_TEMPLATE
|
||||
|
||||
def _function_done(self, lookup_by_name: bool = False, unexpected: bool = False):
|
||||
end_line = self.line_number
|
||||
if unexpected:
|
||||
# If we missed the end of the previous function, assume it ended
|
||||
# on the previous line and that whatever we are tracking next
|
||||
# begins on the current line.
|
||||
end_line -= 1
|
||||
|
||||
for marker in self.fun_markers.iter():
|
||||
self.functions.append(
|
||||
ParserFunction(
|
||||
line_number=self.function_start,
|
||||
module=marker.module,
|
||||
offset=marker.offset,
|
||||
lookup_by_name=lookup_by_name,
|
||||
is_stub=marker_is_stub(marker),
|
||||
is_synthetic=marker_is_synthetic(marker),
|
||||
is_template=marker_is_template(marker),
|
||||
name=self.function_sig,
|
||||
end_line=end_line,
|
||||
offset_comment=offset_match.comment,
|
||||
module=offset_match.module,
|
||||
is_template=offset_match.is_template,
|
||||
is_stub=offset_match.is_stub,
|
||||
)
|
||||
blocks.append(block)
|
||||
offset_matches = []
|
||||
state = ReaderState.WANT_OFFSET
|
||||
)
|
||||
|
||||
if can_seek:
|
||||
line_no += 1
|
||||
line = stream.readline()
|
||||
if line == "":
|
||||
break
|
||||
self.fun_markers.empty()
|
||||
self.curly_indent_stops = 0
|
||||
self.state = ReaderState.SEARCH
|
||||
|
||||
new_match = match_offset_comment(line)
|
||||
if new_match is not None:
|
||||
# We will allow multiple offsets if we have just begun
|
||||
# the code block, but not after we hit the curly brace.
|
||||
if state in (
|
||||
ReaderState.WANT_OFFSET,
|
||||
ReaderState.IN_TEMPLATE,
|
||||
def _vtable_marker(self, marker: DecompMarker):
|
||||
if self.tbl_markers.insert(marker):
|
||||
self._syntax_warning(ParserError.DUPLICATE_MODULE)
|
||||
self.state = ReaderState.IN_VTABLE
|
||||
|
||||
def _vtable_done(self, class_name: str = None):
|
||||
if class_name is None:
|
||||
# Best we can do
|
||||
class_name = self.last_line.strip()
|
||||
|
||||
for marker in self.tbl_markers.iter():
|
||||
self.vtables.append(
|
||||
ParserVtable(
|
||||
line_number=self.line_number,
|
||||
module=marker.module,
|
||||
offset=marker.offset,
|
||||
class_name=class_name,
|
||||
)
|
||||
)
|
||||
|
||||
self.tbl_markers.empty()
|
||||
self.state = ReaderState.SEARCH
|
||||
|
||||
def _variable_marker(self, marker: DecompMarker):
|
||||
if self.var_markers.insert(marker):
|
||||
self._syntax_warning(ParserError.DUPLICATE_MODULE)
|
||||
|
||||
if self.state in (ReaderState.IN_FUNC, ReaderState.IN_FUNC_GLOBAL):
|
||||
self.state = ReaderState.IN_FUNC_GLOBAL
|
||||
else:
|
||||
self.state = ReaderState.IN_GLOBAL
|
||||
|
||||
def _variable_done(self):
|
||||
for marker in self.var_markers.iter():
|
||||
self.variables.append(
|
||||
ParserVariable(
|
||||
line_number=self.line_number,
|
||||
module=marker.module,
|
||||
offset=marker.offset,
|
||||
name=self.last_line.strip(),
|
||||
)
|
||||
)
|
||||
|
||||
self.var_markers.empty()
|
||||
if self.state == ReaderState.IN_FUNC_GLOBAL:
|
||||
self.state = ReaderState.IN_FUNC
|
||||
else:
|
||||
self.state = ReaderState.SEARCH
|
||||
|
||||
def _handle_marker(self, marker: DecompMarker):
|
||||
# Cannot handle any markers between function sig and opening curly brace
|
||||
if self.state == ReaderState.WANT_CURLY:
|
||||
self._syntax_error(ParserError.UNEXPECTED_MARKER)
|
||||
return
|
||||
|
||||
# TODO: How uncertain are we of detecting the end of a function
|
||||
# in a clang-formatted file? For now we assume we have missed the
|
||||
# end if we detect a non-GLOBAL marker while state is IN_FUNC.
|
||||
# Maybe these cases should be syntax errors instead
|
||||
|
||||
if marker_is_function(marker):
|
||||
if self.state in (
|
||||
ReaderState.SEARCH,
|
||||
ReaderState.WANT_SIG,
|
||||
):
|
||||
# If we detected an offset marker unexpectedly,
|
||||
# we are handling it here so we can continue seeking.
|
||||
can_seek = True
|
||||
|
||||
offset_matches.append(new_match)
|
||||
|
||||
if new_match.is_template:
|
||||
state = ReaderState.IN_TEMPLATE
|
||||
else:
|
||||
state = ReaderState.WANT_SIG
|
||||
else:
|
||||
# We will allow multiple offsets if we have just begun
|
||||
# the code block, but not after we hit the curly brace.
|
||||
self._function_marker(marker)
|
||||
elif self.state == ReaderState.IN_FUNC:
|
||||
# We hit another offset unexpectedly.
|
||||
# We can recover easily by just ending the function here.
|
||||
end_line = line_no - 1
|
||||
state = ReaderState.FUNCTION_DONE
|
||||
self._syntax_warning(ParserError.MISSED_END_OF_FUNCTION)
|
||||
self._function_done(unexpected=True)
|
||||
|
||||
# Pause reading here so we handle the offset marker
|
||||
# on the next loop iteration
|
||||
can_seek = False
|
||||
# Start the next function right after so we can
|
||||
# read the next line.
|
||||
self._function_marker(marker)
|
||||
else:
|
||||
self._syntax_error(ParserError.INCOMPATIBLE_MARKER)
|
||||
|
||||
elif state == ReaderState.IN_TEMPLATE:
|
||||
elif marker_is_synthetic(marker):
|
||||
if self.state in (ReaderState.SEARCH, ReaderState.IN_TEMPLATE):
|
||||
self._synthetic_marker(marker)
|
||||
elif self.state == ReaderState.IN_FUNC:
|
||||
self._syntax_warning(ParserError.MISSED_END_OF_FUNCTION)
|
||||
self._function_done(lookup_by_name=True, unexpected=True)
|
||||
self._synthetic_marker(marker)
|
||||
else:
|
||||
self._syntax_error(ParserError.INCOMPATIBLE_MARKER)
|
||||
|
||||
elif marker_is_variable(marker):
|
||||
if self.state in (
|
||||
ReaderState.SEARCH,
|
||||
ReaderState.IN_GLOBAL,
|
||||
ReaderState.IN_FUNC,
|
||||
ReaderState.IN_FUNC_GLOBAL,
|
||||
):
|
||||
self._variable_marker(marker)
|
||||
else:
|
||||
self._syntax_error(ParserError.INCOMPATIBLE_MARKER)
|
||||
|
||||
elif marker_is_vtable(marker):
|
||||
if self.state in (ReaderState.SEARCH, ReaderState.IN_VTABLE):
|
||||
self._vtable_marker(marker)
|
||||
elif self.state == ReaderState.IN_FUNC:
|
||||
self._syntax_warning(ParserError.MISSED_END_OF_FUNCTION)
|
||||
self._function_done(unexpected=True)
|
||||
self._vtable_marker(marker)
|
||||
else:
|
||||
self._syntax_error(ParserError.INCOMPATIBLE_MARKER)
|
||||
|
||||
else:
|
||||
self._syntax_warning(ParserError.BOGUS_MARKER)
|
||||
|
||||
def read_line(self, line: str):
|
||||
self.last_line = line # TODO: Useful or hack for error reporting?
|
||||
self.line_number += 1
|
||||
|
||||
marker = match_marker(line)
|
||||
if marker is not None:
|
||||
# TODO: what's the best place for this?
|
||||
# Does it belong with reading or marker handling?
|
||||
if not is_marker_exact(self.last_line):
|
||||
self._syntax_warning(ParserError.BAD_DECOMP_MARKER)
|
||||
self._handle_marker(marker)
|
||||
return
|
||||
|
||||
line_strip = line.strip()
|
||||
if self.state == ReaderState.IN_TEMPLATE:
|
||||
# TEMPLATE functions are a special case. The signature is
|
||||
# given on the next line (in a // comment)
|
||||
function_sig = get_template_function_name(line)
|
||||
start_line = line_no
|
||||
end_line = line_no
|
||||
state = ReaderState.FUNCTION_DONE
|
||||
name = get_synthetic_name(line)
|
||||
if name is None:
|
||||
self._syntax_error(ParserError.BAD_SYNTHETIC)
|
||||
else:
|
||||
self.function_sig = name
|
||||
self._function_starts_here()
|
||||
self._function_done(lookup_by_name=True)
|
||||
|
||||
elif state == ReaderState.WANT_SIG:
|
||||
# Skip blank lines or comments that come after the offset
|
||||
# marker. There is not a formal procedure for this, so just
|
||||
# assume the next "code line" is the function signature
|
||||
if not is_blank_or_comment(line):
|
||||
elif self.state == ReaderState.WANT_SIG:
|
||||
# Ignore blanks on the way to function start or function name
|
||||
if len(line_strip) == 0:
|
||||
self._syntax_warning(ParserError.UNEXPECTED_BLANK_LINE)
|
||||
|
||||
elif line_strip.startswith("//"):
|
||||
# If we found a comment, assume implicit lookup-by-name
|
||||
# function and end here. We know this is not a decomp marker
|
||||
# because it would have been handled already.
|
||||
self.function_sig = get_synthetic_name(line)
|
||||
self._function_starts_here()
|
||||
self._function_done(lookup_by_name=True)
|
||||
|
||||
elif line_strip == "{":
|
||||
# We missed the function signature but we can recover from this
|
||||
self.function_sig = "(unknown)"
|
||||
self._function_starts_here()
|
||||
self._syntax_warning(ParserError.MISSED_START_OF_FUNCTION)
|
||||
self.state = ReaderState.IN_FUNC
|
||||
|
||||
else:
|
||||
# Inline functions may end with a comment. Strip that out
|
||||
# to help parsing.
|
||||
function_sig = remove_trailing_comment(line.strip())
|
||||
self.function_sig = remove_trailing_comment(line_strip)
|
||||
|
||||
# Now check to see if the opening curly bracket is on the
|
||||
# same line. clang-format should prevent this (BraceWrapping)
|
||||
# but it is easy to detect.
|
||||
# If the entire function is on one line, handle that too.
|
||||
if function_sig.endswith("{"):
|
||||
start_line = line_no
|
||||
state = ReaderState.IN_FUNC
|
||||
elif function_sig.endswith("}") or function_sig.endswith("};"):
|
||||
start_line = line_no
|
||||
end_line = line_no
|
||||
state = ReaderState.FUNCTION_DONE
|
||||
if self.function_sig.endswith("{"):
|
||||
self._function_starts_here()
|
||||
self.state = ReaderState.IN_FUNC
|
||||
elif self.function_sig.endswith("}") or self.function_sig.endswith(
|
||||
"};"
|
||||
):
|
||||
self._function_starts_here()
|
||||
self._function_done()
|
||||
else:
|
||||
state = ReaderState.WANT_CURLY
|
||||
self.state = ReaderState.WANT_CURLY
|
||||
|
||||
elif state == ReaderState.WANT_CURLY:
|
||||
if line.strip() == "{":
|
||||
start_line = line_no
|
||||
state = ReaderState.IN_FUNC
|
||||
elif self.state == ReaderState.WANT_CURLY:
|
||||
if line_strip == "{":
|
||||
self.curly_indent_stops = line.index("{")
|
||||
self._function_starts_here()
|
||||
self.state = ReaderState.IN_FUNC
|
||||
|
||||
elif state == ReaderState.IN_FUNC:
|
||||
# Naive but reasonable assumption that functions will end with
|
||||
# a curly brace on its own line with no prepended spaces.
|
||||
if line.startswith("}"):
|
||||
end_line = line_no
|
||||
state = ReaderState.FUNCTION_DONE
|
||||
elif self.state == ReaderState.IN_FUNC:
|
||||
if line_strip.startswith("}") and line[self.curly_indent_stops] == "}":
|
||||
self._function_done()
|
||||
|
||||
return blocks
|
||||
elif self.state in (ReaderState.IN_GLOBAL, ReaderState.IN_FUNC_GLOBAL):
|
||||
if not is_blank_or_comment(line):
|
||||
self._variable_done()
|
||||
|
||||
elif self.state == ReaderState.IN_VTABLE:
|
||||
vtable_class = get_class_name(line)
|
||||
if vtable_class is not None:
|
||||
self._vtable_done(class_name=vtable_class)
|
||||
|
||||
def read_lines(self, lines: Iterable):
|
||||
for line in lines:
|
||||
self.read_line(line)
|
||||
|
||||
@@ -1,44 +1,17 @@
|
||||
# C++ Parser utility functions and data structures
|
||||
from __future__ import annotations # python <3.10 compatibility
|
||||
import re
|
||||
from typing import List
|
||||
from collections import namedtuple
|
||||
|
||||
DecompMarker = namedtuple("DecompMarker", ["type", "module", "offset"])
|
||||
|
||||
CodeBlock = namedtuple(
|
||||
"CodeBlock",
|
||||
[
|
||||
"offset",
|
||||
"signature",
|
||||
"start_line",
|
||||
"end_line",
|
||||
"offset_comment",
|
||||
"module",
|
||||
"is_template",
|
||||
"is_stub",
|
||||
],
|
||||
)
|
||||
|
||||
OffsetMatch = namedtuple(
|
||||
"OffsetMatch", ["module", "address", "is_template", "is_stub", "comment"]
|
||||
)
|
||||
|
||||
# This has not been formally established, but considering that "STUB"
|
||||
# is a temporary state for a function, we assume it will appear last,
|
||||
# after any other modifiers (i.e. TEMPLATE)
|
||||
|
||||
# To match a reasonable variance of formatting for the offset comment
|
||||
offsetCommentRegex = re.compile(
|
||||
r"\s*//\s*OFFSET:\s*(\w+)\s+(?:0x)?([a-f0-9]+)(\s+TEMPLATE)?(\s+STUB)?", # nopep8
|
||||
markerRegex = re.compile(
|
||||
r"\s*//\s*(\w+):\s*(\w+)\s+(0x[a-f0-9]+)",
|
||||
flags=re.I,
|
||||
)
|
||||
|
||||
# To match the exact syntax (text upper case, hex lower case, with spaces)
|
||||
# that is used in most places
|
||||
offsetCommentExactRegex = re.compile(
|
||||
r"^// OFFSET: [A-Z0-9]+ (0x[a-f0-9]+)( TEMPLATE)?( STUB)?$"
|
||||
) # nopep8
|
||||
|
||||
markerExactRegex = re.compile(r"\s*// ([A-Z]+): ([A-Z0-9]+) (0x[a-f0-9]+)$")
|
||||
|
||||
# The goal here is to just read whatever is on the next line, so some
|
||||
# flexibility in the formatting seems OK
|
||||
@@ -50,15 +23,15 @@ templateCommentRegex = re.compile(r"\s*//\s+(.*)")
|
||||
trailingCommentRegex = re.compile(r"(\s*(?://|/\*).*)$")
|
||||
|
||||
|
||||
def get_template_function_name(line: str) -> str:
|
||||
"""Parse function signature for special TEMPLATE functions"""
|
||||
def get_synthetic_name(line: str) -> str | None:
|
||||
"""Synthetic names appear on a single line comment on the line after the marker.
|
||||
If that's not what we have, return None"""
|
||||
template_match = templateCommentRegex.match(line)
|
||||
|
||||
# If we don't match, you get whatever is on the line as the signature
|
||||
if template_match is not None:
|
||||
return template_match.group(1)
|
||||
|
||||
return line
|
||||
return None
|
||||
|
||||
|
||||
def remove_trailing_comment(line: str) -> str:
|
||||
@@ -78,39 +51,45 @@ def is_blank_or_comment(line: str) -> bool:
|
||||
)
|
||||
|
||||
|
||||
def is_exact_offset_comment(line: str) -> bool:
|
||||
"""If the offset comment does not match our (unofficial) syntax
|
||||
we may want to alert the user to fix it for style points."""
|
||||
return offsetCommentExactRegex.match(line) is not None
|
||||
|
||||
|
||||
def match_offset_comment(line: str) -> OffsetMatch | None:
|
||||
match = offsetCommentRegex.match(line)
|
||||
def match_marker(line: str) -> DecompMarker | None:
|
||||
match = markerRegex.match(line)
|
||||
if match is None:
|
||||
return None
|
||||
|
||||
return OffsetMatch(
|
||||
module=match.group(1),
|
||||
address=int(match.group(2), 16),
|
||||
is_template=match.group(3) is not None,
|
||||
is_stub=match.group(4) is not None,
|
||||
comment=line.strip(),
|
||||
return DecompMarker(
|
||||
type=match.group(1), module=match.group(2), offset=int(match.group(3), 16)
|
||||
)
|
||||
|
||||
|
||||
def distinct_by_module(offsets: List) -> List:
|
||||
"""Given a list of offset markers, return a list with distinct
|
||||
module names. If module names (case-insensitive) are repeated,
|
||||
choose the offset that appears first."""
|
||||
def is_marker_exact(line: str) -> bool:
|
||||
return markerExactRegex.match(line) is not None
|
||||
|
||||
if len(offsets) < 2:
|
||||
return offsets
|
||||
|
||||
# Dict maintains insertion order in python >=3.7
|
||||
offsets_dict = {}
|
||||
for offset in offsets:
|
||||
module_upper = offset.module.upper()
|
||||
if module_upper not in offsets_dict:
|
||||
offsets_dict[module_upper] = offset
|
||||
template_class_decl_regex = re.compile(
|
||||
r"\s*(?:\/\/)?\s*(?:class|struct) (\w+)<([\w]+)\s*(\*+)?\s*>"
|
||||
)
|
||||
|
||||
return list(offsets_dict.values())
|
||||
|
||||
class_decl_regex = re.compile(r"\s*(?:\/\/)?\s*(?:class|struct) (\w+)")
|
||||
|
||||
|
||||
def get_class_name(line: str) -> str | None:
|
||||
"""For VTABLE markers, extract the class name from the code line or comment
|
||||
where it appears."""
|
||||
|
||||
match = template_class_decl_regex.match(line)
|
||||
if match is not None:
|
||||
# For template classes, we should reformat the class name so it matches
|
||||
# the output from cvdump: one space between the template type and any asterisks
|
||||
# if it is a pointer type.
|
||||
(class_name, template_type, asterisks) = match.groups()
|
||||
if asterisks is not None:
|
||||
return f"{class_name}<{template_type} {asterisks}>"
|
||||
|
||||
return f"{class_name}<{template_type}>"
|
||||
|
||||
match = class_decl_regex.match(line)
|
||||
if match is not None:
|
||||
return match.group(1)
|
||||
|
||||
return None
|
||||
|
||||
Reference in New Issue
Block a user