Checkorder tool to keep functions in original binary order (#228)

* First commit of order tool

* More flexible match on module name. Bugfix on blank_or_comment

* Report inexact offset comments in verbose mode. Bugfix for exact regex

* Refactor checkorder into reusable isledecomp module

* Find bad comments in one pass, add awareness of TEMPLATE

* Refactor of state machine to prepare for reccmp integration

* Use isledecomp lib in reccmp

* Build isledecomp in GH actions, fix mypy complaint

* Ensure unit test cpp files will be ignored by reccmp

* Allow multiple offset markers, pep8 cleanup

* Remove unused variable

* Code style, remove unneeded module and TODO

* Final renaming and type hints

* Fix checkorder issues, add GH action and enforce (#2)

* Fix checkorder issues

* Add GH action

* Test error case

* Works

* Fixes

---------

Co-authored-by: Christian Semmler <mail@csemmler.com>
This commit is contained in:
MS
2023-11-21 03:44:45 -05:00
committed by GitHub
parent 714d36b57d
commit 1ae3b07dc2
84 changed files with 4021 additions and 3209 deletions

View File

View File

@@ -0,0 +1,21 @@
import os
from typing import Iterator
def is_file_cpp(filename: str) -> bool:
(basefile, ext) = os.path.splitext(filename)
return ext.lower() in ('.h', '.cpp')
def walk_source_dir(source: str, recursive: bool = True) -> Iterator[str]:
"""Generator to walk the given directory recursively and return
any C++ files found."""
source = os.path.abspath(source)
for subdir, dirs, files in os.walk(source):
for file in files:
if is_file_cpp(file):
yield os.path.join(subdir, file)
if not recursive:
break

View File

@@ -0,0 +1 @@
from .parser import find_code_blocks

View File

@@ -0,0 +1,142 @@
# C++ file parser
from typing import List, TextIO
from enum import Enum
from .util import (
CodeBlock,
OffsetMatch,
is_blank_or_comment,
match_offset_comment,
is_exact_offset_comment,
get_template_function_name,
remove_trailing_comment,
distinct_by_module,
)
class ReaderState(Enum):
WANT_OFFSET = 0
WANT_SIG = 1
IN_FUNC = 2
IN_TEMPLATE = 3
WANT_CURLY = 4
FUNCTION_DONE = 5
def find_code_blocks(stream: TextIO) -> List[CodeBlock]:
"""Read the IO stream (file) line-by-line and give the following report:
Foreach code block (function) in the file, what are its starting and
ending line numbers, and what is the given offset in the original
binary. We expect the result to be ordered by line number because we
are reading the file from start to finish."""
blocks: List[CodeBlock] = []
offset_matches: List[OffsetMatch] = []
function_sig = None
start_line = None
end_line = None
state = ReaderState.WANT_OFFSET
# 1-based to match cvdump and your text editor
# I know it says 0, but we will increment before each readline()
line_no = 0
can_seek = True
while True:
# Do this before reading again so that an EOF will not
# cause us to miss the last function of the file.
if state == ReaderState.FUNCTION_DONE:
# Our list of offset marks could have duplicates on
# module name, so we'll eliminate those now.
for offset_match in distinct_by_module(offset_matches):
block = CodeBlock(offset=offset_match.address,
signature=function_sig,
start_line=start_line,
end_line=end_line,
offset_comment=offset_match.comment,
module=offset_match.module,
is_template=offset_match.is_template,
is_stub=offset_match.is_stub)
blocks.append(block)
offset_matches = []
state = ReaderState.WANT_OFFSET
if can_seek:
line_no += 1
line = stream.readline()
if line == '':
break
new_match = match_offset_comment(line)
if new_match is not None:
# We will allow multiple offsets if we have just begun
# the code block, but not after we hit the curly brace.
if state in (ReaderState.WANT_OFFSET, ReaderState.IN_TEMPLATE,
ReaderState.WANT_SIG):
# If we detected an offset marker unexpectedly,
# we are handling it here so we can continue seeking.
can_seek = True
offset_matches.append(new_match)
if new_match.is_template:
state = ReaderState.IN_TEMPLATE
else:
state = ReaderState.WANT_SIG
else:
# We hit another offset unexpectedly.
# We can recover easily by just ending the function here.
end_line = line_no - 1
state = ReaderState.FUNCTION_DONE
# Pause reading here so we handle the offset marker
# on the next loop iteration
can_seek = False
elif state == ReaderState.IN_TEMPLATE:
# TEMPLATE functions are a special case. The signature is
# given on the next line (in a // comment)
function_sig = get_template_function_name(line)
start_line = line_no
end_line = line_no
state = ReaderState.FUNCTION_DONE
elif state == ReaderState.WANT_SIG:
# Skip blank lines or comments that come after the offset
# marker. There is not a formal procedure for this, so just
# assume the next "code line" is the function signature
if not is_blank_or_comment(line):
# Inline functions may end with a comment. Strip that out
# to help parsing.
function_sig = remove_trailing_comment(line.strip())
# Now check to see if the opening curly bracket is on the
# same line. clang-format should prevent this (BraceWrapping)
# but it is easy to detect.
# If the entire function is on one line, handle that too.
if function_sig.endswith('{'):
start_line = line_no
state = ReaderState.IN_FUNC
elif (function_sig.endswith('}') or
function_sig.endswith('};')):
start_line = line_no
end_line = line_no
state = ReaderState.FUNCTION_DONE
else:
state = ReaderState.WANT_CURLY
elif state == ReaderState.WANT_CURLY:
if line.strip() == '{':
start_line = line_no
state = ReaderState.IN_FUNC
elif state == ReaderState.IN_FUNC:
# Naive but reasonable assumption that functions will end with
# a curly brace on its own line with no prepended spaces.
if line.startswith('}'):
end_line = line_no
state = ReaderState.FUNCTION_DONE
return blocks

View File

@@ -0,0 +1,97 @@
# C++ Parser utility functions and data structures
from __future__ import annotations # python <3.10 compatibility
import re
from typing import List
from collections import namedtuple
CodeBlock = namedtuple('CodeBlock',
['offset', 'signature', 'start_line', 'end_line',
'offset_comment', 'module', 'is_template', 'is_stub'])
OffsetMatch = namedtuple('OffsetMatch', ['module', 'address', 'is_template',
'is_stub', 'comment'])
# This has not been formally established, but considering that "STUB"
# is a temporary state for a function, we assume it will appear last,
# after any other modifiers (i.e. TEMPLATE)
# To match a reasonable variance of formatting for the offset comment
offsetCommentRegex = re.compile(r'\s*//\s*OFFSET:\s*(\w+)\s+(?:0x)?([a-f0-9]+)(\s+TEMPLATE)?(\s+STUB)?', # nopep8
flags=re.I)
# To match the exact syntax (text upper case, hex lower case, with spaces)
# that is used in most places
offsetCommentExactRegex = re.compile(r'^// OFFSET: [A-Z0-9]+ (0x[a-f0-9]+)( TEMPLATE)?( STUB)?$') # nopep8
# The goal here is to just read whatever is on the next line, so some
# flexibility in the formatting seems OK
templateCommentRegex = re.compile(r'\s*//\s+(.*)')
# To remove any comment (//) or block comment (/*) and its leading spaces
# from the end of a code line
trailingCommentRegex = re.compile(r'(\s*(?://|/\*).*)$')
def get_template_function_name(line: str) -> str:
"""Parse function signature for special TEMPLATE functions"""
template_match = templateCommentRegex.match(line)
# If we don't match, you get whatever is on the line as the signature
if template_match is not None:
return template_match.group(1)
return line
def remove_trailing_comment(line: str) -> str:
return trailingCommentRegex.sub('', line)
def is_blank_or_comment(line: str) -> bool:
"""Helper to read ahead after the offset comment is matched.
There could be blank lines or other comments before the
function signature, and we want to skip those."""
line_strip = line.strip()
return (len(line_strip) == 0
or line_strip.startswith('//')
or line_strip.startswith('/*')
or line_strip.endswith('*/'))
def is_exact_offset_comment(line: str) -> bool:
"""If the offset comment does not match our (unofficial) syntax
we may want to alert the user to fix it for style points."""
return offsetCommentExactRegex.match(line) is not None
def match_offset_comment(line: str) -> OffsetMatch | None:
match = offsetCommentRegex.match(line)
if match is None:
return None
return OffsetMatch(module=match.group(1),
address=int(match.group(2), 16),
is_template=match.group(3) is not None,
is_stub=match.group(4) is not None,
comment=line.strip())
def distinct_by_module(offsets: List) -> List:
"""Given a list of offset markers, return a list with distinct
module names. If module names (case-insensitive) are repeated,
choose the offset that appears first."""
if len(offsets) < 2:
return offsets
# Dict maintains insertion order in python >=3.7
offsets_dict = {}
for offset in offsets:
module_upper = offset.module.upper()
if module_upper not in offsets_dict:
offsets_dict[module_upper] = offset
return list(offsets_dict.values())