mirror of
https://github.com/isledecomp/isle.git
synced 2025-10-23 00:14:22 +00:00
Checkorder tool to keep functions in original binary order (#228)
* First commit of order tool * More flexible match on module name. Bugfix on blank_or_comment * Report inexact offset comments in verbose mode. Bugfix for exact regex * Refactor checkorder into reusable isledecomp module * Find bad comments in one pass, add awareness of TEMPLATE * Refactor of state machine to prepare for reccmp integration * Use isledecomp lib in reccmp * Build isledecomp in GH actions, fix mypy complaint * Ensure unit test cpp files will be ignored by reccmp * Allow multiple offset markers, pep8 cleanup * Remove unused variable * Code style, remove unneeded module and TODO * Final renaming and type hints * Fix checkorder issues, add GH action and enforce (#2) * Fix checkorder issues * Add GH action * Test error case * Works * Fixes --------- Co-authored-by: Christian Semmler <mail@csemmler.com>
This commit is contained in:
120
tools/checkorder/checkorder.py
Normal file
120
tools/checkorder/checkorder.py
Normal file
@@ -0,0 +1,120 @@
|
||||
import os
|
||||
import sys
|
||||
import argparse
|
||||
from isledecomp.dir import (
|
||||
walk_source_dir,
|
||||
is_file_cpp
|
||||
)
|
||||
from isledecomp.parser import find_code_blocks
|
||||
from isledecomp.parser.util import (
|
||||
is_exact_offset_comment
|
||||
)
|
||||
|
||||
|
||||
def sig_truncate(sig: str) -> str:
|
||||
"""Helper to truncate function names to 50 chars and append ellipsis
|
||||
if needed. Goal is to stay under 80 columns for tool output."""
|
||||
return f"{sig[:47]}{'...' if len(sig) >= 50 else ''}"
|
||||
|
||||
|
||||
def check_file(filename: str, verbose: bool = False) -> bool:
|
||||
"""Open and read the given file, then check whether the code blocks
|
||||
are in order. If verbose, print each block."""
|
||||
|
||||
with open(filename, 'r') as f:
|
||||
code_blocks = find_code_blocks(f)
|
||||
|
||||
bad_comments = [(block.start_line, block.offset_comment)
|
||||
for block in code_blocks
|
||||
if not is_exact_offset_comment(block.offset_comment)]
|
||||
|
||||
just_offsets = [block.offset for block in code_blocks]
|
||||
sorted_offsets = sorted(just_offsets)
|
||||
file_out_of_order = just_offsets != sorted_offsets
|
||||
|
||||
# If we detect inexact comments, don't print anything unless we are
|
||||
# in verbose mode. If the file is out of order, we always print the
|
||||
# file name.
|
||||
should_report = ((len(bad_comments) > 0 and verbose)
|
||||
or file_out_of_order)
|
||||
|
||||
if not should_report and not file_out_of_order:
|
||||
return False
|
||||
|
||||
# Else: we are alerting to some problem in this file
|
||||
print(filename)
|
||||
if verbose:
|
||||
if file_out_of_order:
|
||||
order_lookup = {k: i for i, k in enumerate(sorted_offsets)}
|
||||
prev_offset = 0
|
||||
|
||||
for block in code_blocks:
|
||||
msg = ' '.join([
|
||||
' ' if block.offset > prev_offset else '!',
|
||||
f'{block.offset:08x}',
|
||||
f'{block.end_line - block.start_line:4} lines',
|
||||
f'{order_lookup[block.offset]:3}',
|
||||
' ',
|
||||
sig_truncate(block.signature),
|
||||
])
|
||||
print(msg)
|
||||
prev_offset = block.offset
|
||||
|
||||
for (line_no, line) in bad_comments:
|
||||
print(f'* line {line_no:3} bad offset comment ({line})')
|
||||
|
||||
print()
|
||||
|
||||
return file_out_of_order
|
||||
|
||||
|
||||
def parse_args(test_args: list | None = None) -> dict:
|
||||
p = argparse.ArgumentParser()
|
||||
p.add_argument('target', help='The file or directory to check.')
|
||||
p.add_argument('--enforce', action=argparse.BooleanOptionalAction,
|
||||
default=False,
|
||||
help='Fail with error code if target is out of order.')
|
||||
p.add_argument('--verbose', action=argparse.BooleanOptionalAction,
|
||||
default=False,
|
||||
help=('Display each code block in the file and show '
|
||||
'where each consecutive run of blocks is broken.'))
|
||||
|
||||
if test_args is None:
|
||||
args = p.parse_args()
|
||||
else:
|
||||
args = p.parse_args(test_args)
|
||||
|
||||
return vars(args)
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
|
||||
if os.path.isdir(args['target']):
|
||||
files_to_check = list(walk_source_dir(args['target']))
|
||||
elif os.path.isfile(args['target']) and is_file_cpp(args['target']):
|
||||
files_to_check = [args['target']]
|
||||
else:
|
||||
sys.exit('Invalid target')
|
||||
|
||||
files_out_of_order = 0
|
||||
|
||||
for file in files_to_check:
|
||||
is_jumbled = check_file(file, args['verbose'])
|
||||
if is_jumbled:
|
||||
files_out_of_order += 1
|
||||
|
||||
if files_out_of_order > 0:
|
||||
error_message = ' '.join([
|
||||
str(files_out_of_order),
|
||||
'files are' if files_out_of_order > 1 else 'file is',
|
||||
'out of order'
|
||||
])
|
||||
print(error_message)
|
||||
|
||||
if files_out_of_order > 0 and args['enforce']:
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
1
tools/checkorder/requirements.txt
Normal file
1
tools/checkorder/requirements.txt
Normal file
@@ -0,0 +1 @@
|
||||
isledecomp
|
1
tools/isledecomp/.gitignore
vendored
Normal file
1
tools/isledecomp/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
isledecomp.egg-info/
|
0
tools/isledecomp/isledecomp/__init__.py
Normal file
0
tools/isledecomp/isledecomp/__init__.py
Normal file
21
tools/isledecomp/isledecomp/dir.py
Normal file
21
tools/isledecomp/isledecomp/dir.py
Normal file
@@ -0,0 +1,21 @@
|
||||
import os
|
||||
from typing import Iterator
|
||||
|
||||
|
||||
def is_file_cpp(filename: str) -> bool:
|
||||
(basefile, ext) = os.path.splitext(filename)
|
||||
return ext.lower() in ('.h', '.cpp')
|
||||
|
||||
|
||||
def walk_source_dir(source: str, recursive: bool = True) -> Iterator[str]:
|
||||
"""Generator to walk the given directory recursively and return
|
||||
any C++ files found."""
|
||||
|
||||
source = os.path.abspath(source)
|
||||
for subdir, dirs, files in os.walk(source):
|
||||
for file in files:
|
||||
if is_file_cpp(file):
|
||||
yield os.path.join(subdir, file)
|
||||
|
||||
if not recursive:
|
||||
break
|
1
tools/isledecomp/isledecomp/parser/__init__.py
Normal file
1
tools/isledecomp/isledecomp/parser/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
from .parser import find_code_blocks
|
142
tools/isledecomp/isledecomp/parser/parser.py
Normal file
142
tools/isledecomp/isledecomp/parser/parser.py
Normal file
@@ -0,0 +1,142 @@
|
||||
# C++ file parser
|
||||
|
||||
from typing import List, TextIO
|
||||
from enum import Enum
|
||||
from .util import (
|
||||
CodeBlock,
|
||||
OffsetMatch,
|
||||
is_blank_or_comment,
|
||||
match_offset_comment,
|
||||
is_exact_offset_comment,
|
||||
get_template_function_name,
|
||||
remove_trailing_comment,
|
||||
distinct_by_module,
|
||||
)
|
||||
|
||||
|
||||
class ReaderState(Enum):
|
||||
WANT_OFFSET = 0
|
||||
WANT_SIG = 1
|
||||
IN_FUNC = 2
|
||||
IN_TEMPLATE = 3
|
||||
WANT_CURLY = 4
|
||||
FUNCTION_DONE = 5
|
||||
|
||||
|
||||
def find_code_blocks(stream: TextIO) -> List[CodeBlock]:
|
||||
"""Read the IO stream (file) line-by-line and give the following report:
|
||||
Foreach code block (function) in the file, what are its starting and
|
||||
ending line numbers, and what is the given offset in the original
|
||||
binary. We expect the result to be ordered by line number because we
|
||||
are reading the file from start to finish."""
|
||||
|
||||
blocks: List[CodeBlock] = []
|
||||
|
||||
offset_matches: List[OffsetMatch] = []
|
||||
|
||||
function_sig = None
|
||||
start_line = None
|
||||
end_line = None
|
||||
state = ReaderState.WANT_OFFSET
|
||||
|
||||
# 1-based to match cvdump and your text editor
|
||||
# I know it says 0, but we will increment before each readline()
|
||||
line_no = 0
|
||||
can_seek = True
|
||||
|
||||
while True:
|
||||
# Do this before reading again so that an EOF will not
|
||||
# cause us to miss the last function of the file.
|
||||
if state == ReaderState.FUNCTION_DONE:
|
||||
# Our list of offset marks could have duplicates on
|
||||
# module name, so we'll eliminate those now.
|
||||
for offset_match in distinct_by_module(offset_matches):
|
||||
block = CodeBlock(offset=offset_match.address,
|
||||
signature=function_sig,
|
||||
start_line=start_line,
|
||||
end_line=end_line,
|
||||
offset_comment=offset_match.comment,
|
||||
module=offset_match.module,
|
||||
is_template=offset_match.is_template,
|
||||
is_stub=offset_match.is_stub)
|
||||
blocks.append(block)
|
||||
offset_matches = []
|
||||
state = ReaderState.WANT_OFFSET
|
||||
|
||||
if can_seek:
|
||||
line_no += 1
|
||||
line = stream.readline()
|
||||
if line == '':
|
||||
break
|
||||
|
||||
new_match = match_offset_comment(line)
|
||||
if new_match is not None:
|
||||
# We will allow multiple offsets if we have just begun
|
||||
# the code block, but not after we hit the curly brace.
|
||||
if state in (ReaderState.WANT_OFFSET, ReaderState.IN_TEMPLATE,
|
||||
ReaderState.WANT_SIG):
|
||||
# If we detected an offset marker unexpectedly,
|
||||
# we are handling it here so we can continue seeking.
|
||||
can_seek = True
|
||||
|
||||
offset_matches.append(new_match)
|
||||
|
||||
if new_match.is_template:
|
||||
state = ReaderState.IN_TEMPLATE
|
||||
else:
|
||||
state = ReaderState.WANT_SIG
|
||||
else:
|
||||
# We hit another offset unexpectedly.
|
||||
# We can recover easily by just ending the function here.
|
||||
end_line = line_no - 1
|
||||
state = ReaderState.FUNCTION_DONE
|
||||
|
||||
# Pause reading here so we handle the offset marker
|
||||
# on the next loop iteration
|
||||
can_seek = False
|
||||
|
||||
elif state == ReaderState.IN_TEMPLATE:
|
||||
# TEMPLATE functions are a special case. The signature is
|
||||
# given on the next line (in a // comment)
|
||||
function_sig = get_template_function_name(line)
|
||||
start_line = line_no
|
||||
end_line = line_no
|
||||
state = ReaderState.FUNCTION_DONE
|
||||
|
||||
elif state == ReaderState.WANT_SIG:
|
||||
# Skip blank lines or comments that come after the offset
|
||||
# marker. There is not a formal procedure for this, so just
|
||||
# assume the next "code line" is the function signature
|
||||
if not is_blank_or_comment(line):
|
||||
# Inline functions may end with a comment. Strip that out
|
||||
# to help parsing.
|
||||
function_sig = remove_trailing_comment(line.strip())
|
||||
|
||||
# Now check to see if the opening curly bracket is on the
|
||||
# same line. clang-format should prevent this (BraceWrapping)
|
||||
# but it is easy to detect.
|
||||
# If the entire function is on one line, handle that too.
|
||||
if function_sig.endswith('{'):
|
||||
start_line = line_no
|
||||
state = ReaderState.IN_FUNC
|
||||
elif (function_sig.endswith('}') or
|
||||
function_sig.endswith('};')):
|
||||
start_line = line_no
|
||||
end_line = line_no
|
||||
state = ReaderState.FUNCTION_DONE
|
||||
else:
|
||||
state = ReaderState.WANT_CURLY
|
||||
|
||||
elif state == ReaderState.WANT_CURLY:
|
||||
if line.strip() == '{':
|
||||
start_line = line_no
|
||||
state = ReaderState.IN_FUNC
|
||||
|
||||
elif state == ReaderState.IN_FUNC:
|
||||
# Naive but reasonable assumption that functions will end with
|
||||
# a curly brace on its own line with no prepended spaces.
|
||||
if line.startswith('}'):
|
||||
end_line = line_no
|
||||
state = ReaderState.FUNCTION_DONE
|
||||
|
||||
return blocks
|
97
tools/isledecomp/isledecomp/parser/util.py
Normal file
97
tools/isledecomp/isledecomp/parser/util.py
Normal file
@@ -0,0 +1,97 @@
|
||||
# C++ Parser utility functions and data structures
|
||||
from __future__ import annotations # python <3.10 compatibility
|
||||
import re
|
||||
from typing import List
|
||||
from collections import namedtuple
|
||||
|
||||
|
||||
CodeBlock = namedtuple('CodeBlock',
|
||||
['offset', 'signature', 'start_line', 'end_line',
|
||||
'offset_comment', 'module', 'is_template', 'is_stub'])
|
||||
|
||||
OffsetMatch = namedtuple('OffsetMatch', ['module', 'address', 'is_template',
|
||||
'is_stub', 'comment'])
|
||||
|
||||
# This has not been formally established, but considering that "STUB"
|
||||
# is a temporary state for a function, we assume it will appear last,
|
||||
# after any other modifiers (i.e. TEMPLATE)
|
||||
|
||||
# To match a reasonable variance of formatting for the offset comment
|
||||
offsetCommentRegex = re.compile(r'\s*//\s*OFFSET:\s*(\w+)\s+(?:0x)?([a-f0-9]+)(\s+TEMPLATE)?(\s+STUB)?', # nopep8
|
||||
flags=re.I)
|
||||
|
||||
# To match the exact syntax (text upper case, hex lower case, with spaces)
|
||||
# that is used in most places
|
||||
offsetCommentExactRegex = re.compile(r'^// OFFSET: [A-Z0-9]+ (0x[a-f0-9]+)( TEMPLATE)?( STUB)?$') # nopep8
|
||||
|
||||
|
||||
# The goal here is to just read whatever is on the next line, so some
|
||||
# flexibility in the formatting seems OK
|
||||
templateCommentRegex = re.compile(r'\s*//\s+(.*)')
|
||||
|
||||
|
||||
# To remove any comment (//) or block comment (/*) and its leading spaces
|
||||
# from the end of a code line
|
||||
trailingCommentRegex = re.compile(r'(\s*(?://|/\*).*)$')
|
||||
|
||||
|
||||
def get_template_function_name(line: str) -> str:
|
||||
"""Parse function signature for special TEMPLATE functions"""
|
||||
template_match = templateCommentRegex.match(line)
|
||||
|
||||
# If we don't match, you get whatever is on the line as the signature
|
||||
if template_match is not None:
|
||||
return template_match.group(1)
|
||||
|
||||
return line
|
||||
|
||||
|
||||
def remove_trailing_comment(line: str) -> str:
|
||||
return trailingCommentRegex.sub('', line)
|
||||
|
||||
|
||||
def is_blank_or_comment(line: str) -> bool:
|
||||
"""Helper to read ahead after the offset comment is matched.
|
||||
There could be blank lines or other comments before the
|
||||
function signature, and we want to skip those."""
|
||||
line_strip = line.strip()
|
||||
return (len(line_strip) == 0
|
||||
or line_strip.startswith('//')
|
||||
or line_strip.startswith('/*')
|
||||
or line_strip.endswith('*/'))
|
||||
|
||||
|
||||
def is_exact_offset_comment(line: str) -> bool:
|
||||
"""If the offset comment does not match our (unofficial) syntax
|
||||
we may want to alert the user to fix it for style points."""
|
||||
return offsetCommentExactRegex.match(line) is not None
|
||||
|
||||
|
||||
def match_offset_comment(line: str) -> OffsetMatch | None:
|
||||
match = offsetCommentRegex.match(line)
|
||||
if match is None:
|
||||
return None
|
||||
|
||||
return OffsetMatch(module=match.group(1),
|
||||
address=int(match.group(2), 16),
|
||||
is_template=match.group(3) is not None,
|
||||
is_stub=match.group(4) is not None,
|
||||
comment=line.strip())
|
||||
|
||||
|
||||
def distinct_by_module(offsets: List) -> List:
|
||||
"""Given a list of offset markers, return a list with distinct
|
||||
module names. If module names (case-insensitive) are repeated,
|
||||
choose the offset that appears first."""
|
||||
|
||||
if len(offsets) < 2:
|
||||
return offsets
|
||||
|
||||
# Dict maintains insertion order in python >=3.7
|
||||
offsets_dict = {}
|
||||
for offset in offsets:
|
||||
module_upper = offset.module.upper()
|
||||
if module_upper not in offsets_dict:
|
||||
offsets_dict[module_upper] = offset
|
||||
|
||||
return list(offsets_dict.values())
|
9
tools/isledecomp/setup.py
Normal file
9
tools/isledecomp/setup.py
Normal file
@@ -0,0 +1,9 @@
|
||||
from setuptools import setup, find_packages
|
||||
|
||||
setup(
|
||||
name='isledecomp',
|
||||
version='0.1.0',
|
||||
description='Python tools for the isledecomp project',
|
||||
packages=find_packages(),
|
||||
tests_require=['pytest'],
|
||||
)
|
0
tools/isledecomp/tests/__init__.py
Normal file
0
tools/isledecomp/tests/__init__.py
Normal file
29
tools/isledecomp/tests/samples/basic_class.cpp
Normal file
29
tools/isledecomp/tests/samples/basic_class.cpp
Normal file
@@ -0,0 +1,29 @@
|
||||
// Sample for python unit tests
|
||||
// Not part of the decomp
|
||||
|
||||
// A very simple class
|
||||
|
||||
class TestClass {
|
||||
public:
|
||||
TestClass();
|
||||
virtual ~TestClass() override;
|
||||
|
||||
virtual MxResult Tickle() override; // vtable+08
|
||||
|
||||
// OFFSET: TEST 0x12345678
|
||||
inline const char* ClassName() const // vtable+0c
|
||||
{
|
||||
// 0xabcd1234
|
||||
return "TestClass";
|
||||
}
|
||||
|
||||
// OFFSET: TEST 0xdeadbeef
|
||||
inline MxBool IsA(const char* name) const override // vtable+10
|
||||
{
|
||||
return !strcmp(name, TestClass::ClassName());
|
||||
}
|
||||
|
||||
private:
|
||||
int m_hello;
|
||||
int m_hiThere;
|
||||
};
|
22
tools/isledecomp/tests/samples/basic_file.cpp
Normal file
22
tools/isledecomp/tests/samples/basic_file.cpp
Normal file
@@ -0,0 +1,22 @@
|
||||
// Sample for python unit tests
|
||||
// Not part of the decomp
|
||||
|
||||
// A very simple well-formed code file
|
||||
|
||||
// OFFSET: TEST 0x1234
|
||||
void function01()
|
||||
{
|
||||
// TODO
|
||||
}
|
||||
|
||||
// OFFSET: TEST 0x2345
|
||||
void function02()
|
||||
{
|
||||
// TODO
|
||||
}
|
||||
|
||||
// OFFSET: TEST 0x3456
|
||||
void function03()
|
||||
{
|
||||
// TODO
|
||||
}
|
8
tools/isledecomp/tests/samples/inline.cpp
Normal file
8
tools/isledecomp/tests/samples/inline.cpp
Normal file
@@ -0,0 +1,8 @@
|
||||
// Sample for python unit tests
|
||||
// Not part of the decomp
|
||||
|
||||
// OFFSET: TEST 0x10000001
|
||||
inline const char* OneLineWithComment() const { return "MxDSObject"; }; // hi there
|
||||
|
||||
// OFFSET: TEST 0x10000002
|
||||
inline const char* OneLine() const { return "MxDSObject"; };
|
16
tools/isledecomp/tests/samples/missing_offset.cpp
Normal file
16
tools/isledecomp/tests/samples/missing_offset.cpp
Normal file
@@ -0,0 +1,16 @@
|
||||
// Sample for python unit tests
|
||||
// Not part of the decomp
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
int no_offset_comment()
|
||||
{
|
||||
static int dummy = 123;
|
||||
return -1;
|
||||
}
|
||||
|
||||
// OFFSET: TEST 0xdeadbeef
|
||||
void regular_ole_function()
|
||||
{
|
||||
printf("hi there");
|
||||
}
|
25
tools/isledecomp/tests/samples/multiple_offsets.cpp
Normal file
25
tools/isledecomp/tests/samples/multiple_offsets.cpp
Normal file
@@ -0,0 +1,25 @@
|
||||
// Sample for python unit tests
|
||||
// Not part of the decomp
|
||||
|
||||
// Handling multiple offset markers
|
||||
|
||||
// OFFSET: TEST 0x1234
|
||||
// OFFSET: HELLO 0x5555
|
||||
void different_modules()
|
||||
{
|
||||
// TODO
|
||||
}
|
||||
|
||||
// OFFSET: TEST 0x2345
|
||||
// OFFSET: TEST 0x1234
|
||||
void same_module()
|
||||
{
|
||||
// TODO
|
||||
}
|
||||
|
||||
// OFFSET: TEST 0x2002
|
||||
// OFFSET: test 0x1001
|
||||
void same_case_insensitive()
|
||||
{
|
||||
// TODO
|
||||
}
|
12
tools/isledecomp/tests/samples/oneline_function.cpp
Normal file
12
tools/isledecomp/tests/samples/oneline_function.cpp
Normal file
@@ -0,0 +1,12 @@
|
||||
// Sample for python unit tests
|
||||
// Not part of the decomp
|
||||
|
||||
// OFFSET: TEST 0x1234
|
||||
void short_function() { static char* msg = "oneliner"; }
|
||||
|
||||
// OFFSET: TEST 0x5555
|
||||
void function_after_one_liner()
|
||||
{
|
||||
// This function comes after the previous that is on a single line.
|
||||
// Do we report the offset for this one correctly?
|
||||
}
|
20
tools/isledecomp/tests/samples/out_of_order.cpp
Normal file
20
tools/isledecomp/tests/samples/out_of_order.cpp
Normal file
@@ -0,0 +1,20 @@
|
||||
// Sample for python unit tests
|
||||
// Not part of the decomp
|
||||
|
||||
// OFFSET: TEST 0x1001
|
||||
void function_order01()
|
||||
{
|
||||
// TODO
|
||||
}
|
||||
|
||||
// OFFSET: TEST 0x1003
|
||||
void function_order03()
|
||||
{
|
||||
// TODO
|
||||
}
|
||||
|
||||
// OFFSET: TEST 0x1002
|
||||
void function_order02()
|
||||
{
|
||||
// TODO
|
||||
}
|
23
tools/isledecomp/tests/samples/poorly_formatted.cpp
Normal file
23
tools/isledecomp/tests/samples/poorly_formatted.cpp
Normal file
@@ -0,0 +1,23 @@
|
||||
// Sample for python unit tests
|
||||
// Not part of the decomp
|
||||
|
||||
// While it's reasonable to expect a well-formed file (and clang-format
|
||||
// will make sure we get one), this will put the parser through its paces.
|
||||
|
||||
// OFFSET: TEST 0x1234
|
||||
void curly_with_spaces()
|
||||
{
|
||||
static char* msg = "hello";
|
||||
}
|
||||
|
||||
// OFFSET: TEST 0x5555
|
||||
void weird_closing_curly()
|
||||
{
|
||||
int x = 123; }
|
||||
|
||||
// OFFSET: HELLO 0x5656
|
||||
void bad_indenting() {
|
||||
if (0)
|
||||
{
|
||||
int y = 5;
|
||||
}}
|
128
tools/isledecomp/tests/test_parser.py
Normal file
128
tools/isledecomp/tests/test_parser.py
Normal file
@@ -0,0 +1,128 @@
|
||||
import os
|
||||
import pytest
|
||||
from typing import List, TextIO
|
||||
from isledecomp.parser import find_code_blocks
|
||||
from isledecomp.parser.util import CodeBlock
|
||||
|
||||
SAMPLE_DIR = os.path.join(os.path.dirname(__file__), 'samples')
|
||||
|
||||
|
||||
def sample_file(filename: str) -> TextIO:
|
||||
"""Wrapper for opening the samples from the directory that does not
|
||||
depend on the cwd where we run the test"""
|
||||
full_path = os.path.join(SAMPLE_DIR, filename)
|
||||
return open(full_path, 'r')
|
||||
|
||||
|
||||
def code_blocks_are_sorted(blocks: List[CodeBlock]) -> bool:
|
||||
"""Helper to make this more idiomatic"""
|
||||
just_offsets = [block.offset for block in blocks]
|
||||
return just_offsets == sorted(just_offsets)
|
||||
|
||||
|
||||
# Tests are below #
|
||||
|
||||
|
||||
def test_sanity():
|
||||
"""Read a very basic file"""
|
||||
with sample_file('basic_file.cpp') as f:
|
||||
blocks = find_code_blocks(f)
|
||||
|
||||
assert len(blocks) == 3
|
||||
assert code_blocks_are_sorted(blocks) is True
|
||||
# n.b. The parser returns line numbers as 1-based
|
||||
# Function starts when we see the opening curly brace
|
||||
assert blocks[0].start_line == 8
|
||||
assert blocks[0].end_line == 10
|
||||
|
||||
|
||||
def test_oneline():
|
||||
"""(Assuming clang-format permits this) This sample has a function
|
||||
on a single line. This will test the end-of-function detection"""
|
||||
with sample_file('oneline_function.cpp') as f:
|
||||
blocks = find_code_blocks(f)
|
||||
|
||||
assert len(blocks) == 2
|
||||
assert blocks[0].start_line == 5
|
||||
assert blocks[0].end_line == 5
|
||||
|
||||
|
||||
def test_missing_offset():
|
||||
"""What if the function doesn't have an offset comment?"""
|
||||
with sample_file('missing_offset.cpp') as f:
|
||||
blocks = find_code_blocks(f)
|
||||
|
||||
# TODO: For now, the function without the offset will just be ignored.
|
||||
# Would be the same outcome if the comment was present but mangled and
|
||||
# we failed to match it. We should detect these cases in the future.
|
||||
assert len(blocks) == 1
|
||||
|
||||
|
||||
def test_jumbled_case():
|
||||
"""The parser just reports what it sees. It is the responsibility of
|
||||
the downstream tools to do something about a jumbled file.
|
||||
Just verify that we are reading it correctly."""
|
||||
with sample_file('out_of_order.cpp') as f:
|
||||
blocks = find_code_blocks(f)
|
||||
|
||||
assert len(blocks) == 3
|
||||
assert code_blocks_are_sorted(blocks) is False
|
||||
|
||||
|
||||
def test_bad_file():
|
||||
with sample_file('poorly_formatted.cpp') as f:
|
||||
blocks = find_code_blocks(f)
|
||||
|
||||
assert len(blocks) == 3
|
||||
|
||||
|
||||
def test_indented():
|
||||
"""Offsets for functions inside of a class will probably be indented."""
|
||||
with sample_file('basic_class.cpp') as f:
|
||||
blocks = find_code_blocks(f)
|
||||
|
||||
# TODO: We don't properly detect the end of these functions
|
||||
# because the closing brace is indented. However... knowing where each
|
||||
# function ends is less important (for now) than capturing
|
||||
# all the functions that are there.
|
||||
|
||||
assert len(blocks) == 2
|
||||
assert blocks[0].offset == int('0x12345678', 16)
|
||||
assert blocks[0].start_line == 15
|
||||
# assert blocks[0].end_line == 18
|
||||
|
||||
assert blocks[1].offset == int('0xdeadbeef', 16)
|
||||
assert blocks[1].start_line == 22
|
||||
# assert blocks[1].end_line == 24
|
||||
|
||||
|
||||
def test_inline():
|
||||
with sample_file('inline.cpp') as f:
|
||||
blocks = find_code_blocks(f)
|
||||
|
||||
assert len(blocks) == 2
|
||||
for block in blocks:
|
||||
assert block.start_line is not None
|
||||
assert block.start_line == block.end_line
|
||||
|
||||
|
||||
def test_multiple_offsets():
|
||||
"""If multiple offset marks appear before for a code block, take them
|
||||
all but ensure module name (case-insensitive) is distinct.
|
||||
Use first module occurrence in case of duplicates."""
|
||||
with sample_file('multiple_offsets.cpp') as f:
|
||||
blocks = find_code_blocks(f)
|
||||
|
||||
assert len(blocks) == 4
|
||||
assert blocks[0].module == 'TEST'
|
||||
assert blocks[0].start_line == 9
|
||||
|
||||
assert blocks[1].module == 'HELLO'
|
||||
assert blocks[1].start_line == 9
|
||||
|
||||
# Duplicate modules are ignored
|
||||
assert blocks[2].start_line == 16
|
||||
assert blocks[2].offset == 0x2345
|
||||
|
||||
assert blocks[3].module == 'TEST'
|
||||
assert blocks[3].offset == 0x2002
|
113
tools/isledecomp/tests/test_parser_util.py
Normal file
113
tools/isledecomp/tests/test_parser_util.py
Normal file
@@ -0,0 +1,113 @@
|
||||
import pytest
|
||||
from collections import namedtuple
|
||||
from typing import List
|
||||
from isledecomp.parser.util import (
|
||||
is_blank_or_comment,
|
||||
match_offset_comment,
|
||||
is_exact_offset_comment,
|
||||
distinct_by_module,
|
||||
)
|
||||
|
||||
|
||||
blank_or_comment_param = [
|
||||
(True, ''),
|
||||
(True, '\t'),
|
||||
(True, ' '),
|
||||
(False, '\tint abc=123;'),
|
||||
(True, '// OFFSET: LEGO1 0xdeadbeef'),
|
||||
(True, ' /* Block comment beginning'),
|
||||
(True, 'Block comment ending */ '),
|
||||
|
||||
# TODO: does clang-format have anything to say about these cases?
|
||||
(False, 'x++; // Comment folows'),
|
||||
(False, 'x++; /* Block comment begins'),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize('expected, line', blank_or_comment_param)
|
||||
def test_is_blank_or_comment(line: str, expected: bool):
|
||||
assert is_blank_or_comment(line) is expected
|
||||
|
||||
|
||||
offset_comment_samples = [
|
||||
# (can_parse: bool, exact_match: bool, line: str)
|
||||
# Should match both expected modules with optional STUB marker
|
||||
(True, True, '// OFFSET: LEGO1 0xdeadbeef'),
|
||||
(True, True, '// OFFSET: LEGO1 0xdeadbeef STUB'),
|
||||
(True, True, '// OFFSET: ISLE 0x12345678'),
|
||||
(True, True, '// OFFSET: ISLE 0x12345678 STUB'),
|
||||
|
||||
# No trailing spaces allowed
|
||||
(True, False, '// OFFSET: LEGO1 0xdeadbeef '),
|
||||
(True, False, '// OFFSET: LEGO1 0xdeadbeef STUB '),
|
||||
|
||||
# Must have exactly one space between elements
|
||||
(True, False, '//OFFSET: ISLE 0xdeadbeef'),
|
||||
(True, False, '// OFFSET:ISLE 0xdeadbeef'),
|
||||
(True, False, '// OFFSET: ISLE 0xdeadbeef'),
|
||||
(True, False, '// OFFSET: ISLE 0xdeadbeef'),
|
||||
(True, False, '// OFFSET: ISLE 0xdeadbeef'),
|
||||
(True, False, '// OFFSET: ISLE 0xdeadbeef STUB'),
|
||||
|
||||
# Must have 0x prefix for hex number
|
||||
(True, False, '// OFFSET: ISLE deadbeef'),
|
||||
|
||||
# Offset, module name, and STUB must be uppercase
|
||||
(True, False, '// offset: ISLE 0xdeadbeef'),
|
||||
(True, False, '// offset: isle 0xdeadbeef'),
|
||||
(True, False, '// OFFSET: LEGO1 0xdeadbeef stub'),
|
||||
|
||||
# Hex string must be lowercase
|
||||
(True, False, '// OFFSET: ISLE 0xDEADBEEF'),
|
||||
|
||||
# TODO: How flexible should we be with matching the module name?
|
||||
(True, True, '// OFFSET: OMNI 0x12345678'),
|
||||
(True, True, '// OFFSET: LEG01 0x12345678'),
|
||||
(True, False, '// OFFSET: hello 0x12345678'),
|
||||
|
||||
# Not close enough to match
|
||||
(False, False, '// OFFSET: ISLE0x12345678'),
|
||||
(False, False, '// OFFSET: 0x12345678'),
|
||||
(False, False, '// LEGO1: 0x12345678'),
|
||||
|
||||
# Hex string shorter than 8 characters
|
||||
(True, True, '// OFFSET: LEGO1 0x1234'),
|
||||
|
||||
# TODO: These match but shouldn't.
|
||||
# (False, False, '// OFFSET: LEGO1 0'),
|
||||
# (False, False, '// OFFSET: LEGO1 0x'),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize('match, exact, line', offset_comment_samples)
|
||||
def test_offset_match(line: str, match: bool, exact):
|
||||
did_match = match_offset_comment(line) is not None
|
||||
assert did_match is match
|
||||
|
||||
|
||||
@pytest.mark.parametrize('match, exact, line', offset_comment_samples)
|
||||
def test_exact_offset_comment(line: str, exact: bool, match):
|
||||
assert is_exact_offset_comment(line) is exact
|
||||
|
||||
|
||||
# Helper for the next test: cut down version of OffsetMatch
|
||||
MiniOfs = namedtuple('MiniOfs', ['module', 'value'])
|
||||
|
||||
distinct_by_module_samples = [
|
||||
# empty set
|
||||
([], []),
|
||||
# same module name
|
||||
([MiniOfs('TEST', 123), MiniOfs('TEST', 555)],
|
||||
[MiniOfs('TEST', 123)]),
|
||||
# same module name, case-insensitive
|
||||
([MiniOfs('test', 123), MiniOfs('TEST', 555)],
|
||||
[MiniOfs('test', 123)]),
|
||||
# duplicates, non-consecutive
|
||||
([MiniOfs('test', 123), MiniOfs('abc', 111), MiniOfs('TEST', 555)],
|
||||
[MiniOfs('test', 123), MiniOfs('abc', 111)]),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize('sample, expected', distinct_by_module_samples)
|
||||
def test_distinct_by_module(sample: List[MiniOfs], expected: List[MiniOfs]):
|
||||
assert distinct_by_module(sample) == expected
|
@@ -12,6 +12,8 @@ import sys
|
||||
import colorama
|
||||
import html
|
||||
import re
|
||||
from isledecomp.dir import walk_source_dir
|
||||
from isledecomp.parser import find_code_blocks
|
||||
from pystache import Renderer
|
||||
|
||||
parser = argparse.ArgumentParser(allow_abbrev=False,
|
||||
@@ -413,145 +415,120 @@ htmlinsert = []
|
||||
|
||||
# Generate basename of original file, used in locating OFFSET lines
|
||||
basename = os.path.basename(os.path.splitext(original)[0])
|
||||
pattern = '// OFFSET:'
|
||||
|
||||
for subdir, dirs, files in os.walk(source):
|
||||
for file in files:
|
||||
srcfilename = os.path.join(os.path.abspath(subdir), file)
|
||||
with open(srcfilename, 'r') as srcfile:
|
||||
line_no = 0
|
||||
for srcfilename in walk_source_dir(source):
|
||||
with open(srcfilename, 'r') as srcfile:
|
||||
blocks = find_code_blocks(srcfile)
|
||||
|
||||
while True:
|
||||
try:
|
||||
line = srcfile.readline()
|
||||
line_no += 1
|
||||
for block in blocks:
|
||||
if block.is_stub:
|
||||
continue
|
||||
|
||||
if not line:
|
||||
break
|
||||
if block.module != basename:
|
||||
continue
|
||||
|
||||
line = line.strip()
|
||||
addr = block.offset
|
||||
# Verbose flag handling
|
||||
if verbose:
|
||||
if addr == verbose:
|
||||
found_verbose_target = True
|
||||
else:
|
||||
continue
|
||||
|
||||
if line.startswith(pattern) and not line.endswith('STUB'):
|
||||
par = line[len(pattern):].strip().split()
|
||||
module = par[0]
|
||||
if module != basename:
|
||||
continue
|
||||
if block.is_template:
|
||||
recinfo = syminfo.get_recompiled_address_from_name(block.signature)
|
||||
if not recinfo:
|
||||
continue
|
||||
else:
|
||||
recinfo = syminfo.get_recompiled_address(srcfilename, block.start_line)
|
||||
if not recinfo:
|
||||
continue
|
||||
|
||||
addr = int(par[1], 16)
|
||||
# The effective_ratio is the ratio when ignoring differing register
|
||||
# allocation vs the ratio is the true ratio.
|
||||
ratio = 0.0
|
||||
effective_ratio = 0.0
|
||||
if recinfo.size:
|
||||
origasm = parse_asm(origfile, addr + recinfo.start, recinfo.size)
|
||||
recompasm = parse_asm(recompfile, recinfo.addr + recinfo.start, recinfo.size)
|
||||
|
||||
# Verbose flag handling
|
||||
if verbose:
|
||||
if addr == verbose:
|
||||
found_verbose_target = True
|
||||
else:
|
||||
continue
|
||||
diff = difflib.SequenceMatcher(None, origasm, recompasm)
|
||||
ratio = diff.ratio()
|
||||
effective_ratio = ratio
|
||||
|
||||
if line.endswith('TEMPLATE'):
|
||||
line = srcfile.readline()
|
||||
line_no += 1
|
||||
# Name comes after // comment
|
||||
name = line.strip()[2:].strip()
|
||||
if ratio != 1.0:
|
||||
# Check whether we can resolve register swaps which are actually
|
||||
# perfect matches modulo compiler entropy.
|
||||
if can_resolve_register_differences(origasm, recompasm):
|
||||
effective_ratio = 1.0
|
||||
else:
|
||||
ratio = 0
|
||||
|
||||
recinfo = syminfo.get_recompiled_address_from_name(name)
|
||||
if not recinfo:
|
||||
continue
|
||||
else:
|
||||
find_open_bracket = line
|
||||
while '{' not in find_open_bracket:
|
||||
find_open_bracket = srcfile.readline()
|
||||
line_no += 1
|
||||
percenttext = f'{(effective_ratio * 100):.2f}%'
|
||||
if not plain:
|
||||
if effective_ratio == 1.0:
|
||||
percenttext = colorama.Fore.GREEN + percenttext + colorama.Style.RESET_ALL
|
||||
elif effective_ratio > 0.8:
|
||||
percenttext = colorama.Fore.YELLOW + percenttext + colorama.Style.RESET_ALL
|
||||
else:
|
||||
percenttext = colorama.Fore.RED + percenttext + colorama.Style.RESET_ALL
|
||||
|
||||
recinfo = syminfo.get_recompiled_address(srcfilename, line_no)
|
||||
if not recinfo:
|
||||
continue
|
||||
if effective_ratio == 1.0 and ratio != 1.0:
|
||||
if plain:
|
||||
percenttext += '*'
|
||||
else:
|
||||
percenttext += colorama.Fore.RED + '*' + colorama.Style.RESET_ALL
|
||||
|
||||
# The effective_ratio is the ratio when ignoring differing register
|
||||
# allocation vs the ratio is the true ratio.
|
||||
ratio = 0.0
|
||||
effective_ratio = 0.0
|
||||
if recinfo.size:
|
||||
origasm = parse_asm(origfile, addr + recinfo.start, recinfo.size)
|
||||
recompasm = parse_asm(recompfile, recinfo.addr + recinfo.start, recinfo.size)
|
||||
if args.print_rec_addr:
|
||||
addrs = f'0x{addr:x} / 0x{recinfo.addr:x}'
|
||||
else:
|
||||
addrs = hex(addr)
|
||||
|
||||
diff = difflib.SequenceMatcher(None, origasm, recompasm)
|
||||
ratio = diff.ratio()
|
||||
effective_ratio = ratio
|
||||
if not verbose:
|
||||
print(f' {recinfo.name} ({addrs}) is {percenttext} similar to the original')
|
||||
|
||||
if ratio != 1.0:
|
||||
# Check whether we can resolve register swaps which are actually
|
||||
# perfect matches modulo compiler entropy.
|
||||
if can_resolve_register_differences(origasm, recompasm):
|
||||
effective_ratio = 1.0
|
||||
else:
|
||||
ratio = 0
|
||||
function_count += 1
|
||||
total_accuracy += ratio
|
||||
total_effective_accuracy += effective_ratio
|
||||
|
||||
percenttext = f'{(effective_ratio * 100):.2f}%'
|
||||
if not plain:
|
||||
if effective_ratio == 1.0:
|
||||
percenttext = colorama.Fore.GREEN + percenttext + colorama.Style.RESET_ALL
|
||||
elif effective_ratio > 0.8:
|
||||
percenttext = colorama.Fore.YELLOW + percenttext + colorama.Style.RESET_ALL
|
||||
else:
|
||||
percenttext = colorama.Fore.RED + percenttext + colorama.Style.RESET_ALL
|
||||
if recinfo.size:
|
||||
udiff = difflib.unified_diff(origasm, recompasm, n=10)
|
||||
|
||||
if effective_ratio == 1.0 and ratio != 1.0:
|
||||
# If verbose, print the diff for that function to the output
|
||||
if verbose:
|
||||
if effective_ratio == 1.0:
|
||||
ok_text = 'OK!' if plain else (colorama.Fore.GREEN + '✨ OK! ✨' + colorama.Style.RESET_ALL)
|
||||
if ratio == 1.0:
|
||||
print(f'{addrs}: {recinfo.name} 100% match.\n\n{ok_text}\n\n')
|
||||
else:
|
||||
print(f'{addrs}: {recinfo.name} Effective 100%% match. (Differs in register allocation only)\n\n{ok_text} (still differs in register allocation)\n\n')
|
||||
else:
|
||||
for line in udiff:
|
||||
if line.startswith('++') or line.startswith('@@') or line.startswith('--'):
|
||||
# Skip unneeded parts of the diff for the brief view
|
||||
pass
|
||||
elif line.startswith('+'):
|
||||
if plain:
|
||||
percenttext += '*'
|
||||
print(line)
|
||||
else:
|
||||
percenttext += colorama.Fore.RED + '*' + colorama.Style.RESET_ALL
|
||||
|
||||
if args.print_rec_addr:
|
||||
addrs = f'0x{addr:x} / 0x{recinfo.addr:x}'
|
||||
print(colorama.Fore.GREEN + line)
|
||||
elif line.startswith('-'):
|
||||
if plain:
|
||||
print(line)
|
||||
else:
|
||||
print(colorama.Fore.RED + line)
|
||||
else:
|
||||
addrs = hex(addr)
|
||||
print(line)
|
||||
if not plain:
|
||||
print(colorama.Style.RESET_ALL, end='')
|
||||
|
||||
if not verbose:
|
||||
print(f' {recinfo.name} ({addrs}) is {percenttext} similar to the original')
|
||||
print(f'\n{recinfo.name} is only {percenttext} similar to the original, diff above')
|
||||
|
||||
function_count += 1
|
||||
total_accuracy += ratio
|
||||
total_effective_accuracy += effective_ratio
|
||||
# If html, record the diffs to an HTML file
|
||||
if html_path:
|
||||
escaped = html.escape('\\n'.join(udiff).replace('"', '\\"').replace('\n', '\\n'))
|
||||
htmlinsert.append(f'{{address: "0x{addr:x}", name: "{html.escape(recinfo.name)}", matching: {effective_ratio}, diff: "{escaped}"}}')
|
||||
|
||||
if recinfo.size:
|
||||
udiff = difflib.unified_diff(origasm, recompasm, n=10)
|
||||
|
||||
# If verbose, print the diff for that function to the output
|
||||
if verbose:
|
||||
if effective_ratio == 1.0:
|
||||
ok_text = 'OK!' if plain else (colorama.Fore.GREEN + '✨ OK! ✨' + colorama.Style.RESET_ALL)
|
||||
if ratio == 1.0:
|
||||
print(f'{addrs}: {recinfo.name} 100% match.\n\n{ok_text}\n\n')
|
||||
else:
|
||||
print(f'{addrs}: {recinfo.name} Effective 100%% match. (Differs in register allocation only)\n\n{ok_text} (still differs in register allocation)\n\n')
|
||||
else:
|
||||
for line in udiff:
|
||||
if line.startswith('++') or line.startswith('@@') or line.startswith('--'):
|
||||
# Skip unneeded parts of the diff for the brief view
|
||||
pass
|
||||
elif line.startswith('+'):
|
||||
if plain:
|
||||
print(line)
|
||||
else:
|
||||
print(colorama.Fore.GREEN + line)
|
||||
elif line.startswith('-'):
|
||||
if plain:
|
||||
print(line)
|
||||
else:
|
||||
print(colorama.Fore.RED + line)
|
||||
else:
|
||||
print(line)
|
||||
if not plain:
|
||||
print(colorama.Style.RESET_ALL, end='')
|
||||
|
||||
print(f'\n{recinfo.name} is only {percenttext} similar to the original, diff above')
|
||||
|
||||
# If html, record the diffs to an HTML file
|
||||
if html_path:
|
||||
escaped = html.escape('\\n'.join(udiff).replace('"', '\\"').replace('\n', '\\n'))
|
||||
htmlinsert.append(f'{{address: "0x{addr:x}", name: "{html.escape(recinfo.name)}", matching: {effective_ratio}, diff: "{escaped}"}}')
|
||||
|
||||
except UnicodeDecodeError:
|
||||
break
|
||||
|
||||
def gen_html(html_file, data):
|
||||
output_data = Renderer().render_path(get_file_in_script_dir('template.html'),
|
||||
|
@@ -1,3 +1,4 @@
|
||||
capstone
|
||||
colorama
|
||||
pystache
|
||||
isledecomp
|
||||
pystache
|
Reference in New Issue
Block a user