Checkorder tool to keep functions in original binary order (#228)

* First commit of order tool

* More flexible match on module name. Bugfix on blank_or_comment

* Report inexact offset comments in verbose mode. Bugfix for exact regex

* Refactor checkorder into reusable isledecomp module

* Find bad comments in one pass, add awareness of TEMPLATE

* Refactor of state machine to prepare for reccmp integration

* Use isledecomp lib in reccmp

* Build isledecomp in GH actions, fix mypy complaint

* Ensure unit test cpp files will be ignored by reccmp

* Allow multiple offset markers, pep8 cleanup

* Remove unused variable

* Code style, remove unneeded module and TODO

* Final renaming and type hints

* Fix checkorder issues, add GH action and enforce (#2)

* Fix checkorder issues

* Add GH action

* Test error case

* Works

* Fixes

---------

Co-authored-by: Christian Semmler <mail@csemmler.com>
This commit is contained in:
MS
2023-11-21 03:44:45 -05:00
committed by GitHub
parent 714d36b57d
commit 1ae3b07dc2
84 changed files with 4021 additions and 3209 deletions

View File

@@ -0,0 +1,120 @@
import os
import sys
import argparse
from isledecomp.dir import (
walk_source_dir,
is_file_cpp
)
from isledecomp.parser import find_code_blocks
from isledecomp.parser.util import (
is_exact_offset_comment
)
def sig_truncate(sig: str) -> str:
"""Helper to truncate function names to 50 chars and append ellipsis
if needed. Goal is to stay under 80 columns for tool output."""
return f"{sig[:47]}{'...' if len(sig) >= 50 else ''}"
def check_file(filename: str, verbose: bool = False) -> bool:
"""Open and read the given file, then check whether the code blocks
are in order. If verbose, print each block."""
with open(filename, 'r') as f:
code_blocks = find_code_blocks(f)
bad_comments = [(block.start_line, block.offset_comment)
for block in code_blocks
if not is_exact_offset_comment(block.offset_comment)]
just_offsets = [block.offset for block in code_blocks]
sorted_offsets = sorted(just_offsets)
file_out_of_order = just_offsets != sorted_offsets
# If we detect inexact comments, don't print anything unless we are
# in verbose mode. If the file is out of order, we always print the
# file name.
should_report = ((len(bad_comments) > 0 and verbose)
or file_out_of_order)
if not should_report and not file_out_of_order:
return False
# Else: we are alerting to some problem in this file
print(filename)
if verbose:
if file_out_of_order:
order_lookup = {k: i for i, k in enumerate(sorted_offsets)}
prev_offset = 0
for block in code_blocks:
msg = ' '.join([
' ' if block.offset > prev_offset else '!',
f'{block.offset:08x}',
f'{block.end_line - block.start_line:4} lines',
f'{order_lookup[block.offset]:3}',
' ',
sig_truncate(block.signature),
])
print(msg)
prev_offset = block.offset
for (line_no, line) in bad_comments:
print(f'* line {line_no:3} bad offset comment ({line})')
print()
return file_out_of_order
def parse_args(test_args: list | None = None) -> dict:
p = argparse.ArgumentParser()
p.add_argument('target', help='The file or directory to check.')
p.add_argument('--enforce', action=argparse.BooleanOptionalAction,
default=False,
help='Fail with error code if target is out of order.')
p.add_argument('--verbose', action=argparse.BooleanOptionalAction,
default=False,
help=('Display each code block in the file and show '
'where each consecutive run of blocks is broken.'))
if test_args is None:
args = p.parse_args()
else:
args = p.parse_args(test_args)
return vars(args)
def main():
args = parse_args()
if os.path.isdir(args['target']):
files_to_check = list(walk_source_dir(args['target']))
elif os.path.isfile(args['target']) and is_file_cpp(args['target']):
files_to_check = [args['target']]
else:
sys.exit('Invalid target')
files_out_of_order = 0
for file in files_to_check:
is_jumbled = check_file(file, args['verbose'])
if is_jumbled:
files_out_of_order += 1
if files_out_of_order > 0:
error_message = ' '.join([
str(files_out_of_order),
'files are' if files_out_of_order > 1 else 'file is',
'out of order'
])
print(error_message)
if files_out_of_order > 0 and args['enforce']:
sys.exit(1)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1 @@
isledecomp

1
tools/isledecomp/.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
isledecomp.egg-info/

View File

View File

@@ -0,0 +1,21 @@
import os
from typing import Iterator
def is_file_cpp(filename: str) -> bool:
(basefile, ext) = os.path.splitext(filename)
return ext.lower() in ('.h', '.cpp')
def walk_source_dir(source: str, recursive: bool = True) -> Iterator[str]:
"""Generator to walk the given directory recursively and return
any C++ files found."""
source = os.path.abspath(source)
for subdir, dirs, files in os.walk(source):
for file in files:
if is_file_cpp(file):
yield os.path.join(subdir, file)
if not recursive:
break

View File

@@ -0,0 +1 @@
from .parser import find_code_blocks

View File

@@ -0,0 +1,142 @@
# C++ file parser
from typing import List, TextIO
from enum import Enum
from .util import (
CodeBlock,
OffsetMatch,
is_blank_or_comment,
match_offset_comment,
is_exact_offset_comment,
get_template_function_name,
remove_trailing_comment,
distinct_by_module,
)
class ReaderState(Enum):
WANT_OFFSET = 0
WANT_SIG = 1
IN_FUNC = 2
IN_TEMPLATE = 3
WANT_CURLY = 4
FUNCTION_DONE = 5
def find_code_blocks(stream: TextIO) -> List[CodeBlock]:
"""Read the IO stream (file) line-by-line and give the following report:
Foreach code block (function) in the file, what are its starting and
ending line numbers, and what is the given offset in the original
binary. We expect the result to be ordered by line number because we
are reading the file from start to finish."""
blocks: List[CodeBlock] = []
offset_matches: List[OffsetMatch] = []
function_sig = None
start_line = None
end_line = None
state = ReaderState.WANT_OFFSET
# 1-based to match cvdump and your text editor
# I know it says 0, but we will increment before each readline()
line_no = 0
can_seek = True
while True:
# Do this before reading again so that an EOF will not
# cause us to miss the last function of the file.
if state == ReaderState.FUNCTION_DONE:
# Our list of offset marks could have duplicates on
# module name, so we'll eliminate those now.
for offset_match in distinct_by_module(offset_matches):
block = CodeBlock(offset=offset_match.address,
signature=function_sig,
start_line=start_line,
end_line=end_line,
offset_comment=offset_match.comment,
module=offset_match.module,
is_template=offset_match.is_template,
is_stub=offset_match.is_stub)
blocks.append(block)
offset_matches = []
state = ReaderState.WANT_OFFSET
if can_seek:
line_no += 1
line = stream.readline()
if line == '':
break
new_match = match_offset_comment(line)
if new_match is not None:
# We will allow multiple offsets if we have just begun
# the code block, but not after we hit the curly brace.
if state in (ReaderState.WANT_OFFSET, ReaderState.IN_TEMPLATE,
ReaderState.WANT_SIG):
# If we detected an offset marker unexpectedly,
# we are handling it here so we can continue seeking.
can_seek = True
offset_matches.append(new_match)
if new_match.is_template:
state = ReaderState.IN_TEMPLATE
else:
state = ReaderState.WANT_SIG
else:
# We hit another offset unexpectedly.
# We can recover easily by just ending the function here.
end_line = line_no - 1
state = ReaderState.FUNCTION_DONE
# Pause reading here so we handle the offset marker
# on the next loop iteration
can_seek = False
elif state == ReaderState.IN_TEMPLATE:
# TEMPLATE functions are a special case. The signature is
# given on the next line (in a // comment)
function_sig = get_template_function_name(line)
start_line = line_no
end_line = line_no
state = ReaderState.FUNCTION_DONE
elif state == ReaderState.WANT_SIG:
# Skip blank lines or comments that come after the offset
# marker. There is not a formal procedure for this, so just
# assume the next "code line" is the function signature
if not is_blank_or_comment(line):
# Inline functions may end with a comment. Strip that out
# to help parsing.
function_sig = remove_trailing_comment(line.strip())
# Now check to see if the opening curly bracket is on the
# same line. clang-format should prevent this (BraceWrapping)
# but it is easy to detect.
# If the entire function is on one line, handle that too.
if function_sig.endswith('{'):
start_line = line_no
state = ReaderState.IN_FUNC
elif (function_sig.endswith('}') or
function_sig.endswith('};')):
start_line = line_no
end_line = line_no
state = ReaderState.FUNCTION_DONE
else:
state = ReaderState.WANT_CURLY
elif state == ReaderState.WANT_CURLY:
if line.strip() == '{':
start_line = line_no
state = ReaderState.IN_FUNC
elif state == ReaderState.IN_FUNC:
# Naive but reasonable assumption that functions will end with
# a curly brace on its own line with no prepended spaces.
if line.startswith('}'):
end_line = line_no
state = ReaderState.FUNCTION_DONE
return blocks

View File

@@ -0,0 +1,97 @@
# C++ Parser utility functions and data structures
from __future__ import annotations # python <3.10 compatibility
import re
from typing import List
from collections import namedtuple
CodeBlock = namedtuple('CodeBlock',
['offset', 'signature', 'start_line', 'end_line',
'offset_comment', 'module', 'is_template', 'is_stub'])
OffsetMatch = namedtuple('OffsetMatch', ['module', 'address', 'is_template',
'is_stub', 'comment'])
# This has not been formally established, but considering that "STUB"
# is a temporary state for a function, we assume it will appear last,
# after any other modifiers (i.e. TEMPLATE)
# To match a reasonable variance of formatting for the offset comment
offsetCommentRegex = re.compile(r'\s*//\s*OFFSET:\s*(\w+)\s+(?:0x)?([a-f0-9]+)(\s+TEMPLATE)?(\s+STUB)?', # nopep8
flags=re.I)
# To match the exact syntax (text upper case, hex lower case, with spaces)
# that is used in most places
offsetCommentExactRegex = re.compile(r'^// OFFSET: [A-Z0-9]+ (0x[a-f0-9]+)( TEMPLATE)?( STUB)?$') # nopep8
# The goal here is to just read whatever is on the next line, so some
# flexibility in the formatting seems OK
templateCommentRegex = re.compile(r'\s*//\s+(.*)')
# To remove any comment (//) or block comment (/*) and its leading spaces
# from the end of a code line
trailingCommentRegex = re.compile(r'(\s*(?://|/\*).*)$')
def get_template_function_name(line: str) -> str:
"""Parse function signature for special TEMPLATE functions"""
template_match = templateCommentRegex.match(line)
# If we don't match, you get whatever is on the line as the signature
if template_match is not None:
return template_match.group(1)
return line
def remove_trailing_comment(line: str) -> str:
return trailingCommentRegex.sub('', line)
def is_blank_or_comment(line: str) -> bool:
"""Helper to read ahead after the offset comment is matched.
There could be blank lines or other comments before the
function signature, and we want to skip those."""
line_strip = line.strip()
return (len(line_strip) == 0
or line_strip.startswith('//')
or line_strip.startswith('/*')
or line_strip.endswith('*/'))
def is_exact_offset_comment(line: str) -> bool:
"""If the offset comment does not match our (unofficial) syntax
we may want to alert the user to fix it for style points."""
return offsetCommentExactRegex.match(line) is not None
def match_offset_comment(line: str) -> OffsetMatch | None:
match = offsetCommentRegex.match(line)
if match is None:
return None
return OffsetMatch(module=match.group(1),
address=int(match.group(2), 16),
is_template=match.group(3) is not None,
is_stub=match.group(4) is not None,
comment=line.strip())
def distinct_by_module(offsets: List) -> List:
"""Given a list of offset markers, return a list with distinct
module names. If module names (case-insensitive) are repeated,
choose the offset that appears first."""
if len(offsets) < 2:
return offsets
# Dict maintains insertion order in python >=3.7
offsets_dict = {}
for offset in offsets:
module_upper = offset.module.upper()
if module_upper not in offsets_dict:
offsets_dict[module_upper] = offset
return list(offsets_dict.values())

View File

@@ -0,0 +1,9 @@
from setuptools import setup, find_packages
setup(
name='isledecomp',
version='0.1.0',
description='Python tools for the isledecomp project',
packages=find_packages(),
tests_require=['pytest'],
)

View File

View File

@@ -0,0 +1,29 @@
// Sample for python unit tests
// Not part of the decomp
// A very simple class
class TestClass {
public:
TestClass();
virtual ~TestClass() override;
virtual MxResult Tickle() override; // vtable+08
// OFFSET: TEST 0x12345678
inline const char* ClassName() const // vtable+0c
{
// 0xabcd1234
return "TestClass";
}
// OFFSET: TEST 0xdeadbeef
inline MxBool IsA(const char* name) const override // vtable+10
{
return !strcmp(name, TestClass::ClassName());
}
private:
int m_hello;
int m_hiThere;
};

View File

@@ -0,0 +1,22 @@
// Sample for python unit tests
// Not part of the decomp
// A very simple well-formed code file
// OFFSET: TEST 0x1234
void function01()
{
// TODO
}
// OFFSET: TEST 0x2345
void function02()
{
// TODO
}
// OFFSET: TEST 0x3456
void function03()
{
// TODO
}

View File

@@ -0,0 +1,8 @@
// Sample for python unit tests
// Not part of the decomp
// OFFSET: TEST 0x10000001
inline const char* OneLineWithComment() const { return "MxDSObject"; }; // hi there
// OFFSET: TEST 0x10000002
inline const char* OneLine() const { return "MxDSObject"; };

View File

@@ -0,0 +1,16 @@
// Sample for python unit tests
// Not part of the decomp
#include <stdio.h>
int no_offset_comment()
{
static int dummy = 123;
return -1;
}
// OFFSET: TEST 0xdeadbeef
void regular_ole_function()
{
printf("hi there");
}

View File

@@ -0,0 +1,25 @@
// Sample for python unit tests
// Not part of the decomp
// Handling multiple offset markers
// OFFSET: TEST 0x1234
// OFFSET: HELLO 0x5555
void different_modules()
{
// TODO
}
// OFFSET: TEST 0x2345
// OFFSET: TEST 0x1234
void same_module()
{
// TODO
}
// OFFSET: TEST 0x2002
// OFFSET: test 0x1001
void same_case_insensitive()
{
// TODO
}

View File

@@ -0,0 +1,12 @@
// Sample for python unit tests
// Not part of the decomp
// OFFSET: TEST 0x1234
void short_function() { static char* msg = "oneliner"; }
// OFFSET: TEST 0x5555
void function_after_one_liner()
{
// This function comes after the previous that is on a single line.
// Do we report the offset for this one correctly?
}

View File

@@ -0,0 +1,20 @@
// Sample for python unit tests
// Not part of the decomp
// OFFSET: TEST 0x1001
void function_order01()
{
// TODO
}
// OFFSET: TEST 0x1003
void function_order03()
{
// TODO
}
// OFFSET: TEST 0x1002
void function_order02()
{
// TODO
}

View File

@@ -0,0 +1,23 @@
// Sample for python unit tests
// Not part of the decomp
// While it's reasonable to expect a well-formed file (and clang-format
// will make sure we get one), this will put the parser through its paces.
// OFFSET: TEST 0x1234
void curly_with_spaces()
{
static char* msg = "hello";
}
// OFFSET: TEST 0x5555
void weird_closing_curly()
{
int x = 123; }
// OFFSET: HELLO 0x5656
void bad_indenting() {
if (0)
{
int y = 5;
}}

View File

@@ -0,0 +1,128 @@
import os
import pytest
from typing import List, TextIO
from isledecomp.parser import find_code_blocks
from isledecomp.parser.util import CodeBlock
SAMPLE_DIR = os.path.join(os.path.dirname(__file__), 'samples')
def sample_file(filename: str) -> TextIO:
"""Wrapper for opening the samples from the directory that does not
depend on the cwd where we run the test"""
full_path = os.path.join(SAMPLE_DIR, filename)
return open(full_path, 'r')
def code_blocks_are_sorted(blocks: List[CodeBlock]) -> bool:
"""Helper to make this more idiomatic"""
just_offsets = [block.offset for block in blocks]
return just_offsets == sorted(just_offsets)
# Tests are below #
def test_sanity():
"""Read a very basic file"""
with sample_file('basic_file.cpp') as f:
blocks = find_code_blocks(f)
assert len(blocks) == 3
assert code_blocks_are_sorted(blocks) is True
# n.b. The parser returns line numbers as 1-based
# Function starts when we see the opening curly brace
assert blocks[0].start_line == 8
assert blocks[0].end_line == 10
def test_oneline():
"""(Assuming clang-format permits this) This sample has a function
on a single line. This will test the end-of-function detection"""
with sample_file('oneline_function.cpp') as f:
blocks = find_code_blocks(f)
assert len(blocks) == 2
assert blocks[0].start_line == 5
assert blocks[0].end_line == 5
def test_missing_offset():
"""What if the function doesn't have an offset comment?"""
with sample_file('missing_offset.cpp') as f:
blocks = find_code_blocks(f)
# TODO: For now, the function without the offset will just be ignored.
# Would be the same outcome if the comment was present but mangled and
# we failed to match it. We should detect these cases in the future.
assert len(blocks) == 1
def test_jumbled_case():
"""The parser just reports what it sees. It is the responsibility of
the downstream tools to do something about a jumbled file.
Just verify that we are reading it correctly."""
with sample_file('out_of_order.cpp') as f:
blocks = find_code_blocks(f)
assert len(blocks) == 3
assert code_blocks_are_sorted(blocks) is False
def test_bad_file():
with sample_file('poorly_formatted.cpp') as f:
blocks = find_code_blocks(f)
assert len(blocks) == 3
def test_indented():
"""Offsets for functions inside of a class will probably be indented."""
with sample_file('basic_class.cpp') as f:
blocks = find_code_blocks(f)
# TODO: We don't properly detect the end of these functions
# because the closing brace is indented. However... knowing where each
# function ends is less important (for now) than capturing
# all the functions that are there.
assert len(blocks) == 2
assert blocks[0].offset == int('0x12345678', 16)
assert blocks[0].start_line == 15
# assert blocks[0].end_line == 18
assert blocks[1].offset == int('0xdeadbeef', 16)
assert blocks[1].start_line == 22
# assert blocks[1].end_line == 24
def test_inline():
with sample_file('inline.cpp') as f:
blocks = find_code_blocks(f)
assert len(blocks) == 2
for block in blocks:
assert block.start_line is not None
assert block.start_line == block.end_line
def test_multiple_offsets():
"""If multiple offset marks appear before for a code block, take them
all but ensure module name (case-insensitive) is distinct.
Use first module occurrence in case of duplicates."""
with sample_file('multiple_offsets.cpp') as f:
blocks = find_code_blocks(f)
assert len(blocks) == 4
assert blocks[0].module == 'TEST'
assert blocks[0].start_line == 9
assert blocks[1].module == 'HELLO'
assert blocks[1].start_line == 9
# Duplicate modules are ignored
assert blocks[2].start_line == 16
assert blocks[2].offset == 0x2345
assert blocks[3].module == 'TEST'
assert blocks[3].offset == 0x2002

View File

@@ -0,0 +1,113 @@
import pytest
from collections import namedtuple
from typing import List
from isledecomp.parser.util import (
is_blank_or_comment,
match_offset_comment,
is_exact_offset_comment,
distinct_by_module,
)
blank_or_comment_param = [
(True, ''),
(True, '\t'),
(True, ' '),
(False, '\tint abc=123;'),
(True, '// OFFSET: LEGO1 0xdeadbeef'),
(True, ' /* Block comment beginning'),
(True, 'Block comment ending */ '),
# TODO: does clang-format have anything to say about these cases?
(False, 'x++; // Comment folows'),
(False, 'x++; /* Block comment begins'),
]
@pytest.mark.parametrize('expected, line', blank_or_comment_param)
def test_is_blank_or_comment(line: str, expected: bool):
assert is_blank_or_comment(line) is expected
offset_comment_samples = [
# (can_parse: bool, exact_match: bool, line: str)
# Should match both expected modules with optional STUB marker
(True, True, '// OFFSET: LEGO1 0xdeadbeef'),
(True, True, '// OFFSET: LEGO1 0xdeadbeef STUB'),
(True, True, '// OFFSET: ISLE 0x12345678'),
(True, True, '// OFFSET: ISLE 0x12345678 STUB'),
# No trailing spaces allowed
(True, False, '// OFFSET: LEGO1 0xdeadbeef '),
(True, False, '// OFFSET: LEGO1 0xdeadbeef STUB '),
# Must have exactly one space between elements
(True, False, '//OFFSET: ISLE 0xdeadbeef'),
(True, False, '// OFFSET:ISLE 0xdeadbeef'),
(True, False, '// OFFSET: ISLE 0xdeadbeef'),
(True, False, '// OFFSET: ISLE 0xdeadbeef'),
(True, False, '// OFFSET: ISLE 0xdeadbeef'),
(True, False, '// OFFSET: ISLE 0xdeadbeef STUB'),
# Must have 0x prefix for hex number
(True, False, '// OFFSET: ISLE deadbeef'),
# Offset, module name, and STUB must be uppercase
(True, False, '// offset: ISLE 0xdeadbeef'),
(True, False, '// offset: isle 0xdeadbeef'),
(True, False, '// OFFSET: LEGO1 0xdeadbeef stub'),
# Hex string must be lowercase
(True, False, '// OFFSET: ISLE 0xDEADBEEF'),
# TODO: How flexible should we be with matching the module name?
(True, True, '// OFFSET: OMNI 0x12345678'),
(True, True, '// OFFSET: LEG01 0x12345678'),
(True, False, '// OFFSET: hello 0x12345678'),
# Not close enough to match
(False, False, '// OFFSET: ISLE0x12345678'),
(False, False, '// OFFSET: 0x12345678'),
(False, False, '// LEGO1: 0x12345678'),
# Hex string shorter than 8 characters
(True, True, '// OFFSET: LEGO1 0x1234'),
# TODO: These match but shouldn't.
# (False, False, '// OFFSET: LEGO1 0'),
# (False, False, '// OFFSET: LEGO1 0x'),
]
@pytest.mark.parametrize('match, exact, line', offset_comment_samples)
def test_offset_match(line: str, match: bool, exact):
did_match = match_offset_comment(line) is not None
assert did_match is match
@pytest.mark.parametrize('match, exact, line', offset_comment_samples)
def test_exact_offset_comment(line: str, exact: bool, match):
assert is_exact_offset_comment(line) is exact
# Helper for the next test: cut down version of OffsetMatch
MiniOfs = namedtuple('MiniOfs', ['module', 'value'])
distinct_by_module_samples = [
# empty set
([], []),
# same module name
([MiniOfs('TEST', 123), MiniOfs('TEST', 555)],
[MiniOfs('TEST', 123)]),
# same module name, case-insensitive
([MiniOfs('test', 123), MiniOfs('TEST', 555)],
[MiniOfs('test', 123)]),
# duplicates, non-consecutive
([MiniOfs('test', 123), MiniOfs('abc', 111), MiniOfs('TEST', 555)],
[MiniOfs('test', 123), MiniOfs('abc', 111)]),
]
@pytest.mark.parametrize('sample, expected', distinct_by_module_samples)
def test_distinct_by_module(sample: List[MiniOfs], expected: List[MiniOfs]):
assert distinct_by_module(sample) == expected

View File

@@ -12,6 +12,8 @@ import sys
import colorama
import html
import re
from isledecomp.dir import walk_source_dir
from isledecomp.parser import find_code_blocks
from pystache import Renderer
parser = argparse.ArgumentParser(allow_abbrev=False,
@@ -413,145 +415,120 @@ htmlinsert = []
# Generate basename of original file, used in locating OFFSET lines
basename = os.path.basename(os.path.splitext(original)[0])
pattern = '// OFFSET:'
for subdir, dirs, files in os.walk(source):
for file in files:
srcfilename = os.path.join(os.path.abspath(subdir), file)
with open(srcfilename, 'r') as srcfile:
line_no = 0
for srcfilename in walk_source_dir(source):
with open(srcfilename, 'r') as srcfile:
blocks = find_code_blocks(srcfile)
while True:
try:
line = srcfile.readline()
line_no += 1
for block in blocks:
if block.is_stub:
continue
if not line:
break
if block.module != basename:
continue
line = line.strip()
addr = block.offset
# Verbose flag handling
if verbose:
if addr == verbose:
found_verbose_target = True
else:
continue
if line.startswith(pattern) and not line.endswith('STUB'):
par = line[len(pattern):].strip().split()
module = par[0]
if module != basename:
continue
if block.is_template:
recinfo = syminfo.get_recompiled_address_from_name(block.signature)
if not recinfo:
continue
else:
recinfo = syminfo.get_recompiled_address(srcfilename, block.start_line)
if not recinfo:
continue
addr = int(par[1], 16)
# The effective_ratio is the ratio when ignoring differing register
# allocation vs the ratio is the true ratio.
ratio = 0.0
effective_ratio = 0.0
if recinfo.size:
origasm = parse_asm(origfile, addr + recinfo.start, recinfo.size)
recompasm = parse_asm(recompfile, recinfo.addr + recinfo.start, recinfo.size)
# Verbose flag handling
if verbose:
if addr == verbose:
found_verbose_target = True
else:
continue
diff = difflib.SequenceMatcher(None, origasm, recompasm)
ratio = diff.ratio()
effective_ratio = ratio
if line.endswith('TEMPLATE'):
line = srcfile.readline()
line_no += 1
# Name comes after // comment
name = line.strip()[2:].strip()
if ratio != 1.0:
# Check whether we can resolve register swaps which are actually
# perfect matches modulo compiler entropy.
if can_resolve_register_differences(origasm, recompasm):
effective_ratio = 1.0
else:
ratio = 0
recinfo = syminfo.get_recompiled_address_from_name(name)
if not recinfo:
continue
else:
find_open_bracket = line
while '{' not in find_open_bracket:
find_open_bracket = srcfile.readline()
line_no += 1
percenttext = f'{(effective_ratio * 100):.2f}%'
if not plain:
if effective_ratio == 1.0:
percenttext = colorama.Fore.GREEN + percenttext + colorama.Style.RESET_ALL
elif effective_ratio > 0.8:
percenttext = colorama.Fore.YELLOW + percenttext + colorama.Style.RESET_ALL
else:
percenttext = colorama.Fore.RED + percenttext + colorama.Style.RESET_ALL
recinfo = syminfo.get_recompiled_address(srcfilename, line_no)
if not recinfo:
continue
if effective_ratio == 1.0 and ratio != 1.0:
if plain:
percenttext += '*'
else:
percenttext += colorama.Fore.RED + '*' + colorama.Style.RESET_ALL
# The effective_ratio is the ratio when ignoring differing register
# allocation vs the ratio is the true ratio.
ratio = 0.0
effective_ratio = 0.0
if recinfo.size:
origasm = parse_asm(origfile, addr + recinfo.start, recinfo.size)
recompasm = parse_asm(recompfile, recinfo.addr + recinfo.start, recinfo.size)
if args.print_rec_addr:
addrs = f'0x{addr:x} / 0x{recinfo.addr:x}'
else:
addrs = hex(addr)
diff = difflib.SequenceMatcher(None, origasm, recompasm)
ratio = diff.ratio()
effective_ratio = ratio
if not verbose:
print(f' {recinfo.name} ({addrs}) is {percenttext} similar to the original')
if ratio != 1.0:
# Check whether we can resolve register swaps which are actually
# perfect matches modulo compiler entropy.
if can_resolve_register_differences(origasm, recompasm):
effective_ratio = 1.0
else:
ratio = 0
function_count += 1
total_accuracy += ratio
total_effective_accuracy += effective_ratio
percenttext = f'{(effective_ratio * 100):.2f}%'
if not plain:
if effective_ratio == 1.0:
percenttext = colorama.Fore.GREEN + percenttext + colorama.Style.RESET_ALL
elif effective_ratio > 0.8:
percenttext = colorama.Fore.YELLOW + percenttext + colorama.Style.RESET_ALL
else:
percenttext = colorama.Fore.RED + percenttext + colorama.Style.RESET_ALL
if recinfo.size:
udiff = difflib.unified_diff(origasm, recompasm, n=10)
if effective_ratio == 1.0 and ratio != 1.0:
# If verbose, print the diff for that function to the output
if verbose:
if effective_ratio == 1.0:
ok_text = 'OK!' if plain else (colorama.Fore.GREEN + '✨ OK! ✨' + colorama.Style.RESET_ALL)
if ratio == 1.0:
print(f'{addrs}: {recinfo.name} 100% match.\n\n{ok_text}\n\n')
else:
print(f'{addrs}: {recinfo.name} Effective 100%% match. (Differs in register allocation only)\n\n{ok_text} (still differs in register allocation)\n\n')
else:
for line in udiff:
if line.startswith('++') or line.startswith('@@') or line.startswith('--'):
# Skip unneeded parts of the diff for the brief view
pass
elif line.startswith('+'):
if plain:
percenttext += '*'
print(line)
else:
percenttext += colorama.Fore.RED + '*' + colorama.Style.RESET_ALL
if args.print_rec_addr:
addrs = f'0x{addr:x} / 0x{recinfo.addr:x}'
print(colorama.Fore.GREEN + line)
elif line.startswith('-'):
if plain:
print(line)
else:
print(colorama.Fore.RED + line)
else:
addrs = hex(addr)
print(line)
if not plain:
print(colorama.Style.RESET_ALL, end='')
if not verbose:
print(f' {recinfo.name} ({addrs}) is {percenttext} similar to the original')
print(f'\n{recinfo.name} is only {percenttext} similar to the original, diff above')
function_count += 1
total_accuracy += ratio
total_effective_accuracy += effective_ratio
# If html, record the diffs to an HTML file
if html_path:
escaped = html.escape('\\n'.join(udiff).replace('"', '\\"').replace('\n', '\\n'))
htmlinsert.append(f'{{address: "0x{addr:x}", name: "{html.escape(recinfo.name)}", matching: {effective_ratio}, diff: "{escaped}"}}')
if recinfo.size:
udiff = difflib.unified_diff(origasm, recompasm, n=10)
# If verbose, print the diff for that function to the output
if verbose:
if effective_ratio == 1.0:
ok_text = 'OK!' if plain else (colorama.Fore.GREEN + '✨ OK! ✨' + colorama.Style.RESET_ALL)
if ratio == 1.0:
print(f'{addrs}: {recinfo.name} 100% match.\n\n{ok_text}\n\n')
else:
print(f'{addrs}: {recinfo.name} Effective 100%% match. (Differs in register allocation only)\n\n{ok_text} (still differs in register allocation)\n\n')
else:
for line in udiff:
if line.startswith('++') or line.startswith('@@') or line.startswith('--'):
# Skip unneeded parts of the diff for the brief view
pass
elif line.startswith('+'):
if plain:
print(line)
else:
print(colorama.Fore.GREEN + line)
elif line.startswith('-'):
if plain:
print(line)
else:
print(colorama.Fore.RED + line)
else:
print(line)
if not plain:
print(colorama.Style.RESET_ALL, end='')
print(f'\n{recinfo.name} is only {percenttext} similar to the original, diff above')
# If html, record the diffs to an HTML file
if html_path:
escaped = html.escape('\\n'.join(udiff).replace('"', '\\"').replace('\n', '\\n'))
htmlinsert.append(f'{{address: "0x{addr:x}", name: "{html.escape(recinfo.name)}", matching: {effective_ratio}, diff: "{escaped}"}}')
except UnicodeDecodeError:
break
def gen_html(html_file, data):
output_data = Renderer().render_path(get_file_in_script_dir('template.html'),

View File

@@ -1,3 +1,4 @@
capstone
colorama
pystache
isledecomp
pystache