Python Linting and Code Formatting (#298)

* Create common print_diff function * Add pylint and black * Fix linting, move classes to utils * Add black/pylint to github actions * Fix linting * Move Bin and SymInfo into their own files * Split out format * Tidy up workdlows and pip, add readme * Lint tests, add tests to readme
2025-10-27 02:14:18 +00:00 · 2023-11-26 07:27:42 +13:00
parent fb0d1ccb62
commit b14116cc93
22 changed files with 1675 additions and 789 deletions
--- a/tools/isledecomp/isledecomp/init.py
+++ b/tools/isledecomp/isledecomp/init.py
@@ -0,0 +1,5 @@
+from .bin import *
+from .dir import *
+from .parser import *
+from .syminfo import *
+from .utils import *
--- a/tools/isledecomp/isledecomp/bin.py
+++ b/tools/isledecomp/isledecomp/bin.py
@@ -0,0 +1,47 @@
+import struct
+
+
+# Declare a class that can automatically convert virtual executable addresses
+# to file addresses
+class Bin:
+    def __init__(self, filename, logger):
+        self.logger = logger
+        self.logger.debug('Parsing headers of "%s"... ', filename)
+        self.filename = filename
+        self.file = None
+        self.imagebase = None
+        self.textvirt = None
+        self.textraw = None
+
+    def __enter__(self):
+        self.logger.debug(f"Bin {self.filename} Enter")
+        self.file = open(self.filename, "rb")
+
+        # HACK: Strictly, we should be parsing the header, but we know where
+        #      everything is in these two files so we just jump straight there
+
+        # Read ImageBase
+        self.file.seek(0xB4)
+        (self.imagebase,) = struct.unpack("<i", self.file.read(4))
+
+        # Read .text VirtualAddress
+        self.file.seek(0x184)
+        (self.textvirt,) = struct.unpack("<i", self.file.read(4))
+
+        # Read .text PointerToRawData
+        self.file.seek(0x18C)
+        (self.textraw,) = struct.unpack("<i", self.file.read(4))
+        self.logger.debug("... Parsing finished")
+        return self
+
+    def __exit__(self, exc_type, exc_value, exc_traceback):
+        self.logger.debug(f"Bin {self.filename} Exit")
+        if self.file:
+            self.file.close()
+
+    def get_addr(self, virt):
+        return virt - self.imagebase - self.textvirt + self.textraw
+
+    def read(self, offset, size):
+        self.file.seek(self.get_addr(offset))
+        return self.file.read(size)
--- a/tools/isledecomp/isledecomp/dir.py
+++ b/tools/isledecomp/isledecomp/dir.py
@@ -1,21 +1,63 @@
 import os
+import subprocess
+import sys
 from typing import Iterator


+class WinePathConverter:
+    def __init__(self, unix_cwd):
+        self.unix_cwd = unix_cwd
+        self.win_cwd = self._call_winepath_unix2win(self.unix_cwd)
+
+    def get_wine_path(self, unix_fn: str) -> str:
+        if unix_fn.startswith("./"):
+            return self.win_cwd + "\\" + unix_fn[2:].replace("/", "\\")
+        if unix_fn.startswith(self.unix_cwd):
+            return (
+                self.win_cwd
+                + "\\"
+                + unix_fn.removeprefix(self.unix_cwd).replace("/", "\\").lstrip("\\")
+            )
+        return self._call_winepath_unix2win(unix_fn)
+
+    def get_unix_path(self, win_fn: str) -> str:
+        if win_fn.startswith(".\\") or win_fn.startswith("./"):
+            return self.unix_cwd + "/" + win_fn[2:].replace("\\", "/")
+        if win_fn.startswith(self.win_cwd):
+            return (
+                self.unix_cwd
+                + "/"
+                + win_fn.removeprefix(self.win_cwd).replace("\\", "/")
+            )
+        return self._call_winepath_win2unix(win_fn)
+
+    @staticmethod
+    def _call_winepath_unix2win(fn: str) -> str:
+        return subprocess.check_output(["winepath", "-w", fn], text=True).strip()
+
+    @staticmethod
+    def _call_winepath_win2unix(fn: str) -> str:
+        return subprocess.check_output(["winepath", fn], text=True).strip()
+
+
 def is_file_cpp(filename: str) -> bool:
-    (basefile, ext) = os.path.splitext(filename)
-    return ext.lower() in ('.h', '.cpp')
+    (_, ext) = os.path.splitext(filename)
+    return ext.lower() in (".h", ".cpp")


 def walk_source_dir(source: str, recursive: bool = True) -> Iterator[str]:
    """Generator to walk the given directory recursively and return
-       any C++ files found."""
+    any C++ files found."""

    source = os.path.abspath(source)
-    for subdir, dirs, files in os.walk(source):
+    for subdir, _, files in os.walk(source):
        for file in files:
            if is_file_cpp(file):
                yield os.path.join(subdir, file)

        if not recursive:
            break
+
+
+def get_file_in_script_dir(fn):
+    return os.path.join(os.path.dirname(os.path.abspath(sys.argv[0])), fn)
--- a/tools/isledecomp/isledecomp/parser/parser.py
+++ b/tools/isledecomp/isledecomp/parser/parser.py
@@ -7,7 +7,6 @@ from .util import (
    OffsetMatch,
    is_blank_or_comment,
    match_offset_comment,
-    is_exact_offset_comment,
    get_template_function_name,
    remove_trailing_comment,
    distinct_by_module,
@@ -25,10 +24,10 @@ class ReaderState(Enum):

 def find_code_blocks(stream: TextIO) -> List[CodeBlock]:
    """Read the IO stream (file) line-by-line and give the following report:
-       Foreach code block (function) in the file, what are its starting and
-       ending line numbers, and what is the given offset in the original
-       binary. We expect the result to be ordered by line number because we
-       are reading the file from start to finish."""
+    Foreach code block (function) in the file, what are its starting and
+    ending line numbers, and what is the given offset in the original
+    binary. We expect the result to be ordered by line number because we
+    are reading the file from start to finish."""

    blocks: List[CodeBlock] = []

@@ -51,14 +50,16 @@ def find_code_blocks(stream: TextIO) -> List[CodeBlock]:
            # Our list of offset marks could have duplicates on
            # module name, so we'll eliminate those now.
            for offset_match in distinct_by_module(offset_matches):
-                block = CodeBlock(offset=offset_match.address,
-                                  signature=function_sig,
-                                  start_line=start_line,
-                                  end_line=end_line,
-                                  offset_comment=offset_match.comment,
-                                  module=offset_match.module,
-                                  is_template=offset_match.is_template,
-                                  is_stub=offset_match.is_stub)
+                block = CodeBlock(
+                    offset=offset_match.address,
+                    signature=function_sig,
+                    start_line=start_line,
+                    end_line=end_line,
+                    offset_comment=offset_match.comment,
+                    module=offset_match.module,
+                    is_template=offset_match.is_template,
+                    is_stub=offset_match.is_stub,
+                )
                blocks.append(block)
            offset_matches = []
            state = ReaderState.WANT_OFFSET
@@ -66,15 +67,18 @@ def find_code_blocks(stream: TextIO) -> List[CodeBlock]:
        if can_seek:
            line_no += 1
            line = stream.readline()
-            if line == '':
+            if line == "":
                break

        new_match = match_offset_comment(line)
        if new_match is not None:
            # We will allow multiple offsets if we have just begun
            # the code block, but not after we hit the curly brace.
-            if state in (ReaderState.WANT_OFFSET, ReaderState.IN_TEMPLATE,
-                         ReaderState.WANT_SIG):
+            if state in (
+                ReaderState.WANT_OFFSET,
+                ReaderState.IN_TEMPLATE,
+                ReaderState.WANT_SIG,
+            ):
                # If we detected an offset marker unexpectedly,
                # we are handling it here so we can continue seeking.
                can_seek = True
@@ -116,11 +120,10 @@ def find_code_blocks(stream: TextIO) -> List[CodeBlock]:
                # same line. clang-format should prevent this (BraceWrapping)
                # but it is easy to detect.
                # If the entire function is on one line, handle that too.
-                if function_sig.endswith('{'):
+                if function_sig.endswith("{"):
                    start_line = line_no
                    state = ReaderState.IN_FUNC
-                elif (function_sig.endswith('}') or
-                        function_sig.endswith('};')):
+                elif function_sig.endswith("}") or function_sig.endswith("};"):
                    start_line = line_no
                    end_line = line_no
                    state = ReaderState.FUNCTION_DONE
@@ -128,14 +131,14 @@ def find_code_blocks(stream: TextIO) -> List[CodeBlock]:
                    state = ReaderState.WANT_CURLY

        elif state == ReaderState.WANT_CURLY:
-            if line.strip() == '{':
+            if line.strip() == "{":
                start_line = line_no
                state = ReaderState.IN_FUNC

        elif state == ReaderState.IN_FUNC:
            # Naive but reasonable assumption that functions will end with
            # a curly brace on its own line with no prepended spaces.
-            if line.startswith('}'):
+            if line.startswith("}"):
                end_line = line_no
                state = ReaderState.FUNCTION_DONE

--- a/tools/isledecomp/isledecomp/parser/util.py
+++ b/tools/isledecomp/isledecomp/parser/util.py
@@ -5,34 +5,49 @@ from typing import List
 from collections import namedtuple


-CodeBlock = namedtuple('CodeBlock',
-                       ['offset', 'signature', 'start_line', 'end_line',
-                        'offset_comment', 'module', 'is_template', 'is_stub'])
+CodeBlock = namedtuple(
+    "CodeBlock",
+    [
+        "offset",
+        "signature",
+        "start_line",
+        "end_line",
+        "offset_comment",
+        "module",
+        "is_template",
+        "is_stub",
+    ],
+)

-OffsetMatch = namedtuple('OffsetMatch', ['module', 'address', 'is_template',
-                                         'is_stub', 'comment'])
+OffsetMatch = namedtuple(
+    "OffsetMatch", ["module", "address", "is_template", "is_stub", "comment"]
+)

 # This has not been formally established, but considering that "STUB"
 # is a temporary state for a function, we assume it will appear last,
 # after any other modifiers (i.e. TEMPLATE)

 # To match a reasonable variance of formatting for the offset comment
-offsetCommentRegex = re.compile(r'\s*//\s*OFFSET:\s*(\w+)\s+(?:0x)?([a-f0-9]+)(\s+TEMPLATE)?(\s+STUB)?',  # nopep8
-                                flags=re.I)
+offsetCommentRegex = re.compile(
+    r"\s*//\s*OFFSET:\s*(\w+)\s+(?:0x)?([a-f0-9]+)(\s+TEMPLATE)?(\s+STUB)?",  # nopep8
+    flags=re.I,
+)

 # To match the exact syntax (text upper case, hex lower case, with spaces)
 # that is used in most places
-offsetCommentExactRegex = re.compile(r'^// OFFSET: [A-Z0-9]+ (0x[a-f0-9]+)( TEMPLATE)?( STUB)?$')  # nopep8
+offsetCommentExactRegex = re.compile(
+    r"^// OFFSET: [A-Z0-9]+ (0x[a-f0-9]+)( TEMPLATE)?( STUB)?$"
+)  # nopep8


 # The goal here is to just read whatever is on the next line, so some
 # flexibility in the formatting seems OK
-templateCommentRegex = re.compile(r'\s*//\s+(.*)')
+templateCommentRegex = re.compile(r"\s*//\s+(.*)")


 # To remove any comment (//) or block comment (/*) and its leading spaces
 # from the end of a code line
-trailingCommentRegex = re.compile(r'(\s*(?://|/\*).*)$')
+trailingCommentRegex = re.compile(r"(\s*(?://|/\*).*)$")


 def get_template_function_name(line: str) -> str:
@@ -47,23 +62,25 @@ def get_template_function_name(line: str) -> str:


 def remove_trailing_comment(line: str) -> str:
-    return trailingCommentRegex.sub('', line)
+    return trailingCommentRegex.sub("", line)


 def is_blank_or_comment(line: str) -> bool:
    """Helper to read ahead after the offset comment is matched.
-       There could be blank lines or other comments before the
-       function signature, and we want to skip those."""
+    There could be blank lines or other comments before the
+    function signature, and we want to skip those."""
    line_strip = line.strip()
-    return (len(line_strip) == 0
-            or line_strip.startswith('//')
-            or line_strip.startswith('/*')
-            or line_strip.endswith('*/'))
+    return (
+        len(line_strip) == 0
+        or line_strip.startswith("//")
+        or line_strip.startswith("/*")
+        or line_strip.endswith("*/")
+    )


 def is_exact_offset_comment(line: str) -> bool:
    """If the offset comment does not match our (unofficial) syntax
-       we may want to alert the user to fix it for style points."""
+    we may want to alert the user to fix it for style points."""
    return offsetCommentExactRegex.match(line) is not None


@@ -72,17 +89,19 @@ def match_offset_comment(line: str) -> OffsetMatch | None:
    if match is None:
        return None

-    return OffsetMatch(module=match.group(1),
-                       address=int(match.group(2), 16),
-                       is_template=match.group(3) is not None,
-                       is_stub=match.group(4) is not None,
-                       comment=line.strip())
+    return OffsetMatch(
+        module=match.group(1),
+        address=int(match.group(2), 16),
+        is_template=match.group(3) is not None,
+        is_stub=match.group(4) is not None,
+        comment=line.strip(),
+    )


 def distinct_by_module(offsets: List) -> List:
    """Given a list of offset markers, return a list with distinct
-       module names. If module names (case-insensitive) are repeated,
-       choose the offset that appears first."""
+    module names. If module names (case-insensitive) are repeated,
+    choose the offset that appears first."""

    if len(offsets) < 2:
        return offsets
--- a/tools/isledecomp/isledecomp/syminfo.py
+++ b/tools/isledecomp/isledecomp/syminfo.py
@@ -0,0 +1,138 @@
+import os
+import subprocess
+from .utils import get_file_in_script_dir
+
+
+class RecompiledInfo:
+    addr = None
+    size = None
+    name = None
+    start = None
+
+
+# Declare a class that parses the output of cvdump for fast access later
+class SymInfo:
+    funcs = {}
+    lines = {}
+    names = {}
+
+    def __init__(self, pdb, sym_recompfile, sym_logger, sym_wine_path_converter=None):
+        self.logger = sym_logger
+        call = [get_file_in_script_dir("cvdump.exe"), "-l", "-s"]
+
+        if sym_wine_path_converter:
+            # Run cvdump through wine and convert path to Windows-friendly wine path
+            call.insert(0, "wine")
+            call.append(sym_wine_path_converter.get_wine_path(pdb))
+        else:
+            call.append(pdb)
+
+        self.logger.info("Parsing %s ...", pdb)
+        self.logger.debug("Command = %s", call)
+        line_dump = subprocess.check_output(call).decode("utf-8").split("\r\n")
+
+        current_section = None
+
+        self.logger.debug("Parsing output of cvdump.exe ...")
+
+        for i, line in enumerate(line_dump):
+            if line.startswith("***"):
+                current_section = line[4:]
+
+            if current_section == "SYMBOLS" and "S_GPROC32" in line:
+                sym_addr = int(line[26:34], 16)
+
+                info = RecompiledInfo()
+                info.addr = (
+                    sym_addr + sym_recompfile.imagebase + sym_recompfile.textvirt
+                )
+
+                use_dbg_offs = False
+                if use_dbg_offs:
+                    debug_offs = line_dump[i + 2]
+                    debug_start = int(debug_offs[22:30], 16)
+                    debug_end = int(debug_offs[43:], 16)
+
+                    info.start = debug_start
+                    info.size = debug_end - debug_start
+                else:
+                    info.start = 0
+                    info.size = int(line[41:49], 16)
+
+                info.name = line[77:]
+
+                self.names[info.name] = info
+                self.funcs[sym_addr] = info
+            elif (
+                current_section == "LINES"
+                and line.startswith("  ")
+                and not line.startswith("   ")
+            ):
+                sourcepath = line.split()[0]
+
+                if sym_wine_path_converter:
+                    # Convert filename to Unix path for file compare
+                    sourcepath = sym_wine_path_converter.get_unix_path(sourcepath)
+
+                if sourcepath not in self.lines:
+                    self.lines[sourcepath] = {}
+
+                j = i + 2
+                while True:
+                    ll = line_dump[j].split()
+                    if len(ll) == 0:
+                        break
+
+                    k = 0
+                    while k < len(ll):
+                        linenum = int(ll[k + 0])
+                        address = int(ll[k + 1], 16)
+                        if linenum not in self.lines[sourcepath]:
+                            self.lines[sourcepath][linenum] = address
+                        k += 2
+
+                    j += 1
+
+        self.logger.debug("... Parsing output of cvdump.exe finished")
+
+    def get_recompiled_address(self, filename, line):
+        recompiled_addr = None
+
+        self.logger.debug("Looking for %s:%s", filename, line)
+        filename_basename = os.path.basename(filename).lower()
+
+        for fn in self.lines:
+            # Sometimes a PDB is compiled with a relative path while we always have
+            # an absolute path. Therefore we must
+            try:
+                if os.path.basename(
+                    fn
+                ).lower() == filename_basename and os.path.samefile(fn, filename):
+                    filename = fn
+                    break
+            except FileNotFoundError:
+                continue
+
+        if filename in self.lines and line in self.lines[filename]:
+            recompiled_addr = self.lines[filename][line]
+
+            if recompiled_addr in self.funcs:
+                return self.funcs[recompiled_addr]
+            self.logger.error(
+                "Failed to find function symbol with address: %x", recompiled_addr
+            )
+            return None
+        self.logger.error(
+            "Failed to find function symbol with filename and line: %s:%s",
+            filename,
+            line,
+        )
+        return None
+
+    def get_recompiled_address_from_name(self, name):
+        self.logger.debug("Looking for %s", name)
+
+        if name in self.names:
+            return self.names[name]
+        self.logger.error("Failed to find function symbol with name: %s", name)
+        return None
--- a/tools/isledecomp/isledecomp/utils.py
+++ b/tools/isledecomp/isledecomp/utils.py
@@ -0,0 +1,42 @@
+import os
+import sys
+import colorama
+
+
+def print_diff(udiff, plain):
+    has_diff = False
+    for line in udiff:
+        has_diff = True
+        color = ""
+        if line.startswith("++") or line.startswith("@@") or line.startswith("--"):
+            # Skip unneeded parts of the diff for the brief view
+            continue
+        # Work out color if we are printing color
+        if not plain:
+            if line.startswith("+"):
+                color = colorama.Fore.GREEN
+            elif line.startswith("-"):
+                color = colorama.Fore.RED
+        print(color + line)
+        # Reset color if we're printing in color
+        if not plain:
+            print(colorama.Style.RESET_ALL, end="")
+    return has_diff
+
+
+def get_file_in_script_dir(fn):
+    return os.path.join(os.path.dirname(os.path.abspath(sys.argv[0])), fn)
+
+
+class OffsetPlaceholderGenerator:
+    def __init__(self):
+        self.counter = 0
+        self.replacements = {}
+
+    def get(self, replace_addr):
+        if replace_addr in self.replacements:
+            return self.replacements[replace_addr]
+        self.counter += 1
+        replacement = f"<OFFSET{self.counter}>"
+        self.replacements[replace_addr] = replacement
+        return replacement