mirror of
https://github.com/isledecomp/isle.git
synced 2025-10-22 07:54:23 +00:00

* Add draft for Ghidra function import script * feature: Basic PDB analysis [skip ci] This is a draft with a lot of open questions left. Please do not merge * Refactor: Introduce submodules and reload remedy * refactor types and make them Python 3.9 compatible * run black * WIP: save progress * fix types and small type safety violations * fix another Python 3.9 syntax incompatibility * Implement struct imports [skip ci] - This code is still in dire need of refactoring and tests - There are only single-digit issues left, and 2600 functions can be imported - The biggest remaining error is mismatched stacks * Refactor, implement enums, fix lots of bugs * fix Python 3.9 issue * refactor: address review comments Not sure why VS Code suddenly decides to remove some empty spaces, but they don't make sense anyway * add unit tests for new type parsers, fix linter issue * refactor: db access from pdb_extraction.py * Fix stack layout offset error * fix: Undo incorrect reference change * Fix CI issue * Improve READMEs (fix typos, add information) --------- Co-authored-by: jonschz <jonschz@users.noreply.github.com>
183 lines
6.8 KiB
Python
183 lines
6.8 KiB
Python
import re
|
|
from typing import Iterable, Tuple
|
|
from collections import namedtuple
|
|
from .types import CvdumpTypesParser
|
|
from .symbols import CvdumpSymbolsParser
|
|
|
|
# e.g. `*** PUBLICS`
|
|
_section_change_regex = re.compile(r"\*\*\* (?P<section>[A-Z/ ]{2,})")
|
|
|
|
# e.g. ` 27 00034EC0 28 00034EE2 29 00034EE7 30 00034EF4`
|
|
_line_addr_pairs_findall = re.compile(r"\s+(?P<line_no>\d+) (?P<addr>[A-F0-9]{8})")
|
|
|
|
# We assume no spaces in the file name
|
|
# e.g. ` Z:\lego-island\isle\LEGO1\viewmanager\viewroi.cpp (None), 0001:00034E90-00034E97, line/addr pairs = 2`
|
|
_lines_subsection_header = re.compile(
|
|
r"^\s*(?P<filename>\S+).*?, (?P<section>[A-F0-9]{4}):(?P<start>[A-F0-9]{8})-(?P<end>[A-F0-9]{8}), line/addr pairs = (?P<len>\d+)"
|
|
)
|
|
|
|
# e.g. `S_PUB32: [0001:0003FF60], Flags: 00000000, __read`
|
|
_publics_line_regex = re.compile(
|
|
r"^(?P<type>\w+): \[(?P<section>\w{4}):(?P<offset>\w{8})], Flags: (?P<flags>\w{8}), (?P<name>\S+)"
|
|
)
|
|
|
|
# e.g. ` Debug start: 00000008, Debug end: 0000016E`
|
|
_gproc_debug_regex = re.compile(
|
|
r"\s*Debug start: (?P<start>\w{8}), Debug end: (?P<end>\w{8})"
|
|
)
|
|
|
|
# e.g. ` 00DA 0001:00000000 00000073 60501020`
|
|
_section_contrib_regex = re.compile(
|
|
r"\s*(?P<module>\w{4}) (?P<section>\w{4}):(?P<offset>\w{8}) (?P<size>\w{8}) (?P<flags>\w{8})"
|
|
)
|
|
|
|
# e.g. `S_GDATA32: [0003:000004A4], Type: T_32PRCHAR(0470), g_set`
|
|
_gdata32_regex = re.compile(
|
|
r"S_GDATA32: \[(?P<section>\w{4}):(?P<offset>\w{8})\], Type:\s*(?P<type>\S+), (?P<name>.+)"
|
|
)
|
|
|
|
# e.g. 0003 "CMakeFiles/isle.dir/ISLE/res/isle.rc.res"
|
|
# e.g. 0004 "C:\work\lego-island\isle\3rdparty\smartheap\SHLW32MT.LIB" "check.obj"
|
|
_module_regex = re.compile(r"(?P<id>\w{4})(?: \"(?P<lib>.+?)\")?(?: \"(?P<obj>.+?)\")")
|
|
|
|
# User functions only
|
|
LinesEntry = namedtuple("LinesEntry", "filename line_no section offset")
|
|
|
|
# Strings, vtables, functions
|
|
# superset of everything else
|
|
# only place you can find the C symbols (library functions, smacker, etc)
|
|
PublicsEntry = namedtuple("PublicsEntry", "type section offset flags name")
|
|
|
|
# (Estimated) size of any symbol
|
|
SizeRefEntry = namedtuple("SizeRefEntry", "module section offset size")
|
|
|
|
# global variables
|
|
GdataEntry = namedtuple("GdataEntry", "section offset type name")
|
|
|
|
ModuleEntry = namedtuple("ModuleEntry", "id lib obj")
|
|
|
|
|
|
class CvdumpParser:
|
|
# pylint: disable=too-many-instance-attributes
|
|
def __init__(self) -> None:
|
|
self._section: str = ""
|
|
self._lines_function: Tuple[str, int] = ("", 0)
|
|
|
|
self.lines = {}
|
|
self.publics = []
|
|
self.sizerefs = []
|
|
self.globals = []
|
|
self.modules = []
|
|
|
|
self.types = CvdumpTypesParser()
|
|
self.symbols_parser = CvdumpSymbolsParser()
|
|
|
|
@property
|
|
def symbols(self):
|
|
return self.symbols_parser.symbols
|
|
|
|
def _lines_section(self, line: str):
|
|
"""Parsing entries from the LINES section. We only care about the pairs of
|
|
line_number and address and the subsection header to indicate which code file
|
|
we are in."""
|
|
|
|
# Subheader indicates a new function and possibly a new code filename.
|
|
# Save the section here because it is not given on the lines that follow.
|
|
if (match := _lines_subsection_header.match(line)) is not None:
|
|
self._lines_function = (
|
|
match.group("filename"),
|
|
int(match.group("section"), 16),
|
|
)
|
|
return
|
|
|
|
# Match any pairs as we find them
|
|
for line_no, offset in _line_addr_pairs_findall.findall(line):
|
|
key = (self._lines_function[1], int(offset, 16))
|
|
self.lines[key] = (self._lines_function[0], int(line_no))
|
|
|
|
def _publics_section(self, line: str):
|
|
"""Match each line from PUBLICS and pull out the symbol information.
|
|
These are MSVC mangled symbol names. String constants and vtable
|
|
addresses can only be found here."""
|
|
if (match := _publics_line_regex.match(line)) is not None:
|
|
self.publics.append(
|
|
PublicsEntry(
|
|
type=match.group("type"),
|
|
section=int(match.group("section"), 16),
|
|
offset=int(match.group("offset"), 16),
|
|
flags=int(match.group("flags"), 16),
|
|
name=match.group("name"),
|
|
)
|
|
)
|
|
|
|
def _globals_section(self, line: str):
|
|
"""S_PROCREF may be useful later.
|
|
Right now we just want S_GDATA32 symbols because it is the simplest
|
|
way to access global variables."""
|
|
if (match := _gdata32_regex.match(line)) is not None:
|
|
self.globals.append(
|
|
GdataEntry(
|
|
section=int(match.group("section"), 16),
|
|
offset=int(match.group("offset"), 16),
|
|
type=match.group("type"),
|
|
name=match.group("name"),
|
|
)
|
|
)
|
|
|
|
def _section_contributions(self, line: str):
|
|
"""Gives the size of elements across all sections of the binary.
|
|
This is the easiest way to get the data size for .data and .rdata
|
|
members that do not have a primitive data type."""
|
|
if (match := _section_contrib_regex.match(line)) is not None:
|
|
self.sizerefs.append(
|
|
SizeRefEntry(
|
|
module=int(match.group("module"), 16),
|
|
section=int(match.group("section"), 16),
|
|
offset=int(match.group("offset"), 16),
|
|
size=int(match.group("size"), 16),
|
|
)
|
|
)
|
|
|
|
def _modules_section(self, line: str):
|
|
"""Record the object file (and lib file, if used) linked into the binary.
|
|
The auto-incrementing id is cross-referenced in SECTION CONTRIBUTIONS
|
|
(and perhaps other locations)"""
|
|
if (match := _module_regex.match(line)) is not None:
|
|
self.modules.append(
|
|
ModuleEntry(
|
|
id=int(match.group("id"), 16),
|
|
lib=match.group("lib"),
|
|
obj=match.group("obj"),
|
|
)
|
|
)
|
|
|
|
def read_line(self, line: str):
|
|
if (match := _section_change_regex.match(line)) is not None:
|
|
self._section = match.group(1)
|
|
return
|
|
|
|
if self._section == "TYPES":
|
|
self.types.read_line(line)
|
|
|
|
elif self._section == "SYMBOLS":
|
|
self.symbols_parser.read_line(line)
|
|
|
|
elif self._section == "LINES":
|
|
self._lines_section(line)
|
|
|
|
elif self._section == "PUBLICS":
|
|
self._publics_section(line)
|
|
|
|
elif self._section == "SECTION CONTRIBUTIONS":
|
|
self._section_contributions(line)
|
|
|
|
elif self._section == "GLOBALS":
|
|
self._globals_section(line)
|
|
|
|
elif self._section == "MODULES":
|
|
self._modules_section(line)
|
|
|
|
def read_lines(self, lines: Iterable[str]):
|
|
for line in lines:
|
|
self.read_line(line)
|