Files
isle/tools/isledecomp/isledecomp/cvdump/parser.py
jonschz f26c30974a Add Ghidra function import script (#909)
* Add draft for Ghidra function import script

* feature: Basic PDB analysis [skip ci]

This is a draft with a lot of open questions left. Please do not merge

* Refactor: Introduce submodules and reload remedy

* refactor types and make them Python 3.9 compatible

* run black

* WIP: save progress

* fix types and small type safety violations

* fix another Python 3.9 syntax incompatibility

* Implement struct imports [skip ci]

- This code is still in dire need of refactoring and tests
- There are only single-digit issues left, and 2600 functions can be imported
- The biggest remaining error is mismatched stacks

* Refactor, implement enums, fix lots of bugs

* fix Python 3.9 issue

* refactor: address review comments

Not sure why VS Code suddenly decides to remove some empty spaces, but they don't make sense anyway

* add unit tests for new type parsers, fix linter issue

* refactor: db access from pdb_extraction.py

* Fix stack layout offset error

* fix: Undo incorrect reference change

* Fix CI issue

* Improve READMEs (fix typos, add information)

---------

Co-authored-by: jonschz <jonschz@users.noreply.github.com>
2024-06-09 08:41:24 -04:00

183 lines
6.8 KiB
Python

import re
from typing import Iterable, Tuple
from collections import namedtuple
from .types import CvdumpTypesParser
from .symbols import CvdumpSymbolsParser
# e.g. `*** PUBLICS`
_section_change_regex = re.compile(r"\*\*\* (?P<section>[A-Z/ ]{2,})")
# e.g. ` 27 00034EC0 28 00034EE2 29 00034EE7 30 00034EF4`
_line_addr_pairs_findall = re.compile(r"\s+(?P<line_no>\d+) (?P<addr>[A-F0-9]{8})")
# We assume no spaces in the file name
# e.g. ` Z:\lego-island\isle\LEGO1\viewmanager\viewroi.cpp (None), 0001:00034E90-00034E97, line/addr pairs = 2`
_lines_subsection_header = re.compile(
r"^\s*(?P<filename>\S+).*?, (?P<section>[A-F0-9]{4}):(?P<start>[A-F0-9]{8})-(?P<end>[A-F0-9]{8}), line/addr pairs = (?P<len>\d+)"
)
# e.g. `S_PUB32: [0001:0003FF60], Flags: 00000000, __read`
_publics_line_regex = re.compile(
r"^(?P<type>\w+): \[(?P<section>\w{4}):(?P<offset>\w{8})], Flags: (?P<flags>\w{8}), (?P<name>\S+)"
)
# e.g. ` Debug start: 00000008, Debug end: 0000016E`
_gproc_debug_regex = re.compile(
r"\s*Debug start: (?P<start>\w{8}), Debug end: (?P<end>\w{8})"
)
# e.g. ` 00DA 0001:00000000 00000073 60501020`
_section_contrib_regex = re.compile(
r"\s*(?P<module>\w{4}) (?P<section>\w{4}):(?P<offset>\w{8}) (?P<size>\w{8}) (?P<flags>\w{8})"
)
# e.g. `S_GDATA32: [0003:000004A4], Type: T_32PRCHAR(0470), g_set`
_gdata32_regex = re.compile(
r"S_GDATA32: \[(?P<section>\w{4}):(?P<offset>\w{8})\], Type:\s*(?P<type>\S+), (?P<name>.+)"
)
# e.g. 0003 "CMakeFiles/isle.dir/ISLE/res/isle.rc.res"
# e.g. 0004 "C:\work\lego-island\isle\3rdparty\smartheap\SHLW32MT.LIB" "check.obj"
_module_regex = re.compile(r"(?P<id>\w{4})(?: \"(?P<lib>.+?)\")?(?: \"(?P<obj>.+?)\")")
# User functions only
LinesEntry = namedtuple("LinesEntry", "filename line_no section offset")
# Strings, vtables, functions
# superset of everything else
# only place you can find the C symbols (library functions, smacker, etc)
PublicsEntry = namedtuple("PublicsEntry", "type section offset flags name")
# (Estimated) size of any symbol
SizeRefEntry = namedtuple("SizeRefEntry", "module section offset size")
# global variables
GdataEntry = namedtuple("GdataEntry", "section offset type name")
ModuleEntry = namedtuple("ModuleEntry", "id lib obj")
class CvdumpParser:
# pylint: disable=too-many-instance-attributes
def __init__(self) -> None:
self._section: str = ""
self._lines_function: Tuple[str, int] = ("", 0)
self.lines = {}
self.publics = []
self.sizerefs = []
self.globals = []
self.modules = []
self.types = CvdumpTypesParser()
self.symbols_parser = CvdumpSymbolsParser()
@property
def symbols(self):
return self.symbols_parser.symbols
def _lines_section(self, line: str):
"""Parsing entries from the LINES section. We only care about the pairs of
line_number and address and the subsection header to indicate which code file
we are in."""
# Subheader indicates a new function and possibly a new code filename.
# Save the section here because it is not given on the lines that follow.
if (match := _lines_subsection_header.match(line)) is not None:
self._lines_function = (
match.group("filename"),
int(match.group("section"), 16),
)
return
# Match any pairs as we find them
for line_no, offset in _line_addr_pairs_findall.findall(line):
key = (self._lines_function[1], int(offset, 16))
self.lines[key] = (self._lines_function[0], int(line_no))
def _publics_section(self, line: str):
"""Match each line from PUBLICS and pull out the symbol information.
These are MSVC mangled symbol names. String constants and vtable
addresses can only be found here."""
if (match := _publics_line_regex.match(line)) is not None:
self.publics.append(
PublicsEntry(
type=match.group("type"),
section=int(match.group("section"), 16),
offset=int(match.group("offset"), 16),
flags=int(match.group("flags"), 16),
name=match.group("name"),
)
)
def _globals_section(self, line: str):
"""S_PROCREF may be useful later.
Right now we just want S_GDATA32 symbols because it is the simplest
way to access global variables."""
if (match := _gdata32_regex.match(line)) is not None:
self.globals.append(
GdataEntry(
section=int(match.group("section"), 16),
offset=int(match.group("offset"), 16),
type=match.group("type"),
name=match.group("name"),
)
)
def _section_contributions(self, line: str):
"""Gives the size of elements across all sections of the binary.
This is the easiest way to get the data size for .data and .rdata
members that do not have a primitive data type."""
if (match := _section_contrib_regex.match(line)) is not None:
self.sizerefs.append(
SizeRefEntry(
module=int(match.group("module"), 16),
section=int(match.group("section"), 16),
offset=int(match.group("offset"), 16),
size=int(match.group("size"), 16),
)
)
def _modules_section(self, line: str):
"""Record the object file (and lib file, if used) linked into the binary.
The auto-incrementing id is cross-referenced in SECTION CONTRIBUTIONS
(and perhaps other locations)"""
if (match := _module_regex.match(line)) is not None:
self.modules.append(
ModuleEntry(
id=int(match.group("id"), 16),
lib=match.group("lib"),
obj=match.group("obj"),
)
)
def read_line(self, line: str):
if (match := _section_change_regex.match(line)) is not None:
self._section = match.group(1)
return
if self._section == "TYPES":
self.types.read_line(line)
elif self._section == "SYMBOLS":
self.symbols_parser.read_line(line)
elif self._section == "LINES":
self._lines_section(line)
elif self._section == "PUBLICS":
self._publics_section(line)
elif self._section == "SECTION CONTRIBUTIONS":
self._section_contributions(line)
elif self._section == "GLOBALS":
self._globals_section(line)
elif self._section == "MODULES":
self._modules_section(line)
def read_lines(self, lines: Iterable[str]):
for line in lines:
self.read_line(line)