Add Ghidra function import script (#909)

* Add draft for Ghidra function import script

* feature: Basic PDB analysis [skip ci]

This is a draft with a lot of open questions left. Please do not merge

* Refactor: Introduce submodules and reload remedy

* refactor types and make them Python 3.9 compatible

* run black

* WIP: save progress

* fix types and small type safety violations

* fix another Python 3.9 syntax incompatibility

* Implement struct imports [skip ci]

- This code is still in dire need of refactoring and tests
- There are only single-digit issues left, and 2600 functions can be imported
- The biggest remaining error is mismatched stacks

* Refactor, implement enums, fix lots of bugs

* fix Python 3.9 issue

* refactor: address review comments

Not sure why VS Code suddenly decides to remove some empty spaces, but they don't make sense anyway

* add unit tests for new type parsers, fix linter issue

* refactor: db access from pdb_extraction.py

* Fix stack layout offset error

* fix: Undo incorrect reference change

* Fix CI issue

* Improve READMEs (fix typos, add information)

---------

Co-authored-by: jonschz <jonschz@users.noreply.github.com>
This commit is contained in:
jonschz
2024-06-09 14:41:24 +02:00
committed by GitHub
parent 88805f9fcb
commit f26c30974a
21 changed files with 1824 additions and 114 deletions

View File

@@ -4,7 +4,7 @@ import difflib
import struct
import uuid
from dataclasses import dataclass
from typing import Callable, Iterable, List, Optional
from typing import Any, Callable, Iterable, List, Optional
from isledecomp.bin import Bin as IsleBin, InvalidVirtualAddressError
from isledecomp.cvdump.demangler import demangle_string_const
from isledecomp.cvdump import Cvdump, CvdumpAnalysis
@@ -90,7 +90,7 @@ class Compare:
def _load_cvdump(self):
logger.info("Parsing %s ...", self.pdb_file)
cv = (
self.cv = (
Cvdump(self.pdb_file)
.lines()
.globals()
@@ -100,9 +100,9 @@ class Compare:
.types()
.run()
)
res = CvdumpAnalysis(cv)
self.cvdump_analysis = CvdumpAnalysis(self.cv)
for sym in res.nodes:
for sym in self.cvdump_analysis.nodes:
# Skip nodes where we have almost no information.
# These probably came from SECTION CONTRIBUTIONS.
if sym.name() is None and sym.node_type is None:
@@ -116,6 +116,7 @@ class Compare:
continue
addr = self.recomp_bin.get_abs_addr(sym.section, sym.offset)
sym.addr = addr
# If this symbol is the final one in its section, we were not able to
# estimate its size because we didn't have the total size of that section.
@@ -165,7 +166,10 @@ class Compare:
addr, sym.node_type, sym.name(), sym.decorated_name, sym.size()
)
for (section, offset), (filename, line_no) in res.verified_lines.items():
for (section, offset), (
filename,
line_no,
) in self.cvdump_analysis.verified_lines.items():
addr = self.recomp_bin.get_abs_addr(section, offset)
self._lines_db.add_line(filename, line_no, addr)
@@ -736,6 +740,9 @@ class Compare:
def get_variables(self) -> List[MatchInfo]:
return self._db.get_matches_by_type(SymbolType.DATA)
def get_match_options(self, addr: int) -> Optional[dict[str, Any]]:
return self._db.get_match_options(addr)
def compare_address(self, addr: int) -> Optional[DiffReport]:
match = self._db.get_one_match(addr)
if match is None:

View File

@@ -2,7 +2,7 @@
addresses/symbols that we want to compare between the original and recompiled binaries."""
import sqlite3
import logging
from typing import List, Optional
from typing import Any, List, Optional
from isledecomp.types import SymbolType
from isledecomp.cvdump.demangler import get_vtordisp_name
@@ -335,7 +335,7 @@ class CompareDb:
def skip_compare(self, orig: int):
self._set_opt_bool(orig, "skip")
def get_match_options(self, addr: int) -> Optional[dict]:
def get_match_options(self, addr: int) -> Optional[dict[str, Any]]:
cur = self._db.execute(
"""SELECT name, value FROM `match_options` WHERE addr = ?""", (addr,)
)

View File

@@ -1,3 +1,4 @@
from .symbols import SymbolsEntry
from .analysis import CvdumpAnalysis
from .parser import CvdumpParser
from .runner import Cvdump

View File

@@ -1,5 +1,7 @@
"""For collating the results from parsing cvdump.exe into a more directly useful format."""
from typing import Dict, List, Tuple, Optional
from isledecomp.cvdump import SymbolsEntry
from isledecomp.types import SymbolType
from .parser import CvdumpParser
from .demangler import demangle_string_const, demangle_vtable
@@ -31,6 +33,8 @@ class CvdumpNode:
# Size as reported by SECTION CONTRIBUTIONS section. Not guaranteed to be
# accurate.
section_contribution: Optional[int] = None
addr: Optional[int] = None
symbol_entry: Optional[SymbolsEntry] = None
def __init__(self, section: int, offset: int) -> None:
self.section = section
@@ -87,13 +91,12 @@ class CvdumpAnalysis:
"""Collects the results from CvdumpParser into a list of nodes (i.e. symbols).
These can then be analyzed by a downstream tool."""
nodes = List[CvdumpNode]
verified_lines = Dict[Tuple[str, str], Tuple[str, str]]
verified_lines: Dict[Tuple[str, str], Tuple[str, str]]
def __init__(self, parser: CvdumpParser):
"""Read in as much information as we have from the parser.
The more sections we have, the better our information will be."""
node_dict = {}
node_dict: Dict[Tuple[int, int], CvdumpNode] = {}
# PUBLICS is our roadmap for everything that follows.
for pub in parser.publics:
@@ -158,8 +161,11 @@ class CvdumpAnalysis:
node_dict[key].friendly_name = sym.name
node_dict[key].confirmed_size = sym.size
node_dict[key].node_type = SymbolType.FUNCTION
node_dict[key].symbol_entry = sym
self.nodes = [v for _, v in dict(sorted(node_dict.items())).items()]
self.nodes: List[CvdumpNode] = [
v for _, v in dict(sorted(node_dict.items())).items()
]
self._estimate_size()
def _estimate_size(self):

View File

@@ -2,6 +2,7 @@ import re
from typing import Iterable, Tuple
from collections import namedtuple
from .types import CvdumpTypesParser
from .symbols import CvdumpSymbolsParser
# e.g. `*** PUBLICS`
_section_change_regex = re.compile(r"\*\*\* (?P<section>[A-Z/ ]{2,})")
@@ -20,11 +21,6 @@ _publics_line_regex = re.compile(
r"^(?P<type>\w+): \[(?P<section>\w{4}):(?P<offset>\w{8})], Flags: (?P<flags>\w{8}), (?P<name>\S+)"
)
# e.g. `(00008C) S_GPROC32: [0001:00034E90], Cb: 00000007, Type: 0x1024, ViewROI::IntrinsicImportance`
_symbol_line_regex = re.compile(
r"\(\w+\) (?P<type>\S+): \[(?P<section>\w{4}):(?P<offset>\w{8})\], Cb: (?P<size>\w+), Type:\s+\S+, (?P<name>.+)"
)
# e.g. ` Debug start: 00000008, Debug end: 0000016E`
_gproc_debug_regex = re.compile(
r"\s*Debug start: (?P<start>\w{8}), Debug end: (?P<end>\w{8})"
@@ -52,9 +48,6 @@ LinesEntry = namedtuple("LinesEntry", "filename line_no section offset")
# only place you can find the C symbols (library functions, smacker, etc)
PublicsEntry = namedtuple("PublicsEntry", "type section offset flags name")
# S_GPROC32 = functions
SymbolsEntry = namedtuple("SymbolsEntry", "type section offset size name")
# (Estimated) size of any symbol
SizeRefEntry = namedtuple("SizeRefEntry", "module section offset size")
@@ -72,12 +65,16 @@ class CvdumpParser:
self.lines = {}
self.publics = []
self.symbols = []
self.sizerefs = []
self.globals = []
self.modules = []
self.types = CvdumpTypesParser()
self.symbols_parser = CvdumpSymbolsParser()
@property
def symbols(self):
return self.symbols_parser.symbols
def _lines_section(self, line: str):
"""Parsing entries from the LINES section. We only care about the pairs of
@@ -127,20 +124,6 @@ class CvdumpParser:
)
)
def _symbols_section(self, line: str):
"""We are interested in S_GPROC32 symbols only."""
if (match := _symbol_line_regex.match(line)) is not None:
if match.group("type") == "S_GPROC32":
self.symbols.append(
SymbolsEntry(
type=match.group("type"),
section=int(match.group("section"), 16),
offset=int(match.group("offset"), 16),
size=int(match.group("size"), 16),
name=match.group("name"),
)
)
def _section_contributions(self, line: str):
"""Gives the size of elements across all sections of the binary.
This is the easiest way to get the data size for .data and .rdata
@@ -177,7 +160,7 @@ class CvdumpParser:
self.types.read_line(line)
elif self._section == "SYMBOLS":
self._symbols_section(line)
self.symbols_parser.read_line(line)
elif self._section == "LINES":
self._lines_section(line)

View File

@@ -0,0 +1,153 @@
from dataclasses import dataclass, field
import logging
import re
from re import Match
from typing import NamedTuple, Optional
logger = logging.getLogger(__name__)
class StackOrRegisterSymbol(NamedTuple):
symbol_type: str
location: str
"""Should always be set/converted to lowercase."""
data_type: str
name: str
# S_GPROC32 = functions
@dataclass
class SymbolsEntry:
# pylint: disable=too-many-instance-attributes
type: str
section: int
offset: int
size: int
func_type: str
name: str
stack_symbols: list[StackOrRegisterSymbol] = field(default_factory=list)
frame_pointer_present: bool = False
addr: Optional[int] = None # Absolute address. Will be set later, if at all
class CvdumpSymbolsParser:
_symbol_line_generic_regex = re.compile(
r"\(\w+\)\s+(?P<symbol_type>[^\s:]+)(?::\s+(?P<second_part>\S.*))?|(?::)$"
)
"""
Parses the first part, e.g. `(00008C) S_GPROC32`, and splits off the second part after the colon (if it exists).
There are three cases:
- no colon, e.g. `(000350) S_END`
- colon but no data, e.g. `(000370) S_COMPILE:`
- colon and data, e.g. `(000304) S_REGISTER: esi, Type: 0x1E14, this``
"""
_symbol_line_function_regex = re.compile(
r"\[(?P<section>\w{4}):(?P<offset>\w{8})\], Cb: (?P<size>\w+), Type:\s+(?P<func_type>[^\s,]+), (?P<name>.+)"
)
"""
Parses the second part of a function symbol, e.g.
`[0001:00034E90], Cb: 00000007, Type: 0x1024, ViewROI::IntrinsicImportance`
"""
# the second part of e.g.
_stack_register_symbol_regex = re.compile(
r"(?P<location>\S+), Type:\s+(?P<data_type>[\w()]+), (?P<name>.+)$"
)
"""
Parses the second part of a stack or register symbol, e.g.
`esi, Type: 0x1E14, this`
"""
_debug_start_end_regex = re.compile(
r"^\s*Debug start: (?P<debug_start>\w+), Debug end: (?P<debug_end>\w+)$"
)
_parent_end_next_regex = re.compile(
r"\s*Parent: (?P<parent_addr>\w+), End: (?P<end_addr>\w+), Next: (?P<next_addr>\w+)$"
)
_flags_frame_pointer_regex = re.compile(r"\s*Flags: Frame Ptr Present$")
_register_stack_symbols = ["S_BPREL32", "S_REGISTER"]
# List the unhandled types so we can check exhaustiveness
_unhandled_symbols = [
"S_COMPILE",
"S_OBJNAME",
"S_THUNK32",
"S_LABEL32",
"S_LDATA32",
"S_LPROC32",
"S_UDT",
]
"""Parser for cvdump output, SYMBOLS section."""
def __init__(self):
self.symbols: list[SymbolsEntry] = []
self.current_function: Optional[SymbolsEntry] = None
def read_line(self, line: str):
if (match := self._symbol_line_generic_regex.match(line)) is not None:
self._parse_generic_case(line, match)
elif (match := self._parent_end_next_regex.match(line)) is not None:
# We do not need this info at the moment, might be useful in the future
pass
elif (match := self._debug_start_end_regex.match(line)) is not None:
# We do not need this info at the moment, might be useful in the future
pass
elif (match := self._flags_frame_pointer_regex.match(line)) is not None:
if self.current_function is None:
logger.error(
"Found a `Flags: Frame Ptr Present` but self.current_function is None"
)
return
self.current_function.frame_pointer_present = True
else:
# Most of these are either `** Module: [...]` or data we do not care about
logger.debug("Unhandled line: %s", line[:-1])
def _parse_generic_case(self, line, line_match: Match[str]):
symbol_type: str = line_match.group("symbol_type")
second_part: Optional[str] = line_match.group("second_part")
if symbol_type == "S_GPROC32":
assert second_part is not None
if (match := self._symbol_line_function_regex.match(second_part)) is None:
logger.error("Invalid function symbol: %s", line[:-1])
return
self.current_function = SymbolsEntry(
type=symbol_type,
section=int(match.group("section"), 16),
offset=int(match.group("offset"), 16),
size=int(match.group("size"), 16),
func_type=match.group("func_type"),
name=match.group("name"),
)
self.symbols.append(self.current_function)
elif symbol_type in self._register_stack_symbols:
assert second_part is not None
if self.current_function is None:
logger.error("Found stack/register outside of function: %s", line[:-1])
return
if (match := self._stack_register_symbol_regex.match(second_part)) is None:
logger.error("Invalid stack/register symbol: %s", line[:-1])
return
new_symbol = StackOrRegisterSymbol(
symbol_type=symbol_type,
location=match.group("location").lower(),
data_type=match.group("data_type"),
name=match.group("name"),
)
self.current_function.stack_symbols.append(new_symbol)
elif symbol_type == "S_END":
self.current_function = None
elif symbol_type in self._unhandled_symbols:
return
else:
logger.error("Unhandled symbol type: %s", line)

View File

@@ -1,5 +1,9 @@
import re
from typing import Dict, List, NamedTuple, Optional
import logging
from typing import Any, Dict, List, NamedTuple, Optional
logger = logging.getLogger(__name__)
class CvdumpTypeError(Exception):
@@ -42,7 +46,7 @@ class ScalarType(NamedTuple):
class TypeInfo(NamedTuple):
key: str
size: int
size: Optional[int]
name: Optional[str] = None
members: Optional[List[FieldListItem]] = None
@@ -156,6 +160,10 @@ class CvdumpTypesParser:
# LF_FIELDLIST member name (2/2)
MEMBER_RE = re.compile(r"^\s+member name = '(?P<name>.*)'$")
LF_FIELDLIST_ENUMERATE = re.compile(
r"^\s+list\[\d+\] = LF_ENUMERATE,.*value = (?P<value>\d+), name = '(?P<name>[^']+)'$"
)
# LF_ARRAY element type
ARRAY_ELEMENT_RE = re.compile(r"^\s+Element type = (?P<type>.*)")
@@ -169,12 +177,53 @@ class CvdumpTypesParser:
# LF_CLASS/LF_STRUCTURE name and other info
CLASS_NAME_RE = re.compile(
r"^\s+Size = (?P<size>\d+), class name = (?P<name>.+), UDT\((?P<udt>0x\w+)\)"
r"^\s+Size = (?P<size>\d+), class name = (?P<name>(?:[^,]|,\S)+)(?:, UDT\((?P<udt>0x\w+)\))?"
)
# LF_MODIFIER, type being modified
MODIFIES_RE = re.compile(r".*modifies type (?P<type>.*)$")
# LF_ARGLIST number of entries
LF_ARGLIST_ARGCOUNT = re.compile(r".*argument count = (?P<argcount>\d+)$")
# LF_ARGLIST list entry
LF_ARGLIST_ENTRY = re.compile(
r"^\s+list\[(?P<index>\d+)\] = (?P<arg_type>[\w()]+)$"
)
# LF_POINTER element
LF_POINTER_ELEMENT = re.compile(r"^\s+Element type : (?P<element_type>.+)$")
# LF_MFUNCTION attribute key-value pairs
LF_MFUNCTION_ATTRIBUTES = [
re.compile(r"\s*Return type = (?P<return_type>[\w()]+)$"),
re.compile(r"\s*Class type = (?P<class_type>[\w()]+)$"),
re.compile(r"\s*This type = (?P<this_type>[\w()]+)$"),
# Call type may contain whitespace
re.compile(r"\s*Call type = (?P<call_type>[\w()\s]+)$"),
re.compile(r"\s*Parms = (?P<num_params>[\w()]+)$"), # LF_MFUNCTION only
re.compile(r"\s*# Parms = (?P<num_params>[\w()]+)$"), # LF_PROCEDURE only
re.compile(r"\s*Arg list type = (?P<arg_list_type>[\w()]+)$"),
re.compile(
r"\s*This adjust = (?P<this_adjust>[\w()]+)$"
), # TODO: figure out the meaning
re.compile(
r"\s*Func attr = (?P<func_attr>[\w()]+)$"
), # Only for completeness, is always `none`
]
LF_ENUM_ATTRIBUTES = [
re.compile(r"^\s*# members = (?P<num_members>\d+)$"),
re.compile(r"^\s*enum name = (?P<name>.+)$"),
]
LF_ENUM_TYPES = re.compile(
r"^\s*type = (?P<underlying_type>\S+) field list type (?P<field_type>0x\w{4})$"
)
LF_ENUM_UDT = re.compile(r"^\s*UDT\((?P<udt>0x\w+)\)$")
LF_UNION_LINE = re.compile(
r"^.*field list type (?P<field_type>0x\w+),.*Size = (?P<size>\d+)\s*,class name = (?P<name>(?:[^,]|,\S)+),\s.*UDT\((?P<udt>0x\w+)\)$"
)
MODES_OF_INTEREST = {
"LF_ARRAY",
"LF_CLASS",
@@ -183,12 +232,16 @@ class CvdumpTypesParser:
"LF_MODIFIER",
"LF_POINTER",
"LF_STRUCTURE",
"LF_ARGLIST",
"LF_MFUNCTION",
"LF_PROCEDURE",
"LF_UNION",
}
def __init__(self) -> None:
self.mode: Optional[str] = None
self.last_key = ""
self.keys = {}
self.keys: Dict[str, Dict[str, Any]] = {}
def _new_type(self):
"""Prepare a new dict for the type we just parsed.
@@ -211,13 +264,20 @@ class CvdumpTypesParser:
obj = self.keys[self.last_key]
obj["members"][-1]["name"] = name
def _get_field_list(self, type_obj: Dict) -> List[FieldListItem]:
def _add_variant(self, name: str, value: int):
obj = self.keys[self.last_key]
if "variants" not in obj:
obj["variants"] = []
variants: list[dict[str, Any]] = obj["variants"]
variants.append({"name": name, "value": value})
def _get_field_list(self, type_obj: Dict[str, Any]) -> List[FieldListItem]:
"""Return the field list for the given LF_CLASS/LF_STRUCTURE reference"""
if type_obj.get("type") == "LF_FIELDLIST":
field_obj = type_obj
else:
field_list_type = type_obj.get("field_list_type")
field_list_type = type_obj["field_list_type"]
field_obj = self.keys[field_list_type]
members: List[FieldListItem] = []
@@ -253,6 +313,9 @@ class CvdumpTypesParser:
raise CvdumpIntegrityError("No array element type")
array_element_size = self.get(array_type).size
assert (
array_element_size is not None
), "Encountered an array whose type has no size"
n_elements = type_obj["size"] // array_element_size
@@ -285,7 +348,10 @@ class CvdumpTypesParser:
# These type references are just a wrapper around a scalar
if obj.get("type") == "LF_ENUM":
return self.get("T_INT4")
underlying_type = obj.get("underlying_type")
if underlying_type is None:
raise CvdumpKeyError(f"Missing 'underlying_type' in {obj}")
return self.get(underlying_type)
if obj.get("type") == "LF_POINTER":
return self.get("T_32PVOID")
@@ -350,6 +416,9 @@ class CvdumpTypesParser:
obj = self.get(type_key)
total_size = obj.size
assert (
total_size is not None
), "Called get_scalar_gapless() on a type without size"
scalars = self.get_scalars(type_key)
@@ -383,6 +452,11 @@ class CvdumpTypesParser:
return member_list_to_struct_string(members)
def read_line(self, line: str):
if line.endswith("\n"):
line = line[:-1]
if len(line) == 0:
return
if (match := self.INDEX_RE.match(line)) is not None:
type_ = match.group(2)
if type_ not in self.MODES_OF_INTEREST:
@@ -393,6 +467,12 @@ class CvdumpTypesParser:
self.last_key = match.group(1)
self.mode = type_
self._new_type()
if type_ == "LF_ARGLIST":
submatch = self.LF_ARGLIST_ARGCOUNT.match(line)
assert submatch is not None
self.keys[self.last_key]["argcount"] = int(submatch.group("argcount"))
# TODO: This should be validated in another pass
return
if self.mode is None:
@@ -413,41 +493,170 @@ class CvdumpTypesParser:
self._set("size", int(match.group("length")))
elif self.mode == "LF_FIELDLIST":
# If this class has a vtable, create a mock member at offset 0
if (match := self.VTABLE_RE.match(line)) is not None:
# For our purposes, any pointer type will do
self._add_member(0, "T_32PVOID")
self._set_member_name("vftable")
self.read_fieldlist_line(line)
# Superclass is set here in the fieldlist rather than in LF_CLASS
elif (match := self.SUPERCLASS_RE.match(line)) is not None:
self._set("super", normalize_type_id(match.group("type")))
elif self.mode == "LF_ARGLIST":
self.read_arglist_line(line)
# Member offset and type given on the first of two lines.
elif (match := self.LIST_RE.match(line)) is not None:
self._add_member(
int(match.group("offset")), normalize_type_id(match.group("type"))
)
elif self.mode in ["LF_MFUNCTION", "LF_PROCEDURE"]:
self.read_mfunction_line(line)
# Name of the member read on the second of two lines.
elif (match := self.MEMBER_RE.match(line)) is not None:
self._set_member_name(match.group("name"))
elif self.mode in ["LF_CLASS", "LF_STRUCTURE"]:
self.read_class_or_struct_line(line)
else: # LF_CLASS or LF_STRUCTURE
# Match the reference to the associated LF_FIELDLIST
if (match := self.CLASS_FIELD_RE.match(line)) is not None:
if match.group("field_type") == "0x0000":
# Not redundant. UDT might not match the key.
# These cases get reported as UDT mismatch.
self._set("is_forward_ref", True)
else:
field_list_type = normalize_type_id(match.group("field_type"))
self._set("field_list_type", field_list_type)
elif self.mode == "LF_POINTER":
self.read_pointer_line(line)
elif self.mode == "LF_ENUM":
self.read_enum_line(line)
elif self.mode == "LF_UNION":
self.read_union_line(line)
else:
# Check for exhaustiveness
logger.error("Unhandled data in mode: %s", self.mode)
def read_fieldlist_line(self, line: str):
# If this class has a vtable, create a mock member at offset 0
if (match := self.VTABLE_RE.match(line)) is not None:
# For our purposes, any pointer type will do
self._add_member(0, "T_32PVOID")
self._set_member_name("vftable")
# Superclass is set here in the fieldlist rather than in LF_CLASS
elif (match := self.SUPERCLASS_RE.match(line)) is not None:
self._set("super", normalize_type_id(match.group("type")))
# Member offset and type given on the first of two lines.
elif (match := self.LIST_RE.match(line)) is not None:
self._add_member(
int(match.group("offset")), normalize_type_id(match.group("type"))
)
# Name of the member read on the second of two lines.
elif (match := self.MEMBER_RE.match(line)) is not None:
self._set_member_name(match.group("name"))
elif (match := self.LF_FIELDLIST_ENUMERATE.match(line)) is not None:
self._add_variant(match.group("name"), int(match.group("value")))
def read_class_or_struct_line(self, line: str):
# Match the reference to the associated LF_FIELDLIST
if (match := self.CLASS_FIELD_RE.match(line)) is not None:
if match.group("field_type") == "0x0000":
# Not redundant. UDT might not match the key.
# These cases get reported as UDT mismatch.
self._set("is_forward_ref", True)
else:
field_list_type = normalize_type_id(match.group("field_type"))
self._set("field_list_type", field_list_type)
elif line.lstrip().startswith("Derivation list type"):
# We do not care about the second line, but we still match it so we see an error
# when another line fails to match
pass
elif (match := self.CLASS_NAME_RE.match(line)) is not None:
# Last line has the vital information.
# If this is a FORWARD REF, we need to follow the UDT pointer
# to get the actual class details.
elif (match := self.CLASS_NAME_RE.match(line)) is not None:
self._set("name", match.group("name"))
self._set("udt", normalize_type_id(match.group("udt")))
self._set("size", int(match.group("size")))
self._set("name", match.group("name"))
udt = match.group("udt")
if udt is not None:
self._set("udt", normalize_type_id(udt))
self._set("size", int(match.group("size")))
else:
logger.error("Unmatched line in class: %s", line[:-1])
def read_arglist_line(self, line: str):
if (match := self.LF_ARGLIST_ENTRY.match(line)) is not None:
obj = self.keys[self.last_key]
arglist: list = obj.setdefault("args", [])
assert int(match.group("index")) == len(
arglist
), "Argument list out of sync"
arglist.append(match.group("arg_type"))
else:
logger.error("Unmatched line in arglist: %s", line[:-1])
def read_pointer_line(self, line):
if (match := self.LF_POINTER_ELEMENT.match(line)) is not None:
self._set("element_type", match.group("element_type"))
else:
stripped_line = line.strip()
# We don't parse these lines, but we still want to check for exhaustiveness
# in case we missed some relevant data
if not any(
stripped_line.startswith(prefix)
for prefix in ["Pointer", "const Pointer", "L-value", "volatile"]
):
logger.error("Unrecognized pointer attribute: %s", line[:-1])
def read_mfunction_line(self, line: str):
"""
The layout is not consistent, so we want to be as robust as possible here.
- Example 1:
Return type = T_LONG(0012), Call type = C Near
Func attr = none
- Example 2:
Return type = T_CHAR(0010), Class type = 0x101A, This type = 0x101B,
Call type = ThisCall, Func attr = none
"""
obj = self.keys[self.last_key]
key_value_pairs = line.split(",")
for pair in key_value_pairs:
if pair.isspace():
continue
obj |= self.parse_function_attribute(pair)
def parse_function_attribute(self, pair: str) -> dict[str, str]:
for attribute_regex in self.LF_MFUNCTION_ATTRIBUTES:
if (match := attribute_regex.match(pair)) is not None:
return match.groupdict()
logger.error("Unknown attribute in function: %s", pair)
return {}
def read_enum_line(self, line: str):
obj = self.keys[self.last_key]
# We need special comma handling because commas may appear in the name.
# Splitting by "," yields the wrong result.
enum_attributes = line.split(", ")
for pair in enum_attributes:
if pair.endswith(","):
pair = pair[:-1]
if pair.isspace():
continue
obj |= self.parse_enum_attribute(pair)
def parse_enum_attribute(self, attribute: str) -> dict[str, Any]:
for attribute_regex in self.LF_ENUM_ATTRIBUTES:
if (match := attribute_regex.match(attribute)) is not None:
return match.groupdict()
if attribute == "NESTED":
return {"is_nested": True}
if attribute == "FORWARD REF":
return {"is_forward_ref": True}
if attribute.startswith("UDT"):
match = self.LF_ENUM_UDT.match(attribute)
assert match is not None
return {"udt": normalize_type_id(match.group("udt"))}
if (match := self.LF_ENUM_TYPES.match(attribute)) is not None:
result = match.groupdict()
result["underlying_type"] = normalize_type_id(result["underlying_type"])
return result
logger.error("Unknown attribute in enum: %s", attribute)
return {}
def read_union_line(self, line: str):
"""This is a rather barebones handler, only parsing the size"""
if (match := self.LF_UNION_LINE.match(line)) is None:
raise AssertionError(f"Unhandled in union: {line}")
self._set("name", match.group("name"))
if match.group("field_type") == "0x0000":
self._set("is_forward_ref", True)
self._set("size", int(match.group("size")))
self._set("udt", normalize_type_id(match.group("udt")))