Add Ghidra function import script (#909)

* Add draft for Ghidra function import script * feature: Basic PDB analysis [skip ci] This is a draft with a lot of open questions left. Please do not merge * Refactor: Introduce submodules and reload remedy * refactor types and make them Python 3.9 compatible * run black * WIP: save progress * fix types and small type safety violations * fix another Python 3.9 syntax incompatibility * Implement struct imports [skip ci] - This code is still in dire need of refactoring and tests - There are only single-digit issues left, and 2600 functions can be imported - The biggest remaining error is mismatched stacks * Refactor, implement enums, fix lots of bugs * fix Python 3.9 issue * refactor: address review comments Not sure why VS Code suddenly decides to remove some empty spaces, but they don't make sense anyway * add unit tests for new type parsers, fix linter issue * refactor: db access from pdb_extraction.py * Fix stack layout offset error * fix: Undo incorrect reference change * Fix CI issue * Improve READMEs (fix typos, add information) --------- Co-authored-by: jonschz <jonschz@users.noreply.github.com>
2025-10-26 09:54:18 +00:00 · 2024-06-09 14:41:24 +02:00
parent 88805f9fcb
commit f26c30974a
21 changed files with 1824 additions and 114 deletions
--- a/tools/isledecomp/isledecomp/compare/core.py
+++ b/tools/isledecomp/isledecomp/compare/core.py
@@ -4,7 +4,7 @@ import difflib
 import struct
 import uuid
 from dataclasses import dataclass
-from typing import Callable, Iterable, List, Optional
+from typing import Any, Callable, Iterable, List, Optional
 from isledecomp.bin import Bin as IsleBin, InvalidVirtualAddressError
 from isledecomp.cvdump.demangler import demangle_string_const
 from isledecomp.cvdump import Cvdump, CvdumpAnalysis
@@ -90,7 +90,7 @@ class Compare:

    def _load_cvdump(self):
        logger.info("Parsing %s ...", self.pdb_file)
-        cv = (
+        self.cv = (
            Cvdump(self.pdb_file)
            .lines()
            .globals()
@@ -100,9 +100,9 @@ class Compare:
            .types()
            .run()
        )
-        res = CvdumpAnalysis(cv)
+        self.cvdump_analysis = CvdumpAnalysis(self.cv)

-        for sym in res.nodes:
+        for sym in self.cvdump_analysis.nodes:
            # Skip nodes where we have almost no information.
            # These probably came from SECTION CONTRIBUTIONS.
            if sym.name() is None and sym.node_type is None:
@@ -116,6 +116,7 @@ class Compare:
                continue

            addr = self.recomp_bin.get_abs_addr(sym.section, sym.offset)
+            sym.addr = addr

            # If this symbol is the final one in its section, we were not able to
            # estimate its size because we didn't have the total size of that section.
@@ -165,7 +166,10 @@ class Compare:
                addr, sym.node_type, sym.name(), sym.decorated_name, sym.size()
            )

-        for (section, offset), (filename, line_no) in res.verified_lines.items():
+        for (section, offset), (
+            filename,
+            line_no,
+        ) in self.cvdump_analysis.verified_lines.items():
            addr = self.recomp_bin.get_abs_addr(section, offset)
            self._lines_db.add_line(filename, line_no, addr)

@@ -736,6 +740,9 @@ class Compare:
    def get_variables(self) -> List[MatchInfo]:
        return self._db.get_matches_by_type(SymbolType.DATA)

+    def get_match_options(self, addr: int) -> Optional[dict[str, Any]]:
+        return self._db.get_match_options(addr)
+
    def compare_address(self, addr: int) -> Optional[DiffReport]:
        match = self._db.get_one_match(addr)
        if match is None:
--- a/tools/isledecomp/isledecomp/compare/db.py
+++ b/tools/isledecomp/isledecomp/compare/db.py
@@ -2,7 +2,7 @@
 addresses/symbols that we want to compare between the original and recompiled binaries."""
 import sqlite3
 import logging
-from typing import List, Optional
+from typing import Any, List, Optional
 from isledecomp.types import SymbolType
 from isledecomp.cvdump.demangler import get_vtordisp_name

@@ -335,7 +335,7 @@ class CompareDb:
    def skip_compare(self, orig: int):
        self._set_opt_bool(orig, "skip")

-    def get_match_options(self, addr: int) -> Optional[dict]:
+    def get_match_options(self, addr: int) -> Optional[dict[str, Any]]:
        cur = self._db.execute(
            """SELECT name, value FROM `match_options` WHERE addr = ?""", (addr,)
        )
--- a/tools/isledecomp/isledecomp/cvdump/init.py
+++ b/tools/isledecomp/isledecomp/cvdump/init.py
@@ -1,3 +1,4 @@
+from .symbols import SymbolsEntry
 from .analysis import CvdumpAnalysis
 from .parser import CvdumpParser
 from .runner import Cvdump
--- a/tools/isledecomp/isledecomp/cvdump/analysis.py
+++ b/tools/isledecomp/isledecomp/cvdump/analysis.py
@@ -1,5 +1,7 @@
 """For collating the results from parsing cvdump.exe into a more directly useful format."""
+
 from typing import Dict, List, Tuple, Optional
+from isledecomp.cvdump import SymbolsEntry
 from isledecomp.types import SymbolType
 from .parser import CvdumpParser
 from .demangler import demangle_string_const, demangle_vtable
@@ -31,6 +33,8 @@ class CvdumpNode:
    # Size as reported by SECTION CONTRIBUTIONS section. Not guaranteed to be
    # accurate.
    section_contribution: Optional[int] = None
+    addr: Optional[int] = None
+    symbol_entry: Optional[SymbolsEntry] = None

    def __init__(self, section: int, offset: int) -> None:
        self.section = section
@@ -87,13 +91,12 @@ class CvdumpAnalysis:
    """Collects the results from CvdumpParser into a list of nodes (i.e. symbols).
    These can then be analyzed by a downstream tool."""

-    nodes = List[CvdumpNode]
-    verified_lines = Dict[Tuple[str, str], Tuple[str, str]]
+    verified_lines: Dict[Tuple[str, str], Tuple[str, str]]

    def __init__(self, parser: CvdumpParser):
        """Read in as much information as we have from the parser.
        The more sections we have, the better our information will be."""
-        node_dict = {}
+        node_dict: Dict[Tuple[int, int], CvdumpNode] = {}

        # PUBLICS is our roadmap for everything that follows.
        for pub in parser.publics:
@@ -158,8 +161,11 @@ class CvdumpAnalysis:
                node_dict[key].friendly_name = sym.name
                node_dict[key].confirmed_size = sym.size
                node_dict[key].node_type = SymbolType.FUNCTION
+                node_dict[key].symbol_entry = sym

-        self.nodes = [v for _, v in dict(sorted(node_dict.items())).items()]
+        self.nodes: List[CvdumpNode] = [
+            v for _, v in dict(sorted(node_dict.items())).items()
+        ]
        self._estimate_size()

    def _estimate_size(self):
--- a/tools/isledecomp/isledecomp/cvdump/parser.py
+++ b/tools/isledecomp/isledecomp/cvdump/parser.py
@@ -2,6 +2,7 @@ import re
 from typing import Iterable, Tuple
 from collections import namedtuple
 from .types import CvdumpTypesParser
+from .symbols import CvdumpSymbolsParser

 # e.g. `*** PUBLICS`
 _section_change_regex = re.compile(r"\*\*\* (?P<section>[A-Z/ ]{2,})")
@@ -20,11 +21,6 @@ _publics_line_regex = re.compile(
    r"^(?P<type>\w+): \[(?P<section>\w{4}):(?P<offset>\w{8})], Flags: (?P<flags>\w{8}), (?P<name>\S+)"
 )

-# e.g. `(00008C) S_GPROC32: [0001:00034E90], Cb: 00000007, Type:             0x1024, ViewROI::IntrinsicImportance`
-_symbol_line_regex = re.compile(
-    r"\(\w+\) (?P<type>\S+): \[(?P<section>\w{4}):(?P<offset>\w{8})\], Cb: (?P<size>\w+), Type:\s+\S+, (?P<name>.+)"
-)
-
 # e.g. `         Debug start: 00000008, Debug end: 0000016E`
 _gproc_debug_regex = re.compile(
    r"\s*Debug start: (?P<start>\w{8}), Debug end: (?P<end>\w{8})"
@@ -52,9 +48,6 @@ LinesEntry = namedtuple("LinesEntry", "filename line_no section offset")
 # only place you can find the C symbols (library functions, smacker, etc)
 PublicsEntry = namedtuple("PublicsEntry", "type section offset flags name")

-# S_GPROC32 = functions
-SymbolsEntry = namedtuple("SymbolsEntry", "type section offset size name")
-
 # (Estimated) size of any symbol
 SizeRefEntry = namedtuple("SizeRefEntry", "module section offset size")

@@ -72,12 +65,16 @@ class CvdumpParser:

        self.lines = {}
        self.publics = []
-        self.symbols = []
        self.sizerefs = []
        self.globals = []
        self.modules = []

        self.types = CvdumpTypesParser()
+        self.symbols_parser = CvdumpSymbolsParser()
+
+    @property
+    def symbols(self):
+        return self.symbols_parser.symbols

    def _lines_section(self, line: str):
        """Parsing entries from the LINES section. We only care about the pairs of
@@ -127,20 +124,6 @@ class CvdumpParser:
                )
            )

-    def _symbols_section(self, line: str):
-        """We are interested in S_GPROC32 symbols only."""
-        if (match := _symbol_line_regex.match(line)) is not None:
-            if match.group("type") == "S_GPROC32":
-                self.symbols.append(
-                    SymbolsEntry(
-                        type=match.group("type"),
-                        section=int(match.group("section"), 16),
-                        offset=int(match.group("offset"), 16),
-                        size=int(match.group("size"), 16),
-                        name=match.group("name"),
-                    )
-                )
-
    def _section_contributions(self, line: str):
        """Gives the size of elements across all sections of the binary.
        This is the easiest way to get the data size for .data and .rdata
@@ -177,7 +160,7 @@ class CvdumpParser:
            self.types.read_line(line)

        elif self._section == "SYMBOLS":
-            self._symbols_section(line)
+            self.symbols_parser.read_line(line)

        elif self._section == "LINES":
            self._lines_section(line)
--- a/tools/isledecomp/isledecomp/cvdump/symbols.py
+++ b/tools/isledecomp/isledecomp/cvdump/symbols.py
@@ -0,0 +1,153 @@
+from dataclasses import dataclass, field
+import logging
+import re
+from re import Match
+from typing import NamedTuple, Optional
+
+
+logger = logging.getLogger(__name__)
+
+
+class StackOrRegisterSymbol(NamedTuple):
+    symbol_type: str
+    location: str
+    """Should always be set/converted to lowercase."""
+    data_type: str
+    name: str
+
+
+# S_GPROC32 = functions
+@dataclass
+class SymbolsEntry:
+    # pylint: disable=too-many-instance-attributes
+    type: str
+    section: int
+    offset: int
+    size: int
+    func_type: str
+    name: str
+    stack_symbols: list[StackOrRegisterSymbol] = field(default_factory=list)
+    frame_pointer_present: bool = False
+    addr: Optional[int] = None  # Absolute address. Will be set later, if at all
+
+
+class CvdumpSymbolsParser:
+    _symbol_line_generic_regex = re.compile(
+        r"\(\w+\)\s+(?P<symbol_type>[^\s:]+)(?::\s+(?P<second_part>\S.*))?|(?::)$"
+    )
+    """
+    Parses the first part, e.g. `(00008C) S_GPROC32`, and splits off the second part after the colon (if it exists).
+    There are three cases:
+    - no colon, e.g. `(000350) S_END`
+    - colon but no data, e.g. `(000370) S_COMPILE:`
+    - colon and data, e.g. `(000304)  S_REGISTER: esi, Type:             0x1E14, this``
+    """
+
+    _symbol_line_function_regex = re.compile(
+        r"\[(?P<section>\w{4}):(?P<offset>\w{8})\], Cb: (?P<size>\w+), Type:\s+(?P<func_type>[^\s,]+), (?P<name>.+)"
+    )
+    """
+    Parses the second part of a function symbol, e.g.
+    `[0001:00034E90], Cb: 00000007, Type:             0x1024, ViewROI::IntrinsicImportance`
+    """
+
+    # the second part of e.g.
+    _stack_register_symbol_regex = re.compile(
+        r"(?P<location>\S+), Type:\s+(?P<data_type>[\w()]+), (?P<name>.+)$"
+    )
+    """
+    Parses the second part of a stack or register symbol, e.g.
+    `esi, Type:             0x1E14, this`
+    """
+
+    _debug_start_end_regex = re.compile(
+        r"^\s*Debug start: (?P<debug_start>\w+), Debug end: (?P<debug_end>\w+)$"
+    )
+
+    _parent_end_next_regex = re.compile(
+        r"\s*Parent: (?P<parent_addr>\w+), End: (?P<end_addr>\w+), Next: (?P<next_addr>\w+)$"
+    )
+
+    _flags_frame_pointer_regex = re.compile(r"\s*Flags: Frame Ptr Present$")
+
+    _register_stack_symbols = ["S_BPREL32", "S_REGISTER"]
+
+    # List the unhandled types so we can check exhaustiveness
+    _unhandled_symbols = [
+        "S_COMPILE",
+        "S_OBJNAME",
+        "S_THUNK32",
+        "S_LABEL32",
+        "S_LDATA32",
+        "S_LPROC32",
+        "S_UDT",
+    ]
+
+    """Parser for cvdump output, SYMBOLS section."""
+
+    def __init__(self):
+        self.symbols: list[SymbolsEntry] = []
+        self.current_function: Optional[SymbolsEntry] = None
+
+    def read_line(self, line: str):
+        if (match := self._symbol_line_generic_regex.match(line)) is not None:
+            self._parse_generic_case(line, match)
+        elif (match := self._parent_end_next_regex.match(line)) is not None:
+            # We do not need this info at the moment, might be useful in the future
+            pass
+        elif (match := self._debug_start_end_regex.match(line)) is not None:
+            # We do not need this info at the moment, might be useful in the future
+            pass
+        elif (match := self._flags_frame_pointer_regex.match(line)) is not None:
+            if self.current_function is None:
+                logger.error(
+                    "Found a `Flags: Frame Ptr Present` but self.current_function is None"
+                )
+                return
+            self.current_function.frame_pointer_present = True
+        else:
+            # Most of these are either `** Module: [...]` or data we do not care about
+            logger.debug("Unhandled line: %s", line[:-1])
+
+    def _parse_generic_case(self, line, line_match: Match[str]):
+        symbol_type: str = line_match.group("symbol_type")
+        second_part: Optional[str] = line_match.group("second_part")
+
+        if symbol_type == "S_GPROC32":
+            assert second_part is not None
+            if (match := self._symbol_line_function_regex.match(second_part)) is None:
+                logger.error("Invalid function symbol: %s", line[:-1])
+                return
+            self.current_function = SymbolsEntry(
+                type=symbol_type,
+                section=int(match.group("section"), 16),
+                offset=int(match.group("offset"), 16),
+                size=int(match.group("size"), 16),
+                func_type=match.group("func_type"),
+                name=match.group("name"),
+            )
+            self.symbols.append(self.current_function)
+
+        elif symbol_type in self._register_stack_symbols:
+            assert second_part is not None
+            if self.current_function is None:
+                logger.error("Found stack/register outside of function: %s", line[:-1])
+                return
+            if (match := self._stack_register_symbol_regex.match(second_part)) is None:
+                logger.error("Invalid stack/register symbol: %s", line[:-1])
+                return
+
+            new_symbol = StackOrRegisterSymbol(
+                symbol_type=symbol_type,
+                location=match.group("location").lower(),
+                data_type=match.group("data_type"),
+                name=match.group("name"),
+            )
+            self.current_function.stack_symbols.append(new_symbol)
+
+        elif symbol_type == "S_END":
+            self.current_function = None
+        elif symbol_type in self._unhandled_symbols:
+            return
+        else:
+            logger.error("Unhandled symbol type: %s", line)
--- a/tools/isledecomp/isledecomp/cvdump/types.py
+++ b/tools/isledecomp/isledecomp/cvdump/types.py
@@ -1,5 +1,9 @@
 import re
-from typing import Dict, List, NamedTuple, Optional
+import logging
+from typing import Any, Dict, List, NamedTuple, Optional
+
+
+logger = logging.getLogger(__name__)


 class CvdumpTypeError(Exception):
@@ -42,7 +46,7 @@ class ScalarType(NamedTuple):

 class TypeInfo(NamedTuple):
    key: str
-    size: int
+    size: Optional[int]
    name: Optional[str] = None
    members: Optional[List[FieldListItem]] = None

@@ -156,6 +160,10 @@ class CvdumpTypesParser:
    # LF_FIELDLIST member name (2/2)
    MEMBER_RE = re.compile(r"^\s+member name = '(?P<name>.*)'$")

+    LF_FIELDLIST_ENUMERATE = re.compile(
+        r"^\s+list\[\d+\] = LF_ENUMERATE,.*value = (?P<value>\d+), name = '(?P<name>[^']+)'$"
+    )
+
    # LF_ARRAY element type
    ARRAY_ELEMENT_RE = re.compile(r"^\s+Element type = (?P<type>.*)")

@@ -169,12 +177,53 @@ class CvdumpTypesParser:

    # LF_CLASS/LF_STRUCTURE name and other info
    CLASS_NAME_RE = re.compile(
-        r"^\s+Size = (?P<size>\d+), class name = (?P<name>.+), UDT\((?P<udt>0x\w+)\)"
+        r"^\s+Size = (?P<size>\d+), class name = (?P<name>(?:[^,]|,\S)+)(?:, UDT\((?P<udt>0x\w+)\))?"
    )

    # LF_MODIFIER, type being modified
    MODIFIES_RE = re.compile(r".*modifies type (?P<type>.*)$")

+    # LF_ARGLIST number of entries
+    LF_ARGLIST_ARGCOUNT = re.compile(r".*argument count = (?P<argcount>\d+)$")
+
+    # LF_ARGLIST list entry
+    LF_ARGLIST_ENTRY = re.compile(
+        r"^\s+list\[(?P<index>\d+)\] = (?P<arg_type>[\w()]+)$"
+    )
+
+    # LF_POINTER element
+    LF_POINTER_ELEMENT = re.compile(r"^\s+Element type : (?P<element_type>.+)$")
+
+    # LF_MFUNCTION attribute key-value pairs
+    LF_MFUNCTION_ATTRIBUTES = [
+        re.compile(r"\s*Return type = (?P<return_type>[\w()]+)$"),
+        re.compile(r"\s*Class type = (?P<class_type>[\w()]+)$"),
+        re.compile(r"\s*This type = (?P<this_type>[\w()]+)$"),
+        # Call type may contain whitespace
+        re.compile(r"\s*Call type = (?P<call_type>[\w()\s]+)$"),
+        re.compile(r"\s*Parms = (?P<num_params>[\w()]+)$"),  # LF_MFUNCTION only
+        re.compile(r"\s*# Parms = (?P<num_params>[\w()]+)$"),  # LF_PROCEDURE only
+        re.compile(r"\s*Arg list type = (?P<arg_list_type>[\w()]+)$"),
+        re.compile(
+            r"\s*This adjust = (?P<this_adjust>[\w()]+)$"
+        ),  # TODO: figure out the meaning
+        re.compile(
+            r"\s*Func attr = (?P<func_attr>[\w()]+)$"
+        ),  # Only for completeness, is always `none`
+    ]
+
+    LF_ENUM_ATTRIBUTES = [
+        re.compile(r"^\s*# members = (?P<num_members>\d+)$"),
+        re.compile(r"^\s*enum name = (?P<name>.+)$"),
+    ]
+    LF_ENUM_TYPES = re.compile(
+        r"^\s*type = (?P<underlying_type>\S+) field list type (?P<field_type>0x\w{4})$"
+    )
+    LF_ENUM_UDT = re.compile(r"^\s*UDT\((?P<udt>0x\w+)\)$")
+    LF_UNION_LINE = re.compile(
+        r"^.*field list type (?P<field_type>0x\w+),.*Size = (?P<size>\d+)\s*,class name = (?P<name>(?:[^,]|,\S)+),\s.*UDT\((?P<udt>0x\w+)\)$"
+    )
+
    MODES_OF_INTEREST = {
        "LF_ARRAY",
        "LF_CLASS",
@@ -183,12 +232,16 @@ class CvdumpTypesParser:
        "LF_MODIFIER",
        "LF_POINTER",
        "LF_STRUCTURE",
+        "LF_ARGLIST",
+        "LF_MFUNCTION",
+        "LF_PROCEDURE",
+        "LF_UNION",
    }

    def __init__(self) -> None:
        self.mode: Optional[str] = None
        self.last_key = ""
-        self.keys = {}
+        self.keys: Dict[str, Dict[str, Any]] = {}

    def _new_type(self):
        """Prepare a new dict for the type we just parsed.
@@ -211,13 +264,20 @@ class CvdumpTypesParser:
        obj = self.keys[self.last_key]
        obj["members"][-1]["name"] = name

-    def _get_field_list(self, type_obj: Dict) -> List[FieldListItem]:
+    def _add_variant(self, name: str, value: int):
+        obj = self.keys[self.last_key]
+        if "variants" not in obj:
+            obj["variants"] = []
+        variants: list[dict[str, Any]] = obj["variants"]
+        variants.append({"name": name, "value": value})
+
+    def _get_field_list(self, type_obj: Dict[str, Any]) -> List[FieldListItem]:
        """Return the field list for the given LF_CLASS/LF_STRUCTURE reference"""

        if type_obj.get("type") == "LF_FIELDLIST":
            field_obj = type_obj
        else:
-            field_list_type = type_obj.get("field_list_type")
+            field_list_type = type_obj["field_list_type"]
            field_obj = self.keys[field_list_type]

        members: List[FieldListItem] = []
@@ -253,6 +313,9 @@ class CvdumpTypesParser:
            raise CvdumpIntegrityError("No array element type")

        array_element_size = self.get(array_type).size
+        assert (
+            array_element_size is not None
+        ), "Encountered an array whose type has no size"

        n_elements = type_obj["size"] // array_element_size

@@ -285,7 +348,10 @@ class CvdumpTypesParser:

        # These type references are just a wrapper around a scalar
        if obj.get("type") == "LF_ENUM":
-            return self.get("T_INT4")
+            underlying_type = obj.get("underlying_type")
+            if underlying_type is None:
+                raise CvdumpKeyError(f"Missing 'underlying_type' in {obj}")
+            return self.get(underlying_type)

        if obj.get("type") == "LF_POINTER":
            return self.get("T_32PVOID")
@@ -350,6 +416,9 @@ class CvdumpTypesParser:

        obj = self.get(type_key)
        total_size = obj.size
+        assert (
+            total_size is not None
+        ), "Called get_scalar_gapless() on a type without size"

        scalars = self.get_scalars(type_key)

@@ -383,6 +452,11 @@ class CvdumpTypesParser:
        return member_list_to_struct_string(members)

    def read_line(self, line: str):
+        if line.endswith("\n"):
+            line = line[:-1]
+        if len(line) == 0:
+            return
+
        if (match := self.INDEX_RE.match(line)) is not None:
            type_ = match.group(2)
            if type_ not in self.MODES_OF_INTEREST:
@@ -393,6 +467,12 @@ class CvdumpTypesParser:
            self.last_key = match.group(1)
            self.mode = type_
            self._new_type()
+
+            if type_ == "LF_ARGLIST":
+                submatch = self.LF_ARGLIST_ARGCOUNT.match(line)
+                assert submatch is not None
+                self.keys[self.last_key]["argcount"] = int(submatch.group("argcount"))
+                # TODO: This should be validated in another pass
            return

        if self.mode is None:
@@ -413,41 +493,170 @@ class CvdumpTypesParser:
                self._set("size", int(match.group("length")))

        elif self.mode == "LF_FIELDLIST":
-            # If this class has a vtable, create a mock member at offset 0
-            if (match := self.VTABLE_RE.match(line)) is not None:
-                # For our purposes, any pointer type will do
-                self._add_member(0, "T_32PVOID")
-                self._set_member_name("vftable")
+            self.read_fieldlist_line(line)

-            # Superclass is set here in the fieldlist rather than in LF_CLASS
-            elif (match := self.SUPERCLASS_RE.match(line)) is not None:
-                self._set("super", normalize_type_id(match.group("type")))
+        elif self.mode == "LF_ARGLIST":
+            self.read_arglist_line(line)

-            # Member offset and type given on the first of two lines.
-            elif (match := self.LIST_RE.match(line)) is not None:
-                self._add_member(
-                    int(match.group("offset")), normalize_type_id(match.group("type"))
-                )
+        elif self.mode in ["LF_MFUNCTION", "LF_PROCEDURE"]:
+            self.read_mfunction_line(line)

-            # Name of the member read on the second of two lines.
-            elif (match := self.MEMBER_RE.match(line)) is not None:
-                self._set_member_name(match.group("name"))
+        elif self.mode in ["LF_CLASS", "LF_STRUCTURE"]:
+            self.read_class_or_struct_line(line)

-        else:  # LF_CLASS or LF_STRUCTURE
-            # Match the reference to the associated LF_FIELDLIST
-            if (match := self.CLASS_FIELD_RE.match(line)) is not None:
-                if match.group("field_type") == "0x0000":
-                    # Not redundant. UDT might not match the key.
-                    # These cases get reported as UDT mismatch.
-                    self._set("is_forward_ref", True)
-                else:
-                    field_list_type = normalize_type_id(match.group("field_type"))
-                    self._set("field_list_type", field_list_type)
+        elif self.mode == "LF_POINTER":
+            self.read_pointer_line(line)

+        elif self.mode == "LF_ENUM":
+            self.read_enum_line(line)
+
+        elif self.mode == "LF_UNION":
+            self.read_union_line(line)
+
+        else:
+            # Check for exhaustiveness
+            logger.error("Unhandled data in mode: %s", self.mode)
+
+    def read_fieldlist_line(self, line: str):
+        # If this class has a vtable, create a mock member at offset 0
+        if (match := self.VTABLE_RE.match(line)) is not None:
+            # For our purposes, any pointer type will do
+            self._add_member(0, "T_32PVOID")
+            self._set_member_name("vftable")
+
+        # Superclass is set here in the fieldlist rather than in LF_CLASS
+        elif (match := self.SUPERCLASS_RE.match(line)) is not None:
+            self._set("super", normalize_type_id(match.group("type")))
+
+        # Member offset and type given on the first of two lines.
+        elif (match := self.LIST_RE.match(line)) is not None:
+            self._add_member(
+                int(match.group("offset")), normalize_type_id(match.group("type"))
+            )
+
+        # Name of the member read on the second of two lines.
+        elif (match := self.MEMBER_RE.match(line)) is not None:
+            self._set_member_name(match.group("name"))
+
+        elif (match := self.LF_FIELDLIST_ENUMERATE.match(line)) is not None:
+            self._add_variant(match.group("name"), int(match.group("value")))
+
+    def read_class_or_struct_line(self, line: str):
+        # Match the reference to the associated LF_FIELDLIST
+        if (match := self.CLASS_FIELD_RE.match(line)) is not None:
+            if match.group("field_type") == "0x0000":
+                # Not redundant. UDT might not match the key.
+                # These cases get reported as UDT mismatch.
+                self._set("is_forward_ref", True)
+            else:
+                field_list_type = normalize_type_id(match.group("field_type"))
+                self._set("field_list_type", field_list_type)
+
+        elif line.lstrip().startswith("Derivation list type"):
+            # We do not care about the second line, but we still match it so we see an error
+            # when another line fails to match
+            pass
+        elif (match := self.CLASS_NAME_RE.match(line)) is not None:
            # Last line has the vital information.
            # If this is a FORWARD REF, we need to follow the UDT pointer
            # to get the actual class details.
-            elif (match := self.CLASS_NAME_RE.match(line)) is not None:
-                self._set("name", match.group("name"))
-                self._set("udt", normalize_type_id(match.group("udt")))
-                self._set("size", int(match.group("size")))
+            self._set("name", match.group("name"))
+            udt = match.group("udt")
+            if udt is not None:
+                self._set("udt", normalize_type_id(udt))
+            self._set("size", int(match.group("size")))
+        else:
+            logger.error("Unmatched line in class: %s", line[:-1])
+
+    def read_arglist_line(self, line: str):
+        if (match := self.LF_ARGLIST_ENTRY.match(line)) is not None:
+            obj = self.keys[self.last_key]
+            arglist: list = obj.setdefault("args", [])
+            assert int(match.group("index")) == len(
+                arglist
+            ), "Argument list out of sync"
+            arglist.append(match.group("arg_type"))
+        else:
+            logger.error("Unmatched line in arglist: %s", line[:-1])
+
+    def read_pointer_line(self, line):
+        if (match := self.LF_POINTER_ELEMENT.match(line)) is not None:
+            self._set("element_type", match.group("element_type"))
+        else:
+            stripped_line = line.strip()
+            # We don't parse these lines, but we still want to check for exhaustiveness
+            # in case we missed some relevant data
+            if not any(
+                stripped_line.startswith(prefix)
+                for prefix in ["Pointer", "const Pointer", "L-value", "volatile"]
+            ):
+                logger.error("Unrecognized pointer attribute: %s", line[:-1])
+
+    def read_mfunction_line(self, line: str):
+        """
+        The layout is not consistent, so we want to be as robust as possible here.
+        - Example 1:
+            Return type = T_LONG(0012), Call type = C Near
+            Func attr = none
+        - Example 2:
+                Return type = T_CHAR(0010), Class type = 0x101A, This type = 0x101B,
+            Call type = ThisCall, Func attr = none
+        """
+
+        obj = self.keys[self.last_key]
+
+        key_value_pairs = line.split(",")
+        for pair in key_value_pairs:
+            if pair.isspace():
+                continue
+            obj |= self.parse_function_attribute(pair)
+
+    def parse_function_attribute(self, pair: str) -> dict[str, str]:
+        for attribute_regex in self.LF_MFUNCTION_ATTRIBUTES:
+            if (match := attribute_regex.match(pair)) is not None:
+                return match.groupdict()
+        logger.error("Unknown attribute in function: %s", pair)
+        return {}
+
+    def read_enum_line(self, line: str):
+        obj = self.keys[self.last_key]
+
+        # We need special comma handling because commas may appear in the name.
+        # Splitting by "," yields the wrong result.
+        enum_attributes = line.split(", ")
+        for pair in enum_attributes:
+            if pair.endswith(","):
+                pair = pair[:-1]
+            if pair.isspace():
+                continue
+            obj |= self.parse_enum_attribute(pair)
+
+    def parse_enum_attribute(self, attribute: str) -> dict[str, Any]:
+        for attribute_regex in self.LF_ENUM_ATTRIBUTES:
+            if (match := attribute_regex.match(attribute)) is not None:
+                return match.groupdict()
+        if attribute == "NESTED":
+            return {"is_nested": True}
+        if attribute == "FORWARD REF":
+            return {"is_forward_ref": True}
+        if attribute.startswith("UDT"):
+            match = self.LF_ENUM_UDT.match(attribute)
+            assert match is not None
+            return {"udt": normalize_type_id(match.group("udt"))}
+        if (match := self.LF_ENUM_TYPES.match(attribute)) is not None:
+            result = match.groupdict()
+            result["underlying_type"] = normalize_type_id(result["underlying_type"])
+            return result
+        logger.error("Unknown attribute in enum: %s", attribute)
+        return {}
+
+    def read_union_line(self, line: str):
+        """This is a rather barebones handler, only parsing the size"""
+        if (match := self.LF_UNION_LINE.match(line)) is None:
+            raise AssertionError(f"Unhandled in union: {line}")
+        self._set("name", match.group("name"))
+        if match.group("field_type") == "0x0000":
+            self._set("is_forward_ref", True)
+
+        self._set("size", int(match.group("size")))
+        self._set("udt", normalize_type_id(match.group("udt")))