Data comparison tool (#618)

* Parse cvdump TYPES section. Add datacmp tool. * Corrections * Use static * Revert "Use static" This reverts commit e0a4324e00. * Handle partially initialized variable * Shuffle order of legounksavedatawriter * Revert "Shuffle order of legounksavedatawriter" This reverts commit 506e06f117. --------- Co-authored-by: Christian Semmler <mail@csemmler.com>
2025-10-26 09:54:18 +00:00 · 2024-03-05 03:45:09 -05:00
parent 068760056a
commit ec1fcce08c
15 changed files with 1347 additions and 82 deletions
--- a/tools/isledecomp/isledecomp/compare/core.py
+++ b/tools/isledecomp/isledecomp/compare/core.py
@@ -95,6 +95,7 @@ class Compare:
            .publics()
            .symbols()
            .section_contributions()
+            .types()
            .run()
        )
        res = CvdumpAnalysis(cv)
@@ -454,6 +455,25 @@ class Compare:

    ## Public API

+    def is_pointer_match(self, orig_addr, recomp_addr) -> bool:
+        """Check whether these pointers point at the same thing"""
+
+        # Null pointers considered matching
+        if orig_addr == 0 and recomp_addr == 0:
+            return True
+
+        match = self._db.get_by_orig(orig_addr)
+        if match is None:
+            return False
+
+        return match.recomp_addr == recomp_addr
+
+    def get_by_orig(self, addr: int) -> Optional[MatchInfo]:
+        return self._db.get_by_orig(addr)
+
+    def get_by_recomp(self, addr: int) -> Optional[MatchInfo]:
+        return self._db.get_by_recomp(addr)
+
    def get_all(self) -> List[MatchInfo]:
        return self._db.get_all()

@@ -463,6 +483,9 @@ class Compare:
    def get_vtables(self) -> List[MatchInfo]:
        return self._db.get_matches_by_type(SymbolType.VTABLE)

+    def get_variables(self) -> List[MatchInfo]:
+        return self._db.get_matches_by_type(SymbolType.DATA)
+
    def compare_address(self, addr: int) -> Optional[DiffReport]:
        match = self._db.get_one_match(addr)
        if match is None:
--- a/tools/isledecomp/isledecomp/cvdump/init.py
+++ b/tools/isledecomp/isledecomp/cvdump/init.py
@@ -1,3 +1,4 @@
 from .analysis import CvdumpAnalysis
 from .parser import CvdumpParser
 from .runner import Cvdump
+from .types import CvdumpTypesParser
--- a/tools/isledecomp/isledecomp/cvdump/analysis.py
+++ b/tools/isledecomp/isledecomp/cvdump/analysis.py
@@ -1,45 +1,9 @@
 """For collating the results from parsing cvdump.exe into a more directly useful format."""
-from typing import List, Optional, Tuple
+from typing import List, Optional
 from isledecomp.types import SymbolType
 from .parser import CvdumpParser
 from .demangler import demangle_string_const, demangle_vtable
-
-
-def data_type_info(type_name: str) -> Optional[Tuple[int, bool]]:
-    """cvdump type aliases are listed here:
-    https://github.com/microsoft/microsoft-pdb/blob/master/include/cvinfo.h
-    For the given type, return tuple(size, is_pointer) if possible."""
-    # pylint: disable=too-many-return-statements
-    # TODO: refactor to be as simple as possble
-
-    # Ignore complex types. We can get the size of those from the TYPES section.
-    if not type_name.startswith("T"):
-        return None
-
-    # if 32-bit pointer
-    if type_name.startswith("T_32P"):
-        return (4, True)
-
-    if type_name.endswith("QUAD") or type_name.endswith("64"):
-        return (8, False)
-
-    if (
-        type_name.endswith("LONG")
-        or type_name.endswith("INT4")
-        or type_name.endswith("32")
-    ):
-        return (4, False)
-
-    if type_name.endswith("SHORT") or type_name.endswith("WCHAR"):
-        return (2, False)
-
-    if "CHAR" in type_name:
-        return (1, False)
-
-    if type_name in ("T_NOTYPE", "T_VOID"):
-        return (0, False)
-
-    return None
+from .types import CvdumpKeyError, CvdumpIntegrityError


 class CvdumpNode:
@@ -146,11 +110,21 @@ class CvdumpAnalysis:
            node_dict[key].node_type = SymbolType.DATA
            node_dict[key].friendly_name = glo.name

-            if (g_info := data_type_info(glo.type)) is not None:
-                (size, is_pointer) = g_info
-                node_dict[key].confirmed_size = size
-                if is_pointer:
-                    node_dict[key].node_type = SymbolType.POINTER
+            try:
+                # Check our types database for type information.
+                # If we did not parse the TYPES section, we can only
+                # get information for built-in "T_" types.
+                g_info = parser.types.get(glo.type)
+                node_dict[key].confirmed_size = g_info.size
+                # Previously we set the symbol type to POINTER here if
+                # the variable was known to be a pointer. We can derive this
+                # information later when it's time to compare the variable,
+                # so let's set these to symbol type DATA instead.
+                # POINTER will be reserved for non-variable pointer data.
+                # e.g. thunks, unwind section.
+            except (CvdumpKeyError, CvdumpIntegrityError):
+                # No big deal if we don't have complete type information.
+                pass

        for lin in parser.lines:
            key = (lin.section, lin.offset)
--- a/tools/isledecomp/isledecomp/cvdump/parser.py
+++ b/tools/isledecomp/isledecomp/cvdump/parser.py
@@ -1,9 +1,10 @@
 import re
 from typing import Iterable, Tuple
 from collections import namedtuple
+from .types import CvdumpTypesParser

 # e.g. `*** PUBLICS`
-_section_change_regex = re.compile(r"^\*\*\* (?P<section>[A-Z/ ]+)")
+_section_change_regex = re.compile(r"^\*\*\* (?P<section>[A-Z/ ]+)$")

 # e.g. `     27 00034EC0     28 00034EE2     29 00034EE7     30 00034EF4`
 _line_addr_pairs_findall = re.compile(r"\s+(?P<line_no>\d+) (?P<addr>[A-F0-9]{8})")
@@ -76,6 +77,8 @@ class CvdumpParser:
        self.globals = []
        self.modules = []

+        self.types = CvdumpTypesParser()
+
    def _lines_section(self, line: str):
        """Parsing entries from the LINES section. We only care about the pairs of
        line_number and address and the subsection header to indicate which code file
@@ -198,6 +201,9 @@ class CvdumpParser:
        elif self._section == "MODULES":
            self._modules_section(line)

+        elif self._section == "TYPES":
+            self.types.read_line(line)
+
    def read_lines(self, lines: Iterable[str]):
        for line in lines:
            self.read_line(line)
--- a/tools/isledecomp/isledecomp/cvdump/runner.py
+++ b/tools/isledecomp/isledecomp/cvdump/runner.py
@@ -14,6 +14,7 @@ class DumpOpt(Enum):
    PUBLICS = 3
    SECTION_CONTRIB = 4
    MODULES = 5
+    TYPES = 6


 cvdump_opt_map = {
@@ -23,6 +24,7 @@ cvdump_opt_map = {
    DumpOpt.PUBLICS: "-p",
    DumpOpt.SECTION_CONTRIB: "-seccontrib",
    DumpOpt.MODULES: "-m",
+    DumpOpt.TYPES: "-t",
 }


@@ -55,6 +57,10 @@ class Cvdump:
        self._options.add(DumpOpt.MODULES)
        return self

+    def types(self):
+        self._options.add(DumpOpt.TYPES)
+        return self
+
    def cmd_line(self) -> List[str]:
        cvdump_exe = lib_path_join("cvdump.exe")
        flags = [cvdump_opt_map[opt] for opt in self._options]
--- a/tools/isledecomp/isledecomp/cvdump/types.py
+++ b/tools/isledecomp/isledecomp/cvdump/types.py
@@ -0,0 +1,433 @@
+import re
+from typing import Dict, Iterator, List, NamedTuple, Optional
+
+
+class CvdumpTypeError(Exception):
+    pass
+
+
+class CvdumpKeyError(KeyError):
+    pass
+
+
+class CvdumpIntegrityError(Exception):
+    pass
+
+
+class FieldListItem(NamedTuple):
+    """Member of a class or structure"""
+
+    offset: int
+    name: str
+    type: str
+
+
+class ScalarType(NamedTuple):
+    offset: int
+    name: Optional[str]
+    type: str
+
+    @property
+    def size(self) -> int:
+        return scalar_type_size(self.type)
+
+    @property
+    def format_char(self) -> str:
+        return scalar_type_format_char(self.type)
+
+    @property
+    def is_pointer(self) -> bool:
+        return scalar_type_pointer(self.type)
+
+
+class TypeInfo(NamedTuple):
+    key: str
+    size: int
+    name: Optional[str] = None
+    members: Optional[List[FieldListItem]] = None
+
+    def is_scalar(self) -> bool:
+        # TODO: distinction between a class with zero members and no vtable?
+        return self.members is None
+
+
+def normalize_type_id(key: str) -> str:
+    """Helper for TYPES parsing to ensure a consistent format.
+    If key begins with "T_" it is a built-in type.
+    Else it is a hex string. We prefer lower case letters and
+    no leading zeroes. (UDT identifier pads to 8 characters.)"""
+    if key.startswith("T_"):
+        # Remove numeric value for "T_" type. We don't use this.
+        return key[: key.index("(")] if "(" in key else key
+
+    return hex(int(key, 16)).lower()
+
+
+def scalar_type_pointer(type_name: str) -> bool:
+    return type_name.startswith("T_32P")
+
+
+def scalar_type_size(type_name: str) -> int:
+    if scalar_type_pointer(type_name):
+        return 4
+
+    if "CHAR" in type_name:
+        return 2 if "WCHAR" in type_name else 1
+
+    if "SHORT" in type_name:
+        return 2
+
+    if "QUAD" in type_name or "64" in type_name:
+        return 8
+
+    return 4
+
+
+def scalar_type_signed(type_name: str) -> bool:
+    if scalar_type_pointer(type_name):
+        return False
+
+    # According to cvinfo.h, T_WCHAR is unsigned
+    return not type_name.startswith("T_U") and not type_name.startswith("T_W")
+
+
+def scalar_type_format_char(type_name: str) -> str:
+    if scalar_type_pointer(type_name):
+        return "L"
+
+    # "Really a char"
+    if type_name.startswith("T_RCHAR"):
+        return "c"
+
+    # floats
+    if type_name.startswith("T_REAL"):
+        return "d" if "64" in type_name else "f"
+
+    size = scalar_type_size(type_name)
+    char = ({1: "b", 2: "h", 4: "l", 8: "q"}).get(size, "l")
+
+    return char if scalar_type_signed(type_name) else char.upper()
+
+
+def member_string_iter(
+    members: List[ScalarType], size: Optional[int] = None
+) -> Iterator[str]:
+    if len(members) == 0:
+        yield "x" * (size or 0)
+
+    last_offset = 0
+    last_size = 0
+    for m in members:
+        padding = m.offset - last_offset - last_size
+        if padding > 0:
+            yield "x" * padding
+
+        yield m.format_char
+        last_offset = m.offset
+        last_size = m.size
+
+    if size is not None:
+        padding = size - (last_offset + last_size)
+        if padding > 0:
+            yield "x" * padding
+
+
+def member_list_to_struct_string(
+    members: List[ScalarType], size: Optional[int] = None
+) -> str:
+    """Create a string for use with struct.unpack
+    Will pad to `size` bytes if present."""
+    if len(members) == 0:
+        return "x" * (size or 0)
+
+    format_string = "".join(list(member_string_iter(members, size)))
+    if len(format_string) > 0:
+        return "<" + format_string
+
+    return ""
+
+
+def join_member_names(parent: str, child: Optional[str]) -> str:
+    """Helper method to combine parent/child member names.
+    Child member name is None if the child is a scalar type."""
+
+    if child is None:
+        return parent
+
+    # If the child is an array index, join without the dot
+    if child.startswith("["):
+        return f"{parent}{child}"
+
+    return f"{parent}.{child}"
+
+
+class CvdumpTypesParser:
+    """Parser for cvdump output, TYPES section.
+    Tricky enough that it demands its own parser."""
+
+    # Marks the start of a new type
+    INDEX_RE = re.compile(r"(?P<key>0x\w+) : .* (?P<type>LF_\w+)")
+
+    # LF_FIELDLIST class/struct member (1/2)
+    LIST_RE = re.compile(
+        r"\s+list\[\d+\] = LF_MEMBER, (?P<scope>\w+), type = (?P<type>.*), offset = (?P<offset>\d+)"
+    )
+
+    # LF_FIELDLIST vtable indicator
+    VTABLE_RE = re.compile(r"^\s+list\[\d+\] = LF_VFUNCTAB")
+
+    # LF_FIELDLIST superclass indicator
+    SUPERCLASS_RE = re.compile(
+        r"^\s+list\[\d+\] = LF_BCLASS, (?P<scope>\w+), type = (?P<type>.*), offset = (?P<offset>\d+)"
+    )
+
+    # LF_FIELDLIST member name (2/2)
+    MEMBER_RE = re.compile(r"^\s+member name = '(?P<name>.*)'$")
+
+    # LF_ARRAY element type
+    ARRAY_ELEMENT_RE = re.compile(r"^\s+Element type = (?P<type>.*)")
+
+    # LF_ARRAY total array size
+    ARRAY_LENGTH_RE = re.compile(r"^\s+length = (?P<length>\d+)")
+
+    # LF_CLASS/LF_STRUCTURE field list reference
+    CLASS_FIELD_RE = re.compile(
+        r"^\s+# members = \d+,  field list type (?P<field_type>0x\w+),"
+    )
+
+    # LF_CLASS/LF_STRUCTURE name and other info
+    CLASS_NAME_RE = re.compile(
+        r"^\s+Size = (?P<size>\d+), class name = (?P<name>.+), UDT\((?P<udt>0x\w+)\)"
+    )
+
+    # LF_MODIFIER, type being modified
+    MODIFIES_RE = re.compile(r".*modifies type (?P<type>.*)$")
+
+    def __init__(self) -> None:
+        self.mode = ""
+        self.last_key = ""
+        self.keys = {}
+
+    def _new_type(self):
+        """Prepare a new dict for the type we just parsed.
+        The id is self.last_key and the "type" of type is self.mode.
+        e.g. LF_CLASS"""
+        self.keys[self.last_key] = {"type": self.mode}
+
+    def _set(self, key: str, value):
+        self.keys[self.last_key][key] = value
+
+    def _add_member(self, offset: int, type_: str):
+        obj = self.keys[self.last_key]
+        if "members" not in obj:
+            obj["members"] = []
+
+        obj["members"].append({"offset": offset, "type": type_})
+
+    def _set_member_name(self, name: str):
+        """Set name for most recently added member."""
+        obj = self.keys[self.last_key]
+        obj["members"][-1]["name"] = name
+
+    def _get_field_list(self, type_obj: Dict) -> List[FieldListItem]:
+        """Return the field list for the given LF_CLASS/LF_STRUCTURE reference"""
+
+        if type_obj.get("type") == "LF_FIELDLIST":
+            field_obj = type_obj
+        else:
+            field_list_type = type_obj.get("field_list_type")
+            field_obj = self.keys[field_list_type]
+
+        members: List[FieldListItem] = []
+
+        super_id = field_obj.get("super")
+        if super_id is not None:
+            # May need to resolve forward ref.
+            superclass = self.get(super_id)
+            if superclass.members is not None:
+                members = superclass.members
+
+        raw_members = field_obj.get("members", [])
+        members += [
+            FieldListItem(
+                offset=m["offset"],
+                type=m["type"],
+                name=m["name"],
+            )
+            for m in raw_members
+        ]
+
+        return sorted(members, key=lambda m: m.offset)
+
+    def _mock_array_members(self, type_obj: Dict) -> List[FieldListItem]:
+        """LF_ARRAY elements provide the element type and the total size.
+        We want the list of "members" as if this was a struct."""
+
+        if type_obj.get("type") != "LF_ARRAY":
+            raise CvdumpTypeError("Type is not an LF_ARRAY")
+
+        array_type = type_obj.get("array_type")
+        if array_type is None:
+            raise CvdumpIntegrityError("No array element type")
+
+        array_element_size = self.get(array_type).size
+
+        n_elements = type_obj["size"] // array_element_size
+
+        return [
+            FieldListItem(
+                offset=i * array_element_size,
+                type=array_type,
+                name=f"[{i}]",
+            )
+            for i in range(n_elements)
+        ]
+
+    def get(self, type_key: str) -> TypeInfo:
+        """Convert our dictionary values read from the cvdump output
+        into a consistent format for the given type."""
+
+        # Scalar type. Handled here because it makes the recursive steps
+        # much simpler.
+        if type_key.startswith("T_"):
+            size = scalar_type_size(type_key)
+            return TypeInfo(
+                key=type_key,
+                size=size,
+            )
+
+        # Go to our dictionary to find it.
+        obj = self.keys.get(type_key.lower())
+        if obj is None:
+            raise CvdumpKeyError(type_key)
+
+        # These type references are just a wrapper around a scalar
+        if obj.get("type") == "LF_ENUM":
+            return self.get("T_INT4")
+
+        if obj.get("type") == "LF_POINTER":
+            return self.get("T_32PVOID")
+
+        if obj.get("is_forward_ref", False):
+            # Get the forward reference to follow.
+            # If this is LF_CLASS/LF_STRUCTURE, it is the UDT value.
+            # For LF_MODIFIER, it is the type being modified.
+            forward_ref = obj.get("udt", None) or obj.get("modifies", None)
+            if forward_ref is None:
+                raise CvdumpIntegrityError(f"Null forward ref for type {type_key}")
+
+            return self.get(forward_ref)
+
+        # Else it is not a forward reference, so build out the object here.
+        if obj.get("type") == "LF_ARRAY":
+            members = self._mock_array_members(obj)
+        else:
+            members = self._get_field_list(obj)
+
+        return TypeInfo(
+            key=type_key,
+            size=obj.get("size"),
+            name=obj.get("name"),
+            members=members,
+        )
+
+    def get_by_name(self, name: str) -> TypeInfo:
+        """Find the complex type with the given name."""
+        # TODO
+        raise NotImplementedError
+
+    def get_scalars(self, type_key: str) -> List[ScalarType]:
+        """Reduce the given type to a list of scalars so we can
+        compare each component value."""
+
+        obj = self.get(type_key)
+        if obj.is_scalar():
+            # Use obj.key here for alias types like LF_POINTER
+            return [ScalarType(offset=0, type=obj.key, name=None)]
+
+        # mypy?
+        assert obj.members is not None
+
+        # Dedupe repeated offsets if this is a union type
+        unique_offsets = {m.offset: m for m in obj.members}
+        unique_members = [m for _, m in unique_offsets.items()]
+
+        return [
+            ScalarType(
+                offset=m.offset + cm.offset,
+                type=cm.type,
+                name=join_member_names(m.name, cm.name),
+            )
+            for m in unique_members
+            for cm in self.get_scalars(m.type)
+        ]
+
+    def get_format_string(self, type_key: str) -> str:
+        obj = self.get(type_key)
+        members = self.get_scalars(type_key)
+        # We need both to pad the data to size
+        return member_list_to_struct_string(members, obj.size)
+
+    def read_line(self, line: str):
+        if (match := self.INDEX_RE.match(line)) is not None:
+            self.last_key = normalize_type_id(match.group("key"))
+            self.mode = match.group("type")
+            self._new_type()
+
+            # We don't need to read anything else from here (for now)
+            if self.mode in ("LF_ENUM", "LF_POINTER"):
+                self._set("size", 4)
+
+        if self.mode == "LF_MODIFIER":
+            if (match := self.MODIFIES_RE.match(line)) is not None:
+                # For convenience, because this is essentially the same thing
+                # as an LF_CLASS forward ref.
+                self._set("is_forward_ref", True)
+                self._set("modifies", normalize_type_id(match.group("type")))
+
+        if self.mode == "LF_ARRAY":
+            if (match := self.ARRAY_ELEMENT_RE.match(line)) is not None:
+                self._set("array_type", normalize_type_id(match.group("type")))
+
+            if (match := self.ARRAY_LENGTH_RE.match(line)) is not None:
+                self._set("size", int(match.group("length")))
+
+        if self.mode == "LF_FIELDLIST":
+            # If this class has a vtable, create a mock member at offset 0
+            if (match := self.VTABLE_RE.match(line)) is not None:
+                # For our purposes, any pointer type will do
+                self._add_member(0, "T_32PVOID")
+                self._set_member_name("vftable")
+
+            # Superclass is set here in the fieldlist rather than in LF_CLASS
+            if (match := self.SUPERCLASS_RE.match(line)) is not None:
+                self._set("super", normalize_type_id(match.group("type")))
+
+            # Member offset and type given on the first of two lines.
+            if (match := self.LIST_RE.match(line)) is not None:
+                self._add_member(
+                    int(match.group("offset")), normalize_type_id(match.group("type"))
+                )
+
+            # Name of the member read on the second of two lines.
+            if (match := self.MEMBER_RE.match(line)) is not None:
+                self._set_member_name(match.group("name"))
+
+        if self.mode in ("LF_STRUCTURE", "LF_CLASS"):
+            # Match the reference to the associated LF_FIELDLIST
+            if (match := self.CLASS_FIELD_RE.match(line)) is not None:
+                if match.group("field_type") == "0x0000":
+                    # Not redundant. UDT might not match the key.
+                    # These cases get reported as UDT mismatch.
+                    self._set("is_forward_ref", True)
+                else:
+                    field_list_type = normalize_type_id(match.group("field_type"))
+                    self._set("field_list_type", field_list_type)
+
+            # Last line has the vital information.
+            # If this is a FORWARD REF, we need to follow the UDT pointer
+            # to get the actual class details.
+            if (match := self.CLASS_NAME_RE.match(line)) is not None:
+                self._set("name", match.group("name"))
+                self._set("udt", normalize_type_id(match.group("udt")))
+                self._set("size", int(match.group("size")))