Ghidra Import: Support virtual inheritance (#1071)

* Implement/fix Ghidra imports for multiple and virtual inheritance

Unfortunately, the handling in Ghidra is still far from perfect. This is a good place to start, though.

* Support offsets in vbase pointers

* Support `this adjust`

* minor stylistic improvement

* Improvements to documentation

---------

Co-authored-by: jonschz <jonschz@users.noreply.github.com>
This commit is contained in:
jonschz
2024-07-30 19:02:15 +02:00
committed by GitHub
parent 1f251ff817
commit 412200ecbc
6 changed files with 549 additions and 72 deletions

View File

@@ -10,6 +10,12 @@ from ghidra.program.model.listing import Function, Parameter
from ghidra.program.flatapi import FlatProgramAPI from ghidra.program.flatapi import FlatProgramAPI
from ghidra.program.model.listing import ParameterImpl from ghidra.program.model.listing import ParameterImpl
from ghidra.program.model.symbol import SourceType from ghidra.program.model.symbol import SourceType
from ghidra.program.model.data import (
TypeDef,
TypedefDataType,
Pointer,
ComponentOffsetSettingsDefinition,
)
from lego_util.pdb_extraction import ( from lego_util.pdb_extraction import (
PdbFunction, PdbFunction,
@@ -17,12 +23,13 @@ from lego_util.pdb_extraction import (
CppStackSymbol, CppStackSymbol,
) )
from lego_util.ghidra_helper import ( from lego_util.ghidra_helper import (
add_pointer_type, add_data_type_or_reuse_existing,
get_or_add_pointer_type,
get_ghidra_namespace, get_ghidra_namespace,
sanitize_name, sanitize_name,
) )
from lego_util.exceptions import StackOffsetMismatchError from lego_util.exceptions import StackOffsetMismatchError, Lego1Exception
from lego_util.type_importer import PdbTypeImporter from lego_util.type_importer import PdbTypeImporter
@@ -91,7 +98,10 @@ class PdbFunctionImporter:
if ( if (
(not return_type_match) (not return_type_match)
and (self.return_type.getLength() > 4) and (self.return_type.getLength() > 4)
and (add_pointer_type(self.api, self.return_type) == ghidra_return_type) and (
get_or_add_pointer_type(self.api, self.return_type)
== ghidra_return_type
)
and any( and any(
param param
for param in ghidra_function.getParameters() for param in ghidra_function.getParameters()
@@ -103,19 +113,22 @@ class PdbFunctionImporter:
) )
return_type_match = True return_type_match = True
# match arguments: decide if thiscall or not # match arguments: decide if thiscall or not, and whether the `this` type matches
thiscall_matches = ( thiscall_matches = (
self.signature.call_type == ghidra_function.getCallingConventionName() self.signature.call_type == ghidra_function.getCallingConventionName()
) )
ghidra_params_without_this = list(ghidra_function.getParameters())
if thiscall_matches and self.signature.call_type == "__thiscall":
this_argument = ghidra_params_without_this.pop(0)
thiscall_matches = self._this_type_match(this_argument)
if self.is_stub: if self.is_stub:
# We do not import the argument list for stubs, so it should be excluded in matches # We do not import the argument list for stubs, so it should be excluded in matches
args_match = True args_match = True
elif thiscall_matches: elif thiscall_matches:
if self.signature.call_type == "__thiscall": args_match = self._parameter_lists_match(ghidra_params_without_this)
args_match = self._matches_thiscall_parameters(ghidra_function)
else:
args_match = self._matches_non_thiscall_parameters(ghidra_function)
else: else:
args_match = False args_match = False
@@ -136,16 +149,22 @@ class PdbFunctionImporter:
and args_match and args_match
) )
def _matches_non_thiscall_parameters(self, ghidra_function: Function) -> bool: def _this_type_match(self, this_parameter: Parameter) -> bool:
return self._parameter_lists_match(ghidra_function.getParameters()) if this_parameter.getName() != "this":
logger.info("Expected first argument to be `this` in __thiscall")
return False
def _matches_thiscall_parameters(self, ghidra_function: Function) -> bool: if self.signature.this_adjust != 0:
ghidra_params = list(ghidra_function.getParameters()) # In this case, the `this` argument should be custom defined
if not isinstance(this_parameter.getDataType(), TypeDef):
logger.info(
"`this` argument is not a typedef while `this adjust` = %d",
self.signature.this_adjust,
)
return False
# We are not checking for the _correct_ `this` type here, which we could do in the future
# remove the `this` argument which we don't generate ourselves return True
ghidra_params.pop(0)
return self._parameter_lists_match(ghidra_params)
def _parameter_lists_match(self, ghidra_params: "list[Parameter]") -> bool: def _parameter_lists_match(self, ghidra_params: "list[Parameter]") -> bool:
# Remove return storage pointer from comparison if present. # Remove return storage pointer from comparison if present.
@@ -194,6 +213,25 @@ class PdbFunctionImporter:
def overwrite_ghidra_function(self, ghidra_function: Function): def overwrite_ghidra_function(self, ghidra_function: Function):
"""Replace the function declaration in Ghidra by the one derived from C++.""" """Replace the function declaration in Ghidra by the one derived from C++."""
if ghidra_function.hasCustomVariableStorage():
# Unfortunately, calling `ghidra_function.setCustomVariableStorage(False)`
# leads to two `this` parameters. Therefore, we first need to remove all `this` parameters
# and then re-generate a new one
ghidra_function.replaceParameters(
Function.FunctionUpdateType.DYNAMIC_STORAGE_ALL_PARAMS, # this implicitly sets custom variable storage to False
True,
SourceType.USER_DEFINED,
[
param
for param in ghidra_function.getParameters()
if param.getName() != "this"
],
)
if ghidra_function.hasCustomVariableStorage():
raise Lego1Exception("Failed to disable custom variable storage.")
ghidra_function.setName(self.name, SourceType.USER_DEFINED) ghidra_function.setName(self.name, SourceType.USER_DEFINED)
ghidra_function.setParentNamespace(self.namespace) ghidra_function.setParentNamespace(self.namespace)
ghidra_function.setReturnType(self.return_type, SourceType.USER_DEFINED) ghidra_function.setReturnType(self.return_type, SourceType.USER_DEFINED)
@@ -203,16 +241,18 @@ class PdbFunctionImporter:
logger.debug( logger.debug(
"%s is a stub, skipping parameter import", self.get_full_name() "%s is a stub, skipping parameter import", self.get_full_name()
) )
return else:
ghidra_function.replaceParameters(
Function.FunctionUpdateType.DYNAMIC_STORAGE_ALL_PARAMS,
True, # force
SourceType.USER_DEFINED,
self.arguments,
)
self._import_parameter_names(ghidra_function)
ghidra_function.replaceParameters( # Special handling for `this adjust` and virtual inheritance
Function.FunctionUpdateType.DYNAMIC_STORAGE_ALL_PARAMS, if self.signature.this_adjust != 0:
True, # force self._set_this_adjust(ghidra_function)
SourceType.USER_DEFINED,
self.arguments,
)
self._import_parameter_names(ghidra_function)
def _import_parameter_names(self, ghidra_function: Function): def _import_parameter_names(self, ghidra_function: Function):
# When we call `ghidra_function.replaceParameters`, Ghidra will generate the layout. # When we call `ghidra_function.replaceParameters`, Ghidra will generate the layout.
@@ -284,3 +324,50 @@ class PdbFunctionImporter:
), ),
None, None,
) )
def _set_this_adjust(
self,
ghidra_function: Function,
):
"""
When `this adjust` is non-zero, the pointer type of `this` needs to be replaced by an offset version.
The offset can only be set on a typedef on the pointer. We also must enable custom storage so we can modify
the auto-generated `this` parameter.
"""
# Necessary in order to overwite the auto-generated `this`
ghidra_function.setCustomVariableStorage(True)
this_parameter = next(
(
param
for param in ghidra_function.getParameters()
if param.isRegisterVariable() and param.getName() == "this"
),
None,
)
if this_parameter is None:
logger.error(
"Failed to find `this` parameter in a function with `this adjust = %d`",
self.signature.this_adjust,
)
else:
current_ghidra_type = this_parameter.getDataType()
assert isinstance(current_ghidra_type, Pointer)
class_name = current_ghidra_type.getDataType().getName()
typedef_name = f"{class_name}PtrOffset0x{self.signature.this_adjust:x}"
typedef_ghidra_type = TypedefDataType(
current_ghidra_type.getCategoryPath(),
typedef_name,
current_ghidra_type,
)
ComponentOffsetSettingsDefinition.DEF.setValue(
typedef_ghidra_type.getDefaultSettings(), self.signature.this_adjust
)
typedef_ghidra_type = add_data_type_or_reuse_existing(
self.api, typedef_ghidra_type
)
this_parameter.setDataType(typedef_ghidra_type, SourceType.USER_DEFINED)

View File

@@ -11,10 +11,8 @@ from lego_util.exceptions import (
# Disable spurious warnings in vscode / pylance # Disable spurious warnings in vscode / pylance
# pyright: reportMissingModuleSource=false # pyright: reportMissingModuleSource=false
from ghidra.program.model.data import PointerDataType
from ghidra.program.model.data import DataTypeConflictHandler
from ghidra.program.flatapi import FlatProgramAPI from ghidra.program.flatapi import FlatProgramAPI
from ghidra.program.model.data import DataType from ghidra.program.model.data import DataType, DataTypeConflictHandler, PointerDataType
from ghidra.program.model.symbol import Namespace from ghidra.program.model.symbol import Namespace
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -37,9 +35,15 @@ def get_ghidra_type(api: FlatProgramAPI, type_name: str):
raise MultipleTypesFoundInGhidraError(type_name, result) raise MultipleTypesFoundInGhidraError(type_name, result)
def add_pointer_type(api: FlatProgramAPI, pointee: DataType) -> DataType: def get_or_add_pointer_type(api: FlatProgramAPI, pointee: DataType) -> DataType:
new_data_type = PointerDataType(pointee) new_pointer_data_type = PointerDataType(pointee)
new_data_type.setCategoryPath(pointee.getCategoryPath()) new_pointer_data_type.setCategoryPath(pointee.getCategoryPath())
return add_data_type_or_reuse_existing(api, new_pointer_data_type)
def add_data_type_or_reuse_existing(
api: FlatProgramAPI, new_data_type: DataType
) -> DataType:
result_data_type = ( result_data_type = (
api.getCurrentProgram() api.getCurrentProgram()
.getDataTypeManager() .getDataTypeManager()
@@ -47,7 +51,7 @@ def add_pointer_type(api: FlatProgramAPI, pointee: DataType) -> DataType:
) )
if result_data_type is not new_data_type: if result_data_type is not new_data_type:
logger.debug( logger.debug(
"New pointer replaced by existing one. Fresh pointer: %s (class: %s)", "Reusing existing data type instead of new one: %s (class: %s)",
result_data_type, result_data_type,
result_data_type.__class__, result_data_type.__class__,
) )

View File

@@ -36,6 +36,8 @@ class FunctionSignature:
return_type: str return_type: str
class_type: Optional[str] class_type: Optional[str]
stack_symbols: list[CppStackOrRegisterSymbol] stack_symbols: list[CppStackOrRegisterSymbol]
# if non-zero: an offset to the `this` parameter in a __thiscall
this_adjust: int
@dataclass @dataclass
@@ -119,6 +121,9 @@ class PdbFunctionExtractor:
call_type = self._call_type_map[function_type["call_type"]] call_type = self._call_type_map[function_type["call_type"]]
# parse as hex number, default to 0
this_adjust = int(function_type.get("this_adjust", "0"), 16)
return FunctionSignature( return FunctionSignature(
original_function_symbol=fn, original_function_symbol=fn,
call_type=call_type, call_type=call_type,
@@ -126,6 +131,7 @@ class PdbFunctionExtractor:
return_type=function_type["return_type"], return_type=function_type["return_type"],
class_type=class_type, class_type=class_type,
stack_symbols=stack_symbols, stack_symbols=stack_symbols,
this_adjust=this_adjust,
) )
def get_function_list(self) -> list[PdbFunction]: def get_function_list(self) -> list[PdbFunction]:

View File

@@ -1,5 +1,5 @@
import logging import logging
from typing import Any, Callable, TypeVar from typing import Any, Callable, Iterator, Optional, TypeVar
# Disable spurious warnings in vscode / pylance # Disable spurious warnings in vscode / pylance
# pyright: reportMissingModuleSource=false # pyright: reportMissingModuleSource=false
@@ -7,6 +7,7 @@ from typing import Any, Callable, TypeVar
# pylint: disable=too-many-return-statements # a `match` would be better, but for now we are stuck with Python 3.9 # pylint: disable=too-many-return-statements # a `match` would be better, but for now we are stuck with Python 3.9
# pylint: disable=no-else-return # Not sure why this rule even is a thing, this is great for checking exhaustiveness # pylint: disable=no-else-return # Not sure why this rule even is a thing, this is great for checking exhaustiveness
from isledecomp.cvdump.types import VirtualBasePointer
from lego_util.exceptions import ( from lego_util.exceptions import (
ClassOrNamespaceNotFoundInGhidraError, ClassOrNamespaceNotFoundInGhidraError,
TypeNotFoundError, TypeNotFoundError,
@@ -15,7 +16,8 @@ from lego_util.exceptions import (
StructModificationError, StructModificationError,
) )
from lego_util.ghidra_helper import ( from lego_util.ghidra_helper import (
add_pointer_type, add_data_type_or_reuse_existing,
get_or_add_pointer_type,
create_ghidra_namespace, create_ghidra_namespace,
get_ghidra_namespace, get_ghidra_namespace,
get_ghidra_type, get_ghidra_type,
@@ -33,6 +35,8 @@ from ghidra.program.model.data import (
EnumDataType, EnumDataType,
StructureDataType, StructureDataType,
StructureInternal, StructureInternal,
TypedefDataType,
ComponentOffsetSettingsDefinition,
) )
from ghidra.util.task import ConsoleTaskMonitor from ghidra.util.task import ConsoleTaskMonitor
@@ -56,10 +60,19 @@ class PdbTypeImporter:
def types(self): def types(self):
return self.extraction.compare.cv.types return self.extraction.compare.cv.types
def import_pdb_type_into_ghidra(self, type_index: str) -> DataType: def import_pdb_type_into_ghidra(
self, type_index: str, slim_for_vbase: bool = False
) -> DataType:
""" """
Recursively imports a type from the PDB into Ghidra. Recursively imports a type from the PDB into Ghidra.
@param type_index Either a scalar type like `T_INT4(...)` or a PDB reference like `0x10ba` @param type_index Either a scalar type like `T_INT4(...)` or a PDB reference like `0x10ba`
@param slim_for_vbase If true, the current invocation
imports a superclass of some class where virtual inheritance is involved (directly or indirectly).
This case requires special handling: Let's say we have `class C: B` and `class B: virtual A`. Then cvdump
reports a size for B that includes both B's fields as well as the A contained at an offset within B,
which is not the correct structure to be contained in C. Therefore, we need to create a "slim" version of B
that fits inside C.
This value should always be `False` when the referenced type is not (a pointer to) a class.
""" """
type_index_lower = type_index.lower() type_index_lower = type_index.lower()
if type_index_lower.startswith("t_"): if type_index_lower.startswith("t_"):
@@ -76,14 +89,19 @@ class PdbTypeImporter:
# follow forward reference (class, struct, union) # follow forward reference (class, struct, union)
if type_pdb.get("is_forward_ref", False): if type_pdb.get("is_forward_ref", False):
return self._import_forward_ref_type(type_index_lower, type_pdb) return self._import_forward_ref_type(
type_index_lower, type_pdb, slim_for_vbase
)
if type_category == "LF_POINTER": if type_category == "LF_POINTER":
return add_pointer_type( return get_or_add_pointer_type(
self.api, self.import_pdb_type_into_ghidra(type_pdb["element_type"]) self.api,
self.import_pdb_type_into_ghidra(
type_pdb["element_type"], slim_for_vbase
),
) )
elif type_category in ["LF_CLASS", "LF_STRUCTURE"]: elif type_category in ["LF_CLASS", "LF_STRUCTURE"]:
return self._import_class_or_struct(type_pdb) return self._import_class_or_struct(type_pdb, slim_for_vbase)
elif type_category == "LF_ARRAY": elif type_category == "LF_ARRAY":
return self._import_array(type_pdb) return self._import_array(type_pdb)
elif type_category == "LF_ENUM": elif type_category == "LF_ENUM":
@@ -120,7 +138,10 @@ class PdbTypeImporter:
return get_ghidra_type(self.api, scalar_cpp_type) return get_ghidra_type(self.api, scalar_cpp_type)
def _import_forward_ref_type( def _import_forward_ref_type(
self, type_index, type_pdb: dict[str, Any] self,
type_index,
type_pdb: dict[str, Any],
slim_for_vbase: bool = False,
) -> DataType: ) -> DataType:
referenced_type = type_pdb.get("udt") or type_pdb.get("modifies") referenced_type = type_pdb.get("udt") or type_pdb.get("modifies")
if referenced_type is None: if referenced_type is None:
@@ -136,7 +157,7 @@ class PdbTypeImporter:
type_index, type_index,
referenced_type, referenced_type,
) )
return self.import_pdb_type_into_ghidra(referenced_type) return self.import_pdb_type_into_ghidra(referenced_type, slim_for_vbase)
def _import_array(self, type_pdb: dict[str, Any]) -> DataType: def _import_array(self, type_pdb: dict[str, Any]) -> DataType:
inner_type = self.import_pdb_type_into_ghidra(type_pdb["array_type"]) inner_type = self.import_pdb_type_into_ghidra(type_pdb["array_type"])
@@ -182,12 +203,18 @@ class PdbTypeImporter:
return result return result
def _import_class_or_struct(self, type_in_pdb: dict[str, Any]) -> DataType: def _import_class_or_struct(
self,
type_in_pdb: dict[str, Any],
slim_for_vbase: bool = False,
) -> DataType:
field_list_type: str = type_in_pdb["field_list_type"] field_list_type: str = type_in_pdb["field_list_type"]
field_list = self.types.keys[field_list_type.lower()] field_list = self.types.keys[field_list_type.lower()]
class_size: int = type_in_pdb["size"] class_size: int = type_in_pdb["size"]
class_name_with_namespace: str = sanitize_name(type_in_pdb["name"]) class_name_with_namespace: str = sanitize_name(type_in_pdb["name"])
if slim_for_vbase:
class_name_with_namespace += "_vbase_slim"
if class_name_with_namespace in self.handled_structs: if class_name_with_namespace in self.handled_structs:
logger.debug( logger.debug(
@@ -205,11 +232,11 @@ class PdbTypeImporter:
self._get_or_create_namespace(class_name_with_namespace) self._get_or_create_namespace(class_name_with_namespace)
data_type = self._get_or_create_struct_data_type( new_ghidra_struct = self._get_or_create_struct_data_type(
class_name_with_namespace, class_size class_name_with_namespace, class_size
) )
if (old_size := data_type.getLength()) != class_size: if (old_size := new_ghidra_struct.getLength()) != class_size:
logger.warning( logger.warning(
"Existing class %s had incorrect size %d. Setting to %d...", "Existing class %s had incorrect size %d. Setting to %d...",
class_name_with_namespace, class_name_with_namespace,
@@ -220,39 +247,189 @@ class PdbTypeImporter:
logger.info("Adding class data type %s", class_name_with_namespace) logger.info("Adding class data type %s", class_name_with_namespace)
logger.debug("Class information: %s", type_in_pdb) logger.debug("Class information: %s", type_in_pdb)
data_type.deleteAll() components: list[dict[str, Any]] = []
data_type.growStructure(class_size) components.extend(self._get_components_from_base_classes(field_list))
# can be missing when no new fields are declared
components.extend(self._get_components_from_members(field_list))
components.extend(
self._get_components_from_vbase(
field_list, class_name_with_namespace, new_ghidra_struct
)
)
components.sort(key=lambda c: c["offset"])
if slim_for_vbase:
# Make a "slim" version: shrink the size to the fields that are actually present.
# This makes a difference when the current class uses virtual inheritance
assert (
len(components) > 0
), f"Error: {class_name_with_namespace} should not be empty. There must be at least one direct or indirect vbase pointer."
last_component = components[-1]
class_size = last_component["offset"] + last_component["type"].getLength()
self._overwrite_struct(
class_name_with_namespace,
new_ghidra_struct,
class_size,
components,
)
logger.info("Finished importing class %s", class_name_with_namespace)
return new_ghidra_struct
def _get_components_from_base_classes(self, field_list) -> Iterator[dict[str, Any]]:
non_virtual_base_classes: dict[str, int] = field_list.get("super", {})
for super_type, offset in non_virtual_base_classes.items():
# If we have virtual inheritance _and_ a non-virtual base class here, we play safe and import slim version.
# This is technically not needed if only one of the superclasses uses virtual inheritance, but I am not aware of any instance.
import_slim_vbase_version_of_superclass = "vbase" in field_list
ghidra_type = self.import_pdb_type_into_ghidra(
super_type, slim_for_vbase=import_slim_vbase_version_of_superclass
)
yield {
"type": ghidra_type,
"offset": offset,
"name": "base" if offset == 0 else f"base_{ghidra_type.getName()}",
}
def _get_components_from_members(self, field_list: dict[str, Any]):
members: list[dict[str, Any]] = field_list.get("members") or []
for member in members:
yield member | {"type": self.import_pdb_type_into_ghidra(member["type"])}
def _get_components_from_vbase(
self,
field_list: dict[str, Any],
class_name_with_namespace: str,
current_type: StructureInternal,
) -> Iterator[dict[str, Any]]:
vbasepointer: Optional[VirtualBasePointer] = field_list.get("vbase", None)
if vbasepointer is not None and any(x.direct for x in vbasepointer.bases):
vbaseptr_type = get_or_add_pointer_type(
self.api,
self._import_vbaseptr(
current_type, class_name_with_namespace, vbasepointer
),
)
yield {
"type": vbaseptr_type,
"offset": vbasepointer.vboffset,
"name": "vbase_offset",
}
def _import_vbaseptr(
self,
current_type: StructureInternal,
class_name_with_namespace: str,
vbasepointer: VirtualBasePointer,
) -> StructureInternal:
pointer_size = 4 # hard-code to 4 because of 32 bit
components = [
{
"offset": 0,
"type": get_or_add_pointer_type(self.api, current_type),
"name": "o_self",
}
]
for vbase in vbasepointer.bases:
vbase_ghidra_type = self.import_pdb_type_into_ghidra(vbase.type)
type_name = vbase_ghidra_type.getName()
vbase_ghidra_pointer = get_or_add_pointer_type(self.api, vbase_ghidra_type)
vbase_ghidra_pointer_typedef = TypedefDataType(
vbase_ghidra_pointer.getCategoryPath(),
f"{type_name}PtrOffset",
vbase_ghidra_pointer,
)
# Set a default value of -4 for the pointer offset. While this appears to be correct in many cases,
# it does not always lead to the best decompile. It can be fine-tuned by hand; the next function call
# makes sure that we don't overwrite this value on re-running the import.
ComponentOffsetSettingsDefinition.DEF.setValue(
vbase_ghidra_pointer_typedef.getDefaultSettings(), -4
)
vbase_ghidra_pointer_typedef = add_data_type_or_reuse_existing(
self.api, vbase_ghidra_pointer_typedef
)
components.append(
{
"offset": vbase.index * pointer_size,
"type": vbase_ghidra_pointer_typedef,
"name": f"o_{type_name}",
}
)
size = len(components) * pointer_size
new_ghidra_struct = self._get_or_create_struct_data_type(
f"{class_name_with_namespace}::VBasePtr", size
)
self._overwrite_struct(
f"{class_name_with_namespace}::VBasePtr",
new_ghidra_struct,
size,
components,
)
return new_ghidra_struct
def _overwrite_struct(
self,
class_name_with_namespace: str,
new_ghidra_struct: StructureInternal,
class_size: int,
components: list[dict[str, Any]],
):
new_ghidra_struct.deleteAll()
new_ghidra_struct.growStructure(class_size)
# this case happened e.g. for IUnknown, which linked to an (incorrect) existing library, and some other types as well. # this case happened e.g. for IUnknown, which linked to an (incorrect) existing library, and some other types as well.
# Unfortunately, we don't get proper error handling for read-only types. # Unfortunately, we don't get proper error handling for read-only types.
# However, we really do NOT want to do this every time because the type might be self-referential and partially imported. # However, we really do NOT want to do this every time because the type might be self-referential and partially imported.
if data_type.getLength() != class_size: if new_ghidra_struct.getLength() != class_size:
data_type = self._delete_and_recreate_struct_data_type( new_ghidra_struct = self._delete_and_recreate_struct_data_type(
class_name_with_namespace, class_size, data_type class_name_with_namespace, class_size, new_ghidra_struct
) )
# can be missing when no new fields are declared
components: list[dict[str, Any]] = field_list.get("members") or []
super_type = field_list.get("super")
if super_type is not None:
components.insert(0, {"type": super_type, "offset": 0, "name": "base"})
for component in components: for component in components:
ghidra_type = self.import_pdb_type_into_ghidra(component["type"]) offset: int = component["offset"]
logger.debug("Adding component to class: %s", component) logger.debug(
"Adding component %s to class: %s", component, class_name_with_namespace
)
try: try:
# for better logs # Make sure there is room for the new structure and that we have no collision.
data_type.replaceAtOffset( existing_type = new_ghidra_struct.getComponentAt(offset)
component["offset"], ghidra_type, -1, component["name"], None assert (
existing_type is not None
), f"Struct collision: Offset {offset} in {class_name_with_namespace} is overlapped by another component"
if existing_type.getDataType().getName() != "undefined":
# collision of structs beginning in the same place -> likely due to unions
logger.warning(
"Struct collision: Offset %d of %s already has a field (likely an inline union)",
offset,
class_name_with_namespace,
)
new_ghidra_struct.replaceAtOffset(
offset,
component["type"],
-1, # set to -1 for fixed-size components
component["name"], # name
None, # comment
) )
except Exception as e: except Exception as e:
raise StructModificationError(type_in_pdb) from e raise StructModificationError(class_name_with_namespace) from e
logger.info("Finished importing class %s", class_name_with_namespace)
return data_type
def _get_or_create_namespace(self, class_name_with_namespace: str): def _get_or_create_namespace(self, class_name_with_namespace: str):
colon_split = class_name_with_namespace.split("::") colon_split = class_name_with_namespace.split("::")

View File

@@ -1,3 +1,4 @@
from dataclasses import dataclass
import re import re
import logging import logging
from typing import Any, Dict, List, NamedTuple, Optional from typing import Any, Dict, List, NamedTuple, Optional
@@ -26,6 +27,19 @@ class FieldListItem(NamedTuple):
type: str type: str
@dataclass
class VirtualBaseClass:
type: str
index: int
direct: bool
@dataclass
class VirtualBasePointer:
vboffset: int
bases: list[VirtualBaseClass]
class ScalarType(NamedTuple): class ScalarType(NamedTuple):
offset: int offset: int
name: Optional[str] name: Optional[str]
@@ -157,6 +171,16 @@ class CvdumpTypesParser:
r"^\s+list\[\d+\] = LF_BCLASS, (?P<scope>\w+), type = (?P<type>.*), offset = (?P<offset>\d+)" r"^\s+list\[\d+\] = LF_BCLASS, (?P<scope>\w+), type = (?P<type>.*), offset = (?P<offset>\d+)"
) )
# LF_FIELDLIST virtual direct/indirect base pointer, line 1/2
VBCLASS_RE = re.compile(
r"^\s+list\[\d+\] = LF_(?P<indirect>I?)VBCLASS, .* base type = (?P<type>.*)$"
)
# LF_FIELDLIST virtual direct/indirect base pointer, line 2/2
VBCLASS_LINE_2_RE = re.compile(
r"^\s+virtual base ptr = .+, vbpoff = (?P<vboffset>\d+), vbind = (?P<vbindex>\d+)$"
)
# LF_FIELDLIST member name (2/2) # LF_FIELDLIST member name (2/2)
MEMBER_RE = re.compile(r"^\s+member name = '(?P<name>.*)'$") MEMBER_RE = re.compile(r"^\s+member name = '(?P<name>.*)'$")
@@ -206,7 +230,7 @@ class CvdumpTypesParser:
re.compile(r"\s*Arg list type = (?P<arg_list_type>[\w()]+)$"), re.compile(r"\s*Arg list type = (?P<arg_list_type>[\w()]+)$"),
re.compile( re.compile(
r"\s*This adjust = (?P<this_adjust>[\w()]+)$" r"\s*This adjust = (?P<this_adjust>[\w()]+)$"
), # TODO: figure out the meaning ), # By how much the incoming pointers are shifted in virtual inheritance; hex value without `0x` prefix
re.compile( re.compile(
r"\s*Func attr = (?P<func_attr>[\w()]+)$" r"\s*Func attr = (?P<func_attr>[\w()]+)$"
), # Only for completeness, is always `none` ), # Only for completeness, is always `none`
@@ -282,12 +306,12 @@ class CvdumpTypesParser:
members: List[FieldListItem] = [] members: List[FieldListItem] = []
super_id = field_obj.get("super") super_ids = field_obj.get("super", [])
if super_id is not None: for super_id in super_ids:
# May need to resolve forward ref. # May need to resolve forward ref.
superclass = self.get(super_id) superclass = self.get(super_id)
if superclass.members is not None: if superclass.members is not None:
members = superclass.members members += superclass.members
raw_members = field_obj.get("members", []) raw_members = field_obj.get("members", [])
members += [ members += [
@@ -526,7 +550,57 @@ class CvdumpTypesParser:
# Superclass is set here in the fieldlist rather than in LF_CLASS # Superclass is set here in the fieldlist rather than in LF_CLASS
elif (match := self.SUPERCLASS_RE.match(line)) is not None: elif (match := self.SUPERCLASS_RE.match(line)) is not None:
self._set("super", normalize_type_id(match.group("type"))) superclass_list: dict[str, int] = self.keys[self.last_key].setdefault(
"super", {}
)
superclass_list[normalize_type_id(match.group("type"))] = int(
match.group("offset")
)
# virtual base class (direct or indirect)
elif (match := self.VBCLASS_RE.match(line)) is not None:
virtual_base_pointer = self.keys[self.last_key].setdefault(
"vbase",
VirtualBasePointer(
vboffset=-1, # default to -1 until we parse the correct value
bases=[],
),
)
assert isinstance(
virtual_base_pointer, VirtualBasePointer
) # type checker only
virtual_base_pointer.bases.append(
VirtualBaseClass(
type=match.group("type"),
index=-1, # default to -1 until we parse the correct value
direct=match.group("indirect") != "I",
)
)
elif (match := self.VBCLASS_LINE_2_RE.match(line)) is not None:
virtual_base_pointer = self.keys[self.last_key].get("vbase", None)
assert isinstance(
virtual_base_pointer, VirtualBasePointer
), "Parsed the second line of an (I)VBCLASS without the first one"
vboffset = int(match.group("vboffset"))
if virtual_base_pointer.vboffset == -1:
# default value
virtual_base_pointer.vboffset = vboffset
elif virtual_base_pointer.vboffset != vboffset:
# vboffset is always equal to 4 in our examples. We are not sure if there can be multiple
# virtual base pointers, and if so, how the layout is supposed to look.
# We therefore assume that there is always only one virtual base pointer.
logger.error(
"Unhandled: Found multiple virtual base pointers at offsets %d and %d",
virtual_base_pointer.vboffset,
vboffset,
)
virtual_base_pointer.bases[-1].index = int(match.group("vbindex"))
# these come out of order, and the lists are so short that it's fine to sort them every time
virtual_base_pointer.bases.sort(key=lambda x: x.index)
# Member offset and type given on the first of two lines. # Member offset and type given on the first of two lines.
elif (match := self.LIST_RE.match(line)) is not None: elif (match := self.LIST_RE.match(line)) is not None:
@@ -579,7 +653,7 @@ class CvdumpTypesParser:
else: else:
logger.error("Unmatched line in arglist: %s", line[:-1]) logger.error("Unmatched line in arglist: %s", line[:-1])
def read_pointer_line(self, line): def read_pointer_line(self, line: str):
if (match := self.LF_POINTER_ELEMENT.match(line)) is not None: if (match := self.LF_POINTER_ELEMENT.match(line)) is not None:
self._set("element_type", match.group("element_type")) self._set("element_type", match.group("element_type"))
else: else:

View File

@@ -6,6 +6,9 @@ from isledecomp.cvdump.types import (
CvdumpTypesParser, CvdumpTypesParser,
CvdumpKeyError, CvdumpKeyError,
CvdumpIntegrityError, CvdumpIntegrityError,
FieldListItem,
VirtualBaseClass,
VirtualBasePointer,
) )
TEST_LINES = """ TEST_LINES = """
@@ -245,10 +248,111 @@ NESTED, enum name = JukeBox::JukeBoxScript, UDT(0x00003cc2)
list[12] = LF_MEMBER, private, type = T_USHORT(0021), offset = 12 list[12] = LF_MEMBER, private, type = T_USHORT(0021), offset = 12
member name = 'm_length' member name = 'm_length'
0x4dee : Length = 406, Leaf = 0x1203 LF_FIELDLIST
list[0] = LF_VBCLASS, public, direct base type = 0x15EA
virtual base ptr = 0x43E9, vbpoff = 4, vbind = 3
list[1] = LF_IVBCLASS, public, indirect base type = 0x1183
virtual base ptr = 0x43E9, vbpoff = 4, vbind = 1
list[2] = LF_IVBCLASS, public, indirect base type = 0x1468
virtual base ptr = 0x43E9, vbpoff = 4, vbind = 2
list[3] = LF_VFUNCTAB, type = 0x2B95
list[4] = LF_ONEMETHOD, public, VANILLA, index = 0x15C2, name = 'LegoRaceMap'
list[5] = LF_ONEMETHOD, public, VIRTUAL, index = 0x15C3, name = '~LegoRaceMap'
list[6] = LF_ONEMETHOD, public, VIRTUAL, index = 0x15C5, name = 'Notify'
list[7] = LF_ONEMETHOD, public, VIRTUAL, index = 0x15C4, name = 'ParseAction'
list[8] = LF_ONEMETHOD, public, VIRTUAL, index = 0x4DED, name = 'VTable0x70'
list[9] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x15C2,
vfptr offset = 0, name = 'FUN_1005d4b0'
list[10] = LF_MEMBER, private, type = T_UCHAR(0020), offset = 8
member name = 'm_parentClass2Field1'
list[11] = LF_MEMBER, private, type = T_32PVOID(0403), offset = 12
member name = 'm_parentClass2Field2'
0x4def : Length = 34, Leaf = 0x1504 LF_CLASS
# members = 21, field list type 0x4dee, CONSTRUCTOR,
Derivation list type 0x0000, VT shape type 0x12a0
Size = 436, class name = LegoRaceMap, UDT(0x00004def)
0x4db6 : Length = 30, Leaf = 0x1504 LF_CLASS 0x4db6 : Length = 30, Leaf = 0x1504 LF_CLASS
# members = 16, field list type 0x4db5, CONSTRUCTOR, OVERLOAD, # members = 16, field list type 0x4db5, CONSTRUCTOR, OVERLOAD,
Derivation list type 0x0000, VT shape type 0x1266 Derivation list type 0x0000, VT shape type 0x1266
Size = 16, class name = MxString, UDT(0x00004db6) Size = 16, class name = MxString, UDT(0x00004db6)
0x5591 : Length = 570, Leaf = 0x1203 LF_FIELDLIST
list[0] = LF_VBCLASS, public, direct base type = 0x15EA
virtual base ptr = 0x43E9, vbpoff = 4, vbind = 3
list[1] = LF_IVBCLASS, public, indirect base type = 0x1183
virtual base ptr = 0x43E9, vbpoff = 4, vbind = 1
list[2] = LF_IVBCLASS, public, indirect base type = 0x1468
virtual base ptr = 0x43E9, vbpoff = 4, vbind = 2
list[3] = LF_VFUNCTAB, type = 0x4E11
list[4] = LF_ONEMETHOD, public, VANILLA, index = 0x1ABD, name = 'LegoCarRaceActor'
list[5] = LF_ONEMETHOD, public, VIRTUAL, index = 0x1AE0, name = 'ClassName'
list[6] = LF_ONEMETHOD, public, VIRTUAL, index = 0x1AE1, name = 'IsA'
list[7] = LF_ONEMETHOD, public, VIRTUAL, index = 0x1ADD, name = 'VTable0x6c'
list[8] = LF_ONEMETHOD, public, VIRTUAL, index = 0x1ADB, name = 'VTable0x70'
list[9] = LF_ONEMETHOD, public, VIRTUAL, index = 0x1ADA, name = 'SwitchBoundary'
list[10] = LF_ONEMETHOD, public, VIRTUAL, index = 0x1ADC, name = 'VTable0x9c'
list[11] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x558E,
vfptr offset = 0, name = 'FUN_10080590'
list[12] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x1AD8,
vfptr offset = 4, name = 'FUN_10012bb0'
list[13] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x1AD9,
vfptr offset = 8, name = 'FUN_10012bc0'
list[14] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x1AD8,
vfptr offset = 12, name = 'FUN_10012bd0'
list[15] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x1AD9,
vfptr offset = 16, name = 'FUN_10012be0'
list[16] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x1AD8,
vfptr offset = 20, name = 'FUN_10012bf0'
list[17] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x1AD9,
vfptr offset = 24, name = 'FUN_10012c00'
list[18] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x1ABD,
vfptr offset = 28, name = 'VTable0x1c'
list[19] = LF_MEMBER, protected, type = T_REAL32(0040), offset = 8
member name = 'm_parentClass1Field1'
list[25] = LF_ONEMETHOD, public, VIRTUAL, (compgenx), index = 0x15D1, name = '~LegoCarRaceActor'
0x5592 : Length = 38, Leaf = 0x1504 LF_CLASS
# members = 26, field list type 0x5591, CONSTRUCTOR,
Derivation list type 0x0000, VT shape type 0x34c7
Size = 416, class name = LegoCarRaceActor, UDT(0x00005592)
0x5593 : Length = 638, Leaf = 0x1203 LF_FIELDLIST
list[0] = LF_BCLASS, public, type = 0x5592, offset = 0
list[1] = LF_BCLASS, public, type = 0x4DEF, offset = 32
list[2] = LF_IVBCLASS, public, indirect base type = 0x1183
virtual base ptr = 0x43E9, vbpoff = 4, vbind = 1
list[3] = LF_IVBCLASS, public, indirect base type = 0x1468
virtual base ptr = 0x43E9, vbpoff = 4, vbind = 2
list[4] = LF_IVBCLASS, public, indirect base type = 0x15EA
virtual base ptr = 0x43E9, vbpoff = 4, vbind = 3
list[5] = LF_ONEMETHOD, public, VANILLA, index = 0x15CD, name = 'LegoRaceCar'
list[6] = LF_ONEMETHOD, public, VIRTUAL, index = 0x15CE, name = '~LegoRaceCar'
list[7] = LF_ONEMETHOD, public, VIRTUAL, index = 0x15D2, name = 'Notify'
list[8] = LF_ONEMETHOD, public, VIRTUAL, index = 0x15E8, name = 'ClassName'
list[9] = LF_ONEMETHOD, public, VIRTUAL, index = 0x15E9, name = 'IsA'
list[10] = LF_ONEMETHOD, public, VIRTUAL, index = 0x15D5, name = 'ParseAction'
list[11] = LF_ONEMETHOD, public, VIRTUAL, index = 0x15D3, name = 'SetWorldSpeed'
list[12] = LF_ONEMETHOD, public, VIRTUAL, index = 0x15DF, name = 'VTable0x6c'
list[13] = LF_ONEMETHOD, public, VIRTUAL, index = 0x15D3, name = 'VTable0x70'
list[14] = LF_ONEMETHOD, public, VIRTUAL, index = 0x15DC, name = 'VTable0x94'
list[15] = LF_ONEMETHOD, public, VIRTUAL, index = 0x15E5, name = 'SwitchBoundary'
list[16] = LF_ONEMETHOD, public, VIRTUAL, index = 0x15DD, name = 'VTable0x9c'
list[17] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x15D4,
vfptr offset = 32, name = 'SetMaxLinearVelocity'
list[18] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x15D4,
vfptr offset = 36, name = 'FUN_10012ff0'
list[19] = LF_ONEMETHOD, public, INTRODUCING VIRTUAL, index = 0x5588,
vfptr offset = 40, name = 'HandleSkeletonKicks'
list[20] = LF_MEMBER, private, type = T_UCHAR(0020), offset = 84
member name = 'm_childClassField'
0x5594 : Length = 34, Leaf = 0x1504 LF_CLASS
# members = 30, field list type 0x5593, CONSTRUCTOR,
Derivation list type 0x0000, VT shape type 0x2d1e
Size = 512, class name = LegoRaceCar, UDT(0x000055bb)
""" """
@@ -309,6 +413,31 @@ def test_members(parser: CvdumpTypesParser):
(12, "m_length", "T_USHORT"), (12, "m_length", "T_USHORT"),
] ]
# LegoRaceCar with multiple superclasses
assert parser.get("0x5594").members == [
FieldListItem(offset=0, name="vftable", type="T_32PVOID"),
FieldListItem(offset=0, name="vftable", type="T_32PVOID"),
FieldListItem(offset=8, name="m_parentClass1Field1", type="T_REAL32"),
FieldListItem(offset=8, name="m_parentClass2Field1", type="T_UCHAR"),
FieldListItem(offset=12, name="m_parentClass2Field2", type="T_32PVOID"),
FieldListItem(offset=84, name="m_childClassField", type="T_UCHAR"),
]
def test_virtual_base_classes(parser: CvdumpTypesParser):
"""Make sure that virtual base classes are parsed correctly."""
lego_car_race_actor = parser.keys.get("0x5591")
assert lego_car_race_actor is not None
assert lego_car_race_actor["vbase"] == VirtualBasePointer(
vboffset=4,
bases=[
VirtualBaseClass(type="0x1183", index=1, direct=False),
VirtualBaseClass(type="0x1468", index=2, direct=False),
VirtualBaseClass(type="0x15EA", index=3, direct=True),
],
)
def test_members_recursive(parser: CvdumpTypesParser): def test_members_recursive(parser: CvdumpTypesParser):
"""Make sure that we unwrap the dependency tree correctly.""" """Make sure that we unwrap the dependency tree correctly."""