mirror of
https://github.com/isledecomp/isle.git
synced 2025-10-24 17:04:17 +00:00
Reccmp comparison engine refactor (#405)
* Reccmp comparison engine refactor * Remove redundant references to 'entry' symbol
This commit is contained in:
@@ -1,3 +1,4 @@
|
||||
import logging
|
||||
import struct
|
||||
from typing import List, Optional
|
||||
from dataclasses import dataclass
|
||||
@@ -51,12 +52,17 @@ class ImageSectionHeader:
|
||||
number_of_line_numbers: int
|
||||
characteristics: int
|
||||
|
||||
@property
|
||||
def extent(self):
|
||||
"""Get the highest possible offset of this section"""
|
||||
return max(self.size_of_raw_data, self.virtual_size)
|
||||
|
||||
def match_name(self, name: str) -> bool:
|
||||
return self.name == struct.pack("8s", name.encode("ascii"))
|
||||
|
||||
def contains_vaddr(self, vaddr: int) -> bool:
|
||||
ofs = vaddr - self.virtual_address
|
||||
return 0 <= ofs < max(self.size_of_raw_data, self.virtual_size)
|
||||
return 0 <= ofs < self.extent
|
||||
|
||||
def addr_is_uninitialized(self, vaddr: int) -> bool:
|
||||
"""We cannot rely on the IMAGE_SCN_CNT_UNINITIALIZED_DATA flag (0x80) in
|
||||
@@ -71,25 +77,29 @@ class ImageSectionHeader:
|
||||
)
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Bin:
|
||||
"""Parses a PE format EXE and allows reading data from a virtual address.
|
||||
Reference: https://learn.microsoft.com/en-us/windows/win32/debug/pe-format"""
|
||||
|
||||
# pylint: disable=too-many-instance-attributes
|
||||
|
||||
def __init__(self, filename: str, logger=None) -> None:
|
||||
self.logger = logger
|
||||
self._debuglog(f'Parsing headers of "{filename}"... ')
|
||||
def __init__(self, filename: str, find_str: bool = False) -> None:
|
||||
logger.debug('Parsing headers of "%s"... ', filename)
|
||||
self.filename = filename
|
||||
self.file = None
|
||||
self.imagebase = None
|
||||
self.entry = None
|
||||
self.sections: List[ImageSectionHeader] = []
|
||||
self.last_section = None
|
||||
self.find_str = find_str
|
||||
self._potential_strings = {}
|
||||
self._relocated_addrs = set()
|
||||
|
||||
def __enter__(self):
|
||||
self._debuglog(f"Bin {self.filename} Enter")
|
||||
logger.debug("Bin %s Enter", self.filename)
|
||||
self.file = open(self.filename, "rb")
|
||||
|
||||
(mz_str,) = struct.unpack("2s", self.file.read(2))
|
||||
@@ -123,28 +133,71 @@ class Bin:
|
||||
|
||||
self._populate_relocations()
|
||||
|
||||
# This is a (semi) expensive lookup that is not necesssary in every case.
|
||||
# We can find strings in the original if we have coverage using STRING markers.
|
||||
# For the recomp, we can find strings using the PDB.
|
||||
if self.find_str:
|
||||
self._prepare_string_search()
|
||||
|
||||
text_section = self._get_section_by_name(".text")
|
||||
self.last_section = text_section
|
||||
|
||||
self._debuglog("... Parsing finished")
|
||||
logger.debug("... Parsing finished")
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_value, exc_traceback):
|
||||
self._debuglog(f"Bin {self.filename} Exit")
|
||||
logger.debug("Bin %s Exit", self.filename)
|
||||
if self.file:
|
||||
self.file.close()
|
||||
|
||||
def _debuglog(self, msg):
|
||||
"""Write to the logger, if present"""
|
||||
if self.logger is not None:
|
||||
self.logger.debug(msg)
|
||||
|
||||
def get_relocated_addresses(self) -> List[int]:
|
||||
return sorted(self._relocated_addrs)
|
||||
|
||||
def find_string(self, target: str) -> Optional[int]:
|
||||
# Pad with null terminator to make sure we don't
|
||||
# match on a subset of the full string
|
||||
if not target.endswith(b"\x00"):
|
||||
target += b"\x00"
|
||||
|
||||
c = target[0]
|
||||
if c not in self._potential_strings:
|
||||
return None
|
||||
|
||||
for addr in self._potential_strings[c]:
|
||||
if target == self.read(addr, len(target)):
|
||||
return addr
|
||||
|
||||
return None
|
||||
|
||||
def is_relocated_addr(self, vaddr) -> bool:
|
||||
return vaddr in self._relocated_addrs
|
||||
|
||||
def _prepare_string_search(self):
|
||||
"""We are intersted in deduplicated string constants found in the
|
||||
.rdata and .data sections. For each relocated address in these sections,
|
||||
read the first byte and save the address if that byte is an ASCII character.
|
||||
When we search for an arbitrary string later, we can narrow down the list
|
||||
of potential locations by a lot."""
|
||||
|
||||
def is_ascii(b):
|
||||
return b" " <= b < b"\x7f"
|
||||
|
||||
sect_data = self._get_section_by_name(".data")
|
||||
sect_rdata = self._get_section_by_name(".rdata")
|
||||
potentials = filter(
|
||||
lambda a: sect_data.contains_vaddr(a) or sect_rdata.contains_vaddr(a),
|
||||
self.get_relocated_addresses(),
|
||||
)
|
||||
|
||||
for addr in potentials:
|
||||
c = self.read(addr, 1)
|
||||
if c is not None and is_ascii(c):
|
||||
k = ord(c)
|
||||
if k not in self._potential_strings:
|
||||
self._potential_strings[k] = set()
|
||||
|
||||
self._potential_strings[k].add(addr)
|
||||
|
||||
def _populate_relocations(self):
|
||||
"""The relocation table in .reloc gives each virtual address where the next four
|
||||
bytes are, itself, another virtual address. During loading, these values will be
|
||||
@@ -212,6 +265,9 @@ class Bin:
|
||||
|
||||
return section
|
||||
|
||||
def get_section_extent_by_index(self, index: int) -> int:
|
||||
return self.sections[index - 1].extent
|
||||
|
||||
def get_section_offset_by_index(self, index: int) -> int:
|
||||
"""The symbols output from cvdump gives addresses in this format: AAAA.BBBBBBBB
|
||||
where A is the index (1-based) into the section table and B is the local offset.
|
||||
@@ -242,6 +298,15 @@ class Bin:
|
||||
+ self.last_section.pointer_to_raw_data
|
||||
)
|
||||
|
||||
def is_valid_section(self, section: int) -> bool:
|
||||
"""The PDB will refer to sections that are not listed in the headers
|
||||
and so should ignore these references."""
|
||||
try:
|
||||
_ = self.sections[section - 1]
|
||||
return True
|
||||
except IndexError:
|
||||
return False
|
||||
|
||||
def is_valid_vaddr(self, vaddr: int) -> bool:
|
||||
"""Does this virtual address point to anything in the exe?"""
|
||||
section = next(
|
||||
|
||||
Reference in New Issue
Block a user