Reccmp comparison engine refactor (#405)

* Reccmp comparison engine refactor

* Remove redundant references to 'entry' symbol
This commit is contained in:
MS
2024-01-04 18:12:55 -05:00
committed by GitHub
parent eeb980fa0f
commit ce68a7b1f4
19 changed files with 987 additions and 279 deletions

View File

@@ -1,3 +1,4 @@
import logging
import struct
from typing import List, Optional
from dataclasses import dataclass
@@ -51,12 +52,17 @@ class ImageSectionHeader:
number_of_line_numbers: int
characteristics: int
@property
def extent(self):
"""Get the highest possible offset of this section"""
return max(self.size_of_raw_data, self.virtual_size)
def match_name(self, name: str) -> bool:
return self.name == struct.pack("8s", name.encode("ascii"))
def contains_vaddr(self, vaddr: int) -> bool:
ofs = vaddr - self.virtual_address
return 0 <= ofs < max(self.size_of_raw_data, self.virtual_size)
return 0 <= ofs < self.extent
def addr_is_uninitialized(self, vaddr: int) -> bool:
"""We cannot rely on the IMAGE_SCN_CNT_UNINITIALIZED_DATA flag (0x80) in
@@ -71,25 +77,29 @@ class ImageSectionHeader:
)
logger = logging.getLogger(__name__)
class Bin:
"""Parses a PE format EXE and allows reading data from a virtual address.
Reference: https://learn.microsoft.com/en-us/windows/win32/debug/pe-format"""
# pylint: disable=too-many-instance-attributes
def __init__(self, filename: str, logger=None) -> None:
self.logger = logger
self._debuglog(f'Parsing headers of "{filename}"... ')
def __init__(self, filename: str, find_str: bool = False) -> None:
logger.debug('Parsing headers of "%s"... ', filename)
self.filename = filename
self.file = None
self.imagebase = None
self.entry = None
self.sections: List[ImageSectionHeader] = []
self.last_section = None
self.find_str = find_str
self._potential_strings = {}
self._relocated_addrs = set()
def __enter__(self):
self._debuglog(f"Bin {self.filename} Enter")
logger.debug("Bin %s Enter", self.filename)
self.file = open(self.filename, "rb")
(mz_str,) = struct.unpack("2s", self.file.read(2))
@@ -123,28 +133,71 @@ class Bin:
self._populate_relocations()
# This is a (semi) expensive lookup that is not necesssary in every case.
# We can find strings in the original if we have coverage using STRING markers.
# For the recomp, we can find strings using the PDB.
if self.find_str:
self._prepare_string_search()
text_section = self._get_section_by_name(".text")
self.last_section = text_section
self._debuglog("... Parsing finished")
logger.debug("... Parsing finished")
return self
def __exit__(self, exc_type, exc_value, exc_traceback):
self._debuglog(f"Bin {self.filename} Exit")
logger.debug("Bin %s Exit", self.filename)
if self.file:
self.file.close()
def _debuglog(self, msg):
"""Write to the logger, if present"""
if self.logger is not None:
self.logger.debug(msg)
def get_relocated_addresses(self) -> List[int]:
return sorted(self._relocated_addrs)
def find_string(self, target: str) -> Optional[int]:
# Pad with null terminator to make sure we don't
# match on a subset of the full string
if not target.endswith(b"\x00"):
target += b"\x00"
c = target[0]
if c not in self._potential_strings:
return None
for addr in self._potential_strings[c]:
if target == self.read(addr, len(target)):
return addr
return None
def is_relocated_addr(self, vaddr) -> bool:
return vaddr in self._relocated_addrs
def _prepare_string_search(self):
"""We are intersted in deduplicated string constants found in the
.rdata and .data sections. For each relocated address in these sections,
read the first byte and save the address if that byte is an ASCII character.
When we search for an arbitrary string later, we can narrow down the list
of potential locations by a lot."""
def is_ascii(b):
return b" " <= b < b"\x7f"
sect_data = self._get_section_by_name(".data")
sect_rdata = self._get_section_by_name(".rdata")
potentials = filter(
lambda a: sect_data.contains_vaddr(a) or sect_rdata.contains_vaddr(a),
self.get_relocated_addresses(),
)
for addr in potentials:
c = self.read(addr, 1)
if c is not None and is_ascii(c):
k = ord(c)
if k not in self._potential_strings:
self._potential_strings[k] = set()
self._potential_strings[k].add(addr)
def _populate_relocations(self):
"""The relocation table in .reloc gives each virtual address where the next four
bytes are, itself, another virtual address. During loading, these values will be
@@ -212,6 +265,9 @@ class Bin:
return section
def get_section_extent_by_index(self, index: int) -> int:
return self.sections[index - 1].extent
def get_section_offset_by_index(self, index: int) -> int:
"""The symbols output from cvdump gives addresses in this format: AAAA.BBBBBBBB
where A is the index (1-based) into the section table and B is the local offset.
@@ -242,6 +298,15 @@ class Bin:
+ self.last_section.pointer_to_raw_data
)
def is_valid_section(self, section: int) -> bool:
"""The PDB will refer to sections that are not listed in the headers
and so should ignore these references."""
try:
_ = self.sections[section - 1]
return True
except IndexError:
return False
def is_valid_vaddr(self, vaddr: int) -> bool:
"""Does this virtual address point to anything in the exe?"""
section = next(