Brute force string search for BETA10 (#1097)

* Brute force string search for BETA10

* improved string check

* Skip this unless source binary is debug

* remove misplaced comment
This commit is contained in:
MS
2024-09-01 16:34:58 -04:00
committed by GitHub
parent 2af5f87051
commit 30be1ed4b8
2 changed files with 53 additions and 2 deletions

View File

@@ -465,6 +465,22 @@ class Bin:
for (func_addr, name_addr) in combined for (func_addr, name_addr) in combined
] ]
def iter_string(self, encoding: str = "ascii") -> Iterator[Tuple[int, str]]:
"""Search for possible strings at each verified address in .data."""
section = self.get_section_by_name(".data")
for addr in self._relocated_addrs:
if section.contains_vaddr(addr):
raw = self.read_string(addr)
if raw is None:
continue
try:
string = raw.decode(encoding)
except UnicodeDecodeError:
continue
yield (addr, string)
def get_section_by_name(self, name: str) -> Section: def get_section_by_name(self, name: str) -> Section:
section = next( section = next(
filter(lambda section: section.match_name(name), self.sections), filter(lambda section: section.match_name(name), self.sections),

View File

@@ -82,8 +82,9 @@ class Compare:
self._load_cvdump() self._load_cvdump()
self._load_markers() self._load_markers()
self._find_original_strings() # Detect floats first to eliminate potential overlap with string data
self._find_float_const() self._find_float_const()
self._find_original_strings()
self._match_imports() self._match_imports()
self._match_exports() self._match_exports()
self._match_thunks() self._match_thunks()
@@ -314,7 +315,7 @@ class Compare:
"""Go to the original binary and look for the specified string constants """Go to the original binary and look for the specified string constants
to find a match. This is a (relatively) expensive operation so we only to find a match. This is a (relatively) expensive operation so we only
look at strings that we have not already matched via a STRING annotation.""" look at strings that we have not already matched via a STRING annotation."""
# Release builds give each de-duped string a symbol so they are easy to find and match.
for string in self._db.get_unmatched_strings(): for string in self._db.get_unmatched_strings():
addr = self.orig_bin.find_string(string.encode("latin1")) addr = self.orig_bin.find_string(string.encode("latin1"))
if addr is None: if addr is None:
@@ -324,6 +325,40 @@ class Compare:
self._db.match_string(addr, string) self._db.match_string(addr, string)
def is_real_string(s: str) -> bool:
"""Heuristic to ignore values that only look like strings.
This is mostly about short strings (len <= 4) that could be byte or word values.
"""
# 0x10 is the MSB of the address space for DLLs (LEGO1), so this is a pointer
if len(s) == 0 or "\x10" in s:
return False
# assert(0) is common
if len(s) == 1 and s[0] != "0":
return False
# Hack because str.isprintable() will fail on strings with newlines or tabs
if len(s) <= 4 and "\\x" in repr(s):
return False
return True
# Debug builds do not de-dupe the strings, so we need to find them via brute force scan.
# We could try to match the string addrs if there is only one in orig and recomp.
# When we sanitize the asm, the result is the same regardless.
if self.orig_bin.is_debug:
for addr, string in self.orig_bin.iter_string("latin1"):
if is_real_string(string):
self._db.set_orig_symbol(
addr, SymbolType.STRING, string, len(string)
)
for addr, string in self.recomp_bin.iter_string("latin1"):
if is_real_string(string):
self._db.set_recomp_symbol(
addr, SymbolType.STRING, string, None, len(string)
)
def _find_float_const(self): def _find_float_const(self):
"""Add floating point constants in each binary to the database. """Add floating point constants in each binary to the database.
We are not matching anything right now because these values are not We are not matching anything right now because these values are not