Identify and handle jump tables (#732)

This commit is contained in:
MS
2024-03-26 14:06:35 -04:00
committed by GitHub
parent 1e3ca11886
commit ededdf31c3
4 changed files with 502 additions and 27 deletions

View File

@@ -0,0 +1,235 @@
"""Pre-parser for x86 instructions. Will identify data/jump tables used with
switch statements and local jump/call destinations."""
import re
import bisect
import struct
from enum import Enum, auto
from collections import namedtuple
from typing import List, NamedTuple, Optional, Tuple, Union
from capstone import Cs, CS_ARCH_X86, CS_MODE_32
from .const import JUMP_MNEMONICS
disassembler = Cs(CS_ARCH_X86, CS_MODE_32)
DisasmLiteInst = namedtuple("DisasmLiteInst", "address, size, mnemonic, op_str")
displacement_regex = re.compile(r".*\+ (0x[0-9a-f]+)\]")
class SectionType(Enum):
CODE = auto()
DATA_TAB = auto()
ADDR_TAB = auto()
class FuncSection(NamedTuple):
type: SectionType
contents: List[Union[DisasmLiteInst, Tuple[str, int]]]
class InstructGen:
# pylint: disable=too-many-instance-attributes
def __init__(self, blob: bytes, start: int) -> None:
self.blob = blob
self.start = start
self.end = len(blob) + start
self.section_end: int = self.end
self.code_tracks: List[List[DisasmLiteInst]] = []
# Todo: Could be refactored later
self.cur_addr: int = 0
self.cur_section_type: SectionType = SectionType.CODE
self.section_start = start
self.sections: List[FuncSection] = []
self.confirmed_addrs = {}
self.analysis()
def _finish_section(self, type_: SectionType, stuff):
sect = FuncSection(type_, stuff)
self.sections.append(sect)
def _insert_confirmed_addr(self, addr: int, type_: SectionType):
# Ignore address outside the bounds of the function
if not self.start <= addr < self.end:
return
self.confirmed_addrs[addr] = type_
# This newly inserted address might signal the end of this section.
# For example, a jump table at the end of the function means we should
# stop reading instructions once we hit that address.
# However, if there is a jump table in between code sections, we might
# read a jump to an address back to the beginning of the function
# (e.g. a loop that spans the entire function)
# so ignore this address because we have already passed it.
if type_ != self.cur_section_type and addr > self.cur_addr:
self.section_end = min(self.section_end, addr)
def _next_section(self, addr: int) -> Optional[SectionType]:
"""We have reached the start of a new section. Tell what kind of
data we are looking at (code or other) and how much we should read."""
# Assume the start of every function is code.
if addr == self.start:
self.section_end = self.end
return SectionType.CODE
# The start of a new section must be an address that we've seen.
new_type = self.confirmed_addrs.get(addr)
if new_type is None:
return None
self.cur_section_type = new_type
# The confirmed addrs dict is sorted by insertion order
# i.e. the order in which we read the addresses
# So we have to sort and then find the next item
# to see where this section should end.
# If we are in a CODE section, ignore contiguous CODE addresses.
# These are not the start of a new section.
# However: if we are not in CODE, any upcoming address is a new section.
# Do this so we can detect contiguous non-CODE sections.
confirmed = [
conf_addr
for (conf_addr, conf_type) in sorted(self.confirmed_addrs.items())
if self.cur_section_type != SectionType.CODE
or conf_type != self.cur_section_type
]
index = bisect.bisect_right(confirmed, addr)
if index < len(confirmed):
self.section_end = confirmed[index]
else:
self.section_end = self.end
return new_type
def _get_code_for(self, addr: int) -> List[DisasmLiteInst]:
"""Start disassembling at the given address."""
# If we are reading a code block beyond the first, see if we already
# have disassembled instructions beginning at the specified address.
# For a CODE/ADDR/CODE function, we might get lucky and produce the
# correct instruction after the jump table's junk instructions.
for track in self.code_tracks:
for i, inst in enumerate(track):
if inst.address == addr:
return track[i:]
# If we are here, we don't have the instructions.
# Todo: Could try to be clever here and disassemble only
# as much as we probably need (i.e. if a jump table is between CODE
# blocks, there are probably only a few bad instructions after the
# jump table is finished. We could disassemble up to the next verified
# code address and stitch it together)
blob_cropped = self.blob[addr - self.start :]
instructions = [
DisasmLiteInst(*inst)
for inst in disassembler.disasm_lite(blob_cropped, addr)
]
self.code_tracks.append(instructions)
return instructions
def _handle_jump(self, inst: DisasmLiteInst):
# If this is a regular jump and its destination is within the
# bounds of the binary data (i.e. presumed function size)
# add it to our list of confirmed addresses.
if inst.op_str[0] == "0":
value = int(inst.op_str, 16)
self._insert_confirmed_addr(value, SectionType.CODE)
# If this is jumping into a table of addresses, save the destination
elif (match := displacement_regex.match(inst.op_str)) is not None:
value = int(match.group(1), 16)
self._insert_confirmed_addr(value, SectionType.ADDR_TAB)
def analysis(self):
self.cur_addr = self.start
while (sect_type := self._next_section(self.cur_addr)) is not None:
self.section_start = self.cur_addr
if sect_type == SectionType.CODE:
instructions = self._get_code_for(self.cur_addr)
# If we didn't get any instructions back, something is wrong.
# i.e. We can only read part of the full instruction that is up next.
if len(instructions) == 0:
# Nudge the current addr so we will eventually move on to the
# next section.
# Todo: Maybe we could just call it quits here
self.cur_addr += 1
break
for inst in instructions:
# section_end is updated as we read instructions.
# If we are into a jump/data table and would read
# a junk instruction, stop here.
if self.cur_addr >= self.section_end:
break
# print(f"{inst.address:x} : {inst.mnemonic} {inst.op_str}")
if inst.mnemonic in JUMP_MNEMONICS:
self._handle_jump(inst)
# Todo: log calls too (unwind section)
elif inst.mnemonic == "mov":
# Todo: maintain pairing of data/jump tables
if (match := displacement_regex.match(inst.op_str)) is not None:
value = int(match.group(1), 16)
self._insert_confirmed_addr(value, SectionType.DATA_TAB)
# Do this instead of copying instruction address.
# If there is only one instruction, we would get stuck here.
self.cur_addr += inst.size
# End of for loop on instructions.
# We are at the end of the section or the entire function.
# Cut out only the valid instructions for this section
# and save it for later.
# Todo: don't need to iter on every instruction here.
# They are already in order.
instruction_slice = [
inst for inst in instructions if inst.address < self.section_end
]
self._finish_section(SectionType.CODE, instruction_slice)
elif sect_type == SectionType.ADDR_TAB:
# Clamp to multiple of 4 (dwords)
read_size = ((self.section_end - self.cur_addr) // 4) * 4
offsets = range(self.section_start, self.section_start + read_size, 4)
dwords = self.blob[
self.cur_addr - self.start : self.cur_addr - self.start + read_size
]
addrs = [addr for addr, in struct.iter_unpack("<L", dwords)]
for addr in addrs:
# Todo: the fact that these are jump table destinations
# should factor into the label name.
self._insert_confirmed_addr(addr, SectionType.CODE)
jump_table = list(zip(offsets, addrs))
# for (t0,t1) in jump_table:
# print(f"{t0:x} : --> {t1:x}")
self._finish_section(SectionType.ADDR_TAB, jump_table)
self.cur_addr = self.section_end
else:
# Todo: variable data size?
read_size = self.section_end - self.cur_addr
offsets = range(self.section_start, self.section_start + read_size)
bytes_ = self.blob[
self.cur_addr - self.start : self.cur_addr - self.start + read_size
]
data = [b for b, in struct.iter_unpack("<B", bytes_)]
data_table = list(zip(offsets, data))
# for (t0,t1) in data_table:
# print(f"{t0:x} : value {t1:02x}")
self._finish_section(SectionType.DATA_TAB, data_table)
self.cur_addr = self.section_end

View File

@@ -11,13 +11,13 @@ from functools import cache
from typing import Callable, List, Optional, Tuple from typing import Callable, List, Optional, Tuple
from collections import namedtuple from collections import namedtuple
from isledecomp.bin import InvalidVirtualAddressError from isledecomp.bin import InvalidVirtualAddressError
from capstone import Cs, CS_ARCH_X86, CS_MODE_32
from .const import JUMP_MNEMONICS, SINGLE_OPERAND_INSTS from .const import JUMP_MNEMONICS, SINGLE_OPERAND_INSTS
from .instgen import InstructGen, SectionType
disassembler = Cs(CS_ARCH_X86, CS_MODE_32)
ptr_replace_regex = re.compile(r"\[(0x[0-9a-f]+)\]") ptr_replace_regex = re.compile(r"\[(0x[0-9a-f]+)\]")
displace_replace_regex = re.compile(r"\+ (0x[0-9a-f]+)\]")
# For matching an immediate value on its own. # For matching an immediate value on its own.
# Preceded by start-of-string (first operand) or comma-space (second operand) # Preceded by start-of-string (first operand) or comma-space (second operand)
immediate_replace_regex = re.compile(r"(?:^|, )(0x[0-9a-f]+)") immediate_replace_regex = re.compile(r"(?:^|, )(0x[0-9a-f]+)")
@@ -172,16 +172,25 @@ class ParseAsm:
else: else:
op_str = ptr_replace_regex.sub(self.hex_replace_always, inst.op_str) op_str = ptr_replace_regex.sub(self.hex_replace_always, inst.op_str)
# We only want relocated addresses for pointer displacement.
# i.e. ptr [register + something]
# Otherwise we would use a placeholder for every stack variable,
# vtable call, or this->member access.
op_str = displace_replace_regex.sub(self.hex_replace_relocated, op_str)
op_str = immediate_replace_regex.sub(self.hex_replace_relocated, op_str) op_str = immediate_replace_regex.sub(self.hex_replace_relocated, op_str)
return (inst.mnemonic, op_str) return (inst.mnemonic, op_str)
def parse_asm(self, data: bytes, start_addr: Optional[int] = 0) -> List[str]: def parse_asm(self, data: bytes, start_addr: Optional[int] = 0) -> List[str]:
asm = [] asm = []
for raw_inst in disassembler.disasm_lite(data, start_addr): ig = InstructGen(data, start_addr)
for sect_type, sect_contents in ig.sections:
if sect_type == SectionType.CODE:
for inst in sect_contents:
# Use heuristics to disregard some differences that aren't representative # Use heuristics to disregard some differences that aren't representative
# of the accuracy of a function (e.g. global offsets) # of the accuracy of a function (e.g. global offsets)
inst = DisasmLiteInst(*raw_inst)
# If there is no pointer or immediate value in the op_str, # If there is no pointer or immediate value in the op_str,
# there is nothing to sanitize. # there is nothing to sanitize.
@@ -201,5 +210,14 @@ class ParseAsm:
# mnemonic + " " + op_str # mnemonic + " " + op_str
asm.append((hex(inst.address), " ".join(result))) asm.append((hex(inst.address), " ".join(result)))
elif sect_type == SectionType.ADDR_TAB:
asm.append(("", "Jump table:"))
for i, (ofs, _) in enumerate(sect_contents):
asm.append((hex(ofs), f"Jump_dest_{i}"))
elif sect_type == SectionType.DATA_TAB:
asm.append(("", "Data table:"))
for ofs, b in sect_contents:
asm.append((hex(ofs), hex(b)))
return asm return asm

View File

@@ -0,0 +1,212 @@
from isledecomp.compare.asm.instgen import InstructGen, SectionType
def test_ret():
"""Make sure we can handle a function with one instruction."""
ig = InstructGen(b"\xc3", 0)
assert len(ig.sections) == 1
SCORE_NOTIFY = (
b"\x53\x56\x57\x8b\xd9\x33\xff\x8b\x74\x24\x10\x56\xe8\xbf\xe1\x01"
b"\x00\x80\xbb\xf6\x00\x00\x00\x00\x0f\x84\x9c\x00\x00\x00\x8b\x4e"
b"\x04\x49\x83\xf9\x17\x0f\x87\x8f\x00\x00\x00\x33\xc0\x8a\x81\xec"
b"\x14\x00\x10\xff\x24\x85\xd4\x14\x00\x10\x8b\xcb\xbf\x01\x00\x00"
b"\x00\xe8\x7a\x05\x00\x00\x8b\xc7\x5f\x5e\x5b\xc2\x04\x00\x56\x8b"
b"\xcb\xe8\xaa\x00\x00\x00\x8b\xf8\x8b\xc7\x5f\x5e\x5b\xc2\x04\x00"
b"\x80\x7e\x18\x20\x75\x07\x8b\xcb\xe8\xc3\xfe\xff\xff\xbf\x01\x00"
b"\x00\x00\x8b\xc7\x5f\x5e\x5b\xc2\x04\x00\x56\x8b\xcb\xe8\x3e\x02"
b"\x00\x00\x8b\xf8\x8b\xc7\x5f\x5e\x5b\xc2\x04\x00\x6a\x09\xa1\x4c"
b"\x45\x0f\x10\x6a\x07\x50\xe8\x35\x45\x01\x00\x83\xc4\x0c\x8b\x83"
b"\xf8\x00\x00\x00\x85\xc0\x74\x0d\x50\xe8\xa2\x42\x01\x00\x8b\xc8"
b"\xe8\x9b\x9b\x03\x00\xbf\x01\x00\x00\x00\x8b\xc7\x5f\x5e\x5b\xc2"
b"\x04\x00\x8b\xff\x4a\x14\x00\x10\x5e\x14\x00\x10\x70\x14\x00\x10"
b"\x8a\x14\x00\x10\x9c\x14\x00\x10\xca\x14\x00\x10\x00\x01\x05\x05"
b"\x05\x05\x02\x05\x05\x05\x05\x05\x05\x05\x05\x05\x03\x05\x05\x05"
b"\x05\x05\x05\x04\xcc\xcc\xcc\xcc\xcc\xcc\xcc\xcc\xcc\xcc\xcc\xcc"
)
def test_score_notify():
"""Score::Notify function from 0x10001410 in LEGO1.
Good representative function for jump table (at 0x100014d4)
and switch data (at 0x100014ec)."""
ig = InstructGen(SCORE_NOTIFY, 0x10001410)
# Did we get everything?
assert len(ig.sections) == 3
types_only = tuple(s.type for s in ig.sections)
assert types_only == (SectionType.CODE, SectionType.ADDR_TAB, SectionType.DATA_TAB)
# CODE section stopped at correct place?
instructions = ig.sections[0].contents
assert instructions[-1].address == 0x100014D2
# n.b. 0x100014d2 is the dummy instruction `mov edi, edi`
# Ghidra does more thorough analysis and ignores this.
# The last real instruction should be at 0x100014cf. Not a big deal
# to include this because it is not junk data.
# 6 switch addresses
assert len(ig.sections[1].contents) == 6
# TODO: The data table at the end includes all of the 0xCC padding bytes.
SMACK_CASE = (
# LEGO1: 0x100cdc43 (modified so jump table points at +0x1016)
b"\x2e\xff\x24\x8d\x16\x10\x00\x00"
# LEGO1: 0x100cdb62 (instructions before and after jump table)
b"\x8b\xf8\xeb\x1a\x87\xdb\x87\xc9\x87\xdb\x87\xc9\x87\xdb\x50\xdc"
b"\x0c\x10\xd0\xe2\x0c\x10\xb0\xe8\x0c\x10\x50\xe9\x0c\x10\xa0\x10"
b"\x27\x10\x10\x3c\x11\x77\x17\x8a\xc8"
)
def test_smack_case():
"""Case where we have code / jump table / code.
Need to properly separate code sections, eliminate junk instructions
and continue disassembling at the proper address following the data."""
ig = InstructGen(SMACK_CASE, 0x1000)
assert len(ig.sections) == 3
assert ig.sections[0].type == ig.sections[2].type == SectionType.CODE
# Make sure we captured the instruction immediately after
assert ig.sections[2].contents[0].mnemonic == "mov"
# BETA10 0x1004c9cc
BETA_FUNC = (
b"\x55\x8b\xec\x83\xec\x08\x53\x56\x57\x89\x4d\xfc\x8b\x45\xfc\x33"
b"\xc9\x8a\x88\x19\x02\x00\x00\x89\x4d\xf8\xe9\x1e\x00\x00\x00\xe9"
b"\x41\x00\x00\x00\xe9\x3c\x00\x00\x00\xe9\x37\x00\x00\x00\xe9\x32"
b"\x00\x00\x00\xe9\x2d\x00\x00\x00\xe9\x28\x00\x00\x00\x83\x7d\xf8"
b"\x04\x0f\x87\x1e\x00\x00\x00\x8b\x45\xf8\xff\x24\x85\x1d\xca\x04"
b"\x10\xeb\xc9\x04\x10\xf0\xc9\x04\x10\xf5\xc9\x04\x10\xfa\xc9\x04"
b"\x10\xff\xc9\x04\x10\xb0\x01\xe9\x00\x00\x00\x00\x5f\x5e\x5b\xc9"
b"\xc2\x04\x00"
)
def test_beta_case():
"""Complete (and short) function with CODE / ADDR / CODE"""
ig = InstructGen(BETA_FUNC, 0x1004C9CC)
# The JMP into the jump table immediately precedes the jump table.
# We have to detect this and switch sections correctly or we will only
# get 1 section.
assert len(ig.sections) == 3
assert ig.sections[0].type == ig.sections[2].type == SectionType.CODE
# Make sure we captured the instruction immediately after
assert ig.sections[2].contents[0].mnemonic == "mov"
# LEGO1 0x1000fb50
# TODO: The test data here is longer than it needs to be.
THUNK_TEST = (
b"\x2b\x49\xfc\xe9\x08\x00\x00\x00\xcc\xcc\xcc\xcc\xcc\xcc\xcc\xcc"
b"\x56\x8b\xf1\xe8\xd8\xc5\x00\x00\x8b\xce\xe8\xb1\xdc\x01\x00\xf6"
b"\x44\x24\x08\x01\x74\x0c\x8d\x46\xe0\x50\xe8\xe1\x66\x07\x00\x83"
b"\xc4\x04\x8d\x46\xe0\x5e\xc2\x04\x00\xcc\xcc\xcc\xcc\xcc\xcc\xcc"
b"\x2b\x49\xfc\xe9\x08\x00\x00\x00\xcc\xcc\xcc\xcc\xcc\xcc\xcc\xcc"
b"\xb8\x7c\x05\x0f\x10\xc3\xcc\xcc\xcc\xcc\xcc\xcc\xcc\xcc\xcc\xcc"
b"\x2b\x49\xfc\xe9\x08\x00\x00\x00\xcc\xcc\xcc\xcc\xcc\xcc\xcc\xcc"
b"\x8b\x54"
# The problem is here: the last two bytes are the start of the next
# function 0x1000fbc0. This is not enough data to read an instruction.
)
def test_thunk_case():
"""Adjuster thunk incorrectly annotated.
We are reading way more bytes than we should for this function."""
ig = InstructGen(THUNK_TEST, 0x1000FB50)
# No switch cases here, so the only section is code.
# This caused an infinite loop during testing so the goal is just to finish.
assert len(ig.sections) == 1
# TODO: We might detect the 0xCC padding bytes and cut off the function.
# If we did that, we would correctly read only 2 instructions.
# assert len(ig.sections[0].contents) == 2
# LEGO1 0x1006f080, Infocenter::HandleEndAction
HANDLE_END_ACTION = (
b"\x53\x56\x57\x8b\xf1\x8b\x5c\x24\x10\x8b\x0d\x84\x45\x0f\x10\x8b"
b"\x7b\x0c\x8b\x47\x20\x39\x01\x75\x29\x81\x7f\x1c\xf3\x01\x00\x00"
b"\x75\x20\xe8\x59\x66\xfa\xff\x6a\x00\x8b\x40\x18\x6a\x00\x6a\x10"
b"\x50\xff\x15\x38\xb5\x10\x10\xb8\x01\x00\x00\x00\x5f\x5e\x5b\xc2"
b"\x04\x00\x39\x46\x0c\x0f\x85\xa2\x00\x00\x00\x8b\x47\x1c\x83\xf8"
b"\x28\x74\x18\x83\xf8\x29\x74\x13\x83\xf8\x2a\x74\x0e\x83\xf8\x2b"
b"\x74\x09\x83\xf8\x2c\x0f\x85\x82\x00\x00\x00\x66\x8b\x86\xd4\x01"
b"\x00\x00\x66\x85\xc0\x74\x09\x66\x48\x66\x89\x86\xd4\x01\x00\x00"
b"\x66\x83\xbe\xd4\x01\x00\x00\x00\x75\x63\x6a\x0b\xe8\xff\x67\xfa"
b"\xff\x66\x8b\x86\xfc\x00\x00\x00\x83\xc4\x04\x50\xe8\x3f\x66\xfa"
b"\xff\x8b\xc8\xe8\x58\xa6\xfc\xff\x0f\xbf\x86\xfc\x00\x00\x00\x48"
b"\x83\xf8\x04\x77\x2f\xff\x24\x85\x78\xf4\x06\x10\x68\x1d\x02\x00"
b"\x00\xeb\x1a\x68\x1e\x02\x00\x00\xeb\x13\x68\x1f\x02\x00\x00\xeb"
b"\x0c\x68\x20\x02\x00\x00\xeb\x05\x68\x21\x02\x00\x00\x8b\xce\xe8"
b"\x9c\x21\x00\x00\x6a\x01\x8b\xce\xe8\x53\x1c\x00\x00\x8d\x8e\x0c"
b"\x01\x00\x00\x53\x8b\x01\xff\x50\x04\x85\xc0\x0f\x85\xef\x02\x00"
b"\x00\x8b\x56\x0c\x8b\x4f\x20\x3b\xd1\x74\x0e\x8b\x1d\x74\x45\x0f"
b"\x10\x39\x0b\x0f\x85\xd7\x02\x00\x00\x81\x7f\x1c\x02\x02\x00\x00"
b"\x75\x1a\x6a\x00\x52\x6a\x10\xe8\xa4\x65\xfa\xff\x8b\xc8\xe8\x0d"
b"\xa2\xfb\xff\x66\xc7\x86\xd6\x01\x00\x00\x00\x00\x8b\x96\x00\x01"
b"\x00\x00\x8d\x42\x74\x8b\x18\x83\xfb\x0c\x0f\x87\x9b\x02\x00\x00"
b"\x33\xc9\x8a\x8b\xac\xf4\x06\x10\xff\x24\x8d\x8c\xf4\x06\x10\x8b"
b"\x86\x08\x01\x00\x00\x83\xf8\x05\x77\x07\xff\x24\x85\xbc\xf4\x06"
b"\x10\x8b\xce\xe8\xb8\x1a\x00\x00\x8b\x86\x00\x01\x00\x00\x68\xf4"
b"\x01\x00\x00\x8b\xce\xc7\x40\x74\x0b\x00\x00\x00\xe8\xef\x20\x00"
b"\x00\x8b\x86\x00\x01\x00\x00\xc7\x86\x08\x01\x00\x00\xff\xff\xff"
b"\xff\x83\x78\x78\x00\x0f\x85\x40\x02\x00\x00\xb8\x01\x00\x00\x00"
b"\x5f\x66\xc7\x86\xd2\x01\x00\x00\x01\x00\x5e\x5b\xc2\x04\x00\x6a"
b"\x00\x8b\xce\x6a\x01\xe8\xd6\x19\x00\x00\xb8\x01\x00\x00\x00\x5f"
b"\x5e\x5b\xc2\x04\x00\x6a\x01\x8b\xce\x6a\x02\xe8\xc0\x19\x00\x00"
b"\xb8\x01\x00\x00\x00\x5f\x5e\x5b\xc2\x04\x00\x8b\xce\xe8\x3e\x1a"
b"\x00\x00\x8b\x86\x00\x01\x00\x00\x68\x1c\x02\x00\x00\x8b\xce\xc7"
b"\x40\x74\x0b\x00\x00\x00\xe8\x75\x20\x00\x00\xb8\x01\x00\x00\x00"
b"\x5f\xc7\x86\x08\x01\x00\x00\xff\xff\xff\xff\x5e\x5b\xc2\x04\x00"
b"\x8b\xce\xe8\x09\x1a\x00\x00\x8b\x86\x00\x01\x00\x00\x68\x1b\x02"
b"\x00\x00\x8b\xce\xc7\x40\x74\x0b\x00\x00\x00\xe8\x40\x20\x00\x00"
b"\xb8\x01\x00\x00\x00\x5f\xc7\x86\x08\x01\x00\x00\xff\xff\xff\xff"
b"\x5e\x5b\xc2\x04\x00\xc7\x00\x0b\x00\x00\x00\x8b\x86\x08\x01\x00"
b"\x00\x83\xf8\x04\x74\x0c\x83\xf8\x05\x74\x0e\x68\xf4\x01\x00\x00"
b"\xeb\x0c\x68\x1c\x02\x00\x00\xeb\x05\x68\x1b\x02\x00\x00\x8b\xce"
b"\xe8\xfb\x1f\x00\x00\xb8\x01\x00\x00\x00\x5f\xc7\x86\x08\x01\x00"
b"\x00\xff\xff\xff\xff\x5e\x5b\xc2\x04\x00\x6a\x00\xa1\xa0\x76\x0f"
b"\x10\x50\xe8\x39\x65\xfa\xff\x83\xc4\x08\xa1\xa4\x76\x0f\x10\x6a"
b"\x00\x50\xe8\x29\x65\xfa\xff\x83\xc4\x08\xe8\xf1\x63\xfa\xff\x8b"
b"\xc8\xe8\x6a\x02\x01\x00\xb8\x01\x00\x00\x00\x5f\x5e\x5b\xc2\x04"
b"\x00\x8b\x47\x1c\x83\xf8\x46\x74\x09\x83\xf8\x47\x0f\x85\x09\x01"
b"\x00\x00\x6a\x00\x6a\x00\x6a\x32\x6a\x03\xe8\x91\x65\xfa\xff\x8b"
b"\xc8\xe8\xfa\xc7\xfd\xff\x8b\x86\x00\x01\x00\x00\x5f\x5e\x5b\xc7"
b"\x40\x74\x0e\x00\x00\x00\xb8\x01\x00\x00\x00\xc2\x04\x00\x8b\x47"
b"\x1c\x39\x86\xf8\x00\x00\x00\x0f\x85\xce\x00\x00\x00\xe8\xbe\x63"
b"\xfa\xff\x83\x78\x10\x02\x74\x19\x66\x8b\x86\xfc\x00\x00\x00\x66"
b"\x85\xc0\x74\x0d\x50\xe8\xa6\x63\xfa\xff\x8b\xc8\xe8\xbf\xa3\xfc"
b"\xff\x6a\x00\x6a\x00\x6a\x32\x6a\x03\xe8\x32\x65\xfa\xff\x8b\xc8"
b"\xe8\x9b\xc7\xfd\xff\x8b\x86\x00\x01\x00\x00\x5f\x5e\x5b\xc7\x40"
b"\x74\x0e\x00\x00\x00\xb8\x01\x00\x00\x00\xc2\x04\x00\x83\x7a\x78"
b"\x00\x75\x32\x8b\x86\xf8\x00\x00\x00\x83\xf8\x28\x74\x27\x83\xf8"
b"\x29\x74\x22\x83\xf8\x2a\x74\x1d\x83\xf8\x2b\x74\x18\x83\xf8\x2c"
b"\x74\x13\x66\xc7\x86\xd0\x01\x00\x00\x01\x00\x6a\x0b\xe8\xee\x64"
b"\xfa\xff\x83\xc4\x04\x8b\x86\x00\x01\x00\x00\x6a\x01\x68\xdc\x44"
b"\x0f\x10\xc7\x40\x74\x02\x00\x00\x00\xe8\x22\x64\xfa\xff\x83\xc4"
b"\x08\xb8\x01\x00\x00\x00\x5f\x5e\x5b\xc2\x04\x00\x8b\x47\x1c\x39"
b"\x86\xf8\x00\x00\x00\x75\x14\x6a\x00\x6a\x00\x6a\x32\x6a\x03\xe8"
b"\x9c\x64\xfa\xff\x8b\xc8\xe8\x05\xc7\xfd\xff\xb8\x01\x00\x00\x00"
b"\x5f\x5e\x5b\xc2\x04\x00\x8b\xff\x3c\xf1\x06\x10\x43\xf1\x06\x10"
b"\x4a\xf1\x06\x10\x51\xf1\x06\x10\x58\xf1\x06\x10\xdf\xf1\x06\x10"
b"\xd5\xf2\x06\x10\x1a\xf3\x06\x10\x51\xf3\x06\x10\x8e\xf3\x06\x10"
b"\xed\xf3\x06\x10\x4c\xf4\x06\x10\x6b\xf4\x06\x10\x00\x01\x02\x07"
b"\x03\x04\x07\x07\x07\x07\x07\x05\x06\x8d\x49\x00\x3f\xf2\x06\x10"
b"\x55\xf2\x06\x10\xf1\xf1\x06\x10\xf1\xf1\x06\x10\x6b\xf2\x06\x10"
b"\xa0\xf2\x06\x10\xcc\xcc\xcc\xcc\xcc\xcc\xcc\xcc\xcc\xcc\xcc\xcc"
)
def test_action_case():
"""3 switches: 3 jump tables, 1 data table"""
ig = InstructGen(HANDLE_END_ACTION, 0x1006F080)
# Two of the jump tables (0x1006f478 with 5, 0x1006f48c with 8)
# are contiguous.
assert len(ig.sections) == 5

View File

@@ -81,13 +81,23 @@ def test_jump_displacement():
assert op_str == "-0x2" assert op_str == "-0x2"
@pytest.mark.xfail(reason="Not implemented yet")
def test_jmp_table(): def test_jmp_table():
"""Should detect the characteristic jump table instruction """To ignore cases where it would be inappropriate to replace pointer
(for a switch statement) and use placeholder.""" displacement (i.e. the vast majority of them) we require the address
to be relocated. This excludes any address less than the imagebase."""
p = ParseAsm() p = ParseAsm()
inst = mock_inst("jmp", "dword ptr [eax*4 + 0x5555]") inst = mock_inst("jmp", "dword ptr [eax*4 + 0x5555]")
(_, op_str) = p.sanitize(inst) (_, op_str) = p.sanitize(inst)
# i.e. no change
assert op_str == "dword ptr [eax*4 + 0x5555]"
def relocate_lookup(addr: int) -> bool:
return addr == 0x5555
# Now add the relocation lookup
p = ParseAsm(relocate_lookup=relocate_lookup)
(_, op_str) = p.sanitize(inst)
# Should replace it now
assert op_str == "dword ptr [eax*4 + <OFFSET1>]" assert op_str == "dword ptr [eax*4 + <OFFSET1>]"