diff --git a/CHANGELOG.md b/CHANGELOG.md index 0dd23f2a5..bb72fbbf7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -75,6 +75,7 @@ The table below shows which release corresponds to each branch, and what date th ## 5.0.0 (`dev`) +- [#2580][2580] Use capstone to implement `ELF.libc_start_main_return` - [#2419][2419] riscv: avoid compressed instructions (if you need compressed, use .option rvc) - [#2551][2551] Detect when kitty is being used as terminal - [#2519][2519] Drop Python 2.7 support / Require Python 3.10 @@ -94,6 +95,7 @@ The table below shows which release corresponds to each branch, and what date th - [#2575][2575] Detect when Terminator is being used as terminal - [#2578][2578] Add gnome-terminal, Alacritty, Ttilix for run_in_new_terminal +[2580]: https://github.com/Gallopsled/pwntools/pull/2580 [2419]: https://github.com/Gallopsled/pwntools/pull/2419 [2551]: https://github.com/Gallopsled/pwntools/pull/2551 [2519]: https://github.com/Gallopsled/pwntools/pull/2519 diff --git a/pwnlib/asm.py b/pwnlib/asm.py index 8716e8976..5ad59caa5 100644 --- a/pwnlib/asm.py +++ b/pwnlib/asm.py @@ -388,10 +388,10 @@ def _bfdname(): 'sparc64' : 'elf64-sparc', } - if arch in bfdnames: - return bfdnames[arch] - else: + name = bfdnames.get(arch) + if not name: raise Exception("Cannot find bfd name for architecture %r" % arch) + return name def _bfdarch(): @@ -409,10 +409,7 @@ def _bfdarch(): 'loongarch64': 'loongarch64' } - if arch in convert: - return convert[arch] - - return arch + return convert.get(arch, arch) def _run(cmd, stdin = None): log.debug('%s', subprocess.list2cmdline(cmd)) @@ -1015,3 +1012,62 @@ def disasm(data, vma = 0, byte = True, offset = True, instructions = True): lines.append(line) return re.sub(',([^ ])', r', \1', '\n'.join(lines)) + +@LocalContext +def get_cs_disassembler(eabi=None): + import capstone as cs + E = { + 'big': cs.CS_MODE_BIG_ENDIAN, + 'little': cs.CS_MODE_LITTLE_ENDIAN, + }[context.endianness] + + B = {16: cs.CS_MODE_16, 32: cs.CS_MODE_32, 64: cs.CS_MODE_64}[context.bits] + + try: + CS_ARCH_AARCH64 = cs.CS_ARCH_AARCH64 + except Exception: + CS_ARCH_AARCH64 = cs.CS_ARCH_ARM64 + + try: + CS_ARCH_SYSTEMZ = cs.CS_ARCH_SYSTEMZ + except Exception: + CS_ARCH_SYSTEMZ = cs.CS_ARCH_SYSZ + + params = { + 'i386' : (cs.CS_ARCH_X86, B), + 'amd64' : (cs.CS_ARCH_X86, B), + 'thumb' : (cs.CS_ARCH_ARM, cs.CS_MODE_THUMB + E), + 'arm' : (cs.CS_ARCH_ARM, cs.CS_MODE_ARM + E), + 'aarch64': (CS_ARCH_AARCH64, cs.CS_MODE_ARM + E), + 'armhf' : (cs.CS_ARCH_ARM, cs.CS_MODE_THUMB + E), + 'mips' : (cs.CS_ARCH_MIPS, cs.CS_MODE_32 + E), + 'mips64' : (cs.CS_ARCH_MIPS, cs.CS_MODE_64 + E), + 'sparc' : (cs.CS_ARCH_SPARC, cs.CS_MODE_32 + E), + 'sparc64': (cs.CS_ARCH_SPARC, cs.CS_MODE_64 + E), + 'ppc' : (cs.CS_ARCH_PPC, B + E), + 'powerpc': (cs.CS_ARCH_PPC, E + cs.CS_MODE_32), + 'powerpc64': (cs.CS_ARCH_PPC, E + cs.CS_MODE_64), + 'em_s390': (CS_ARCH_SYSTEMZ, cs.CS_MODE_BIG_ENDIAN + cs.CS_MODE_64), + #'ia64': None, + #'m68k': cs.CS_ARCH_M68K, + #'xcore': cs.CS_ARCH_XCORE, + #'tms320c64x': cs.CS_ARCH_TMS320C64X, + #'m680x': cs.CS_ARCH_M680X, + #'evm': cs.CS_ARCH_EVM, + #'mos65xx': cs.CS_ARCH_MOS65XX, + #'bpf': cs.CS_ARCH_BPF, + #'riscv': cs.CS_ARCH_RISCV, + #'tricore': cs.CS_ARCH_TRICORE, + #'wasm': cs.CS_ARCH_WASM, + #'sh': cs.CS_ARCH_SH, + } + + arch = context.arch + if arch == 'arm' and eabi == 'hf': arch = 'armhf' + param = params.get(arch) + if not param: + raise Exception(f"unsupported {context.arch} for capstone") + arch, mode = param + md = cs.Cs(arch, mode) + md.detail = True + return md diff --git a/pwnlib/context/__init__.py b/pwnlib/context/__init__.py index 4ffed2cf0..cf389fb10 100644 --- a/pwnlib/context/__init__.py +++ b/pwnlib/context/__init__.py @@ -412,6 +412,7 @@ class ContextType(object): 'avr': little_8, 'amd64': little_64, 'arm': little_32, + 'armhf': little_32, 'cris': little_32, 'i386': little_32, 'ia64': big_64, @@ -427,6 +428,7 @@ class ContextType(object): 's390': big_32, 'sparc': big_32, 'sparc64': big_64, + 'em_s390': big_64, 'thumb': little_32, 'vax': little_32, 'none': {}, diff --git a/pwnlib/elf/elf.py b/pwnlib/elf/elf.py index ba07a2cb2..b3a0c844c 100644 --- a/pwnlib/elf/elf.py +++ b/pwnlib/elf/elf.py @@ -54,6 +54,7 @@ import re import subprocess import tempfile +import capstone as cs from io import BytesIO @@ -1143,6 +1144,12 @@ def _populate_kernel_version(self): self.config['version'] = self.version + def cs_disasm(self, md: cs.Cs, address, n_bytes): + if self.arch == 'arm' and address & 1: + address -= 1 + + return md.disasm(self.read(address, n_bytes), address) + @property def libc_start_main_return(self): """:class:`int`: Address of the return address into __libc_start_main from main. @@ -1157,62 +1164,88 @@ def libc_start_main_return(self): to list all calls inside __libc_start_main, find the call to exit after the call to main and select the previous call. """ - if '__libc_start_main' not in self.functions: + func = self.functions.get('__libc_start_main') + exit_addr = self.symbols.get('exit') + if not (func and exit_addr): return 0 - if 'exit' not in self.symbols: - return 0 + # `__libc_start_call_main` is usually smaller than `__libc_start_main`, + # (except for powerpc which uses a bigger `generic_start_main`), so + # we might disassemble a bit too much, but it's a good dynamic estimate. + callee_size = func.size + # most arch's call instruction has the first operands as an intermidiate, except s390 + imm_index = 0 + eabi = None # If there's no delay slot, execution continues on the next instruction after a call. call_return_offset = 1 + call_instructions = set([cs.CS_GRP_CALL]) if self.arch in ['arm', 'thumb']: - call_instructions = set(['blx', 'bl']) + # FIXME: I have no idea why setting self.arch = 'armhf' does not work + if b'armhf' in self.linker: eabi = 'hf' + if exit_addr & 1: exit_addr -= 1 elif self.arch == 'aarch64': - call_instructions = set(['blr', 'bl']) + pass elif self.arch in ['mips', 'mips64']: - call_instructions = set(['bal', 'jalr']) # Account for the delay slot. call_return_offset = 2 elif self.arch in ['i386', 'amd64', 'ia64']: - call_instructions = set(['call']) + pass + elif self.arch in ['ppc', 'powerpc', 'powerpc64']: + callee_size *= 2 + # powepc often jumps to the local entry point after TOC setup + if exit_addr & 1 == 0: exit_addr += 8 + pass + elif self.arch in ['em_s390', 's390']: + imm_index = 1 + pass else: log.error('Unsupported architecture %s in ELF.libc_start_main_return', self.arch) return 0 - lines = self.functions['__libc_start_main'].disasm().split('\n') - exit_addr = hex(self.symbols['exit']) - calls = [(index, line) for index, line in enumerate(lines) if set(line.split()) & call_instructions] + from pwnlib.asm import get_cs_disassembler + md = get_cs_disassembler(arch=self.arch, endian=self.endian, bits=self.bits, eabi=eabi) + dis = list(self.cs_disasm(md, func.address, func.size)) - def find_ret_main_addr(lines, calls): - exit_calls = [index for index, line in enumerate(calls) if exit_addr in line[1]] - if len(exit_calls) != 1: + filter_calls = lambda dis: ((i, x) for i, x in enumerate(dis) if call_instructions & set(x.groups)) + + if self.arch in ['ppc', 'powerpc', 'powerpc64']: + filter_calls = lambda dis: ((i, x) for i, x in enumerate(dis) if x.mnemonic in ['bctrl', 'bl']) + # FIXME: `bal` was not included in CS_GRP_CALL. This is fixed on capstone v6.alpha + elif self.arch in ['mips', 'mips64']: + filter_calls = lambda dis: ((i, x) for i, x in enumerate(dis) if x.mnemonic in ['bal', 'jalr']) + + calls = list(filter_calls(dis)) + + def find_ret_main_addr(caller_dis, calls): + call_to_main = -1 + for i, insn in calls: + if cs.CS_GRP_CALL in insn.groups and insn.operands[imm_index].imm == exit_addr: + break + call_to_main = i + else: return 0 - call_to_main = calls[exit_calls[0] - 1] - return_from_main = lines[call_to_main[0] + call_return_offset].lstrip() - return_from_main = int(return_from_main[ : return_from_main.index(':') ], 16) - return return_from_main + return_from_main = caller_dis[call_to_main + call_return_offset] + return return_from_main.address # Starting with glibc-2.34 calling `main` is split out into `__libc_start_call_main` - ret_addr = find_ret_main_addr(lines, calls) + ret_addr = find_ret_main_addr(dis, calls) # Pre glibc-2.34 case - `main` is called directly if ret_addr: return ret_addr # `__libc_start_main` -> `__libc_start_call_main` -> `main` # Find a direct call which calls `exit` once. That's probably `__libc_start_call_main`. - direct_call_pattern = re.compile(r'['+r'|'.join(call_instructions)+r']\s+(0x[0-9a-zA-Z]+)') - for line in calls: - match = direct_call_pattern.search(line[1]) - if not match: - continue + for _, insn in calls: + op = insn.operands[imm_index] + if op.type != cs.CS_OP_IMM: continue + + target_addr = op.imm + callee_dis = list(self.cs_disasm(md, target_addr, callee_size)) + callee_calls = filter_calls(callee_dis) - target_addr = int(match.group(1), 0) - # `__libc_start_call_main` is usually smaller than `__libc_start_main`, so - # we might disassemble a bit too much, but it's a good dynamic estimate. - callee_lines = self.disasm(target_addr, self.functions['__libc_start_main'].size).split('\n') - callee_calls = [(index, line) for index, line in enumerate(callee_lines) if set(line.split()) & call_instructions] - ret_addr = find_ret_main_addr(callee_lines, callee_calls) + ret_addr = find_ret_main_addr(callee_dis, callee_calls) if ret_addr: return ret_addr return 0