Skip to content

[experiment]: Use capstone to implement ELF.libc_start_main_ret #2580

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ The table below shows which release corresponds to each branch, and what date th

## 5.0.0 (`dev`)

- [#2580][2580] Use capstone to implement `ELF.libc_start_main_return`
- [#2419][2419] riscv: avoid compressed instructions (if you need compressed, use .option rvc)
- [#2551][2551] Detect when kitty is being used as terminal
- [#2519][2519] Drop Python 2.7 support / Require Python 3.10
Expand All @@ -94,6 +95,7 @@ The table below shows which release corresponds to each branch, and what date th
- [#2575][2575] Detect when Terminator is being used as terminal
- [#2578][2578] Add gnome-terminal, Alacritty, Ttilix for run_in_new_terminal

[2580]: https://github.com/Gallopsled/pwntools/pull/2580
[2419]: https://github.com/Gallopsled/pwntools/pull/2419
[2551]: https://github.com/Gallopsled/pwntools/pull/2551
[2519]: https://github.com/Gallopsled/pwntools/pull/2519
Expand Down
70 changes: 63 additions & 7 deletions pwnlib/asm.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,10 +388,10 @@ def _bfdname():
'sparc64' : 'elf64-sparc',
}

if arch in bfdnames:
return bfdnames[arch]
else:
name = bfdnames.get(arch)
if not name:
raise Exception("Cannot find bfd name for architecture %r" % arch)
return name


def _bfdarch():
Expand All @@ -409,10 +409,7 @@ def _bfdarch():
'loongarch64': 'loongarch64'
}

if arch in convert:
return convert[arch]

return arch
return convert.get(arch, arch)

def _run(cmd, stdin = None):
log.debug('%s', subprocess.list2cmdline(cmd))
Expand Down Expand Up @@ -1015,3 +1012,62 @@ def disasm(data, vma = 0, byte = True, offset = True, instructions = True):
lines.append(line)

return re.sub(',([^ ])', r', \1', '\n'.join(lines))

@LocalContext
def get_cs_disassembler(eabi=None):
import capstone as cs
E = {
'big': cs.CS_MODE_BIG_ENDIAN,
'little': cs.CS_MODE_LITTLE_ENDIAN,
}[context.endianness]

B = {16: cs.CS_MODE_16, 32: cs.CS_MODE_32, 64: cs.CS_MODE_64}[context.bits]

try:
CS_ARCH_AARCH64 = cs.CS_ARCH_AARCH64
except Exception:
CS_ARCH_AARCH64 = cs.CS_ARCH_ARM64

try:
CS_ARCH_SYSTEMZ = cs.CS_ARCH_SYSTEMZ
except Exception:
CS_ARCH_SYSTEMZ = cs.CS_ARCH_SYSZ

params = {
'i386' : (cs.CS_ARCH_X86, B),
'amd64' : (cs.CS_ARCH_X86, B),
'thumb' : (cs.CS_ARCH_ARM, cs.CS_MODE_THUMB + E),
'arm' : (cs.CS_ARCH_ARM, cs.CS_MODE_ARM + E),
'aarch64': (CS_ARCH_AARCH64, cs.CS_MODE_ARM + E),
'armhf' : (cs.CS_ARCH_ARM, cs.CS_MODE_THUMB + E),
'mips' : (cs.CS_ARCH_MIPS, cs.CS_MODE_32 + E),
'mips64' : (cs.CS_ARCH_MIPS, cs.CS_MODE_64 + E),
'sparc' : (cs.CS_ARCH_SPARC, cs.CS_MODE_32 + E),
'sparc64': (cs.CS_ARCH_SPARC, cs.CS_MODE_64 + E),
'ppc' : (cs.CS_ARCH_PPC, B + E),
'powerpc': (cs.CS_ARCH_PPC, E + cs.CS_MODE_32),
'powerpc64': (cs.CS_ARCH_PPC, E + cs.CS_MODE_64),
'em_s390': (CS_ARCH_SYSTEMZ, cs.CS_MODE_BIG_ENDIAN + cs.CS_MODE_64),
#'ia64': None,
#'m68k': cs.CS_ARCH_M68K,
#'xcore': cs.CS_ARCH_XCORE,
#'tms320c64x': cs.CS_ARCH_TMS320C64X,
#'m680x': cs.CS_ARCH_M680X,
#'evm': cs.CS_ARCH_EVM,
#'mos65xx': cs.CS_ARCH_MOS65XX,
#'bpf': cs.CS_ARCH_BPF,
#'riscv': cs.CS_ARCH_RISCV,
#'tricore': cs.CS_ARCH_TRICORE,
#'wasm': cs.CS_ARCH_WASM,
#'sh': cs.CS_ARCH_SH,
}

arch = context.arch
if arch == 'arm' and eabi == 'hf': arch = 'armhf'
param = params.get(arch)
if not param:
raise Exception(f"unsupported {context.arch} for capstone")
arch, mode = param
md = cs.Cs(arch, mode)
md.detail = True
return md
2 changes: 2 additions & 0 deletions pwnlib/context/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -412,6 +412,7 @@ class ContextType(object):
'avr': little_8,
'amd64': little_64,
'arm': little_32,
'armhf': little_32,
'cris': little_32,
'i386': little_32,
'ia64': big_64,
Expand All @@ -427,6 +428,7 @@ class ContextType(object):
's390': big_32,
'sparc': big_32,
'sparc64': big_64,
'em_s390': big_64,
'thumb': little_32,
'vax': little_32,
'none': {},
Expand Down
91 changes: 62 additions & 29 deletions pwnlib/elf/elf.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
import re
import subprocess
import tempfile
import capstone as cs

from io import BytesIO

Expand Down Expand Up @@ -1143,6 +1144,12 @@ def _populate_kernel_version(self):

self.config['version'] = self.version

def cs_disasm(self, md: cs.Cs, address, n_bytes):
if self.arch == 'arm' and address & 1:
address -= 1

return md.disasm(self.read(address, n_bytes), address)

@property
def libc_start_main_return(self):
""":class:`int`: Address of the return address into __libc_start_main from main.
Expand All @@ -1157,62 +1164,88 @@ def libc_start_main_return(self):
to list all calls inside __libc_start_main, find the call to exit
after the call to main and select the previous call.
"""
if '__libc_start_main' not in self.functions:
func = self.functions.get('__libc_start_main')
exit_addr = self.symbols.get('exit')
if not (func and exit_addr):
return 0

if 'exit' not in self.symbols:
return 0
# `__libc_start_call_main` is usually smaller than `__libc_start_main`,
# (except for powerpc which uses a bigger `generic_start_main`), so
# we might disassemble a bit too much, but it's a good dynamic estimate.
callee_size = func.size
# most arch's call instruction has the first operands as an intermidiate, except s390
imm_index = 0
eabi = None

# If there's no delay slot, execution continues on the next instruction after a call.
call_return_offset = 1
call_instructions = set([cs.CS_GRP_CALL])
if self.arch in ['arm', 'thumb']:
call_instructions = set(['blx', 'bl'])
# FIXME: I have no idea why setting self.arch = 'armhf' does not work
if b'armhf' in self.linker: eabi = 'hf'
if exit_addr & 1: exit_addr -= 1
elif self.arch == 'aarch64':
call_instructions = set(['blr', 'bl'])
pass
elif self.arch in ['mips', 'mips64']:
call_instructions = set(['bal', 'jalr'])
# Account for the delay slot.
call_return_offset = 2
elif self.arch in ['i386', 'amd64', 'ia64']:
call_instructions = set(['call'])
pass
elif self.arch in ['ppc', 'powerpc', 'powerpc64']:
callee_size *= 2
# powepc often jumps to the local entry point after TOC setup
if exit_addr & 1 == 0: exit_addr += 8
pass
elif self.arch in ['em_s390', 's390']:
imm_index = 1
pass
else:
log.error('Unsupported architecture %s in ELF.libc_start_main_return', self.arch)
return 0

lines = self.functions['__libc_start_main'].disasm().split('\n')
exit_addr = hex(self.symbols['exit'])
calls = [(index, line) for index, line in enumerate(lines) if set(line.split()) & call_instructions]
from pwnlib.asm import get_cs_disassembler
md = get_cs_disassembler(arch=self.arch, endian=self.endian, bits=self.bits, eabi=eabi)
dis = list(self.cs_disasm(md, func.address, func.size))

def find_ret_main_addr(lines, calls):
exit_calls = [index for index, line in enumerate(calls) if exit_addr in line[1]]
if len(exit_calls) != 1:
filter_calls = lambda dis: ((i, x) for i, x in enumerate(dis) if call_instructions & set(x.groups))

if self.arch in ['ppc', 'powerpc', 'powerpc64']:
filter_calls = lambda dis: ((i, x) for i, x in enumerate(dis) if x.mnemonic in ['bctrl', 'bl'])
# FIXME: `bal` was not included in CS_GRP_CALL. This is fixed on capstone v6.alpha
elif self.arch in ['mips', 'mips64']:
filter_calls = lambda dis: ((i, x) for i, x in enumerate(dis) if x.mnemonic in ['bal', 'jalr'])

calls = list(filter_calls(dis))

def find_ret_main_addr(caller_dis, calls):
call_to_main = -1
for i, insn in calls:
if cs.CS_GRP_CALL in insn.groups and insn.operands[imm_index].imm == exit_addr:
break
call_to_main = i
else:
return 0

call_to_main = calls[exit_calls[0] - 1]
return_from_main = lines[call_to_main[0] + call_return_offset].lstrip()
return_from_main = int(return_from_main[ : return_from_main.index(':') ], 16)
return return_from_main
return_from_main = caller_dis[call_to_main + call_return_offset]
return return_from_main.address

# Starting with glibc-2.34 calling `main` is split out into `__libc_start_call_main`
ret_addr = find_ret_main_addr(lines, calls)
ret_addr = find_ret_main_addr(dis, calls)
# Pre glibc-2.34 case - `main` is called directly
if ret_addr:
return ret_addr

# `__libc_start_main` -> `__libc_start_call_main` -> `main`
# Find a direct call which calls `exit` once. That's probably `__libc_start_call_main`.
direct_call_pattern = re.compile(r'['+r'|'.join(call_instructions)+r']\s+(0x[0-9a-zA-Z]+)')
for line in calls:
match = direct_call_pattern.search(line[1])
if not match:
continue
for _, insn in calls:
op = insn.operands[imm_index]
if op.type != cs.CS_OP_IMM: continue

target_addr = op.imm
callee_dis = list(self.cs_disasm(md, target_addr, callee_size))
callee_calls = filter_calls(callee_dis)

target_addr = int(match.group(1), 0)
# `__libc_start_call_main` is usually smaller than `__libc_start_main`, so
# we might disassemble a bit too much, but it's a good dynamic estimate.
callee_lines = self.disasm(target_addr, self.functions['__libc_start_main'].size).split('\n')
callee_calls = [(index, line) for index, line in enumerate(callee_lines) if set(line.split()) & call_instructions]
ret_addr = find_ret_main_addr(callee_lines, callee_calls)
ret_addr = find_ret_main_addr(callee_dis, callee_calls)
if ret_addr:
return ret_addr
return 0
Expand Down
Loading