diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 0000000..8f89c69 --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,47 @@ +# +# Copyright (c) 2019-present, Trail of Bits, Inc. +# All rights reserved. +# +# This source code is licensed in accordance with the terms specified in +# the LICENSE file found in the root directory of this source tree. +# + +name: Build + +on: + # Run this workflow once every 6 hours against the master branch + #schedule: + # - cron: "0 */6 * * *" + + push: + branches: + - "*" + + tags: + - "*" + + pull_request: + branches: + - "*" + +jobs: + build_linux: + strategy: + matrix: + image: + - { name: "ubuntu", tag: "22.04" } + llvm: ["16"] + cxxcommon_version: ["v0.3.2"] + + runs-on: ubuntu-22.04 + container: + image: docker.pkg.github.com/lifting-bits/cxx-common/vcpkg-builder-${{ matrix.image.name }}:${{ matrix.image.tag }} + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + steps: + - uses: actions/checkout@v3 + - uses: ./actions/build-cxx-common + with: + gh-token: ${{ secrets.GITHUB_TOKEN }} diff --git a/actions/build-cxx-common/action.yml b/actions/build-cxx-common/action.yml new file mode 100644 index 0000000..173eca7 --- /dev/null +++ b/actions/build-cxx-common/action.yml @@ -0,0 +1,53 @@ +name: "Lifting tools CXX-Common" +description: "Build cxx-common latest for lifting-tools" +inputs: + target-export-path: # id of input + description: "where to export the cxx-common build" + required: true + gh-token: + description: "token for the target nuget cache" + required: true +runs: + using: "composite" + steps: + - uses: actions/checkout@v3 + with: + submodules: true + path: cxx-common + repository: "lifting-bits/cxx-common" + + - name: Initialize vcpkg + shell: bash + working-directory: cxx-common + run: | + { read -r vcpkg_repo_url && read -r vcpkg_commit; } <./vcpkg_info.txt || exit 1 + git clone "${vcpkg_repo_url}" + git -C vcpkg checkout "${vcpkg_commit}" + echo "VCPKG_ROOT=$(pwd)/vcpkg" >> $GITHUB_ENV + echo "VCPKG_DISABLE_METRICS=1" >> $GITHUB_ENV + echo "VCPKG_USE_NUGET_CACHE=1" >> $GITHUB_ENV + echo "VCPKG_BINARY_SOURCES=clear;nuget,GitHub,readwrite;nugettimeout,3601" >> $GITHUB_ENV + echo "VCPKG_NUGET_REPOSITORY=''" >> $GITHUB_ENV + - name: "Bootstrap vcpkg" + shell: "bash" + working-directory: cxx-common + run: | + ./vcpkg/bootstrap-vcpkg.sh + - name: "Setup NuGet Credentials" + shell: "bash" + working-directory: cxx-common + run: | + mono `./vcpkg/vcpkg fetch nuget | tail -n 1` \ + sources add \ + -source "https://nuget.pkg.github.com/lifting-bits/index.json" \ + -storepasswordincleartext \ + -name "GitHub" \ + -username "lifting-bits" \ + -password "${{ inputs.gh-token }}" + mono `./vcpkg/vcpkg fetch nuget | tail -n 1` \ + setapikey "${{ inputs.gh-token }}" \ + -source "https://nuget.pkg.github.com/lifting-bits/index.json" + + ./vcpkg/vcpkg install sqlite3 cpprestsdk --debug + + ./build_dependencies.sh --release --export-dir ${{ inputs.target-export-path }} llvm-${{ matrix.llvm }} --debug diff --git a/datasets/fetch_amp_challengebins.sh b/datasets/fetch_amp_challengebins.sh index 6beb2e1..0b26a52 100755 --- a/datasets/fetch_amp_challengebins.sh +++ b/datasets/fetch_amp_challengebins.sh @@ -37,7 +37,7 @@ while [[ $# -gt 0 ]] ; do done curl -LO https://tob-amp-share.nyc3.digitaloceanspaces.com/challenge-binaries-latest.tar.xz.gpg -gpg --no-tty --batch --pinentry-mode loopback --passphrase "${TOB_AMP_PASSPHRASE}" \ +gpg --no-tty --batch --yes --pinentry-mode loopback --passphrase "${TOB_AMP_PASSPHRASE}" \ -o challenge-binaries-latest.tar.xz \ --decrypt challenge-binaries-latest.tar.xz.gpg rm -rf challenge-binaries-latest.tar.xz.gpg diff --git a/tool_run_scripts/anvill.py b/tool_run_scripts/anvill.py index 5ac0c03..74b7d23 100755 --- a/tool_run_scripts/anvill.py +++ b/tool_run_scripts/anvill.py @@ -35,8 +35,14 @@ class AnvillGhidraCmd(ToolCmd): + def __init__(self, tool, infile, outdir, source_base, index, stats, language_overrides): + self.lang_overrides = language_overrides + super().__init__(tool, infile, outdir, source_base, index, stats) + def make_tool_cmd(self): f = self.infile.stem + fullname = self.infile.name + jsonfile = f"{self.index}-{f}.pb" self.tmpout = self.outdir.joinpath("work").joinpath(jsonfile) @@ -48,10 +54,13 @@ def make_tool_cmd(self): "/tmp", f"dummy_ghidra_proj{self.index}-{f}", "-readOnly", - "-deleteProject", - "-import", + "-deleteProject"] + + (["-processor", self.lang_overrides[fullname]] if fullname in self.lang_overrides else []) + +["-import", str(self.infile), "-postScript", + "FixGlobalRegister", + "-postScript", "anvillHeadlessExportScript", str(self.tmpout), ]) @@ -146,6 +155,7 @@ def make_tool_cmd(self): str(self.tmpout), "-stats_out", str(self.stats_file), + "-remove_next_pc_assignments", "-logtostderr", ]) @@ -201,11 +211,35 @@ def save(self): reprofile.write(" ".join(self.cmd)) reprofile.write("\n") +# Run the script with no input to trigger script compilation so it gets saved in the cache +def initialize_ghidra_cache(ghidra_dir): + try: + args = [os.path.join(ghidra_dir, "support", "analyzeHeadless")] + args.extend([ + "/tmp", + "dummy_ghidra_proj_init", + "-readOnly", + "-deleteProject", + "-preScript", + "anvillHeadlessExportScript", + ]) + + subprocess.run(args=args) + except OSError as oe: + log.error(f"Could not initialize ghidra: {oe}") + sys.exit(1) + except subprocess.CalledProcessError as cpe: + log.error(f"Could not initialize: {cpe}") + sys.exit(1) + except subprocess.TimeoutExpired as tme: + log.error(f"Could not initialize ghidra: timeout exception") + sys.exit(1) + -def run_anvill_ghidra(ghidra_dir, output_dir, failonly, source_path, stats, input_and_idx): +def run_anvill_ghidra(ghidra_dir, output_dir, failonly, source_path, stats, language_id_overrides, input_and_idx): idx, input_file = input_and_idx cmd = AnvillGhidraCmd(ghidra_dir, input_file, output_dir, - source_path, idx, stats) + source_path, idx, stats, language_id_overrides) retcode = cmd.run() log.debug(f"Anvill run returned {retcode}") @@ -251,7 +285,7 @@ def get_anvill_version(cmd): log.error(f"Could not get anvill version: {cpe}") sys.exit(1) except subprocess.TimeoutExpired as tme: - log.error(f"Could not get anvill version: timeout execption") + log.error(f"Could not get anvill version: timeout exception") sys.exit(1) return rt.stdout.decode("utf-8") @@ -261,25 +295,33 @@ def anvill_python_main(args, source_path, dest_path): num_cpus = os.cpu_count() anvill_stats = Stats() + + language_id_overrides = {} + if args.test_options: with open(args.test_options, "r") as rf: anvill_stats.load_rules(rf) + if "language_id_overrides" in anvill_stats.rules: + language_id_overrides = anvill_stats.rules['language_id_overrides'] # get all the bitcode log.info(f"Listing files in {str(source_path)}") - sources = list(source_path.rglob("*.elf")) - # Sometimes we forget the .elf suffix - sources.extend(list(source_path.rglob("*.o"))) - log.info(f"Found {len(sources)} ELF files") + # Filter for files that are executable + sources = [source for source in source_path.rglob("*") if source.is_file() and os.access(source, os.X_OK) and not source.name.startswith(".")] + + log.info(f"Found {len(sources)} Executable files") # load test to ignore anvill_stats.set_stat("start_time", str(datetime.now())) max_items_python = len(sources) + # initialize ghidra cache to pre-compile the script + initialize_ghidra_cache(os.path.expanduser(args.ghidra_install_dir)) + # workspace for anvill-python apply_anvill_ghidra = partial( - run_anvill_ghidra, os.path.expanduser(args.ghidra_install_dir), dest_path, args.only_fails, source_path, anvill_stats) + run_anvill_ghidra, os.path.expanduser(args.ghidra_install_dir), dest_path, args.only_fails, source_path, anvill_stats, language_id_overrides) with ThreadPool(num_cpus) as p: with tqdm(total=max_items_python) as pbar: diff --git a/tool_run_scripts/stats.py b/tool_run_scripts/stats.py index 26df3ee..8b24038 100644 --- a/tool_run_scripts/stats.py +++ b/tool_run_scripts/stats.py @@ -80,10 +80,11 @@ def print_fails(self, fail_count=5, output=None, verbose=True): def get_fail_count(self): success_runs = len(self.stats.get("output.success", [])) + timed_out_runs = self.stats.get("program_timeouts", 0) program_runs = self.stats.get("program_runs", 0) ignored_outputs = len(self.stats.get("outputignore_success", [])) ignored_outputs += len(self.stats.get("outputignore_fail", [])) - return (program_runs - ignored_outputs) - success_runs + return (program_runs - ignored_outputs) - (success_runs + timed_out_runs) def print_stats(self, output=None): # emit start/end time diff --git a/tool_run_scripts/toolcmd.py b/tool_run_scripts/toolcmd.py index 4310642..624a053 100644 --- a/tool_run_scripts/toolcmd.py +++ b/tool_run_scripts/toolcmd.py @@ -1,3 +1,4 @@ +from abc import ABC, abstractmethod import re import logging import signal @@ -8,12 +9,12 @@ log.addHandler(logging.StreamHandler()) log.setLevel(logging.DEBUG) -FILE_NAME_RE = re.compile("([^/\s]+\.[^/\s]+:\d+)") -PYTHON_ERROR_RE = re.compile('([^/\s]+\.py)", line (\d+)') -ASAN_ERROR_RE = re.compile('AddressSanitizer: [a-zA-Z\-]+ .*/([^:]+:[\d]+)') -CLANG_ERROR_RE = re.compile("error: ([\w']+) *([\w']*) *([\w']+) *([\w']+)") +FILE_NAME_RE = re.compile(r"([^/\s]+\.[^/\s]+:\d+)") +PYTHON_ERROR_RE = re.compile(r'([^/\s]+\.py)", line (\d+)') +ASAN_ERROR_RE = re.compile(r'AddressSanitizer: [a-zA-Z\-]+ .*/([^:]+:[\d]+)') +CLANG_ERROR_RE = re.compile(r"error: ([\w']+) *([\w']*) *([\w']+) *([\w']+)") -class ToolCmd: +class ToolCmd(ABC): def __init__(self, tool, infile, outdir, source_base, index, stats): self.source_base = source_base self.index = index @@ -32,8 +33,9 @@ def set_output(self, rc, out, err): self.out = out self.err = err + @abstractmethod def make_tool_cmd(self): - raise RuntimeError("Please override make_tool_cmd") + pass def clang_traceback(self, msg): if not msg: @@ -124,7 +126,7 @@ def __del__(self): log.debug(f"Unlinking on delete {self.tmpout}") try: os.unlink(self.tmpout) - except FileNotFoundError as fnf: + except FileNotFoundError: log.debug(f"Tried to delete a file that doesn't exist: {self.tmpout}") def run(self):