Terminal bench manual trigger github action (#2312)

arjun37602 · Arjun Balaji · web-flow · commit 292f678c8de3 · 2025-07-16T11:18:08.000-07:00
* half way thru tb set up

* tb yaml

* TB

* tb bench scripts

* trigger on push change

* typo

* trigger push

* python v

* run local branch instead of prod

* unnecessary env variables removed

* allow log inspection

* implement _env

* npm, rust, cargo, clone github directly

* Update setup_amazon_q.sh

* clean up disk + check amt of free space

* get gcc dependencies

* big timeout

* pipe config files from gh runner to docker

* configure env + working with sso

* changed default

* default to latest

* fixing qchat location + forcing correct auth

* set env vars not just config file

* env vars all caps

* confirm env vairables are visible

* roleName + code simplify

* environment variable fix + local working

* use the correct git hash

* larger runner for storage

* use full hash instead of short hash

* fail if hash invalid

* Force to run on manual trigger

* responding to PR comments

---------

Co-authored-by: Arjun Balaji &lt;arjbal@amazon.com&gt;
diff --git a/.github/workflows/terminal-bench.yaml b/.github/workflows/terminal-bench.yaml
@@ -0,0 +1,84 @@
+# This is a terminal-bench workflow that is manually triggered
+# Template taken from https://github.com/actions/starter-workflows/blob/main/automation/manual.yml for reference 
+
+name: Terminal-Bench
+
+# Controls when the action will run. Workflow runs when manually triggered using the UI
+on:
+  workflow_dispatch:
+    inputs:
+      name:
+        description: 'Run terminal-bench workflow to test Q CLI in real terminal environments.' 
+        default: 'all'
+        required: true
+        type: string
+        
+jobs:
+  run-benchmark:
+    # avoids disk storage issues
+    runs-on: ubuntu-latest-8-cores
+    # makes these env vars available in main.py
+    env:
+      CHAT_DOWNLOAD_ROLE_ARN: ${{ secrets.CHAT_DOWNLOAD_ROLE_ARN }}
+      CHAT_BUILD_BUCKET_NAME: ${{ secrets.CHAT_BUILD_BUCKET_NAME }}
+    permissions:
+      id-token: write
+      contents: read
+    steps:
+
+    # clear unnecessary storage to ensure docker containers have space
+    - name: Cleanup and free disk space
+      run: |
+        sudo rm -rf /usr/share/dotnet
+        sudo rm -rf /opt/ghc
+        sudo rm -rf "/usr/local/share/boost"
+        sudo rm -rf "$AGENT_TOOLSDIRECTORY"
+        sudo rm -rf /usr/local/lib/android
+        sudo rm -rf /usr/share/swift
+        sudo apt-get clean
+        df -h
+
+    - name: Checkout repository
+      uses: actions/checkout@v4
+
+    # Captures git hash of branch to query specific S3 bucket
+    - name: Set git hash
+      run: |
+        if [ -n "$GITHUB_SHA" ]; then
+          git_hash=$(git rev-parse "$GITHUB_SHA")
+        else
+          git_hash="latest"
+        fi
+        # appends to github_env file
+        echo "GIT_HASH=$git_hash" >> $GITHUB_ENV
+        echo "Git hash set to: $git_hash"
+
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.13'
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install terminal-bench
+    
+    # OIDC enabled for github for ArjunPersonal
+    - name: Configure AWS credentials
+      uses: aws-actions/configure-aws-credentials@v4
+      with:
+        role-to-assume: ${{ secrets.AWS_TB_ROLE }}
+        aws-region: us-east-1
+
+    - name: Run terminal benchmark
+      run: |
+        cd terminal-bench-test
+        tb run --agent-import-path main:AmazonQCLIAgent --dataset-name terminal-bench-core --dataset-version head
+
+    # uploads results if run fails as well to allow for easy log inspection
+    - name: Upload results
+      if: always()
+      uses: actions/upload-artifact@v4
+      with:
+        name: benchmark-results
+        path: terminal-bench-test/runs/
diff --git a/terminal-bench-test/main.py b/terminal-bench-test/main.py
@@ -0,0 +1,51 @@
+import os
+import shlex
+from pathlib import Path
+
+from terminal_bench.agents.installed_agents.abstract_installed_agent import (
+    AbstractInstalledAgent,
+)
+from terminal_bench.terminal.models import TerminalCommand
+
+
+class AmazonQCLIAgent(AbstractInstalledAgent):
+
+    @staticmethod
+    def name() -> str:
+        return "Amazon Q CLI"
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    """
+    Makes necessary env vars available in docker containers
+    """
+    @property
+    def _env(self) -> dict[str, str]:
+        # SIGv4 = 1 for AWS credentials
+        env = {
+            "AMAZON_Q_SIGV4": 1,
+            "AWS_ACCESS_KEY_ID": os.environ.get("AWS_ACCESS_KEY_ID", ''),
+            "AWS_SECRET_ACCESS_KEY": os.environ.get("AWS_SECRET_ACCESS_KEY", ''),
+            "AWS_SESSION_TOKEN": os.environ.get("AWS_SESSION_TOKEN", ''),
+            "GIT_HASH": os.environ.get("GIT_HASH", ''),
+            "CHAT_DOWNLOAD_ROLE_ARN": os.environ.get("CHAT_DOWNLOAD_ROLE_ARN", ''),
+            "CHAT_BUILD_BUCKET_NAME": os.environ.get("CHAT_BUILD_BUCKET_NAME", '')
+        }
+        return env
+
+    @property
+    def _install_agent_script_path(self) -> os.PathLike:
+        return Path(__file__).parent / "setup_amazon_q.sh"
+
+    def _run_agent_commands(self, task_description: str) -> list[TerminalCommand]:
+        escaped_description = shlex.quote(task_description)
+        
+        return [
+        # q chat with 30 min max timeout and also we wait on input. Using qchat because of sigv4. 
+            TerminalCommand(
+                command=f"qchat chat --no-interactive --trust-all-tools {escaped_description}",
+                max_timeout_sec=1800, 
+                block=True,
+            )
+        ]
diff --git a/terminal-bench-test/setup_amazon_q.sh b/terminal-bench-test/setup_amazon_q.sh
@@ -0,0 +1,59 @@
+#!/bin/bash
+set -e
+# if git hash empty then set to latest auto
+apt-get update
+apt-get install -y curl wget unzip jq
+
+echo "Installing AWS CLI..."
+curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
+unzip -q awscliv2.zip
+./aws/install --bin-dir /usr/local/bin --install-dir /usr/local/aws-cli
+
+# Create AWS credentials from environment variables
+mkdir -p ~/.aws
+cat > ~/.aws/credentials << EOF
+[default]
+aws_access_key_id = ${AWS_ACCESS_KEY_ID}
+aws_secret_access_key = ${AWS_SECRET_ACCESS_KEY}
+aws_session_token = ${AWS_SESSION_TOKEN}
+EOF
+chmod 600 ~/.aws/credentials
+
+cat > ~/.aws/config << EOF
+[default]
+region = us-east-1
+EOF
+chmod 600 ~/.aws/config
+
+# Assume role and capture temporary credentials --> needed for s3 bucket access for build
+echo "Assuming AWS s3 role"
+TEMP_CREDENTIALS=$(aws sts assume-role --role-arn ${CHAT_DOWNLOAD_ROLE_ARN} --role-session-name S3AccessSession 2>/dev/null || echo '{}')
+QCHAT_ACCESSKEY=$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.AccessKeyId')
+Q_SECRET_ACCESS_KEY=$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.SecretAccessKey')
+Q_SESSION_TOKEN=$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.SessionToken')
+
+# Download specific build from S3 based on commit hash
+echo "Downloading Amazon Q CLI build from S3..."
+S3_PREFIX="main/${GIT_HASH}/x86_64-unknown-linux-musl"
+echo "Downloading qchat.zip from s3://.../${S3_PREFIX}/qchat.zip"
+
+# Try download, if hash is invalid we fail.
+AWS_ACCESS_KEY_ID="$QCHAT_ACCESSKEY" AWS_SECRET_ACCESS_KEY="$Q_SECRET_ACCESS_KEY" AWS_SESSION_TOKEN="$Q_SESSION_TOKEN" \
+  aws s3 cp s3://${CHAT_BUILD_BUCKET_NAME}/${S3_PREFIX}/qchat.zip ./qchat.zip --region us-east-1
+
+# Handle the zip file, copy the qchat executable to /usr/local/bin + symlink from old code
+echo "Extracting qchat.zip..."
+unzip -q qchat.zip
+
+# move it to /usr/local/bin/qchat for path as qchat may not work otherwise
+if cp qchat /usr/local/bin/ && chmod +x /usr/local/bin/qchat; then
+    ln -sf /usr/local/bin/qchat /usr/local/bin/q
+    echo "qchat installed successfully"
+else
+    echo "ERROR: Failed to install qchat"
+    exit 1
+fi
+
+echo "Cleaning q zip"
+rm -f qchat.zip
+rm -rf qchat