Suport V0 and V1 and remove unused mmlu

Yikun · Yikun · commit 0e32b8daa2db · 2025-06-03T00:45:16.000+08:00
Signed-off-by: Yikun Jiang &lt;yikunkero@gmail.com&gt;
diff --git a/.github/workflows/accuracy_report.yaml b/.github/workflows/accuracy_report.yaml
@@ -60,16 +60,6 @@ jobs:
         env:
           GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 
-      - name: Query artifact run id for Llama-3.1-8B-Instruct V0 latest artifact
-        id: get_Llama_3_1_8B_Instruct_latest_run_id_V0
-        run: |
-          ARTIFACT_JSON=$(gh api "repos/${{ github.repository }}/actions/artifacts")
-          RUN_ID=$(echo "$ARTIFACT_JSON" | \
-            jq -r '[.artifacts[] | select(.name=="${{ github.event.inputs.vllm-ascend-version }}-Llama-3.1-8B-Instruct-V0-report")] | sort_by(.created_at) | last | .workflow_run.id')
-          echo "runid=$RUN_ID" >> "$GITHUB_OUTPUT"
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
       - name: Query artifact run id for Qwen3-8B-Base V0 latest artifact
         id: get_Qwen3_8B_Base_latest_run_id_V0
         run: |
@@ -98,15 +88,6 @@ jobs:
           repository: vllm-project/vllm-ascend
           run-id: ${{ steps.get_Qwen2_5_7B_Instruct_latest_run_id_V0.outputs.runid }}
 
-      - name: Download meta-llama/Llama-3.1-8B-Instruct Artifact
-        uses: actions/download-artifact@v4
-        with:
-          name: ${{ github.event.inputs.vllm-ascend-version }}-Llama-3.1-8B-Instruct-V0-report
-          path: ./docs/source/developer_guide/evaluation/accuracy_report
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-          repository: vllm-project/vllm-ascend
-          run-id: ${{ steps.get_Llama_3_1_8B_Instruct_latest_run_id_V0.outputs.runid }}
-
       - name: Download Qwen/Qwen3-8B-Base Artifact
         uses: actions/download-artifact@v4
         with:
@@ -120,7 +101,6 @@ jobs:
         working-directory: ./docs/source/developer_guide/evaluation/accuracy_report
         run: |
           cat ./Qwen2.5-VL-7B-Instruct.md
-          cat ./Llama-3.1-8B-Instruct.md
           cat ./Qwen2.5-7B-Instruct.md
           cat ./Qwen3-8B-Base.md
       
@@ -139,12 +119,10 @@ jobs:
 
             - [Workflow run][1]
             - [Qwen2.5-7B-Instruct accuracy report][2]
-            - [Llama-3.1-8B-Instruct accuracy report][3]
-            - [Qwen2.5-VL-7B-Instruct accuracy report][4]
-            - [Qwen3-8B-Base accuracy report][5]
+            - [Qwen2.5-VL-7B-Instruct accuracy report][3]
+            - [Qwen3-8B-Base accuracy report][4]
 
             [1]: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
             [2]: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ steps.get_Qwen2_5_7B_Instruct_latest_run_id_V0.outputs.runid }}
-            [3]: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ steps.get_Llama_3_1_8B_Instruct_latest_run_id_V0.outputs.runid }}
-            [4]: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ steps.get_Qwen2_5_VL_7B_Instruct_latest_run_id_V0.outputs.runid }}
-            [5]: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ steps.get_Qwen3_8B_Base_latest_run_id_V0.outputs.runid }}
+            [3]: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ steps.get_Qwen2_5_VL_7B_Instruct_latest_run_id_V0.outputs.runid }}
+            [4]: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ steps.get_Qwen3_8B_Base_latest_run_id_V0.outputs.runid }}
diff --git a/.github/workflows/accuracy_test.yaml b/.github/workflows/accuracy_test.yaml
@@ -62,6 +62,10 @@ defaults:
   run:
     shell: bash -el {0}
 
+concurrency:
+  group: pr-${{ github.event.pull_request.number }}
+  cancel-in-progress: true
+
 jobs:
   accuracy_tests:
     # test will be triggered when tag '*-accuracy-test' & 'ready-for-test' or workflow_dispatch job
@@ -73,9 +77,14 @@ jobs:
       contains(github.event.pull_request.labels.*.name, 'ready-for-test') ||
       github.event_name == 'workflow_dispatch'
       }}
-    runs-on: ${{ matrix.runner || 'linux-arm64-npu-2' }}
+    runs-on: >-
+      ${{
+          (matrix.model_name == 'Qwen/Qwen2.5-VL-7B-Instruct' && 'linux-arm64-npu-4') ||
+          'linux-arm64-npu-2'
+      }}
     strategy:
       matrix:
+        vllm_use_version: [0, 1]
         # the accuracy test will run:
         # 1. workflow_dispatch with models input
         #   - all: Qwen/Qwen2.5-7B-Instruct, Qwen/Qwen2.5-VL-7B-Instruct, Qwen/Qwen3-8B-Base
@@ -84,17 +93,24 @@ jobs:
         #   - accuracy-test: Qwen/Qwen2.5-7B-Instruct, Qwen/Qwen2.5-VL-7B-Instruct
         #   - dense-accuracy-test: Qwen/Qwen2.5-7B-Instruct
         #   - vl-accuracy-test: Qwen/Qwen2.5-VL-7B-Instruct
-        include: ${{ fromJSON(
-          (github.event.inputs.models == 'all' && '[{"model_name":"Qwen/Qwen2.5-7B-Instruct"},{"model_name":"Qwen/Qwen2.5-VL-7B-Instruct", "runner":"linux-arm64-npu-4"},{"model_name":"Qwen/Qwen3-8B-Base"}]') ||
-          (github.event.inputs.models == 'Qwen/Qwen2.5-7B-Instruct' && '[{"model_name":"Qwen/Qwen2.5-7B-Instruct"}]') ||
-          (github.event.inputs.models == 'Qwen/Qwen2.5-VL-7B-Instruct' && '[{"model_name":"Qwen/Qwen2.5-VL-7B-Instruct", "runner":"linux-arm64-npu-4"}]') ||
-          (github.event.inputs.models == 'Qwen/Qwen3-8B-Base' && '[{"model_name":"Qwen/Qwen3-8B-Base"}]') ||
-          contains(github.event.pull_request.labels.*.name, 'accuracy-test') && '[{"model_name":"Qwen/Qwen2.5-7B-Instruct"},{"model_name":"Qwen/Qwen2.5-VL-7B-Instruct", "runner":"linux-arm64-npu-4"}]' ||
-          contains(github.event.pull_request.labels.*.name, 'dense-accuracy-test') && '[{"model_name":"Qwen/Qwen2.5-7B-Instruct"}]' ||
-          contains(github.event.pull_request.labels.*.name, 'vl-accuracy-test') && '[{"model_name":"Qwen/Qwen2.5-VL-7B-Instruct", "runner":"linux-arm64-npu-4"}]'
+        model_name: ${{ fromJSON(
+          (github.event.inputs.models == 'all' &&
+            '["Qwen/Qwen2.5-7B-Instruct","Qwen/Qwen2.5-VL-7B-Instruct","model_name":"Qwen/Qwen3-8B-Base"]') ||
+          (github.event.inputs.models == 'Qwen/Qwen2.5-7B-Instruct' &&
+            '["Qwen/Qwen2.5-7B-Instruct"]') ||
+          (github.event.inputs.models == 'Qwen/Qwen2.5-VL-7B-Instruct' &&
+            '["Qwen/Qwen2.5-VL-7B-Instruct"]') ||
+          (github.event.inputs.models == 'Qwen/Qwen3-8B-Base' &&
+            '["Qwen/Qwen3-8B-Base"]') ||
+          contains(github.event.pull_request.labels.*.name, 'accuracy-test') &&
+            '["Qwen/Qwen2.5-7B-Instruct","Qwen/Qwen2.5-VL-7B-Instruct"]' ||
+          contains(github.event.pull_request.labels.*.name, 'dense-accuracy-test') &&
+            '["Qwen/Qwen2.5-7B-Instruct"]' ||
+          contains(github.event.pull_request.labels.*.name, 'vl-accuracy-test') &&
+            '["Qwen/Qwen2.5-VL-7B-Instruct"]'
          ) }}
       fail-fast: false
-    name: ${{ matrix.model_name }} accuracy
+    name: ${{ matrix.model_name }} accuracy V${{ matrix.vllm_use_version }}
     container:
       image: m.daocloud.io/quay.io/ascend/cann:8.1.rc1-910b-ubuntu22.04-py3.10
       env:
@@ -189,34 +205,38 @@ jobs:
           echo "vLLM: ${{ env.GHA_VLLM_VERSION }}"
           echo "vLLM Ascend: ${{ env.GHA_VLLM_ASCEND_VERSION || github.ref }}"
 
-      - name: Run Accuracy Test for V0
+      - name: Run Accuracy Test for V${{ matrix.vllm_use_version }}
         id: report
         working-directory: ./benchmarks
         env:
-          VLLM_USE_V1: 0
           PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
+          VLLM_USE_V1: ${{ matrix.vllm_use_version }}
         run: |
           model_base_name=$(basename ${{ matrix.model_name }})
-          echo "model_base_name=$model_base_name"
-          echo "model_base_name=$model_base_name" >> $GITHUB_OUTPUT
-          mkdir -p ./accuracy/V0
+          markdown_name="${model_base_name}-V${{ matrix.vllm_use_version }}"
+          echo "markdown_name=$markdown_name"
+          echo "markdown_name=$markdown_name" >> $GITHUB_OUTPUT
+          mkdir -p ./accuracy
 
           python ./scripts/run_accuracy.py \
             --model "${{ matrix.model_name }}" \
-            --output "./accuracy/V0/${model_base_name}.md" \
+            --output "./accuracy/${markdown_name}.md" \
             --vllm_ascend_version "${{ env.GHA_VLLM_ASCEND_VERSION || github.ref }}" \
             --cann_version "${{ env.GHA_CANN_VERSION }}" \
             --torch_npu_version "${{ env.GHA_TORCH_NPU_VERSION }}" \
             --torch_version "${{ env.GHA_TORCH_VERSION }}" \
             --vllm_version "${{ env.GHA_VLLM_VERSION }}"
 
-          cat ./accuracy/V0/${model_base_name}.md >> $GITHUB_STEP_SUMMARY
+      - name: Generate step summary
+        if: ${{ always() }}
+        run: |
+          cat ./benchmarks/accuracy/${{ steps.report.outputs.markdown_name }}.md >> $GITHUB_STEP_SUMMARY
 
-      - name: Upload Report for V0
+      - name: Upload Report for V${{ matrix.vllm_use_version }}
         uses: actions/upload-artifact@v4
         with:
-          name: "${{ env.GHA_VLLM_ASCEND_VERSION }}-${{ steps.report.outputs.model_base_name }}-V0-report"
-          path: ./benchmarks/accuracy/V0/${{ steps.report.outputs.model_base_name }}.md
+          name: "${{ env.GHA_VLLM_ASCEND_VERSION }}-${{ steps.report.outputs.markdown_name }}-report"
+          path: ./benchmarks/accuracy/${{ steps.report.outputs.markdown_name }}.md
           if-no-files-found: warn
           retention-days: 90
           overwrite: true
diff --git a/benchmarks/scripts/run_accuracy.py b/benchmarks/scripts/run_accuracy.py
@@ -26,23 +26,15 @@
 import lm_eval
 import torch
 
-UNIMODAL_MODEL_NAME = [
-    "Qwen/Qwen2.5-7B-Instruct", "meta-llama/Llama-3.1-8B-Instruct",
-    "Qwen/Qwen3-8B"
-]
-UNIMODAL_TASK = ["ceval-valid", "mmlu", "gsm8k"]
+UNIMODAL_MODEL_NAME = ["Qwen/Qwen2.5-7B-Instruct", "Qwen/Qwen3-8B"]
+UNIMODAL_TASK = ["ceval-valid", "gsm8k"]
 MULTIMODAL_NAME = ["Qwen/Qwen2.5-VL-7B-Instruct"]
 MULTIMODAL_TASK = ["mmmu_val"]
 
 batch_size_dict = {"ceval-valid": 1, "mmlu": 1, "gsm8k": "auto", "mmmu_val": 1}
 
 MODEL_RUN_INFO = {
     "Qwen/Qwen2.5-7B-Instruct":
-    ("export MODEL_ARGS='pretrained={model}, max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6'\n"
-     "lm_eval --model vllm --modlel_args $MODEL_ARGS --tasks {datasets} \ \n"
-     "--apply_chat_template --fewshot_as_multiturn --num_fewshot 5 --batch_size 1"
-     ),
-    "meta-llama/Llama-3.1-8B-Instruct":
     ("export MODEL_ARGS='pretrained={model}, max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6'\n"
      "lm_eval --model vllm --modlel_args $MODEL_ARGS --tasks {datasets} \ \n"
      "--apply_chat_template --fewshot_as_multiturn --num_fewshot 5 --batch_size 1"
@@ -228,4 +220,7 @@ def main(args):
     parser.add_argument("--vllm_version", type=str, required=False)
     parser.add_argument("--cann_version", type=str, required=False)
     args = parser.parse_args()
+    # TODO(yikun):
+    # 1. add a exit 1 if accuracy is not as expected
+    # 2. Add ✅, ❌ to markdown if accuracy is not as expected
     main(args)