Skip to content

Commit 06d6f37

Browse files
committed
Suport V0 and V1 and remove unused mmlu
Signed-off-by: Yikun Jiang <yikunkero@gmail.com>
1 parent e2a364a commit 06d6f37

File tree

3 files changed

+47
-55
lines changed

3 files changed

+47
-55
lines changed

.github/workflows/accuracy_report.yaml

Lines changed: 4 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -60,16 +60,6 @@ jobs:
6060
env:
6161
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
6262

63-
- name: Query artifact run id for Llama-3.1-8B-Instruct V0 latest artifact
64-
id: get_Llama_3_1_8B_Instruct_latest_run_id_V0
65-
run: |
66-
ARTIFACT_JSON=$(gh api "repos/${{ github.repository }}/actions/artifacts")
67-
RUN_ID=$(echo "$ARTIFACT_JSON" | \
68-
jq -r '[.artifacts[] | select(.name=="${{ github.event.inputs.vllm-ascend-version }}-Llama-3.1-8B-Instruct-V0-report")] | sort_by(.created_at) | last | .workflow_run.id')
69-
echo "runid=$RUN_ID" >> "$GITHUB_OUTPUT"
70-
env:
71-
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
72-
7363
- name: Query artifact run id for Qwen3-8B-Base V0 latest artifact
7464
id: get_Qwen3_8B_Base_latest_run_id_V0
7565
run: |
@@ -98,15 +88,6 @@ jobs:
9888
repository: vllm-project/vllm-ascend
9989
run-id: ${{ steps.get_Qwen2_5_7B_Instruct_latest_run_id_V0.outputs.runid }}
10090

101-
- name: Download meta-llama/Llama-3.1-8B-Instruct Artifact
102-
uses: actions/download-artifact@v4
103-
with:
104-
name: ${{ github.event.inputs.vllm-ascend-version }}-Llama-3.1-8B-Instruct-V0-report
105-
path: ./docs/source/developer_guide/evaluation/accuracy_report
106-
github-token: ${{ secrets.GITHUB_TOKEN }}
107-
repository: vllm-project/vllm-ascend
108-
run-id: ${{ steps.get_Llama_3_1_8B_Instruct_latest_run_id_V0.outputs.runid }}
109-
11091
- name: Download Qwen/Qwen3-8B-Base Artifact
11192
uses: actions/download-artifact@v4
11293
with:
@@ -120,7 +101,6 @@ jobs:
120101
working-directory: ./docs/source/developer_guide/evaluation/accuracy_report
121102
run: |
122103
cat ./Qwen2.5-VL-7B-Instruct.md
123-
cat ./Llama-3.1-8B-Instruct.md
124104
cat ./Qwen2.5-7B-Instruct.md
125105
cat ./Qwen3-8B-Base.md
126106
@@ -139,12 +119,10 @@ jobs:
139119
140120
- [Workflow run][1]
141121
- [Qwen2.5-7B-Instruct accuracy report][2]
142-
- [Llama-3.1-8B-Instruct accuracy report][3]
143-
- [Qwen2.5-VL-7B-Instruct accuracy report][4]
144-
- [Qwen3-8B-Base accuracy report][5]
122+
- [Qwen2.5-VL-7B-Instruct accuracy report][3]
123+
- [Qwen3-8B-Base accuracy report][4]
145124
146125
[1]: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
147126
[2]: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ steps.get_Qwen2_5_7B_Instruct_latest_run_id_V0.outputs.runid }}
148-
[3]: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ steps.get_Llama_3_1_8B_Instruct_latest_run_id_V0.outputs.runid }}
149-
[4]: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ steps.get_Qwen2_5_VL_7B_Instruct_latest_run_id_V0.outputs.runid }}
150-
[5]: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ steps.get_Qwen3_8B_Base_latest_run_id_V0.outputs.runid }}
127+
[3]: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ steps.get_Qwen2_5_VL_7B_Instruct_latest_run_id_V0.outputs.runid }}
128+
[4]: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ steps.get_Qwen3_8B_Base_latest_run_id_V0.outputs.runid }}

.github/workflows/accuracy_test.yaml

Lines changed: 38 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,10 @@ defaults:
6262
run:
6363
shell: bash -el {0}
6464

65+
concurrency:
66+
group: pr-${{ github.event.pull_request.number }}
67+
cancel-in-progress: true
68+
6569
jobs:
6670
accuracy_tests:
6771
# test will be triggered when tag '*-accuracy-test' & 'ready-for-test' or workflow_dispatch job
@@ -73,9 +77,10 @@ jobs:
7377
contains(github.event.pull_request.labels.*.name, 'ready-for-test') ||
7478
github.event_name == 'workflow_dispatch'
7579
}}
76-
runs-on: ${{ matrix.runner || 'linux-arm64-npu-2' }}
80+
runs-on: ${{ (matrix.model_name == 'Qwen2.5-VL-7B-Instruct' && 'linux-arm64-npu-4') || (matrix.model_name != 'Qwen2.5-VL-7B-Instruct' && 'linux-arm64-npu-2') }}
7781
strategy:
7882
matrix:
83+
vllm_use_version: [0, 1]
7984
# the accuracy test will run:
8085
# 1. workflow_dispatch with models input
8186
# - all: Qwen/Qwen2.5-7B-Instruct, Qwen/Qwen2.5-VL-7B-Instruct, Qwen/Qwen3-8B-Base
@@ -85,16 +90,26 @@ jobs:
8590
# - dense-accuracy-test: Qwen/Qwen2.5-7B-Instruct
8691
# - vl-accuracy-test: Qwen/Qwen2.5-VL-7B-Instruct
8792
include: ${{ fromJSON(
88-
(github.event.inputs.models == 'all' && '[{"model_name":"Qwen/Qwen2.5-7B-Instruct"},{"model_name":"Qwen/Qwen2.5-VL-7B-Instruct", "runner":"linux-arm64-npu-4"},{"model_name":"Qwen/Qwen3-8B-Base"}]') ||
89-
(github.event.inputs.models == 'Qwen/Qwen2.5-7B-Instruct' && '[{"model_name":"Qwen/Qwen2.5-7B-Instruct"}]') ||
90-
(github.event.inputs.models == 'Qwen/Qwen2.5-VL-7B-Instruct' && '[{"model_name":"Qwen/Qwen2.5-VL-7B-Instruct", "runner":"linux-arm64-npu-4"}]') ||
91-
(github.event.inputs.models == 'Qwen/Qwen3-8B-Base' && '[{"model_name":"Qwen/Qwen3-8B-Base"}]') ||
92-
contains(github.event.pull_request.labels.*.name, 'accuracy-test') && '[{"model_name":"Qwen/Qwen2.5-7B-Instruct"},{"model_name":"Qwen/Qwen2.5-VL-7B-Instruct", "runner":"linux-arm64-npu-4"}]' ||
93-
contains(github.event.pull_request.labels.*.name, 'dense-accuracy-test') && '[{"model_name":"Qwen/Qwen2.5-7B-Instruct"}]' ||
94-
contains(github.event.pull_request.labels.*.name, 'vl-accuracy-test') && '[{"model_name":"Qwen/Qwen2.5-VL-7B-Instruct", "runner":"linux-arm64-npu-4"}]'
93+
(github.event.inputs.models == 'all' &&
94+
'[{"model_name":"Qwen/Qwen2.5-7B-Instruct"},
95+
{"model_name":"Qwen/Qwen2.5-VL-7B-Instruct"},
96+
{"model_name":"Qwen/Qwen3-8B-Base"}]') ||
97+
(github.event.inputs.models == 'Qwen/Qwen2.5-7B-Instruct' &&
98+
'[{"model_name":"Qwen/Qwen2.5-7B-Instruct"}]') ||
99+
(github.event.inputs.models == 'Qwen/Qwen2.5-VL-7B-Instruct' &&
100+
'[{"model_name":"Qwen/Qwen2.5-VL-7B-Instruct"}]') ||
101+
(github.event.inputs.models == 'Qwen/Qwen3-8B-Base' &&
102+
'[{"model_name":"Qwen/Qwen3-8B-Base"}]') ||
103+
contains(github.event.pull_request.labels.*.name, 'accuracy-test') &&
104+
'[{"model_name":"Qwen/Qwen2.5-7B-Instruct"},
105+
{"model_name":"Qwen/Qwen2.5-VL-7B-Instruct"}]' ||
106+
contains(github.event.pull_request.labels.*.name, 'dense-accuracy-test') &&
107+
'[{"model_name":"Qwen/Qwen2.5-7B-Instruct"}]' ||
108+
contains(github.event.pull_request.labels.*.name, 'vl-accuracy-test') &&
109+
'[{"model_name":"Qwen/Qwen2.5-VL-7B-Instruct"}]'
95110
) }}
96111
fail-fast: false
97-
name: ${{ matrix.model_name }} accuracy
112+
name: ${{ matrix.model_name }} accuracy V${{ matrix.vllm_use_version }}
98113
container:
99114
image: m.daocloud.io/quay.io/ascend/cann:8.1.rc1-910b-ubuntu22.04-py3.10
100115
env:
@@ -189,34 +204,38 @@ jobs:
189204
echo "vLLM: ${{ env.GHA_VLLM_VERSION }}"
190205
echo "vLLM Ascend: ${{ env.GHA_VLLM_ASCEND_VERSION || github.ref }}"
191206
192-
- name: Run Accuracy Test for V0
207+
- name: Run Accuracy Test for V${{ matrix.vllm_use_version }}
193208
id: report
194209
working-directory: ./benchmarks
195210
env:
196-
VLLM_USE_V1: 0
197211
PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
212+
VLLM_USE_V1: ${{ matrix.vllm_use_version }}
198213
run: |
199214
model_base_name=$(basename ${{ matrix.model_name }})
200-
echo "model_base_name=$model_base_name"
201-
echo "model_base_name=$model_base_name" >> $GITHUB_OUTPUT
202-
mkdir -p ./accuracy/V0
215+
markdown_name="${model_base_name}-V${{ matrix.vllm_use_version }}"
216+
echo "markdown_name=$markdown_name"
217+
echo "markdown_name=$markdown_name" >> $GITHUB_OUTPUT
218+
mkdir -p ./accuracy
203219
204220
python ./scripts/run_accuracy.py \
205221
--model "${{ matrix.model_name }}" \
206-
--output "./accuracy/V0/${model_base_name}.md" \
222+
--output "./accuracy/${markdown_name}.md" \
207223
--vllm_ascend_version "${{ env.GHA_VLLM_ASCEND_VERSION || github.ref }}" \
208224
--cann_version "${{ env.GHA_CANN_VERSION }}" \
209225
--torch_npu_version "${{ env.GHA_TORCH_NPU_VERSION }}" \
210226
--torch_version "${{ env.GHA_TORCH_VERSION }}" \
211227
--vllm_version "${{ env.GHA_VLLM_VERSION }}"
212228
213-
cat ./accuracy/V0/${model_base_name}.md >> $GITHUB_STEP_SUMMARY
229+
- name: Generate step summary
230+
if: ${{ always() }}
231+
run: |
232+
cat ./benchmarks/accuracy/${{ steps.report.outputs.markdown_name }}.md >> $GITHUB_STEP_SUMMARY
214233
215-
- name: Upload Report for V0
234+
- name: Upload Report for V${{ matrix.vllm_use_version }}
216235
uses: actions/upload-artifact@v4
217236
with:
218-
name: "${{ env.GHA_VLLM_ASCEND_VERSION }}-${{ steps.report.outputs.model_base_name }}-V0-report"
219-
path: ./benchmarks/accuracy/V0/${{ steps.report.outputs.model_base_name }}.md
237+
name: "${{ env.GHA_VLLM_ASCEND_VERSION }}-${{ steps.report.outputs.markdown_name }}-report"
238+
path: ./benchmarks/accuracy/${{ steps.report.outputs.markdown_name }}.md
220239
if-no-files-found: warn
221240
retention-days: 90
222241
overwrite: true

benchmarks/scripts/run_accuracy.py

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -26,23 +26,15 @@
2626
import lm_eval
2727
import torch
2828

29-
UNIMODAL_MODEL_NAME = [
30-
"Qwen/Qwen2.5-7B-Instruct", "meta-llama/Llama-3.1-8B-Instruct",
31-
"Qwen/Qwen3-8B"
32-
]
33-
UNIMODAL_TASK = ["ceval-valid", "mmlu", "gsm8k"]
29+
UNIMODAL_MODEL_NAME = ["Qwen/Qwen2.5-7B-Instruct", "Qwen/Qwen3-8B"]
30+
UNIMODAL_TASK = ["ceval-valid", "gsm8k"]
3431
MULTIMODAL_NAME = ["Qwen/Qwen2.5-VL-7B-Instruct"]
3532
MULTIMODAL_TASK = ["mmmu_val"]
3633

3734
batch_size_dict = {"ceval-valid": 1, "mmlu": 1, "gsm8k": "auto", "mmmu_val": 1}
3835

3936
MODEL_RUN_INFO = {
4037
"Qwen/Qwen2.5-7B-Instruct":
41-
("export MODEL_ARGS='pretrained={model}, max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6'\n"
42-
"lm_eval --model vllm --modlel_args $MODEL_ARGS --tasks {datasets} \ \n"
43-
"--apply_chat_template --fewshot_as_multiturn --num_fewshot 5 --batch_size 1"
44-
),
45-
"meta-llama/Llama-3.1-8B-Instruct":
4638
("export MODEL_ARGS='pretrained={model}, max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6'\n"
4739
"lm_eval --model vllm --modlel_args $MODEL_ARGS --tasks {datasets} \ \n"
4840
"--apply_chat_template --fewshot_as_multiturn --num_fewshot 5 --batch_size 1"
@@ -228,4 +220,7 @@ def main(args):
228220
parser.add_argument("--vllm_version", type=str, required=False)
229221
parser.add_argument("--cann_version", type=str, required=False)
230222
args = parser.parse_args()
223+
# TODO(yikun):
224+
# 1. add a exit 1 if accuracy is not as expected
225+
# 2. Add ✅, ❌ to markdown if accuracy is not as expected
231226
main(args)

0 commit comments

Comments
 (0)