Skip to content

[v0.9.1] Support Kimi-k2 Fp16 and W8A8 and W4A8 #1098

[v0.9.1] Support Kimi-k2 Fp16 and W8A8 and W4A8

[v0.9.1] Support Kimi-k2 Fp16 and W8A8 and W4A8 #1098

#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
# This test will be triggered:
# 1. PR labeled with: '*accuracy-test' (ONLY 1 label valid) & 'ready-for-test'
# 2. workflow_dispatch with models input
# See detail rule in strategy.matrix note
name: Benchmarks / accuracy
on:
pull_request:
types: [ labeled ]
workflow_dispatch:
inputs:
vllm-version:
description: 'vllm version:'
required: true
type: choice
# Please also update this when bump matched version
# Current supported vLLM versions
options:
- main
- v0.9.1
- v0.7.3
vllm-ascend-version:
description: 'vllm-ascend version:'
required: true
type: choice
options:
- main
- v0.7.3-dev
models:
description: 'model:'
required: true
type: choice
options:
- all
- Qwen/Qwen2.5-7B-Instruct
- Qwen/Qwen2.5-VL-7B-Instruct
- Qwen/Qwen3-8B-Base
default: 'all'
# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
# declared as "shell: bash -el {0}" on steps that need to be properly activated.
# It's used to activate ascend-toolkit environment variables.
defaults:
run:
shell: bash -el {0}
# only cancel in-progress runs of the same workflow
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
accuracy_tests:
# test will be triggered when tag '*-accuracy-test' & 'ready-for-test' or workflow_dispatch job
if: >-
${{
(contains(github.event.pull_request.labels.*.name, 'accuracy-test') ||
contains(github.event.pull_request.labels.*.name, 'vl-accuracy-test') ||
contains(github.event.pull_request.labels.*.name, 'dense-accuracy-test')) &&
contains(github.event.pull_request.labels.*.name, 'ready-for-test') ||
github.event_name == 'workflow_dispatch'
}}
runs-on: >-
${{
(matrix.model_name == 'Qwen/Qwen2.5-VL-7B-Instruct' && 'linux-arm64-npu-4') ||
'linux-arm64-npu-2'
}}
strategy:
matrix:
vllm_use_version: [0, 1]
# the accuracy test will run:
# 1. workflow_dispatch with models input
# - all: Qwen/Qwen2.5-7B-Instruct, Qwen/Qwen2.5-VL-7B-Instruct, Qwen/Qwen3-8B-Base
# - specified but not all: Qwen/Qwen2.5-7B-Instruct, Qwen/Qwen2.5-VL-7B-Instruct, Qwen/Qwen3-8B-Base
# 2. PR labeled with "*-accuracy-test"
# - accuracy-test: Qwen/Qwen2.5-7B-Instruct, Qwen/Qwen2.5-VL-7B-Instruct
# - dense-accuracy-test: Qwen/Qwen2.5-7B-Instruct
# - vl-accuracy-test: Qwen/Qwen2.5-VL-7B-Instruct
model_name: ${{ fromJSON(
(github.event.inputs.models == 'all' &&
'["Qwen/Qwen2.5-7B-Instruct","Qwen/Qwen2.5-VL-7B-Instruct","Qwen/Qwen3-8B-Base"]') ||
(github.event.inputs.models == 'Qwen/Qwen2.5-7B-Instruct' &&
'["Qwen/Qwen2.5-7B-Instruct"]') ||
(github.event.inputs.models == 'Qwen/Qwen2.5-VL-7B-Instruct' &&
'["Qwen/Qwen2.5-VL-7B-Instruct"]') ||
(github.event.inputs.models == 'Qwen/Qwen3-8B-Base' &&
'["Qwen/Qwen3-8B-Base"]') ||
contains(github.event.pull_request.labels.*.name, 'accuracy-test') &&
'["Qwen/Qwen2.5-7B-Instruct","Qwen/Qwen2.5-VL-7B-Instruct"]' ||
contains(github.event.pull_request.labels.*.name, 'dense-accuracy-test') &&
'["Qwen/Qwen2.5-7B-Instruct"]' ||
contains(github.event.pull_request.labels.*.name, 'vl-accuracy-test') &&
'["Qwen/Qwen2.5-VL-7B-Instruct"]'
) }}
# Remove exclude after https://github.com/vllm-project/vllm-ascend/issues/1044 resolved
exclude:
- model_name: Qwen/Qwen2.5-VL-7B-Instruct
vllm_use_version: 1
fail-fast: false
name: ${{ matrix.model_name }} accuracy V${{ matrix.vllm_use_version }}
container:
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
env:
HF_ENDPOINT: https://hf-mirror.com
HF_TOKEN: ${{ secrets.HF_TOKEN }}
DATASET_SOURCE: ModelScope
VLLM_USE_MODELSCOPE: True
# 1. If version specified (work_dispatch), do specified branch accuracy test
# 2. If no version (labeled PR), do accuracy test by default ref:
# The branch, tag or SHA to checkout. When checking out the repository that
# triggered a workflow, this defaults to the reference or SHA for that event.
# Otherwise, uses the default branch.
GHA_VLLM_ASCEND_VERSION: ${{ github.event.inputs.vllm-ascend-version }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Check npu and CANN info
run: |
npu-smi info
cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
- name: Config mirrors
run: |
sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
apt-get update -y
apt install git -y
git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/
- name: Install system dependencies
run: |
apt-get -y install `cat packages.txt`
apt-get -y install gcc g++ cmake libnuma-dev
- name: Checkout vllm-project/vllm repo
uses: actions/checkout@v4
with:
repository: vllm-project/vllm
path: ./vllm-empty
# Please also update this when bump matched version
ref: ${{ github.event.inputs.vllm-version || 'v0.9.1' }}
- name: Install vllm-project/vllm from source
working-directory: ./vllm-empty
run: VLLM_TARGET_DEVICE=empty pip install -e .
- name: Checkout vllm-project/vllm-ascend repo
uses: actions/checkout@v4
with:
repository: vllm-project/vllm-ascend
path: ./vllm-ascend
ref: ${{ env.GHA_VLLM_ASCEND_VERSION }}
- name: Install vllm-project/vllm-ascend
working-directory: ./vllm-ascend
run: |
pip install -r requirements-dev.txt
pip install -e .
- name: Install lm-eval, ray, and datasets
run: |
pip install lm-eval
- name: Collect version info
run: |
for dir in /usr/local/Ascend/ascend-toolkit/*; do
dname=$(basename "$dir")
if [ "$dname" != "latest" ]; then
TOOLKIT_DIR="$dname"
break
fi
done
INFO_FILE="/usr/local/Ascend/ascend-toolkit/${TOOLKIT_DIR}/$(uname -i)-linux/ascend_toolkit_install.info"
GHA_CANN_VERSION=$(grep "version=" "$INFO_FILE" \
| head -n1 \
| cut -d'=' -f2 \
| tr -d '"')
{
echo "GHA_CANN_VERSION=$GHA_CANN_VERSION"
pip show torch | grep "Version:" | awk '{print "GHA_TORCH_VERSION="$2}'
pip show torch_npu | grep "Version:" | awk '{print "GHA_TORCH_NPU_VERSION="$2}'
pip show vllm | grep "Version:" | awk '{print "GHA_VLLM_VERSION="$2}' | sed 's/+.*//'
echo "GHA_VLLM_ASCEND_VERSION=${{ github.event.inputs.vllm-ascend-version || github.ref }}"
} >> "$GITHUB_ENV"
- name: Print versions
run: |
echo "CANN: ${{ env.GHA_CANN_VERSION }}"
echo "Torch NPU: ${{ env.GHA_TORCH_NPU_VERSION }}"
echo "Torch: ${{ env.GHA_TORCH_VERSION }}"
echo "vLLM: ${{ env.GHA_VLLM_VERSION }}"
echo "vLLM Ascend: ${{ env.GHA_VLLM_ASCEND_VERSION }}"
- name: Run Accuracy Test for V${{ matrix.vllm_use_version }}
id: report
working-directory: ./benchmarks
env:
PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
VLLM_USE_V1: ${{ matrix.vllm_use_version }}
run: |
model_base_name=$(basename ${{ matrix.model_name }})
markdown_name="${model_base_name}-V${{ matrix.vllm_use_version }}"
echo "markdown_name=$markdown_name"
echo "markdown_name=$markdown_name" >> $GITHUB_OUTPUT
mkdir -p ./accuracy
python ./scripts/run_accuracy.py \
--model "${{ matrix.model_name }}" \
--output "./accuracy/${markdown_name}.md" \
--vllm_ascend_version "${{ env.GHA_VLLM_ASCEND_VERSION || github.ref }}" \
--cann_version "${{ env.GHA_CANN_VERSION }}" \
--torch_npu_version "${{ env.GHA_TORCH_NPU_VERSION }}" \
--torch_version "${{ env.GHA_TORCH_VERSION }}" \
--vllm_version "${{ env.GHA_VLLM_VERSION }}"
- name: Generate step summary
if: ${{ always() }}
run: |
cat ./benchmarks/accuracy/${{ steps.report.outputs.markdown_name }}.md >> $GITHUB_STEP_SUMMARY
- name: Sanitize version string for artifact naming
run: |
SAFE_VLLM_ASCEND_VERSION="${GHA_VLLM_ASCEND_VERSION//\//-}"
echo "SAFE_VLLM_ASCEND_VERSION=$SAFE_VLLM_ASCEND_VERSION" >> "$GITHUB_ENV"
- name: Upload Report for V${{ matrix.vllm_use_version }}
if: ${{ github.event_name == 'workflow_dispatch' }}
uses: actions/upload-artifact@v4
with:
name: "${{ env.SAFE_VLLM_ASCEND_VERSION }}-${{ steps.report.outputs.markdown_name }}-report"
path: ./benchmarks/accuracy/${{ steps.report.outputs.markdown_name }}.md
if-no-files-found: warn
retention-days: 90
overwrite: true