add eval config for Qwen3-235B-A22B-Thinking-2507-FP8

hl475 · hl475 · commit 72a0c3f062cb · 2025-10-17T16:06:19.000-07:00
Signed-off-by: Huamin Li &lt;3ericli@gmail.com&gt;
diff --git a/.buildkite/lm-eval-harness/configs/Qwen3-235B-A22B-Thinking-2507-FP8.yaml b/.buildkite/lm-eval-harness/configs/Qwen3-235B-A22B-Thinking-2507-FP8.yaml
@@ -0,0 +1,11 @@
+model_name: "Qwen/Qwen3-235B-A22B-Thinking-2507-FP8"
+backend: "vllm"
+tasks:
+  - name: "mmlu_pro"
+    metrics:
+    - name: "exact_match,custom-extract"
+      value: 0.77
+num_fewshot: 5
+limit: 250 # will run on 250 * 14 subjects = 3500 samples
+max_model_len: 8096
+gen_kwargs: "top_p=1,top_k=0,max_gen_toks=1536"
diff --git a/.buildkite/lm-eval-harness/configs/models-large-h100.txt b/.buildkite/lm-eval-harness/configs/models-large-h100.txt
@@ -1 +1 @@
-Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
+Qwen3-235B-A22B-Thinking-2507-FP8.yaml
diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@@ -40,7 +40,9 @@ def launch_lm_eval(eval_config, tp_size):
         # existing text models in CI, so only apply it for mm.
         apply_chat_template=backend == "vllm-vlm",
         batch_size=batch_size,
+        gen_kwargs=eval_config.get("gen_kwargs", None),
     )
+
     return results
 
 
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -1084,7 +1084,7 @@ steps:
   - tests/weight_loading
   commands:
     - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
-  
+
 - label: NixlConnector PD accuracy tests (Distributed) # 30min
   timeout_in_minutes: 30
   working_dir: "/vllm-workspace/tests"
@@ -1126,6 +1126,18 @@ steps:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
 
+##### H100 test #####
+- label: LM Eval Medium Models (H100) # optional
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-medium-h100.txt --tp-size=4
+
 ##### H200 test #####
 - label: Distributed Tests (H200) # optional
   gpu: h200

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml`
	`1`	`+Qwen3-235B-A22B-Thinking-2507-FP8.yaml`
Original file line number	Diff line number	Diff line change
`@@ -40,7 +40,9 @@ def launch_lm_eval(eval_config, tp_size):`
`40`	`40`	`# existing text models in CI, so only apply it for mm.`
`41`	`41`	`apply_chat_template=backend == "vllm-vlm",`
`42`	`42`	`batch_size=batch_size,`
	`43`	`+ gen_kwargs=eval_config.get("gen_kwargs", None),`
`43`	`44`	`)`
	`45`	`+`
`44`	`46`	`return results`
`45`	`47`
`46`	`48`