Skip to content

Commit 1b51135

Browse files
MatthewBonanniZhengHongming888
authored andcommitted
Add TP parameter to attention tests (vllm-project#27683)
Signed-off-by: Matthew Bonanni <mbonanni@redhat.com>
1 parent 109e978 commit 1b51135

File tree

4 files changed

+92
-11
lines changed

4 files changed

+92
-11
lines changed

.buildkite/test-pipeline.yaml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -347,8 +347,7 @@ steps:
347347
- vllm/v1/attention
348348
- tests/v1/attention
349349
commands:
350-
- export VLLM_DISABLE_FLASHINFER_PREFILL=1 # TODO: FI prefill is bugged and causes incorrectness, fix this
351-
- pytest -v -s v1/attention
350+
- VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
352351

353352
- label: V1 Test others (CPU) # 5 mins
354353
source_file_dependencies:

tests/v1/attention/test_attention_backends.py

Lines changed: 53 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,7 @@ def _test_backend_correctness(
295295
block_size: int = 16,
296296
atol: float = 1e-2,
297297
rtol: float = 1e-2,
298+
tensor_parallel_size: int = 1,
298299
):
299300
"""
300301
Test that all backends produce similar outputs to a reference implementation
@@ -310,13 +311,38 @@ def _test_backend_correctness(
310311
4. Running each vLLM attention backend with the new queries and the
311312
simulated paged KV cache.
312313
5. Comparing the vLLM backend's output to the ground-truth SDPA output.
314+
315+
Note: When tensor_parallel_size > 1, we simulate the head partitioning
316+
by overriding the model config to use fewer heads, without requiring
317+
multiple GPUs. This tests that backends work correctly with different
318+
head counts.
313319
"""
314320
current_platform.seed_everything(42)
321+
322+
hf_config_override = None
323+
if tensor_parallel_size > 1:
324+
from vllm.config import ModelConfig
325+
326+
temp_config = ModelConfig(model=model, max_model_len=1)
327+
original_num_heads = temp_config.hf_text_config.num_attention_heads
328+
original_num_kv_heads = getattr(
329+
temp_config.hf_text_config, "num_key_value_heads", None
330+
)
331+
hf_config_override = {
332+
"num_attention_heads": original_num_heads // tensor_parallel_size,
333+
}
334+
if original_num_kv_heads is not None:
335+
hf_config_override["num_key_value_heads"] = max(
336+
1, original_num_kv_heads // tensor_parallel_size
337+
)
338+
315339
vllm_config = create_vllm_config(
316340
model_name=model,
341+
tensor_parallel_size=1, # Always use TP=1 to avoid multi-GPU requirements
317342
max_model_len=max(batch_spec.seq_lens),
318343
block_size=block_size,
319344
num_gpu_blocks=8192,
345+
hf_config_override=hf_config_override,
320346
)
321347
device = torch.device("cuda:0")
322348

@@ -503,7 +529,10 @@ def error_msg(msg: str, backend_name: str):
503529
],
504530
)
505531
@pytest.mark.parametrize("model", ["meta-llama/Meta-Llama-3-8B"])
506-
def test_causal_backend_correctness(batch_spec_name: str, model: str):
532+
@pytest.mark.parametrize("tensor_parallel_size", [1, 2, 4])
533+
def test_causal_backend_correctness(
534+
batch_spec_name: str, model: str, tensor_parallel_size: int
535+
):
507536
"""Test backend's correctness with causal attention."""
508537

509538
def causal_mask_mod(
@@ -523,12 +552,23 @@ def causal_mask_mod(
523552
SMALL_BLOCK_BACKENDS = [
524553
x for x in BACKENDS_TO_TEST if x not in LARGE_BLOCK_BACKENDS
525554
]
526-
_test_backend_correctness(batch_spec, model, SMALL_BLOCK_BACKENDS, causal_mask_mod)
555+
_test_backend_correctness(
556+
batch_spec,
557+
model,
558+
SMALL_BLOCK_BACKENDS,
559+
causal_mask_mod,
560+
tensor_parallel_size=tensor_parallel_size,
561+
)
527562

528563
# Fast FlexAttention needs to run with block_size=128
529564
if LARGE_BLOCK_BACKENDS:
530565
_test_backend_correctness(
531-
batch_spec, model, LARGE_BLOCK_BACKENDS, causal_mask_mod, block_size=128
566+
batch_spec,
567+
model,
568+
LARGE_BLOCK_BACKENDS,
569+
causal_mask_mod,
570+
block_size=128,
571+
tensor_parallel_size=tensor_parallel_size,
532572
)
533573

534574

@@ -545,7 +585,10 @@ def causal_mask_mod(
545585
["small_decode", "small_prefill", "mixed_medium", "large_decode", "large_prefill"],
546586
)
547587
@pytest.mark.parametrize("model", ["microsoft/Phi-tiny-MoE-instruct"])
548-
def test_sliding_window_backend_correctness(batch_spec_name: str, model: str):
588+
@pytest.mark.parametrize("tensor_parallel_size", [1, 2, 4])
589+
def test_sliding_window_backend_correctness(
590+
batch_spec_name: str, model: str, tensor_parallel_size: int
591+
):
549592
"""Test backend's correctness with sliding window attention."""
550593

551594
def sliding_window_mask_mod(
@@ -575,7 +618,11 @@ def sliding_window_mask_mod(
575618
x for x in SLIDING_WINDOW_BACKENDS_TO_TEST if x not in LARGE_BLOCK_BACKENDS
576619
]
577620
_test_backend_correctness(
578-
batch_spec, model, SMALL_BLOCK_BACKENDS, sliding_window_mask_mod_fn
621+
batch_spec,
622+
model,
623+
SMALL_BLOCK_BACKENDS,
624+
sliding_window_mask_mod_fn,
625+
tensor_parallel_size=tensor_parallel_size,
579626
)
580627

581628
# Fast FlexAttention needs to run with block_size=128
@@ -586,4 +633,5 @@ def sliding_window_mask_mod(
586633
LARGE_BLOCK_BACKENDS,
587634
sliding_window_mask_mod_fn,
588635
block_size=128,
636+
tensor_parallel_size=tensor_parallel_size,
589637
)

tests/v1/attention/test_mla_backends.py

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -394,8 +394,11 @@ def run_attention_backend(
394394
"spec_decode_medium",
395395
],
396396
)
397-
@pytest.mark.parametrize("model", ["deepseek-ai/DeepSeek-V2-Lite-Chat"])
398-
def test_backend_correctness(dist_init, batch_spec_name: str, model: str):
397+
@pytest.mark.parametrize("model", ["deepseek-ai/DeepSeek-R1"])
398+
@pytest.mark.parametrize("tensor_parallel_size", [1, 4, 8, 16])
399+
def test_backend_correctness(
400+
dist_init, batch_spec_name: str, model: str, tensor_parallel_size: int
401+
):
399402
"""
400403
Test that all backends produce similar outputs to a reference implementation
401404
using torch.nn.functional.scaled_dot_product_attention.
@@ -410,6 +413,11 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str):
410413
4. Running each vLLM attention backend with the new queries and the
411414
simulated paged KV cache.
412415
5. Comparing the vLLM backend's output to the ground-truth SDPA output.
416+
417+
Note: When tensor_parallel_size > 1, we simulate the head partitioning
418+
by overriding the model config to use fewer heads, without requiring
419+
multiple GPUs. This tests that backends work correctly with different
420+
head counts.
413421
"""
414422

415423
batch_spec = BATCH_SPECS[batch_spec_name]
@@ -423,11 +431,30 @@ def test_backend_correctness(dist_init, batch_spec_name: str, model: str):
423431
# Add 1 for null block at index 0, and some buffer
424432
num_gpu_blocks = required_blocks + 1 + 100
425433

434+
hf_config_override = None
435+
if tensor_parallel_size > 1:
436+
from vllm.config import ModelConfig
437+
438+
temp_config = ModelConfig(model=model, max_model_len=1)
439+
original_num_heads = temp_config.hf_text_config.num_attention_heads
440+
original_num_kv_heads = getattr(
441+
temp_config.hf_text_config, "num_key_value_heads", None
442+
)
443+
hf_config_override = {
444+
"num_attention_heads": original_num_heads // tensor_parallel_size,
445+
}
446+
if original_num_kv_heads is not None:
447+
hf_config_override["num_key_value_heads"] = max(
448+
1, original_num_kv_heads // tensor_parallel_size
449+
)
450+
426451
vllm_config = create_vllm_config(
427452
model_name=model,
453+
tensor_parallel_size=1, # Always use TP=1 to avoid multi-GPU requirements
428454
max_model_len=max(batch_spec.seq_lens),
429455
num_gpu_blocks=num_gpu_blocks,
430456
block_size=default_block_size,
457+
hf_config_override=hf_config_override,
431458
)
432459

433460
# For spec decode tests, add a speculative_config to set the reorder_batch_threshold

tests/v1/attention/test_sparse_mla_backends.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,10 @@ def _quantize_dequantize_fp8_ds_mla(
113113

114114
@pytest.mark.parametrize("batch_name", list(SPARSE_BACKEND_BATCH_SPECS.keys()))
115115
@pytest.mark.parametrize("kv_cache_dtype", ["fp8_ds_mla", "auto"])
116-
def test_sparse_backend_decode_correctness(dist_init, batch_name, kv_cache_dtype):
116+
@pytest.mark.parametrize("tensor_parallel_size", [1, 2, 4])
117+
def test_sparse_backend_decode_correctness(
118+
dist_init, batch_name, kv_cache_dtype, tensor_parallel_size
119+
):
117120
if not torch.cuda.is_available():
118121
pytest.skip("CUDA is required for sparse MLA decode test")
119122

@@ -135,8 +138,11 @@ def test_sparse_backend_decode_correctness(dist_init, batch_name, kv_cache_dtype
135138
total_cache_tokens = sum(batch_spec.seq_lens)
136139
block_size = 64
137140

141+
# Note: We use TP=1 to avoid multi-GPU requirements in CI.
142+
# The test simulates head partitioning via mocked methods below.
138143
vllm_config = create_vllm_config(
139144
model_name="deepseek-ai/DeepSeek-V2-Lite-Chat",
145+
tensor_parallel_size=1,
140146
max_model_len=max_seqlen,
141147
num_gpu_blocks=max(2048, cdiv(total_cache_tokens, block_size) + 1),
142148
block_size=block_size,
@@ -156,7 +162,8 @@ def test_sparse_backend_decode_correctness(dist_init, batch_name, kv_cache_dtype
156162
)
157163
model_config.dtype = dtype
158164
model_config.get_num_attention_heads = MethodType(
159-
lambda self, parallel_config: num_heads, model_config
165+
lambda self, parallel_config: max(1, num_heads // tensor_parallel_size),
166+
model_config,
160167
)
161168
model_config.get_num_kv_heads = MethodType(
162169
lambda self, parallel_config: 1, model_config

0 commit comments

Comments
 (0)