Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
from tests.e2e.model_utils import check_outputs_equal

MODELS = [
"Qwen/Qwen3-0.6B",
"deepseek-ai/DeepSeek-V2-Lite",
]


Expand All @@ -39,15 +39,19 @@ def test_models_with_multistream_overlap_shared_expert(
max_tokens: int,
) -> None:
prompts = [
"Hello, my name is", "The president of the United States is",
"The capital of France is", "The future of AI is"
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]

sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0.0)
with VllmRunner(
model,
max_model_len=1024,
enforce_eager=True,
data_parallel_size=2,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

dp works in this case?

enable_expert_parallel=True,
Comment on lines +53 to +54
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

These parameters are duplicated across the three VllmRunner instantiations in this test (see also lines 66-67 and 79-80). This code duplication makes the test harder to maintain and more fragile. If these parameters need to be updated in the future, it's easy to forget to update all instances, which could lead to incorrect test behavior and mask real bugs.

To improve robustness, I recommend refactoring to remove the duplication. You can define a dictionary with the common parameters and reuse it for each VllmRunner call.

Example:

common_args = {
    "model": model,
    "max_model_len": 1024,
    "data_parallel_size": 2,
    "enable_expert_parallel": True,
}

with VllmRunner(
    **common_args,
    enforce_eager=True,
    additional_config={
        "multistream_overlap_shared_expert": True,
    },
) as runner:
    # ...

with VllmRunner(
    **common_args,
    enforce_eager=False,
    additional_config={
        "multistream_overlap_shared_expert": True,
    },
) as runner:
    # ...

with VllmRunner(**common_args, enforce_eager=True) as runner:
    # ...

additional_config={
"multistream_overlap_shared_expert": True,
},
Expand All @@ -59,6 +63,8 @@ def test_models_with_multistream_overlap_shared_expert(
model,
max_model_len=1024,
enforce_eager=False,
data_parallel_size=2,
enable_expert_parallel=True,
additional_config={
"multistream_overlap_shared_expert": True,
},
Expand All @@ -70,6 +76,8 @@ def test_models_with_multistream_overlap_shared_expert(
model,
max_model_len=1024,
enforce_eager=True,
data_parallel_size=2,
enable_expert_parallel=True,
) as runner:
vllm_eager_outputs = runner.model.generate(prompts, sampling_params)

Expand Down
Loading