Skip to content

Commit 3a0cc9f

Browse files
wangxiyuanwangxiaoteng888
authored andcommitted
Refactor e2e CI (vllm-project#2276)
Refactor E2E CI to make it clear and faster 1. remove some uesless e2e test 2. remove some uesless function 3. Make sure all test runs with VLLMRunner to avoid oom error 4. Make sure all ops test end with torch.empty_cache to avoid oom error 5. run the test one by one to avoid resource limit error - vLLM version: v0.10.1.1 - vLLM main: vllm-project/vllm@a344a5a Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
1 parent 044bc67 commit 3a0cc9f

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+372
-1755
lines changed

.github/workflows/vllm_ascend_test.yaml

Lines changed: 29 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -191,27 +191,29 @@ jobs:
191191
VLLM_WORKER_MULTIPROC_METHOD: spawn
192192
VLLM_USE_MODELSCOPE: True
193193
run: |
194-
pytest -sv tests/e2e/singlecard/test_offline_inference.py
195-
# pytest -sv tests/e2e/singlecard/test_ilama_lora.py
196-
pytest -sv tests/e2e/singlecard/test_guided_decoding.py
194+
# We found that if running aclgraph tests in batch, it will cause AclmdlRICaptureBegin error. So we run
195+
# the test separately.
196+
197+
pytest -sv tests/e2e/singlecard/test_aclgraph.py
198+
pytest -sv tests/e2e/singlecard/test_ascend_scheduler.py
197199
pytest -sv tests/e2e/singlecard/test_camem.py
200+
pytest -sv tests/e2e/singlecard/test_chunked.py
198201
pytest -sv tests/e2e/singlecard/test_embedding.py
202+
pytest -sv tests/e2e/singlecard/test_guided_decoding.py
203+
# TODO: Fix lora accuracy error
204+
# pytest -sv tests/e2e/singlecard/test_ilama_lora.py
205+
pytest -sv tests/e2e/singlecard/test_profile_execute_duration.py
206+
pytest -sv tests/e2e/singlecard/test_quantization.py
207+
pytest -sv tests/e2e/singlecard/test_sampler.py
208+
pytest -sv tests/e2e/singlecard/test_vlm.py
199209
200210
# ------------------------------------ v1 spec decode test ------------------------------------ #
201211
pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py
202212
# TODO: revert me when test_v1_spec_decode.py::test_ngram_correctness is fixed
203213
pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py
204214
205-
# All other tests, ignore: 310p test, accuracy test.
206-
pytest -sv tests/e2e/singlecard/ \
207-
--ignore=tests/e2e/singlecard/test_offline_inference.py \
208-
--ignore=tests/e2e/singlecard/test_ilama_lora.py \
209-
--ignore=tests/e2e/singlecard/test_guided_decoding.py \
210-
--ignore=tests/e2e/singlecard/test_camem.py \
211-
--ignore=tests/e2e/singlecard/test_embedding.py \
212-
--ignore=tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py \
213-
--ignore=tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py \
214-
--ignore=tests/e2e/singlecard/test_offline_inference_310p.py
215+
pytest -sv tests/e2e/singlecard/ops/
216+
215217
e2e-2-cards:
216218
needs: [e2e]
217219
if: ${{ needs.e2e.result == 'success' }}
@@ -273,17 +275,23 @@ jobs:
273275
VLLM_WORKER_MULTIPROC_METHOD: spawn
274276
VLLM_USE_MODELSCOPE: True
275277
run: |
278+
pytest -sv tests/e2e/multicard/test_data_parallel.py
279+
pytest -sv tests/e2e/multicard/test_expert_parallel.py
280+
# external_launcher test is not stable enough. Fix it later
281+
# pytest -sv tests/e2e/multicard/test_external_launcher.py
282+
pytest -sv tests/e2e/multicard/test_fused_moe_allgather_ep.py
276283
# pytest -sv tests/e2e/multicard/test_ilama_lora_tp2.py
277-
# Fixme: run VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py will raise error.
284+
278285
# To avoid oom, we need to run the test in a single process.
279-
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_multistream_moe
280286
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ
281-
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeekV3_dbo
287+
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_multistream_moe
288+
#pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_pangu
289+
#pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen3_W8A8
282290
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen3_W4A8DYNAMIC
283291
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W4A8DYNAMIC
284292
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_sp_for_qwen3_moe
285-
pytest -sv tests/e2e/multicard/test_data_parallel.py
286-
pytest -sv tests/e2e/multicard/ --ignore=tests/e2e/multicard/test_ilama_lora_tp2.py \
287-
--ignore=tests/e2e/multicard/test_offline_inference_distributed.py \
288-
--ignore=tests/e2e/multicard/test_data_parallel.py \
289-
--ignore=tests/e2e/multicard/test_offline_inference_310p.py
293+
294+
#pytest -sv tests/e2e/multicard/test_pipeline_parallel.py
295+
#pytest -sv tests/e2e/multicard/test_prefix_caching.py
296+
#pytest -sv tests/e2e/multicard/test_qwen3_moe.py
297+
#pytest -sv tests/e2e/multicard/test_torchair_graph_mode.py

.github/workflows/vllm_ascend_test_310p.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ jobs:
111111
PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
112112
run: |
113113
if [[ "${{ matrix.os }}" == "linux-aarch64-310p-1" ]]; then
114-
pytest -sv tests/e2e/singlecard/test_offline_inference_310p.py
114+
pytest -sv tests/e2e/310p/test_offline_inference_310p.py
115115
else
116-
pytest -sv tests/e2e/multicard/test_offline_inference_310p.py
117-
fi
116+
pytest -sv tests/e2e/310p/test_offline_inference_parallel_310p.py
117+
fi

tests/e2e/singlecard/test_offline_inference_310p.py renamed to tests/e2e/310p/test_offline_inference_310p.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
import vllm_ascend # noqa: F401
2222
from tests.e2e.conftest import VllmRunner
2323

24-
MODELS = ["Qwen/Qwen3-0.6B-Base", "Qwen/Qwen2.5-7B-Instruct"]
24+
MODELS = ["Qwen/Qwen3-0.6B", "Qwen/Qwen2.5-7B-Instruct"]
2525

2626

2727
@pytest.mark.parametrize("model", MODELS)
File renamed without changes.

tests/e2e/conftest.py

Lines changed: 23 additions & 113 deletions
Original file line numberDiff line numberDiff line change
@@ -33,13 +33,11 @@
3333
from transformers.models.auto.auto_factory import _BaseAutoModelClass
3434
from vllm import LLM, SamplingParams
3535
from vllm.config import TaskOption, _get_and_verify_dtype
36-
from vllm.inputs import ExplicitEncoderDecoderPrompt, TextPrompt, TokensPrompt
36+
from vllm.inputs import TextPrompt
3737
from vllm.outputs import RequestOutput
38-
from vllm.sampling_params import BeamSearchParams
3938
from vllm.transformers_utils.utils import maybe_model_redirect
40-
from vllm.utils import is_list_of
4139

42-
from tests.e2e.model_utils import (PROMPT_TEMPLATES, TokensTextLogprobs,
40+
from tests.e2e.model_utils import (TokensTextLogprobs,
4341
TokensTextLogprobsPromptLogprobs)
4442
# TODO: remove this part after the patch merged into vllm, if
4543
# we not explicitly patch here, some of them might be effectiveless
@@ -62,7 +60,6 @@
6260
PromptVideoInput = _PromptMultiModalInput[np.ndarray]
6361

6462
_TEST_DIR = os.path.dirname(__file__)
65-
_TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
6663

6764

6865
def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
@@ -89,13 +86,13 @@ def __init__(
8986
# Use smaller max model length, otherwise bigger model cannot run due
9087
# to kv cache size limit.
9188
max_model_len: int = 1024,
92-
dtype: str = "half",
89+
dtype: str = "auto",
9390
disable_log_stats: bool = True,
9491
tensor_parallel_size: int = 1,
9592
block_size: int = 16,
9693
enable_chunked_prefill: bool = False,
9794
swap_space: int = 4,
98-
enforce_eager: Optional[bool] = True,
95+
enforce_eager: Optional[bool] = False,
9996
quantization: Optional[str] = None,
10097
**kwargs,
10198
) -> None:
@@ -220,26 +217,6 @@ def generate_w_logprobs(
220217
if sampling_params.prompt_logprobs is None else
221218
toks_str_logsprobs_prompt_logprobs)
222219

223-
def generate_encoder_decoder_w_logprobs(
224-
self,
225-
encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str, str]],
226-
sampling_params: SamplingParams,
227-
) -> Union[List[TokensTextLogprobs],
228-
List[TokensTextLogprobsPromptLogprobs]]:
229-
'''
230-
Logprobs generation for vLLM encoder/decoder models
231-
'''
232-
233-
assert sampling_params.logprobs is not None
234-
req_outputs = self.model.generate(encoder_decoder_prompts,
235-
sampling_params=sampling_params)
236-
toks_str_logsprobs_prompt_logprobs = (
237-
self._final_steps_generate_w_logprobs(req_outputs))
238-
# Omit prompt logprobs if not required by sampling params
239-
return ([x[0:-1] for x in toks_str_logsprobs_prompt_logprobs]
240-
if sampling_params.prompt_logprobs is None else
241-
toks_str_logsprobs_prompt_logprobs)
242-
243220
def generate_greedy(
244221
self,
245222
prompts: List[str],
@@ -284,53 +261,6 @@ def generate_greedy_logprobs(
284261
audios=audios,
285262
videos=videos)
286263

287-
def generate_encoder_decoder_greedy_logprobs(
288-
self,
289-
encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str, str]],
290-
max_tokens: int,
291-
num_logprobs: int,
292-
num_prompt_logprobs: Optional[int] = None,
293-
) -> Union[List[TokensTextLogprobs],
294-
List[TokensTextLogprobsPromptLogprobs]]:
295-
greedy_logprobs_params = SamplingParams(
296-
temperature=0.0,
297-
max_tokens=max_tokens,
298-
logprobs=num_logprobs,
299-
prompt_logprobs=(num_prompt_logprobs),
300-
)
301-
'''
302-
Greedy logprobs generation for vLLM encoder/decoder models
303-
'''
304-
305-
return self.generate_encoder_decoder_w_logprobs(
306-
encoder_decoder_prompts, greedy_logprobs_params)
307-
308-
def generate_beam_search(
309-
self,
310-
prompts: Union[List[str], List[List[int]]],
311-
beam_width: int,
312-
max_tokens: int,
313-
) -> List[Tuple[List[List[int]], List[str]]]:
314-
if is_list_of(prompts, str, check="all"):
315-
prompts = [TextPrompt(prompt=prompt) for prompt in prompts]
316-
else:
317-
prompts = [
318-
TokensPrompt(prompt_token_ids=tokens) for tokens in prompts
319-
]
320-
outputs = self.model.beam_search(
321-
prompts,
322-
BeamSearchParams(beam_width=beam_width, max_tokens=max_tokens))
323-
returned_outputs = []
324-
for output in outputs:
325-
token_ids = [x.tokens for x in output.sequences]
326-
texts = [x.text for x in output.sequences]
327-
returned_outputs.append((token_ids, texts))
328-
return returned_outputs
329-
330-
def classify(self, prompts: List[str]) -> List[List[float]]:
331-
req_outputs = self.model.classify(prompts)
332-
return [req_output.outputs.probs for req_output in req_outputs]
333-
334264
def encode(
335265
self,
336266
prompts: List[str],
@@ -346,14 +276,6 @@ def encode(
346276
req_outputs = self.model.embed(inputs)
347277
return [req_output.outputs.embedding for req_output in req_outputs]
348278

349-
def score(
350-
self,
351-
text_1: Union[str, List[str]],
352-
text_2: Union[str, List[str]],
353-
) -> List[float]:
354-
req_outputs = self.model.score(text_1, text_2)
355-
return [req_output.outputs.score for req_output in req_outputs]
356-
357279
def __enter__(self):
358280
return self
359281

@@ -362,35 +284,6 @@ def __exit__(self, exc_type, exc_value, traceback):
362284
cleanup_dist_env_and_memory()
363285

364286

365-
@pytest.fixture(scope="session")
366-
def vllm_runner():
367-
return VllmRunner
368-
369-
370-
@pytest.fixture(params=list(PROMPT_TEMPLATES.keys()))
371-
def prompt_template(request):
372-
return PROMPT_TEMPLATES[request.param]
373-
374-
375-
def _read_prompts(filename: str) -> list[str]:
376-
with open(filename) as f:
377-
prompts = f.readlines()
378-
return prompts
379-
380-
381-
@pytest.fixture
382-
def example_prompts() -> list[str]:
383-
prompts = []
384-
for filename in _TEST_PROMPTS:
385-
prompts += _read_prompts(filename)
386-
return prompts
387-
388-
389-
@pytest.fixture(scope="session")
390-
def ilama_lora_files():
391-
return snapshot_download(repo_id="vllm-ascend/ilama-text2sql-spider")
392-
393-
394287
class HfRunner:
395288

396289
def get_default_device(self):
@@ -515,5 +408,22 @@ def __exit__(self, exc_type, exc_value, traceback):
515408

516409

517410
@pytest.fixture(scope="session")
518-
def hf_runner():
519-
return HfRunner
411+
def ilama_lora_files():
412+
return snapshot_download(repo_id="vllm-ascend/ilama-text2sql-spider")
413+
414+
415+
def qwen_prompt(questions: List[str]) -> List[str]:
416+
placeholder = "<|image_pad|>"
417+
return [("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
418+
f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
419+
f"{q}<|im_end|>\n<|im_start|>assistant\n") for q in questions]
420+
421+
422+
PROMPT_TEMPLATES = {
423+
"qwen2.5vl": qwen_prompt,
424+
}
425+
426+
427+
@pytest.fixture(params=list(PROMPT_TEMPLATES.keys()))
428+
def prompt_template(request):
429+
return PROMPT_TEMPLATES[request.param]

0 commit comments

Comments
 (0)