Skip to content

Commit 65767d7

Browse files
committed
[CI] Refactor e2e CI
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
1 parent 600b08f commit 65767d7

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+372
-1755
lines changed

.github/workflows/vllm_ascend_test.yaml

Lines changed: 28 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -191,27 +191,29 @@ jobs:
191191
VLLM_WORKER_MULTIPROC_METHOD: spawn
192192
VLLM_USE_MODELSCOPE: True
193193
run: |
194-
pytest -sv tests/e2e/singlecard/test_offline_inference.py
195-
# pytest -sv tests/e2e/singlecard/test_ilama_lora.py
196-
pytest -sv tests/e2e/singlecard/test_guided_decoding.py
194+
# We found that if running aclgraph tests in batch, it will cause AclmdlRICaptureBegin error. So we run
195+
# the test separately.
196+
197+
pytest -sv tests/e2e/singlecard/test_aclgraph.py
198+
pytest -sv tests/e2e/singlecard/test_ascend_scheduler.py
197199
pytest -sv tests/e2e/singlecard/test_camem.py
200+
pytest -sv tests/e2e/singlecard/test_chunked.py
198201
pytest -sv tests/e2e/singlecard/test_embedding.py
202+
pytest -sv tests/e2e/singlecard/test_guided_decoding.py
203+
# TODO: Fix lora accuracy error
204+
# pytest -sv tests/e2e/singlecard/test_ilama_lora.py
205+
pytest -sv tests/e2e/singlecard/test_profile_execute_duration.py
206+
pytest -sv tests/e2e/singlecard/test_quantization.py
207+
pytest -sv tests/e2e/singlecard/test_sampler.py
208+
pytest -sv tests/e2e/singlecard/test_vlm.py
199209
200210
# ------------------------------------ v1 spec decode test ------------------------------------ #
201211
pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py
202212
# TODO: revert me when test_v1_spec_decode.py::test_ngram_correctness is fixed
203213
pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py
204214
205-
# All other tests, ignore: 310p test, accuracy test.
206-
pytest -sv tests/e2e/singlecard/ \
207-
--ignore=tests/e2e/singlecard/test_offline_inference.py \
208-
--ignore=tests/e2e/singlecard/test_ilama_lora.py \
209-
--ignore=tests/e2e/singlecard/test_guided_decoding.py \
210-
--ignore=tests/e2e/singlecard/test_camem.py \
211-
--ignore=tests/e2e/singlecard/test_embedding.py \
212-
--ignore=tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py \
213-
--ignore=tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py \
214-
--ignore=tests/e2e/singlecard/test_offline_inference_310p.py
215+
pytest -sv tests/e2e/singlecard/ops/
216+
215217
e2e-2-cards:
216218
needs: [e2e]
217219
if: ${{ needs.e2e.result == 'success' }}
@@ -273,18 +275,24 @@ jobs:
273275
VLLM_WORKER_MULTIPROC_METHOD: spawn
274276
VLLM_USE_MODELSCOPE: True
275277
run: |
278+
pytest -sv tests/e2e/multicard/test_data_parallel.py
279+
pytest -sv tests/e2e/multicard/test_expert_parallel.py
280+
pytest -sv tests/e2e/multicard/test_external_launcher.py
281+
pytest -sv tests/e2e/multicard/test_fused_moe_allgather_ep.py
276282
# pytest -sv tests/e2e/multicard/test_ilama_lora_tp2.py
277-
# Fixme: run VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py will raise error.
283+
278284
# To avoid oom, we need to run the test in a single process.
279-
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_multistream_moe
280285
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ
286+
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_multistream_moe
287+
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_pangu
281288
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeekV3_dbo
282289
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_alltoallv
290+
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen3_W8A8
283291
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen3_W4A8DYNAMIC
284292
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W4A8DYNAMIC
285293
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_sp_for_qwen3_moe
286-
pytest -sv tests/e2e/multicard/test_data_parallel.py
287-
pytest -sv tests/e2e/multicard/ --ignore=tests/e2e/multicard/test_ilama_lora_tp2.py \
288-
--ignore=tests/e2e/multicard/test_offline_inference_distributed.py \
289-
--ignore=tests/e2e/multicard/test_data_parallel.py \
290-
--ignore=tests/e2e/multicard/test_offline_inference_310p.py
294+
295+
pytest -sv tests/e2e/multicard/test_pipeline_parallel.py
296+
pytest -sv tests/e2e/multicard/test_prefix_caching.py
297+
pytest -sv tests/e2e/multicard/test_qwen3_moe.py
298+
pytest -sv tests/e2e/multicard/test_torchair_graph_mode.py

.github/workflows/vllm_ascend_test_310p.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ jobs:
111111
PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
112112
run: |
113113
if [[ "${{ matrix.os }}" == "linux-aarch64-310p-1" ]]; then
114-
pytest -sv tests/e2e/singlecard/test_offline_inference_310p.py
114+
pytest -sv tests/e2e/310p/test_offline_inference_310p.py
115115
else
116-
pytest -sv tests/e2e/multicard/test_offline_inference_310p.py
117-
fi
116+
pytest -sv tests/e2e/310p/test_offline_inference_parallel_310p.py
117+
fi

tests/e2e/singlecard/test_offline_inference_310p.py renamed to tests/e2e/310p/test_offline_inference_310p.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
import vllm_ascend # noqa: F401
2222
from tests.e2e.conftest import VllmRunner
2323

24-
MODELS = ["Qwen/Qwen3-0.6B-Base", "Qwen/Qwen2.5-7B-Instruct"]
24+
MODELS = ["Qwen/Qwen3-0.6B", "Qwen/Qwen2.5-7B-Instruct"]
2525

2626

2727
@pytest.mark.parametrize("model", MODELS)
File renamed without changes.

tests/e2e/conftest.py

Lines changed: 23 additions & 113 deletions
Original file line numberDiff line numberDiff line change
@@ -33,13 +33,11 @@
3333
from transformers.models.auto.auto_factory import _BaseAutoModelClass
3434
from vllm import LLM, SamplingParams
3535
from vllm.config import TaskOption, _get_and_verify_dtype
36-
from vllm.inputs import ExplicitEncoderDecoderPrompt, TextPrompt, TokensPrompt
36+
from vllm.inputs import TextPrompt
3737
from vllm.outputs import RequestOutput
38-
from vllm.sampling_params import BeamSearchParams
3938
from vllm.transformers_utils.utils import maybe_model_redirect
40-
from vllm.utils import is_list_of
4139

42-
from tests.e2e.model_utils import (PROMPT_TEMPLATES, TokensTextLogprobs,
40+
from tests.e2e.model_utils import (TokensTextLogprobs,
4341
TokensTextLogprobsPromptLogprobs)
4442
# TODO: remove this part after the patch merged into vllm, if
4543
# we not explicitly patch here, some of them might be effectiveless
@@ -62,7 +60,6 @@
6260
PromptVideoInput = _PromptMultiModalInput[np.ndarray]
6361

6462
_TEST_DIR = os.path.dirname(__file__)
65-
_TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
6663

6764

6865
def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
@@ -89,13 +86,13 @@ def __init__(
8986
# Use smaller max model length, otherwise bigger model cannot run due
9087
# to kv cache size limit.
9188
max_model_len: int = 1024,
92-
dtype: str = "half",
89+
dtype: str = "auto",
9390
disable_log_stats: bool = True,
9491
tensor_parallel_size: int = 1,
9592
block_size: int = 16,
9693
enable_chunked_prefill: bool = False,
9794
swap_space: int = 4,
98-
enforce_eager: Optional[bool] = True,
95+
enforce_eager: Optional[bool] = False,
9996
quantization: Optional[str] = None,
10097
**kwargs,
10198
) -> None:
@@ -220,26 +217,6 @@ def generate_w_logprobs(
220217
if sampling_params.prompt_logprobs is None else
221218
toks_str_logsprobs_prompt_logprobs)
222219

223-
def generate_encoder_decoder_w_logprobs(
224-
self,
225-
encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str, str]],
226-
sampling_params: SamplingParams,
227-
) -> Union[List[TokensTextLogprobs],
228-
List[TokensTextLogprobsPromptLogprobs]]:
229-
'''
230-
Logprobs generation for vLLM encoder/decoder models
231-
'''
232-
233-
assert sampling_params.logprobs is not None
234-
req_outputs = self.model.generate(encoder_decoder_prompts,
235-
sampling_params=sampling_params)
236-
toks_str_logsprobs_prompt_logprobs = (
237-
self._final_steps_generate_w_logprobs(req_outputs))
238-
# Omit prompt logprobs if not required by sampling params
239-
return ([x[0:-1] for x in toks_str_logsprobs_prompt_logprobs]
240-
if sampling_params.prompt_logprobs is None else
241-
toks_str_logsprobs_prompt_logprobs)
242-
243220
def generate_greedy(
244221
self,
245222
prompts: List[str],
@@ -284,53 +261,6 @@ def generate_greedy_logprobs(
284261
audios=audios,
285262
videos=videos)
286263

287-
def generate_encoder_decoder_greedy_logprobs(
288-
self,
289-
encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str, str]],
290-
max_tokens: int,
291-
num_logprobs: int,
292-
num_prompt_logprobs: Optional[int] = None,
293-
) -> Union[List[TokensTextLogprobs],
294-
List[TokensTextLogprobsPromptLogprobs]]:
295-
greedy_logprobs_params = SamplingParams(
296-
temperature=0.0,
297-
max_tokens=max_tokens,
298-
logprobs=num_logprobs,
299-
prompt_logprobs=(num_prompt_logprobs),
300-
)
301-
'''
302-
Greedy logprobs generation for vLLM encoder/decoder models
303-
'''
304-
305-
return self.generate_encoder_decoder_w_logprobs(
306-
encoder_decoder_prompts, greedy_logprobs_params)
307-
308-
def generate_beam_search(
309-
self,
310-
prompts: Union[List[str], List[List[int]]],
311-
beam_width: int,
312-
max_tokens: int,
313-
) -> List[Tuple[List[List[int]], List[str]]]:
314-
if is_list_of(prompts, str, check="all"):
315-
prompts = [TextPrompt(prompt=prompt) for prompt in prompts]
316-
else:
317-
prompts = [
318-
TokensPrompt(prompt_token_ids=tokens) for tokens in prompts
319-
]
320-
outputs = self.model.beam_search(
321-
prompts,
322-
BeamSearchParams(beam_width=beam_width, max_tokens=max_tokens))
323-
returned_outputs = []
324-
for output in outputs:
325-
token_ids = [x.tokens for x in output.sequences]
326-
texts = [x.text for x in output.sequences]
327-
returned_outputs.append((token_ids, texts))
328-
return returned_outputs
329-
330-
def classify(self, prompts: List[str]) -> List[List[float]]:
331-
req_outputs = self.model.classify(prompts)
332-
return [req_output.outputs.probs for req_output in req_outputs]
333-
334264
def encode(
335265
self,
336266
prompts: List[str],
@@ -346,14 +276,6 @@ def encode(
346276
req_outputs = self.model.embed(inputs)
347277
return [req_output.outputs.embedding for req_output in req_outputs]
348278

349-
def score(
350-
self,
351-
text_1: Union[str, List[str]],
352-
text_2: Union[str, List[str]],
353-
) -> List[float]:
354-
req_outputs = self.model.score(text_1, text_2)
355-
return [req_output.outputs.score for req_output in req_outputs]
356-
357279
def __enter__(self):
358280
return self
359281

@@ -362,35 +284,6 @@ def __exit__(self, exc_type, exc_value, traceback):
362284
cleanup_dist_env_and_memory()
363285

364286

365-
@pytest.fixture(scope="session")
366-
def vllm_runner():
367-
return VllmRunner
368-
369-
370-
@pytest.fixture(params=list(PROMPT_TEMPLATES.keys()))
371-
def prompt_template(request):
372-
return PROMPT_TEMPLATES[request.param]
373-
374-
375-
def _read_prompts(filename: str) -> list[str]:
376-
with open(filename) as f:
377-
prompts = f.readlines()
378-
return prompts
379-
380-
381-
@pytest.fixture
382-
def example_prompts() -> list[str]:
383-
prompts = []
384-
for filename in _TEST_PROMPTS:
385-
prompts += _read_prompts(filename)
386-
return prompts
387-
388-
389-
@pytest.fixture(scope="session")
390-
def ilama_lora_files():
391-
return snapshot_download(repo_id="vllm-ascend/ilama-text2sql-spider")
392-
393-
394287
class HfRunner:
395288

396289
def get_default_device(self):
@@ -515,5 +408,22 @@ def __exit__(self, exc_type, exc_value, traceback):
515408

516409

517410
@pytest.fixture(scope="session")
518-
def hf_runner():
519-
return HfRunner
411+
def ilama_lora_files():
412+
return snapshot_download(repo_id="vllm-ascend/ilama-text2sql-spider")
413+
414+
415+
def qwen_prompt(questions: List[str]) -> List[str]:
416+
placeholder = "<|image_pad|>"
417+
return [("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
418+
f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
419+
f"{q}<|im_end|>\n<|im_start|>assistant\n") for q in questions]
420+
421+
422+
PROMPT_TEMPLATES = {
423+
"qwen2.5vl": qwen_prompt,
424+
}
425+
426+
427+
@pytest.fixture(params=list(PROMPT_TEMPLATES.keys()))
428+
def prompt_template(request):
429+
return PROMPT_TEMPLATES[request.param]

0 commit comments

Comments
 (0)