Skip to content

Commit 731e588

Browse files
committed
fix prefix cache and ep
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
1 parent cd5b26f commit 731e588

File tree

3 files changed

+25
-16
lines changed

3 files changed

+25
-16
lines changed

.github/workflows/vllm_ascend_test_full.yaml

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -129,12 +129,12 @@ jobs:
129129
# We found that if running aclgraph tests in batch, it will cause AclmdlRICaptureBegin error. So we run
130130
# the test separately.
131131
132-
#pytest -sv tests/e2e/singlecard/test_aclgraph.py
133-
#pytest -sv tests/e2e/singlecard/test_ascend_scheduler.py
134-
#pytest -sv tests/e2e/singlecard/test_camem.py
135-
#pytest -sv tests/e2e/singlecard/test_chunked.py
136-
#pytest -sv tests/e2e/singlecard/test_embedding.py
137-
#pytest -sv tests/e2e/singlecard/test_guided_decoding.py
132+
pytest -sv tests/e2e/singlecard/test_aclgraph.py
133+
pytest -sv tests/e2e/singlecard/test_ascend_scheduler.py
134+
pytest -sv tests/e2e/singlecard/test_camem.py
135+
pytest -sv tests/e2e/singlecard/test_chunked.py
136+
pytest -sv tests/e2e/singlecard/test_embedding.py
137+
pytest -sv tests/e2e/singlecard/test_guided_decoding.py
138138
#pytest -sv tests/e2e/singlecard/test_ilama_lora.py
139139
pytest -sv tests/e2e/singlecard/test_profile_execute_duration.py
140140
pytest -sv tests/e2e/singlecard/test_quantization.py
@@ -211,21 +211,21 @@ jobs:
211211
VLLM_USE_MODELSCOPE: True
212212
run: |
213213
#pytest -sv tests/e2e/multicard/test_data_parallel.py
214-
#pytest -sv tests/e2e/multicard/test_expert_parallel.py
214+
pytest -sv tests/e2e/multicard/test_expert_parallel.py
215215
# external_launcher test is not stable enough. Fix it later
216216
# pytest -sv tests/e2e/multicard/test_external_launcher.py
217217
#pytest -sv tests/e2e/multicard/test_fused_moe_allgather_ep.py
218218
#pytest -sv tests/e2e/multicard/test_ilama_lora_tp2.py
219219
220220
# To avoid oom, we need to run the test in a single process.
221-
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ
222-
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_multistream_moe
223-
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen3_W8A8
224-
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen3_W4A8DYNAMIC
225-
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W4A8DYNAMIC
226-
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_sp_for_qwen3_moe
227-
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen_Dense_with_flashcomm_v1
228-
pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen_Dense_with_prefetch_mlp_weight
221+
#pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ
222+
#pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_multistream_moe
223+
#pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen3_W8A8
224+
#pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen3_W4A8DYNAMIC
225+
#pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W4A8DYNAMIC
226+
#pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_sp_for_qwen3_moe
227+
#pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen_Dense_with_flashcomm_v1
228+
#pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen_Dense_with_prefetch_mlp_weight
229229
230230
#pytest -sv tests/e2e/multicard/test_pipeline_parallel.py
231231
pytest -sv tests/e2e/multicard/test_prefix_caching.py

vllm_ascend/platform.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
from torch.distributed.distributed_c10d import PrefixStore
2626
from vllm.logger import logger
2727
from vllm.platforms import Platform, PlatformEnum
28+
from vllm.utils import cdiv
2829

2930
from vllm_ascend.ascend_config import (check_ascend_config, get_ascend_config,
3031
init_ascend_config)
@@ -245,6 +246,12 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
245246
if cache_config:
246247
if cache_config.block_size is None:
247248
cache_config.block_size = 128
249+
else:
250+
target_block_size = cdiv(cache_config.block_size, 64) * 64
251+
if target_block_size > 256:
252+
target_block_size = 256
253+
cache_config.block_size = target_block_size
254+
248255
if cache_config.enable_prefix_caching and cache_config.block_size != 128:
249256
logger.warning(
250257
"If prefix caching is enabled, block size must be set to 128."

vllm_ascend/worker/block_table.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,9 @@ def compute_slot_mapping(self, req_indices: np.ndarray,
166166
self.slot_mapping_np[:req_indices.shape[0]] = np.where(
167167
mask, slot_mapping, -1)
168168
else:
169-
if self.block_size == self.kernel_sizes[0]:
169+
assert self.kernel_sizes is not None
170+
if self.block_size == self.kernel_sizes[0] or self.kernel_sizes[
171+
0] == 0:
170172
# IMPORTANT: In hybrid mode, positions are in logical block space,
171173
# but we need to map them to the correct logical block table indices
172174
logical_block_idx = positions // self.block_size

0 commit comments

Comments
 (0)