Skip to content

Commit 585a494

Browse files
authored
[Core] Disable the chunked prefill feature in Non-MLA LLMs (#2894)
### What this PR does / why we need it? This PR enforces the forcible disabling of the chunked prefill feature in Non-MLA models, as the performance of operators supporting this functionality is currently suboptimal. Unless the user has enabled chunked prefill in the ascend_scheduler_config, we would allow this feature. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? CI passed with new added/existing test. Related: #2659 - vLLM version: main - vLLM main: vllm-project/vllm@d21a36f Signed-off-by: rjg-lyh <1318825571@qq.com>
1 parent 756b8a1 commit 585a494

File tree

3 files changed

+29
-23
lines changed

3 files changed

+29
-23
lines changed

tests/ut/core/test_schedule_config.py

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,6 @@ def test_initialize_from_config_with_default(self):
3636
self.basic_scheduler_config, {})
3737
self.assertEqual(ascend_config.enable_chunked_prefill, False)
3838
self.assertEqual(ascend_config.policy, "fcfs")
39-
self.assertEqual(ascend_config.num_scheduler_steps, 1)
4039
self.assertEqual(ascend_config.scheduler_cls,
4140
"vllm_ascend.core.scheduler.AscendScheduler")
4241
self.assertEqual(ascend_config.max_num_encoder_input_tokens, 8192)
@@ -49,15 +48,13 @@ def test_initialize_from_config_with_override(self):
4948
AscendSchedulerConfig(
5049
enable_chunked_prefill=False,
5150
policy="fcfs",
52-
num_scheduler_steps=1,
5351
scheduler_cls="vllm_ascend.core.scheduler.AscendScheduler",
5452
max_num_batched_tokens=2048,
5553
max_model_len=2048,
5654
),
5755
)
5856
self.assertEqual(ascend_config.enable_chunked_prefill, False)
5957
self.assertEqual(ascend_config.policy, "fcfs")
60-
self.assertEqual(ascend_config.num_scheduler_steps, 1)
6158
self.assertEqual(ascend_config.scheduler_cls,
6259
"vllm_ascend.core.scheduler.AscendScheduler")
6360
self.assertEqual(ascend_config.max_num_batched_tokens, 2048)
@@ -85,21 +82,6 @@ def test_not_implemented_multimodal(self):
8582
self.assertIn("currently AscendScheduler only supports LLM models",
8683
str(context.exception))
8784

88-
def test_not_implemented_multi_step(self):
89-
with self.assertRaises(NotImplementedError) as context:
90-
AscendSchedulerConfig.initialize_from_config(
91-
self.basic_scheduler_config,
92-
AscendSchedulerConfig(
93-
num_scheduler_steps=2,
94-
max_num_batched_tokens=2048,
95-
max_model_len=2048,
96-
),
97-
)
98-
self.assertIn(
99-
"currently AscendScheduler doesn't support multi-step",
100-
str(context.exception),
101-
)
102-
10385
def test_not_implemented_send_delta_data(self):
10486
with self.assertRaises(NotImplementedError) as context:
10587
AscendSchedulerConfig.initialize_from_config(

vllm_ascend/core/schedule_config.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@
2525
class AscendSchedulerConfig(SchedulerConfig):
2626
enable_chunked_prefill: bool = False
2727
policy: str = "fcfs"
28-
num_scheduler_steps: int = 1
2928
scheduler_cls: Union[str, Type[object]] = (
3029
"vllm_ascend.core.scheduler.AscendScheduler")
3130
enable_pd_transfer: bool = False
@@ -44,7 +43,6 @@ def initialize_from_config(
4443
# Override default values into original SchedulerConfig
4544
scheduler_config["enable_chunked_prefill"] = False
4645
scheduler_config["policy"] = "fcfs"
47-
scheduler_config["num_scheduler_steps"] = 1
4846
scheduler_config["scheduler_cls"] = (
4947
"vllm_ascend.core.scheduler.AscendScheduler")
5048
scheduler_config["enable_pd_transfer"] = False
@@ -76,9 +74,6 @@ def __post_init__(self) -> None:
7674
if self.is_multimodal_model:
7775
raise NotImplementedError(
7876
"currently AscendScheduler only supports LLM models.")
79-
if self.num_scheduler_steps > 1:
80-
raise NotImplementedError(
81-
"currently AscendScheduler doesn't support multi-step.")
8277
if self.send_delta_data:
8378
raise NotImplementedError(
8479
"currently AscendScheduler doesn't support send_delta_data.")

vllm_ascend/platform.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,35 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
128128
model_config = vllm_config.model_config
129129
parallel_config = vllm_config.parallel_config
130130
cache_config = vllm_config.cache_config
131+
decoding_config = vllm_config.decoding_config
132+
scheduler_config = vllm_config.scheduler_config
133+
ascend_scheduler_config = ascend_config.ascend_scheduler_config
134+
135+
if model_config is not None and not model_config.use_mla:
136+
logger.info(
137+
"Non-MLA LLMs forcibly disable the chunked prefill feature,"
138+
"as the performance of operators supporting this feature "
139+
"functionality is currently suboptimal.")
140+
if not model_config.is_multimodal_model and \
141+
decoding_config.backend == "auto" and \
142+
not scheduler_config.delay_factor > 0 and \
143+
not scheduler_config.send_delta_data and \
144+
scheduler_config.policy == "fcfs":
145+
ascend_scheduler_config.enabled = True
146+
chunked_prefill_enabled_in_ascend_scheduler = getattr(
147+
ascend_scheduler_config, "enable_chunked_prefill", False)
148+
if chunked_prefill_enabled_in_ascend_scheduler:
149+
logger.warning(
150+
"Chunked prefill feature is enabled in ascend_scheduler,"
151+
"but note that the operator supporting this feature "
152+
"would lead to performance degradation.")
153+
# In this situation, max_num_batched_tokens would have been rewritten.
154+
# So we must make sure max_num_batched_tokens is not smaller than max_model_len.
155+
if (scheduler_config.max_num_batched_tokens
156+
< scheduler_config.max_model_len
157+
and not chunked_prefill_enabled_in_ascend_scheduler):
158+
scheduler_config.max_num_batched_tokens = scheduler_config.max_model_len
159+
131160
kv_cache_dtype = vllm_config.additional_config.get(
132161
"kv_cache_dtype", None)
133162
if kv_cache_dtype is not None:

0 commit comments

Comments
 (0)