We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent 4a12841 commit 4b2a9c4Copy full SHA for 4b2a9c4
vllm_ascend/attention/mla_v1.py
@@ -198,10 +198,10 @@ def __init__(self,
198
if self.speculative_config:
199
spec_token_num = self.speculative_config.num_speculative_tokens
200
self.decode_threshold += spec_token_num
201
- self.reorder_batch_threshold = self.decode_threshold
202
assert self.decode_threshold <= 16, f"decode_threshold exceeded \
203
npu_fused_infer_attention_score TND layout's limit of 16, \
204
got {self.decode_threshold}"
+ self.reorder_batch_threshold = self.decode_threshold
205
206
if self.chunked_prefill_enabled:
207
self.chunked_prefill_workspace_size = min(
0 commit comments