We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent bdce49a commit db96d97Copy full SHA for db96d97
vllm/v1/attention/backends/mla/flashmla.py
@@ -71,7 +71,7 @@ class FlashMLAMetadata(MLACommonMetadata[FlashMLADecodeMetadata]):
71
class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]):
72
cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.UNIFORM_BATCH
73
query_len_support: ClassVar[QueryLenSupport] = QueryLenSupport.UNIFORM
74
- reorder_batch_threshold: int = 512 # process small prefills with decode pathway
+ reorder_batch_threshold: int = 128 # process small prefills with decode pathway
75
# ^ TODO(matt): tune this
76
77
def __init__(
0 commit comments