[Bugfix] change FlashMLA reorder_batch_threshold (vllm-project#27777)

MatthewBonanni · ZhengHongming888 · commit db96d978b7f7 · 2025-11-08T08:32:52.000-08:00
Signed-off-by: Matthew Bonanni &lt;mbonanni@redhat.com&gt;
diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py
@@ -71,7 +71,7 @@ class FlashMLAMetadata(MLACommonMetadata[FlashMLADecodeMetadata]):
 class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]):
     cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.UNIFORM_BATCH
     query_len_support: ClassVar[QueryLenSupport] = QueryLenSupport.UNIFORM
-    reorder_batch_threshold: int = 512  # process small prefills with decode pathway
+    reorder_batch_threshold: int = 128  # process small prefills with decode pathway
     # ^ TODO(matt): tune this
 
     def __init__(