Override inductor default mm with batch invariant one for B200

PaulZhang12 · PaulZhang12 · commit fade2e2ffbfd · 2025-11-02T20:01:23.000-08:00
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
@@ -18,6 +18,9 @@
 from vllm.platforms import current_platform
 from vllm.utils.import_utils import resolve_obj_by_qualname
 from vllm.utils.torch_utils import is_torch_equal_or_newer
+from vllm.model_executor.layers.batch_invariant import (
+    vllm_is_batch_invariant,
+)
 
 if TYPE_CHECKING:
     from vllm.config import VllmConfig
@@ -579,6 +582,13 @@ def __post_init__(self) -> None:
             self.inductor_compile_config["combo_kernels"] = True
             self.inductor_compile_config["benchmark_combo_kernel"] = True
 
+        # Batch invariance on Blackwell doesn't work with cuda graphs
+        if vllm_is_batch_invariant() and current_platform.is_device_capability(100) >= (10, 0):
+            logger.warning(
+                "Disabling Cudagraphs: Batch invariance on Blackwell doesn't work with cuda graphs"
+            )
+            self.cudagraph_mode = CUDAGraphMode.NONE
+
         # migrate the deprecated flags
         if not self.use_cudagraph:
             logger.warning(
diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py
@@ -140,7 +140,7 @@ def matmul_kernel_persistent(
 
 
 def matmul_persistent(
-    a: torch.Tensor, b: torch.Tensor, bias: torch.Tensor | None = None
+    a: torch.Tensor, b: torch.Tensor, bias: torch.Tensor | None = None, out = None
 ):
     # Check constraints.
     assert a.shape[1] == b.shape[0], "Incompatible dimensions"
@@ -213,7 +213,11 @@ def grid(META):
         HAS_BIAS=bias is not None,
         **configs[dtype],
     )
-    return c
+
+    if out is not None:
+        out.copy_(c)
+    else:
+        return c
 
 
 @triton.jit
@@ -466,6 +470,9 @@ def mean_dim(
 def mm_batch_invariant(a, b):
     return matmul_persistent(a, b)
 
+def mm_batch_invariant_out(a, b, out=None):
+    return matmul_persistent(a, b, bias=None, out=out)
+
 
 def matmul_batch_invariant(a, b, *, out=None):
     # torch.matmul can handle various dimensions