Override inductor default mm with batch invariant one for B200

PaulZhang12 · PaulZhang12 · commit 3878d3c28007 · 2025-11-03T08:32:17.000-08:00
diff --git a/tests/v1/generation/test_batch_invariance.py b/tests/v1/generation/test_batch_invariance.py
@@ -456,7 +456,6 @@ def test_simple_generation(backend, monkeypatch: pytest.MonkeyPatch):
         model=model,
         max_num_seqs=1,
         tensor_parallel_size=int(os.getenv("VLLM_TP_SIZE", "1")),
-        enforce_eager=True,
         gpu_memory_utilization=0.9,
         max_model_len=2048,
         dtype="bfloat16",
@@ -998,7 +997,6 @@ def LLM_with_max_seqs(
         dtype="bfloat16",
         tensor_parallel_size=int(os.getenv("VLLM_TP_SIZE", "1")),
         enable_prefix_caching=False,
-        enforce_eager=True,
         # Enable for MOE models
         # enable_expert_parallel=True,
     )
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
@@ -15,6 +15,9 @@
 from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
 from vllm.config.utils import config
 from vllm.logger import init_logger
+from vllm.model_executor.layers.batch_invariant import (
+    vllm_is_batch_invariant,
+)
 from vllm.platforms import current_platform
 from vllm.utils.import_utils import resolve_obj_by_qualname
 from vllm.utils.torch_utils import is_torch_equal_or_newer
@@ -579,6 +582,13 @@ def __post_init__(self) -> None:
             self.inductor_compile_config["combo_kernels"] = True
             self.inductor_compile_config["benchmark_combo_kernel"] = True
 
+        # Batch invariance on Blackwell doesn't work with cuda graphs
+        if vllm_is_batch_invariant() and current_platform.is_device_capability(100):
+            logger.warning(
+                "Disabling Cudagraphs: Batch invariance on Blackwell doesn't work with cuda graphs"
+            )
+            self.cudagraph_mode = CUDAGraphMode.NONE
+
         # migrate the deprecated flags
         if not self.use_cudagraph:
             logger.warning(