feat: introduce vLLM compatibility layer for improved model execution

drewjin · drewjin · commit 782148d74ed7 · 2026-04-24T17:03:56.000Z
- Added a new module `vllm_compat.py` to manage vLLM-specific configurations and group coordination for tensor parallelism.
- Integrated vLLM context management into the `ModelRunnerBase` to ensure proper configuration during model loading.
- Updated linear layer operations to utilize the new vLLM group coordination, enhancing distributed training capabilities.
- Implemented graph capture support in the multi-block model runner, optimizing CUDA stream management for performance.
diff --git a/diffulex/engine/model_runner.py b/diffulex/engine/model_runner.py
@@ -18,6 +18,7 @@
 from diffulex.model import AutoModelForDiffusionLM
 from diffulex.engine.strategy_registry import DiffulexStrategyRegistry
 from diffulex.logger import get_logger
+from diffulex.vllm_compat import reset_vllm_compat_state, vllm_current_config
 
 
 logger = get_logger(__name__)
@@ -100,7 +101,8 @@ def __init__(self, config: Config, rank: int, event: Event | list[Event]):
         torch.set_default_dtype(self.default_dtype)
         torch.set_default_device(f"cuda:{device_id}")
 
-        self.model = self.load_model(config)
+        with vllm_current_config(config):
+            self.model = self.load_model(config)
         self.sampler = self.load_sampler(config)
         self.allocate_kv_cache()
         self.warmup_model()
@@ -146,6 +148,7 @@ def exit(self):
                 dist.destroy_process_group()
         except Exception:
             logger.debug("Failed to destroy process group on rank %s.", self.rank, exc_info=True)
+        reset_vllm_compat_state()
         reset_parallel_state()
 
     def start_worker_loop(self):
diff --git a/diffulex/layer/linear.py b/diffulex/layer/linear.py
@@ -6,49 +6,11 @@
 import torch.distributed as dist
 
 from diffulex.distributed.parallel_state import fetch_parallel_state
-
-_VLLM_TP_GROUP = None
-_VLLM_TP_GROUP_FAILED = False
-
-
-def _tp_group_ranks_for_vllm() -> list[list[int]]:
-    state = fetch_parallel_state()
-    tp_size = int(state.tp_size)
-    if tp_size <= 1:
-        return [[int(state.global_rank)]]
-    dp_size = int(state.dp_size)
-    return [
-        list(range(dp_rank * tp_size, (dp_rank + 1) * tp_size))
-        for dp_rank in range(dp_size)
-    ]
-
-
-def _get_vllm_tp_group():
-    global _VLLM_TP_GROUP, _VLLM_TP_GROUP_FAILED
-    if _VLLM_TP_GROUP is not None:
-        return _VLLM_TP_GROUP
-    if _VLLM_TP_GROUP_FAILED:
-        return None
-
-    try:
-        from vllm.distributed.parallel_state import GroupCoordinator, set_custom_all_reduce
-
-        set_custom_all_reduce(True)
-        _VLLM_TP_GROUP = GroupCoordinator(
-            group_ranks=_tp_group_ranks_for_vllm(),
-            local_rank=int(torch.cuda.current_device()),
-            torch_distributed_backend=dist.get_backend(),
-            use_device_communicator=True,
-            group_name="tp",
-        )
-        return _VLLM_TP_GROUP
-    except Exception:
-        _VLLM_TP_GROUP_FAILED = True
-        return None
+from diffulex.vllm_compat import get_vllm_tp_group
 
 
 def tp_all_reduce(x: torch.Tensor, group) -> torch.Tensor:
-    vllm_tp_group = _get_vllm_tp_group()
+    vllm_tp_group = get_vllm_tp_group()
     if vllm_tp_group is not None:
         return vllm_tp_group.all_reduce(x)
     dist.all_reduce(x, group=group)
diff --git a/diffulex/strategy_template/multi_block/engine/model_runner.py b/diffulex/strategy_template/multi_block/engine/model_runner.py
@@ -17,6 +17,7 @@
 from diffulex.engine.status import DllmReqStatus
 from diffulex.engine.model_runner import ModelRunnerBase
 from diffulex.logger import get_logger
+from diffulex.vllm_compat import vllm_graph_capture
 
 logger = get_logger(__name__)
 
@@ -147,8 +148,9 @@ def run_once() -> None:
 
             torch.cuda.synchronize()
             self._graph_capture_barrier()
-            with torch.cuda.graph(graph, pool=pool, stream=stream):
-                run_once()
+            with vllm_graph_capture(stream, pool) as capture_stream:
+                with torch.cuda.graph(graph, pool=pool, stream=capture_stream):
+                    run_once()
             stream.synchronize()
         return graph
 
diff --git a/diffulex/vllm_compat.py b/diffulex/vllm_compat.py
@@ -0,0 +1,121 @@
+from __future__ import annotations
+
+from contextlib import contextmanager, nullcontext
+from typing import Iterator
+
+import torch
+import torch.distributed as dist
+
+from diffulex.config import Config
+from diffulex.distributed.parallel_state import fetch_parallel_state
+from diffulex.logger import get_logger
+
+
+logger = get_logger(__name__)
+
+_VLLM_TP_GROUP = None
+_VLLM_TP_GROUP_FAILED = False
+
+
+def _tp_group_ranks_for_vllm() -> list[list[int]]:
+    state = fetch_parallel_state()
+    tp_size = int(state.tp_size)
+    if tp_size <= 1:
+        return [[int(state.global_rank)]]
+    dp_size = int(state.dp_size)
+    return [
+        list(range(dp_rank * tp_size, (dp_rank + 1) * tp_size))
+        for dp_rank in range(dp_size)
+    ]
+
+
+def get_vllm_tp_group():
+    """Return a vLLM GroupCoordinator matching Diffulex TP ranks.
+
+    dInfer routes custom all-reduce and CUDA graph capture through the vLLM /
+    sglang group coordinator. Diffulex owns process-group initialization, so we
+    build the coordinator on top of the already initialized torch distributed
+    group and fall back silently when vLLM is unavailable.
+    """
+    global _VLLM_TP_GROUP, _VLLM_TP_GROUP_FAILED
+    if _VLLM_TP_GROUP is not None:
+        return _VLLM_TP_GROUP
+    if _VLLM_TP_GROUP_FAILED:
+        return None
+
+    try:
+        from vllm.distributed.parallel_state import GroupCoordinator, set_custom_all_reduce
+
+        set_custom_all_reduce(True)
+        _VLLM_TP_GROUP = GroupCoordinator(
+            group_ranks=_tp_group_ranks_for_vllm(),
+            local_rank=int(torch.cuda.current_device()),
+            torch_distributed_backend=dist.get_backend(),
+            use_device_communicator=True,
+            group_name="tp",
+        )
+        return _VLLM_TP_GROUP
+    except Exception:
+        _VLLM_TP_GROUP_FAILED = True
+        logger.debug("Failed to initialize vLLM TP coordinator.", exc_info=True)
+        return None
+
+
+@contextmanager
+def vllm_current_config(config: Config) -> Iterator[None]:
+    """Temporarily install a minimal vLLM config during module construction."""
+    try:
+        from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
+
+        parallel_config = ParallelConfig(
+            tensor_parallel_size=int(config.tensor_parallel_size),
+            pipeline_parallel_size=1,
+            data_parallel_size=int(config.data_parallel_size),
+            enable_expert_parallel=bool(int(config.expert_parallel_size) > 1),
+            disable_custom_all_reduce=False,
+            distributed_timeout_seconds=int(config.distributed_timeout_seconds),
+        )
+        with set_current_vllm_config(VllmConfig(parallel_config=parallel_config)):
+            yield
+    except Exception:
+        logger.debug("Using Diffulex model init without vLLM current config.", exc_info=True)
+        yield
+
+
+@contextmanager
+def vllm_graph_capture(stream: torch.cuda.Stream, pool) -> Iterator[torch.cuda.Stream]:
+    """Enter vLLM graph-capture side contexts when available."""
+    try:
+        from vllm.distributed.device_communicators.pynccl_allocator import set_graph_pool_id
+
+        set_graph_pool_id(pool)
+    except Exception:
+        logger.debug("Failed to set vLLM graph pool id.", exc_info=True)
+
+    group = get_vllm_tp_group()
+    if group is None or not hasattr(group, "graph_capture"):
+        with torch.cuda.stream(stream):
+            yield stream
+        return
+
+    try:
+        context = getattr(group, "graph_capture")()
+        with context as graph_context:
+            yield getattr(graph_context, "stream", stream)
+    except Exception:
+        logger.debug("vLLM graph_capture context failed; using raw CUDA stream.", exc_info=True)
+        with torch.cuda.stream(stream):
+            yield stream
+
+
+def reset_vllm_compat_state() -> None:
+    global _VLLM_TP_GROUP, _VLLM_TP_GROUP_FAILED
+    group = _VLLM_TP_GROUP
+    _VLLM_TP_GROUP = None
+    _VLLM_TP_GROUP_FAILED = False
+    if group is not None and hasattr(group, "destroy"):
+        try:
+            group.destroy()
+        except Exception:
+            logger.debug("Failed to destroy vLLM TP coordinator.", exc_info=True)
+