add torch.xpu.stream to ddp

jingxu10 · jingxu10 · commit 5ceb86f97314 · 2023-08-10T19:03:31.000+09:00
diff --git a/src/lightning/fabric/strategies/ddp.py b/src/lightning/fabric/strategies/ddp.py
@@ -115,13 +115,17 @@ def setup_environment(self) -> None:
     def setup_module(self, module: Module) -> DistributedDataParallel:
         """Wraps the model into a :class:`~torch.nn.parallel.distributed.DistributedDataParallel` module."""
         device_ids = self._determine_ddp_device_ids()
+        ctx = None
         if self.root_device.type == "cuda":
             # https://pytorch.org/docs/stable/notes/cuda.html#id5
             ctx = torch.cuda.stream(torch.cuda.Stream()) if device_ids is not None else nullcontext()
+        if self.root_device.type == "xpu":
+            ctx = torch.xpu.stream(torch.xpu.Stream()) if device_ids is not None else nullcontext()
+        if ctx is None:
+            return DistributedDataParallel(module=module, device_ids=device_ids, **self._ddp_kwargs)
+        else:
             with ctx:
                 return DistributedDataParallel(module=module, device_ids=device_ids, **self._ddp_kwargs)
-        else:
-            return DistributedDataParallel(module=module, device_ids=device_ids, **self._ddp_kwargs)
 
     def module_to_device(self, module: Module) -> None:
         module.to(self.root_device)
diff --git a/src/lightning/pytorch/strategies/ddp.py b/src/lightning/pytorch/strategies/ddp.py
@@ -183,13 +183,17 @@ def _setup_model(self, model: Module) -> DistributedDataParallel:
         """Wraps the model into a :class:`~torch.nn.parallel.distributed.DistributedDataParallel` module."""
         device_ids = self.determine_ddp_device_ids()
         log.debug(f"setting up DDP model with device ids: {device_ids}, kwargs: {self._ddp_kwargs}")
+        ctx = None
         if self.root_device.type == "cuda":
             # https://pytorch.org/docs/stable/notes/cuda.html#id5
             ctx = torch.cuda.stream(torch.cuda.Stream()) if device_ids is not None else nullcontext()
+        if self.root_device.type == "xpu":
+            ctx = torch.xpu.stream(torch.xpu.Stream()) if device_ids is not None else nullcontext()
+        if ctx is None:
+            return DistributedDataParallel(module=model, device_ids=device_ids, **self._ddp_kwargs)
+        else:
             with ctx:
                 return DistributedDataParallel(module=model, device_ids=device_ids, **self._ddp_kwargs)
-        else:
-            return DistributedDataParallel(module=model, device_ids=device_ids, **self._ddp_kwargs)
 
     def setup_distributed(self) -> None:
         log.debug(f"{self.__class__.__name__}: setting up distributed...")