From 4b1c2f3c21aeac22973c7dd8599bf56595d8cc18 Mon Sep 17 00:00:00 2001 From: Corey Adams Date: Thu, 20 Jul 2023 08:58:37 -0500 Subject: [PATCH 01/26] Enable Intel XPU as an accelerator and automatic GPU --- src/lightning/fabric/accelerators/__init__.py | 1 + src/lightning/fabric/accelerators/xpu.py | 106 ++++++++++++++++++ src/lightning/fabric/connector.py | 9 +- .../fabric/utilities/device_parser.py | 25 +++-- .../pytorch/accelerators/__init__.py | 2 + src/lightning/pytorch/accelerators/xpu.py | 106 ++++++++++++++++++ .../connectors/accelerator_connector.py | 5 + src/lightning/pytorch/trainer/setup.py | 9 +- 8 files changed, 248 insertions(+), 15 deletions(-) create mode 100644 src/lightning/fabric/accelerators/xpu.py create mode 100644 src/lightning/pytorch/accelerators/xpu.py diff --git a/src/lightning/fabric/accelerators/__init__.py b/src/lightning/fabric/accelerators/__init__.py index 35d2cc6eb17b0..c57d77ad2676e 100644 --- a/src/lightning/fabric/accelerators/__init__.py +++ b/src/lightning/fabric/accelerators/__init__.py @@ -16,6 +16,7 @@ from lightning.fabric.accelerators.mps import MPSAccelerator # noqa: F401 from lightning.fabric.accelerators.registry import _AcceleratorRegistry, call_register_accelerators from lightning.fabric.accelerators.xla import XLAAccelerator # noqa: F401 +from lightning.fabric.accelerators.xpu import XPUAccelerator _ACCELERATORS_BASE_MODULE = "lightning.fabric.accelerators" ACCELERATOR_REGISTRY = _AcceleratorRegistry() diff --git a/src/lightning/fabric/accelerators/xpu.py b/src/lightning/fabric/accelerators/xpu.py new file mode 100644 index 0000000000000..c3394c6e80107 --- /dev/null +++ b/src/lightning/fabric/accelerators/xpu.py @@ -0,0 +1,106 @@ +# Copyright The Lightning AI team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import warnings +from contextlib import contextmanager +from functools import lru_cache +from typing import cast, Generator, List, Optional, Union + +import torch +from lightning_utilities.core.rank_zero import rank_zero_info + +from lightning.fabric.accelerators.accelerator import Accelerator +from lightning.fabric.accelerators.registry import _AcceleratorRegistry +from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_1_12 + +from lightning_utilities.core.imports import RequirementCache + +from typing import Any, Dict, List, Union + +class XPUAccelerator(Accelerator): + """Support for a Intel Discrete Graphics Cards 'XPU'.""" + + def __init__(self, *args: Any, **kwargs: Any) -> None: + if not _IPEX_AVAILABLE: + raise ModuleNotFoundError(str(_IPEX_AVAILABLE)) + super().__init__(*args, **kwargs) + + @staticmethod + def parse_devices(devices: Any) -> Any: + # Put parsing logic here how devices can be passed into the Trainer + # via the `devices` argument + from lightning.fabric.utilities.device_parser import _parse_gpu_ids + + return _parse_gpu_ids(devices, include_xpu=True) + + @staticmethod + def get_parallel_devices(devices: Any) -> Any: + # Here, convert the device indices to actual device objects + import intel_extension_for_pytorch as ipex + + return [torch.device("xpu", idx) for idx in devices] + + @staticmethod + def auto_device_count() -> int: + # Return a value for auto-device selection when `Trainer(devices="auto")` + return num_xpu_devices() + + @staticmethod + def is_available() -> bool: + import intel_extension_for_pytorch as ipex + return ipex.xpu.is_available() + + def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]: + # Return optional device statistics for loggers + return {} + + def setup_device(self, device: torch.device) -> None: + pass + + def teardown(self) -> None: + pass + + @classmethod + def register_accelerators(cls, accelerator_registry): + accelerator_registry.register( + "xpu", + cls, + description=cls.__class__.__name__, + ) + + +_IPEX_AVAILABLE = RequirementCache("intel_extension_for_pytorch>=1.13", "intel_extension_for_pytorch") + + +@lru_cache(1) +def num_xpu_devices() -> int: + """Returns the number of available CUDA devices. + + Unlike :func:`torch.cuda.device_count`, this function does its best not to create a CUDA context for fork support, + if the platform allows it. + """ + if _IPEX_AVAILABLE: + import intel_extension_for_pytorch as ipex + return ipex.xpu.device_count() + else: + return 0 + +def _get_all_visible_xpu_devices() -> List[int]: + """Returns a list of all visible Intel XPU devices. + + Devices masked by the environment variabale ``ZE_AFFINITY_MASK`` won't be returned here. For example, assume you + have 8 physical GPUs. If ``ZE_AFFINITY_MASK="1,3,6"``, then this function will return the list ``[0, 1, 2]`` + because these are the three visible GPUs after applying the mask ``ZE_AFFINITY_MASK``. + """ + return list(range(num_xpu_devices())) \ No newline at end of file diff --git a/src/lightning/fabric/connector.py b/src/lightning/fabric/connector.py index 255d439e6df92..b5aa3aa4c38b0 100644 --- a/src/lightning/fabric/connector.py +++ b/src/lightning/fabric/connector.py @@ -23,6 +23,7 @@ from lightning.fabric.accelerators.cuda import CUDAAccelerator from lightning.fabric.accelerators.mps import MPSAccelerator from lightning.fabric.accelerators.xla import XLAAccelerator +from lightning.fabric.accelerators.xpu import XPUAccelerator from lightning.fabric.plugins import ( CheckpointIO, DeepSpeedPrecision, @@ -313,6 +314,8 @@ def _choose_auto_accelerator(self) -> str: return "mps" if CUDAAccelerator.is_available(): return "cuda" + if XPUAccelerator.is_available(): + return "xpu" return "cpu" @staticmethod @@ -321,6 +324,8 @@ def _choose_gpu_accelerator_backend() -> str: return "mps" if CUDAAccelerator.is_available(): return "cuda" + if XPUAccelerator.is_available(): + return "xpu" raise RuntimeError("No supported gpu backend found!") def _set_parallel_devices_and_init_accelerator(self) -> None: @@ -377,8 +382,8 @@ def _choose_strategy(self) -> Union[Strategy, str]: if self._num_nodes_flag > 1: return "ddp" if len(self._parallel_devices) <= 1: - if isinstance(self._accelerator_flag, (CUDAAccelerator, MPSAccelerator)) or ( - isinstance(self._accelerator_flag, str) and self._accelerator_flag in ("cuda", "gpu", "mps") + if isinstance(self._accelerator_flag, (CUDAAccelerator, MPSAccelerator, XPUAccelerator)) or ( + isinstance(self._accelerator_flag, str) and self._accelerator_flag in ("cuda", "gpu", "mps", "xpu") ): device = _determine_root_gpu_device(self._parallel_devices) else: diff --git a/src/lightning/fabric/utilities/device_parser.py b/src/lightning/fabric/utilities/device_parser.py index 65e363cb06d65..4e1a6b2b4fafa 100644 --- a/src/lightning/fabric/utilities/device_parser.py +++ b/src/lightning/fabric/utilities/device_parser.py @@ -49,6 +49,7 @@ def _parse_gpu_ids( gpus: Optional[Union[int, str, List[int]]], include_cuda: bool = False, include_mps: bool = False, + include_xpu: bool = False, ) -> Optional[List[int]]: """ Parses the GPU IDs given in the format as accepted by the @@ -62,6 +63,7 @@ def _parse_gpu_ids( Any int N > 0 indicates that GPUs [0..N) should be used. include_cuda: A boolean value indicating whether to include CUDA devices for GPU parsing. include_mps: A boolean value indicating whether to include MPS devices for GPU parsing. + include_xpu: A boolean value indicating whether to include XPU devices for GPU parsing. Returns: A list of GPUs to be used or ``None`` if no GPUs were requested @@ -71,7 +73,7 @@ def _parse_gpu_ids( If no GPUs are available but the value of gpus variable indicates request for GPUs .. note:: - ``include_cuda`` and ``include_mps`` default to ``False`` so that you only + ``include_cuda`` and ``include_mps`` and ``include_xpu`` default to ``False`` so that you only have to specify which device type to use and all other devices are not disabled. """ # Check that gpus param is None, Int, String or Sequence of Ints @@ -84,14 +86,14 @@ def _parse_gpu_ids( # We know the user requested GPUs therefore if some of the # requested GPUs are not available an exception is thrown. gpus = _normalize_parse_gpu_string_input(gpus) - gpus = _normalize_parse_gpu_input_to_list(gpus, include_cuda=include_cuda, include_mps=include_mps) + gpus = _normalize_parse_gpu_input_to_list(gpus, include_cuda=include_cuda, include_mps=include_mps, include_xpu=include_xpu) if not gpus: raise MisconfigurationException("GPUs requested but none are available.") if ( TorchElasticEnvironment.detect() and len(gpus) != 1 - and len(_get_all_available_gpus(include_cuda=include_cuda, include_mps=include_mps)) == 1 + and len(_get_all_available_gpus(include_cuda=include_cuda, include_mps=include_mps, include_xpu=include_xpu)) == 1 ): # Omit sanity check on torchelastic because by default it shows one visible GPU per process return gpus @@ -99,7 +101,7 @@ def _parse_gpu_ids( # Check that GPUs are unique. Duplicate GPUs are not supported by the backend. _check_unique(gpus) - return _sanitize_gpu_ids(gpus, include_cuda=include_cuda, include_mps=include_mps) + return _sanitize_gpu_ids(gpus, include_cuda=include_cuda, include_mps=include_mps, include_xpu=include_xpu) def _normalize_parse_gpu_string_input(s: Union[int, str, List[int]]) -> Union[int, List[int]]: @@ -112,7 +114,7 @@ def _normalize_parse_gpu_string_input(s: Union[int, str, List[int]]) -> Union[in return int(s.strip()) -def _sanitize_gpu_ids(gpus: List[int], include_cuda: bool = False, include_mps: bool = False) -> List[int]: +def _sanitize_gpu_ids(gpus: List[int], include_cuda: bool = False, include_mps: bool = False, include_xpu: bool = False) -> List[int]: """Checks that each of the GPUs in the list is actually available. Raises a MisconfigurationException if any of the GPUs is not available. @@ -126,9 +128,9 @@ def _sanitize_gpu_ids(gpus: List[int], include_cuda: bool = False, include_mps: MisconfigurationException: If machine has fewer available GPUs than requested. """ - if sum((include_cuda, include_mps)) == 0: + if sum((include_cuda, include_mps, include_xpu)) == 0: raise ValueError("At least one gpu type should be specified!") - all_available_gpus = _get_all_available_gpus(include_cuda=include_cuda, include_mps=include_mps) + all_available_gpus = _get_all_available_gpus(include_cuda=include_cuda, include_mps=include_mps, include_xpu=include_xpu) for gpu in gpus: if gpu not in all_available_gpus: raise MisconfigurationException( @@ -138,7 +140,7 @@ def _sanitize_gpu_ids(gpus: List[int], include_cuda: bool = False, include_mps: def _normalize_parse_gpu_input_to_list( - gpus: Union[int, List[int], Tuple[int, ...]], include_cuda: bool, include_mps: bool + gpus: Union[int, List[int], Tuple[int, ...]], include_cuda: bool, include_mps: bool, include_xpu: bool ) -> Optional[List[int]]: assert gpus is not None if isinstance(gpus, (MutableSequence, tuple)): @@ -148,19 +150,20 @@ def _normalize_parse_gpu_input_to_list( if not gpus: # gpus==0 return None if gpus == -1: - return _get_all_available_gpus(include_cuda=include_cuda, include_mps=include_mps) + return _get_all_available_gpus(include_cuda=include_cuda, include_mps=include_mps, include_xpu=include_xpu) return list(range(gpus)) -def _get_all_available_gpus(include_cuda: bool = False, include_mps: bool = False) -> List[int]: +def _get_all_available_gpus(include_cuda: bool = False, include_mps: bool = False, include_xpu: bool = False) -> List[int]: """ Returns: A list of all available GPUs """ cuda_gpus = accelerators.cuda._get_all_visible_cuda_devices() if include_cuda else [] mps_gpus = accelerators.mps._get_all_available_mps_gpus() if include_mps else [] - return cuda_gpus + mps_gpus + xpu_gpus = accelerators.xpu._get_all_visible_xpu_devices() if include_xpu else [] + return cuda_gpus + mps_gpus + xpu_gpus def _check_unique(device_ids: List[int]) -> None: diff --git a/src/lightning/pytorch/accelerators/__init__.py b/src/lightning/pytorch/accelerators/__init__.py index bd3dce572448e..917bf7282957d 100644 --- a/src/lightning/pytorch/accelerators/__init__.py +++ b/src/lightning/pytorch/accelerators/__init__.py @@ -17,6 +17,8 @@ from lightning.pytorch.accelerators.cuda import CUDAAccelerator # noqa: F401 from lightning.pytorch.accelerators.mps import MPSAccelerator # noqa: F401 from lightning.pytorch.accelerators.xla import XLAAccelerator # noqa: F401 +from lightning.pytorch.accelerators.xpu import XPUAccelerator # noqa: F401 + ACCELERATORS_BASE_MODULE = "lightning.pytorch.accelerators" AcceleratorRegistry = _AcceleratorRegistry() diff --git a/src/lightning/pytorch/accelerators/xpu.py b/src/lightning/pytorch/accelerators/xpu.py new file mode 100644 index 0000000000000..d8199b711b26a --- /dev/null +++ b/src/lightning/pytorch/accelerators/xpu.py @@ -0,0 +1,106 @@ +# Copyright The Lightning AI team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import warnings +from contextlib import contextmanager +from functools import lru_cache +from typing import cast, Generator, List, Optional, Union + +import torch +from lightning_utilities.core.rank_zero import rank_zero_info + +from lightning.fabric.accelerators.registry import _AcceleratorRegistry +from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_1_12 + +from lightning_utilities.core.imports import RequirementCache + +from lightning.pytorch.accelerators.accelerator import Accelerator +from typing import Any, Dict, List, Union + +class XPUAccelerator(Accelerator): + """Support for a Intel Discrete Graphics Cards 'XPU'.""" + + def __init__(self, *args: Any, **kwargs: Any) -> None: + if not _IPEX_AVAILABLE: + raise ModuleNotFoundError(str(_IPEX_AVAILABLE)) + super().__init__(*args, **kwargs) + + @staticmethod + def parse_devices(devices: Any) -> Any: + # Put parsing logic here how devices can be passed into the Trainer + # via the `devices` argument + from lightning.fabric.utilities.device_parser import _parse_gpu_ids + + return _parse_gpu_ids(devices, include_xpu=True) + + @staticmethod + def get_parallel_devices(devices: Any) -> Any: + # Here, convert the device indices to actual device objects + import intel_extension_for_pytorch as ipex + + return [torch.device("xpu", idx) for idx in devices] + + @staticmethod + def auto_device_count() -> int: + # Return a value for auto-device selection when `Trainer(devices="auto")` + return num_xpu_devices() + + @staticmethod + def is_available() -> bool: + import intel_extension_for_pytorch as ipex + return ipex.xpu.is_available() + + def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]: + # Return optional device statistics for loggers + return {} + + def setup_device(self, device: torch.device) -> None: + pass + + def teardown(self) -> None: + pass + + @classmethod + def register_accelerators(cls, accelerator_registry): + accelerator_registry.register( + "xpu", + cls, + description=cls.__class__.__name__, + ) + + +_IPEX_AVAILABLE = RequirementCache("intel_extension_for_pytorch>=1.13", "intel_extension_for_pytorch") + + +@lru_cache(1) +def num_xpu_devices() -> int: + """Returns the number of available CUDA devices. + + Unlike :func:`torch.cuda.device_count`, this function does its best not to create a CUDA context for fork support, + if the platform allows it. + """ + if _IPEX_AVAILABLE: + import intel_extension_for_pytorch as ipex + return ipex.xpu.device_count() + else: + return 0 + +def _get_all_visible_xpu_devices() -> List[int]: + """Returns a list of all visible Intel XPU devices. + + Devices masked by the environment variabale ``ZE_AFFINITY_MASK`` won't be returned here. For example, assume you + have 8 physical GPUs. If ``ZE_AFFINITY_MASK="1,3,6"``, then this function will return the list ``[0, 1, 2]`` + because these are the three visible GPUs after applying the mask ``ZE_AFFINITY_MASK``. + """ + return list(range(num_xpu_devices())) \ No newline at end of file diff --git a/src/lightning/pytorch/trainer/connectors/accelerator_connector.py b/src/lightning/pytorch/trainer/connectors/accelerator_connector.py index 7e21f6216c1e3..f96de4f68f32c 100644 --- a/src/lightning/pytorch/trainer/connectors/accelerator_connector.py +++ b/src/lightning/pytorch/trainer/connectors/accelerator_connector.py @@ -36,6 +36,7 @@ from lightning.pytorch.accelerators.cuda import CUDAAccelerator from lightning.pytorch.accelerators.mps import MPSAccelerator from lightning.pytorch.accelerators.xla import XLAAccelerator +from lightning.pytorch.accelerators.xpu import XPUAccelerator from lightning.pytorch.plugins import ( CheckpointIO, DeepSpeedPrecisionPlugin, @@ -352,6 +353,8 @@ def _choose_auto_accelerator(self) -> str: return "mps" if CUDAAccelerator.is_available(): return "cuda" + if XPUAccelerator.is_available(): + return "xpu" return "cpu" @staticmethod @@ -360,6 +363,8 @@ def _choose_gpu_accelerator_backend() -> str: return "mps" if CUDAAccelerator.is_available(): return "cuda" + if XPUAccelerator.is_available(): + return "xpu" raise MisconfigurationException("No supported gpu backend found!") def _set_parallel_devices_and_init_accelerator(self) -> None: diff --git a/src/lightning/pytorch/trainer/setup.py b/src/lightning/pytorch/trainer/setup.py index 36f9c27e70983..5fd686b3765d6 100644 --- a/src/lightning/pytorch/trainer/setup.py +++ b/src/lightning/pytorch/trainer/setup.py @@ -17,7 +17,7 @@ import lightning.pytorch as pl from lightning.fabric.utilities.warnings import PossibleUserWarning -from lightning.pytorch.accelerators import CUDAAccelerator, MPSAccelerator, XLAAccelerator +from lightning.pytorch.accelerators import CUDAAccelerator, MPSAccelerator, XLAAccelerator, XPUAccelerator from lightning.pytorch.loggers.logger import DummyLogger from lightning.pytorch.profilers import ( AdvancedProfiler, @@ -148,11 +148,14 @@ def _log_device_info(trainer: "pl.Trainer") -> None: elif MPSAccelerator.is_available(): gpu_available = True gpu_type = " (mps)" + elif XPUAccelerator.is_available(): + gpu_available = True + gpu_type = " (xpu)" else: gpu_available = False gpu_type = "" - gpu_used = isinstance(trainer.accelerator, (CUDAAccelerator, MPSAccelerator)) + gpu_used = isinstance(trainer.accelerator, (CUDAAccelerator, MPSAccelerator, XPUAccelerator)) rank_zero_info(f"GPU available: {gpu_available}{gpu_type}, used: {gpu_used}") num_tpu_cores = trainer.num_devices if isinstance(trainer.accelerator, XLAAccelerator) else 0 @@ -183,6 +186,8 @@ def _log_device_info(trainer: "pl.Trainer") -> None: and not isinstance(trainer.accelerator, CUDAAccelerator) or MPSAccelerator.is_available() and not isinstance(trainer.accelerator, MPSAccelerator) + or XPUAccelerator.is_available() + and not isinstance(trainer.accelerator, XPUAccelerator) ): rank_zero_warn( "GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.", From 54af860aafde7c2588064b4b6eac421aa65bce6e Mon Sep 17 00:00:00 2001 From: Corey Adams Date: Wed, 8 Nov 2023 16:26:03 -0600 Subject: [PATCH 02/26] Fixing some things since my last updates: accelerator structure changed a bit, mpi environment seems to be broken --- src/lightning/fabric/accelerators/__init__.py | 3 --- src/lightning/fabric/accelerators/xpu.py | 9 +++------ src/lightning/fabric/plugins/environments/mpi.py | 5 ++++- src/lightning/pytorch/accelerators/xpu.py | 1 - 4 files changed, 7 insertions(+), 11 deletions(-) diff --git a/src/lightning/fabric/accelerators/__init__.py b/src/lightning/fabric/accelerators/__init__.py index 122737f21cd56..62319379b5bf4 100644 --- a/src/lightning/fabric/accelerators/__init__.py +++ b/src/lightning/fabric/accelerators/__init__.py @@ -18,11 +18,8 @@ from lightning.fabric.accelerators.mps import MPSAccelerator # noqa: F401 from lightning.fabric.accelerators.registry import _AcceleratorRegistry from lightning.fabric.accelerators.xla import XLAAccelerator # noqa: F401 -<<<<<<< HEAD from lightning.fabric.accelerators.xpu import XPUAccelerator -======= from lightning.fabric.utilities.registry import _register_classes ->>>>>>> 964364b3bbfd27c448633025ae7919ef613b83ef ACCELERATOR_REGISTRY = _AcceleratorRegistry() _register_classes(ACCELERATOR_REGISTRY, "register_accelerators", sys.modules[__name__], Accelerator) diff --git a/src/lightning/fabric/accelerators/xpu.py b/src/lightning/fabric/accelerators/xpu.py index c3394c6e80107..fa88852c320d3 100644 --- a/src/lightning/fabric/accelerators/xpu.py +++ b/src/lightning/fabric/accelerators/xpu.py @@ -22,8 +22,6 @@ from lightning.fabric.accelerators.accelerator import Accelerator from lightning.fabric.accelerators.registry import _AcceleratorRegistry -from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_1_12 - from lightning_utilities.core.imports import RequirementCache from typing import Any, Dict, List, Union @@ -72,15 +70,14 @@ def teardown(self) -> None: pass @classmethod - def register_accelerators(cls, accelerator_registry): + def register_accelerators(cls, accelerator_registry: _AcceleratorRegistry) -> None: accelerator_registry.register( "xpu", cls, - description=cls.__class__.__name__, + description=cls.__name__, ) - -_IPEX_AVAILABLE = RequirementCache("intel_extension_for_pytorch>=1.13", "intel_extension_for_pytorch") +_IPEX_AVAILABLE = RequirementCache("intel_extension_for_pytorch>=2.0", "intel_extension_for_pytorch") @lru_cache(1) diff --git a/src/lightning/fabric/plugins/environments/mpi.py b/src/lightning/fabric/plugins/environments/mpi.py index e40fe8b027790..2cc9f5dd9d78d 100644 --- a/src/lightning/fabric/plugins/environments/mpi.py +++ b/src/lightning/fabric/plugins/environments/mpi.py @@ -109,7 +109,10 @@ def _init_comm_local(self) -> None: hostname = socket.gethostname() all_hostnames = self._comm_world.gather(hostname, root=0) # sort all the hostnames, and find unique ones - unique_hosts = sorted(set(all_hostnames)) + if self.global_rank() == 0: + unique_hosts = sorted(set(all_hostnames)) + else: + unique_hosts = None unique_hosts = self._comm_world.bcast(unique_hosts, root=0) # find the index for this host in the list of hosts: self._node_rank = unique_hosts.index(hostname) diff --git a/src/lightning/pytorch/accelerators/xpu.py b/src/lightning/pytorch/accelerators/xpu.py index d8199b711b26a..97ced93bb610f 100644 --- a/src/lightning/pytorch/accelerators/xpu.py +++ b/src/lightning/pytorch/accelerators/xpu.py @@ -21,7 +21,6 @@ from lightning_utilities.core.rank_zero import rank_zero_info from lightning.fabric.accelerators.registry import _AcceleratorRegistry -from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_1_12 from lightning_utilities.core.imports import RequirementCache From bd1215099b155111d03599a35832b9145f54564f Mon Sep 17 00:00:00 2001 From: Corey Adams Date: Fri, 9 Feb 2024 15:32:44 -0600 Subject: [PATCH 03/26] Enable DDP for XPU. THere is a bug, probably in the CCL layer, where broadcasting strings isn't working. This commit includes a workaround for that case. --- src/lightning/fabric/strategies/ddp.py | 9 ++++++++- src/lightning/fabric/utilities/distributed.py | 11 ++++++++++- src/lightning/pytorch/strategies/ddp.py | 12 ++++++++++-- 3 files changed, 28 insertions(+), 4 deletions(-) diff --git a/src/lightning/fabric/strategies/ddp.py b/src/lightning/fabric/strategies/ddp.py index 3b1de32a5d98b..3e4f68dd17b57 100644 --- a/src/lightning/fabric/strategies/ddp.py +++ b/src/lightning/fabric/strategies/ddp.py @@ -123,8 +123,15 @@ def setup_environment(self) -> None: def setup_module(self, module: Module) -> DistributedDataParallel: """Wraps the model into a :class:`~torch.nn.parallel.distributed.DistributedDataParallel` module.""" device_ids = self._determine_ddp_device_ids() + print(self.root_device) + print(self.root_device.type) # https://pytorch.org/docs/stable/notes/cuda.html#id5 - ctx = torch.cuda.stream(torch.cuda.Stream()) if device_ids is not None else nullcontext() + if self.root_device.type == "cuda": + ctx = torch.cuda.stream(torch.cuda.Stream()) if device_ids is not None else nullcontext() + elif self.root_device.type == "xpu": + ctx = torch.xpu.stream(torch.xpu.Stream()) if device_ids is not None else nullcontext() + else: + ctx = nullcontext() with ctx: return DistributedDataParallel(module=module, device_ids=device_ids, **self._ddp_kwargs) diff --git a/src/lightning/fabric/utilities/distributed.py b/src/lightning/fabric/utilities/distributed.py index 16157c66274be..b52335fdc10fa 100644 --- a/src/lightning/fabric/utilities/distributed.py +++ b/src/lightning/fabric/utilities/distributed.py @@ -288,6 +288,10 @@ def _init_dist_connection( os.environ["MASTER_ADDR"] = cluster_environment.main_address os.environ["MASTER_PORT"] = str(cluster_environment.main_port) log.info(f"Initializing distributed: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}") + + if torch_distributed_backend.lower() == "ccl": + import oneccl_bindings_for_pytorch + torch.distributed.init_process_group(torch_distributed_backend, rank=global_rank, world_size=world_size, **kwargs) # On rank=0 let everyone know training is starting @@ -300,7 +304,12 @@ def _init_dist_connection( def _get_default_process_group_backend_for_device(device: torch.device) -> str: - return "nccl" if device.type == "cuda" else "gloo" + if device.type == "cuda": + return "nncl" + elif device.type == "xpu": + return "ccl" + else: + return "gloo" class _DatasetSamplerWrapper(Dataset): diff --git a/src/lightning/pytorch/strategies/ddp.py b/src/lightning/pytorch/strategies/ddp.py index 9031b6ee177f3..e612287f63daf 100644 --- a/src/lightning/pytorch/strategies/ddp.py +++ b/src/lightning/pytorch/strategies/ddp.py @@ -190,7 +190,12 @@ def _setup_model(self, model: Module) -> DistributedDataParallel: device_ids = self.determine_ddp_device_ids() log.debug(f"setting up DDP model with device ids: {device_ids}, kwargs: {self._ddp_kwargs}") # https://pytorch.org/docs/stable/notes/cuda.html#id5 - ctx = torch.cuda.stream(torch.cuda.Stream()) if device_ids is not None else nullcontext() + if self.root_device.type == "cuda": + ctx = torch.cuda.stream(torch.cuda.Stream()) if device_ids is not None else nullcontext() + elif self.root_device.type == "xpu": + ctx = torch.xpu.stream(torch.xpu.Stream()) if device_ids is not None else nullcontext() + else: + ctx = nullcontext() with ctx: return DistributedDataParallel(module=model, device_ids=device_ids, **self._ddp_kwargs) @@ -304,7 +309,10 @@ def broadcast(self, obj: TBroadcast, src: int = 0) -> TBroadcast: return obj obj = [obj] - torch.distributed.broadcast_object_list(obj, src, group=_group.WORLD) + if self.root_device.type != "xpu" and type(obj[0]) == str: + # I don't know why this is true. I will have to investigate. In the meantime, + # This is getting called by the profiler which can be worked around: + torch.distributed.broadcast_object_list(obj, src, group=_group.WORLD) return obj[0] @override From 711281a28d0f4299f52c737a27b6fd0d6272eb22 Mon Sep 17 00:00:00 2001 From: Corey adams Date: Fri, 9 Feb 2024 15:37:13 -0600 Subject: [PATCH 04/26] Update throughput_monitor.py Syncronize xpu devices --- src/lightning/pytorch/callbacks/throughput_monitor.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/lightning/pytorch/callbacks/throughput_monitor.py b/src/lightning/pytorch/callbacks/throughput_monitor.py index 71a85e431bb7d..c1a23b65563e5 100644 --- a/src/lightning/pytorch/callbacks/throughput_monitor.py +++ b/src/lightning/pytorch/callbacks/throughput_monitor.py @@ -130,7 +130,9 @@ def _update(self, trainer: "Trainer", pl_module: "LightningModule", batch: Any, if trainer.strategy.root_device.type == "cuda": # required or else perf_counter() won't be correct torch.cuda.synchronize() - + elif trainer.strategy.root_device.type == "xpu": + torch.xpu.synchronize() + elapsed = time.perf_counter() - self._t0s[stage] if self.length_fn is not None: self._lengths[stage] += self.length_fn(batch) From 0c36f3cd59e7883accfdaef2c56ab2a491cd5082 Mon Sep 17 00:00:00 2001 From: Corey adams Date: Fri, 9 Feb 2024 15:49:34 -0600 Subject: [PATCH 05/26] Update accelerator_connector.py Add xpu warning --- .../pytorch/trainer/connectors/accelerator_connector.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/lightning/pytorch/trainer/connectors/accelerator_connector.py b/src/lightning/pytorch/trainer/connectors/accelerator_connector.py index 82469e4efa97f..15f1a16bf8244 100644 --- a/src/lightning/pytorch/trainer/connectors/accelerator_connector.py +++ b/src/lightning/pytorch/trainer/connectors/accelerator_connector.py @@ -319,6 +319,13 @@ def _check_config_and_set_final_flags( f" but accelerator set to {self._accelerator_flag}, please choose one device type" ) self._accelerator_flag = "cuda" + if self._strategy_flag.parallel_devices[0].type == "xpu": + if self._accelerator_flag and self._accelerator_flag not in ("auto", "xpu", "gpu"): + raise MisconfigurationException( + f"GPU parallel_devices set through {self._strategy_flag.__class__.__name__} class," + f" but accelerator set to {self._accelerator_flag}, please choose one device type" + ) + self._accelerator_flag = "xpu" self._parallel_devices = self._strategy_flag.parallel_devices def _check_device_config_and_set_final_flags(self, devices: Union[List[int], str, int], num_nodes: int) -> None: From 4477301828812941b7287f32e54b8df1e83d3256 Mon Sep 17 00:00:00 2001 From: Corey adams Date: Fri, 9 Feb 2024 15:51:56 -0600 Subject: [PATCH 06/26] Update module.py Include XPU in on-gpu check. --- src/lightning/pytorch/core/module.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lightning/pytorch/core/module.py b/src/lightning/pytorch/core/module.py index faaab7e15fd05..32be2d4a945b7 100644 --- a/src/lightning/pytorch/core/module.py +++ b/src/lightning/pytorch/core/module.py @@ -285,7 +285,7 @@ def on_gpu(self) -> bool: Useful to set flags around the LightningModule for different CPU vs GPU behavior. """ - return self.device.type == "cuda" + return self.device.type == "cuda" or self.device.type == "xpu" @property def automatic_optimization(self) -> bool: From 6b766440f26a7272ce98a78090403c238a5abfec Mon Sep 17 00:00:00 2001 From: Corey adams Date: Fri, 9 Feb 2024 15:53:35 -0600 Subject: [PATCH 07/26] Update saving.py Include XPU in map location --- src/lightning/pytorch/core/saving.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/lightning/pytorch/core/saving.py b/src/lightning/pytorch/core/saving.py index 78e449abd3e34..6211c83e0bee6 100644 --- a/src/lightning/pytorch/core/saving.py +++ b/src/lightning/pytorch/core/saving.py @@ -34,7 +34,7 @@ from lightning.fabric.utilities.cloud_io import _load as pl_load from lightning.fabric.utilities.data import AttributeDict from lightning.fabric.utilities.types import _MAP_LOCATION_TYPE, _PATH -from lightning.pytorch.accelerators import CUDAAccelerator, MPSAccelerator, XLAAccelerator +from lightning.pytorch.accelerators import CUDAAccelerator, MPSAccelerator, XLAAccelerator, XPUAccelerator from lightning.pytorch.utilities.imports import _OMEGACONF_AVAILABLE from lightning.pytorch.utilities.migration import pl_legacy_patch from lightning.pytorch.utilities.migration.utils import _pl_migrate_checkpoint @@ -109,6 +109,8 @@ def _default_map_location(storage: "UntypedStorage", location: str) -> Optional[ and not CUDAAccelerator.is_available() or location.startswith("xla") and not XLAAccelerator.is_available() + or location.startswith("xpu") + and nott XPUAccelerator.is_available() ): return storage.cpu() return None # default behavior by `torch.load()` From 0fac1707399c167bc6977d74a3bff2edd7168d18 Mon Sep 17 00:00:00 2001 From: Corey Adams Date: Fri, 9 Feb 2024 16:15:55 -0600 Subject: [PATCH 08/26] Add further support for XPU devices from Intel --- src/lightning/data/processing/functions.py | 4 +++ src/lightning/data/utilities/env.py | 4 +++ src/lightning/fabric/accelerators/xpu.py | 4 +-- src/lightning/fabric/cli.py | 2 +- src/lightning/fabric/fabric.py | 2 +- src/lightning/fabric/strategies/strategy.py | 1 + .../fabric/utilities/device_dtype_mixin.py | 25 +++++++++++++++++++ src/lightning/fabric/utilities/seed.py | 6 ++++- 8 files changed, 43 insertions(+), 5 deletions(-) diff --git a/src/lightning/data/processing/functions.py b/src/lightning/data/processing/functions.py index 00905aa40dcd4..0bf76216d444f 100644 --- a/src/lightning/data/processing/functions.py +++ b/src/lightning/data/processing/functions.py @@ -111,6 +111,10 @@ def _find_device(self) -> None: num_gpus = torch.cuda.device_count() device = int(global_rank) % num_gpus self._device = f"cuda:{device}" + elif hasattr(torch, "xpu"): + num_gpus = torch.xpu.device_count() + device = int(global_rank) % num_gpus + self._device = f"xpu:{device}" class LambdaDataChunkRecipe(DataChunkRecipe): diff --git a/src/lightning/data/utilities/env.py b/src/lightning/data/utilities/env.py index c9406963d909b..79766df190e73 100644 --- a/src/lightning/data/utilities/env.py +++ b/src/lightning/data/utilities/env.py @@ -41,6 +41,10 @@ def detect(cls) -> "_DistributedEnv": # TODO: Add support for other accelerators num_nodes = (world_size // torch.cuda.device_count()) if torch.cuda.is_available() else 1 + # Add XPU accelerators if there are not CUDA and XPU is available: + if hasattr(torch, "xpu") and torch.xpu.is_available(): + num_nodes = world_size // torch.xpu.device_count() + if num_nodes > 1: # validate the world size is divisble by the number of GPUs assert world_size % torch.cuda.device_count() == 0 diff --git a/src/lightning/fabric/accelerators/xpu.py b/src/lightning/fabric/accelerators/xpu.py index fa88852c320d3..9752f480c5cdc 100644 --- a/src/lightning/fabric/accelerators/xpu.py +++ b/src/lightning/fabric/accelerators/xpu.py @@ -82,9 +82,9 @@ def register_accelerators(cls, accelerator_registry: _AcceleratorRegistry) -> No @lru_cache(1) def num_xpu_devices() -> int: - """Returns the number of available CUDA devices. + """Returns the number of available XPU devices. - Unlike :func:`torch.cuda.device_count`, this function does its best not to create a CUDA context for fork support, + Unlike :func:`torch.xpu.device_count`, this function does its best not to create a XPU context for fork support, if the platform allows it. """ if _IPEX_AVAILABLE: diff --git a/src/lightning/fabric/cli.py b/src/lightning/fabric/cli.py index 0ff777232a7fb..f691095e0526f 100644 --- a/src/lightning/fabric/cli.py +++ b/src/lightning/fabric/cli.py @@ -30,7 +30,7 @@ _CLICK_AVAILABLE = RequirementCache("click") -_SUPPORTED_ACCELERATORS = ("cpu", "gpu", "cuda", "mps", "tpu") +_SUPPORTED_ACCELERATORS = ("cpu", "gpu", "cuda", "mps", "tpu", "xpu") def _get_supported_strategies() -> List[str]: diff --git a/src/lightning/fabric/fabric.py b/src/lightning/fabric/fabric.py index bc07e633a911e..b3d9de055feba 100644 --- a/src/lightning/fabric/fabric.py +++ b/src/lightning/fabric/fabric.py @@ -98,7 +98,7 @@ class Fabric: Args: accelerator: The hardware to run on. Possible choices are: - ``"cpu"``, ``"cuda"``, ``"mps"``, ``"gpu"``, ``"tpu"``, ``"auto"``. + ``"cpu"``, ``"cuda"``, ``"mps"``, ``"xpu"``, ``"gpu"``, ``"tpu"``, ``"auto"``. strategy: Strategy for how to run across multiple devices. Possible choices are: ``"dp"``, ``"ddp"``, ``"ddp_spawn"``, ``"deepspeed"``, ``"fsdp"``. devices: Number of devices to train on (``int``), which GPUs to train on (``list`` or ``str``), or ``"auto"``. diff --git a/src/lightning/fabric/strategies/strategy.py b/src/lightning/fabric/strategies/strategy.py index 6c29b95d2a481..233712a55fa45 100644 --- a/src/lightning/fabric/strategies/strategy.py +++ b/src/lightning/fabric/strategies/strategy.py @@ -329,6 +329,7 @@ def load_checkpoint( """ torch.cuda.empty_cache() + if hasattr(torch, "xpu") : torch.xpu.empty_cache() checkpoint = self.checkpoint_io.load_checkpoint(path) if not state: return checkpoint diff --git a/src/lightning/fabric/utilities/device_dtype_mixin.py b/src/lightning/fabric/utilities/device_dtype_mixin.py index a9e614cbbd1d2..ec12ee46b0336 100644 --- a/src/lightning/fabric/utilities/device_dtype_mixin.py +++ b/src/lightning/fabric/utilities/device_dtype_mixin.py @@ -43,6 +43,10 @@ def device(self) -> torch.device: # make this more explicit to always include the index if device.type == "cuda" and device.index is None: return torch.device(f"cuda:{torch.cuda.current_device()}") + + if hasattr(torch, "xpu") and device.type == "xpu" and device.index is None: + return torch.device(f"xpu:{torch.xpu.current_device()}") + return device @@ -75,6 +79,27 @@ def cuda(self, device: Optional[Union[torch.device, int]] = None) -> Self: _update_properties(self, device=device) return super().cuda(device=device) + @override + def xpu(self, device: Optional[Union[torch.device, int]] = None) -> Self: + """Moves all model parameters and buffers to the XPU GPU. This also makes associated parameters and buffers + different objects. So it should be called before constructing optimizer if the module will live on GPU while + being optimized. + + Arguments: + device: If specified, all parameters will be copied to that device. If `None`, the current XPU device + index will be used. + + Returns: + Module: self + + """ + if device is None: + device = torch.device("xpu", torch.xpu.current_device()) + elif isinstance(device, int): + device = torch.device("xpu", index=device) + _update_properties(self, device=device) + return super().xpu(device=device) + @override def cpu(self) -> Self: """See :meth:`torch.nn.Module.cpu`.""" diff --git a/src/lightning/fabric/utilities/seed.py b/src/lightning/fabric/utilities/seed.py index b274bce88fcdf..4e78f01079b18 100644 --- a/src/lightning/fabric/utilities/seed.py +++ b/src/lightning/fabric/utilities/seed.py @@ -102,7 +102,7 @@ def pl_worker_init_function(worker_id: int, rank: Optional[int] = None) -> None: random.seed(stdlib_seed) -def _collect_rng_states(include_cuda: bool = True) -> Dict[str, Any]: +def _collect_rng_states(include_cuda: bool = True, include_xpu: bool = True) -> Dict[str, Any]: r"""Collect the global random state of :mod:`torch`, :mod:`torch.cuda`, :mod:`numpy` and Python.""" states = { "torch": torch.get_rng_state(), @@ -111,6 +111,8 @@ def _collect_rng_states(include_cuda: bool = True) -> Dict[str, Any]: } if include_cuda: states["torch.cuda"] = torch.cuda.get_rng_state_all() if torch.cuda.is_available() else [] + if include_xpu and hasattr(torch, "xpu"): + states["torch.xpu"] = torch.xpu.get_rng_state_all() if torch.xpu.is_available() else [] return states @@ -121,6 +123,8 @@ def _set_rng_states(rng_state_dict: Dict[str, Any]) -> None: # torch.cuda rng_state is only included since v1.8. if "torch.cuda" in rng_state_dict: torch.cuda.set_rng_state_all(rng_state_dict["torch.cuda"]) + if "torch.xpu" in rng_state_dict and hasattr(torch, "xpu"): + torch.xpu.set_rng_state_all(rng_state_dict["torch.xpu"]) np.random.set_state(rng_state_dict["numpy"]) version, state, gauss = rng_state_dict["python"] python_set_rng_state((version, tuple(state), gauss)) From 365126d92fb5c122fdd644466d5e9bc88dc5c81e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 9 Feb 2024 22:34:07 +0000 Subject: [PATCH 09/26] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/lightning/data/utilities/env.py | 2 +- src/lightning/fabric/accelerators/xpu.py | 21 +++++++---------- src/lightning/fabric/strategies/ddp.py | 2 +- .../fabric/utilities/device_dtype_mixin.py | 2 +- .../fabric/utilities/device_parser.py | 10 ++++---- src/lightning/fabric/utilities/distributed.py | 7 +++--- .../pytorch/accelerators/__init__.py | 1 - src/lightning/pytorch/accelerators/xpu.py | 23 +++++++------------ .../pytorch/callbacks/throughput_monitor.py | 2 +- 9 files changed, 28 insertions(+), 42 deletions(-) diff --git a/src/lightning/data/utilities/env.py b/src/lightning/data/utilities/env.py index 79766df190e73..e5cbbdc74cb5c 100644 --- a/src/lightning/data/utilities/env.py +++ b/src/lightning/data/utilities/env.py @@ -43,7 +43,7 @@ def detect(cls) -> "_DistributedEnv": # Add XPU accelerators if there are not CUDA and XPU is available: if hasattr(torch, "xpu") and torch.xpu.is_available(): - num_nodes = world_size // torch.xpu.device_count() + num_nodes = world_size // torch.xpu.device_count() if num_nodes > 1: # validate the world size is divisble by the number of GPUs diff --git a/src/lightning/fabric/accelerators/xpu.py b/src/lightning/fabric/accelerators/xpu.py index 9752f480c5cdc..d3e377eadd2c2 100644 --- a/src/lightning/fabric/accelerators/xpu.py +++ b/src/lightning/fabric/accelerators/xpu.py @@ -11,20 +11,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import os -import warnings -from contextlib import contextmanager from functools import lru_cache -from typing import cast, Generator, List, Optional, Union +from typing import Any, Dict, List, Union import torch -from lightning_utilities.core.rank_zero import rank_zero_info +from lightning_utilities.core.imports import RequirementCache from lightning.fabric.accelerators.accelerator import Accelerator from lightning.fabric.accelerators.registry import _AcceleratorRegistry -from lightning_utilities.core.imports import RequirementCache -from typing import Any, Dict, List, Union class XPUAccelerator(Accelerator): """Support for a Intel Discrete Graphics Cards 'XPU'.""" @@ -45,8 +40,7 @@ def parse_devices(devices: Any) -> Any: @staticmethod def get_parallel_devices(devices: Any) -> Any: # Here, convert the device indices to actual device objects - import intel_extension_for_pytorch as ipex - + return [torch.device("xpu", idx) for idx in devices] @staticmethod @@ -65,7 +59,7 @@ def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]: def setup_device(self, device: torch.device) -> None: pass - + def teardown(self) -> None: pass @@ -86,12 +80,12 @@ def num_xpu_devices() -> int: Unlike :func:`torch.xpu.device_count`, this function does its best not to create a XPU context for fork support, if the platform allows it. + """ if _IPEX_AVAILABLE: import intel_extension_for_pytorch as ipex return ipex.xpu.device_count() - else: - return 0 + return 0 def _get_all_visible_xpu_devices() -> List[int]: """Returns a list of all visible Intel XPU devices. @@ -99,5 +93,6 @@ def _get_all_visible_xpu_devices() -> List[int]: Devices masked by the environment variabale ``ZE_AFFINITY_MASK`` won't be returned here. For example, assume you have 8 physical GPUs. If ``ZE_AFFINITY_MASK="1,3,6"``, then this function will return the list ``[0, 1, 2]`` because these are the three visible GPUs after applying the mask ``ZE_AFFINITY_MASK``. + """ - return list(range(num_xpu_devices())) \ No newline at end of file + return list(range(num_xpu_devices())) diff --git a/src/lightning/fabric/strategies/ddp.py b/src/lightning/fabric/strategies/ddp.py index 3e4f68dd17b57..7433b418d49f3 100644 --- a/src/lightning/fabric/strategies/ddp.py +++ b/src/lightning/fabric/strategies/ddp.py @@ -127,7 +127,7 @@ def setup_module(self, module: Module) -> DistributedDataParallel: print(self.root_device.type) # https://pytorch.org/docs/stable/notes/cuda.html#id5 if self.root_device.type == "cuda": - ctx = torch.cuda.stream(torch.cuda.Stream()) if device_ids is not None else nullcontext() + ctx = torch.cuda.stream(torch.cuda.Stream()) if device_ids is not None else nullcontext() elif self.root_device.type == "xpu": ctx = torch.xpu.stream(torch.xpu.Stream()) if device_ids is not None else nullcontext() else: diff --git a/src/lightning/fabric/utilities/device_dtype_mixin.py b/src/lightning/fabric/utilities/device_dtype_mixin.py index ec12ee46b0336..71d597e68afba 100644 --- a/src/lightning/fabric/utilities/device_dtype_mixin.py +++ b/src/lightning/fabric/utilities/device_dtype_mixin.py @@ -43,7 +43,7 @@ def device(self) -> torch.device: # make this more explicit to always include the index if device.type == "cuda" and device.index is None: return torch.device(f"cuda:{torch.cuda.current_device()}") - + if hasattr(torch, "xpu") and device.type == "xpu" and device.index is None: return torch.device(f"xpu:{torch.xpu.current_device()}") diff --git a/src/lightning/fabric/utilities/device_parser.py b/src/lightning/fabric/utilities/device_parser.py index 44199a790d207..b0b429c478e5b 100644 --- a/src/lightning/fabric/utilities/device_parser.py +++ b/src/lightning/fabric/utilities/device_parser.py @@ -115,8 +115,8 @@ def _normalize_parse_gpu_string_input(s: Union[int, str, List[int]]) -> Union[in def _sanitize_gpu_ids(gpus: List[int], include_cuda: bool = False, include_mps: bool = False, include_xpu: bool = False) -> List[int]: - """Checks that each of the GPUs in the list is actually available. Raises a MisconfigurationException if any of - the GPUs is not available. + """Checks that each of the GPUs in the list is actually available. Raises a MisconfigurationException if any of the + GPUs is not available. Args: gpus: List of ints corresponding to GPU indices @@ -156,15 +156,15 @@ def _normalize_parse_gpu_input_to_list( return list(range(gpus)) -def _get_all_available_gpus(include_cuda: bool = False, include_mps: bool = False, include_xpu: bool = False) -> List[int]: +def _get_all_available_gpus(include_cuda: bool = False, include_mps: bool = False, include_xpu: bool = False) -> List[int]: """ Returns: A list of all available GPUs """ from lightning.fabric.accelerators.cuda import _get_all_visible_cuda_devices - from lightning.fabric.accelerators.mps import _get_all_available_mps_gpus - from lightning.fabric.accelerators.xpu import _get_all_visible_xpu_devices + from lightning.fabric.accelerators.mps import _get_all_available_mps_gpus + from lightning.fabric.accelerators.xpu import _get_all_visible_xpu_devices cuda_gpus = _get_all_visible_cuda_devices() if include_cuda else [] mps_gpus = _get_all_available_mps_gpus() if include_mps else [] diff --git a/src/lightning/fabric/utilities/distributed.py b/src/lightning/fabric/utilities/distributed.py index b52335fdc10fa..7dca4f3f9e3f8 100644 --- a/src/lightning/fabric/utilities/distributed.py +++ b/src/lightning/fabric/utilities/distributed.py @@ -290,7 +290,7 @@ def _init_dist_connection( log.info(f"Initializing distributed: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}") if torch_distributed_backend.lower() == "ccl": - import oneccl_bindings_for_pytorch + pass torch.distributed.init_process_group(torch_distributed_backend, rank=global_rank, world_size=world_size, **kwargs) @@ -306,10 +306,9 @@ def _init_dist_connection( def _get_default_process_group_backend_for_device(device: torch.device) -> str: if device.type == "cuda": return "nncl" - elif device.type == "xpu": + if device.type == "xpu": return "ccl" - else: - return "gloo" + return "gloo" class _DatasetSamplerWrapper(Dataset): diff --git a/src/lightning/pytorch/accelerators/__init__.py b/src/lightning/pytorch/accelerators/__init__.py index 92437ff0f59ca..cbeb82d1cb32e 100644 --- a/src/lightning/pytorch/accelerators/__init__.py +++ b/src/lightning/pytorch/accelerators/__init__.py @@ -22,6 +22,5 @@ from lightning.pytorch.accelerators.xla import XLAAccelerator # noqa: F401 from lightning.pytorch.accelerators.xpu import XPUAccelerator # noqa: F401 - AcceleratorRegistry = _AcceleratorRegistry() _register_classes(AcceleratorRegistry, "register_accelerators", sys.modules[__name__], Accelerator) diff --git a/src/lightning/pytorch/accelerators/xpu.py b/src/lightning/pytorch/accelerators/xpu.py index 97ced93bb610f..547ed1301ecfa 100644 --- a/src/lightning/pytorch/accelerators/xpu.py +++ b/src/lightning/pytorch/accelerators/xpu.py @@ -11,21 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import os -import warnings -from contextlib import contextmanager from functools import lru_cache -from typing import cast, Generator, List, Optional, Union +from typing import Any, Dict, List, Union import torch -from lightning_utilities.core.rank_zero import rank_zero_info - -from lightning.fabric.accelerators.registry import _AcceleratorRegistry - from lightning_utilities.core.imports import RequirementCache from lightning.pytorch.accelerators.accelerator import Accelerator -from typing import Any, Dict, List, Union + class XPUAccelerator(Accelerator): """Support for a Intel Discrete Graphics Cards 'XPU'.""" @@ -46,8 +39,7 @@ def parse_devices(devices: Any) -> Any: @staticmethod def get_parallel_devices(devices: Any) -> Any: # Here, convert the device indices to actual device objects - import intel_extension_for_pytorch as ipex - + return [torch.device("xpu", idx) for idx in devices] @staticmethod @@ -66,7 +58,7 @@ def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]: def setup_device(self, device: torch.device) -> None: pass - + def teardown(self) -> None: pass @@ -88,12 +80,12 @@ def num_xpu_devices() -> int: Unlike :func:`torch.cuda.device_count`, this function does its best not to create a CUDA context for fork support, if the platform allows it. + """ if _IPEX_AVAILABLE: import intel_extension_for_pytorch as ipex return ipex.xpu.device_count() - else: - return 0 + return 0 def _get_all_visible_xpu_devices() -> List[int]: """Returns a list of all visible Intel XPU devices. @@ -101,5 +93,6 @@ def _get_all_visible_xpu_devices() -> List[int]: Devices masked by the environment variabale ``ZE_AFFINITY_MASK`` won't be returned here. For example, assume you have 8 physical GPUs. If ``ZE_AFFINITY_MASK="1,3,6"``, then this function will return the list ``[0, 1, 2]`` because these are the three visible GPUs after applying the mask ``ZE_AFFINITY_MASK``. + """ - return list(range(num_xpu_devices())) \ No newline at end of file + return list(range(num_xpu_devices())) diff --git a/src/lightning/pytorch/callbacks/throughput_monitor.py b/src/lightning/pytorch/callbacks/throughput_monitor.py index c1a23b65563e5..0fb936d2e9225 100644 --- a/src/lightning/pytorch/callbacks/throughput_monitor.py +++ b/src/lightning/pytorch/callbacks/throughput_monitor.py @@ -132,7 +132,7 @@ def _update(self, trainer: "Trainer", pl_module: "LightningModule", batch: Any, torch.cuda.synchronize() elif trainer.strategy.root_device.type == "xpu": torch.xpu.synchronize() - + elapsed = time.perf_counter() - self._t0s[stage] if self.length_fn is not None: self._lengths[stage] += self.length_fn(batch) From 5b6c3a8172319586424f2fee8a255d4fee375c22 Mon Sep 17 00:00:00 2001 From: Corey Adams Date: Fri, 9 Feb 2024 16:37:34 -0600 Subject: [PATCH 10/26] Fix typo --- src/lightning/pytorch/core/saving.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lightning/pytorch/core/saving.py b/src/lightning/pytorch/core/saving.py index 6211c83e0bee6..5aa3727822ebb 100644 --- a/src/lightning/pytorch/core/saving.py +++ b/src/lightning/pytorch/core/saving.py @@ -110,7 +110,7 @@ def _default_map_location(storage: "UntypedStorage", location: str) -> Optional[ or location.startswith("xla") and not XLAAccelerator.is_available() or location.startswith("xpu") - and nott XPUAccelerator.is_available() + and not XPUAccelerator.is_available() ): return storage.cpu() return None # default behavior by `torch.load()` From d08bda82fcdd74cf4c578498d2cbe55f661421cf Mon Sep 17 00:00:00 2001 From: Corey Adams Date: Mon, 12 Feb 2024 11:00:14 -0600 Subject: [PATCH 11/26] Fix type error in memory stats. Enable oneccl in distributed mode --- src/lightning/fabric/utilities/distributed.py | 1 + src/lightning/pytorch/accelerators/xpu.py | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/lightning/fabric/utilities/distributed.py b/src/lightning/fabric/utilities/distributed.py index 7dca4f3f9e3f8..3fd27d208f75d 100644 --- a/src/lightning/fabric/utilities/distributed.py +++ b/src/lightning/fabric/utilities/distributed.py @@ -290,6 +290,7 @@ def _init_dist_connection( log.info(f"Initializing distributed: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}") if torch_distributed_backend.lower() == "ccl": + import oneccl_bindings_for_pytorch pass torch.distributed.init_process_group(torch_distributed_backend, rank=global_rank, world_size=world_size, **kwargs) diff --git a/src/lightning/pytorch/accelerators/xpu.py b/src/lightning/pytorch/accelerators/xpu.py index 547ed1301ecfa..bf3e538794da8 100644 --- a/src/lightning/pytorch/accelerators/xpu.py +++ b/src/lightning/pytorch/accelerators/xpu.py @@ -18,6 +18,7 @@ from lightning_utilities.core.imports import RequirementCache from lightning.pytorch.accelerators.accelerator import Accelerator +from lightning.fabric.utilities.types import _DEVICE class XPUAccelerator(Accelerator): @@ -52,9 +53,9 @@ def is_available() -> bool: import intel_extension_for_pytorch as ipex return ipex.xpu.is_available() - def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]: + def get_device_stats(self, device: _DEVICE) -> Dict[str, Any]: # Return optional device statistics for loggers - return {} + return torch.xpu.memory_stats(device) def setup_device(self, device: torch.device) -> None: pass From cdc9ae516dfe5c32615b5a9198a250c9368dcfb3 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 12 Feb 2024 17:01:39 +0000 Subject: [PATCH 12/26] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/lightning/fabric/utilities/distributed.py | 1 - src/lightning/pytorch/accelerators/xpu.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/lightning/fabric/utilities/distributed.py b/src/lightning/fabric/utilities/distributed.py index 3fd27d208f75d..7dca4f3f9e3f8 100644 --- a/src/lightning/fabric/utilities/distributed.py +++ b/src/lightning/fabric/utilities/distributed.py @@ -290,7 +290,6 @@ def _init_dist_connection( log.info(f"Initializing distributed: GLOBAL_RANK: {global_rank}, MEMBER: {global_rank + 1}/{world_size}") if torch_distributed_backend.lower() == "ccl": - import oneccl_bindings_for_pytorch pass torch.distributed.init_process_group(torch_distributed_backend, rank=global_rank, world_size=world_size, **kwargs) diff --git a/src/lightning/pytorch/accelerators/xpu.py b/src/lightning/pytorch/accelerators/xpu.py index bf3e538794da8..f189493a9c620 100644 --- a/src/lightning/pytorch/accelerators/xpu.py +++ b/src/lightning/pytorch/accelerators/xpu.py @@ -12,13 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. from functools import lru_cache -from typing import Any, Dict, List, Union +from typing import Any, Dict, List import torch from lightning_utilities.core.imports import RequirementCache -from lightning.pytorch.accelerators.accelerator import Accelerator from lightning.fabric.utilities.types import _DEVICE +from lightning.pytorch.accelerators.accelerator import Accelerator class XPUAccelerator(Accelerator): From 68ffe7a8dbfd1ddbc8c6796330eb1cf87268f880 Mon Sep 17 00:00:00 2001 From: Corey Adams Date: Mon, 12 Feb 2024 11:11:19 -0600 Subject: [PATCH 13/26] Address precommit.ci errors --- src/lightning/fabric/accelerators/__init__.py | 2 +- src/lightning/fabric/strategies/strategy.py | 3 ++- .../fabric/utilities/device_parser.py | 27 +++++++++++++++---- src/lightning/pytorch/strategies/ddp.py | 2 +- 4 files changed, 26 insertions(+), 8 deletions(-) diff --git a/src/lightning/fabric/accelerators/__init__.py b/src/lightning/fabric/accelerators/__init__.py index 62319379b5bf4..8f5bb9c296bc6 100644 --- a/src/lightning/fabric/accelerators/__init__.py +++ b/src/lightning/fabric/accelerators/__init__.py @@ -18,7 +18,7 @@ from lightning.fabric.accelerators.mps import MPSAccelerator # noqa: F401 from lightning.fabric.accelerators.registry import _AcceleratorRegistry from lightning.fabric.accelerators.xla import XLAAccelerator # noqa: F401 -from lightning.fabric.accelerators.xpu import XPUAccelerator +from lightning.fabric.accelerators.xpu import XPUAccelerator # noqa: F401 from lightning.fabric.utilities.registry import _register_classes ACCELERATOR_REGISTRY = _AcceleratorRegistry() diff --git a/src/lightning/fabric/strategies/strategy.py b/src/lightning/fabric/strategies/strategy.py index 233712a55fa45..ca5379510427e 100644 --- a/src/lightning/fabric/strategies/strategy.py +++ b/src/lightning/fabric/strategies/strategy.py @@ -329,7 +329,8 @@ def load_checkpoint( """ torch.cuda.empty_cache() - if hasattr(torch, "xpu") : torch.xpu.empty_cache() + if hasattr(torch, "xpu"): + torch.xpu.empty_cache() checkpoint = self.checkpoint_io.load_checkpoint(path) if not state: return checkpoint diff --git a/src/lightning/fabric/utilities/device_parser.py b/src/lightning/fabric/utilities/device_parser.py index b0b429c478e5b..b8e75ee38cfbb 100644 --- a/src/lightning/fabric/utilities/device_parser.py +++ b/src/lightning/fabric/utilities/device_parser.py @@ -85,7 +85,10 @@ def _parse_gpu_ids( # We know the user requested GPUs therefore if some of the # requested GPUs are not available an exception is thrown. gpus = _normalize_parse_gpu_string_input(gpus) - gpus = _normalize_parse_gpu_input_to_list(gpus, include_cuda=include_cuda, include_mps=include_mps, include_xpu=include_xpu) + gpus = _normalize_parse_gpu_input_to_list(gpus, + include_cuda=include_cuda, + include_mps=include_mps, + include_xpu=include_xpu) if not gpus: raise MisconfigurationException("GPUs requested but none are available.") @@ -93,7 +96,9 @@ def _parse_gpu_ids( torch.distributed.is_available() and torch.distributed.is_torchelastic_launched() and len(gpus) != 1 - and len(_get_all_available_gpus(include_cuda=include_cuda, include_mps=include_mps, include_xpu=include_xpu)) == 1 + and len(_get_all_available_gpus(include_cuda=include_cuda, + include_mps=include_mps, + include_xpu=include_xpu)) == 1 ): # Omit sanity check on torchelastic because by default it shows one visible GPU per process return gpus @@ -114,7 +119,12 @@ def _normalize_parse_gpu_string_input(s: Union[int, str, List[int]]) -> Union[in return int(s.strip()) -def _sanitize_gpu_ids(gpus: List[int], include_cuda: bool = False, include_mps: bool = False, include_xpu: bool = False) -> List[int]: +def _sanitize_gpu_ids( + gpus: List[int], + include_cuda: bool = False, + include_mps: bool = False, + include_xpu: bool = False +) -> List[int]: """Checks that each of the GPUs in the list is actually available. Raises a MisconfigurationException if any of the GPUs is not available. @@ -141,7 +151,10 @@ def _sanitize_gpu_ids(gpus: List[int], include_cuda: bool = False, include_mps: def _normalize_parse_gpu_input_to_list( - gpus: Union[int, List[int], Tuple[int, ...]], include_cuda: bool, include_mps: bool, include_xpu: bool + gpus: Union[int, List[int], Tuple[int, ...]], + include_cuda: bool, + include_mps: bool, + include_xpu: bool ) -> Optional[List[int]]: assert gpus is not None if isinstance(gpus, (MutableSequence, tuple)): @@ -156,7 +169,11 @@ def _normalize_parse_gpu_input_to_list( return list(range(gpus)) -def _get_all_available_gpus(include_cuda: bool = False, include_mps: bool = False, include_xpu: bool = False) -> List[int]: +def _get_all_available_gpus( + include_cuda: bool = False, + include_mps: bool = False, + include_xpu: bool = False +) -> List[int]: """ Returns: A list of all available GPUs diff --git a/src/lightning/pytorch/strategies/ddp.py b/src/lightning/pytorch/strategies/ddp.py index e612287f63daf..f95bbbda4ad11 100644 --- a/src/lightning/pytorch/strategies/ddp.py +++ b/src/lightning/pytorch/strategies/ddp.py @@ -309,7 +309,7 @@ def broadcast(self, obj: TBroadcast, src: int = 0) -> TBroadcast: return obj obj = [obj] - if self.root_device.type != "xpu" and type(obj[0]) == str: + if self.root_device.type != "xpu" and isinstance(type(obj[0]), str): # I don't know why this is true. I will have to investigate. In the meantime, # This is getting called by the profiler which can be worked around: torch.distributed.broadcast_object_list(obj, src, group=_group.WORLD) From 5103a1606ad839c39217b2b6fca88a49b50a10b1 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 12 Feb 2024 17:12:33 +0000 Subject: [PATCH 14/26] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/lightning/fabric/accelerators/__init__.py | 2 +- src/lightning/fabric/strategies/strategy.py | 2 +- .../fabric/utilities/device_parser.py | 26 +++++++++---------- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/src/lightning/fabric/accelerators/__init__.py b/src/lightning/fabric/accelerators/__init__.py index 8f5bb9c296bc6..1a3dda20828a7 100644 --- a/src/lightning/fabric/accelerators/__init__.py +++ b/src/lightning/fabric/accelerators/__init__.py @@ -18,7 +18,7 @@ from lightning.fabric.accelerators.mps import MPSAccelerator # noqa: F401 from lightning.fabric.accelerators.registry import _AcceleratorRegistry from lightning.fabric.accelerators.xla import XLAAccelerator # noqa: F401 -from lightning.fabric.accelerators.xpu import XPUAccelerator # noqa: F401 +from lightning.fabric.accelerators.xpu import XPUAccelerator # noqa: F401 from lightning.fabric.utilities.registry import _register_classes ACCELERATOR_REGISTRY = _AcceleratorRegistry() diff --git a/src/lightning/fabric/strategies/strategy.py b/src/lightning/fabric/strategies/strategy.py index ca5379510427e..d2810efb4f077 100644 --- a/src/lightning/fabric/strategies/strategy.py +++ b/src/lightning/fabric/strategies/strategy.py @@ -329,7 +329,7 @@ def load_checkpoint( """ torch.cuda.empty_cache() - if hasattr(torch, "xpu"): + if hasattr(torch, "xpu"): torch.xpu.empty_cache() checkpoint = self.checkpoint_io.load_checkpoint(path) if not state: diff --git a/src/lightning/fabric/utilities/device_parser.py b/src/lightning/fabric/utilities/device_parser.py index b8e75ee38cfbb..13de96aac61b9 100644 --- a/src/lightning/fabric/utilities/device_parser.py +++ b/src/lightning/fabric/utilities/device_parser.py @@ -85,9 +85,9 @@ def _parse_gpu_ids( # We know the user requested GPUs therefore if some of the # requested GPUs are not available an exception is thrown. gpus = _normalize_parse_gpu_string_input(gpus) - gpus = _normalize_parse_gpu_input_to_list(gpus, - include_cuda=include_cuda, - include_mps=include_mps, + gpus = _normalize_parse_gpu_input_to_list(gpus, + include_cuda=include_cuda, + include_mps=include_mps, include_xpu=include_xpu) if not gpus: raise MisconfigurationException("GPUs requested but none are available.") @@ -96,8 +96,8 @@ def _parse_gpu_ids( torch.distributed.is_available() and torch.distributed.is_torchelastic_launched() and len(gpus) != 1 - and len(_get_all_available_gpus(include_cuda=include_cuda, - include_mps=include_mps, + and len(_get_all_available_gpus(include_cuda=include_cuda, + include_mps=include_mps, include_xpu=include_xpu)) == 1 ): # Omit sanity check on torchelastic because by default it shows one visible GPU per process @@ -120,9 +120,9 @@ def _normalize_parse_gpu_string_input(s: Union[int, str, List[int]]) -> Union[in def _sanitize_gpu_ids( - gpus: List[int], - include_cuda: bool = False, - include_mps: bool = False, + gpus: List[int], + include_cuda: bool = False, + include_mps: bool = False, include_xpu: bool = False ) -> List[int]: """Checks that each of the GPUs in the list is actually available. Raises a MisconfigurationException if any of the @@ -151,9 +151,9 @@ def _sanitize_gpu_ids( def _normalize_parse_gpu_input_to_list( - gpus: Union[int, List[int], Tuple[int, ...]], - include_cuda: bool, - include_mps: bool, + gpus: Union[int, List[int], Tuple[int, ...]], + include_cuda: bool, + include_mps: bool, include_xpu: bool ) -> Optional[List[int]]: assert gpus is not None @@ -170,8 +170,8 @@ def _normalize_parse_gpu_input_to_list( def _get_all_available_gpus( - include_cuda: bool = False, - include_mps: bool = False, + include_cuda: bool = False, + include_mps: bool = False, include_xpu: bool = False ) -> List[int]: """ From 0aad4170171dc6ae8606ecc4f9690d29731d31c8 Mon Sep 17 00:00:00 2001 From: Corey Adams Date: Mon, 12 Feb 2024 11:15:10 -0600 Subject: [PATCH 15/26] Missed a line that was too long --- src/lightning/fabric/utilities/device_parser.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/lightning/fabric/utilities/device_parser.py b/src/lightning/fabric/utilities/device_parser.py index b8e75ee38cfbb..e8bfaf409a210 100644 --- a/src/lightning/fabric/utilities/device_parser.py +++ b/src/lightning/fabric/utilities/device_parser.py @@ -141,7 +141,10 @@ def _sanitize_gpu_ids( """ if sum((include_cuda, include_mps, include_xpu)) == 0: raise ValueError("At least one gpu type should be specified!") - all_available_gpus = _get_all_available_gpus(include_cuda=include_cuda, include_mps=include_mps, include_xpu=include_xpu) + all_available_gpus = _get_all_available_gpus( + include_cuda=include_cuda, + include_mps=include_mps, + include_xpu=include_xpu) for gpu in gpus: if gpu not in all_available_gpus: raise MisconfigurationException( From 036cc1b6dd093a1829a33fe6cc5e9b2494fa0b67 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 12 Feb 2024 17:17:35 +0000 Subject: [PATCH 16/26] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/lightning/fabric/utilities/device_parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lightning/fabric/utilities/device_parser.py b/src/lightning/fabric/utilities/device_parser.py index 7b94336f606db..476caeb2fbeba 100644 --- a/src/lightning/fabric/utilities/device_parser.py +++ b/src/lightning/fabric/utilities/device_parser.py @@ -142,8 +142,8 @@ def _sanitize_gpu_ids( if sum((include_cuda, include_mps, include_xpu)) == 0: raise ValueError("At least one gpu type should be specified!") all_available_gpus = _get_all_available_gpus( - include_cuda=include_cuda, - include_mps=include_mps, + include_cuda=include_cuda, + include_mps=include_mps, include_xpu=include_xpu) for gpu in gpus: if gpu not in all_available_gpus: From 6936c24c90ea61ab573b6039c119cd127d483a8c Mon Sep 17 00:00:00 2001 From: Corey Adams Date: Mon, 12 Feb 2024 11:22:35 -0600 Subject: [PATCH 17/26] Wrap ipex imports more carefully --- src/lightning/fabric/accelerators/xpu.py | 8 ++++++-- src/lightning/pytorch/accelerators/xpu.py | 8 ++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/src/lightning/fabric/accelerators/xpu.py b/src/lightning/fabric/accelerators/xpu.py index d3e377eadd2c2..2c45e13f95185 100644 --- a/src/lightning/fabric/accelerators/xpu.py +++ b/src/lightning/fabric/accelerators/xpu.py @@ -50,8 +50,12 @@ def auto_device_count() -> int: @staticmethod def is_available() -> bool: - import intel_extension_for_pytorch as ipex - return ipex.xpu.is_available() + # Carefully check before trying to import: + if _IPEX_AVAILABLE: + import intel_extension_for_pytorch as ipex + return ipex.xpu.is_available() + else: + return False def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]: # Return optional device statistics for loggers diff --git a/src/lightning/pytorch/accelerators/xpu.py b/src/lightning/pytorch/accelerators/xpu.py index f189493a9c620..0e69df0f33dc9 100644 --- a/src/lightning/pytorch/accelerators/xpu.py +++ b/src/lightning/pytorch/accelerators/xpu.py @@ -50,8 +50,12 @@ def auto_device_count() -> int: @staticmethod def is_available() -> bool: - import intel_extension_for_pytorch as ipex - return ipex.xpu.is_available() + # Carefully check before trying to import: + if _IPEX_AVAILABLE: + import intel_extension_for_pytorch as ipex + return ipex.xpu.is_available() + else: + return False def get_device_stats(self, device: _DEVICE) -> Dict[str, Any]: # Return optional device statistics for loggers From 6ab7e1a27dd07b19619e615909748e27d1d52751 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 12 Feb 2024 17:26:07 +0000 Subject: [PATCH 18/26] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/lightning/fabric/accelerators/xpu.py | 3 +-- src/lightning/pytorch/accelerators/xpu.py | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/lightning/fabric/accelerators/xpu.py b/src/lightning/fabric/accelerators/xpu.py index 2c45e13f95185..7fbbb9405c05a 100644 --- a/src/lightning/fabric/accelerators/xpu.py +++ b/src/lightning/fabric/accelerators/xpu.py @@ -54,8 +54,7 @@ def is_available() -> bool: if _IPEX_AVAILABLE: import intel_extension_for_pytorch as ipex return ipex.xpu.is_available() - else: - return False + return False def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]: # Return optional device statistics for loggers diff --git a/src/lightning/pytorch/accelerators/xpu.py b/src/lightning/pytorch/accelerators/xpu.py index 0e69df0f33dc9..4d5a43b1b8d6c 100644 --- a/src/lightning/pytorch/accelerators/xpu.py +++ b/src/lightning/pytorch/accelerators/xpu.py @@ -54,8 +54,7 @@ def is_available() -> bool: if _IPEX_AVAILABLE: import intel_extension_for_pytorch as ipex return ipex.xpu.is_available() - else: - return False + return False def get_device_stats(self, device: _DEVICE) -> Dict[str, Any]: # Return optional device statistics for loggers From eee4061fc35b4fc828fdc87ebcf3e5a25e555192 Mon Sep 17 00:00:00 2001 From: Corey Adams Date: Mon, 12 Feb 2024 11:36:45 -0600 Subject: [PATCH 19/26] Fix typo of nncl to nccl --- src/lightning/fabric/utilities/distributed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lightning/fabric/utilities/distributed.py b/src/lightning/fabric/utilities/distributed.py index 7dca4f3f9e3f8..a929c2501f4b7 100644 --- a/src/lightning/fabric/utilities/distributed.py +++ b/src/lightning/fabric/utilities/distributed.py @@ -305,7 +305,7 @@ def _init_dist_connection( def _get_default_process_group_backend_for_device(device: torch.device) -> str: if device.type == "cuda": - return "nncl" + return "nccl" if device.type == "xpu": return "ccl" return "gloo" From be667293e3eca5550b879e0de8e43ce21923467b Mon Sep 17 00:00:00 2001 From: Corey Adams Date: Mon, 12 Feb 2024 11:59:05 -0600 Subject: [PATCH 20/26] Add function typing and return signature to one XPU function. Add override decorator in line with other accelerators. --- src/lightning/fabric/accelerators/xpu.py | 8 ++++++++ src/lightning/pytorch/accelerators/xpu.py | 10 +++++++++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/src/lightning/fabric/accelerators/xpu.py b/src/lightning/fabric/accelerators/xpu.py index 7fbbb9405c05a..1528c14ff8611 100644 --- a/src/lightning/fabric/accelerators/xpu.py +++ b/src/lightning/fabric/accelerators/xpu.py @@ -30,6 +30,7 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: super().__init__(*args, **kwargs) @staticmethod + @override def parse_devices(devices: Any) -> Any: # Put parsing logic here how devices can be passed into the Trainer # via the `devices` argument @@ -38,17 +39,20 @@ def parse_devices(devices: Any) -> Any: return _parse_gpu_ids(devices, include_xpu=True) @staticmethod + @override def get_parallel_devices(devices: Any) -> Any: # Here, convert the device indices to actual device objects return [torch.device("xpu", idx) for idx in devices] @staticmethod + @override def auto_device_count() -> int: # Return a value for auto-device selection when `Trainer(devices="auto")` return num_xpu_devices() @staticmethod + @override def is_available() -> bool: # Carefully check before trying to import: if _IPEX_AVAILABLE: @@ -56,17 +60,21 @@ def is_available() -> bool: return ipex.xpu.is_available() return False + @override def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]: # Return optional device statistics for loggers return {} + @override def setup_device(self, device: torch.device) -> None: pass + @override def teardown(self) -> None: pass @classmethod + @override def register_accelerators(cls, accelerator_registry: _AcceleratorRegistry) -> None: accelerator_registry.register( "xpu", diff --git a/src/lightning/pytorch/accelerators/xpu.py b/src/lightning/pytorch/accelerators/xpu.py index 4d5a43b1b8d6c..ff072ce8dcf99 100644 --- a/src/lightning/pytorch/accelerators/xpu.py +++ b/src/lightning/pytorch/accelerators/xpu.py @@ -30,6 +30,7 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: super().__init__(*args, **kwargs) @staticmethod + @override def parse_devices(devices: Any) -> Any: # Put parsing logic here how devices can be passed into the Trainer # via the `devices` argument @@ -38,17 +39,20 @@ def parse_devices(devices: Any) -> Any: return _parse_gpu_ids(devices, include_xpu=True) @staticmethod + @override def get_parallel_devices(devices: Any) -> Any: # Here, convert the device indices to actual device objects return [torch.device("xpu", idx) for idx in devices] @staticmethod + @override def auto_device_count() -> int: # Return a value for auto-device selection when `Trainer(devices="auto")` return num_xpu_devices() @staticmethod + @override def is_available() -> bool: # Carefully check before trying to import: if _IPEX_AVAILABLE: @@ -56,18 +60,22 @@ def is_available() -> bool: return ipex.xpu.is_available() return False + @override def get_device_stats(self, device: _DEVICE) -> Dict[str, Any]: # Return optional device statistics for loggers return torch.xpu.memory_stats(device) + @override def setup_device(self, device: torch.device) -> None: pass + @override def teardown(self) -> None: pass @classmethod - def register_accelerators(cls, accelerator_registry): + @override + def register_accelerators(cls, accelerator_registry: _AcceleratorRegistry) -> None: accelerator_registry.register( "xpu", cls, From b5ab23781f29a35a0c375a860d6c48271caa9bd3 Mon Sep 17 00:00:00 2001 From: Corey Adams Date: Mon, 12 Feb 2024 12:19:15 -0600 Subject: [PATCH 21/26] Fix missing import --- src/lightning/fabric/accelerators/xpu.py | 1 + src/lightning/pytorch/accelerators/xpu.py | 1 + 2 files changed, 2 insertions(+) diff --git a/src/lightning/fabric/accelerators/xpu.py b/src/lightning/fabric/accelerators/xpu.py index 1528c14ff8611..5dd3784b52a2b 100644 --- a/src/lightning/fabric/accelerators/xpu.py +++ b/src/lightning/fabric/accelerators/xpu.py @@ -13,6 +13,7 @@ # limitations under the License. from functools import lru_cache from typing import Any, Dict, List, Union +from typing_extensions import override import torch from lightning_utilities.core.imports import RequirementCache diff --git a/src/lightning/pytorch/accelerators/xpu.py b/src/lightning/pytorch/accelerators/xpu.py index ff072ce8dcf99..f37dd59324241 100644 --- a/src/lightning/pytorch/accelerators/xpu.py +++ b/src/lightning/pytorch/accelerators/xpu.py @@ -13,6 +13,7 @@ # limitations under the License. from functools import lru_cache from typing import Any, Dict, List +from typing_extensions import override import torch from lightning_utilities.core.imports import RequirementCache From d62c74fee54acd5431c4341cd80aa721b648c7fd Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 12 Feb 2024 18:20:10 +0000 Subject: [PATCH 22/26] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/lightning/fabric/accelerators/xpu.py | 2 +- src/lightning/pytorch/accelerators/xpu.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lightning/fabric/accelerators/xpu.py b/src/lightning/fabric/accelerators/xpu.py index 5dd3784b52a2b..24b213377bc5d 100644 --- a/src/lightning/fabric/accelerators/xpu.py +++ b/src/lightning/fabric/accelerators/xpu.py @@ -13,10 +13,10 @@ # limitations under the License. from functools import lru_cache from typing import Any, Dict, List, Union -from typing_extensions import override import torch from lightning_utilities.core.imports import RequirementCache +from typing_extensions import override from lightning.fabric.accelerators.accelerator import Accelerator from lightning.fabric.accelerators.registry import _AcceleratorRegistry diff --git a/src/lightning/pytorch/accelerators/xpu.py b/src/lightning/pytorch/accelerators/xpu.py index f37dd59324241..ea792d7be997b 100644 --- a/src/lightning/pytorch/accelerators/xpu.py +++ b/src/lightning/pytorch/accelerators/xpu.py @@ -13,10 +13,10 @@ # limitations under the License. from functools import lru_cache from typing import Any, Dict, List -from typing_extensions import override import torch from lightning_utilities.core.imports import RequirementCache +from typing_extensions import override from lightning.fabric.utilities.types import _DEVICE from lightning.pytorch.accelerators.accelerator import Accelerator From 89865b5997a78cabeccfbded0dd6d329fbb41938 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 3 Apr 2024 20:45:16 +0000 Subject: [PATCH 23/26] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/lightning/fabric/accelerators/xpu.py | 4 +++ .../fabric/utilities/device_dtype_mixin.py | 1 - .../fabric/utilities/device_parser.py | 31 ++++++------------- src/lightning/pytorch/accelerators/xpu.py | 3 ++ 4 files changed, 17 insertions(+), 22 deletions(-) diff --git a/src/lightning/fabric/accelerators/xpu.py b/src/lightning/fabric/accelerators/xpu.py index 24b213377bc5d..5da88d34cb088 100644 --- a/src/lightning/fabric/accelerators/xpu.py +++ b/src/lightning/fabric/accelerators/xpu.py @@ -58,6 +58,7 @@ def is_available() -> bool: # Carefully check before trying to import: if _IPEX_AVAILABLE: import intel_extension_for_pytorch as ipex + return ipex.xpu.is_available() return False @@ -83,6 +84,7 @@ def register_accelerators(cls, accelerator_registry: _AcceleratorRegistry) -> No description=cls.__name__, ) + _IPEX_AVAILABLE = RequirementCache("intel_extension_for_pytorch>=2.0", "intel_extension_for_pytorch") @@ -96,9 +98,11 @@ def num_xpu_devices() -> int: """ if _IPEX_AVAILABLE: import intel_extension_for_pytorch as ipex + return ipex.xpu.device_count() return 0 + def _get_all_visible_xpu_devices() -> List[int]: """Returns a list of all visible Intel XPU devices. diff --git a/src/lightning/fabric/utilities/device_dtype_mixin.py b/src/lightning/fabric/utilities/device_dtype_mixin.py index 71d597e68afba..79afbd1ef7aec 100644 --- a/src/lightning/fabric/utilities/device_dtype_mixin.py +++ b/src/lightning/fabric/utilities/device_dtype_mixin.py @@ -47,7 +47,6 @@ def device(self) -> torch.device: if hasattr(torch, "xpu") and device.type == "xpu" and device.index is None: return torch.device(f"xpu:{torch.xpu.current_device()}") - return device @override diff --git a/src/lightning/fabric/utilities/device_parser.py b/src/lightning/fabric/utilities/device_parser.py index 476caeb2fbeba..5962b33504835 100644 --- a/src/lightning/fabric/utilities/device_parser.py +++ b/src/lightning/fabric/utilities/device_parser.py @@ -85,10 +85,9 @@ def _parse_gpu_ids( # We know the user requested GPUs therefore if some of the # requested GPUs are not available an exception is thrown. gpus = _normalize_parse_gpu_string_input(gpus) - gpus = _normalize_parse_gpu_input_to_list(gpus, - include_cuda=include_cuda, - include_mps=include_mps, - include_xpu=include_xpu) + gpus = _normalize_parse_gpu_input_to_list( + gpus, include_cuda=include_cuda, include_mps=include_mps, include_xpu=include_xpu + ) if not gpus: raise MisconfigurationException("GPUs requested but none are available.") @@ -96,9 +95,8 @@ def _parse_gpu_ids( torch.distributed.is_available() and torch.distributed.is_torchelastic_launched() and len(gpus) != 1 - and len(_get_all_available_gpus(include_cuda=include_cuda, - include_mps=include_mps, - include_xpu=include_xpu)) == 1 + and len(_get_all_available_gpus(include_cuda=include_cuda, include_mps=include_mps, include_xpu=include_xpu)) + == 1 ): # Omit sanity check on torchelastic because by default it shows one visible GPU per process return gpus @@ -120,10 +118,7 @@ def _normalize_parse_gpu_string_input(s: Union[int, str, List[int]]) -> Union[in def _sanitize_gpu_ids( - gpus: List[int], - include_cuda: bool = False, - include_mps: bool = False, - include_xpu: bool = False + gpus: List[int], include_cuda: bool = False, include_mps: bool = False, include_xpu: bool = False ) -> List[int]: """Checks that each of the GPUs in the list is actually available. Raises a MisconfigurationException if any of the GPUs is not available. @@ -142,9 +137,8 @@ def _sanitize_gpu_ids( if sum((include_cuda, include_mps, include_xpu)) == 0: raise ValueError("At least one gpu type should be specified!") all_available_gpus = _get_all_available_gpus( - include_cuda=include_cuda, - include_mps=include_mps, - include_xpu=include_xpu) + include_cuda=include_cuda, include_mps=include_mps, include_xpu=include_xpu + ) for gpu in gpus: if gpu not in all_available_gpus: raise MisconfigurationException( @@ -154,10 +148,7 @@ def _sanitize_gpu_ids( def _normalize_parse_gpu_input_to_list( - gpus: Union[int, List[int], Tuple[int, ...]], - include_cuda: bool, - include_mps: bool, - include_xpu: bool + gpus: Union[int, List[int], Tuple[int, ...]], include_cuda: bool, include_mps: bool, include_xpu: bool ) -> Optional[List[int]]: assert gpus is not None if isinstance(gpus, (MutableSequence, tuple)): @@ -173,9 +164,7 @@ def _normalize_parse_gpu_input_to_list( def _get_all_available_gpus( - include_cuda: bool = False, - include_mps: bool = False, - include_xpu: bool = False + include_cuda: bool = False, include_mps: bool = False, include_xpu: bool = False ) -> List[int]: """ Returns: diff --git a/src/lightning/pytorch/accelerators/xpu.py b/src/lightning/pytorch/accelerators/xpu.py index ea792d7be997b..60fe3ead43e30 100644 --- a/src/lightning/pytorch/accelerators/xpu.py +++ b/src/lightning/pytorch/accelerators/xpu.py @@ -58,6 +58,7 @@ def is_available() -> bool: # Carefully check before trying to import: if _IPEX_AVAILABLE: import intel_extension_for_pytorch as ipex + return ipex.xpu.is_available() return False @@ -97,9 +98,11 @@ def num_xpu_devices() -> int: """ if _IPEX_AVAILABLE: import intel_extension_for_pytorch as ipex + return ipex.xpu.device_count() return 0 + def _get_all_visible_xpu_devices() -> List[int]: """Returns a list of all visible Intel XPU devices. From 5820c75810fdc92b9c92efea6b144210a61007ad Mon Sep 17 00:00:00 2001 From: Corey Adams Date: Wed, 3 Apr 2024 15:49:50 -0500 Subject: [PATCH 24/26] Fix typing error in register_accelerator --- src/lightning/fabric/accelerators/xpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lightning/fabric/accelerators/xpu.py b/src/lightning/fabric/accelerators/xpu.py index 24b213377bc5d..fce52827fcf02 100644 --- a/src/lightning/fabric/accelerators/xpu.py +++ b/src/lightning/fabric/accelerators/xpu.py @@ -76,7 +76,7 @@ def teardown(self) -> None: @classmethod @override - def register_accelerators(cls, accelerator_registry: _AcceleratorRegistry) -> None: + def register_accelerators(cls, accelerator_registry: Any) -> None: accelerator_registry.register( "xpu", cls, From b3b28328ef5c26b3828df97edbee5fbb332810e0 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 3 Apr 2024 20:51:13 +0000 Subject: [PATCH 25/26] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/lightning/fabric/accelerators/xpu.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/lightning/fabric/accelerators/xpu.py b/src/lightning/fabric/accelerators/xpu.py index e11658aa60c51..0d88468ccabd0 100644 --- a/src/lightning/fabric/accelerators/xpu.py +++ b/src/lightning/fabric/accelerators/xpu.py @@ -19,7 +19,6 @@ from typing_extensions import override from lightning.fabric.accelerators.accelerator import Accelerator -from lightning.fabric.accelerators.registry import _AcceleratorRegistry class XPUAccelerator(Accelerator): From 8a443c76c11d765ebefe9938dfc5160e1480339d Mon Sep 17 00:00:00 2001 From: Corey adams Date: Wed, 3 Apr 2024 15:56:40 -0500 Subject: [PATCH 26/26] Update xpu.py --- src/lightning/pytorch/accelerators/xpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lightning/pytorch/accelerators/xpu.py b/src/lightning/pytorch/accelerators/xpu.py index 60fe3ead43e30..321bb3f206473 100644 --- a/src/lightning/pytorch/accelerators/xpu.py +++ b/src/lightning/pytorch/accelerators/xpu.py @@ -77,7 +77,7 @@ def teardown(self) -> None: @classmethod @override - def register_accelerators(cls, accelerator_registry: _AcceleratorRegistry) -> None: + def register_accelerators(cls, accelerator_registry: Any) -> None: accelerator_registry.register( "xpu", cls,