From 8d37a9a10e79c28f8ef2d9793cdf9d8c775175cf Mon Sep 17 00:00:00 2001 From: awaelchli Date: Sun, 10 Mar 2024 13:09:02 +0100 Subject: [PATCH 01/24] basic system check --- .../fabric/utilities/system_check.py | 116 ++++++++++++++++++ 1 file changed, 116 insertions(+) create mode 100644 src/lightning/fabric/utilities/system_check.py diff --git a/src/lightning/fabric/utilities/system_check.py b/src/lightning/fabric/utilities/system_check.py new file mode 100644 index 0000000000000..2e0ea1a0cc2c8 --- /dev/null +++ b/src/lightning/fabric/utilities/system_check.py @@ -0,0 +1,116 @@ +import os +import subprocess +from datetime import timedelta +from functools import lru_cache +from pathlib import Path +from typing import Any +import torch.distributed as dist +import torch.multiprocessing as mp + +import torch + + +def main(): + # if not dist.is_available(): + # raise RuntimeError("Requires PyTorch distributed to be available.") + + if num_cuda_devices() == 0: + print0("Warning: Skipping system check because no GPUs were detected.") + + if num_cuda_devices() == 1: + describe_nvidia_smi() + pass + + if num_cuda_devices() > 1: + describe_nvidia_smi() + describe_gpu_connectivity() + mp.spawn(_check_cuda_distributed, nprocs=num_cuda_devices()) + + +def _check_cuda_distributed(local_rank: int) -> None: + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = "29500" + os.environ["WORLD_SIZE"] = str(num_cuda_devices()) + os.environ["RANK"] = str(local_rank) + os.environ["LOCAL_RANK"] = str(local_rank) + + system_check_dir = Path("./.system_check") + system_check_dir.mkdir(exist_ok=True) + + dist.init_process_group( + backend="nccl", + world_size=num_cuda_devices(), + rank=local_rank, + timeout=timedelta(seconds=30), + ) + + device = torch.device("cuda", local_rank) + torch.cuda.set_device(local_rank) + + dist.barrier() + payload = torch.rand(100, 100, device=device) + dist.all_reduce(payload) + + +@lru_cache() +def rank() -> int: + import torch.distributed as dist + + return dist.get_rank() + + +@lru_cache() +def world_size() -> int: + import torch.distributed as dist + + return dist.get_world_size() + + +def print0(*args: Any, **kwargs: Any) -> None: + if rank() == 0: + print(*args, **kwargs) + + +@lru_cache() +def num_cuda_devices() -> int: + import torch + + return torch.cuda.device_count() + + +def is_torch_available() -> bool: + try: + import torch # noqa: F401 + except (ImportError, ModuleNotFoundError): + return False + return True + + +def collect_nvidia_smi_topo() -> str: + return subprocess.run(["nvidia-smi", "topo", "-m"], capture_output=True, text=True).stdout + + +def collect_nvidia_smi() -> str: + return subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout + + +def describe_nvidia_smi(): + print( + "Below is the output of `nvidia-smi`. It shows information about the GPUs that are installed on this machine," + " the driver version, and the maximum supported CUDA version it can run." + ) + print() + print(collect_nvidia_smi()) + + +def describe_gpu_connectivity(): + print( + "The matrix below shows how the GPUs in this machine are connected." + " NVLink (NV) is the fastest connection, and is only available on high-end systems like V100 or A100." + ) + print() + print(collect_nvidia_smi_topo()) + + +if __name__ == '__main__': + main() From a65e7a34b81657279ec49299e854405357371cdb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 10 Mar 2024 08:37:59 -0400 Subject: [PATCH 02/24] update --- .../fabric/utilities/system_check.py | 39 +++++++++++++------ 1 file changed, 28 insertions(+), 11 deletions(-) diff --git a/src/lightning/fabric/utilities/system_check.py b/src/lightning/fabric/utilities/system_check.py index 2e0ea1a0cc2c8..627a016021d91 100644 --- a/src/lightning/fabric/utilities/system_check.py +++ b/src/lightning/fabric/utilities/system_check.py @@ -1,4 +1,6 @@ import os +import logging +import shutil import subprocess from datetime import timedelta from functools import lru_cache @@ -9,8 +11,12 @@ import torch +system_check_dir = Path("./system_check") + def main(): + setup_logging() + # if not dist.is_available(): # raise RuntimeError("Requires PyTorch distributed to be available.") @@ -33,9 +39,8 @@ def _check_cuda_distributed(local_rank: int) -> None: os.environ["WORLD_SIZE"] = str(num_cuda_devices()) os.environ["RANK"] = str(local_rank) os.environ["LOCAL_RANK"] = str(local_rank) - - system_check_dir = Path("./.system_check") - system_check_dir.mkdir(exist_ok=True) + os.environ["NCCL_DEBUG"] = "INFO" + os.environ["NCCL_DEBUG_FILE"] = str(system_check_dir / f"nccl-rank-{local_rank}.txt") dist.init_process_group( backend="nccl", @@ -52,6 +57,18 @@ def _check_cuda_distributed(local_rank: int) -> None: dist.all_reduce(payload) +def setup_logging() -> None: + if system_check_dir.is_dir(): + shutil.rmtree(system_check_dir) + system_check_dir.mkdir() + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + file_handler = logging.FileHandler(str(system_check_dir / "logs.txt")) + file_handler.setLevel(logging.INFO) + logger.addHandler(file_handler) + + @lru_cache() def rank() -> int: import torch.distributed as dist @@ -95,21 +112,21 @@ def collect_nvidia_smi() -> str: def describe_nvidia_smi(): - print( + logger = logging.getLogger() + logger.info( "Below is the output of `nvidia-smi`. It shows information about the GPUs that are installed on this machine," - " the driver version, and the maximum supported CUDA version it can run." + " the driver version, and the maximum supported CUDA version it can run.\n" ) - print() - print(collect_nvidia_smi()) + logger.info(collect_nvidia_smi()) def describe_gpu_connectivity(): - print( + logger = logging.getLogger() + logger.info( "The matrix below shows how the GPUs in this machine are connected." - " NVLink (NV) is the fastest connection, and is only available on high-end systems like V100 or A100." + " NVLink (NV) is the fastest connection, and is only available on high-end systems like V100 or A100.\n" ) - print() - print(collect_nvidia_smi_topo()) + logger.info(collect_nvidia_smi_topo()) if __name__ == '__main__': From 80b05465fc7a973ce90361a764513ce86cb00a97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 10 Mar 2024 09:02:30 -0400 Subject: [PATCH 03/24] update --- .../fabric/utilities/system_check.py | 26 +++++-------------- 1 file changed, 6 insertions(+), 20 deletions(-) diff --git a/src/lightning/fabric/utilities/system_check.py b/src/lightning/fabric/utilities/system_check.py index 627a016021d91..fe3d3a81d6f69 100644 --- a/src/lightning/fabric/utilities/system_check.py +++ b/src/lightning/fabric/utilities/system_check.py @@ -8,7 +8,6 @@ from typing import Any import torch.distributed as dist import torch.multiprocessing as mp - import torch system_check_dir = Path("./system_check") @@ -42,16 +41,18 @@ def _check_cuda_distributed(local_rank: int) -> None: os.environ["NCCL_DEBUG"] = "INFO" os.environ["NCCL_DEBUG_FILE"] = str(system_check_dir / f"nccl-rank-{local_rank}.txt") + device = torch.device("cuda", local_rank) + torch.cuda.set_device(local_rank) + dist.init_process_group( backend="nccl", world_size=num_cuda_devices(), rank=local_rank, - timeout=timedelta(seconds=30), + # NCCL gets initialized in the first collective call (e.g., barrier below), + # which must be successful for this timeout to work. + timeout=timedelta(seconds=10), ) - device = torch.device("cuda", local_rank) - torch.cuda.set_device(local_rank) - dist.barrier() payload = torch.rand(100, 100, device=device) dist.all_reduce(payload) @@ -76,13 +77,6 @@ def rank() -> int: return dist.get_rank() -@lru_cache() -def world_size() -> int: - import torch.distributed as dist - - return dist.get_world_size() - - def print0(*args: Any, **kwargs: Any) -> None: if rank() == 0: print(*args, **kwargs) @@ -95,14 +89,6 @@ def num_cuda_devices() -> int: return torch.cuda.device_count() -def is_torch_available() -> bool: - try: - import torch # noqa: F401 - except (ImportError, ModuleNotFoundError): - return False - return True - - def collect_nvidia_smi_topo() -> str: return subprocess.run(["nvidia-smi", "topo", "-m"], capture_output=True, text=True).stdout From d0028b6fae5a94c1d54329ade1f2df0ab0cabeec Mon Sep 17 00:00:00 2001 From: awaelchli Date: Sun, 10 Mar 2024 14:14:02 +0100 Subject: [PATCH 04/24] prints --- .../fabric/utilities/system_check.py | 82 ++++++++++--------- 1 file changed, 42 insertions(+), 40 deletions(-) diff --git a/src/lightning/fabric/utilities/system_check.py b/src/lightning/fabric/utilities/system_check.py index fe3d3a81d6f69..c79b494e5e2bc 100644 --- a/src/lightning/fabric/utilities/system_check.py +++ b/src/lightning/fabric/utilities/system_check.py @@ -10,109 +10,111 @@ import torch.multiprocessing as mp import torch -system_check_dir = Path("./system_check") +SYSTEM_CHECK_DIR = Path("./system_check") -def main(): - setup_logging() +def main() -> None: + _setup_logging() + num_cuda_devices = torch.cuda.device_count() - # if not dist.is_available(): - # raise RuntimeError("Requires PyTorch distributed to be available.") + if num_cuda_devices == 0: + print("Warning: Skipping system check because no GPUs were detected.") - if num_cuda_devices() == 0: - print0("Warning: Skipping system check because no GPUs were detected.") - - if num_cuda_devices() == 1: - describe_nvidia_smi() + if num_cuda_devices == 1: + # TODO + _describe_nvidia_smi() pass - if num_cuda_devices() > 1: - describe_nvidia_smi() - describe_gpu_connectivity() - mp.spawn(_check_cuda_distributed, nprocs=num_cuda_devices()) + if num_cuda_devices > 1: + _describe_nvidia_smi() + _describe_gpu_connectivity() + mp.spawn(_check_cuda_distributed, nprocs=num_cuda_devices, args=(num_cuda_devices,)) -def _check_cuda_distributed(local_rank: int) -> None: +def _check_cuda_distributed(local_rank: int, world_size: int) -> None: os.environ["MASTER_ADDR"] = "localhost" os.environ["MASTER_PORT"] = "29500" - os.environ["WORLD_SIZE"] = str(num_cuda_devices()) + os.environ["WORLD_SIZE"] = str(world_size) os.environ["RANK"] = str(local_rank) os.environ["LOCAL_RANK"] = str(local_rank) os.environ["NCCL_DEBUG"] = "INFO" - os.environ["NCCL_DEBUG_FILE"] = str(system_check_dir / f"nccl-rank-{local_rank}.txt") + os.environ["NCCL_DEBUG_FILE"] = str(SYSTEM_CHECK_DIR / f"nccl-rank-{local_rank}.txt") device = torch.device("cuda", local_rank) torch.cuda.set_device(local_rank) + _print0("Setting up the process group ... ", end="") dist.init_process_group( backend="nccl", - world_size=num_cuda_devices(), + world_size=world_size, rank=local_rank, # NCCL gets initialized in the first collective call (e.g., barrier below), # which must be successful for this timeout to work. timeout=timedelta(seconds=10), ) - + _print0("Done.") + + _print0( + "Synchronizing GPUs. If this step doesn't finish within 30 seconds, there is a problem with your" + " multi-GPU setup." + ) dist.barrier() + _print0("Done.") + payload = torch.rand(100, 100, device=device) + _print0("Running all-reduce test ... ", end="") dist.all_reduce(payload) + _print0("Done.") -def setup_logging() -> None: - if system_check_dir.is_dir(): - shutil.rmtree(system_check_dir) - system_check_dir.mkdir() +def _setup_logging() -> None: + if SYSTEM_CHECK_DIR.is_dir(): + shutil.rmtree(SYSTEM_CHECK_DIR) + SYSTEM_CHECK_DIR.mkdir() logger = logging.getLogger() logger.setLevel(logging.INFO) - file_handler = logging.FileHandler(str(system_check_dir / "logs.txt")) + file_handler = logging.FileHandler(str(SYSTEM_CHECK_DIR / "logs.txt")) file_handler.setLevel(logging.INFO) logger.addHandler(file_handler) @lru_cache() -def rank() -> int: +def _rank() -> int: import torch.distributed as dist return dist.get_rank() -def print0(*args: Any, **kwargs: Any) -> None: - if rank() == 0: +def _print0(*args: Any, **kwargs: Any) -> None: + if _rank() == 0: print(*args, **kwargs) -@lru_cache() -def num_cuda_devices() -> int: - import torch - - return torch.cuda.device_count() - - -def collect_nvidia_smi_topo() -> str: +def _collect_nvidia_smi_topo() -> str: return subprocess.run(["nvidia-smi", "topo", "-m"], capture_output=True, text=True).stdout -def collect_nvidia_smi() -> str: +def _collect_nvidia_smi() -> str: return subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout -def describe_nvidia_smi(): +def _describe_nvidia_smi() -> None: logger = logging.getLogger() logger.info( "Below is the output of `nvidia-smi`. It shows information about the GPUs that are installed on this machine," " the driver version, and the maximum supported CUDA version it can run.\n" ) - logger.info(collect_nvidia_smi()) + logger.info(_collect_nvidia_smi()) -def describe_gpu_connectivity(): +def _describe_gpu_connectivity() -> None: logger = logging.getLogger() logger.info( "The matrix below shows how the GPUs in this machine are connected." " NVLink (NV) is the fastest connection, and is only available on high-end systems like V100 or A100.\n" ) - logger.info(collect_nvidia_smi_topo()) + logger.info(_collect_nvidia_smi_topo()) if __name__ == '__main__': From afc4bdc4a12bb99346e4b52da52968e5adfca477 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 10 Mar 2024 09:20:45 -0400 Subject: [PATCH 05/24] update --- src/lightning/fabric/utilities/system_check.py | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/src/lightning/fabric/utilities/system_check.py b/src/lightning/fabric/utilities/system_check.py index c79b494e5e2bc..15cd9361ea53d 100644 --- a/src/lightning/fabric/utilities/system_check.py +++ b/src/lightning/fabric/utilities/system_check.py @@ -52,14 +52,13 @@ def _check_cuda_distributed(local_rank: int, world_size: int) -> None: # which must be successful for this timeout to work. timeout=timedelta(seconds=10), ) - _print0("Done.") + _print0("done.") _print0( - "Synchronizing GPUs. If this step doesn't finish within 30 seconds, there is a problem with your" + "Synchronizing GPUs. If the program hangs for more than 30 seconds, there is a problem with your" " multi-GPU setup." ) dist.barrier() - _print0("Done.") payload = torch.rand(100, 100, device=device) _print0("Running all-reduce test ... ", end="") @@ -79,15 +78,8 @@ def _setup_logging() -> None: logger.addHandler(file_handler) -@lru_cache() -def _rank() -> int: - import torch.distributed as dist - - return dist.get_rank() - - def _print0(*args: Any, **kwargs: Any) -> None: - if _rank() == 0: + if int(os.getenv("RANK", 0)) == 0: print(*args, **kwargs) From 760ed2cfcc6ec4d751bbc2e1ca069a5d34a01b3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 10 Mar 2024 09:51:37 -0400 Subject: [PATCH 06/24] update --- .../fabric/utilities/system_check.py | 43 +++++++++++++++++-- 1 file changed, 40 insertions(+), 3 deletions(-) diff --git a/src/lightning/fabric/utilities/system_check.py b/src/lightning/fabric/utilities/system_check.py index 15cd9361ea53d..6335caabac2f6 100644 --- a/src/lightning/fabric/utilities/system_check.py +++ b/src/lightning/fabric/utilities/system_check.py @@ -3,17 +3,17 @@ import shutil import subprocess from datetime import timedelta -from functools import lru_cache from pathlib import Path from typing import Any import torch.distributed as dist import torch.multiprocessing as mp import torch +import time SYSTEM_CHECK_DIR = Path("./system_check") -def main() -> None: +def main(timeout: int = 60) -> None: _setup_logging() num_cuda_devices = torch.cuda.device_count() @@ -28,7 +28,27 @@ def main() -> None: if num_cuda_devices > 1: _describe_nvidia_smi() _describe_gpu_connectivity() - mp.spawn(_check_cuda_distributed, nprocs=num_cuda_devices, args=(num_cuda_devices,)) + + context = mp.spawn( + _check_cuda_distributed, + nprocs=num_cuda_devices, + args=(num_cuda_devices,), + join=False, + ) + + start = time.time() + joined = False + while not joined and (time.time() - start < timeout): + joined = context.join(timeout=5) + time.sleep(1) + + if not joined: + for pid in context.pids(): + _kill_process(pid) + print("not successful") # TODO + + # TODO: relative dir + print(f"Find detailed logs at {SYSTEM_CHECK_DIR}") def _check_cuda_distributed(local_rank: int, world_size: int) -> None: @@ -53,6 +73,10 @@ def _check_cuda_distributed(local_rank: int, world_size: int) -> None: timeout=timedelta(seconds=10), ) _print0("done.") + + # TODO: remove + # if local_rank > 0: + # return _print0( "Synchronizing GPUs. If the program hangs for more than 30 seconds, there is a problem with your" @@ -109,5 +133,18 @@ def _describe_gpu_connectivity() -> None: logger.info(_collect_nvidia_smi_topo()) +def _kill_process(pid: int) -> None: + import psutil # TODO + + try: + process = psutil.Process(pid) + if process.is_running(): + process.kill() + except psutil.NoSuchProcess: + pass + except psutil.AccessDenied: + pass + + if __name__ == '__main__': main() From 62b608fb108bbf48c6ea1d4d974230351dffe74d Mon Sep 17 00:00:00 2001 From: awaelchli Date: Sun, 10 Mar 2024 15:36:42 +0100 Subject: [PATCH 07/24] update --- src/lightning/fabric/utilities/system_check.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/lightning/fabric/utilities/system_check.py b/src/lightning/fabric/utilities/system_check.py index 6335caabac2f6..4fa2774234b4c 100644 --- a/src/lightning/fabric/utilities/system_check.py +++ b/src/lightning/fabric/utilities/system_check.py @@ -48,7 +48,8 @@ def main(timeout: int = 60) -> None: print("not successful") # TODO # TODO: relative dir - print(f"Find detailed logs at {SYSTEM_CHECK_DIR}") + relative_dir = SYSTEM_CHECK_DIR.relative_to(Path.cwd()) + print(f"Find detailed logs at {relative_dir}") def _check_cuda_distributed(local_rank: int, world_size: int) -> None: From 1d6d8909606eade0105c48aeba2b3c540e254d32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 10 Mar 2024 11:30:00 -0400 Subject: [PATCH 08/24] update --- .../fabric/utilities/system_check.py | 66 ++++++++++++------- 1 file changed, 41 insertions(+), 25 deletions(-) diff --git a/src/lightning/fabric/utilities/system_check.py b/src/lightning/fabric/utilities/system_check.py index 4fa2774234b4c..95b8e491cb98f 100644 --- a/src/lightning/fabric/utilities/system_check.py +++ b/src/lightning/fabric/utilities/system_check.py @@ -29,30 +29,48 @@ def main(timeout: int = 60) -> None: _describe_nvidia_smi() _describe_gpu_connectivity() - context = mp.spawn( - _check_cuda_distributed, + success = _check_cuda_distributed(timeout) + + if not success: + print( + f"The multi-GPU NCCL test did not finish within {timeout} seconds." + " It looks like there is an issue with your multi-GPU setup." + " Now trying to run again with `NCCL_P2P_DISABLE=1` set." + ) + os.environ["NCCL_P2P_DISABLE"] = "1" + success = _check_cuda_distributed(timeout) + if not success: + print( + f"Disabling peer-to-peer transport did not fix the issue." + ) + else: + print("Multi-GPU test successful.") + + print(f"Find detailed logs at {SYSTEM_CHECK_DIR.absolute()}") + + +def _check_cuda_distributed(timeout: int) -> bool: + num_cuda_devices = torch.cuda.device_count() + context = mp.spawn( + _run_all_reduce_test, nprocs=num_cuda_devices, args=(num_cuda_devices,), join=False, ) - start = time.time() - joined = False - while not joined and (time.time() - start < timeout): - joined = context.join(timeout=5) - time.sleep(1) - - if not joined: - for pid in context.pids(): - _kill_process(pid) - print("not successful") # TODO + start = time.time() + success = False + while not success and (time.time() - start < timeout): + success = context.join(timeout=5) + time.sleep(1) - # TODO: relative dir - relative_dir = SYSTEM_CHECK_DIR.relative_to(Path.cwd()) - print(f"Find detailed logs at {relative_dir}") + if not success: + for pid in context.pids(): + _kill_process(pid) + return success -def _check_cuda_distributed(local_rank: int, world_size: int) -> None: +def _run_all_reduce_test(local_rank: int, world_size: int) -> None: os.environ["MASTER_ADDR"] = "localhost" os.environ["MASTER_PORT"] = "29500" os.environ["WORLD_SIZE"] = str(world_size) @@ -76,19 +94,17 @@ def _check_cuda_distributed(local_rank: int, world_size: int) -> None: _print0("done.") # TODO: remove - # if local_rank > 0: - # return + if local_rank > 0: + return - _print0( - "Synchronizing GPUs. If the program hangs for more than 30 seconds, there is a problem with your" - " multi-GPU setup." - ) + _print0("Synchronizing GPUs ... ", end="") dist.barrier() + _print0("done.") payload = torch.rand(100, 100, device=device) _print0("Running all-reduce test ... ", end="") dist.all_reduce(payload) - _print0("Done.") + _print0("done.") def _setup_logging() -> None: @@ -103,9 +119,9 @@ def _setup_logging() -> None: logger.addHandler(file_handler) -def _print0(*args: Any, **kwargs: Any) -> None: +def _print0(string: str, **kwargs: Any) -> None: if int(os.getenv("RANK", 0)) == 0: - print(*args, **kwargs) + print(string, **kwargs) def _collect_nvidia_smi_topo() -> str: From e613adc12648c6f59f31998dbd637a404699809e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 10 Mar 2024 11:30:41 -0400 Subject: [PATCH 09/24] update --- src/lightning/fabric/utilities/system_check.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/lightning/fabric/utilities/system_check.py b/src/lightning/fabric/utilities/system_check.py index 95b8e491cb98f..a453e554078c8 100644 --- a/src/lightning/fabric/utilities/system_check.py +++ b/src/lightning/fabric/utilities/system_check.py @@ -93,9 +93,9 @@ def _run_all_reduce_test(local_rank: int, world_size: int) -> None: ) _print0("done.") - # TODO: remove - if local_rank > 0: - return + # # TODO: remove + # if local_rank > 0: + # return _print0("Synchronizing GPUs ... ", end="") dist.barrier() From fa7d3c1c321ea2e1121670374bbf7b1c796c4f52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 10 Mar 2024 17:06:42 -0400 Subject: [PATCH 10/24] update --- .../fabric/utilities/system_check.py | 69 ++++++++++++------- 1 file changed, 44 insertions(+), 25 deletions(-) diff --git a/src/lightning/fabric/utilities/system_check.py b/src/lightning/fabric/utilities/system_check.py index a453e554078c8..c739fb8a226d7 100644 --- a/src/lightning/fabric/utilities/system_check.py +++ b/src/lightning/fabric/utilities/system_check.py @@ -9,8 +9,12 @@ import torch.multiprocessing as mp import torch import time +from lightning_utilities.core.imports import RequirementCache -SYSTEM_CHECK_DIR = Path("./system_check") + +_psutil_available = RequirementCache("psutil") +_logger = logging.getLogger(__name__) +_system_check_dir = Path("./system_check") def main(timeout: int = 60) -> None: @@ -40,23 +44,24 @@ def main(timeout: int = 60) -> None: os.environ["NCCL_P2P_DISABLE"] = "1" success = _check_cuda_distributed(timeout) if not success: - print( - f"Disabling peer-to-peer transport did not fix the issue." - ) + print(f"Disabling peer-to-peer transport did not fix the issue.") else: print("Multi-GPU test successful.") - print(f"Find detailed logs at {SYSTEM_CHECK_DIR.absolute()}") + print(f"Find detailed logs at {_system_check_dir.absolute()}") def _check_cuda_distributed(timeout: int) -> bool: + if not _psutil_available: + raise ModuleNotFoundError(str(_psutil_available)) + num_cuda_devices = torch.cuda.device_count() context = mp.spawn( - _run_all_reduce_test, - nprocs=num_cuda_devices, - args=(num_cuda_devices,), - join=False, - ) + _run_all_reduce_test, + nprocs=num_cuda_devices, + args=(num_cuda_devices,), + join=False, + ) start = time.time() success = False @@ -77,7 +82,7 @@ def _run_all_reduce_test(local_rank: int, world_size: int) -> None: os.environ["RANK"] = str(local_rank) os.environ["LOCAL_RANK"] = str(local_rank) os.environ["NCCL_DEBUG"] = "INFO" - os.environ["NCCL_DEBUG_FILE"] = str(SYSTEM_CHECK_DIR / f"nccl-rank-{local_rank}.txt") + os.environ["NCCL_DEBUG_FILE"] = str(_system_check_dir / f"nccl-rank-{local_rank}.txt") device = torch.device("cuda", local_rank) torch.cuda.set_device(local_rank) @@ -108,20 +113,22 @@ def _run_all_reduce_test(local_rank: int, world_size: int) -> None: def _setup_logging() -> None: - if SYSTEM_CHECK_DIR.is_dir(): - shutil.rmtree(SYSTEM_CHECK_DIR) - SYSTEM_CHECK_DIR.mkdir() + if _system_check_dir.is_dir(): + shutil.rmtree(_system_check_dir) + _system_check_dir.mkdir() - logger = logging.getLogger() - logger.setLevel(logging.INFO) - file_handler = logging.FileHandler(str(SYSTEM_CHECK_DIR / "logs.txt")) + _logger.setLevel(logging.INFO) + file_handler = logging.FileHandler(str(_system_check_dir / "logs.txt")) file_handler.setLevel(logging.INFO) - logger.addHandler(file_handler) + console_handler = logging.StreamHandler() + console_handler.setLevel(logging.INFO) + _logger.addHandler(file_handler) + _logger.addHandler(console_handler) def _print0(string: str, **kwargs: Any) -> None: if int(os.getenv("RANK", 0)) == 0: - print(string, **kwargs) + _logger.info(string, **kwargs) def _collect_nvidia_smi_topo() -> str: @@ -132,22 +139,34 @@ def _collect_nvidia_smi() -> str: return subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout +# def _collect_nvidia_driver_version() -> str: +# result = subprocess.run( +# [ +# "nvidia-smi", +# "--query-gpu=driver_version", +# "--id=0", +# "--format=csv,noheader", +# ], +# capture_output=True, +# text=True, +# ) +# return result.stdout + + def _describe_nvidia_smi() -> None: - logger = logging.getLogger() - logger.info( + _logger.info( "Below is the output of `nvidia-smi`. It shows information about the GPUs that are installed on this machine," " the driver version, and the maximum supported CUDA version it can run.\n" ) - logger.info(_collect_nvidia_smi()) + _logger.info(_collect_nvidia_smi()) def _describe_gpu_connectivity() -> None: - logger = logging.getLogger() - logger.info( + _logger.info( "The matrix below shows how the GPUs in this machine are connected." " NVLink (NV) is the fastest connection, and is only available on high-end systems like V100 or A100.\n" ) - logger.info(_collect_nvidia_smi_topo()) + _logger.info(_collect_nvidia_smi_topo()) def _kill_process(pid: int) -> None: From 5ad949148fd36d2894472db9765e96245a4402e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Sun, 10 Mar 2024 17:52:47 -0400 Subject: [PATCH 11/24] update --- .../fabric/utilities/system_check.py | 69 +++++++++---------- 1 file changed, 33 insertions(+), 36 deletions(-) diff --git a/src/lightning/fabric/utilities/system_check.py b/src/lightning/fabric/utilities/system_check.py index c739fb8a226d7..72eb5857a8088 100644 --- a/src/lightning/fabric/utilities/system_check.py +++ b/src/lightning/fabric/utilities/system_check.py @@ -9,6 +9,7 @@ import torch.multiprocessing as mp import torch import time +import pkg_resources from lightning_utilities.core.imports import RequirementCache @@ -19,15 +20,16 @@ def main(timeout: int = 60) -> None: _setup_logging() + _collect_packages() + num_cuda_devices = torch.cuda.device_count() + if num_cuda_devices == 0: - print("Warning: Skipping system check because no GPUs were detected.") + _print0("Warning: Skipping system check because no GPUs were detected.") if num_cuda_devices == 1: - # TODO _describe_nvidia_smi() - pass if num_cuda_devices > 1: _describe_nvidia_smi() @@ -36,7 +38,7 @@ def main(timeout: int = 60) -> None: success = _check_cuda_distributed(timeout) if not success: - print( + _print0( f"The multi-GPU NCCL test did not finish within {timeout} seconds." " It looks like there is an issue with your multi-GPU setup." " Now trying to run again with `NCCL_P2P_DISABLE=1` set." @@ -44,11 +46,11 @@ def main(timeout: int = 60) -> None: os.environ["NCCL_P2P_DISABLE"] = "1" success = _check_cuda_distributed(timeout) if not success: - print(f"Disabling peer-to-peer transport did not fix the issue.") + _print0(f"Disabling peer-to-peer transport did not fix the issue.") else: - print("Multi-GPU test successful.") + _print0("Multi-GPU test successful.") - print(f"Find detailed logs at {_system_check_dir.absolute()}") + _print0(f"Find detailed logs at {_system_check_dir.absolute()}") def _check_cuda_distributed(timeout: int) -> bool: @@ -87,7 +89,7 @@ def _run_all_reduce_test(local_rank: int, world_size: int) -> None: device = torch.device("cuda", local_rank) torch.cuda.set_device(local_rank) - _print0("Setting up the process group ... ", end="") + _print0("Setting up the process group ...") dist.init_process_group( backend="nccl", world_size=world_size, @@ -96,20 +98,17 @@ def _run_all_reduce_test(local_rank: int, world_size: int) -> None: # which must be successful for this timeout to work. timeout=timedelta(seconds=10), ) - _print0("done.") - # # TODO: remove + # TODO: remove # if local_rank > 0: # return - - _print0("Synchronizing GPUs ... ", end="") + + _print0("Synchronizing GPUs ... ") dist.barrier() - _print0("done.") payload = torch.rand(100, 100, device=device) - _print0("Running all-reduce test ... ", end="") + _print0("Running all-reduce test ...") dist.all_reduce(payload) - _print0("done.") def _setup_logging() -> None: @@ -126,9 +125,9 @@ def _setup_logging() -> None: _logger.addHandler(console_handler) -def _print0(string: str, **kwargs: Any) -> None: +def _print0(string: str) -> None: if int(os.getenv("RANK", 0)) == 0: - _logger.info(string, **kwargs) + _logger.info(string) def _collect_nvidia_smi_topo() -> str: @@ -139,20 +138,6 @@ def _collect_nvidia_smi() -> str: return subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout -# def _collect_nvidia_driver_version() -> str: -# result = subprocess.run( -# [ -# "nvidia-smi", -# "--query-gpu=driver_version", -# "--id=0", -# "--format=csv,noheader", -# ], -# capture_output=True, -# text=True, -# ) -# return result.stdout - - def _describe_nvidia_smi() -> None: _logger.info( "Below is the output of `nvidia-smi`. It shows information about the GPUs that are installed on this machine," @@ -164,23 +149,35 @@ def _describe_nvidia_smi() -> None: def _describe_gpu_connectivity() -> None: _logger.info( "The matrix below shows how the GPUs in this machine are connected." - " NVLink (NV) is the fastest connection, and is only available on high-end systems like V100 or A100.\n" + " NVLink (NV) is the fastest connection, and is only available on high-end systems like V100, A100, etc.\n" ) _logger.info(_collect_nvidia_smi_topo()) def _kill_process(pid: int) -> None: - import psutil # TODO + import psutil try: process = psutil.Process(pid) if process.is_running(): process.kill() - except psutil.NoSuchProcess: - pass - except psutil.AccessDenied: + except (psutil.NoSuchProcess, psutil.AccessDenied): pass +def _collect_packages() -> None: + packages = {} + for dist in pkg_resources.working_set: + package = dist.as_requirement() + packages[package.key] = package.specs[0][1] + + longest = max(len(p) for p in packages) + with open(_system_check_dir / "packages.txt", "w") as file: + for name in sorted(packages.keys()): + version = packages[name] + pad = " " * (longest - len(name)) + file.write(f"{name}{pad} {version}\n") + + if __name__ == '__main__': main() From c22fb2bd6d73857679554dd8bfd028bbe18e54c3 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Sun, 10 Mar 2024 23:03:06 +0100 Subject: [PATCH 12/24] update --- src/lightning/fabric/cli.py | 6 ++++++ .../fabric/utilities/system_check.py | 21 ++----------------- 2 files changed, 8 insertions(+), 19 deletions(-) diff --git a/src/lightning/fabric/cli.py b/src/lightning/fabric/cli.py index d8c6fe47b6630..5c6f0e4f09311 100644 --- a/src/lightning/fabric/cli.py +++ b/src/lightning/fabric/cli.py @@ -30,6 +30,7 @@ from lightning.fabric.utilities.device_parser import _parse_gpu_ids from lightning.fabric.utilities.distributed import _suggested_max_num_threads from lightning.fabric.utilities.load import _load_distributed_checkpoint +from lightning.fabric.utilities import system_check _log = logging.getLogger(__name__) @@ -188,6 +189,11 @@ def _consolidate(checkpoint_folder: str, output_file: Optional[str]) -> None: checkpoint = _load_distributed_checkpoint(config.checkpoint_folder) torch.save(checkpoint, config.output_file) + @_main.command("system-check") + def _system_check(): + """Run a system check to test your multi-GPU setup.""" + system_check.main() + def _set_env_variables(args: Namespace) -> None: """Set the environment variables for the new processes. diff --git a/src/lightning/fabric/utilities/system_check.py b/src/lightning/fabric/utilities/system_check.py index 72eb5857a8088..b282fbc7f7196 100644 --- a/src/lightning/fabric/utilities/system_check.py +++ b/src/lightning/fabric/utilities/system_check.py @@ -4,12 +4,10 @@ import subprocess from datetime import timedelta from pathlib import Path -from typing import Any import torch.distributed as dist import torch.multiprocessing as mp import torch import time -import pkg_resources from lightning_utilities.core.imports import RequirementCache @@ -20,11 +18,9 @@ def main(timeout: int = 60) -> None: _setup_logging() - _collect_packages() - + num_cuda_devices = torch.cuda.device_count() - if num_cuda_devices == 0: _print0("Warning: Skipping system check because no GPUs were detected.") @@ -123,6 +119,7 @@ def _setup_logging() -> None: console_handler.setLevel(logging.INFO) _logger.addHandler(file_handler) _logger.addHandler(console_handler) + _logger.propagate = False def _print0(string: str) -> None: @@ -165,19 +162,5 @@ def _kill_process(pid: int) -> None: pass -def _collect_packages() -> None: - packages = {} - for dist in pkg_resources.working_set: - package = dist.as_requirement() - packages[package.key] = package.specs[0][1] - - longest = max(len(p) for p in packages) - with open(_system_check_dir / "packages.txt", "w") as file: - for name in sorted(packages.keys()): - version = packages[name] - pad = " " * (longest - len(name)) - file.write(f"{name}{pad} {version}\n") - - if __name__ == '__main__': main() From 7a7f7e6928d44a3e3240b0a71b6b0193143495bc Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 10 Mar 2024 22:20:11 +0000 Subject: [PATCH 13/24] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/lightning/fabric/cli.py | 2 +- .../fabric/utilities/system_check.py | 20 +++++++++---------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/lightning/fabric/cli.py b/src/lightning/fabric/cli.py index 5c6f0e4f09311..67a6ff2b6d7ef 100644 --- a/src/lightning/fabric/cli.py +++ b/src/lightning/fabric/cli.py @@ -26,11 +26,11 @@ from lightning.fabric.accelerators import CPUAccelerator, CUDAAccelerator, MPSAccelerator from lightning.fabric.plugins.precision.precision import _PRECISION_INPUT_STR, _PRECISION_INPUT_STR_ALIAS from lightning.fabric.strategies import STRATEGY_REGISTRY +from lightning.fabric.utilities import system_check from lightning.fabric.utilities.consolidate_checkpoint import _process_cli_args from lightning.fabric.utilities.device_parser import _parse_gpu_ids from lightning.fabric.utilities.distributed import _suggested_max_num_threads from lightning.fabric.utilities.load import _load_distributed_checkpoint -from lightning.fabric.utilities import system_check _log = logging.getLogger(__name__) diff --git a/src/lightning/fabric/utilities/system_check.py b/src/lightning/fabric/utilities/system_check.py index b282fbc7f7196..22e0e7c35958c 100644 --- a/src/lightning/fabric/utilities/system_check.py +++ b/src/lightning/fabric/utilities/system_check.py @@ -1,16 +1,16 @@ -import os import logging +import os import shutil import subprocess +import time from datetime import timedelta from pathlib import Path + +import torch import torch.distributed as dist import torch.multiprocessing as mp -import torch -import time from lightning_utilities.core.imports import RequirementCache - _psutil_available = RequirementCache("psutil") _logger = logging.getLogger(__name__) _system_check_dir = Path("./system_check") @@ -30,19 +30,19 @@ def main(timeout: int = 60) -> None: if num_cuda_devices > 1: _describe_nvidia_smi() _describe_gpu_connectivity() - + success = _check_cuda_distributed(timeout) - + if not success: _print0( f"The multi-GPU NCCL test did not finish within {timeout} seconds." " It looks like there is an issue with your multi-GPU setup." " Now trying to run again with `NCCL_P2P_DISABLE=1` set." - ) + ) os.environ["NCCL_P2P_DISABLE"] = "1" success = _check_cuda_distributed(timeout) if not success: - _print0(f"Disabling peer-to-peer transport did not fix the issue.") + _print0("Disabling peer-to-peer transport did not fix the issue.") else: _print0("Multi-GPU test successful.") @@ -90,7 +90,7 @@ def _run_all_reduce_test(local_rank: int, world_size: int) -> None: backend="nccl", world_size=world_size, rank=local_rank, - # NCCL gets initialized in the first collective call (e.g., barrier below), + # NCCL gets initialized in the first collective call (e.g., barrier below), # which must be successful for this timeout to work. timeout=timedelta(seconds=10), ) @@ -162,5 +162,5 @@ def _kill_process(pid: int) -> None: pass -if __name__ == '__main__': +if __name__ == "__main__": main() From 5c5b973e3df6c604393481ecc9a2f3cf8b7d2808 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Sun, 10 Mar 2024 23:24:23 +0100 Subject: [PATCH 14/24] update --- src/lightning/fabric/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lightning/fabric/cli.py b/src/lightning/fabric/cli.py index 67a6ff2b6d7ef..3b5070ee4daf0 100644 --- a/src/lightning/fabric/cli.py +++ b/src/lightning/fabric/cli.py @@ -190,7 +190,7 @@ def _consolidate(checkpoint_folder: str, output_file: Optional[str]) -> None: torch.save(checkpoint, config.output_file) @_main.command("system-check") - def _system_check(): + def _system_check() -> None: """Run a system check to test your multi-GPU setup.""" system_check.main() From 652677f8b27f79df10faeae71f2d344cce1d2ce2 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Sun, 10 Mar 2024 23:28:33 +0100 Subject: [PATCH 15/24] add license --- .../fabric/utilities/consolidate_checkpoint.py | 13 +++++++++++++ src/lightning/fabric/utilities/distributed.py | 13 +++++++++++++ src/lightning/fabric/utilities/seed.py | 13 +++++++++++++ src/lightning/fabric/utilities/spike.py | 13 +++++++++++++ src/lightning/fabric/utilities/system_check.py | 13 +++++++++++++ 5 files changed, 65 insertions(+) diff --git a/src/lightning/fabric/utilities/consolidate_checkpoint.py b/src/lightning/fabric/utilities/consolidate_checkpoint.py index 15d20d8d89ecc..7a94089c7a9cd 100644 --- a/src/lightning/fabric/utilities/consolidate_checkpoint.py +++ b/src/lightning/fabric/utilities/consolidate_checkpoint.py @@ -1,3 +1,16 @@ +# Copyright The Lightning AI team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import logging from argparse import ArgumentParser, Namespace from pathlib import Path diff --git a/src/lightning/fabric/utilities/distributed.py b/src/lightning/fabric/utilities/distributed.py index 30bfe4e254a07..599b185ffd105 100644 --- a/src/lightning/fabric/utilities/distributed.py +++ b/src/lightning/fabric/utilities/distributed.py @@ -1,3 +1,16 @@ +# Copyright The Lightning AI team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import contextlib import logging import os diff --git a/src/lightning/fabric/utilities/seed.py b/src/lightning/fabric/utilities/seed.py index b274bce88fcdf..c7003f1b05593 100644 --- a/src/lightning/fabric/utilities/seed.py +++ b/src/lightning/fabric/utilities/seed.py @@ -1,3 +1,16 @@ +# Copyright The Lightning AI team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import logging import os import random diff --git a/src/lightning/fabric/utilities/spike.py b/src/lightning/fabric/utilities/spike.py index 5dca5990064e8..6ccad8f9bf776 100644 --- a/src/lightning/fabric/utilities/spike.py +++ b/src/lightning/fabric/utilities/spike.py @@ -1,3 +1,16 @@ +# Copyright The Lightning AI team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import json import operator import os diff --git a/src/lightning/fabric/utilities/system_check.py b/src/lightning/fabric/utilities/system_check.py index 22e0e7c35958c..4234cbb2d8516 100644 --- a/src/lightning/fabric/utilities/system_check.py +++ b/src/lightning/fabric/utilities/system_check.py @@ -1,3 +1,16 @@ +# Copyright The Lightning AI team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import logging import os import shutil From 589a97b5241c1c6ca751d036b83a3208ead3c9af Mon Sep 17 00:00:00 2001 From: awaelchli Date: Sun, 10 Mar 2024 23:38:14 +0100 Subject: [PATCH 16/24] update --- .azure/gpu-tests-fabric.yml | 11 +++++++++-- .azure/gpu-tests-pytorch.yml | 1 + .gitignore | 1 + src/lightning/fabric/cli.py | 6 +++--- tests/tests_fabric/run_standalone_tasks.sh | 23 ++++++++++++++++++++++ 5 files changed, 37 insertions(+), 5 deletions(-) create mode 100644 tests/tests_fabric/run_standalone_tasks.sh diff --git a/.azure/gpu-tests-fabric.yml b/.azure/gpu-tests-fabric.yml index ba86449e92355..b41b714edce32 100644 --- a/.azure/gpu-tests-fabric.yml +++ b/.azure/gpu-tests-fabric.yml @@ -134,14 +134,21 @@ jobs: - bash: python -m coverage run --source ${COVERAGE_SOURCE} -m pytest . -v --durations=50 workingDirectory: tests/tests_fabric/ - displayName: "Testing: fabric standard" + displayName: "Testing: Fabric standard" timeoutInMinutes: "10" - bash: bash ../run_standalone_tests.sh "." workingDirectory: tests/tests_fabric/ env: PL_STANDALONE_TESTS_SOURCE: $(COVERAGE_SOURCE) - displayName: "Testing: fabric standalone" + displayName: "Testing: Fabric standalone tests" + timeoutInMinutes: "10" + + - bash: bash run_standalone_tasks.sh + workingDirectory: tests/tests_fabric + env: + PL_USE_MOCKED_MNIST: "1" + displayName: "Testing: Fabric standalone tasks" timeoutInMinutes: "10" - bash: | diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml index b9ab6ead7f0d1..c795bac955334 100644 --- a/.azure/gpu-tests-pytorch.yml +++ b/.azure/gpu-tests-pytorch.yml @@ -24,6 +24,7 @@ pr: - "examples/run_pl_examples.sh" - "examples/pytorch/basics/backbone_image_classifier.py" - "examples/pytorch/basics/autoencoder.py" + - "tests/run_standalone_*.sh" - "requirements/pytorch/**" - "src/lightning/__init__.py" - "src/lightning/__setup__.py" diff --git a/.gitignore b/.gitignore index de1de44fec235..2ace5d1151c5e 100644 --- a/.gitignore +++ b/.gitignore @@ -175,6 +175,7 @@ wandb *.prof *.tar.gz .neptune/ +system_check/ # dataset generated from bolts in examples. cifar-10-batches-py diff --git a/src/lightning/fabric/cli.py b/src/lightning/fabric/cli.py index 3b5070ee4daf0..ffbcf7ade6a9a 100644 --- a/src/lightning/fabric/cli.py +++ b/src/lightning/fabric/cli.py @@ -189,9 +189,9 @@ def _consolidate(checkpoint_folder: str, output_file: Optional[str]) -> None: checkpoint = _load_distributed_checkpoint(config.checkpoint_folder) torch.save(checkpoint, config.output_file) - @_main.command("system-check") - def _system_check() -> None: - """Run a system check to test your multi-GPU setup.""" + @_main.command("diagnose") + def _diagnose() -> None: + """Diagnose issues with your multi-GPU setup.""" system_check.main() diff --git a/tests/tests_fabric/run_standalone_tasks.sh b/tests/tests_fabric/run_standalone_tasks.sh new file mode 100644 index 0000000000000..63ff0c0d301cd --- /dev/null +++ b/tests/tests_fabric/run_standalone_tasks.sh @@ -0,0 +1,23 @@ +#!/bin/bash +# Copyright The Lightning AI team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +set -e +# THIS FILE ASSUMES IT IS RUN INSIDE THE tests/tests_pytorch DIRECTORY + +# this environment variable allows special tests to run +export PL_RUN_STANDALONE_TESTS=1 + +# test that a user can manually launch individual processes +echo "Running system check" +fabric diagnose From bf90dff5b4c41ec5fbbd6f6679206cd9948c0273 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Mon, 11 Mar 2024 00:18:55 +0100 Subject: [PATCH 17/24] tests --- src/lightning/fabric/utilities/system_check.py | 12 ++++-------- .../utilities/test_system_check.py | 18 ++++++++++++++++++ 2 files changed, 22 insertions(+), 8 deletions(-) create mode 100644 tests/tests_fabric/utilities/test_system_check.py diff --git a/src/lightning/fabric/utilities/system_check.py b/src/lightning/fabric/utilities/system_check.py index 4234cbb2d8516..ff7ac4e18bced 100644 --- a/src/lightning/fabric/utilities/system_check.py +++ b/src/lightning/fabric/utilities/system_check.py @@ -20,7 +20,7 @@ from pathlib import Path import torch -import torch.distributed as dist +import torch.distributed import torch.multiprocessing as mp from lightning_utilities.core.imports import RequirementCache @@ -99,7 +99,7 @@ def _run_all_reduce_test(local_rank: int, world_size: int) -> None: torch.cuda.set_device(local_rank) _print0("Setting up the process group ...") - dist.init_process_group( + torch.distributed.init_process_group( backend="nccl", world_size=world_size, rank=local_rank, @@ -108,16 +108,12 @@ def _run_all_reduce_test(local_rank: int, world_size: int) -> None: timeout=timedelta(seconds=10), ) - # TODO: remove - # if local_rank > 0: - # return - _print0("Synchronizing GPUs ... ") - dist.barrier() + torch.distributed.barrier() payload = torch.rand(100, 100, device=device) _print0("Running all-reduce test ...") - dist.all_reduce(payload) + torch.distributed.all_reduce(payload) def _setup_logging() -> None: diff --git a/tests/tests_fabric/utilities/test_system_check.py b/tests/tests_fabric/utilities/test_system_check.py new file mode 100644 index 0000000000000..a8578f110d352 --- /dev/null +++ b/tests/tests_fabric/utilities/test_system_check.py @@ -0,0 +1,18 @@ +import os +from unittest import mock + +import torch + +from lightning.fabric.utilities.system_check import _run_all_reduce_test + + +@mock.patch.dict(os.environ, {}, clear=True) +@mock.patch("lightning.fabric.utilities.system_check.torch.device", return_value=torch.device("cpu")) +@mock.patch("lightning.fabric.utilities.system_check.torch.cuda.set_device") +@mock.patch("lightning.fabric.utilities.system_check.torch.distributed") +def test_run_all_reduce_test(dist_mock, set_device_mock, __): + _run_all_reduce_test(local_rank=1, world_size=4) + set_device_mock.assert_called_once() + dist_mock.init_process_group.assert_called_once() + dist_mock.barrier.assert_called_once() + dist_mock.all_reduce.assert_called_once() From ea01ac28a372e824a4c14daf8abeaa2ee8f8460a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 10 Mar 2024 23:19:32 +0000 Subject: [PATCH 18/24] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/tests_fabric/utilities/test_system_check.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/tests_fabric/utilities/test_system_check.py b/tests/tests_fabric/utilities/test_system_check.py index a8578f110d352..1efeae98b71f6 100644 --- a/tests/tests_fabric/utilities/test_system_check.py +++ b/tests/tests_fabric/utilities/test_system_check.py @@ -2,7 +2,6 @@ from unittest import mock import torch - from lightning.fabric.utilities.system_check import _run_all_reduce_test From a7faaed800139577057766e8256e677ba93958ea Mon Sep 17 00:00:00 2001 From: awaelchli Date: Mon, 11 Mar 2024 00:31:31 +0100 Subject: [PATCH 19/24] troubleshooting --- .../guide/multi_node/barebones.rst | 50 +----------- docs/source-fabric/guide/troubleshooting.rst | 79 +++++++++++++++++++ 2 files changed, 81 insertions(+), 48 deletions(-) create mode 100644 docs/source-fabric/guide/troubleshooting.rst diff --git a/docs/source-fabric/guide/multi_node/barebones.rst b/docs/source-fabric/guide/multi_node/barebones.rst index a251df230174c..6a43460b9865a 100644 --- a/docs/source-fabric/guide/multi_node/barebones.rst +++ b/docs/source-fabric/guide/multi_node/barebones.rst @@ -110,52 +110,6 @@ After executing these commands, you should immediately see an output like this: Troubleshooting *************** - -**My program is stuck initializing at startup. What is causing this?** - -You are seeing a message like this in the logs, but nothing happens: - -.. code-block:: - - Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/4 - -The most likely reasons and how to fix it: - -- **Wrong network interface:** Some servers have multiple network interfaces. - There is usually only one that can send and receive traffic from the network of the other nodes, but sometimes it is not set as the default. - In this case, you need to set it manually: - - .. code-block:: bash - - export GLOO_SOCKET_IFNAME=eno1 - export NCCL_SOCKET_IFNAME=eno1 - fabric run ... - - You can find the interface name by parsing the output of the ``ifconfig`` command. - The name of this interface **may differ on each node**. - -- **NCCL can't communicate between the nodes:** - - Follow the steps in the `NCCL troubleshooting guide `_. - In particular, take note of the network section that describes restricting the port range and firewall rules. - - .. code-block:: bash - - echo "net.ipv4.ip_local_port_range = 50000 51000" >> /etc/sysctl.conf - sysctl --system - ufw allow 50000:51000/tcp - - -**My program crashes with an NCCL error, but it is not helpful** - -Launch your command by prepending ``NCCL_DEBUG=INFO`` to get more info. - -.. code-block:: bash - - NCCL_DEBUG=INFO fabric run ... - - ----- - -If you are sick of troubleshooting cluster problems, give :doc:`Lightning cloud <./cloud>` a try! +Please refer to the :doc:`troubleshooting guide <../troubleshooting>` guide if you are experiencing issues related to multi-node training hanging or crashing. +If you are sick of troubleshooting cluster problems, give :doc:`Lightning Studios <./cloud>` a try! For other questions, please don't hesitate to join the `Discord `_. diff --git a/docs/source-fabric/guide/troubleshooting.rst b/docs/source-fabric/guide/troubleshooting.rst new file mode 100644 index 0000000000000..9ba134a3306c3 --- /dev/null +++ b/docs/source-fabric/guide/troubleshooting.rst @@ -0,0 +1,79 @@ +############### +Troubleshooting +############### + + +---- + + +********* +Multi-GPU +********* + + +**My program is stuck initializing at startup. What is causing this?** + +You are seeing a message like this in the logs, but nothing happens: + +.. code-block:: + + Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/4 + +The most likely reasons and how to fix it: + + +.. code-block:: bash + + fabric diagnose + + +---- + + +********** +Multi-node +********** + + +**My program is stuck initializing at startup. What is causing this?** + +You are seeing a message like this in the logs, but nothing happens: + +.. code-block:: + + Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/4 + +The most likely reasons and how to fix it: + +- **Wrong network interface:** Some servers have multiple network interfaces. + There is usually only one that can send and receive traffic from the network of the other nodes, but sometimes it is not set as the default. + In this case, you need to set it manually: + + .. code-block:: bash + + export GLOO_SOCKET_IFNAME=eno1 + export NCCL_SOCKET_IFNAME=eno1 + fabric run ... + + You can find the interface name by parsing the output of the ``ifconfig`` command. + The name of this interface **may differ on each node**. + +- **NCCL can't communicate between the nodes:** + + Follow the steps in the `NCCL troubleshooting guide `_. + In particular, take note of the network section that describes restricting the port range and firewall rules. + + .. code-block:: bash + + echo "net.ipv4.ip_local_port_range = 50000 51000" >> /etc/sysctl.conf + sysctl --system + ufw allow 50000:51000/tcp + + +**My program crashes with an NCCL error, but it is not helpful** + +Launch your command by prepending ``NCCL_DEBUG=INFO`` to get more info. + +.. code-block:: bash + + NCCL_DEBUG=INFO fabric run ... From 8c8d23b000acb938afb3dd64d20cfa9d317d82ac Mon Sep 17 00:00:00 2001 From: awaelchli Date: Mon, 11 Mar 2024 02:51:24 +0100 Subject: [PATCH 20/24] docs --- docs/source-fabric/guide/troubleshooting.rst | 48 +++++++++++-------- .../fabric/utilities/system_check.py | 6 ++- 2 files changed, 33 insertions(+), 21 deletions(-) diff --git a/docs/source-fabric/guide/troubleshooting.rst b/docs/source-fabric/guide/troubleshooting.rst index 9ba134a3306c3..28a841dbe6bb8 100644 --- a/docs/source-fabric/guide/troubleshooting.rst +++ b/docs/source-fabric/guide/troubleshooting.rst @@ -2,6 +2,8 @@ Troubleshooting ############### +Learn how to troubleshoot possible causes for common issues related to CUDA, NCCL, and distributed training. + ---- @@ -10,22 +12,38 @@ Troubleshooting Multi-GPU ********* - -**My program is stuck initializing at startup. What is causing this?** - -You are seeing a message like this in the logs, but nothing happens: +If your program is stuck at .. code-block:: Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/4 -The most likely reasons and how to fix it: - +it indicates that PyTorch can't set up the communication between GPUs, and that your system is not configured correctly. +Run the `diagnose` command from the Fabric CLI to investigate: .. code-block:: bash fabric diagnose +This tool will run basic multi-GPU tests using only PyTorch. +Any issues raised here will confirm that the problem is with your system and not with Lightning. +Common solutions: + +- **Wrong driver version:** The NVIDIA driver for your GPU is too old or too new. + You can check the version of the driver by running + + .. code-block:: bash + + nvidia-smi --id=0 --query-gpu=driver_version --format=csv,noheader + + *Solution*: Install a recent driver. + Search online for instructions how to update the driver on your platform. + +- **Peer-to-peer connection is broken:** The GPUs can't communicate with each other. + *Solution*: Try to set the environment variable ``NCCL_P2P_DISABLE=1``. + If you rerun your scipt and it fixes the problem, this means that peer-to-peer transport is not working properly (your training will run but it will be slow). + This is likely because of driver compatibility issues (see above) or because your GPU does not support peer-to-peer (e.g., certain RTX cards). + ---- @@ -34,16 +52,15 @@ The most likely reasons and how to fix it: Multi-node ********** - -**My program is stuck initializing at startup. What is causing this?** - -You are seeing a message like this in the logs, but nothing happens: +Before troubleshooting multi-node connectivity issues, first ensure that multi-GPU within a single machine is working correctly by following the steps above. +If single-node execution works, but multi-node hangs at .. code-block:: Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/4 -The most likely reasons and how to fix it: +it indicates that there is a connection issue between the nodes. +Common solutions: - **Wrong network interface:** Some servers have multiple network interfaces. There is usually only one that can send and receive traffic from the network of the other nodes, but sometimes it is not set as the default. @@ -68,12 +85,3 @@ The most likely reasons and how to fix it: echo "net.ipv4.ip_local_port_range = 50000 51000" >> /etc/sysctl.conf sysctl --system ufw allow 50000:51000/tcp - - -**My program crashes with an NCCL error, but it is not helpful** - -Launch your command by prepending ``NCCL_DEBUG=INFO`` to get more info. - -.. code-block:: bash - - NCCL_DEBUG=INFO fabric run ... diff --git a/src/lightning/fabric/utilities/system_check.py b/src/lightning/fabric/utilities/system_check.py index ff7ac4e18bced..e9c52a27be7d8 100644 --- a/src/lightning/fabric/utilities/system_check.py +++ b/src/lightning/fabric/utilities/system_check.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + import logging import os import shutil @@ -54,7 +55,10 @@ def main(timeout: int = 60) -> None: ) os.environ["NCCL_P2P_DISABLE"] = "1" success = _check_cuda_distributed(timeout) - if not success: + if success: + _print0("Disabling peer-to-peer transport fixed the issue.") + # TODO: Give advice + else: _print0("Disabling peer-to-peer transport did not fix the issue.") else: _print0("Multi-GPU test successful.") From 1f7cf52177318e4e0c2bbc50128b280b3db15f94 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Mon, 11 Mar 2024 02:57:22 +0100 Subject: [PATCH 21/24] links --- docs/source-fabric/fundamentals/launch.rst | 9 +++++++++ docs/source-fabric/glossary/index.rst | 6 ++++++ 2 files changed, 15 insertions(+) diff --git a/docs/source-fabric/fundamentals/launch.rst b/docs/source-fabric/fundamentals/launch.rst index efde3f54fe846..784011e533e13 100644 --- a/docs/source-fabric/fundamentals/launch.rst +++ b/docs/source-fabric/fundamentals/launch.rst @@ -237,6 +237,15 @@ Next steps :height: 160 :tag: advanced +.. displayitem:: + :header: Troubleshooting + :description: Learn how to troubleshoot common multi-GPU issues + :button_link: ../guide/troubleshooting.html + :col_css: col-md-4 + :height: 160 + :tag: advanced + + .. raw:: html diff --git a/docs/source-fabric/glossary/index.rst b/docs/source-fabric/glossary/index.rst index b08bc4f830163..4f9a683db8c03 100644 --- a/docs/source-fabric/glossary/index.rst +++ b/docs/source-fabric/glossary/index.rst @@ -8,6 +8,7 @@ Glossary Checkpoint <../guide/checkpoint/index> Weights and Biases <../guide/loggers/wandb> + Troubleshooting <../guide/troubleshooting> .. raw:: html @@ -150,6 +151,11 @@ Glossary :button_link: ../fundamentals/launch.html :col_css: col-md-4 +.. displayitem:: + :header: NCCL + :button_link: ../guide/troubleshoot.html + :col_css: col-md-4 + .. displayitem:: :header: Notebook :button_link: ../launch/notebook.html From 297e9809d2da7ec2abb0f2e7c5e6c371ae0eaac8 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Mon, 11 Mar 2024 03:00:14 +0100 Subject: [PATCH 22/24] link --- docs/source-fabric/glossary/index.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source-fabric/glossary/index.rst b/docs/source-fabric/glossary/index.rst index 4f9a683db8c03..e5bc92cad9ceb 100644 --- a/docs/source-fabric/glossary/index.rst +++ b/docs/source-fabric/glossary/index.rst @@ -153,7 +153,7 @@ Glossary .. displayitem:: :header: NCCL - :button_link: ../guide/troubleshoot.html + :button_link: ../guide/troubleshooting.html :col_css: col-md-4 .. displayitem:: From cd63115a76d15473dd5b258d4108f113ead687b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 14 Mar 2024 20:43:08 -0400 Subject: [PATCH 23/24] update --- .../fabric/utilities/system_check.py | 42 +++++++++++++------ 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/src/lightning/fabric/utilities/system_check.py b/src/lightning/fabric/utilities/system_check.py index e9c52a27be7d8..ed7898302237a 100644 --- a/src/lightning/fabric/utilities/system_check.py +++ b/src/lightning/fabric/utilities/system_check.py @@ -23,6 +23,7 @@ import torch import torch.distributed import torch.multiprocessing as mp +from torch.multiprocessing.spawn import ProcessRaisedException from lightning_utilities.core.imports import RequirementCache _psutil_available = RequirementCache("psutil") @@ -40,6 +41,7 @@ def main(timeout: int = 60) -> None: if num_cuda_devices == 1: _describe_nvidia_smi() + # _check_cuda() if num_cuda_devices > 1: _describe_nvidia_smi() @@ -48,19 +50,24 @@ def main(timeout: int = 60) -> None: success = _check_cuda_distributed(timeout) if not success: + env = { + "NCCL_P2P_DISABLE": "1", + "NCCL_NET_PLUGIN": "none", + } _print0( - f"The multi-GPU NCCL test did not finish within {timeout} seconds." + f"The multi-GPU NCCL test did not succeed." " It looks like there is an issue with your multi-GPU setup." - " Now trying to run again with `NCCL_P2P_DISABLE=1` set." + " Now trying to run again with NCCL features disabled." ) - os.environ["NCCL_P2P_DISABLE"] = "1" + os.environ.update(env) success = _check_cuda_distributed(timeout) if success: - _print0("Disabling peer-to-peer transport fixed the issue.") - # TODO: Give advice + _print0("Disabling the following NCCL features seems to have fixed the issue:") + _print_env_variables(env) else: - _print0("Disabling peer-to-peer transport did not fix the issue.") - else: + _print0("Disabling NCCL features did not fix the issue.") + + if success: _print0("Multi-GPU test successful.") _print0(f"Find detailed logs at {_system_check_dir.absolute()}") @@ -81,7 +88,13 @@ def _check_cuda_distributed(timeout: int) -> bool: start = time.time() success = False while not success and (time.time() - start < timeout): - success = context.join(timeout=5) + try: + success = context.join(timeout=5) + except ProcessRaisedException as e: + _logger.debug(str(e)) + success = False + break + time.sleep(1) if not success: @@ -125,9 +138,9 @@ def _setup_logging() -> None: shutil.rmtree(_system_check_dir) _system_check_dir.mkdir() - _logger.setLevel(logging.INFO) + _logger.setLevel(logging.DEBUG) file_handler = logging.FileHandler(str(_system_check_dir / "logs.txt")) - file_handler.setLevel(logging.INFO) + file_handler.setLevel(logging.DEBUG) console_handler = logging.StreamHandler() console_handler.setLevel(logging.INFO) _logger.addHandler(file_handler) @@ -140,6 +153,11 @@ def _print0(string: str) -> None: _logger.info(string) +def _print_env_variables(env: dict) -> None: + for k, v in env.items(): + _print0(f"{k}={v}") + + def _collect_nvidia_smi_topo() -> str: return subprocess.run(["nvidia-smi", "topo", "-m"], capture_output=True, text=True).stdout @@ -157,11 +175,11 @@ def _describe_nvidia_smi() -> None: def _describe_gpu_connectivity() -> None: - _logger.info( + _logger.debug( "The matrix below shows how the GPUs in this machine are connected." " NVLink (NV) is the fastest connection, and is only available on high-end systems like V100, A100, etc.\n" ) - _logger.info(_collect_nvidia_smi_topo()) + _logger.debug(_collect_nvidia_smi_topo()) def _kill_process(pid: int) -> None: From a25eaf8be8e58559ad2f486167832905497b76ca Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 15 Mar 2024 00:43:46 +0000 Subject: [PATCH 24/24] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/lightning/fabric/utilities/system_check.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/lightning/fabric/utilities/system_check.py b/src/lightning/fabric/utilities/system_check.py index ed7898302237a..5453ecae13e61 100644 --- a/src/lightning/fabric/utilities/system_check.py +++ b/src/lightning/fabric/utilities/system_check.py @@ -23,8 +23,8 @@ import torch import torch.distributed import torch.multiprocessing as mp -from torch.multiprocessing.spawn import ProcessRaisedException from lightning_utilities.core.imports import RequirementCache +from torch.multiprocessing.spawn import ProcessRaisedException _psutil_available = RequirementCache("psutil") _logger = logging.getLogger(__name__) @@ -55,7 +55,7 @@ def main(timeout: int = 60) -> None: "NCCL_NET_PLUGIN": "none", } _print0( - f"The multi-GPU NCCL test did not succeed." + "The multi-GPU NCCL test did not succeed." " It looks like there is an issue with your multi-GPU setup." " Now trying to run again with NCCL features disabled." ) @@ -94,7 +94,7 @@ def _check_cuda_distributed(timeout: int) -> bool: _logger.debug(str(e)) success = False break - + time.sleep(1) if not success: