From 8d37a9a10e79c28f8ef2d9793cdf9d8c775175cf Mon Sep 17 00:00:00 2001
From: awaelchli <aedu.waelchli@gmail.com>
Date: Sun, 10 Mar 2024 13:09:02 +0100
Subject: [PATCH 01/24] basic system check

---
 .../fabric/utilities/system_check.py          | 116 ++++++++++++++++++
 1 file changed, 116 insertions(+)
 create mode 100644 src/lightning/fabric/utilities/system_check.py

diff --git a/src/lightning/fabric/utilities/system_check.py b/src/lightning/fabric/utilities/system_check.py
new file mode 100644
index 0000000000000..2e0ea1a0cc2c8
--- /dev/null
+++ b/src/lightning/fabric/utilities/system_check.py
@@ -0,0 +1,116 @@
+import os
+import subprocess
+from datetime import timedelta
+from functools import lru_cache
+from pathlib import Path
+from typing import Any
+import torch.distributed as dist
+import torch.multiprocessing as mp
+
+import torch
+
+
+def main():
+    # if not dist.is_available():
+    #     raise RuntimeError("Requires PyTorch distributed to be available.")
+
+    if num_cuda_devices() == 0:
+        print0("Warning: Skipping system check because no GPUs were detected.")
+
+    if num_cuda_devices() == 1:
+        describe_nvidia_smi()
+        pass
+
+    if num_cuda_devices() > 1:
+        describe_nvidia_smi()
+        describe_gpu_connectivity()
+        mp.spawn(_check_cuda_distributed, nprocs=num_cuda_devices())
+
+
+def _check_cuda_distributed(local_rank: int) -> None:
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = "29500"
+    os.environ["WORLD_SIZE"] = str(num_cuda_devices())
+    os.environ["RANK"] = str(local_rank)
+    os.environ["LOCAL_RANK"] = str(local_rank)
+
+    system_check_dir = Path("./.system_check")
+    system_check_dir.mkdir(exist_ok=True)
+
+    dist.init_process_group(
+        backend="nccl",
+        world_size=num_cuda_devices(),
+        rank=local_rank,
+        timeout=timedelta(seconds=30),
+    )
+
+    device = torch.device("cuda", local_rank)
+    torch.cuda.set_device(local_rank)
+
+    dist.barrier()
+    payload = torch.rand(100, 100, device=device)
+    dist.all_reduce(payload)
+
+
+@lru_cache()
+def rank() -> int:
+    import torch.distributed as dist
+
+    return dist.get_rank()
+
+
+@lru_cache()
+def world_size() -> int:
+    import torch.distributed as dist
+
+    return dist.get_world_size()
+
+
+def print0(*args: Any, **kwargs: Any) -> None:
+    if rank() == 0:
+        print(*args, **kwargs)
+
+
+@lru_cache()
+def num_cuda_devices() -> int:
+    import torch
+
+    return torch.cuda.device_count()
+
+
+def is_torch_available() -> bool:
+    try:
+        import torch  # noqa: F401
+    except (ImportError, ModuleNotFoundError):
+        return False
+    return True
+
+
+def collect_nvidia_smi_topo() -> str:
+    return subprocess.run(["nvidia-smi", "topo", "-m"], capture_output=True, text=True).stdout
+
+
+def collect_nvidia_smi() -> str:
+    return subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout
+
+
+def describe_nvidia_smi():
+    print(
+        "Below is the output of `nvidia-smi`. It shows information about the GPUs that are installed on this machine,"
+        " the driver version, and the maximum supported CUDA version it can run."
+    )
+    print()
+    print(collect_nvidia_smi())
+
+
+def describe_gpu_connectivity():
+    print(
+        "The matrix below shows how the GPUs in this machine are connected."
+        " NVLink (NV) is the fastest connection, and is only available on high-end systems like V100 or A100."
+    )
+    print()
+    print(collect_nvidia_smi_topo())
+
+
+if __name__ == '__main__':
+    main()

From a65e7a34b81657279ec49299e854405357371cdb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 10 Mar 2024 08:37:59 -0400
Subject: [PATCH 02/24] update

---
 .../fabric/utilities/system_check.py          | 39 +++++++++++++------
 1 file changed, 28 insertions(+), 11 deletions(-)

diff --git a/src/lightning/fabric/utilities/system_check.py b/src/lightning/fabric/utilities/system_check.py
index 2e0ea1a0cc2c8..627a016021d91 100644
--- a/src/lightning/fabric/utilities/system_check.py
+++ b/src/lightning/fabric/utilities/system_check.py
@@ -1,4 +1,6 @@
 import os
+import logging
+import shutil
 import subprocess
 from datetime import timedelta
 from functools import lru_cache
@@ -9,8 +11,12 @@
 
 import torch
 
+system_check_dir = Path("./system_check")
+
 
 def main():
+    setup_logging()
+    
     # if not dist.is_available():
     #     raise RuntimeError("Requires PyTorch distributed to be available.")
 
@@ -33,9 +39,8 @@ def _check_cuda_distributed(local_rank: int) -> None:
     os.environ["WORLD_SIZE"] = str(num_cuda_devices())
     os.environ["RANK"] = str(local_rank)
     os.environ["LOCAL_RANK"] = str(local_rank)
-
-    system_check_dir = Path("./.system_check")
-    system_check_dir.mkdir(exist_ok=True)
+    os.environ["NCCL_DEBUG"] = "INFO"
+    os.environ["NCCL_DEBUG_FILE"] = str(system_check_dir / f"nccl-rank-{local_rank}.txt")
 
     dist.init_process_group(
         backend="nccl",
@@ -52,6 +57,18 @@ def _check_cuda_distributed(local_rank: int) -> None:
     dist.all_reduce(payload)
 
 
+def setup_logging() -> None:
+    if system_check_dir.is_dir():
+        shutil.rmtree(system_check_dir)
+    system_check_dir.mkdir()
+
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+    file_handler = logging.FileHandler(str(system_check_dir / "logs.txt"))
+    file_handler.setLevel(logging.INFO)
+    logger.addHandler(file_handler)
+
+
 @lru_cache()
 def rank() -> int:
     import torch.distributed as dist
@@ -95,21 +112,21 @@ def collect_nvidia_smi() -> str:
 
 
 def describe_nvidia_smi():
-    print(
+    logger = logging.getLogger()
+    logger.info(
         "Below is the output of `nvidia-smi`. It shows information about the GPUs that are installed on this machine,"
-        " the driver version, and the maximum supported CUDA version it can run."
+        " the driver version, and the maximum supported CUDA version it can run.\n"
     )
-    print()
-    print(collect_nvidia_smi())
+    logger.info(collect_nvidia_smi())
 
 
 def describe_gpu_connectivity():
-    print(
+    logger = logging.getLogger()
+    logger.info(
         "The matrix below shows how the GPUs in this machine are connected."
-        " NVLink (NV) is the fastest connection, and is only available on high-end systems like V100 or A100."
+        " NVLink (NV) is the fastest connection, and is only available on high-end systems like V100 or A100.\n"
     )
-    print()
-    print(collect_nvidia_smi_topo())
+    logger.info(collect_nvidia_smi_topo())
 
 
 if __name__ == '__main__':

From 80b05465fc7a973ce90361a764513ce86cb00a97 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 10 Mar 2024 09:02:30 -0400
Subject: [PATCH 03/24] update

---
 .../fabric/utilities/system_check.py          | 26 +++++--------------
 1 file changed, 6 insertions(+), 20 deletions(-)

diff --git a/src/lightning/fabric/utilities/system_check.py b/src/lightning/fabric/utilities/system_check.py
index 627a016021d91..fe3d3a81d6f69 100644
--- a/src/lightning/fabric/utilities/system_check.py
+++ b/src/lightning/fabric/utilities/system_check.py
@@ -8,7 +8,6 @@
 from typing import Any
 import torch.distributed as dist
 import torch.multiprocessing as mp
-
 import torch
 
 system_check_dir = Path("./system_check")
@@ -42,16 +41,18 @@ def _check_cuda_distributed(local_rank: int) -> None:
     os.environ["NCCL_DEBUG"] = "INFO"
     os.environ["NCCL_DEBUG_FILE"] = str(system_check_dir / f"nccl-rank-{local_rank}.txt")
 
+    device = torch.device("cuda", local_rank)
+    torch.cuda.set_device(local_rank)
+
     dist.init_process_group(
         backend="nccl",
         world_size=num_cuda_devices(),
         rank=local_rank,
-        timeout=timedelta(seconds=30),
+        # NCCL gets initialized in the first collective call (e.g., barrier below), 
+        # which must be successful for this timeout to work.
+        timeout=timedelta(seconds=10),
     )
 
-    device = torch.device("cuda", local_rank)
-    torch.cuda.set_device(local_rank)
-
     dist.barrier()
     payload = torch.rand(100, 100, device=device)
     dist.all_reduce(payload)
@@ -76,13 +77,6 @@ def rank() -> int:
     return dist.get_rank()
 
 
-@lru_cache()
-def world_size() -> int:
-    import torch.distributed as dist
-
-    return dist.get_world_size()
-
-
 def print0(*args: Any, **kwargs: Any) -> None:
     if rank() == 0:
         print(*args, **kwargs)
@@ -95,14 +89,6 @@ def num_cuda_devices() -> int:
     return torch.cuda.device_count()
 
 
-def is_torch_available() -> bool:
-    try:
-        import torch  # noqa: F401
-    except (ImportError, ModuleNotFoundError):
-        return False
-    return True
-
-
 def collect_nvidia_smi_topo() -> str:
     return subprocess.run(["nvidia-smi", "topo", "-m"], capture_output=True, text=True).stdout
 

From d0028b6fae5a94c1d54329ade1f2df0ab0cabeec Mon Sep 17 00:00:00 2001
From: awaelchli <aedu.waelchli@gmail.com>
Date: Sun, 10 Mar 2024 14:14:02 +0100
Subject: [PATCH 04/24] prints

---
 .../fabric/utilities/system_check.py          | 82 ++++++++++---------
 1 file changed, 42 insertions(+), 40 deletions(-)

diff --git a/src/lightning/fabric/utilities/system_check.py b/src/lightning/fabric/utilities/system_check.py
index fe3d3a81d6f69..c79b494e5e2bc 100644
--- a/src/lightning/fabric/utilities/system_check.py
+++ b/src/lightning/fabric/utilities/system_check.py
@@ -10,109 +10,111 @@
 import torch.multiprocessing as mp
 import torch
 
-system_check_dir = Path("./system_check")
+SYSTEM_CHECK_DIR = Path("./system_check")
 
 
-def main():
-    setup_logging()
+def main() -> None:
+    _setup_logging()
+    num_cuda_devices = torch.cuda.device_count()
     
-    # if not dist.is_available():
-    #     raise RuntimeError("Requires PyTorch distributed to be available.")
+    if num_cuda_devices == 0:
+        print("Warning: Skipping system check because no GPUs were detected.")
 
-    if num_cuda_devices() == 0:
-        print0("Warning: Skipping system check because no GPUs were detected.")
-
-    if num_cuda_devices() == 1:
-        describe_nvidia_smi()
+    if num_cuda_devices == 1:
+        # TODO
+        _describe_nvidia_smi()
         pass
 
-    if num_cuda_devices() > 1:
-        describe_nvidia_smi()
-        describe_gpu_connectivity()
-        mp.spawn(_check_cuda_distributed, nprocs=num_cuda_devices())
+    if num_cuda_devices > 1:
+        _describe_nvidia_smi()
+        _describe_gpu_connectivity()
+        mp.spawn(_check_cuda_distributed, nprocs=num_cuda_devices, args=(num_cuda_devices,))
 
 
-def _check_cuda_distributed(local_rank: int) -> None:
+def _check_cuda_distributed(local_rank: int, world_size: int) -> None:
     os.environ["MASTER_ADDR"] = "localhost"
     os.environ["MASTER_PORT"] = "29500"
-    os.environ["WORLD_SIZE"] = str(num_cuda_devices())
+    os.environ["WORLD_SIZE"] = str(world_size)
     os.environ["RANK"] = str(local_rank)
     os.environ["LOCAL_RANK"] = str(local_rank)
     os.environ["NCCL_DEBUG"] = "INFO"
-    os.environ["NCCL_DEBUG_FILE"] = str(system_check_dir / f"nccl-rank-{local_rank}.txt")
+    os.environ["NCCL_DEBUG_FILE"] = str(SYSTEM_CHECK_DIR / f"nccl-rank-{local_rank}.txt")
 
     device = torch.device("cuda", local_rank)
     torch.cuda.set_device(local_rank)
 
+    _print0("Setting up the process group ... ", end="")
     dist.init_process_group(
         backend="nccl",
-        world_size=num_cuda_devices(),
+        world_size=world_size,
         rank=local_rank,
         # NCCL gets initialized in the first collective call (e.g., barrier below), 
         # which must be successful for this timeout to work.
         timeout=timedelta(seconds=10),
     )
-
+    _print0("Done.")
+    
+    _print0(
+        "Synchronizing GPUs. If this step doesn't finish within 30 seconds, there is a problem with your"
+        " multi-GPU setup."
+    )
     dist.barrier()
+    _print0("Done.")
+
     payload = torch.rand(100, 100, device=device)
+    _print0("Running all-reduce test ... ", end="")
     dist.all_reduce(payload)
+    _print0("Done.")
 
 
-def setup_logging() -> None:
-    if system_check_dir.is_dir():
-        shutil.rmtree(system_check_dir)
-    system_check_dir.mkdir()
+def _setup_logging() -> None:
+    if SYSTEM_CHECK_DIR.is_dir():
+        shutil.rmtree(SYSTEM_CHECK_DIR)
+    SYSTEM_CHECK_DIR.mkdir()
 
     logger = logging.getLogger()
     logger.setLevel(logging.INFO)
-    file_handler = logging.FileHandler(str(system_check_dir / "logs.txt"))
+    file_handler = logging.FileHandler(str(SYSTEM_CHECK_DIR / "logs.txt"))
     file_handler.setLevel(logging.INFO)
     logger.addHandler(file_handler)
 
 
 @lru_cache()
-def rank() -> int:
+def _rank() -> int:
     import torch.distributed as dist
 
     return dist.get_rank()
 
 
-def print0(*args: Any, **kwargs: Any) -> None:
-    if rank() == 0:
+def _print0(*args: Any, **kwargs: Any) -> None:
+    if _rank() == 0:
         print(*args, **kwargs)
 
 
-@lru_cache()
-def num_cuda_devices() -> int:
-    import torch
-
-    return torch.cuda.device_count()
-
-
-def collect_nvidia_smi_topo() -> str:
+def _collect_nvidia_smi_topo() -> str:
     return subprocess.run(["nvidia-smi", "topo", "-m"], capture_output=True, text=True).stdout
 
 
-def collect_nvidia_smi() -> str:
+def _collect_nvidia_smi() -> str:
     return subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout
 
 
-def describe_nvidia_smi():
+def _describe_nvidia_smi() -> None:
     logger = logging.getLogger()
     logger.info(
         "Below is the output of `nvidia-smi`. It shows information about the GPUs that are installed on this machine,"
         " the driver version, and the maximum supported CUDA version it can run.\n"
     )
-    logger.info(collect_nvidia_smi())
+    logger.info(_collect_nvidia_smi())
 
 
-def describe_gpu_connectivity():
+def _describe_gpu_connectivity() -> None:
     logger = logging.getLogger()
     logger.info(
         "The matrix below shows how the GPUs in this machine are connected."
         " NVLink (NV) is the fastest connection, and is only available on high-end systems like V100 or A100.\n"
     )
-    logger.info(collect_nvidia_smi_topo())
+    logger.info(_collect_nvidia_smi_topo())
 
 
 if __name__ == '__main__':

From afc4bdc4a12bb99346e4b52da52968e5adfca477 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 10 Mar 2024 09:20:45 -0400
Subject: [PATCH 05/24] update

---
 src/lightning/fabric/utilities/system_check.py | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/src/lightning/fabric/utilities/system_check.py b/src/lightning/fabric/utilities/system_check.py
index c79b494e5e2bc..15cd9361ea53d 100644
--- a/src/lightning/fabric/utilities/system_check.py
+++ b/src/lightning/fabric/utilities/system_check.py
@@ -52,14 +52,13 @@ def _check_cuda_distributed(local_rank: int, world_size: int) -> None:
         # which must be successful for this timeout to work.
         timeout=timedelta(seconds=10),
     )
-    _print0("Done.")
+    _print0("done.")
     
     _print0(
-        "Synchronizing GPUs. If this step doesn't finish within 30 seconds, there is a problem with your"
+        "Synchronizing GPUs. If the program hangs for more than 30 seconds, there is a problem with your"
         " multi-GPU setup."
     )
     dist.barrier()
-    _print0("Done.")
 
     payload = torch.rand(100, 100, device=device)
     _print0("Running all-reduce test ... ", end="")
@@ -79,15 +78,8 @@ def _setup_logging() -> None:
     logger.addHandler(file_handler)
 
 
-@lru_cache()
-def _rank() -> int:
-    import torch.distributed as dist
-
-    return dist.get_rank()
-
-
 def _print0(*args: Any, **kwargs: Any) -> None:
-    if _rank() == 0:
+    if int(os.getenv("RANK", 0)) == 0:
         print(*args, **kwargs)
 
 

From 760ed2cfcc6ec4d751bbc2e1ca069a5d34a01b3d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 10 Mar 2024 09:51:37 -0400
Subject: [PATCH 06/24] update

---
 .../fabric/utilities/system_check.py          | 43 +++++++++++++++++--
 1 file changed, 40 insertions(+), 3 deletions(-)

diff --git a/src/lightning/fabric/utilities/system_check.py b/src/lightning/fabric/utilities/system_check.py
index 15cd9361ea53d..6335caabac2f6 100644
--- a/src/lightning/fabric/utilities/system_check.py
+++ b/src/lightning/fabric/utilities/system_check.py
@@ -3,17 +3,17 @@
 import shutil
 import subprocess
 from datetime import timedelta
-from functools import lru_cache
 from pathlib import Path
 from typing import Any
 import torch.distributed as dist
 import torch.multiprocessing as mp
 import torch
+import time
 
 SYSTEM_CHECK_DIR = Path("./system_check")
 
 
-def main() -> None:
+def main(timeout: int = 60) -> None:
     _setup_logging()
     num_cuda_devices = torch.cuda.device_count()
     
@@ -28,7 +28,27 @@ def main() -> None:
     if num_cuda_devices > 1:
         _describe_nvidia_smi()
         _describe_gpu_connectivity()
-        mp.spawn(_check_cuda_distributed, nprocs=num_cuda_devices, args=(num_cuda_devices,))
+        
+        context = mp.spawn(
+            _check_cuda_distributed,
+            nprocs=num_cuda_devices,
+            args=(num_cuda_devices,),
+            join=False,
+        )
+
+        start = time.time()
+        joined = False
+        while not joined and (time.time() - start < timeout):
+            joined = context.join(timeout=5)
+            time.sleep(1)
+
+        if not joined:
+            for pid in context.pids():
+                _kill_process(pid)
+            print("not successful")  # TODO
+
+        # TODO: relative dir
+        print(f"Find detailed logs at {SYSTEM_CHECK_DIR}")
 
 
 def _check_cuda_distributed(local_rank: int, world_size: int) -> None:
@@ -53,6 +73,10 @@ def _check_cuda_distributed(local_rank: int, world_size: int) -> None:
         timeout=timedelta(seconds=10),
     )
     _print0("done.")
+
+    # TODO: remove
+    # if local_rank > 0:
+    #     return
     
     _print0(
         "Synchronizing GPUs. If the program hangs for more than 30 seconds, there is a problem with your"
@@ -109,5 +133,18 @@ def _describe_gpu_connectivity() -> None:
     logger.info(_collect_nvidia_smi_topo())
 
 
+def _kill_process(pid: int) -> None:
+    import psutil  # TODO
+
+    try:
+        process = psutil.Process(pid)
+        if process.is_running():
+            process.kill()
+    except psutil.NoSuchProcess:
+        pass
+    except psutil.AccessDenied:
+        pass
+
+
 if __name__ == '__main__':
     main()

From 62b608fb108bbf48c6ea1d4d974230351dffe74d Mon Sep 17 00:00:00 2001
From: awaelchli <aedu.waelchli@gmail.com>
Date: Sun, 10 Mar 2024 15:36:42 +0100
Subject: [PATCH 07/24] update

---
 src/lightning/fabric/utilities/system_check.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/lightning/fabric/utilities/system_check.py b/src/lightning/fabric/utilities/system_check.py
index 6335caabac2f6..4fa2774234b4c 100644
--- a/src/lightning/fabric/utilities/system_check.py
+++ b/src/lightning/fabric/utilities/system_check.py
@@ -48,7 +48,8 @@ def main(timeout: int = 60) -> None:
             print("not successful")  # TODO
 
         # TODO: relative dir
-        print(f"Find detailed logs at {SYSTEM_CHECK_DIR}")
+        relative_dir = SYSTEM_CHECK_DIR.relative_to(Path.cwd())
+        print(f"Find detailed logs at {relative_dir}")
 
 
 def _check_cuda_distributed(local_rank: int, world_size: int) -> None:

From 1d6d8909606eade0105c48aeba2b3c540e254d32 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 10 Mar 2024 11:30:00 -0400
Subject: [PATCH 08/24] update

---
 .../fabric/utilities/system_check.py          | 66 ++++++++++++-------
 1 file changed, 41 insertions(+), 25 deletions(-)

diff --git a/src/lightning/fabric/utilities/system_check.py b/src/lightning/fabric/utilities/system_check.py
index 4fa2774234b4c..95b8e491cb98f 100644
--- a/src/lightning/fabric/utilities/system_check.py
+++ b/src/lightning/fabric/utilities/system_check.py
@@ -29,30 +29,48 @@ def main(timeout: int = 60) -> None:
         _describe_nvidia_smi()
         _describe_gpu_connectivity()
         
-        context = mp.spawn(
-            _check_cuda_distributed,
+        success = _check_cuda_distributed(timeout)
+        
+        if not success:
+            print(
+                f"The multi-GPU NCCL test did not finish within {timeout} seconds."
+                " It looks like there is an issue with your multi-GPU setup."
+                " Now trying to run again with `NCCL_P2P_DISABLE=1` set."
+            ) 
+            os.environ["NCCL_P2P_DISABLE"] = "1"
+            success = _check_cuda_distributed(timeout)
+            if not success:
+                print(
+                    f"Disabling peer-to-peer transport did not fix the issue."
+                )
+        else:
+            print("Multi-GPU test successful.")
+
+        print(f"Find detailed logs at {SYSTEM_CHECK_DIR.absolute()}")
+
+
+def _check_cuda_distributed(timeout: int) -> bool:
+    num_cuda_devices = torch.cuda.device_count()
+    context = mp.spawn(
+            _run_all_reduce_test,
             nprocs=num_cuda_devices,
             args=(num_cuda_devices,),
             join=False,
         )
 
-        start = time.time()
-        joined = False
-        while not joined and (time.time() - start < timeout):
-            joined = context.join(timeout=5)
-            time.sleep(1)
-
-        if not joined:
-            for pid in context.pids():
-                _kill_process(pid)
-            print("not successful")  # TODO
+    start = time.time()
+    success = False
+    while not success and (time.time() - start < timeout):
+        success = context.join(timeout=5)
+        time.sleep(1)
 
-        # TODO: relative dir
-        relative_dir = SYSTEM_CHECK_DIR.relative_to(Path.cwd())
-        print(f"Find detailed logs at {relative_dir}")
+    if not success:
+        for pid in context.pids():
+            _kill_process(pid)
+    return success
 
 
-def _check_cuda_distributed(local_rank: int, world_size: int) -> None:
+def _run_all_reduce_test(local_rank: int, world_size: int) -> None:
     os.environ["MASTER_ADDR"] = "localhost"
     os.environ["MASTER_PORT"] = "29500"
     os.environ["WORLD_SIZE"] = str(world_size)
@@ -76,19 +94,17 @@ def _check_cuda_distributed(local_rank: int, world_size: int) -> None:
     _print0("done.")
 
     # TODO: remove
-    # if local_rank > 0:
-    #     return
+    if local_rank > 0:
+        return
     
-    _print0(
-        "Synchronizing GPUs. If the program hangs for more than 30 seconds, there is a problem with your"
-        " multi-GPU setup."
-    )
+    _print0("Synchronizing GPUs ... ", end="")
     dist.barrier()
+    _print0("done.")
 
     payload = torch.rand(100, 100, device=device)
     _print0("Running all-reduce test ... ", end="")
     dist.all_reduce(payload)
-    _print0("Done.")
+    _print0("done.")
 
 
 def _setup_logging() -> None:
@@ -103,9 +119,9 @@ def _setup_logging() -> None:
     logger.addHandler(file_handler)
 
 
-def _print0(*args: Any, **kwargs: Any) -> None:
+def _print0(string: str, **kwargs: Any) -> None:
     if int(os.getenv("RANK", 0)) == 0:
-        print(*args, **kwargs)
+        print(string, **kwargs)
 
 
 def _collect_nvidia_smi_topo() -> str:

From e613adc12648c6f59f31998dbd637a404699809e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 10 Mar 2024 11:30:41 -0400
Subject: [PATCH 09/24] update

---
 src/lightning/fabric/utilities/system_check.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/lightning/fabric/utilities/system_check.py b/src/lightning/fabric/utilities/system_check.py
index 95b8e491cb98f..a453e554078c8 100644
--- a/src/lightning/fabric/utilities/system_check.py
+++ b/src/lightning/fabric/utilities/system_check.py
@@ -93,9 +93,9 @@ def _run_all_reduce_test(local_rank: int, world_size: int) -> None:
     )
     _print0("done.")
 
-    # TODO: remove
-    if local_rank > 0:
-        return
+    # # TODO: remove
+    # if local_rank > 0:
+    #     return
     
     _print0("Synchronizing GPUs ... ", end="")
     dist.barrier()

From fa7d3c1c321ea2e1121670374bbf7b1c796c4f52 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 10 Mar 2024 17:06:42 -0400
Subject: [PATCH 10/24] update

---
 .../fabric/utilities/system_check.py          | 69 ++++++++++++-------
 1 file changed, 44 insertions(+), 25 deletions(-)

diff --git a/src/lightning/fabric/utilities/system_check.py b/src/lightning/fabric/utilities/system_check.py
index a453e554078c8..c739fb8a226d7 100644
--- a/src/lightning/fabric/utilities/system_check.py
+++ b/src/lightning/fabric/utilities/system_check.py
@@ -9,8 +9,12 @@
 import torch.multiprocessing as mp
 import torch
 import time
+from lightning_utilities.core.imports import RequirementCache
 
-SYSTEM_CHECK_DIR = Path("./system_check")
+
+_psutil_available = RequirementCache("psutil")
+_logger = logging.getLogger(__name__)
+_system_check_dir = Path("./system_check")
 
 
 def main(timeout: int = 60) -> None:
@@ -40,23 +44,24 @@ def main(timeout: int = 60) -> None:
             os.environ["NCCL_P2P_DISABLE"] = "1"
             success = _check_cuda_distributed(timeout)
             if not success:
-                print(
-                    f"Disabling peer-to-peer transport did not fix the issue."
-                )
+                print(f"Disabling peer-to-peer transport did not fix the issue.")
         else:
             print("Multi-GPU test successful.")
 
-        print(f"Find detailed logs at {SYSTEM_CHECK_DIR.absolute()}")
+        print(f"Find detailed logs at {_system_check_dir.absolute()}")
 
 
 def _check_cuda_distributed(timeout: int) -> bool:
+    if not _psutil_available:
+        raise ModuleNotFoundError(str(_psutil_available))
+
     num_cuda_devices = torch.cuda.device_count()
     context = mp.spawn(
-            _run_all_reduce_test,
-            nprocs=num_cuda_devices,
-            args=(num_cuda_devices,),
-            join=False,
-        )
+        _run_all_reduce_test,
+        nprocs=num_cuda_devices,
+        args=(num_cuda_devices,),
+        join=False,
+    )
 
     start = time.time()
     success = False
@@ -77,7 +82,7 @@ def _run_all_reduce_test(local_rank: int, world_size: int) -> None:
     os.environ["RANK"] = str(local_rank)
     os.environ["LOCAL_RANK"] = str(local_rank)
     os.environ["NCCL_DEBUG"] = "INFO"
-    os.environ["NCCL_DEBUG_FILE"] = str(SYSTEM_CHECK_DIR / f"nccl-rank-{local_rank}.txt")
+    os.environ["NCCL_DEBUG_FILE"] = str(_system_check_dir / f"nccl-rank-{local_rank}.txt")
 
     device = torch.device("cuda", local_rank)
     torch.cuda.set_device(local_rank)
@@ -108,20 +113,22 @@ def _run_all_reduce_test(local_rank: int, world_size: int) -> None:
 
 
 def _setup_logging() -> None:
-    if SYSTEM_CHECK_DIR.is_dir():
-        shutil.rmtree(SYSTEM_CHECK_DIR)
-    SYSTEM_CHECK_DIR.mkdir()
+    if _system_check_dir.is_dir():
+        shutil.rmtree(_system_check_dir)
+    _system_check_dir.mkdir()
 
-    logger = logging.getLogger()
-    logger.setLevel(logging.INFO)
-    file_handler = logging.FileHandler(str(SYSTEM_CHECK_DIR / "logs.txt"))
+    _logger.setLevel(logging.INFO)
+    file_handler = logging.FileHandler(str(_system_check_dir / "logs.txt"))
     file_handler.setLevel(logging.INFO)
-    logger.addHandler(file_handler)
+    console_handler = logging.StreamHandler()
+    console_handler.setLevel(logging.INFO)
+    _logger.addHandler(file_handler)
+    _logger.addHandler(console_handler)
 
 
 def _print0(string: str, **kwargs: Any) -> None:
     if int(os.getenv("RANK", 0)) == 0:
-        print(string, **kwargs)
+        _logger.info(string, **kwargs)
 
 
 def _collect_nvidia_smi_topo() -> str:
@@ -132,22 +139,34 @@ def _collect_nvidia_smi() -> str:
     return subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout
 
 
+# def _collect_nvidia_driver_version() -> str:
+#     result = subprocess.run(
+#         [
+#             "nvidia-smi", 
+#             "--query-gpu=driver_version", 
+#             "--id=0",
+#             "--format=csv,noheader",
+#         ], 
+#         capture_output=True, 
+#         text=True,
+#     )
+#     return result.stdout
+
+
 def _describe_nvidia_smi() -> None:
-    logger = logging.getLogger()
-    logger.info(
+    _logger.info(
         "Below is the output of `nvidia-smi`. It shows information about the GPUs that are installed on this machine,"
         " the driver version, and the maximum supported CUDA version it can run.\n"
     )
-    logger.info(_collect_nvidia_smi())
+    _logger.info(_collect_nvidia_smi())
 
 
 def _describe_gpu_connectivity() -> None:
-    logger = logging.getLogger()
-    logger.info(
+    _logger.info(
         "The matrix below shows how the GPUs in this machine are connected."
         " NVLink (NV) is the fastest connection, and is only available on high-end systems like V100 or A100.\n"
     )
-    logger.info(_collect_nvidia_smi_topo())
+    _logger.info(_collect_nvidia_smi_topo())
 
 
 def _kill_process(pid: int) -> None:

From 5ad949148fd36d2894472db9765e96245a4402e5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Sun, 10 Mar 2024 17:52:47 -0400
Subject: [PATCH 11/24] update

---
 .../fabric/utilities/system_check.py          | 69 +++++++++----------
 1 file changed, 33 insertions(+), 36 deletions(-)

diff --git a/src/lightning/fabric/utilities/system_check.py b/src/lightning/fabric/utilities/system_check.py
index c739fb8a226d7..72eb5857a8088 100644
--- a/src/lightning/fabric/utilities/system_check.py
+++ b/src/lightning/fabric/utilities/system_check.py
@@ -9,6 +9,7 @@
 import torch.multiprocessing as mp
 import torch
 import time
+import pkg_resources
 from lightning_utilities.core.imports import RequirementCache
 
 
@@ -19,15 +20,16 @@
 
 def main(timeout: int = 60) -> None:
     _setup_logging()
+    _collect_packages()
+    
     num_cuda_devices = torch.cuda.device_count()
+
     
     if num_cuda_devices == 0:
-        print("Warning: Skipping system check because no GPUs were detected.")
+        _print0("Warning: Skipping system check because no GPUs were detected.")
 
     if num_cuda_devices == 1:
-        # TODO
         _describe_nvidia_smi()
-        pass
 
     if num_cuda_devices > 1:
         _describe_nvidia_smi()
@@ -36,7 +38,7 @@ def main(timeout: int = 60) -> None:
         success = _check_cuda_distributed(timeout)
         
         if not success:
-            print(
+            _print0(
                 f"The multi-GPU NCCL test did not finish within {timeout} seconds."
                 " It looks like there is an issue with your multi-GPU setup."
                 " Now trying to run again with `NCCL_P2P_DISABLE=1` set."
@@ -44,11 +46,11 @@ def main(timeout: int = 60) -> None:
             os.environ["NCCL_P2P_DISABLE"] = "1"
             success = _check_cuda_distributed(timeout)
             if not success:
-                print(f"Disabling peer-to-peer transport did not fix the issue.")
+                _print0(f"Disabling peer-to-peer transport did not fix the issue.")
         else:
-            print("Multi-GPU test successful.")
+            _print0("Multi-GPU test successful.")
 
-        print(f"Find detailed logs at {_system_check_dir.absolute()}")
+    _print0(f"Find detailed logs at {_system_check_dir.absolute()}")
 
 
 def _check_cuda_distributed(timeout: int) -> bool:
@@ -87,7 +89,7 @@ def _run_all_reduce_test(local_rank: int, world_size: int) -> None:
     device = torch.device("cuda", local_rank)
     torch.cuda.set_device(local_rank)
 
-    _print0("Setting up the process group ... ", end="")
+    _print0("Setting up the process group ...")
     dist.init_process_group(
         backend="nccl",
         world_size=world_size,
@@ -96,20 +98,17 @@ def _run_all_reduce_test(local_rank: int, world_size: int) -> None:
         # which must be successful for this timeout to work.
         timeout=timedelta(seconds=10),
     )
-    _print0("done.")
 
-    # # TODO: remove
+    # TODO: remove
     # if local_rank > 0:
     #     return
-    
-    _print0("Synchronizing GPUs ... ", end="")
+
+    _print0("Synchronizing GPUs ... ")
     dist.barrier()
-    _print0("done.")
 
     payload = torch.rand(100, 100, device=device)
-    _print0("Running all-reduce test ... ", end="")
+    _print0("Running all-reduce test ...")
     dist.all_reduce(payload)
-    _print0("done.")
 
 
 def _setup_logging() -> None:
@@ -126,9 +125,9 @@ def _setup_logging() -> None:
     _logger.addHandler(console_handler)
 
 
-def _print0(string: str, **kwargs: Any) -> None:
+def _print0(string: str) -> None:
     if int(os.getenv("RANK", 0)) == 0:
-        _logger.info(string, **kwargs)
+        _logger.info(string)
 
 
 def _collect_nvidia_smi_topo() -> str:
@@ -139,20 +138,6 @@ def _collect_nvidia_smi() -> str:
     return subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout
 
 
-# def _collect_nvidia_driver_version() -> str:
-#     result = subprocess.run(
-#         [
-#             "nvidia-smi", 
-#             "--query-gpu=driver_version", 
-#             "--id=0",
-#             "--format=csv,noheader",
-#         ], 
-#         capture_output=True, 
-#         text=True,
-#     )
-#     return result.stdout
-
-
 def _describe_nvidia_smi() -> None:
     _logger.info(
         "Below is the output of `nvidia-smi`. It shows information about the GPUs that are installed on this machine,"
@@ -164,23 +149,35 @@ def _describe_nvidia_smi() -> None:
 def _describe_gpu_connectivity() -> None:
     _logger.info(
         "The matrix below shows how the GPUs in this machine are connected."
-        " NVLink (NV) is the fastest connection, and is only available on high-end systems like V100 or A100.\n"
+        " NVLink (NV) is the fastest connection, and is only available on high-end systems like V100, A100, etc.\n"
     )
     _logger.info(_collect_nvidia_smi_topo())
 
 
 def _kill_process(pid: int) -> None:
-    import psutil  # TODO
+    import psutil
 
     try:
         process = psutil.Process(pid)
         if process.is_running():
             process.kill()
-    except psutil.NoSuchProcess:
-        pass
-    except psutil.AccessDenied:
+    except (psutil.NoSuchProcess, psutil.AccessDenied):
         pass
 
 
+def _collect_packages() -> None:
+    packages = {}
+    for dist in pkg_resources.working_set:
+        package = dist.as_requirement()
+        packages[package.key] = package.specs[0][1]
+    
+    longest = max(len(p) for p in packages)
+    with open(_system_check_dir / "packages.txt", "w") as file:
+        for name in sorted(packages.keys()):
+            version = packages[name]
+            pad = " " * (longest - len(name))
+            file.write(f"{name}{pad}  {version}\n")
+
+
 if __name__ == '__main__':
     main()

From c22fb2bd6d73857679554dd8bfd028bbe18e54c3 Mon Sep 17 00:00:00 2001
From: awaelchli <aedu.waelchli@gmail.com>
Date: Sun, 10 Mar 2024 23:03:06 +0100
Subject: [PATCH 12/24] update

---
 src/lightning/fabric/cli.py                   |  6 ++++++
 .../fabric/utilities/system_check.py          | 21 ++-----------------
 2 files changed, 8 insertions(+), 19 deletions(-)

diff --git a/src/lightning/fabric/cli.py b/src/lightning/fabric/cli.py
index d8c6fe47b6630..5c6f0e4f09311 100644
--- a/src/lightning/fabric/cli.py
+++ b/src/lightning/fabric/cli.py
@@ -30,6 +30,7 @@
 from lightning.fabric.utilities.device_parser import _parse_gpu_ids
 from lightning.fabric.utilities.distributed import _suggested_max_num_threads
 from lightning.fabric.utilities.load import _load_distributed_checkpoint
+from lightning.fabric.utilities import system_check
 
 _log = logging.getLogger(__name__)
 
@@ -188,6 +189,11 @@ def _consolidate(checkpoint_folder: str, output_file: Optional[str]) -> None:
         checkpoint = _load_distributed_checkpoint(config.checkpoint_folder)
         torch.save(checkpoint, config.output_file)
 
+    @_main.command("system-check")
+    def _system_check():
+        """Run a system check to test your multi-GPU setup."""
+        system_check.main()
+
 
 def _set_env_variables(args: Namespace) -> None:
     """Set the environment variables for the new processes.
diff --git a/src/lightning/fabric/utilities/system_check.py b/src/lightning/fabric/utilities/system_check.py
index 72eb5857a8088..b282fbc7f7196 100644
--- a/src/lightning/fabric/utilities/system_check.py
+++ b/src/lightning/fabric/utilities/system_check.py
@@ -4,12 +4,10 @@
 import subprocess
 from datetime import timedelta
 from pathlib import Path
-from typing import Any
 import torch.distributed as dist
 import torch.multiprocessing as mp
 import torch
 import time
-import pkg_resources
 from lightning_utilities.core.imports import RequirementCache
 
 
@@ -20,11 +18,9 @@
 
 def main(timeout: int = 60) -> None:
     _setup_logging()
-    _collect_packages()
-    
+
     num_cuda_devices = torch.cuda.device_count()
 
-    
     if num_cuda_devices == 0:
         _print0("Warning: Skipping system check because no GPUs were detected.")
 
@@ -123,6 +119,7 @@ def _setup_logging() -> None:
     console_handler.setLevel(logging.INFO)
     _logger.addHandler(file_handler)
     _logger.addHandler(console_handler)
+    _logger.propagate = False
 
 
 def _print0(string: str) -> None:
@@ -165,19 +162,5 @@ def _kill_process(pid: int) -> None:
         pass
 
 
-def _collect_packages() -> None:
-    packages = {}
-    for dist in pkg_resources.working_set:
-        package = dist.as_requirement()
-        packages[package.key] = package.specs[0][1]
-    
-    longest = max(len(p) for p in packages)
-    with open(_system_check_dir / "packages.txt", "w") as file:
-        for name in sorted(packages.keys()):
-            version = packages[name]
-            pad = " " * (longest - len(name))
-            file.write(f"{name}{pad}  {version}\n")
-
-
 if __name__ == '__main__':
     main()

From 7a7f7e6928d44a3e3240b0a71b6b0193143495bc Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sun, 10 Mar 2024 22:20:11 +0000
Subject: [PATCH 13/24] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/lightning/fabric/cli.py                   |  2 +-
 .../fabric/utilities/system_check.py          | 20 +++++++++----------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/lightning/fabric/cli.py b/src/lightning/fabric/cli.py
index 5c6f0e4f09311..67a6ff2b6d7ef 100644
--- a/src/lightning/fabric/cli.py
+++ b/src/lightning/fabric/cli.py
@@ -26,11 +26,11 @@
 from lightning.fabric.accelerators import CPUAccelerator, CUDAAccelerator, MPSAccelerator
 from lightning.fabric.plugins.precision.precision import _PRECISION_INPUT_STR, _PRECISION_INPUT_STR_ALIAS
 from lightning.fabric.strategies import STRATEGY_REGISTRY
+from lightning.fabric.utilities import system_check
 from lightning.fabric.utilities.consolidate_checkpoint import _process_cli_args
 from lightning.fabric.utilities.device_parser import _parse_gpu_ids
 from lightning.fabric.utilities.distributed import _suggested_max_num_threads
 from lightning.fabric.utilities.load import _load_distributed_checkpoint
-from lightning.fabric.utilities import system_check
 
 _log = logging.getLogger(__name__)
 
diff --git a/src/lightning/fabric/utilities/system_check.py b/src/lightning/fabric/utilities/system_check.py
index b282fbc7f7196..22e0e7c35958c 100644
--- a/src/lightning/fabric/utilities/system_check.py
+++ b/src/lightning/fabric/utilities/system_check.py
@@ -1,16 +1,16 @@
-import os
 import logging
+import os
 import shutil
 import subprocess
+import time
 from datetime import timedelta
 from pathlib import Path
+
+import torch
 import torch.distributed as dist
 import torch.multiprocessing as mp
-import torch
-import time
 from lightning_utilities.core.imports import RequirementCache
 
-
 _psutil_available = RequirementCache("psutil")
 _logger = logging.getLogger(__name__)
 _system_check_dir = Path("./system_check")
@@ -30,19 +30,19 @@ def main(timeout: int = 60) -> None:
     if num_cuda_devices > 1:
         _describe_nvidia_smi()
         _describe_gpu_connectivity()
-        
+
         success = _check_cuda_distributed(timeout)
-        
+
         if not success:
             _print0(
                 f"The multi-GPU NCCL test did not finish within {timeout} seconds."
                 " It looks like there is an issue with your multi-GPU setup."
                 " Now trying to run again with `NCCL_P2P_DISABLE=1` set."
-            ) 
+            )
             os.environ["NCCL_P2P_DISABLE"] = "1"
             success = _check_cuda_distributed(timeout)
             if not success:
-                _print0(f"Disabling peer-to-peer transport did not fix the issue.")
+                _print0("Disabling peer-to-peer transport did not fix the issue.")
         else:
             _print0("Multi-GPU test successful.")
 
@@ -90,7 +90,7 @@ def _run_all_reduce_test(local_rank: int, world_size: int) -> None:
         backend="nccl",
         world_size=world_size,
         rank=local_rank,
-        # NCCL gets initialized in the first collective call (e.g., barrier below), 
+        # NCCL gets initialized in the first collective call (e.g., barrier below),
         # which must be successful for this timeout to work.
         timeout=timedelta(seconds=10),
     )
@@ -162,5 +162,5 @@ def _kill_process(pid: int) -> None:
         pass
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()

From 5c5b973e3df6c604393481ecc9a2f3cf8b7d2808 Mon Sep 17 00:00:00 2001
From: awaelchli <aedu.waelchli@gmail.com>
Date: Sun, 10 Mar 2024 23:24:23 +0100
Subject: [PATCH 14/24] update

---
 src/lightning/fabric/cli.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lightning/fabric/cli.py b/src/lightning/fabric/cli.py
index 67a6ff2b6d7ef..3b5070ee4daf0 100644
--- a/src/lightning/fabric/cli.py
+++ b/src/lightning/fabric/cli.py
@@ -190,7 +190,7 @@ def _consolidate(checkpoint_folder: str, output_file: Optional[str]) -> None:
         torch.save(checkpoint, config.output_file)
 
     @_main.command("system-check")
-    def _system_check():
+    def _system_check() -> None:
         """Run a system check to test your multi-GPU setup."""
         system_check.main()
 

From 652677f8b27f79df10faeae71f2d344cce1d2ce2 Mon Sep 17 00:00:00 2001
From: awaelchli <aedu.waelchli@gmail.com>
Date: Sun, 10 Mar 2024 23:28:33 +0100
Subject: [PATCH 15/24] add license

---
 .../fabric/utilities/consolidate_checkpoint.py      | 13 +++++++++++++
 src/lightning/fabric/utilities/distributed.py       | 13 +++++++++++++
 src/lightning/fabric/utilities/seed.py              | 13 +++++++++++++
 src/lightning/fabric/utilities/spike.py             | 13 +++++++++++++
 src/lightning/fabric/utilities/system_check.py      | 13 +++++++++++++
 5 files changed, 65 insertions(+)

diff --git a/src/lightning/fabric/utilities/consolidate_checkpoint.py b/src/lightning/fabric/utilities/consolidate_checkpoint.py
index 15d20d8d89ecc..7a94089c7a9cd 100644
--- a/src/lightning/fabric/utilities/consolidate_checkpoint.py
+++ b/src/lightning/fabric/utilities/consolidate_checkpoint.py
@@ -1,3 +1,16 @@
+# Copyright The Lightning AI team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import logging
 from argparse import ArgumentParser, Namespace
 from pathlib import Path
diff --git a/src/lightning/fabric/utilities/distributed.py b/src/lightning/fabric/utilities/distributed.py
index 30bfe4e254a07..599b185ffd105 100644
--- a/src/lightning/fabric/utilities/distributed.py
+++ b/src/lightning/fabric/utilities/distributed.py
@@ -1,3 +1,16 @@
+# Copyright The Lightning AI team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import contextlib
 import logging
 import os
diff --git a/src/lightning/fabric/utilities/seed.py b/src/lightning/fabric/utilities/seed.py
index b274bce88fcdf..c7003f1b05593 100644
--- a/src/lightning/fabric/utilities/seed.py
+++ b/src/lightning/fabric/utilities/seed.py
@@ -1,3 +1,16 @@
+# Copyright The Lightning AI team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import logging
 import os
 import random
diff --git a/src/lightning/fabric/utilities/spike.py b/src/lightning/fabric/utilities/spike.py
index 5dca5990064e8..6ccad8f9bf776 100644
--- a/src/lightning/fabric/utilities/spike.py
+++ b/src/lightning/fabric/utilities/spike.py
@@ -1,3 +1,16 @@
+# Copyright The Lightning AI team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import json
 import operator
 import os
diff --git a/src/lightning/fabric/utilities/system_check.py b/src/lightning/fabric/utilities/system_check.py
index 22e0e7c35958c..4234cbb2d8516 100644
--- a/src/lightning/fabric/utilities/system_check.py
+++ b/src/lightning/fabric/utilities/system_check.py
@@ -1,3 +1,16 @@
+# Copyright The Lightning AI team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import logging
 import os
 import shutil

From 589a97b5241c1c6ca751d036b83a3208ead3c9af Mon Sep 17 00:00:00 2001
From: awaelchli <aedu.waelchli@gmail.com>
Date: Sun, 10 Mar 2024 23:38:14 +0100
Subject: [PATCH 16/24] update

---
 .azure/gpu-tests-fabric.yml                | 11 +++++++++--
 .azure/gpu-tests-pytorch.yml               |  1 +
 .gitignore                                 |  1 +
 src/lightning/fabric/cli.py                |  6 +++---
 tests/tests_fabric/run_standalone_tasks.sh | 23 ++++++++++++++++++++++
 5 files changed, 37 insertions(+), 5 deletions(-)
 create mode 100644 tests/tests_fabric/run_standalone_tasks.sh

diff --git a/.azure/gpu-tests-fabric.yml b/.azure/gpu-tests-fabric.yml
index ba86449e92355..b41b714edce32 100644
--- a/.azure/gpu-tests-fabric.yml
+++ b/.azure/gpu-tests-fabric.yml
@@ -134,14 +134,21 @@ jobs:
 
       - bash: python -m coverage run --source ${COVERAGE_SOURCE} -m pytest . -v --durations=50
         workingDirectory: tests/tests_fabric/
-        displayName: "Testing: fabric standard"
+        displayName: "Testing: Fabric standard"
         timeoutInMinutes: "10"
 
       - bash: bash ../run_standalone_tests.sh "."
         workingDirectory: tests/tests_fabric/
         env:
           PL_STANDALONE_TESTS_SOURCE: $(COVERAGE_SOURCE)
-        displayName: "Testing: fabric standalone"
+        displayName: "Testing: Fabric standalone tests"
+        timeoutInMinutes: "10"
+
+      - bash: bash run_standalone_tasks.sh
+        workingDirectory: tests/tests_fabric
+        env:
+          PL_USE_MOCKED_MNIST: "1"
+        displayName: "Testing: Fabric standalone tasks"
         timeoutInMinutes: "10"
 
       - bash: |
diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml
index b9ab6ead7f0d1..c795bac955334 100644
--- a/.azure/gpu-tests-pytorch.yml
+++ b/.azure/gpu-tests-pytorch.yml
@@ -24,6 +24,7 @@ pr:
       - "examples/run_pl_examples.sh"
       - "examples/pytorch/basics/backbone_image_classifier.py"
       - "examples/pytorch/basics/autoencoder.py"
+      - "tests/run_standalone_*.sh"
       - "requirements/pytorch/**"
       - "src/lightning/__init__.py"
       - "src/lightning/__setup__.py"
diff --git a/.gitignore b/.gitignore
index de1de44fec235..2ace5d1151c5e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -175,6 +175,7 @@ wandb
 *.prof
 *.tar.gz
 .neptune/
+system_check/
 
 # dataset generated from bolts in examples.
 cifar-10-batches-py
diff --git a/src/lightning/fabric/cli.py b/src/lightning/fabric/cli.py
index 3b5070ee4daf0..ffbcf7ade6a9a 100644
--- a/src/lightning/fabric/cli.py
+++ b/src/lightning/fabric/cli.py
@@ -189,9 +189,9 @@ def _consolidate(checkpoint_folder: str, output_file: Optional[str]) -> None:
         checkpoint = _load_distributed_checkpoint(config.checkpoint_folder)
         torch.save(checkpoint, config.output_file)
 
-    @_main.command("system-check")
-    def _system_check() -> None:
-        """Run a system check to test your multi-GPU setup."""
+    @_main.command("diagnose")
+    def _diagnose() -> None:
+        """Diagnose issues with your multi-GPU setup."""
         system_check.main()
 
 
diff --git a/tests/tests_fabric/run_standalone_tasks.sh b/tests/tests_fabric/run_standalone_tasks.sh
new file mode 100644
index 0000000000000..63ff0c0d301cd
--- /dev/null
+++ b/tests/tests_fabric/run_standalone_tasks.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+# Copyright The Lightning AI team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+# THIS FILE ASSUMES IT IS RUN INSIDE THE tests/tests_pytorch DIRECTORY
+
+# this environment variable allows special tests to run
+export PL_RUN_STANDALONE_TESTS=1
+
+# test that a user can manually launch individual processes
+echo "Running system check"
+fabric diagnose

From bf90dff5b4c41ec5fbbd6f6679206cd9948c0273 Mon Sep 17 00:00:00 2001
From: awaelchli <aedu.waelchli@gmail.com>
Date: Mon, 11 Mar 2024 00:18:55 +0100
Subject: [PATCH 17/24] tests

---
 src/lightning/fabric/utilities/system_check.py | 12 ++++--------
 .../utilities/test_system_check.py             | 18 ++++++++++++++++++
 2 files changed, 22 insertions(+), 8 deletions(-)
 create mode 100644 tests/tests_fabric/utilities/test_system_check.py

diff --git a/src/lightning/fabric/utilities/system_check.py b/src/lightning/fabric/utilities/system_check.py
index 4234cbb2d8516..ff7ac4e18bced 100644
--- a/src/lightning/fabric/utilities/system_check.py
+++ b/src/lightning/fabric/utilities/system_check.py
@@ -20,7 +20,7 @@
 from pathlib import Path
 
 import torch
-import torch.distributed as dist
+import torch.distributed
 import torch.multiprocessing as mp
 from lightning_utilities.core.imports import RequirementCache
 
@@ -99,7 +99,7 @@ def _run_all_reduce_test(local_rank: int, world_size: int) -> None:
     torch.cuda.set_device(local_rank)
 
     _print0("Setting up the process group ...")
-    dist.init_process_group(
+    torch.distributed.init_process_group(
         backend="nccl",
         world_size=world_size,
         rank=local_rank,
@@ -108,16 +108,12 @@ def _run_all_reduce_test(local_rank: int, world_size: int) -> None:
         timeout=timedelta(seconds=10),
     )
 
-    # TODO: remove
-    # if local_rank > 0:
-    #     return
-
     _print0("Synchronizing GPUs ... ")
-    dist.barrier()
+    torch.distributed.barrier()
 
     payload = torch.rand(100, 100, device=device)
     _print0("Running all-reduce test ...")
-    dist.all_reduce(payload)
+    torch.distributed.all_reduce(payload)
 
 
 def _setup_logging() -> None:
diff --git a/tests/tests_fabric/utilities/test_system_check.py b/tests/tests_fabric/utilities/test_system_check.py
new file mode 100644
index 0000000000000..a8578f110d352
--- /dev/null
+++ b/tests/tests_fabric/utilities/test_system_check.py
@@ -0,0 +1,18 @@
+import os
+from unittest import mock
+
+import torch
+
+from lightning.fabric.utilities.system_check import _run_all_reduce_test
+
+
+@mock.patch.dict(os.environ, {}, clear=True)
+@mock.patch("lightning.fabric.utilities.system_check.torch.device", return_value=torch.device("cpu"))
+@mock.patch("lightning.fabric.utilities.system_check.torch.cuda.set_device")
+@mock.patch("lightning.fabric.utilities.system_check.torch.distributed")
+def test_run_all_reduce_test(dist_mock, set_device_mock, __):
+    _run_all_reduce_test(local_rank=1, world_size=4)
+    set_device_mock.assert_called_once()
+    dist_mock.init_process_group.assert_called_once()
+    dist_mock.barrier.assert_called_once()
+    dist_mock.all_reduce.assert_called_once()

From ea01ac28a372e824a4c14daf8abeaa2ee8f8460a Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sun, 10 Mar 2024 23:19:32 +0000
Subject: [PATCH 18/24] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/tests_fabric/utilities/test_system_check.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/tests_fabric/utilities/test_system_check.py b/tests/tests_fabric/utilities/test_system_check.py
index a8578f110d352..1efeae98b71f6 100644
--- a/tests/tests_fabric/utilities/test_system_check.py
+++ b/tests/tests_fabric/utilities/test_system_check.py
@@ -2,7 +2,6 @@
 from unittest import mock
 
 import torch
-
 from lightning.fabric.utilities.system_check import _run_all_reduce_test
 
 

From a7faaed800139577057766e8256e677ba93958ea Mon Sep 17 00:00:00 2001
From: awaelchli <aedu.waelchli@gmail.com>
Date: Mon, 11 Mar 2024 00:31:31 +0100
Subject: [PATCH 19/24] troubleshooting

---
 .../guide/multi_node/barebones.rst            | 50 +-----------
 docs/source-fabric/guide/troubleshooting.rst  | 79 +++++++++++++++++++
 2 files changed, 81 insertions(+), 48 deletions(-)
 create mode 100644 docs/source-fabric/guide/troubleshooting.rst

diff --git a/docs/source-fabric/guide/multi_node/barebones.rst b/docs/source-fabric/guide/multi_node/barebones.rst
index a251df230174c..6a43460b9865a 100644
--- a/docs/source-fabric/guide/multi_node/barebones.rst
+++ b/docs/source-fabric/guide/multi_node/barebones.rst
@@ -110,52 +110,6 @@ After executing these commands, you should immediately see an output like this:
 Troubleshooting
 ***************
 
-
-**My program is stuck initializing at startup. What is causing this?**
-
-You are seeing a message like this in the logs, but nothing happens:
-
-.. code-block::
-
-    Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/4
-
-The most likely reasons and how to fix it:
-
-- **Wrong network interface:** Some servers have multiple network interfaces.
-  There is usually only one that can send and receive traffic from the network of the other nodes, but sometimes it is not set as the default.
-  In this case, you need to set it manually:
-
-  .. code-block:: bash
-
-    export GLOO_SOCKET_IFNAME=eno1
-    export NCCL_SOCKET_IFNAME=eno1
-    fabric run ...
-
-  You can find the interface name by parsing the output of the ``ifconfig`` command.
-  The name of this interface **may differ on each node**.
-
-- **NCCL can't communicate between the nodes:**
-
-  Follow the steps in the `NCCL troubleshooting guide <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/troubleshooting.html>`_.
-  In particular, take note of the network section that describes restricting the port range and firewall rules.
-
-  .. code-block:: bash
-
-      echo "net.ipv4.ip_local_port_range = 50000 51000" >> /etc/sysctl.conf
-      sysctl --system
-      ufw allow 50000:51000/tcp
-
-
-**My program crashes with an NCCL error, but it is not helpful**
-
-Launch your command by prepending ``NCCL_DEBUG=INFO`` to get more info.
-
-.. code-block:: bash
-
-    NCCL_DEBUG=INFO fabric run ...
-
-
-----
-
-If you are sick of troubleshooting cluster problems, give :doc:`Lightning cloud <./cloud>` a try!
+Please refer to the :doc:`troubleshooting guide <../troubleshooting>` guide if you are experiencing issues related to multi-node training hanging or crashing.
+If you are sick of troubleshooting cluster problems, give :doc:`Lightning Studios <./cloud>` a try!
 For other questions, please don't hesitate to join the `Discord <https://discord.gg/VptPCZkGNa>`_.
diff --git a/docs/source-fabric/guide/troubleshooting.rst b/docs/source-fabric/guide/troubleshooting.rst
new file mode 100644
index 0000000000000..9ba134a3306c3
--- /dev/null
+++ b/docs/source-fabric/guide/troubleshooting.rst
@@ -0,0 +1,79 @@
+###############
+Troubleshooting
+###############
+
+
+----
+
+
+*********
+Multi-GPU
+*********
+
+
+**My program is stuck initializing at startup. What is causing this?**
+
+You are seeing a message like this in the logs, but nothing happens:
+
+.. code-block::
+
+    Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/4
+
+The most likely reasons and how to fix it:
+
+
+.. code-block:: bash
+
+    fabric diagnose
+
+
+----
+
+
+**********
+Multi-node
+**********
+
+
+**My program is stuck initializing at startup. What is causing this?**
+
+You are seeing a message like this in the logs, but nothing happens:
+
+.. code-block::
+
+    Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/4
+
+The most likely reasons and how to fix it:
+
+- **Wrong network interface:** Some servers have multiple network interfaces.
+  There is usually only one that can send and receive traffic from the network of the other nodes, but sometimes it is not set as the default.
+  In this case, you need to set it manually:
+
+  .. code-block:: bash
+
+    export GLOO_SOCKET_IFNAME=eno1
+    export NCCL_SOCKET_IFNAME=eno1
+    fabric run ...
+
+  You can find the interface name by parsing the output of the ``ifconfig`` command.
+  The name of this interface **may differ on each node**.
+
+- **NCCL can't communicate between the nodes:**
+
+  Follow the steps in the `NCCL troubleshooting guide <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/troubleshooting.html>`_.
+  In particular, take note of the network section that describes restricting the port range and firewall rules.
+
+  .. code-block:: bash
+
+      echo "net.ipv4.ip_local_port_range = 50000 51000" >> /etc/sysctl.conf
+      sysctl --system
+      ufw allow 50000:51000/tcp
+
+
+**My program crashes with an NCCL error, but it is not helpful**
+
+Launch your command by prepending ``NCCL_DEBUG=INFO`` to get more info.
+
+.. code-block:: bash
+
+    NCCL_DEBUG=INFO fabric run ...

From 8c8d23b000acb938afb3dd64d20cfa9d317d82ac Mon Sep 17 00:00:00 2001
From: awaelchli <aedu.waelchli@gmail.com>
Date: Mon, 11 Mar 2024 02:51:24 +0100
Subject: [PATCH 20/24] docs

---
 docs/source-fabric/guide/troubleshooting.rst  | 48 +++++++++++--------
 .../fabric/utilities/system_check.py          |  6 ++-
 2 files changed, 33 insertions(+), 21 deletions(-)

diff --git a/docs/source-fabric/guide/troubleshooting.rst b/docs/source-fabric/guide/troubleshooting.rst
index 9ba134a3306c3..28a841dbe6bb8 100644
--- a/docs/source-fabric/guide/troubleshooting.rst
+++ b/docs/source-fabric/guide/troubleshooting.rst
@@ -2,6 +2,8 @@
 Troubleshooting
 ###############
 
+Learn how to troubleshoot possible causes for common issues related to CUDA, NCCL, and distributed training.
+
 
 ----
 
@@ -10,22 +12,38 @@ Troubleshooting
 Multi-GPU
 *********
 
-
-**My program is stuck initializing at startup. What is causing this?**
-
-You are seeing a message like this in the logs, but nothing happens:
+If your program is stuck at
 
 .. code-block::
 
     Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/4
 
-The most likely reasons and how to fix it:
-
+it indicates that PyTorch can't set up the communication between GPUs, and that your system is not configured correctly.
+Run the `diagnose` command from the Fabric CLI to investigate:
 
 .. code-block:: bash
 
     fabric diagnose
 
+This tool will run basic multi-GPU tests using only PyTorch.
+Any issues raised here will confirm that the problem is with your system and not with Lightning.
+Common solutions:
+
+- **Wrong driver version:** The NVIDIA driver for your GPU is too old or too new.
+  You can check the version of the driver by running
+
+  .. code-block:: bash
+
+      nvidia-smi --id=0 --query-gpu=driver_version --format=csv,noheader
+
+  *Solution*: Install a recent driver.
+  Search online for instructions how to update the driver on your platform.
+
+- **Peer-to-peer connection is broken:** The GPUs can't communicate with each other.
+  *Solution*: Try to set the environment variable ``NCCL_P2P_DISABLE=1``.
+  If you rerun your scipt and it fixes the problem, this means that peer-to-peer transport is not working properly (your training will run but it will be slow).
+  This is likely because of driver compatibility issues (see above) or because your GPU does not support peer-to-peer (e.g., certain RTX cards).
+
 
 ----
 
@@ -34,16 +52,15 @@ The most likely reasons and how to fix it:
 Multi-node
 **********
 
-
-**My program is stuck initializing at startup. What is causing this?**
-
-You are seeing a message like this in the logs, but nothing happens:
+Before troubleshooting multi-node connectivity issues, first ensure that multi-GPU within a single machine is working correctly by following the steps above.
+If single-node execution works, but multi-node hangs at
 
 .. code-block::
 
     Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/4
 
-The most likely reasons and how to fix it:
+it indicates that there is a connection issue between the nodes.
+Common solutions:
 
 - **Wrong network interface:** Some servers have multiple network interfaces.
   There is usually only one that can send and receive traffic from the network of the other nodes, but sometimes it is not set as the default.
@@ -68,12 +85,3 @@ The most likely reasons and how to fix it:
       echo "net.ipv4.ip_local_port_range = 50000 51000" >> /etc/sysctl.conf
       sysctl --system
       ufw allow 50000:51000/tcp
-
-
-**My program crashes with an NCCL error, but it is not helpful**
-
-Launch your command by prepending ``NCCL_DEBUG=INFO`` to get more info.
-
-.. code-block:: bash
-
-    NCCL_DEBUG=INFO fabric run ...
diff --git a/src/lightning/fabric/utilities/system_check.py b/src/lightning/fabric/utilities/system_check.py
index ff7ac4e18bced..e9c52a27be7d8 100644
--- a/src/lightning/fabric/utilities/system_check.py
+++ b/src/lightning/fabric/utilities/system_check.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 import logging
 import os
 import shutil
@@ -54,7 +55,10 @@ def main(timeout: int = 60) -> None:
             )
             os.environ["NCCL_P2P_DISABLE"] = "1"
             success = _check_cuda_distributed(timeout)
-            if not success:
+            if success:
+                _print0("Disabling peer-to-peer transport fixed the issue.")
+                # TODO: Give advice
+            else:
                 _print0("Disabling peer-to-peer transport did not fix the issue.")
         else:
             _print0("Multi-GPU test successful.")

From 1f7cf52177318e4e0c2bbc50128b280b3db15f94 Mon Sep 17 00:00:00 2001
From: awaelchli <aedu.waelchli@gmail.com>
Date: Mon, 11 Mar 2024 02:57:22 +0100
Subject: [PATCH 21/24] links

---
 docs/source-fabric/fundamentals/launch.rst | 9 +++++++++
 docs/source-fabric/glossary/index.rst      | 6 ++++++
 2 files changed, 15 insertions(+)

diff --git a/docs/source-fabric/fundamentals/launch.rst b/docs/source-fabric/fundamentals/launch.rst
index efde3f54fe846..784011e533e13 100644
--- a/docs/source-fabric/fundamentals/launch.rst
+++ b/docs/source-fabric/fundamentals/launch.rst
@@ -237,6 +237,15 @@ Next steps
     :height: 160
     :tag: advanced
 
+.. displayitem::
+    :header: Troubleshooting
+    :description: Learn how to troubleshoot common multi-GPU issues
+    :button_link: ../guide/troubleshooting.html
+    :col_css: col-md-4
+    :height: 160
+    :tag: advanced
+
+
 .. raw:: html
 
         </div>
diff --git a/docs/source-fabric/glossary/index.rst b/docs/source-fabric/glossary/index.rst
index b08bc4f830163..4f9a683db8c03 100644
--- a/docs/source-fabric/glossary/index.rst
+++ b/docs/source-fabric/glossary/index.rst
@@ -8,6 +8,7 @@ Glossary
 
    Checkpoint <../guide/checkpoint/index>
    Weights and Biases <../guide/loggers/wandb>
+   Troubleshooting <../guide/troubleshooting>
 
 
 .. raw:: html
@@ -150,6 +151,11 @@ Glossary
     :button_link: ../fundamentals/launch.html
     :col_css: col-md-4
 
+.. displayitem::
+    :header: NCCL
+    :button_link: ../guide/troubleshoot.html
+    :col_css: col-md-4
+
 .. displayitem::
     :header: Notebook
     :button_link: ../launch/notebook.html

From 297e9809d2da7ec2abb0f2e7c5e6c371ae0eaac8 Mon Sep 17 00:00:00 2001
From: awaelchli <aedu.waelchli@gmail.com>
Date: Mon, 11 Mar 2024 03:00:14 +0100
Subject: [PATCH 22/24] link

---
 docs/source-fabric/glossary/index.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source-fabric/glossary/index.rst b/docs/source-fabric/glossary/index.rst
index 4f9a683db8c03..e5bc92cad9ceb 100644
--- a/docs/source-fabric/glossary/index.rst
+++ b/docs/source-fabric/glossary/index.rst
@@ -153,7 +153,7 @@ Glossary
 
 .. displayitem::
     :header: NCCL
-    :button_link: ../guide/troubleshoot.html
+    :button_link: ../guide/troubleshooting.html
     :col_css: col-md-4
 
 .. displayitem::

From cd63115a76d15473dd5b258d4108f113ead687b9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Thu, 14 Mar 2024 20:43:08 -0400
Subject: [PATCH 23/24] update

---
 .../fabric/utilities/system_check.py          | 42 +++++++++++++------
 1 file changed, 30 insertions(+), 12 deletions(-)

diff --git a/src/lightning/fabric/utilities/system_check.py b/src/lightning/fabric/utilities/system_check.py
index e9c52a27be7d8..ed7898302237a 100644
--- a/src/lightning/fabric/utilities/system_check.py
+++ b/src/lightning/fabric/utilities/system_check.py
@@ -23,6 +23,7 @@
 import torch
 import torch.distributed
 import torch.multiprocessing as mp
+from torch.multiprocessing.spawn import ProcessRaisedException
 from lightning_utilities.core.imports import RequirementCache
 
 _psutil_available = RequirementCache("psutil")
@@ -40,6 +41,7 @@ def main(timeout: int = 60) -> None:
 
     if num_cuda_devices == 1:
         _describe_nvidia_smi()
+        # _check_cuda()
 
     if num_cuda_devices > 1:
         _describe_nvidia_smi()
@@ -48,19 +50,24 @@ def main(timeout: int = 60) -> None:
         success = _check_cuda_distributed(timeout)
 
         if not success:
+            env = {
+                "NCCL_P2P_DISABLE": "1",
+                "NCCL_NET_PLUGIN": "none",
+            }
             _print0(
-                f"The multi-GPU NCCL test did not finish within {timeout} seconds."
+                f"The multi-GPU NCCL test did not succeed."
                 " It looks like there is an issue with your multi-GPU setup."
-                " Now trying to run again with `NCCL_P2P_DISABLE=1` set."
+                " Now trying to run again with NCCL features disabled."
             )
-            os.environ["NCCL_P2P_DISABLE"] = "1"
+            os.environ.update(env)
             success = _check_cuda_distributed(timeout)
             if success:
-                _print0("Disabling peer-to-peer transport fixed the issue.")
-                # TODO: Give advice
+                _print0("Disabling the following NCCL features seems to have fixed the issue:")
+                _print_env_variables(env)
             else:
-                _print0("Disabling peer-to-peer transport did not fix the issue.")
-        else:
+                _print0("Disabling NCCL features did not fix the issue.")
+
+        if success:
             _print0("Multi-GPU test successful.")
 
     _print0(f"Find detailed logs at {_system_check_dir.absolute()}")
@@ -81,7 +88,13 @@ def _check_cuda_distributed(timeout: int) -> bool:
     start = time.time()
     success = False
     while not success and (time.time() - start < timeout):
-        success = context.join(timeout=5)
+        try:
+            success = context.join(timeout=5)
+        except ProcessRaisedException as e:
+            _logger.debug(str(e))
+            success = False
+            break
+    
         time.sleep(1)
 
     if not success:
@@ -125,9 +138,9 @@ def _setup_logging() -> None:
         shutil.rmtree(_system_check_dir)
     _system_check_dir.mkdir()
 
-    _logger.setLevel(logging.INFO)
+    _logger.setLevel(logging.DEBUG)
     file_handler = logging.FileHandler(str(_system_check_dir / "logs.txt"))
-    file_handler.setLevel(logging.INFO)
+    file_handler.setLevel(logging.DEBUG)
     console_handler = logging.StreamHandler()
     console_handler.setLevel(logging.INFO)
     _logger.addHandler(file_handler)
@@ -140,6 +153,11 @@ def _print0(string: str) -> None:
         _logger.info(string)
 
 
+def _print_env_variables(env: dict) -> None:
+    for k, v in env.items():
+        _print0(f"{k}={v}")
+
+
 def _collect_nvidia_smi_topo() -> str:
     return subprocess.run(["nvidia-smi", "topo", "-m"], capture_output=True, text=True).stdout
 
@@ -157,11 +175,11 @@ def _describe_nvidia_smi() -> None:
 
 
 def _describe_gpu_connectivity() -> None:
-    _logger.info(
+    _logger.debug(
         "The matrix below shows how the GPUs in this machine are connected."
         " NVLink (NV) is the fastest connection, and is only available on high-end systems like V100, A100, etc.\n"
     )
-    _logger.info(_collect_nvidia_smi_topo())
+    _logger.debug(_collect_nvidia_smi_topo())
 
 
 def _kill_process(pid: int) -> None:

From a25eaf8be8e58559ad2f486167832905497b76ca Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 15 Mar 2024 00:43:46 +0000
Subject: [PATCH 24/24] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/lightning/fabric/utilities/system_check.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/lightning/fabric/utilities/system_check.py b/src/lightning/fabric/utilities/system_check.py
index ed7898302237a..5453ecae13e61 100644
--- a/src/lightning/fabric/utilities/system_check.py
+++ b/src/lightning/fabric/utilities/system_check.py
@@ -23,8 +23,8 @@
 import torch
 import torch.distributed
 import torch.multiprocessing as mp
-from torch.multiprocessing.spawn import ProcessRaisedException
 from lightning_utilities.core.imports import RequirementCache
+from torch.multiprocessing.spawn import ProcessRaisedException
 
 _psutil_available = RequirementCache("psutil")
 _logger = logging.getLogger(__name__)
@@ -55,7 +55,7 @@ def main(timeout: int = 60) -> None:
                 "NCCL_NET_PLUGIN": "none",
             }
             _print0(
-                f"The multi-GPU NCCL test did not succeed."
+                "The multi-GPU NCCL test did not succeed."
                 " It looks like there is an issue with your multi-GPU setup."
                 " Now trying to run again with NCCL features disabled."
             )
@@ -94,7 +94,7 @@ def _check_cuda_distributed(timeout: int) -> bool:
             _logger.debug(str(e))
             success = False
             break
-    
+
         time.sleep(1)
 
     if not success: