diff --git a/.azure/gpu-tests-fabric.yml b/.azure/gpu-tests-fabric.yml
index ba86449e92355..b41b714edce32 100644
--- a/.azure/gpu-tests-fabric.yml
+++ b/.azure/gpu-tests-fabric.yml
@@ -134,14 +134,21 @@ jobs:
 
       - bash: python -m coverage run --source ${COVERAGE_SOURCE} -m pytest . -v --durations=50
         workingDirectory: tests/tests_fabric/
-        displayName: "Testing: fabric standard"
+        displayName: "Testing: Fabric standard"
         timeoutInMinutes: "10"
 
       - bash: bash ../run_standalone_tests.sh "."
         workingDirectory: tests/tests_fabric/
         env:
           PL_STANDALONE_TESTS_SOURCE: $(COVERAGE_SOURCE)
-        displayName: "Testing: fabric standalone"
+        displayName: "Testing: Fabric standalone tests"
+        timeoutInMinutes: "10"
+
+      - bash: bash run_standalone_tasks.sh
+        workingDirectory: tests/tests_fabric
+        env:
+          PL_USE_MOCKED_MNIST: "1"
+        displayName: "Testing: Fabric standalone tasks"
         timeoutInMinutes: "10"
 
       - bash: |
diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml
index b9ab6ead7f0d1..c795bac955334 100644
--- a/.azure/gpu-tests-pytorch.yml
+++ b/.azure/gpu-tests-pytorch.yml
@@ -24,6 +24,7 @@ pr:
       - "examples/run_pl_examples.sh"
       - "examples/pytorch/basics/backbone_image_classifier.py"
       - "examples/pytorch/basics/autoencoder.py"
+      - "tests/run_standalone_*.sh"
       - "requirements/pytorch/**"
       - "src/lightning/__init__.py"
       - "src/lightning/__setup__.py"
diff --git a/.gitignore b/.gitignore
index de1de44fec235..2ace5d1151c5e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -175,6 +175,7 @@ wandb
 *.prof
 *.tar.gz
 .neptune/
+system_check/
 
 # dataset generated from bolts in examples.
 cifar-10-batches-py
diff --git a/docs/source-fabric/fundamentals/launch.rst b/docs/source-fabric/fundamentals/launch.rst
index efde3f54fe846..784011e533e13 100644
--- a/docs/source-fabric/fundamentals/launch.rst
+++ b/docs/source-fabric/fundamentals/launch.rst
@@ -237,6 +237,15 @@ Next steps
     :height: 160
     :tag: advanced
 
+.. displayitem::
+    :header: Troubleshooting
+    :description: Learn how to troubleshoot common multi-GPU issues
+    :button_link: ../guide/troubleshooting.html
+    :col_css: col-md-4
+    :height: 160
+    :tag: advanced
+
+
 .. raw:: html
 
         </div>
diff --git a/docs/source-fabric/glossary/index.rst b/docs/source-fabric/glossary/index.rst
index b08bc4f830163..e5bc92cad9ceb 100644
--- a/docs/source-fabric/glossary/index.rst
+++ b/docs/source-fabric/glossary/index.rst
@@ -8,6 +8,7 @@ Glossary
 
    Checkpoint <../guide/checkpoint/index>
    Weights and Biases <../guide/loggers/wandb>
+   Troubleshooting <../guide/troubleshooting>
 
 
 .. raw:: html
@@ -150,6 +151,11 @@ Glossary
     :button_link: ../fundamentals/launch.html
     :col_css: col-md-4
 
+.. displayitem::
+    :header: NCCL
+    :button_link: ../guide/troubleshooting.html
+    :col_css: col-md-4
+
 .. displayitem::
     :header: Notebook
     :button_link: ../launch/notebook.html
diff --git a/docs/source-fabric/guide/multi_node/barebones.rst b/docs/source-fabric/guide/multi_node/barebones.rst
index a251df230174c..6a43460b9865a 100644
--- a/docs/source-fabric/guide/multi_node/barebones.rst
+++ b/docs/source-fabric/guide/multi_node/barebones.rst
@@ -110,52 +110,6 @@ After executing these commands, you should immediately see an output like this:
 Troubleshooting
 ***************
 
-
-**My program is stuck initializing at startup. What is causing this?**
-
-You are seeing a message like this in the logs, but nothing happens:
-
-.. code-block::
-
-    Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/4
-
-The most likely reasons and how to fix it:
-
-- **Wrong network interface:** Some servers have multiple network interfaces.
-  There is usually only one that can send and receive traffic from the network of the other nodes, but sometimes it is not set as the default.
-  In this case, you need to set it manually:
-
-  .. code-block:: bash
-
-    export GLOO_SOCKET_IFNAME=eno1
-    export NCCL_SOCKET_IFNAME=eno1
-    fabric run ...
-
-  You can find the interface name by parsing the output of the ``ifconfig`` command.
-  The name of this interface **may differ on each node**.
-
-- **NCCL can't communicate between the nodes:**
-
-  Follow the steps in the `NCCL troubleshooting guide <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/troubleshooting.html>`_.
-  In particular, take note of the network section that describes restricting the port range and firewall rules.
-
-  .. code-block:: bash
-
-      echo "net.ipv4.ip_local_port_range = 50000 51000" >> /etc/sysctl.conf
-      sysctl --system
-      ufw allow 50000:51000/tcp
-
-
-**My program crashes with an NCCL error, but it is not helpful**
-
-Launch your command by prepending ``NCCL_DEBUG=INFO`` to get more info.
-
-.. code-block:: bash
-
-    NCCL_DEBUG=INFO fabric run ...
-
-
-----
-
-If you are sick of troubleshooting cluster problems, give :doc:`Lightning cloud <./cloud>` a try!
+Please refer to the :doc:`troubleshooting guide <../troubleshooting>` guide if you are experiencing issues related to multi-node training hanging or crashing.
+If you are sick of troubleshooting cluster problems, give :doc:`Lightning Studios <./cloud>` a try!
 For other questions, please don't hesitate to join the `Discord <https://discord.gg/VptPCZkGNa>`_.
diff --git a/docs/source-fabric/guide/troubleshooting.rst b/docs/source-fabric/guide/troubleshooting.rst
new file mode 100644
index 0000000000000..28a841dbe6bb8
--- /dev/null
+++ b/docs/source-fabric/guide/troubleshooting.rst
@@ -0,0 +1,87 @@
+###############
+Troubleshooting
+###############
+
+Learn how to troubleshoot possible causes for common issues related to CUDA, NCCL, and distributed training.
+
+
+----
+
+
+*********
+Multi-GPU
+*********
+
+If your program is stuck at
+
+.. code-block::
+
+    Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/4
+
+it indicates that PyTorch can't set up the communication between GPUs, and that your system is not configured correctly.
+Run the `diagnose` command from the Fabric CLI to investigate:
+
+.. code-block:: bash
+
+    fabric diagnose
+
+This tool will run basic multi-GPU tests using only PyTorch.
+Any issues raised here will confirm that the problem is with your system and not with Lightning.
+Common solutions:
+
+- **Wrong driver version:** The NVIDIA driver for your GPU is too old or too new.
+  You can check the version of the driver by running
+
+  .. code-block:: bash
+
+      nvidia-smi --id=0 --query-gpu=driver_version --format=csv,noheader
+
+  *Solution*: Install a recent driver.
+  Search online for instructions how to update the driver on your platform.
+
+- **Peer-to-peer connection is broken:** The GPUs can't communicate with each other.
+  *Solution*: Try to set the environment variable ``NCCL_P2P_DISABLE=1``.
+  If you rerun your scipt and it fixes the problem, this means that peer-to-peer transport is not working properly (your training will run but it will be slow).
+  This is likely because of driver compatibility issues (see above) or because your GPU does not support peer-to-peer (e.g., certain RTX cards).
+
+
+----
+
+
+**********
+Multi-node
+**********
+
+Before troubleshooting multi-node connectivity issues, first ensure that multi-GPU within a single machine is working correctly by following the steps above.
+If single-node execution works, but multi-node hangs at
+
+.. code-block::
+
+    Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/4
+
+it indicates that there is a connection issue between the nodes.
+Common solutions:
+
+- **Wrong network interface:** Some servers have multiple network interfaces.
+  There is usually only one that can send and receive traffic from the network of the other nodes, but sometimes it is not set as the default.
+  In this case, you need to set it manually:
+
+  .. code-block:: bash
+
+    export GLOO_SOCKET_IFNAME=eno1
+    export NCCL_SOCKET_IFNAME=eno1
+    fabric run ...
+
+  You can find the interface name by parsing the output of the ``ifconfig`` command.
+  The name of this interface **may differ on each node**.
+
+- **NCCL can't communicate between the nodes:**
+
+  Follow the steps in the `NCCL troubleshooting guide <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/troubleshooting.html>`_.
+  In particular, take note of the network section that describes restricting the port range and firewall rules.
+
+  .. code-block:: bash
+
+      echo "net.ipv4.ip_local_port_range = 50000 51000" >> /etc/sysctl.conf
+      sysctl --system
+      ufw allow 50000:51000/tcp
diff --git a/src/lightning/fabric/cli.py b/src/lightning/fabric/cli.py
index d8c6fe47b6630..ffbcf7ade6a9a 100644
--- a/src/lightning/fabric/cli.py
+++ b/src/lightning/fabric/cli.py
@@ -26,6 +26,7 @@
 from lightning.fabric.accelerators import CPUAccelerator, CUDAAccelerator, MPSAccelerator
 from lightning.fabric.plugins.precision.precision import _PRECISION_INPUT_STR, _PRECISION_INPUT_STR_ALIAS
 from lightning.fabric.strategies import STRATEGY_REGISTRY
+from lightning.fabric.utilities import system_check
 from lightning.fabric.utilities.consolidate_checkpoint import _process_cli_args
 from lightning.fabric.utilities.device_parser import _parse_gpu_ids
 from lightning.fabric.utilities.distributed import _suggested_max_num_threads
@@ -188,6 +189,11 @@ def _consolidate(checkpoint_folder: str, output_file: Optional[str]) -> None:
         checkpoint = _load_distributed_checkpoint(config.checkpoint_folder)
         torch.save(checkpoint, config.output_file)
 
+    @_main.command("diagnose")
+    def _diagnose() -> None:
+        """Diagnose issues with your multi-GPU setup."""
+        system_check.main()
+
 
 def _set_env_variables(args: Namespace) -> None:
     """Set the environment variables for the new processes.
diff --git a/src/lightning/fabric/utilities/consolidate_checkpoint.py b/src/lightning/fabric/utilities/consolidate_checkpoint.py
index 15d20d8d89ecc..7a94089c7a9cd 100644
--- a/src/lightning/fabric/utilities/consolidate_checkpoint.py
+++ b/src/lightning/fabric/utilities/consolidate_checkpoint.py
@@ -1,3 +1,16 @@
+# Copyright The Lightning AI team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import logging
 from argparse import ArgumentParser, Namespace
 from pathlib import Path
diff --git a/src/lightning/fabric/utilities/distributed.py b/src/lightning/fabric/utilities/distributed.py
index 30bfe4e254a07..599b185ffd105 100644
--- a/src/lightning/fabric/utilities/distributed.py
+++ b/src/lightning/fabric/utilities/distributed.py
@@ -1,3 +1,16 @@
+# Copyright The Lightning AI team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import contextlib
 import logging
 import os
diff --git a/src/lightning/fabric/utilities/seed.py b/src/lightning/fabric/utilities/seed.py
index b274bce88fcdf..c7003f1b05593 100644
--- a/src/lightning/fabric/utilities/seed.py
+++ b/src/lightning/fabric/utilities/seed.py
@@ -1,3 +1,16 @@
+# Copyright The Lightning AI team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import logging
 import os
 import random
diff --git a/src/lightning/fabric/utilities/spike.py b/src/lightning/fabric/utilities/spike.py
index 5dca5990064e8..6ccad8f9bf776 100644
--- a/src/lightning/fabric/utilities/spike.py
+++ b/src/lightning/fabric/utilities/spike.py
@@ -1,3 +1,16 @@
+# Copyright The Lightning AI team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import json
 import operator
 import os
diff --git a/src/lightning/fabric/utilities/system_check.py b/src/lightning/fabric/utilities/system_check.py
new file mode 100644
index 0000000000000..5453ecae13e61
--- /dev/null
+++ b/src/lightning/fabric/utilities/system_check.py
@@ -0,0 +1,197 @@
+# Copyright The Lightning AI team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import shutil
+import subprocess
+import time
+from datetime import timedelta
+from pathlib import Path
+
+import torch
+import torch.distributed
+import torch.multiprocessing as mp
+from lightning_utilities.core.imports import RequirementCache
+from torch.multiprocessing.spawn import ProcessRaisedException
+
+_psutil_available = RequirementCache("psutil")
+_logger = logging.getLogger(__name__)
+_system_check_dir = Path("./system_check")
+
+
+def main(timeout: int = 60) -> None:
+    _setup_logging()
+
+    num_cuda_devices = torch.cuda.device_count()
+
+    if num_cuda_devices == 0:
+        _print0("Warning: Skipping system check because no GPUs were detected.")
+
+    if num_cuda_devices == 1:
+        _describe_nvidia_smi()
+        # _check_cuda()
+
+    if num_cuda_devices > 1:
+        _describe_nvidia_smi()
+        _describe_gpu_connectivity()
+
+        success = _check_cuda_distributed(timeout)
+
+        if not success:
+            env = {
+                "NCCL_P2P_DISABLE": "1",
+                "NCCL_NET_PLUGIN": "none",
+            }
+            _print0(
+                "The multi-GPU NCCL test did not succeed."
+                " It looks like there is an issue with your multi-GPU setup."
+                " Now trying to run again with NCCL features disabled."
+            )
+            os.environ.update(env)
+            success = _check_cuda_distributed(timeout)
+            if success:
+                _print0("Disabling the following NCCL features seems to have fixed the issue:")
+                _print_env_variables(env)
+            else:
+                _print0("Disabling NCCL features did not fix the issue.")
+
+        if success:
+            _print0("Multi-GPU test successful.")
+
+    _print0(f"Find detailed logs at {_system_check_dir.absolute()}")
+
+
+def _check_cuda_distributed(timeout: int) -> bool:
+    if not _psutil_available:
+        raise ModuleNotFoundError(str(_psutil_available))
+
+    num_cuda_devices = torch.cuda.device_count()
+    context = mp.spawn(
+        _run_all_reduce_test,
+        nprocs=num_cuda_devices,
+        args=(num_cuda_devices,),
+        join=False,
+    )
+
+    start = time.time()
+    success = False
+    while not success and (time.time() - start < timeout):
+        try:
+            success = context.join(timeout=5)
+        except ProcessRaisedException as e:
+            _logger.debug(str(e))
+            success = False
+            break
+
+        time.sleep(1)
+
+    if not success:
+        for pid in context.pids():
+            _kill_process(pid)
+    return success
+
+
+def _run_all_reduce_test(local_rank: int, world_size: int) -> None:
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = "29500"
+    os.environ["WORLD_SIZE"] = str(world_size)
+    os.environ["RANK"] = str(local_rank)
+    os.environ["LOCAL_RANK"] = str(local_rank)
+    os.environ["NCCL_DEBUG"] = "INFO"
+    os.environ["NCCL_DEBUG_FILE"] = str(_system_check_dir / f"nccl-rank-{local_rank}.txt")
+
+    device = torch.device("cuda", local_rank)
+    torch.cuda.set_device(local_rank)
+
+    _print0("Setting up the process group ...")
+    torch.distributed.init_process_group(
+        backend="nccl",
+        world_size=world_size,
+        rank=local_rank,
+        # NCCL gets initialized in the first collective call (e.g., barrier below),
+        # which must be successful for this timeout to work.
+        timeout=timedelta(seconds=10),
+    )
+
+    _print0("Synchronizing GPUs ... ")
+    torch.distributed.barrier()
+
+    payload = torch.rand(100, 100, device=device)
+    _print0("Running all-reduce test ...")
+    torch.distributed.all_reduce(payload)
+
+
+def _setup_logging() -> None:
+    if _system_check_dir.is_dir():
+        shutil.rmtree(_system_check_dir)
+    _system_check_dir.mkdir()
+
+    _logger.setLevel(logging.DEBUG)
+    file_handler = logging.FileHandler(str(_system_check_dir / "logs.txt"))
+    file_handler.setLevel(logging.DEBUG)
+    console_handler = logging.StreamHandler()
+    console_handler.setLevel(logging.INFO)
+    _logger.addHandler(file_handler)
+    _logger.addHandler(console_handler)
+    _logger.propagate = False
+
+
+def _print0(string: str) -> None:
+    if int(os.getenv("RANK", 0)) == 0:
+        _logger.info(string)
+
+
+def _print_env_variables(env: dict) -> None:
+    for k, v in env.items():
+        _print0(f"{k}={v}")
+
+
+def _collect_nvidia_smi_topo() -> str:
+    return subprocess.run(["nvidia-smi", "topo", "-m"], capture_output=True, text=True).stdout
+
+
+def _collect_nvidia_smi() -> str:
+    return subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout
+
+
+def _describe_nvidia_smi() -> None:
+    _logger.info(
+        "Below is the output of `nvidia-smi`. It shows information about the GPUs that are installed on this machine,"
+        " the driver version, and the maximum supported CUDA version it can run.\n"
+    )
+    _logger.info(_collect_nvidia_smi())
+
+
+def _describe_gpu_connectivity() -> None:
+    _logger.debug(
+        "The matrix below shows how the GPUs in this machine are connected."
+        " NVLink (NV) is the fastest connection, and is only available on high-end systems like V100, A100, etc.\n"
+    )
+    _logger.debug(_collect_nvidia_smi_topo())
+
+
+def _kill_process(pid: int) -> None:
+    import psutil
+
+    try:
+        process = psutil.Process(pid)
+        if process.is_running():
+            process.kill()
+    except (psutil.NoSuchProcess, psutil.AccessDenied):
+        pass
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/tests_fabric/run_standalone_tasks.sh b/tests/tests_fabric/run_standalone_tasks.sh
new file mode 100644
index 0000000000000..63ff0c0d301cd
--- /dev/null
+++ b/tests/tests_fabric/run_standalone_tasks.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+# Copyright The Lightning AI team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+# THIS FILE ASSUMES IT IS RUN INSIDE THE tests/tests_pytorch DIRECTORY
+
+# this environment variable allows special tests to run
+export PL_RUN_STANDALONE_TESTS=1
+
+# test that a user can manually launch individual processes
+echo "Running system check"
+fabric diagnose
diff --git a/tests/tests_fabric/utilities/test_system_check.py b/tests/tests_fabric/utilities/test_system_check.py
new file mode 100644
index 0000000000000..1efeae98b71f6
--- /dev/null
+++ b/tests/tests_fabric/utilities/test_system_check.py
@@ -0,0 +1,17 @@
+import os
+from unittest import mock
+
+import torch
+from lightning.fabric.utilities.system_check import _run_all_reduce_test
+
+
+@mock.patch.dict(os.environ, {}, clear=True)
+@mock.patch("lightning.fabric.utilities.system_check.torch.device", return_value=torch.device("cpu"))
+@mock.patch("lightning.fabric.utilities.system_check.torch.cuda.set_device")
+@mock.patch("lightning.fabric.utilities.system_check.torch.distributed")
+def test_run_all_reduce_test(dist_mock, set_device_mock, __):
+    _run_all_reduce_test(local_rank=1, world_size=4)
+    set_device_mock.assert_called_once()
+    dist_mock.init_process_group.assert_called_once()
+    dist_mock.barrier.assert_called_once()
+    dist_mock.all_reduce.assert_called_once()