diff --git a/.azure/gpu-tests-fabric.yml b/.azure/gpu-tests-fabric.yml index ba86449e92355..b41b714edce32 100644 --- a/.azure/gpu-tests-fabric.yml +++ b/.azure/gpu-tests-fabric.yml @@ -134,14 +134,21 @@ jobs: - bash: python -m coverage run --source ${COVERAGE_SOURCE} -m pytest . -v --durations=50 workingDirectory: tests/tests_fabric/ - displayName: "Testing: fabric standard" + displayName: "Testing: Fabric standard" timeoutInMinutes: "10" - bash: bash ../run_standalone_tests.sh "." workingDirectory: tests/tests_fabric/ env: PL_STANDALONE_TESTS_SOURCE: $(COVERAGE_SOURCE) - displayName: "Testing: fabric standalone" + displayName: "Testing: Fabric standalone tests" + timeoutInMinutes: "10" + + - bash: bash run_standalone_tasks.sh + workingDirectory: tests/tests_fabric + env: + PL_USE_MOCKED_MNIST: "1" + displayName: "Testing: Fabric standalone tasks" timeoutInMinutes: "10" - bash: | diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml index b9ab6ead7f0d1..c795bac955334 100644 --- a/.azure/gpu-tests-pytorch.yml +++ b/.azure/gpu-tests-pytorch.yml @@ -24,6 +24,7 @@ pr: - "examples/run_pl_examples.sh" - "examples/pytorch/basics/backbone_image_classifier.py" - "examples/pytorch/basics/autoencoder.py" + - "tests/run_standalone_*.sh" - "requirements/pytorch/**" - "src/lightning/__init__.py" - "src/lightning/__setup__.py" diff --git a/.gitignore b/.gitignore index de1de44fec235..2ace5d1151c5e 100644 --- a/.gitignore +++ b/.gitignore @@ -175,6 +175,7 @@ wandb *.prof *.tar.gz .neptune/ +system_check/ # dataset generated from bolts in examples. cifar-10-batches-py diff --git a/docs/source-fabric/fundamentals/launch.rst b/docs/source-fabric/fundamentals/launch.rst index efde3f54fe846..784011e533e13 100644 --- a/docs/source-fabric/fundamentals/launch.rst +++ b/docs/source-fabric/fundamentals/launch.rst @@ -237,6 +237,15 @@ Next steps :height: 160 :tag: advanced +.. displayitem:: + :header: Troubleshooting + :description: Learn how to troubleshoot common multi-GPU issues + :button_link: ../guide/troubleshooting.html + :col_css: col-md-4 + :height: 160 + :tag: advanced + + .. raw:: html diff --git a/docs/source-fabric/glossary/index.rst b/docs/source-fabric/glossary/index.rst index b08bc4f830163..e5bc92cad9ceb 100644 --- a/docs/source-fabric/glossary/index.rst +++ b/docs/source-fabric/glossary/index.rst @@ -8,6 +8,7 @@ Glossary Checkpoint <../guide/checkpoint/index> Weights and Biases <../guide/loggers/wandb> + Troubleshooting <../guide/troubleshooting> .. raw:: html @@ -150,6 +151,11 @@ Glossary :button_link: ../fundamentals/launch.html :col_css: col-md-4 +.. displayitem:: + :header: NCCL + :button_link: ../guide/troubleshooting.html + :col_css: col-md-4 + .. displayitem:: :header: Notebook :button_link: ../launch/notebook.html diff --git a/docs/source-fabric/guide/multi_node/barebones.rst b/docs/source-fabric/guide/multi_node/barebones.rst index a251df230174c..6a43460b9865a 100644 --- a/docs/source-fabric/guide/multi_node/barebones.rst +++ b/docs/source-fabric/guide/multi_node/barebones.rst @@ -110,52 +110,6 @@ After executing these commands, you should immediately see an output like this: Troubleshooting *************** - -**My program is stuck initializing at startup. What is causing this?** - -You are seeing a message like this in the logs, but nothing happens: - -.. code-block:: - - Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/4 - -The most likely reasons and how to fix it: - -- **Wrong network interface:** Some servers have multiple network interfaces. - There is usually only one that can send and receive traffic from the network of the other nodes, but sometimes it is not set as the default. - In this case, you need to set it manually: - - .. code-block:: bash - - export GLOO_SOCKET_IFNAME=eno1 - export NCCL_SOCKET_IFNAME=eno1 - fabric run ... - - You can find the interface name by parsing the output of the ``ifconfig`` command. - The name of this interface **may differ on each node**. - -- **NCCL can't communicate between the nodes:** - - Follow the steps in the `NCCL troubleshooting guide `_. - In particular, take note of the network section that describes restricting the port range and firewall rules. - - .. code-block:: bash - - echo "net.ipv4.ip_local_port_range = 50000 51000" >> /etc/sysctl.conf - sysctl --system - ufw allow 50000:51000/tcp - - -**My program crashes with an NCCL error, but it is not helpful** - -Launch your command by prepending ``NCCL_DEBUG=INFO`` to get more info. - -.. code-block:: bash - - NCCL_DEBUG=INFO fabric run ... - - ----- - -If you are sick of troubleshooting cluster problems, give :doc:`Lightning cloud <./cloud>` a try! +Please refer to the :doc:`troubleshooting guide <../troubleshooting>` guide if you are experiencing issues related to multi-node training hanging or crashing. +If you are sick of troubleshooting cluster problems, give :doc:`Lightning Studios <./cloud>` a try! For other questions, please don't hesitate to join the `Discord `_. diff --git a/docs/source-fabric/guide/troubleshooting.rst b/docs/source-fabric/guide/troubleshooting.rst new file mode 100644 index 0000000000000..28a841dbe6bb8 --- /dev/null +++ b/docs/source-fabric/guide/troubleshooting.rst @@ -0,0 +1,87 @@ +############### +Troubleshooting +############### + +Learn how to troubleshoot possible causes for common issues related to CUDA, NCCL, and distributed training. + + +---- + + +********* +Multi-GPU +********* + +If your program is stuck at + +.. code-block:: + + Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/4 + +it indicates that PyTorch can't set up the communication between GPUs, and that your system is not configured correctly. +Run the `diagnose` command from the Fabric CLI to investigate: + +.. code-block:: bash + + fabric diagnose + +This tool will run basic multi-GPU tests using only PyTorch. +Any issues raised here will confirm that the problem is with your system and not with Lightning. +Common solutions: + +- **Wrong driver version:** The NVIDIA driver for your GPU is too old or too new. + You can check the version of the driver by running + + .. code-block:: bash + + nvidia-smi --id=0 --query-gpu=driver_version --format=csv,noheader + + *Solution*: Install a recent driver. + Search online for instructions how to update the driver on your platform. + +- **Peer-to-peer connection is broken:** The GPUs can't communicate with each other. + *Solution*: Try to set the environment variable ``NCCL_P2P_DISABLE=1``. + If you rerun your scipt and it fixes the problem, this means that peer-to-peer transport is not working properly (your training will run but it will be slow). + This is likely because of driver compatibility issues (see above) or because your GPU does not support peer-to-peer (e.g., certain RTX cards). + + +---- + + +********** +Multi-node +********** + +Before troubleshooting multi-node connectivity issues, first ensure that multi-GPU within a single machine is working correctly by following the steps above. +If single-node execution works, but multi-node hangs at + +.. code-block:: + + Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/4 + +it indicates that there is a connection issue between the nodes. +Common solutions: + +- **Wrong network interface:** Some servers have multiple network interfaces. + There is usually only one that can send and receive traffic from the network of the other nodes, but sometimes it is not set as the default. + In this case, you need to set it manually: + + .. code-block:: bash + + export GLOO_SOCKET_IFNAME=eno1 + export NCCL_SOCKET_IFNAME=eno1 + fabric run ... + + You can find the interface name by parsing the output of the ``ifconfig`` command. + The name of this interface **may differ on each node**. + +- **NCCL can't communicate between the nodes:** + + Follow the steps in the `NCCL troubleshooting guide `_. + In particular, take note of the network section that describes restricting the port range and firewall rules. + + .. code-block:: bash + + echo "net.ipv4.ip_local_port_range = 50000 51000" >> /etc/sysctl.conf + sysctl --system + ufw allow 50000:51000/tcp diff --git a/src/lightning/fabric/cli.py b/src/lightning/fabric/cli.py index d8c6fe47b6630..ffbcf7ade6a9a 100644 --- a/src/lightning/fabric/cli.py +++ b/src/lightning/fabric/cli.py @@ -26,6 +26,7 @@ from lightning.fabric.accelerators import CPUAccelerator, CUDAAccelerator, MPSAccelerator from lightning.fabric.plugins.precision.precision import _PRECISION_INPUT_STR, _PRECISION_INPUT_STR_ALIAS from lightning.fabric.strategies import STRATEGY_REGISTRY +from lightning.fabric.utilities import system_check from lightning.fabric.utilities.consolidate_checkpoint import _process_cli_args from lightning.fabric.utilities.device_parser import _parse_gpu_ids from lightning.fabric.utilities.distributed import _suggested_max_num_threads @@ -188,6 +189,11 @@ def _consolidate(checkpoint_folder: str, output_file: Optional[str]) -> None: checkpoint = _load_distributed_checkpoint(config.checkpoint_folder) torch.save(checkpoint, config.output_file) + @_main.command("diagnose") + def _diagnose() -> None: + """Diagnose issues with your multi-GPU setup.""" + system_check.main() + def _set_env_variables(args: Namespace) -> None: """Set the environment variables for the new processes. diff --git a/src/lightning/fabric/utilities/consolidate_checkpoint.py b/src/lightning/fabric/utilities/consolidate_checkpoint.py index 15d20d8d89ecc..7a94089c7a9cd 100644 --- a/src/lightning/fabric/utilities/consolidate_checkpoint.py +++ b/src/lightning/fabric/utilities/consolidate_checkpoint.py @@ -1,3 +1,16 @@ +# Copyright The Lightning AI team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import logging from argparse import ArgumentParser, Namespace from pathlib import Path diff --git a/src/lightning/fabric/utilities/distributed.py b/src/lightning/fabric/utilities/distributed.py index 30bfe4e254a07..599b185ffd105 100644 --- a/src/lightning/fabric/utilities/distributed.py +++ b/src/lightning/fabric/utilities/distributed.py @@ -1,3 +1,16 @@ +# Copyright The Lightning AI team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import contextlib import logging import os diff --git a/src/lightning/fabric/utilities/seed.py b/src/lightning/fabric/utilities/seed.py index b274bce88fcdf..c7003f1b05593 100644 --- a/src/lightning/fabric/utilities/seed.py +++ b/src/lightning/fabric/utilities/seed.py @@ -1,3 +1,16 @@ +# Copyright The Lightning AI team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import logging import os import random diff --git a/src/lightning/fabric/utilities/spike.py b/src/lightning/fabric/utilities/spike.py index 5dca5990064e8..6ccad8f9bf776 100644 --- a/src/lightning/fabric/utilities/spike.py +++ b/src/lightning/fabric/utilities/spike.py @@ -1,3 +1,16 @@ +# Copyright The Lightning AI team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import json import operator import os diff --git a/src/lightning/fabric/utilities/system_check.py b/src/lightning/fabric/utilities/system_check.py new file mode 100644 index 0000000000000..5453ecae13e61 --- /dev/null +++ b/src/lightning/fabric/utilities/system_check.py @@ -0,0 +1,197 @@ +# Copyright The Lightning AI team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os +import shutil +import subprocess +import time +from datetime import timedelta +from pathlib import Path + +import torch +import torch.distributed +import torch.multiprocessing as mp +from lightning_utilities.core.imports import RequirementCache +from torch.multiprocessing.spawn import ProcessRaisedException + +_psutil_available = RequirementCache("psutil") +_logger = logging.getLogger(__name__) +_system_check_dir = Path("./system_check") + + +def main(timeout: int = 60) -> None: + _setup_logging() + + num_cuda_devices = torch.cuda.device_count() + + if num_cuda_devices == 0: + _print0("Warning: Skipping system check because no GPUs were detected.") + + if num_cuda_devices == 1: + _describe_nvidia_smi() + # _check_cuda() + + if num_cuda_devices > 1: + _describe_nvidia_smi() + _describe_gpu_connectivity() + + success = _check_cuda_distributed(timeout) + + if not success: + env = { + "NCCL_P2P_DISABLE": "1", + "NCCL_NET_PLUGIN": "none", + } + _print0( + "The multi-GPU NCCL test did not succeed." + " It looks like there is an issue with your multi-GPU setup." + " Now trying to run again with NCCL features disabled." + ) + os.environ.update(env) + success = _check_cuda_distributed(timeout) + if success: + _print0("Disabling the following NCCL features seems to have fixed the issue:") + _print_env_variables(env) + else: + _print0("Disabling NCCL features did not fix the issue.") + + if success: + _print0("Multi-GPU test successful.") + + _print0(f"Find detailed logs at {_system_check_dir.absolute()}") + + +def _check_cuda_distributed(timeout: int) -> bool: + if not _psutil_available: + raise ModuleNotFoundError(str(_psutil_available)) + + num_cuda_devices = torch.cuda.device_count() + context = mp.spawn( + _run_all_reduce_test, + nprocs=num_cuda_devices, + args=(num_cuda_devices,), + join=False, + ) + + start = time.time() + success = False + while not success and (time.time() - start < timeout): + try: + success = context.join(timeout=5) + except ProcessRaisedException as e: + _logger.debug(str(e)) + success = False + break + + time.sleep(1) + + if not success: + for pid in context.pids(): + _kill_process(pid) + return success + + +def _run_all_reduce_test(local_rank: int, world_size: int) -> None: + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = "29500" + os.environ["WORLD_SIZE"] = str(world_size) + os.environ["RANK"] = str(local_rank) + os.environ["LOCAL_RANK"] = str(local_rank) + os.environ["NCCL_DEBUG"] = "INFO" + os.environ["NCCL_DEBUG_FILE"] = str(_system_check_dir / f"nccl-rank-{local_rank}.txt") + + device = torch.device("cuda", local_rank) + torch.cuda.set_device(local_rank) + + _print0("Setting up the process group ...") + torch.distributed.init_process_group( + backend="nccl", + world_size=world_size, + rank=local_rank, + # NCCL gets initialized in the first collective call (e.g., barrier below), + # which must be successful for this timeout to work. + timeout=timedelta(seconds=10), + ) + + _print0("Synchronizing GPUs ... ") + torch.distributed.barrier() + + payload = torch.rand(100, 100, device=device) + _print0("Running all-reduce test ...") + torch.distributed.all_reduce(payload) + + +def _setup_logging() -> None: + if _system_check_dir.is_dir(): + shutil.rmtree(_system_check_dir) + _system_check_dir.mkdir() + + _logger.setLevel(logging.DEBUG) + file_handler = logging.FileHandler(str(_system_check_dir / "logs.txt")) + file_handler.setLevel(logging.DEBUG) + console_handler = logging.StreamHandler() + console_handler.setLevel(logging.INFO) + _logger.addHandler(file_handler) + _logger.addHandler(console_handler) + _logger.propagate = False + + +def _print0(string: str) -> None: + if int(os.getenv("RANK", 0)) == 0: + _logger.info(string) + + +def _print_env_variables(env: dict) -> None: + for k, v in env.items(): + _print0(f"{k}={v}") + + +def _collect_nvidia_smi_topo() -> str: + return subprocess.run(["nvidia-smi", "topo", "-m"], capture_output=True, text=True).stdout + + +def _collect_nvidia_smi() -> str: + return subprocess.run(["nvidia-smi"], capture_output=True, text=True).stdout + + +def _describe_nvidia_smi() -> None: + _logger.info( + "Below is the output of `nvidia-smi`. It shows information about the GPUs that are installed on this machine," + " the driver version, and the maximum supported CUDA version it can run.\n" + ) + _logger.info(_collect_nvidia_smi()) + + +def _describe_gpu_connectivity() -> None: + _logger.debug( + "The matrix below shows how the GPUs in this machine are connected." + " NVLink (NV) is the fastest connection, and is only available on high-end systems like V100, A100, etc.\n" + ) + _logger.debug(_collect_nvidia_smi_topo()) + + +def _kill_process(pid: int) -> None: + import psutil + + try: + process = psutil.Process(pid) + if process.is_running(): + process.kill() + except (psutil.NoSuchProcess, psutil.AccessDenied): + pass + + +if __name__ == "__main__": + main() diff --git a/tests/tests_fabric/run_standalone_tasks.sh b/tests/tests_fabric/run_standalone_tasks.sh new file mode 100644 index 0000000000000..63ff0c0d301cd --- /dev/null +++ b/tests/tests_fabric/run_standalone_tasks.sh @@ -0,0 +1,23 @@ +#!/bin/bash +# Copyright The Lightning AI team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +set -e +# THIS FILE ASSUMES IT IS RUN INSIDE THE tests/tests_pytorch DIRECTORY + +# this environment variable allows special tests to run +export PL_RUN_STANDALONE_TESTS=1 + +# test that a user can manually launch individual processes +echo "Running system check" +fabric diagnose diff --git a/tests/tests_fabric/utilities/test_system_check.py b/tests/tests_fabric/utilities/test_system_check.py new file mode 100644 index 0000000000000..1efeae98b71f6 --- /dev/null +++ b/tests/tests_fabric/utilities/test_system_check.py @@ -0,0 +1,17 @@ +import os +from unittest import mock + +import torch +from lightning.fabric.utilities.system_check import _run_all_reduce_test + + +@mock.patch.dict(os.environ, {}, clear=True) +@mock.patch("lightning.fabric.utilities.system_check.torch.device", return_value=torch.device("cpu")) +@mock.patch("lightning.fabric.utilities.system_check.torch.cuda.set_device") +@mock.patch("lightning.fabric.utilities.system_check.torch.distributed") +def test_run_all_reduce_test(dist_mock, set_device_mock, __): + _run_all_reduce_test(local_rank=1, world_size=4) + set_device_mock.assert_called_once() + dist_mock.init_process_group.assert_called_once() + dist_mock.barrier.assert_called_once() + dist_mock.all_reduce.assert_called_once()