Description
#!/bin/bash
On AWS, the EFA and OFI paths enable NCCL to use optimized networking.
export LD_LIBRARY_PATH=/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/amazon/openmpi/lib:/opt/aws-ofi-nccl/lib:/usr/local/cuda/lib:/usr/local/cuda/lib64:/usr/local/cuda:/usr/local/cuda/targets/x86_64-linux/lib/:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib:$LD_LIBRARY_PATH
export FI_EFA_SET_CUDA_SYNC_MEMOPS=0
torchrun --nnodes=8 --node_rank=0 --nproc_per_node=8
main_training_mamba.py
--tokenizer_path="/workspace/mnt/xxx/models/Bamba-9B"
--data_path="/workspace/mnt/cm-nfx/datasets/case7/no_preprocess"
--datasets="dataset=algorithmic_corpus,dataset=synthetic_code_snippet,dataset=synthetic_qa"
--weights="1,1,1"
--col_name="text"
--file_type="arrow"
--num_workers=12
--seq_length=4096
--vocab_size=128256
--logical_shards=960
--ckpt_load_path="/workspace/mnt/xxx/models/Bamba-9B"
--ckpt_save_path="/workspace/mnt/xxx/ckpt/bamba-fms"
--sharding_strategy="fsdp"
--batch_size=2
--learning_rate=3e-4
--num_steps=1000
--report_interval=10
--checkpoint_interval=10
--strip_tokens: str = ""
--seed=2023
--bos_token=None
--bol_token=None
--eol_token=None
--eos_token=0 \
I am using this script for training, but it has been stuck. What is the reason? Thank you
W0109 16:19:56.623000 139977270769472 torch/distributed/run.py:779]
W0109 16:19:56.623000 139977270769472 torch/distributed/run.py:779] *****************************************
W0109 16:19:56.623000 139977270769472 torch/distributed/run.py:779] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
W0109 16:19:56.623000 139977270769472 torch/distributed/run.py:779] *****************************************
Traceback (most recent call last):
File "/opt/conda/envs/bamba/bin/torchrun", line 8, in
sys.exit(main())
File "/opt/conda/envs/bamba/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 348, in wrapper
return f(*args, **kwargs)
File "/opt/conda/envs/bamba/lib/python3.10/site-packages/torch/distributed/run.py", line 901, in main
run(args)
File "/opt/conda/envs/bamba/lib/python3.10/site-packages/torch/distributed/run.py", line 892, in run
elastic_launch(
File "/opt/conda/envs/bamba/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 133, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/opt/conda/envs/bamba/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 255, in launch_agent
result = agent.run()
File "/opt/conda/envs/bamba/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 124, in wrapper
result = f(*args, **kwargs)
File "/opt/conda/envs/bamba/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 680, in run
result = self._invoke_run(role)
File "/opt/conda/envs/bamba/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 829, in _invoke_run
self._initialize_workers(self._worker_group)
File "/opt/conda/envs/bamba/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 124, in wrapper
result = f(*args, **kwargs)
File "/opt/conda/envs/bamba/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 652, in _initialize_workers
self._rendezvous(worker_group)
File "/opt/conda/envs/bamba/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 124, in wrapper
result = f(*args, **kwargs)
File "/opt/conda/envs/bamba/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 489, in _rendezvous
rdzv_info = spec.rdzv_handler.next_rendezvous()
File "/opt/conda/envs/bamba/lib/python3.10/site-packages/torch/distributed/elastic/rendezvous/static_tcp_rendezvous.py", line 66, in next_rendezvous
self._store = TCPStore( # type: ignore[call-arg]
torch.distributed.DistStoreError: Timed out after 901 seconds waiting for clients. 1/8 clients joined.