diff --git a/examples/pretrain_starcoder2_1b.slurm b/examples/pretrain_starcoder2_1b.slurm new file mode 100644 index 0000000000..e7e0e7ecec --- /dev/null +++ b/examples/pretrain_starcoder2_1b.slurm @@ -0,0 +1,144 @@ +#!/bin/bash +#SBATCH --job-name=1B_starcoder2 +#SBATCH --nodes=32 +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --gres=gpu:8 +#SBATCH --exclusive +#SBATCH --partition=production-cluster +#SBATCH --output=/fsx/loubna/logs/starcoder2_1b/%x-%j.out +#SBATCH --qos=high +#SBATCH --array 1-3%1 + +set -x -e +source /admin/home/loubna/.bashrc + +# a100 +export CUDA_HOME=/usr/local/cuda-11.7 + +export NCCL_ASYNC_ERROR_HANDLING=1 + +# AWS specific +export NCCL_PROTO=simple +export RDMAV_FORK_SAFE=1 +export FI_EFA_FORK_SAFE=1 +export FI_EFA_USE_DEVICE_RDMA=1 +export FI_PROVIDER=efa +export FI_LOG_LEVEL=1 +export NCCL_IB_DISABLE=1 +export NCCL_SOCKET_IFNAME=ens + +conda activate megatron_bigcode_a100 + +echo "START TIME: $(date)" + +# File Path setup +SCRIPT_REPO=/fsx/loubna/bigcode_2/code/pr/Megatron-LM +pushd $SCRIPT_REPO +export CUDA_DEVICE_MAX_CONNECTIONS=1 +LOG_PATH=$SCRIPT_REPO/main_log.txt + +# Training setup +GPUS_PER_NODE=8 +# so processes know who to talk to +MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) +MASTER_PORT=6000 +NNODES=$SLURM_NNODES +NODE_RANK=$SLURM_PROCID +WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) + +# File path setup +CHECKPOINT_PATH=/fsx/bigcode/experiments/pretraining/starcoder2-1B/checkpoints # Adjust: Directory to store the checkpoints +# Starcoder2 tokenizer and data paths in /fsx/bigcode +TOKENIZER_FILE=/fsx/loubna/data/tokenizer/starcoder2-smol-internal-1/tokenizer.json +WEIGHTS_TRAIN=/fsx/loubna/bigcode_2/code/bigcode-data-mix/data/train_data_paths.txt.tmp +WEIGHTS_VALID=/fsx/loubna/bigcode_2/code/bigcode-data-mix/data/valid_data_paths.txt.tmp + +mkdir -p $CHECKPOINT_PATH/tensorboard + +GPT_ARGS="\ + --tensor-model-parallel-size 1 \ + --pipeline-model-parallel-size 1 \ + --num-layers 24 \ + --hidden-size 2048 \ + --num-attention-heads 16 \ + --attention-head-type multiquery \ + --init-method-std 0.02209 \ + --seq-length 4096 \ + --max-position-embeddings 4096 \ + --use-rotary-position-embeddings \ + --rotary-theta 100000 \ + --attention-dropout 0.1 \ + --hidden-dropout 0.1 \ + --micro-batch-size 4 \ + --global-batch-size 1024 \ + --lr 0.0004 \ + --min-lr 0.00004 \ + --train-iters 500000 \ + --lr-decay-iters 500000 \ + --lr-decay-style cosine \ + --lr-warmup-iters 2000 \ + --weight-decay .1 \ + --adam-beta2 .95 \ + --clip-grad 1.0 \ + --bf16 \ + --use-flash-attn \ + --fim-rate 0.5 \ + --fim-split-sample \"\" \ + --fragment-fim-rate 0.5 \ + --log-interval 10 \ + --save-interval 10000 \ + --eval-interval 10000 \ + --eval-iters 2 \ + --valid-num-workers 0 \ +" + +TENSORBOARD_ARGS="--tensorboard-dir /fsx/bigcode/experiments/pretraining/starcoder2-1B/tensorboard" + +CMD=" \ + $SCRIPT_REPO/pretrain_gpt.py \ + $GPT_ARGS \ + --tokenizer-type TokenizerFromFile \ + --tokenizer-file $TOKENIZER_FILE \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $WEIGHTS_TRAIN \ + --valid-weighted-split-paths-path $WEIGHTS_VALID \ + --structured-logs \ + --structured-logs-dir $CHECKPOINT_PATH/logs \ + $TENSORBOARD_ARGS \ + --wandb-entity-name loubnabnl \ + --wandb-project-name starcoder2-1B \ + " + +export LAUNCHER="python -u -m torch.distributed.run \ + --nproc_per_node $GPUS_PER_NODE \ + --nnodes $NNODES \ + --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \ + --rdzv_backend c10d \ + --max_restarts 0 \ + --tee 3 \ + " + +echo $CMD + +# hide duplicated errors using this hack - will be properly fixed in pt-1.12 +# export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json + +# This is needed for torch1.12.1 otherwise it doesn't link correctly, not sur what the issue was. +#export PATH="/usr/local/cuda-11.6/bin:$PATH" +#export LD_LIBRARY_PATH="/usr/local/cuda-11.6/lib64:$LD_LIBRARY_PATH" +#export LD_PRELOAD=$CUDA_HOME/lib/libnccl.so +#export LD_LIBRARY_PATH=$CUDA_HOME/efa/lib:$CUDA_HOME/lib:$CUDA_HOME/lib64:$LD_LIBRARY_PATH + +# srun error handling: +# --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks +# --kill-on-bad-exit=1: terminate a step if any task exits with a non-zero exit code +SRUN_ARGS=" \ + --wait=60 \ + --kill-on-bad-exit=1 \ + " + +# py-spy top -s -i -n -- $LAUNCHER --node_rank $SLURM_PROCID --role $SLURMD_NODENAME: $CMD +clear; srun $SRUN_ARGS --jobid $SLURM_JOB_ID bash -c "$LAUNCHER --node_rank \$SLURM_PROCID --role \$SLURMD_NODENAME: $CMD" 2>&1 | tee $LOG_PATH + +echo "END TIME: $(date)" diff --git a/examples/pretrain_starcoder2_1b_fix_rope.slurm b/examples/pretrain_starcoder2_1b_fix_rope.slurm new file mode 100644 index 0000000000..43358d78bb --- /dev/null +++ b/examples/pretrain_starcoder2_1b_fix_rope.slurm @@ -0,0 +1,146 @@ +#!/bin/bash +#SBATCH --job-name=1B_test +#SBATCH --nodes=32 +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --gres=gpu:8 +#SBATCH --exclusive +#SBATCH --partition=production-cluster +#SBATCH --output=/fsx/loubna/logs/starcoder2_1b/%x-%j.out +#SBATCH --qos=high +#SBATCH --array 1-3%1 + +set -x -e +source /admin/home/loubna/.bashrc + +# a100 +export CUDA_HOME=/usr/local/cuda-11.7 + +export NCCL_ASYNC_ERROR_HANDLING=1 + +# AWS specific +export NCCL_PROTO=simple +export RDMAV_FORK_SAFE=1 +export FI_EFA_FORK_SAFE=1 +export FI_EFA_USE_DEVICE_RDMA=1 +export FI_PROVIDER=efa +export FI_LOG_LEVEL=1 +export NCCL_IB_DISABLE=1 +export NCCL_SOCKET_IFNAME=ens + +conda activate megatron_bigcode_a100 + +echo "START TIME: $(date)" + +# File Path setup +SCRIPT_REPO=/fsx/loubna/bigcode_2/code/pr/Megatron-LM +pushd $SCRIPT_REPO +export CUDA_DEVICE_MAX_CONNECTIONS=1 +LOG_PATH=$SCRIPT_REPO/main_log.txt + +# Training setup +GPUS_PER_NODE=8 +# so processes know who to talk to +MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) +MASTER_PORT=6000 +NNODES=$SLURM_NNODES +NODE_RANK=$SLURM_PROCID +WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) + +# File path setup +CHECKPOINT_PATH=/fsx/bigcode/experiments/pretraining/starcoder2-1B/checkpoints_fix_rope # Adjust: Directory to store the checkpoints +# Starcoder2 tokenizer and data paths in /fsx/bigcode +TOKENIZER_FILE=/fsx/loubna/data/tokenizer/starcoder2-smol-internal-1/tokenizer.json +WEIGHTS_TRAIN=/fsx/loubna/bigcode_2/code/bigcode-data-mix/data/train_data_paths.txt.tmp +WEIGHTS_VALID=/fsx/loubna/bigcode_2/code/bigcode-data-mix/data/valid_data_paths.txt.tmp + +mkdir -p $CHECKPOINT_PATH/tensorboard + +GPT_ARGS="\ + --tensor-model-parallel-size 1 \ + --pipeline-model-parallel-size 1 \ + --num-layers 24 \ + --hidden-size 2048 \ + --num-attention-heads 16 \ + --attention-head-type multiquery \ + --init-method-std 0.02209 \ + --seq-length 4096 \ + --max-position-embeddings 4096 \ + --use-rotary-position-embeddings \ + --rotary-theta 100000 \ + --position-embedding-type rotary \ + --no-position-embedding \ + --attention-dropout 0.1 \ + --hidden-dropout 0.1 \ + --micro-batch-size 4 \ + --global-batch-size 1024 \ + --lr 0.0004 \ + --min-lr 0.00004 \ + --train-iters 500000 \ + --lr-decay-iters 500000 \ + --lr-decay-style cosine \ + --lr-warmup-iters 2000 \ + --weight-decay .1 \ + --adam-beta2 .95 \ + --clip-grad 1.0 \ + --bf16 \ + --use-flash-attn \ + --fim-rate 0.5 \ + --fim-split-sample \"\" \ + --fragment-fim-rate 0.5 \ + --log-interval 10 \ + --save-interval 10000 \ + --eval-interval 10000 \ + --eval-iters 2 \ + --valid-num-workers 0 \ +" + +TENSORBOARD_ARGS="--tensorboard-dir /fsx/bigcode/experiments/pretraining/starcoder2-1B/tensorboard" + +CMD=" \ + $SCRIPT_REPO/pretrain_gpt.py \ + $GPT_ARGS \ + --tokenizer-type TokenizerFromFile \ + --tokenizer-file $TOKENIZER_FILE \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $WEIGHTS_TRAIN \ + --valid-weighted-split-paths-path $WEIGHTS_VALID \ + --structured-logs \ + --structured-logs-dir $CHECKPOINT_PATH/logs \ + $TENSORBOARD_ARGS \ + --wandb-entity-name loubnabnl \ + --wandb-project-name starcoder2-1B \ + " + +export LAUNCHER="python -u -m torch.distributed.run \ + --nproc_per_node $GPUS_PER_NODE \ + --nnodes $NNODES \ + --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \ + --rdzv_backend c10d \ + --max_restarts 0 \ + --tee 3 \ + " + +echo $CMD + +# hide duplicated errors using this hack - will be properly fixed in pt-1.12 +# export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json + +# This is needed for torch1.12.1 otherwise it doesn't link correctly, not sur what the issue was. +#export PATH="/usr/local/cuda-11.6/bin:$PATH" +#export LD_LIBRARY_PATH="/usr/local/cuda-11.6/lib64:$LD_LIBRARY_PATH" +#export LD_PRELOAD=$CUDA_HOME/lib/libnccl.so +#export LD_LIBRARY_PATH=$CUDA_HOME/efa/lib:$CUDA_HOME/lib:$CUDA_HOME/lib64:$LD_LIBRARY_PATH + +# srun error handling: +# --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks +# --kill-on-bad-exit=1: terminate a step if any task exits with a non-zero exit code +SRUN_ARGS=" \ + --wait=60 \ + --kill-on-bad-exit=1 \ + " + +# py-spy top -s -i -n -- $LAUNCHER --node_rank $SLURM_PROCID --role $SLURMD_NODENAME: $CMD +clear; srun $SRUN_ARGS --jobid $SLURM_JOB_ID bash -c "$LAUNCHER --node_rank \$SLURM_PROCID --role \$SLURMD_NODENAME: $CMD" 2>&1 | tee $LOG_PATH + +echo "END TIME: $(date)"