From bb82121daf5ae1fc20cde07449acbda96845f0f0 Mon Sep 17 00:00:00 2001 From: Loubna Ben Allal <44069155+loubnabnl@users.noreply.github.com> Date: Fri, 10 Nov 2023 16:24:33 +0100 Subject: [PATCH 1/4] Create pretrain_starcoder2_1b.slurm --- examples/pretrain_starcoder2_1b.slurm | 143 ++++++++++++++++++++++++++ 1 file changed, 143 insertions(+) create mode 100644 examples/pretrain_starcoder2_1b.slurm diff --git a/examples/pretrain_starcoder2_1b.slurm b/examples/pretrain_starcoder2_1b.slurm new file mode 100644 index 0000000000..14bb4188f2 --- /dev/null +++ b/examples/pretrain_starcoder2_1b.slurm @@ -0,0 +1,143 @@ +#!/bin/bash +#SBATCH --job-name=1b-starcoder2 +#SBATCH --nodes=32 +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --gres=gpu:8 +#SBATCH --exclusive +#SBATCH --partition=production-cluster +#SBATCH --output=/fsx/loubna/logs/starcoder2_1b/%x-%j.out +#SBATCH --qos=high +#SBATCH --array 1-3%1 + +set -x -e +source /admin/home/loubna/.bashrc + +# a100 +export CUDA_HOME=/usr/local/cuda-11.7 + +export NCCL_ASYNC_ERROR_HANDLING=1 + +# AWS specific +export NCCL_PROTO=simple +export RDMAV_FORK_SAFE=1 +export FI_EFA_FORK_SAFE=1 +export FI_EFA_USE_DEVICE_RDMA=1 +export FI_PROVIDER=efa +export FI_LOG_LEVEL=1 +export NCCL_IB_DISABLE=1 +export NCCL_SOCKET_IFNAME=ens + +conda activate megatron_bigcode_a100 + +echo "START TIME: $(date)" + +# File Path setup +SCRIPT_REPO=/fsx/loubna/bigcode_2/code/pr/Megatron-LM +pushd $SCRIPT_REPO +export CUDA_DEVICE_MAX_CONNECTIONS=1 +LOG_PATH=$SCRIPT_REPO/main_log.txt + +# Training setup +GPUS_PER_NODE=8 +# so processes know who to talk to +MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) +MASTER_PORT=6000 +NNODES=$SLURM_NNODES +NODE_RANK=$SLURM_PROCID +WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) + +# File path setup +CHECKPOINT_PATH=/fsx/bigcode/experiments/pretraining/starcoder2-1B/checkpoints # Adjust: Directory to store the checkpoints +# Starcoder2 tokenizer and data paths in /fsx/bigcode +TOKENIZER_FILE=/fsx/loubna/data/tokenizer/starcoder2-smol-internal-1/tokenizer.json +WEIGHTS_TRAIN=/fsx/loubna/bigcode_2/code/bigcode-data-mix/data/train_data_paths.txt.tmp +WEIGHTS_VALID=/fsx/loubna/bigcode_2/code/bigcode-data-mix/data/valid_data_paths.txt.tmp + +mkdir -p $CHECKPOINT_PATH/tensorboard + +GPT_ARGS="\ + --tensor-model-parallel-size 1 \ + --pipeline-model-parallel-size 1 \ + --num-layers 24 \ + --hidden-size 2048 \ + --num-attention-heads 16 \ + --attention-head-type multiquery \ + --init-method-std 0.02209 \ + --seq-length 8192 \ + --max-position-embeddings 8192 \ + --use-rotary-position-embeddings \ + --attention-dropout 0.1 \ + --hidden-dropout 0.1 \ + --micro-batch-size 2 \ + --global-batch-size 512 \ + --lr 0.0004 \ + --min-lr 0.00004 \ + --train-iters 500000 \ + --lr-decay-iters 500000 \ + --lr-decay-style cosine \ + --lr-warmup-iters 2000 \ + --weight-decay .1 \ + --adam-beta2 .95 \ + --clip-grad 1.0 \ + --bf16 \ + --use-flash-attn \ + --fim-rate 0.5 \ + --fim-split-sample \"\" \ + --fragment-fim-rate 0.5 \ + --log-interval 10 \ + --save-interval 10000 \ + --eval-interval 10000 \ + --eval-iters 2 \ + --valid-num-workers 0 \ +" + +TENSORBOARD_ARGS="--tensorboard-dir /fsx/bigcode/experiments/pretraining/starcoder2-1B/tensorboard" + +CMD=" \ + $SCRIPT_REPO/pretrain_gpt.py \ + $GPT_ARGS \ + --tokenizer-type TokenizerFromFile \ + --tokenizer-file $TOKENIZER_FILE \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $WEIGHTS_TRAIN \ + --valid-weighted-split-paths-path $WEIGHTS_VALID \ + --structured-logs \ + --structured-logs-dir $CHECKPOINT_PATH/logs \ + $TENSORBOARD_ARGS \ + --wandb-entity-name loubnabnl \ + --wandb-project-name starcoder2-1B \ + " + +export LAUNCHER="python -u -m torch.distributed.run \ + --nproc_per_node $GPUS_PER_NODE \ + --nnodes $NNODES \ + --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \ + --rdzv_backend c10d \ + --max_restarts 0 \ + --tee 3 \ + " + +echo $CMD + +# hide duplicated errors using this hack - will be properly fixed in pt-1.12 +# export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json + +# This is needed for torch1.12.1 otherwise it doesn't link correctly, not sur what the issue was. +#export PATH="/usr/local/cuda-11.6/bin:$PATH" +#export LD_LIBRARY_PATH="/usr/local/cuda-11.6/lib64:$LD_LIBRARY_PATH" +#export LD_PRELOAD=$CUDA_HOME/lib/libnccl.so +#export LD_LIBRARY_PATH=$CUDA_HOME/efa/lib:$CUDA_HOME/lib:$CUDA_HOME/lib64:$LD_LIBRARY_PATH + +# srun error handling: +# --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks +# --kill-on-bad-exit=1: terminate a step if any task exits with a non-zero exit code +SRUN_ARGS=" \ + --wait=60 \ + --kill-on-bad-exit=1 \ + " + +# py-spy top -s -i -n -- $LAUNCHER --node_rank $SLURM_PROCID --role $SLURMD_NODENAME: $CMD +clear; srun $SRUN_ARGS --jobid $SLURM_JOB_ID bash -c "$LAUNCHER --node_rank \$SLURM_PROCID --role \$SLURMD_NODENAME: $CMD" 2>&1 | tee $LOG_PATH + +echo "END TIME: $(date)" From 61ff9d9cb7b11e0e985e2869e92d24ded132005b Mon Sep 17 00:00:00 2001 From: Loubna Ben Allal <44069155+loubnabnl@users.noreply.github.com> Date: Fri, 10 Nov 2023 18:06:20 +0100 Subject: [PATCH 2/4] Update pretrain_starcoder2_1b.slurm --- examples/pretrain_starcoder2_1b.slurm | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/examples/pretrain_starcoder2_1b.slurm b/examples/pretrain_starcoder2_1b.slurm index 14bb4188f2..e7e0e7ecec 100644 --- a/examples/pretrain_starcoder2_1b.slurm +++ b/examples/pretrain_starcoder2_1b.slurm @@ -1,5 +1,5 @@ #!/bin/bash -#SBATCH --job-name=1b-starcoder2 +#SBATCH --job-name=1B_starcoder2 #SBATCH --nodes=32 #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! #SBATCH --gres=gpu:8 @@ -63,13 +63,14 @@ GPT_ARGS="\ --num-attention-heads 16 \ --attention-head-type multiquery \ --init-method-std 0.02209 \ - --seq-length 8192 \ - --max-position-embeddings 8192 \ + --seq-length 4096 \ + --max-position-embeddings 4096 \ --use-rotary-position-embeddings \ + --rotary-theta 100000 \ --attention-dropout 0.1 \ --hidden-dropout 0.1 \ - --micro-batch-size 2 \ - --global-batch-size 512 \ + --micro-batch-size 4 \ + --global-batch-size 1024 \ --lr 0.0004 \ --min-lr 0.00004 \ --train-iters 500000 \ From aacf49308e1e260eea7ce7909c1dce78bb1e3aaa Mon Sep 17 00:00:00 2001 From: Loubna Ben Allal <44069155+loubnabnl@users.noreply.github.com> Date: Tue, 14 Nov 2023 15:19:14 +0000 Subject: [PATCH 3/4] Create pretrain_starcoder2_1b_fix_rope.slurm --- .../pretrain_starcoder2_1b_fix_rope.slurm | 146 ++++++++++++++++++ 1 file changed, 146 insertions(+) create mode 100644 examples/pretrain_starcoder2_1b_fix_rope.slurm diff --git a/examples/pretrain_starcoder2_1b_fix_rope.slurm b/examples/pretrain_starcoder2_1b_fix_rope.slurm new file mode 100644 index 0000000000..6d6792e2e7 --- /dev/null +++ b/examples/pretrain_starcoder2_1b_fix_rope.slurm @@ -0,0 +1,146 @@ +#!/bin/bash +#SBATCH --job-name=1B_test +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --gres=gpu:8 +#SBATCH --exclusive +#SBATCH --partition=production-cluster +#SBATCH --output=/fsx/loubna/logs/starcoder2_1b/%x-%j.out +#SBATCH --qos=high +#SBATCH --array 1-3%1 + +set -x -e +source /admin/home/loubna/.bashrc + +# a100 +export CUDA_HOME=/usr/local/cuda-11.7 + +export NCCL_ASYNC_ERROR_HANDLING=1 + +# AWS specific +export NCCL_PROTO=simple +export RDMAV_FORK_SAFE=1 +export FI_EFA_FORK_SAFE=1 +export FI_EFA_USE_DEVICE_RDMA=1 +export FI_PROVIDER=efa +export FI_LOG_LEVEL=1 +export NCCL_IB_DISABLE=1 +export NCCL_SOCKET_IFNAME=ens + +conda activate megatron_bigcode_a100 + +echo "START TIME: $(date)" + +# File Path setup +SCRIPT_REPO=/fsx/loubna/bigcode_2/code/pr/Megatron-LM +pushd $SCRIPT_REPO +export CUDA_DEVICE_MAX_CONNECTIONS=1 +LOG_PATH=$SCRIPT_REPO/main_log.txt + +# Training setup +GPUS_PER_NODE=8 +# so processes know who to talk to +MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) +MASTER_PORT=6000 +NNODES=$SLURM_NNODES +NODE_RANK=$SLURM_PROCID +WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) + +# File path setup +CHECKPOINT_PATH=/fsx/bigcode/experiments/pretraining/starcoder2-1B/checkpoints_fix # Adjust: Directory to store the checkpoints +# Starcoder2 tokenizer and data paths in /fsx/bigcode +TOKENIZER_FILE=/fsx/loubna/data/tokenizer/starcoder2-smol-internal-1/tokenizer.json +WEIGHTS_TRAIN=/fsx/loubna/bigcode_2/code/bigcode-data-mix/data/train_data_paths.txt.tmp +WEIGHTS_VALID=/fsx/loubna/bigcode_2/code/bigcode-data-mix/data/valid_data_paths.txt.tmp + +mkdir -p $CHECKPOINT_PATH/tensorboard + +GPT_ARGS="\ + --tensor-model-parallel-size 1 \ + --pipeline-model-parallel-size 1 \ + --num-layers 24 \ + --hidden-size 2048 \ + --num-attention-heads 16 \ + --attention-head-type multiquery \ + --init-method-std 0.02209 \ + --seq-length 4096 \ + --max-position-embeddings 4096 \ + --use-rotary-position-embeddings \ + --rotary-theta 100000 \ + --position-embedding-type rotary \ + --no-position-embedding \ + --attention-dropout 0.1 \ + --hidden-dropout 0.1 \ + --micro-batch-size 1 \ + --global-batch-size 8 \ + --lr 0.0004 \ + --min-lr 0.00004 \ + --train-iters 500000 \ + --lr-decay-iters 500000 \ + --lr-decay-style cosine \ + --lr-warmup-iters 2000 \ + --weight-decay .1 \ + --adam-beta2 .95 \ + --clip-grad 1.0 \ + --bf16 \ + --use-flash-attn \ + --fim-rate 0.5 \ + --fim-split-sample \"\" \ + --fragment-fim-rate 0.5 \ + --log-interval 10 \ + --save-interval 10000 \ + --eval-interval 10000 \ + --eval-iters 2 \ + --valid-num-workers 0 \ +" + +TENSORBOARD_ARGS="--tensorboard-dir /fsx/bigcode/experiments/pretraining/starcoder2-1B/tensorboard" + +CMD=" \ + $SCRIPT_REPO/pretrain_gpt.py \ + $GPT_ARGS \ + --tokenizer-type TokenizerFromFile \ + --tokenizer-file $TOKENIZER_FILE \ + --save $CHECKPOINT_PATH \ + --load $CHECKPOINT_PATH \ + --train-weighted-split-paths-path $WEIGHTS_TRAIN \ + --valid-weighted-split-paths-path $WEIGHTS_VALID \ + --structured-logs \ + --structured-logs-dir $CHECKPOINT_PATH/logs \ + $TENSORBOARD_ARGS \ + --wandb-entity-name loubnabnl \ + --wandb-project-name starcoder2-1B \ + " + +export LAUNCHER="python -u -m torch.distributed.run \ + --nproc_per_node $GPUS_PER_NODE \ + --nnodes $NNODES \ + --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \ + --rdzv_backend c10d \ + --max_restarts 0 \ + --tee 3 \ + " + +echo $CMD + +# hide duplicated errors using this hack - will be properly fixed in pt-1.12 +# export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json + +# This is needed for torch1.12.1 otherwise it doesn't link correctly, not sur what the issue was. +#export PATH="/usr/local/cuda-11.6/bin:$PATH" +#export LD_LIBRARY_PATH="/usr/local/cuda-11.6/lib64:$LD_LIBRARY_PATH" +#export LD_PRELOAD=$CUDA_HOME/lib/libnccl.so +#export LD_LIBRARY_PATH=$CUDA_HOME/efa/lib:$CUDA_HOME/lib:$CUDA_HOME/lib64:$LD_LIBRARY_PATH + +# srun error handling: +# --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks +# --kill-on-bad-exit=1: terminate a step if any task exits with a non-zero exit code +SRUN_ARGS=" \ + --wait=60 \ + --kill-on-bad-exit=1 \ + " + +# py-spy top -s -i -n -- $LAUNCHER --node_rank $SLURM_PROCID --role $SLURMD_NODENAME: $CMD +clear; srun $SRUN_ARGS --jobid $SLURM_JOB_ID bash -c "$LAUNCHER --node_rank \$SLURM_PROCID --role \$SLURMD_NODENAME: $CMD" 2>&1 | tee $LOG_PATH + +echo "END TIME: $(date)" From b13e6dad7b8a4c54b355ee2476242634445fce96 Mon Sep 17 00:00:00 2001 From: Loubna Ben Allal <44069155+loubnabnl@users.noreply.github.com> Date: Tue, 14 Nov 2023 15:43:04 +0000 Subject: [PATCH 4/4] Update pretrain_starcoder2_1b_fix_rope.slurm --- examples/pretrain_starcoder2_1b_fix_rope.slurm | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/pretrain_starcoder2_1b_fix_rope.slurm b/examples/pretrain_starcoder2_1b_fix_rope.slurm index 6d6792e2e7..43358d78bb 100644 --- a/examples/pretrain_starcoder2_1b_fix_rope.slurm +++ b/examples/pretrain_starcoder2_1b_fix_rope.slurm @@ -1,6 +1,6 @@ #!/bin/bash #SBATCH --job-name=1B_test -#SBATCH --nodes=1 +#SBATCH --nodes=32 #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! #SBATCH --gres=gpu:8 #SBATCH --exclusive @@ -47,7 +47,7 @@ NODE_RANK=$SLURM_PROCID WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) # File path setup -CHECKPOINT_PATH=/fsx/bigcode/experiments/pretraining/starcoder2-1B/checkpoints_fix # Adjust: Directory to store the checkpoints +CHECKPOINT_PATH=/fsx/bigcode/experiments/pretraining/starcoder2-1B/checkpoints_fix_rope # Adjust: Directory to store the checkpoints # Starcoder2 tokenizer and data paths in /fsx/bigcode TOKENIZER_FILE=/fsx/loubna/data/tokenizer/starcoder2-smol-internal-1/tokenizer.json WEIGHTS_TRAIN=/fsx/loubna/bigcode_2/code/bigcode-data-mix/data/train_data_paths.txt.tmp @@ -71,8 +71,8 @@ GPT_ARGS="\ --no-position-embedding \ --attention-dropout 0.1 \ --hidden-dropout 0.1 \ - --micro-batch-size 1 \ - --global-batch-size 8 \ + --micro-batch-size 4 \ + --global-batch-size 1024 \ --lr 0.0004 \ --min-lr 0.00004 \ --train-iters 500000 \