From bb82121daf5ae1fc20cde07449acbda96845f0f0 Mon Sep 17 00:00:00 2001
From: Loubna Ben Allal <44069155+loubnabnl@users.noreply.github.com>
Date: Fri, 10 Nov 2023 16:24:33 +0100
Subject: [PATCH 1/4] Create pretrain_starcoder2_1b.slurm

---
 examples/pretrain_starcoder2_1b.slurm | 143 ++++++++++++++++++++++++++
 1 file changed, 143 insertions(+)
 create mode 100644 examples/pretrain_starcoder2_1b.slurm

diff --git a/examples/pretrain_starcoder2_1b.slurm b/examples/pretrain_starcoder2_1b.slurm
new file mode 100644
index 0000000000..14bb4188f2
--- /dev/null
+++ b/examples/pretrain_starcoder2_1b.slurm
@@ -0,0 +1,143 @@
+#!/bin/bash
+#SBATCH --job-name=1b-starcoder2
+#SBATCH --nodes=32
+#SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
+#SBATCH --gres=gpu:8
+#SBATCH --exclusive 
+#SBATCH --partition=production-cluster
+#SBATCH --output=/fsx/loubna/logs/starcoder2_1b/%x-%j.out
+#SBATCH --qos=high
+#SBATCH --array 1-3%1
+
+set -x -e
+source /admin/home/loubna/.bashrc
+
+# a100
+export CUDA_HOME=/usr/local/cuda-11.7
+
+export NCCL_ASYNC_ERROR_HANDLING=1
+
+# AWS specific
+export NCCL_PROTO=simple
+export RDMAV_FORK_SAFE=1
+export FI_EFA_FORK_SAFE=1
+export FI_EFA_USE_DEVICE_RDMA=1
+export FI_PROVIDER=efa
+export FI_LOG_LEVEL=1
+export NCCL_IB_DISABLE=1
+export NCCL_SOCKET_IFNAME=ens
+
+conda activate megatron_bigcode_a100
+
+echo "START TIME: $(date)"
+
+# File Path setup
+SCRIPT_REPO=/fsx/loubna/bigcode_2/code/pr/Megatron-LM
+pushd $SCRIPT_REPO
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+LOG_PATH=$SCRIPT_REPO/main_log.txt
+
+# Training setup
+GPUS_PER_NODE=8
+# so processes know who to talk to
+MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+MASTER_PORT=6000
+NNODES=$SLURM_NNODES
+NODE_RANK=$SLURM_PROCID
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+# File path setup
+CHECKPOINT_PATH=/fsx/bigcode/experiments/pretraining/starcoder2-1B/checkpoints  # Adjust: Directory to store the checkpoints
+# Starcoder2 tokenizer and data paths in /fsx/bigcode
+TOKENIZER_FILE=/fsx/loubna/data/tokenizer/starcoder2-smol-internal-1/tokenizer.json 
+WEIGHTS_TRAIN=/fsx/loubna/bigcode_2/code/bigcode-data-mix/data/train_data_paths.txt.tmp
+WEIGHTS_VALID=/fsx/loubna/bigcode_2/code/bigcode-data-mix/data/valid_data_paths.txt.tmp
+
+mkdir -p $CHECKPOINT_PATH/tensorboard
+
+GPT_ARGS="\
+       --tensor-model-parallel-size 1 \
+       --pipeline-model-parallel-size 1 \
+       --num-layers 24 \
+       --hidden-size 2048 \
+       --num-attention-heads 16 \
+       --attention-head-type multiquery \
+       --init-method-std 0.02209 \
+       --seq-length 8192 \
+       --max-position-embeddings 8192 \
+       --use-rotary-position-embeddings \
+       --attention-dropout 0.1 \
+       --hidden-dropout 0.1 \
+       --micro-batch-size 2 \
+       --global-batch-size 512 \
+       --lr 0.0004 \
+       --min-lr 0.00004 \
+       --train-iters 500000 \
+       --lr-decay-iters 500000 \
+       --lr-decay-style cosine \
+       --lr-warmup-iters 2000 \
+       --weight-decay .1 \
+       --adam-beta2 .95 \
+       --clip-grad 1.0 \
+       --bf16 \
+       --use-flash-attn \
+       --fim-rate 0.5 \
+       --fim-split-sample \"<file_sep>\" \
+       --fragment-fim-rate 0.5 \
+       --log-interval 10 \
+       --save-interval 10000 \
+       --eval-interval 10000 \
+       --eval-iters 2 \
+       --valid-num-workers 0 \
+"
+
+TENSORBOARD_ARGS="--tensorboard-dir /fsx/bigcode/experiments/pretraining/starcoder2-1B/tensorboard"
+
+CMD=" \
+    $SCRIPT_REPO/pretrain_gpt.py \
+    $GPT_ARGS \
+    --tokenizer-type TokenizerFromFile \
+    --tokenizer-file $TOKENIZER_FILE \
+    --save $CHECKPOINT_PATH \
+    --load $CHECKPOINT_PATH \
+    --train-weighted-split-paths-path $WEIGHTS_TRAIN \
+    --valid-weighted-split-paths-path $WEIGHTS_VALID \
+    --structured-logs \
+    --structured-logs-dir $CHECKPOINT_PATH/logs \
+    $TENSORBOARD_ARGS \
+    --wandb-entity-name loubnabnl \
+    --wandb-project-name starcoder2-1B \
+    "
+
+export LAUNCHER="python -u -m torch.distributed.run \
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
+    --rdzv_backend c10d \
+    --max_restarts 0 \
+    --tee 3 \
+    "
+
+echo $CMD
+
+# hide duplicated errors using this hack - will be properly fixed in pt-1.12
+# export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json
+
+# This is needed for torch1.12.1 otherwise it doesn't link correctly, not sur what the issue was.
+#export PATH="/usr/local/cuda-11.6/bin:$PATH"
+#export LD_LIBRARY_PATH="/usr/local/cuda-11.6/lib64:$LD_LIBRARY_PATH"
+#export LD_PRELOAD=$CUDA_HOME/lib/libnccl.so
+#export LD_LIBRARY_PATH=$CUDA_HOME/efa/lib:$CUDA_HOME/lib:$CUDA_HOME/lib64:$LD_LIBRARY_PATH
+
+# srun error handling:
+# --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks
+# --kill-on-bad-exit=1: terminate a step if any task exits with a non-zero exit code
+SRUN_ARGS=" \
+    --wait=60 \
+    --kill-on-bad-exit=1 \
+    "
+
+# py-spy top -s -i -n -- $LAUNCHER --node_rank $SLURM_PROCID --role $SLURMD_NODENAME: $CMD
+clear; srun $SRUN_ARGS --jobid $SLURM_JOB_ID bash -c "$LAUNCHER --node_rank \$SLURM_PROCID --role \$SLURMD_NODENAME: $CMD" 2>&1 | tee $LOG_PATH
+
+echo "END TIME: $(date)"

From 61ff9d9cb7b11e0e985e2869e92d24ded132005b Mon Sep 17 00:00:00 2001
From: Loubna Ben Allal <44069155+loubnabnl@users.noreply.github.com>
Date: Fri, 10 Nov 2023 18:06:20 +0100
Subject: [PATCH 2/4] Update pretrain_starcoder2_1b.slurm

---
 examples/pretrain_starcoder2_1b.slurm | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/examples/pretrain_starcoder2_1b.slurm b/examples/pretrain_starcoder2_1b.slurm
index 14bb4188f2..e7e0e7ecec 100644
--- a/examples/pretrain_starcoder2_1b.slurm
+++ b/examples/pretrain_starcoder2_1b.slurm
@@ -1,5 +1,5 @@
 #!/bin/bash
-#SBATCH --job-name=1b-starcoder2
+#SBATCH --job-name=1B_starcoder2
 #SBATCH --nodes=32
 #SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
 #SBATCH --gres=gpu:8
@@ -63,13 +63,14 @@ GPT_ARGS="\
        --num-attention-heads 16 \
        --attention-head-type multiquery \
        --init-method-std 0.02209 \
-       --seq-length 8192 \
-       --max-position-embeddings 8192 \
+       --seq-length 4096 \
+       --max-position-embeddings 4096 \
        --use-rotary-position-embeddings \
+       --rotary-theta 100000 \
        --attention-dropout 0.1 \
        --hidden-dropout 0.1 \
-       --micro-batch-size 2 \
-       --global-batch-size 512 \
+       --micro-batch-size 4 \
+       --global-batch-size 1024 \
        --lr 0.0004 \
        --min-lr 0.00004 \
        --train-iters 500000 \

From aacf49308e1e260eea7ce7909c1dce78bb1e3aaa Mon Sep 17 00:00:00 2001
From: Loubna Ben Allal <44069155+loubnabnl@users.noreply.github.com>
Date: Tue, 14 Nov 2023 15:19:14 +0000
Subject: [PATCH 3/4] Create pretrain_starcoder2_1b_fix_rope.slurm

---
 .../pretrain_starcoder2_1b_fix_rope.slurm     | 146 ++++++++++++++++++
 1 file changed, 146 insertions(+)
 create mode 100644 examples/pretrain_starcoder2_1b_fix_rope.slurm

diff --git a/examples/pretrain_starcoder2_1b_fix_rope.slurm b/examples/pretrain_starcoder2_1b_fix_rope.slurm
new file mode 100644
index 0000000000..6d6792e2e7
--- /dev/null
+++ b/examples/pretrain_starcoder2_1b_fix_rope.slurm
@@ -0,0 +1,146 @@
+#!/bin/bash
+#SBATCH --job-name=1B_test
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
+#SBATCH --gres=gpu:8
+#SBATCH --exclusive 
+#SBATCH --partition=production-cluster
+#SBATCH --output=/fsx/loubna/logs/starcoder2_1b/%x-%j.out
+#SBATCH --qos=high
+#SBATCH --array 1-3%1
+
+set -x -e
+source /admin/home/loubna/.bashrc
+
+# a100
+export CUDA_HOME=/usr/local/cuda-11.7
+
+export NCCL_ASYNC_ERROR_HANDLING=1
+
+# AWS specific
+export NCCL_PROTO=simple
+export RDMAV_FORK_SAFE=1
+export FI_EFA_FORK_SAFE=1
+export FI_EFA_USE_DEVICE_RDMA=1
+export FI_PROVIDER=efa
+export FI_LOG_LEVEL=1
+export NCCL_IB_DISABLE=1
+export NCCL_SOCKET_IFNAME=ens
+
+conda activate megatron_bigcode_a100
+
+echo "START TIME: $(date)"
+
+# File Path setup
+SCRIPT_REPO=/fsx/loubna/bigcode_2/code/pr/Megatron-LM
+pushd $SCRIPT_REPO
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+LOG_PATH=$SCRIPT_REPO/main_log.txt
+
+# Training setup
+GPUS_PER_NODE=8
+# so processes know who to talk to
+MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+MASTER_PORT=6000
+NNODES=$SLURM_NNODES
+NODE_RANK=$SLURM_PROCID
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+# File path setup
+CHECKPOINT_PATH=/fsx/bigcode/experiments/pretraining/starcoder2-1B/checkpoints_fix  # Adjust: Directory to store the checkpoints
+# Starcoder2 tokenizer and data paths in /fsx/bigcode
+TOKENIZER_FILE=/fsx/loubna/data/tokenizer/starcoder2-smol-internal-1/tokenizer.json 
+WEIGHTS_TRAIN=/fsx/loubna/bigcode_2/code/bigcode-data-mix/data/train_data_paths.txt.tmp
+WEIGHTS_VALID=/fsx/loubna/bigcode_2/code/bigcode-data-mix/data/valid_data_paths.txt.tmp
+
+mkdir -p $CHECKPOINT_PATH/tensorboard
+
+GPT_ARGS="\
+       --tensor-model-parallel-size 1 \
+       --pipeline-model-parallel-size 1 \
+       --num-layers 24 \
+       --hidden-size 2048 \
+       --num-attention-heads 16 \
+       --attention-head-type multiquery \
+       --init-method-std 0.02209 \
+       --seq-length 4096 \
+       --max-position-embeddings 4096 \
+       --use-rotary-position-embeddings \
+       --rotary-theta 100000 \
+       --position-embedding-type rotary \
+       --no-position-embedding \
+       --attention-dropout 0.1 \
+       --hidden-dropout 0.1 \
+       --micro-batch-size 1 \
+       --global-batch-size 8 \
+       --lr 0.0004 \
+       --min-lr 0.00004 \
+       --train-iters 500000 \
+       --lr-decay-iters 500000 \
+       --lr-decay-style cosine \
+       --lr-warmup-iters 2000 \
+       --weight-decay .1 \
+       --adam-beta2 .95 \
+       --clip-grad 1.0 \
+       --bf16 \
+       --use-flash-attn \
+       --fim-rate 0.5 \
+       --fim-split-sample \"<file_sep>\" \
+       --fragment-fim-rate 0.5 \
+       --log-interval 10 \
+       --save-interval 10000 \
+       --eval-interval 10000 \
+       --eval-iters 2 \
+       --valid-num-workers 0 \
+"
+
+TENSORBOARD_ARGS="--tensorboard-dir /fsx/bigcode/experiments/pretraining/starcoder2-1B/tensorboard"
+
+CMD=" \
+    $SCRIPT_REPO/pretrain_gpt.py \
+    $GPT_ARGS \
+    --tokenizer-type TokenizerFromFile \
+    --tokenizer-file $TOKENIZER_FILE \
+    --save $CHECKPOINT_PATH \
+    --load $CHECKPOINT_PATH \
+    --train-weighted-split-paths-path $WEIGHTS_TRAIN \
+    --valid-weighted-split-paths-path $WEIGHTS_VALID \
+    --structured-logs \
+    --structured-logs-dir $CHECKPOINT_PATH/logs \
+    $TENSORBOARD_ARGS \
+    --wandb-entity-name loubnabnl \
+    --wandb-project-name starcoder2-1B \
+    "
+
+export LAUNCHER="python -u -m torch.distributed.run \
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
+    --rdzv_backend c10d \
+    --max_restarts 0 \
+    --tee 3 \
+    "
+
+echo $CMD
+
+# hide duplicated errors using this hack - will be properly fixed in pt-1.12
+# export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json
+
+# This is needed for torch1.12.1 otherwise it doesn't link correctly, not sur what the issue was.
+#export PATH="/usr/local/cuda-11.6/bin:$PATH"
+#export LD_LIBRARY_PATH="/usr/local/cuda-11.6/lib64:$LD_LIBRARY_PATH"
+#export LD_PRELOAD=$CUDA_HOME/lib/libnccl.so
+#export LD_LIBRARY_PATH=$CUDA_HOME/efa/lib:$CUDA_HOME/lib:$CUDA_HOME/lib64:$LD_LIBRARY_PATH
+
+# srun error handling:
+# --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks
+# --kill-on-bad-exit=1: terminate a step if any task exits with a non-zero exit code
+SRUN_ARGS=" \
+    --wait=60 \
+    --kill-on-bad-exit=1 \
+    "
+
+# py-spy top -s -i -n -- $LAUNCHER --node_rank $SLURM_PROCID --role $SLURMD_NODENAME: $CMD
+clear; srun $SRUN_ARGS --jobid $SLURM_JOB_ID bash -c "$LAUNCHER --node_rank \$SLURM_PROCID --role \$SLURMD_NODENAME: $CMD" 2>&1 | tee $LOG_PATH
+
+echo "END TIME: $(date)"

From b13e6dad7b8a4c54b355ee2476242634445fce96 Mon Sep 17 00:00:00 2001
From: Loubna Ben Allal <44069155+loubnabnl@users.noreply.github.com>
Date: Tue, 14 Nov 2023 15:43:04 +0000
Subject: [PATCH 4/4] Update pretrain_starcoder2_1b_fix_rope.slurm

---
 examples/pretrain_starcoder2_1b_fix_rope.slurm | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/pretrain_starcoder2_1b_fix_rope.slurm b/examples/pretrain_starcoder2_1b_fix_rope.slurm
index 6d6792e2e7..43358d78bb 100644
--- a/examples/pretrain_starcoder2_1b_fix_rope.slurm
+++ b/examples/pretrain_starcoder2_1b_fix_rope.slurm
@@ -1,6 +1,6 @@
 #!/bin/bash
 #SBATCH --job-name=1B_test
-#SBATCH --nodes=1
+#SBATCH --nodes=32
 #SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
 #SBATCH --gres=gpu:8
 #SBATCH --exclusive 
@@ -47,7 +47,7 @@ NODE_RANK=$SLURM_PROCID
 WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
 
 # File path setup
-CHECKPOINT_PATH=/fsx/bigcode/experiments/pretraining/starcoder2-1B/checkpoints_fix  # Adjust: Directory to store the checkpoints
+CHECKPOINT_PATH=/fsx/bigcode/experiments/pretraining/starcoder2-1B/checkpoints_fix_rope  # Adjust: Directory to store the checkpoints
 # Starcoder2 tokenizer and data paths in /fsx/bigcode
 TOKENIZER_FILE=/fsx/loubna/data/tokenizer/starcoder2-smol-internal-1/tokenizer.json 
 WEIGHTS_TRAIN=/fsx/loubna/bigcode_2/code/bigcode-data-mix/data/train_data_paths.txt.tmp
@@ -71,8 +71,8 @@ GPT_ARGS="\
        --no-position-embedding \
        --attention-dropout 0.1 \
        --hidden-dropout 0.1 \
-       --micro-batch-size 1 \
-       --global-batch-size 8 \
+       --micro-batch-size 4 \
+       --global-batch-size 1024 \
        --lr 0.0004 \
        --min-lr 0.00004 \
        --train-iters 500000 \