Skip to content

Commit 33f0385

Browse files
lineickmatbun
andauthored
Lumi scripts for MNIST (#378)
* update files * first working version * update scripts * added singularity bindings module * module * latest container * fix LD_LIBRARY_PATH? * fixed horovod: multinode ray hpo and distributed training * Update slurm.lumi.sh * Update slurm.lumi.sh * cleanup, oom fix * cleanup * cleanup * cleanup * small change * added comments --------- Co-authored-by: Matteo Bunino <48362942+matbun@users.noreply.github.com>
1 parent 3fee316 commit 33f0385

File tree

4 files changed

+403
-7
lines changed

4 files changed

+403
-7
lines changed

use-cases/mnist/torch/runall.sh renamed to use-cases/mnist/torch/runall.jsc.sh

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$T
2727
--job-name="$RUN_NAME-n$N" \
2828
--output="logs_slurm/job-$RUN_NAME-n$N.out" \
2929
--error="logs_slurm/job-$RUN_NAME-n$N.err" \
30-
slurm.sh
30+
slurm.jsc.sh
3131

3232
# DeepSpeed itwinai
3333
DIST_MODE="deepspeed"
@@ -37,7 +37,7 @@ sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$T
3737
--job-name="$RUN_NAME-n$N" \
3838
--output="logs_slurm/job-$RUN_NAME-n$N.out" \
3939
--error="logs_slurm/job-$RUN_NAME-n$N.err" \
40-
slurm.sh
40+
slurm.jsc.sh
4141

4242
# Horovod itwinai
4343
DIST_MODE="horovod"
@@ -47,7 +47,7 @@ sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$T
4747
--job-name="$RUN_NAME-n$N" \
4848
--output="logs_slurm/job-$RUN_NAME-n$N.out" \
4949
--error="logs_slurm/job-$RUN_NAME-n$N.err" \
50-
slurm.sh
50+
slurm.jsc.sh
5151

5252

5353
### GAN training ###
@@ -60,7 +60,7 @@ sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$T
6060
--job-name="$RUN_NAME-n$N" \
6161
--output="logs_slurm/job-$RUN_NAME-n$N.out" \
6262
--error="logs_slurm/job-$RUN_NAME-n$N.err" \
63-
slurm.sh
63+
slurm.jsc.sh
6464

6565
# DeepSpeed itwinai
6666
DIST_MODE="deepspeed"
@@ -70,7 +70,7 @@ sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$T
7070
--job-name="$RUN_NAME-n$N" \
7171
--output="logs_slurm/job-$RUN_NAME-n$N.out" \
7272
--error="logs_slurm/job-$RUN_NAME-n$N.err" \
73-
slurm.sh
73+
slurm.jsc.sh
7474

7575
# GAN with Horovod does not work
7676
# # Horovod itwinai
@@ -81,4 +81,4 @@ sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$T
8181
# --job-name="$RUN_NAME-n$N" \
8282
# --output="logs_slurm/job-$RUN_NAME-n$N.out" \
8383
# --error="logs_slurm/job-$RUN_NAME-n$N.err" \
84-
# slurm.sh
84+
# slurm.jsc.sh

use-cases/mnist/torch/runall.lumi.sh

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
#!/bin/bash
2+
3+
# --------------------------------------------------------------------------------------
4+
# Part of the interTwin Project: https://www.intertwin.eu/
5+
#
6+
# Created by: Matteo Bunino
7+
#
8+
# Credit:
9+
# - Matteo Bunino <matteo.bunino@cern.ch> - CERN
10+
# - Linus Eickhoff <linus.maximilian.eickhoff@cern.ch> - CERN
11+
# --------------------------------------------------------------------------------------
12+
13+
export CONTAINER_PATH="/project/project_465001592/itwinai-containers/container_test_5.sif"
14+
15+
# Clear SLURM logs (*.out and *.err files)
16+
rm -rf logs_slurm checkpoints* mllogs* ray_checkpoints logs_torchrun
17+
mkdir -p logs_slurm logs_torchrun
18+
19+
export HYDRA_FULL_ERROR=1
20+
21+
# DDP itwinai
22+
DIST_MODE="ddp"
23+
RUN_NAME="ddp-itwinai"
24+
TRAINING_CMD="itwinai exec-pipeline strategy=ddp checkpoints_location=checkpoints_ddp"
25+
sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" \
26+
--job-name="$RUN_NAME-n$N" \
27+
--output="logs_slurm/job-$RUN_NAME-n$N.out" \
28+
--error="logs_slurm/job-$RUN_NAME-n$N.err" \
29+
slurm.lumi.sh
30+
31+
# DeepSpeed itwinai
32+
DIST_MODE="deepspeed"
33+
RUN_NAME="deepspeed-itwinai"
34+
TRAINING_CMD="itwinai exec-pipeline strategy=deepspeed checkpoints_location=checkpoints_deepspeed"
35+
sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" \
36+
--job-name="$RUN_NAME-n$N" \
37+
--output="logs_slurm/job-$RUN_NAME-n$N.out" \
38+
--error="logs_slurm/job-$RUN_NAME-n$N.err" \
39+
slurm.lumi.sh
40+
41+
# Horovod itwinai
42+
DIST_MODE="horovod"
43+
RUN_NAME="horovod-itwinai"
44+
TRAINING_CMD="itwinai exec-pipeline strategy=horovod checkpoints_location=checkpoints_hvd"
45+
sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" \
46+
--job-name="$RUN_NAME-n$N" \
47+
--output="logs_slurm/job-$RUN_NAME-n$N.out" \
48+
--error="logs_slurm/job-$RUN_NAME-n$N.err" \
49+
slurm.lumi.sh
50+
51+
### GAN training ###
52+
53+
# DDP itwinai
54+
DIST_MODE="ddp"
55+
RUN_NAME="ddp-itwinai"
56+
TRAINING_CMD="itwinai exec-pipeline strategy=ddp checkpoints_location=checkpoints_ddp +pipe_key=training_pipeline_gan"
57+
sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" \
58+
--job-name="$RUN_NAME-n$N" \
59+
--output="logs_slurm/job-$RUN_NAME-n$N.out" \
60+
--error="logs_slurm/job-$RUN_NAME-n$N.err" \
61+
slurm.lumi.sh
62+
63+
# DeepSpeed itwinai
64+
DIST_MODE="deepspeed"
65+
RUN_NAME="deepspeed-itwinai"
66+
TRAINING_CMD="itwinai exec-pipeline strategy=deepspeed checkpoints_location=checkpoints_deepspeed +pipe_key=training_pipeline_gan"
67+
sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" \
68+
--job-name="$RUN_NAME-n$N" \
69+
--output="logs_slurm/job-$RUN_NAME-n$N.out" \
70+
--error="logs_slurm/job-$RUN_NAME-n$N.err" \
71+
slurm.lumi.sh
72+
73+
# GAN with Horovod does not work
74+
# Horovod itwinai
75+
# DIST_MODE="horovod"
76+
# RUN_NAME="horovod-itwinai"
77+
# TRAINING_CMD="itwinai exec-pipeline strategy=horovod checkpoints_location=checkpoints_hvd +pipe_key=training_pipeline_gan"
78+
# sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" \
79+
# --job-name="$RUN_NAME-n$N" \
80+
# --output="logs_slurm/job-$RUN_NAME-n$N.out" \
81+
# --error="logs_slurm/job-$RUN_NAME-n$N.err" \
82+
# slurm.lumi.sh

use-cases/mnist/torch/slurm.sh renamed to use-cases/mnist/torch/slurm.jsc.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,9 @@ function ray-launcher(){
130130
echo All Ray workers started.
131131

132132
# Run command without srun
133-
$1 training_pipeline.steps.training_step.ray_scaling_config.num_workers=$(($SLURM_GPUS_PER_NODE * $SLURM_NNODES))
133+
# if you want the number of workers to be adaptive during distributed training append this:
134+
# training_pipeline.steps.training_step.ray_scaling_config.num_workers=$(($SLURM_GPUS_PER_NODE * $SLURM_NNODES))
135+
$1
134136
}
135137

136138
function torchrun-launcher(){

0 commit comments

Comments
 (0)