|
| 1 | +#!/bin/bash |
| 2 | + |
| 3 | +# -------------------------------------------------------------------------------------- |
| 4 | +# Part of the interTwin Project: https://www.intertwin.eu/ |
| 5 | +# |
| 6 | +# Created by: Matteo Bunino |
| 7 | +# |
| 8 | +# Credit: |
| 9 | +# - Matteo Bunino <matteo.bunino@cern.ch> - CERN |
| 10 | +# - Linus Eickhoff <linus.maximilian.eickhoff@cern.ch> - CERN |
| 11 | +# -------------------------------------------------------------------------------------- |
| 12 | + |
| 13 | +export CONTAINER_PATH="/project/project_465001592/itwinai-containers/container_test_5.sif" |
| 14 | + |
| 15 | +# Clear SLURM logs (*.out and *.err files) |
| 16 | +rm -rf logs_slurm checkpoints* mllogs* ray_checkpoints logs_torchrun |
| 17 | +mkdir -p logs_slurm logs_torchrun |
| 18 | + |
| 19 | +export HYDRA_FULL_ERROR=1 |
| 20 | + |
| 21 | +# DDP itwinai |
| 22 | +DIST_MODE="ddp" |
| 23 | +RUN_NAME="ddp-itwinai" |
| 24 | +TRAINING_CMD="itwinai exec-pipeline strategy=ddp checkpoints_location=checkpoints_ddp" |
| 25 | +sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" \ |
| 26 | + --job-name="$RUN_NAME-n$N" \ |
| 27 | + --output="logs_slurm/job-$RUN_NAME-n$N.out" \ |
| 28 | + --error="logs_slurm/job-$RUN_NAME-n$N.err" \ |
| 29 | + slurm.lumi.sh |
| 30 | + |
| 31 | +# DeepSpeed itwinai |
| 32 | +DIST_MODE="deepspeed" |
| 33 | +RUN_NAME="deepspeed-itwinai" |
| 34 | +TRAINING_CMD="itwinai exec-pipeline strategy=deepspeed checkpoints_location=checkpoints_deepspeed" |
| 35 | +sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" \ |
| 36 | + --job-name="$RUN_NAME-n$N" \ |
| 37 | + --output="logs_slurm/job-$RUN_NAME-n$N.out" \ |
| 38 | + --error="logs_slurm/job-$RUN_NAME-n$N.err" \ |
| 39 | + slurm.lumi.sh |
| 40 | + |
| 41 | +# Horovod itwinai |
| 42 | +DIST_MODE="horovod" |
| 43 | +RUN_NAME="horovod-itwinai" |
| 44 | +TRAINING_CMD="itwinai exec-pipeline strategy=horovod checkpoints_location=checkpoints_hvd" |
| 45 | +sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" \ |
| 46 | + --job-name="$RUN_NAME-n$N" \ |
| 47 | + --output="logs_slurm/job-$RUN_NAME-n$N.out" \ |
| 48 | + --error="logs_slurm/job-$RUN_NAME-n$N.err" \ |
| 49 | + slurm.lumi.sh |
| 50 | + |
| 51 | +### GAN training ### |
| 52 | + |
| 53 | +# DDP itwinai |
| 54 | +DIST_MODE="ddp" |
| 55 | +RUN_NAME="ddp-itwinai" |
| 56 | +TRAINING_CMD="itwinai exec-pipeline strategy=ddp checkpoints_location=checkpoints_ddp +pipe_key=training_pipeline_gan" |
| 57 | +sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" \ |
| 58 | + --job-name="$RUN_NAME-n$N" \ |
| 59 | + --output="logs_slurm/job-$RUN_NAME-n$N.out" \ |
| 60 | + --error="logs_slurm/job-$RUN_NAME-n$N.err" \ |
| 61 | + slurm.lumi.sh |
| 62 | + |
| 63 | +# DeepSpeed itwinai |
| 64 | +DIST_MODE="deepspeed" |
| 65 | +RUN_NAME="deepspeed-itwinai" |
| 66 | +TRAINING_CMD="itwinai exec-pipeline strategy=deepspeed checkpoints_location=checkpoints_deepspeed +pipe_key=training_pipeline_gan" |
| 67 | +sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" \ |
| 68 | + --job-name="$RUN_NAME-n$N" \ |
| 69 | + --output="logs_slurm/job-$RUN_NAME-n$N.out" \ |
| 70 | + --error="logs_slurm/job-$RUN_NAME-n$N.err" \ |
| 71 | + slurm.lumi.sh |
| 72 | + |
| 73 | +# GAN with Horovod does not work |
| 74 | +# Horovod itwinai |
| 75 | +# DIST_MODE="horovod" |
| 76 | +# RUN_NAME="horovod-itwinai" |
| 77 | +# TRAINING_CMD="itwinai exec-pipeline strategy=horovod checkpoints_location=checkpoints_hvd +pipe_key=training_pipeline_gan" |
| 78 | +# sbatch --export=ALL,DIST_MODE="$DIST_MODE",RUN_NAME="$RUN_NAME",TRAINING_CMD="$TRAINING_CMD",PYTHON_VENV="$PYTHON_VENV" \ |
| 79 | +# --job-name="$RUN_NAME-n$N" \ |
| 80 | +# --output="logs_slurm/job-$RUN_NAME-n$N.out" \ |
| 81 | +# --error="logs_slurm/job-$RUN_NAME-n$N.err" \ |
| 82 | +# slurm.lumi.sh |
0 commit comments