From 034b0e71b0a969837e11b3d8ddb6711624cd51bf Mon Sep 17 00:00:00 2001 From: stdweird Date: Wed, 4 Sep 2024 14:11:39 +0200 Subject: [PATCH 1/4] add intelmpi slurm container support --- lib/vsc/mympirun/mpi/intelmpi.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/lib/vsc/mympirun/mpi/intelmpi.py b/lib/vsc/mympirun/mpi/intelmpi.py index 7aff74d..fe824a2 100644 --- a/lib/vsc/mympirun/mpi/intelmpi.py +++ b/lib/vsc/mympirun/mpi/intelmpi.py @@ -174,6 +174,30 @@ def mpirun_prepare_execution(self): return super().mpirun_prepare_execution() + def prepare(self): + super().prepare() + self.prepare_container_environment() + + def prepare_container_environment(self): + slurm_container = os.environ.get('SLURM_CONTAINER', None) + if slurm_container is not None: + logging.debug(f"Found SLURM_CONTAINER {slurm_container}") + + # intel mpi has some support for running with singularity, but not for slurm container mode + for key in [k for k in os.environ.keys() if k.startswith('APPTAINER') or k.startswith('SINGULARITY')]: + logging.debug(f"Removing environment variable {key} (={os.environ[key]}) in slurm container mode") + del os.environ[key] + + hydrmk = getattr(self, 'HYDRA_RMK', None) + if self.has_hydra and hydrmk and hydrmk[0] == 'slurm': + # need to pass full environment to tasks with slurm container wrapper + if slurm_container.startswith('HPCWN'): + slurm_container += ':fullenv' + # pass the container option to srun in slurm bootstrap + envname = 'I_MPI_HYDRA_BOOTSTRAP_EXEC_EXTRA_ARGS' + os.environ[envname] = f"--container {slurm_container}" + logging.debug(f"set extra {envname} to {os.environ[envname]}") + def pinning_override(self): """ pinning """ From 2b51af131a1ff594da426b24853ec68458ba2b53 Mon Sep 17 00:00:00 2001 From: stdweird Date: Wed, 4 Sep 2024 14:15:16 +0200 Subject: [PATCH 2/4] bump version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index b4b2e5f..98ba909 100644 --- a/setup.py +++ b/setup.py @@ -46,7 +46,7 @@ 'tests_require': [ 'mock', ], - 'version': '5.4.0', + 'version': '5.4.1', 'author': [sdw, kh], 'maintainer': [sdw, kh], 'zip_safe': False, From 2095b7f3783e345ef768e5b508300416f92a4605 Mon Sep 17 00:00:00 2001 From: stdweird Date: Thu, 5 Sep 2024 17:38:30 +0200 Subject: [PATCH 3/4] pass openmpi OMPI variables --- lib/vsc/mympirun/mpi/openmpi.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/vsc/mympirun/mpi/openmpi.py b/lib/vsc/mympirun/mpi/openmpi.py index c0562d8..0a380dd 100644 --- a/lib/vsc/mympirun/mpi/openmpi.py +++ b/lib/vsc/mympirun/mpi/openmpi.py @@ -58,6 +58,8 @@ class OpenMPI(MPI): MPIEXEC_TEMPLATE_GLOBAL_OPTION = ['--mca', '%(name)s', "%(value)s"] + OPTS_FROM_ENV_FLAVOR_PREFIX = ['OMPI'] + REMOTE_OPTION_TEMPLATE = ['--mca', 'pls_rsh_agent', '%(rsh)s'] def use_ucx_pml(self): From 4a21500e3126412f11740416f84334d1288c0330 Mon Sep 17 00:00:00 2001 From: stdweird Date: Thu, 5 Sep 2024 17:56:09 +0200 Subject: [PATCH 4/4] refactor slurm container mechanism --- lib/vsc/mympirun/mpi/intelmpi.py | 27 +++------------------------ lib/vsc/mympirun/mpi/mpi.py | 32 ++++++++++++++++++++++++++++---- lib/vsc/mympirun/mpi/openmpi.py | 2 ++ 3 files changed, 33 insertions(+), 28 deletions(-) diff --git a/lib/vsc/mympirun/mpi/intelmpi.py b/lib/vsc/mympirun/mpi/intelmpi.py index fe824a2..c3997ac 100644 --- a/lib/vsc/mympirun/mpi/intelmpi.py +++ b/lib/vsc/mympirun/mpi/intelmpi.py @@ -77,6 +77,9 @@ class IntelMPI(MPI): OPTS_FROM_ENV_TEMPLATE = ['-envlist', '%(commaseparated)s'] + CONTAINER_CLEANUP_ENVVARS = ('SINGULARITY', 'APPTAINER') # must be tuple + CONTAINER_HYDRA_ENVVAR = 'I_MPI_HYDRA_BOOTSTRAP_EXEC_EXTRA_ARGS' + def _has_hydra(self): """Has HYDRA or not""" mgr = os.environ.get('I_MPI_PROCESS_MANAGER', None) @@ -174,30 +177,6 @@ def mpirun_prepare_execution(self): return super().mpirun_prepare_execution() - def prepare(self): - super().prepare() - self.prepare_container_environment() - - def prepare_container_environment(self): - slurm_container = os.environ.get('SLURM_CONTAINER', None) - if slurm_container is not None: - logging.debug(f"Found SLURM_CONTAINER {slurm_container}") - - # intel mpi has some support for running with singularity, but not for slurm container mode - for key in [k for k in os.environ.keys() if k.startswith('APPTAINER') or k.startswith('SINGULARITY')]: - logging.debug(f"Removing environment variable {key} (={os.environ[key]}) in slurm container mode") - del os.environ[key] - - hydrmk = getattr(self, 'HYDRA_RMK', None) - if self.has_hydra and hydrmk and hydrmk[0] == 'slurm': - # need to pass full environment to tasks with slurm container wrapper - if slurm_container.startswith('HPCWN'): - slurm_container += ':fullenv' - # pass the container option to srun in slurm bootstrap - envname = 'I_MPI_HYDRA_BOOTSTRAP_EXEC_EXTRA_ARGS' - os.environ[envname] = f"--container {slurm_container}" - logging.debug(f"set extra {envname} to {os.environ[envname]}") - def pinning_override(self): """ pinning """ diff --git a/lib/vsc/mympirun/mpi/mpi.py b/lib/vsc/mympirun/mpi/mpi.py index 0c71de1..7dd810b 100644 --- a/lib/vsc/mympirun/mpi/mpi.py +++ b/lib/vsc/mympirun/mpi/mpi.py @@ -206,6 +206,9 @@ class MPI(MpiBase): ] OPTS_FROM_ENV_FLAVOR_PREFIX = [] # to be set per flavor + CONTAINER_CLEANUP_ENVVARS = () # must be tuple + CONTAINER_HYDRA_ENVVAR = None + def __init__(self, options, cmdargs, **kwargs): self.options = options self.cmdargs = cmdargs @@ -291,7 +294,6 @@ def main(self): msg = f"main: exitcode {exitcode} > 0; cmd {self.mpirun_cmd}" raise Exception(msg) - ### BEGIN prepare ### def prepare(self): """Collect information to create the commands.""" self.check_usable_cpus() @@ -306,6 +308,8 @@ def prepare(self): self.set_pinning() + self.container_environment() + def check_usable_cpus(self): """Check and log if non-standard cpus (eg due to cpusets).""" if not self.cores_per_node == len(self.cpus): @@ -569,7 +573,6 @@ def make_mympirundir(self): logging.debug("make_mympirun_dir: tmp mympirundir %s", destdir) self.mympirundir = destdir - ### BEGIN pinning ### def set_pinning(self): """ set pinmpi to True or False depending on the command line options 'pinmpi' and 'overridepin' @@ -587,7 +590,29 @@ def set_pinning(self): else: logging.debug("set_pinning: pinmpi %s", self.options.pinmpi) - ### BEGIN mpdboot ### + def container_environment(self): + slurm_container = os.environ.get('SLURM_CONTAINER', None) + if slurm_container is not None: + logging.debug(f"Found SLURM_CONTAINER {slurm_container}") + + if self.CONTAINER_CLEANUP_ENVVARS: + # e.g. intel mpi has some support for running with singularity, but not for slurm container mode + for key in [k for k in os.environ.keys() if k.startswith(self.CONTAINER_CLEANUP_ENVVARS)]: + logging.debug(f"Removing environment variable {key} (={os.environ[key]}) in slurm container mode") + del os.environ[key] + + hydrmk = getattr(self, 'HYDRA_RMK', None) + if self.has_hydra and hydrmk and hydrmk[0] == 'slurm' and self.CONTAINER_HYDRA_ENVVAR: + # assumes the hydra slurm support uses srun + + # need to pass full environment to tasks with slurm container wrapper + if slurm_container.startswith('HPCWN'): + slurm_container += ':fullenv' + + # pass the container option to srun in slurm bootstrap + os.environ[self.CONTAINER_HYDRA_ENVVAR] = f"--container={slurm_container}" + logging.debug(f"set extra {self.CONTAINER_HYDRA_ENVVAR} to {os.environ[self.CONTAINER_HYDRA_ENVVAR]}") + def make_mpdboot(self): """ Make the mpdboot configuration. @@ -693,7 +718,6 @@ def make_mpdboot_options(self): if not self.has_hydra: self.mpdboot_options.add(self.REMOTE_OPTION_TEMPLATE, tmpl_vals={'rsh': self.get_rsh()}) - ### BEGIN mpiexec ### def set_mpiexec_global_options(self): """ Set mpiexec_global_options. diff --git a/lib/vsc/mympirun/mpi/openmpi.py b/lib/vsc/mympirun/mpi/openmpi.py index 0a380dd..5398974 100644 --- a/lib/vsc/mympirun/mpi/openmpi.py +++ b/lib/vsc/mympirun/mpi/openmpi.py @@ -62,6 +62,8 @@ class OpenMPI(MPI): REMOTE_OPTION_TEMPLATE = ['--mca', 'pls_rsh_agent', '%(rsh)s'] + CONTAINER_HYDRA_ENVVAR = 'OMPI_MCA_plm_slurm_args' + def use_ucx_pml(self): """Determine whether or not to use the UCX Point-to-Point Messaging Layer (PML).""" # don't use UCX by default (mostly because of backwards-compatibility)