Skip to content

Commit d6ffdaf

Browse files
committed
Merge branch 'main' into feat/auto-gres
2 parents 0ed4fab + 335d88b commit d6ffdaf

File tree

17 files changed

+300
-81
lines changed

17 files changed

+300
-81
lines changed

.ansible-lint.yml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,6 @@ skip_list:
66
- jinja[spacing]
77
- galaxy[no-changelog]
88
- meta-runtime[unsupported-version]
9-
10-
warn_list:
119
- name[missing]
1210
- name[play]
1311
- var-naming

.github/workflows/stackhpc.yml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,12 @@ jobs:
107107
. venv/bin/activate
108108
. environments/.stackhpc/activate
109109
cd "$STACKHPC_TF_DIR"
110-
tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars"
110+
max_retries=3
111+
delay=30
112+
for i in $(seq 1 $max_retries); do
113+
tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" && break
114+
[ "$i" -lt "$max_retries" ] && sleep $delay || exit 1
115+
done
111116
112117
- name: Delete infrastructure if provisioning failed
113118
run: |

ansible/adhoc/sync-pulp.yml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,5 @@
55
name: pulp_site
66
tasks_from: sync.yml
77
vars:
8-
pulp_site_target_arch: "x86_64"
9-
pulp_site_target_distribution: "rocky"
108
# default distribution to *latest* specified for baseos repo:
119
pulp_site_target_distribution_version: "{{ dnf_repos_repos['baseos'].keys() | map('float') | sort | last }}"

ansible/fatimage.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,7 @@
206206
ansible.builtin.include_role:
207207
name: hpctests
208208
tasks_from: source-hpl.yml
209+
when: "'hpctests' in group_names"
209210

210211
- hosts: prometheus
211212
become: true

ansible/roles/eessi/defaults/main.yaml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
---
2+
cvmfs_release_version: "6-3"
3+
24
# Default to 10GB
35
cvmfs_quota_limit_mb: 10000
46

@@ -9,4 +11,4 @@ cvmfs_config_default:
911
cvmfs_config_overrides: {}
1012
cvmfs_config: "{{ cvmfs_config_default | combine(cvmfs_config_overrides) }}"
1113

12-
cvmfs_gpg_checksum: "sha256:4ac81adff957565277cfa6a4a330cdc2ce5a8fdd73b8760d1a5a32bef71c4bd6"
14+
cvmfs_gpg_checksum: "sha256:5c60679d307a96524204c127250e8ebdda66a459659faa1718bdf32dde1d7069"

ansible/roles/eessi/tasks/install.yml

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,18 +3,17 @@
33
- name: Download Cern GPG key
44
# checkov:skip=CKV2_ANSIBLE_2: "Ensure that HTTPS url is used with get_url"
55
ansible.builtin.get_url:
6-
url: http://cvmrepo.web.cern.ch/cvmrepo/yum/RPM-GPG-KEY-CernVM
6+
url: https://cvmrepo.web.cern.ch/cvmrepo/yum/RPM-GPG-KEY-CernVM-2048
77
dest: ./cvmfs-key.gpg
88
checksum: "{{ cvmfs_gpg_checksum }}"
99
mode: "0644"
1010

1111
- name: Import downloaded GPG key # noqa: no-changed-when
1212
ansible.builtin.command: rpm --import cvmfs-key.gpg # noqa: command-instead-of-module
13+
1314
- name: Add CVMFS repo
14-
# checkov:skip=CKV2_ANSIBLE_4: "Ensure that packages with untrusted or missing GPG signatures are not used by dnf"
1515
ansible.builtin.dnf:
16-
name: https://ecsft.cern.ch/dist/cvmfs/cvmfs-release/cvmfs-release-latest.noarch.rpm
17-
disable_gpg_check: true
16+
name: "https://ecsft.cern.ch/dist/cvmfs/cvmfs-release/cvmfs-release-{{ cvmfs_release_version }}.noarch.rpm"
1817

1918
- name: Install CVMFS
2019
ansible.builtin.dnf:

ansible/roles/nhc/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ When the `ansible/site.yml` playbook is run this will automatically:
3737

3838
- Filesystem mounts
3939
- Ethernet interfaces
40+
- InfiniBand interfaces
4041

4142
See `/etc/nhc/nhc.conf` on a compute node for the full configuration.
4243

ansible/roles/nhc/templates/nhc.conf.j2

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,18 @@
88
{# /efi is mounted both directly and via systemd1 autofs, which NHC can't cope with #}
99
{# use `awk '{print $5 " " $10 " " $4 " " $9}' /proc/self/mountinfo | sort -k1` to check that is the only case #}
1010
{% set mount_mode = 'rw' if 'rw' in mount.options.split(',') else 'ro' %}
11-
{{ ansible_fqdn }} || check_fs_mount_{{ mount_mode }} -t "{{ mount.fstype }}" -s "{{ mount.device }}" -f "{{ mount.mount }}"
11+
{{ ansible_fqdn }} || check_fs_mount_{{ mount_mode }} -t "{{ mount.fstype }}" -s "{{ mount.device | replace(' ', '\\040') }}" -f "{{ mount.mount }}"
1212
{% endfor %}
1313

1414
## Ethernet interface checks
1515
{% for iface in ansible_interfaces | select('match', 'eth') %}
1616
{{ ansible_fqdn }} || check_hw_eth {{ iface }}
1717
{% endfor %}
1818

19+
## InfiniBand interface checks
20+
{% for iface in ansible_interfaces | select('match', 'ib') %}
21+
{{ ansible_fqdn }} || check_hw_ib {{ (ansible_facts[iface]['speed'] / 1000) | int }} {{ iface }}
22+
{% endfor %}
23+
1924
## Site-specific checks
2025
{{ nhc_config_extra }}

dev/image-set-properties.sh

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
#!/usr/bin/bash
2+
# Set image properties correctly for Slurm Appliance images
3+
#
4+
# Usage:
5+
# dev/image-set-properties.sh $IMAGE_NAME_OR_ID
6+
7+
set -euo pipefail
8+
9+
image=${1?param missing - image name or ID}
10+
echo getting image format ...
11+
format=$(openstack image show -c disk_format -f value "${image}")
12+
13+
echo setting constant properties ...
14+
set -x
15+
openstack image set \
16+
--property hw_machine_type=q35 \
17+
--property hw_architecture=x86_64 \
18+
--property hw_vif_multiqueue_enabled=true \
19+
--property hw_firmware_type=uefi \
20+
--property os_distro=rocky \
21+
--property os_type=linux \
22+
--property os_admin_user=rocky \
23+
"$image"
24+
25+
set +x
26+
if [[ "$format" = raw ]]; then
27+
echo setting raw properties...
28+
set -x
29+
openstack image set \
30+
--property hw_scsi_model=virtio-scsi \
31+
--property hw_disk_bus=scsi \
32+
"$image"
33+
else
34+
echo setting qcow2 properties
35+
set -x
36+
openstack image set \
37+
--property hw_scsi_model=virtio-scsi \
38+
--property hw_disk_bus=scsi \
39+
--property hw_scsi_model=virtio \
40+
"$image"
41+
fi

docs/eessi.md

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
# EESSI Guide
2+
3+
## How to Load EESSI
4+
5+
The EESSI environment can be initialise by running:
6+
7+
```[bash]
8+
source /cvmfs/software.eessi.io/versions/2023.06/init/bash
9+
```
10+
11+
This is non-reversible because it:
12+
13+
- Changes your `$PATH`, `$MODULEPATH`, `$LD_LIBRARY_PATH`, and other critical environment variables.
14+
- Sets EESSI-specific variables such as `EESSI_ROOT`.
15+
16+
However, it is the recommended method because it:
17+
18+
- Detects system CPU architecture and OS.
19+
- Detects and configures GPU support.
20+
- Prepares the full EESSI software stack.
21+
- Sets up Lmod (environment module system).
22+
23+
The [EESSI docs](https://www.eessi.io/docs/using_eessi/setting_up_environment/) offer another method to load EESSI. This alternative method only initialises the Lmod module system and does not load a platform-specific setup. For these reasons, it is recommended to use the method detailed above.
24+
25+
Successful environment setup will show `{EESSI 2023.06}` at the start of your CLI.
26+
27+
To deactivate your EESSI environment you can either restart your shell using `exec bash` or exit the shell by running `exit`.
28+
29+
## GPU Support with EESSI
30+
31+
To enable GPU support, the cluster must be running a site-specific image build that has CUDA enabled. For a guide on how to do this, please refer to [docs/image-build.md](../image-build.md).
32+
33+
### Using GPUs
34+
35+
All CUDA-enabled software in EESSI expects CUDA drivers in a specific `host_injections` directory.
36+
37+
#### To expose the NVIDIA GPU drivers
38+
39+
Use the `link_nvidia_host_libraries.sh` script, provided by EESSI, to symlink your GPU drivers into `host_injections`.
40+
41+
```[bash]
42+
/cvmfs/software.eessi.io/versions/2023.06/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh
43+
```
44+
45+
Rerun this script when your NVIDIA GPU drivers are updated. It is also safe to rerun at any time as the script will detect if the driver versions have already been symlinked.
46+
47+
### Building with GPUs
48+
49+
Run `which nvcc` to confirm that the CUDA compiler is found.
50+
51+
If `nvcc` is not found, add the CUDA path to your environment:
52+
53+
```[bash]
54+
export PATH=/usr/local/cuda/bin:$PATH
55+
```
56+
57+
`which nvcc` should now show the path to the CUDA compiler.
58+
59+
#### Loading EESSI module for the GCC compiler
60+
61+
Running `which gcc` with EESSI initialised should initially show a path `.../2023.06/compat...` which points to the compatibility compiler.
62+
63+
It is important to load a `gcc` version that is compatible with the host's CUDA version. This can be found by referring to the table below:
64+
65+
To find your cuda version run `nvcc --version`
66+
67+
| **CUDA Version** | **Recommended GCC Module (EESSI 2023.06)** |
68+
| ---------------- | ------------------------------------------------------- |
69+
| CUDA 13.0 | GCC/12.3.0 |
70+
| CUDA 12.3 – 12.5 | GCC/12.3.0 or GCC/12.2.0 (officially supports GCC 12.x) |
71+
| CUDA 12.0 – 12.2 | GCC/12.2.0 (officially listed supported version) |
72+
| CUDA 11.4 – 11.8 | no matching GCC module in EESSI 2023.06 |
73+
| CUDA < 11.4 | not supported |
74+
75+
To load correct ESSI module run, substituting x for the compatible GCC version:
76+
77+
```[bash]
78+
module load GCC/12.x.0
79+
```
80+
81+
Running `which gcc` will now give a path `.../2023.06/software...` which is the full compiler provided by EESSI. This is what we want for CUDA builds.
82+
83+
Now you can run `cmake` and `make` to compile CUDA using EESSI's `gcc`.
84+
85+
#### Test: Compile deviceQuery from CUDA-Samples
86+
87+
To test that your EESSI setup can compile CUDA, try compiling `deviceQuery` from CUDA-Samples with the following steps:
88+
89+
```[bash]
90+
git clone https://github.com/NVIDIA/cuda-samples.git
91+
cd cuda-samples/Samples/1_Utilities/deviceQuery
92+
mkdir -p build
93+
cd build
94+
cmake ..
95+
make
96+
./deviceQuery
97+
```

0 commit comments

Comments
 (0)