Skip to content

Commit fb73cdf

Browse files
authored
Merge pull request #2 from stackhpc/training/leafcloud-v2.3
Update for Slurm appliance v2.3
2 parents 9ac2bd6 + 6ad6dcc commit fb73cdf

File tree

203 files changed

+3258
-1837
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

203 files changed

+3258
-1837
lines changed

.github/bin/get-s3-image.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ if [ -n "$image_exists" ]; then
1717
else
1818
echo "Image $image_name not found in OpenStack. Getting it from S3."
1919

20-
wget https://object.arcus.openstack.hpc.cam.ac.uk/swift/v1/AUTH_3a06571936a0424bb40bc5c672c4ccb1/$bucket_name/$image_name --progress=dot:giga
20+
wget https://leafcloud.store/swift/v1/AUTH_f39848421b2747148400ad8eeae8d536/$bucket_name/$image_name --progress=dot:giga
2121

2222
echo "Uploading image $image_name to OpenStack..."
2323
openstack image create --file $image_name --disk-format qcow2 $image_name --progress

.github/workflows/extra.yml

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ on:
88
- 'environments/.stackhpc/tofu/cluster_image.auto.tfvars.json'
99
- 'ansible/roles/doca/**'
1010
- 'ansible/roles/cuda/**'
11+
- 'ansible/roles/slurm_recompile/**' # runs on cuda group
1112
- 'ansible/roles/lustre/**'
1213
- '.github/workflows/extra.yml'
1314
pull_request:
@@ -32,16 +33,17 @@ jobs:
3233
- image_name: openhpc-extra-RL8
3334
source_image_name_key: RL8 # key into environments/.stackhpc/tofu/cluster_image.auto.tfvars.json
3435
inventory_groups: doca,cuda,lustre
35-
volume_size: 30 # needed for cuda
36+
volume_size: 35 # needed for cuda
3637
- image_name: openhpc-extra-RL9
3738
source_image_name_key: RL9
3839
inventory_groups: doca,cuda,lustre
39-
volume_size: 30 # needed for cuda
40+
volume_size: 35 # needed for cuda
4041
env:
4142
ANSIBLE_FORCE_COLOR: True
4243
OS_CLOUD: openstack
4344
CI_CLOUD: ${{ vars.CI_CLOUD }} # default from repo settings
4445
ARK_PASSWORD: ${{ secrets.ARK_PASSWORD }}
46+
PACKER_GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }}
4547

4648
steps:
4749
- uses: actions/checkout@v2
@@ -94,7 +96,7 @@ jobs:
9496
. environments/.stackhpc/activate
9597
cd packer/
9698
packer init .
97-
99+
98100
PACKER_LOG=1 packer build \
99101
-on-error=${{ vars.PACKER_ON_ERROR }} \
100102
-var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \
@@ -103,7 +105,7 @@ jobs:
103105
-var "inventory_groups=${{ matrix.build.inventory_groups }}" \
104106
-var "volume_size=${{ matrix.build.volume_size }}" \
105107
openstack.pkr.hcl
106-
108+
107109
- name: Get created image names from manifest
108110
id: manifest
109111
run: |
@@ -121,7 +123,7 @@ jobs:
121123
- name: Make image usable for further builds
122124
run: |
123125
. venv/bin/activate
124-
openstack image unset --property signature_verified "${{ steps.manifest.outputs.image-id }}"
126+
openstack image unset --property signature_verified "${{ steps.manifest.outputs.image-id }}" || true
125127
126128
- name: Delete image for automatically-run workflows
127129
run: |

.github/workflows/fatimage.yml

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,11 @@ on:
1010
- LEAFCLOUD
1111
- SMS
1212
- ARCUS
13+
cleanup_on_failure:
14+
description: Cleanup Packer resources on failure
15+
type: boolean
16+
required: true
17+
default: true
1318

1419
jobs:
1520
openstack:
@@ -23,17 +28,18 @@ jobs:
2328
matrix: # build RL8, RL9
2429
build:
2530
- image_name: openhpc-RL8
26-
source_image_name: Rocky-8-GenericCloud-Base-8.10-20240528.0.x86_64.qcow2
31+
source_image_name: Rocky-8-GenericCloud-Base-8.10-20240528.0.x86_64.raw
2732
inventory_groups: control,compute,login,update
2833
- image_name: openhpc-RL9
29-
source_image_name: Rocky-9-GenericCloud-Base-9.5-20241118.0.x86_64.qcow2
34+
source_image_name: Rocky-9-GenericCloud-Base-9.6-20250531.0.x86_64.qcow2
3035
inventory_groups: control,compute,login,update
3136
env:
3237
ANSIBLE_FORCE_COLOR: True
3338
OS_CLOUD: openstack
3439
CI_CLOUD: ${{ github.event.inputs.ci_cloud }}
3540
ARK_PASSWORD: ${{ secrets.ARK_PASSWORD }}
3641
LEAFCLOUD_PULP_PASSWORD: ${{ secrets.LEAFCLOUD_PULP_PASSWORD }}
42+
PACKER_GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }}
3743

3844
steps:
3945
- uses: actions/checkout@v2
@@ -78,7 +84,7 @@ jobs:
7884
packer init .
7985
8086
PACKER_LOG=1 packer build \
81-
-on-error=${{ vars.PACKER_ON_ERROR }} \
87+
-on-error=${{ github.event.inputs.cleanup_on_failure && 'cleanup' || 'abort' }} \
8288
-var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \
8389
-var "source_image_name=${{ matrix.build.source_image_name }}" \
8490
-var "image_name=${{ matrix.build.image_name }}" \
@@ -102,7 +108,7 @@ jobs:
102108
- name: Make image usable for further builds
103109
run: |
104110
. venv/bin/activate
105-
openstack image unset --property signature_verified "${{ steps.manifest.outputs.image-id }}"
111+
openstack image unset --property signature_verified "${{ steps.manifest.outputs.image-id }}" || true
106112
107113
- name: Upload manifest artifact
108114
uses: actions/upload-artifact@v4

.github/workflows/nightly-cleanup.yml

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -59,10 +59,34 @@ jobs:
5959
- name: Delete CI clusters
6060
run: |
6161
. venv/bin/activate
62-
if [[ -z "${ci_clusters}" ]]; then
62+
if [[ -z ${ci_clusters} ]]; then
6363
echo "No clusters to delete."
6464
exit 0
6565
fi
66-
echo "Deleting clusters: ${ci_clusters}"
67-
./dev/delete-cluster.py ${ci_clusters} --force
66+
67+
for cluster_prefix in ${ci_clusters}
68+
do
69+
echo "Processing cluster: $cluster_prefix"
70+
71+
# Get all servers with the matching name for control node
72+
CONTROL_SERVERS=$(openstack server list --name ${cluster_prefix}-control --format json)
73+
74+
# Get unique server names to avoid duplicate cleanup
75+
UNIQUE_NAMES=$(echo "$CONTROL_SERVERS" | jq -r '.[].Name' | sort | uniq)
76+
for name in $UNIQUE_NAMES; do
77+
echo "Deleting cluster with control node: $name"
78+
79+
# Get the first matching server ID by name
80+
server=$(echo "$CONTROL_SERVERS" | jq -r '.[] | select(.Name=="'"$name"'") | .ID' | head -n1)
81+
82+
# Make sure server still exists (wasn't deleted earlier)
83+
if ! openstack server show "$server" &>/dev/null; then
84+
echo "Server $server no longer exists, skipping $name."
85+
continue
86+
fi
87+
88+
echo "Deleting cluster $cluster_prefix (server $server)..."
89+
./dev/delete-cluster.py $cluster_prefix --force
90+
done
91+
done
6892
shell: bash

.github/workflows/nightlybuild.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ jobs:
3636
CI_CLOUD: ${{ github.event.inputs.ci_cloud || vars.CI_CLOUD }}
3737
ARK_PASSWORD: ${{ secrets.ARK_PASSWORD }}
3838
LEAFCLOUD_PULP_PASSWORD: ${{ secrets.LEAFCLOUD_PULP_PASSWORD }}
39+
PACKER_GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }}
3940

4041
steps:
4142
- uses: actions/checkout@v2
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
name: Release images
2+
on:
3+
workflow_dispatch:
4+
release:
5+
types:
6+
- published # should work for both pre-releases and releases
7+
env:
8+
IMAGE_PATH: environments/.stackhpc/tofu/cluster_image.auto.tfvars.json
9+
jobs:
10+
ci-image-release:
11+
name: ci-image-release
12+
runs-on: ubuntu-22.04
13+
concurrency: ${{ github.workflow }}-${{ github.ref }}
14+
strategy:
15+
fail-fast: false
16+
matrix:
17+
build:
18+
- RL8
19+
- RL9
20+
steps:
21+
- uses: actions/checkout@v2
22+
23+
- name: Write s3cmd configuration
24+
run: echo "${{ secrets.LEAFCLOUD_S3_CFG }}" > ~/.s3cfg
25+
26+
- name: Install s3cmd
27+
run: |
28+
sudo apt-get update
29+
sudo apt-get --yes install s3cmd
30+
31+
- name: Retrieve image name
32+
run: |
33+
TARGET_IMAGE=$(jq --arg version "${{ matrix.build }}" -r '.cluster_image[$version]' "${{ env.IMAGE_PATH }}")
34+
echo "TARGET_IMAGE=${TARGET_IMAGE}" >> "$GITHUB_ENV"
35+
36+
- name: Copy image from pre-release to release bucket
37+
run: s3cmd cp s3://openhpc-images-prerelease/${{ env.TARGET_IMAGE }} s3://openhpc-images

.github/workflows/s3-image-sync.yml

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
name: Upload CI-tested images to Arcus S3 and sync clouds
1+
name: Upload CI-tested images to Leafcloud S3 and sync clouds
22
on:
33
workflow_dispatch:
44
push:
@@ -21,11 +21,12 @@ jobs:
2121

2222
- name: Write s3cmd configuration
2323
run: |
24-
echo "${{ secrets['ARCUS_S3_CFG'] }}" > ~/.s3cfg
24+
echo "${{ secrets['LEAFCLOUD_S3_CFG'] }}" > ~/.s3cfg
2525
shell: bash
2626

2727
- name: Install s3cmd
2828
run: |
29+
sudo apt-get update
2930
sudo apt-get --yes install s3cmd
3031
3132
- name: Cleanup S3 bucket
@@ -72,23 +73,41 @@ jobs:
7273

7374
- name: Write s3cmd configuration
7475
run: |
75-
echo "${{ secrets['ARCUS_S3_CFG'] }}" > ~/.s3cfg
76+
echo "${{ secrets['LEAFCLOUD_S3_CFG'] }}" > ~/.s3cfg
7677
shell: bash
7778

78-
- name: Install s3cmd
79+
- name: Install s3cmd and qemu-utils
7980
run: |
80-
sudo apt-get --yes install s3cmd
81+
sudo apt-get update
82+
sudo apt-get --yes install s3cmd qemu-utils
8183
8284
- name: Retrieve image name
8385
run: |
8486
TARGET_IMAGE=$(jq --arg version "${{ matrix.build }}" -r '.cluster_image[$version]' "${{ env.IMAGE_PATH }}")
8587
echo "TARGET_IMAGE=${TARGET_IMAGE}" >> "$GITHUB_ENV"
8688
shell: bash
8789

90+
- name: Clear up some space on runner
91+
run: |
92+
df -h
93+
sudo rm -rf /usr/share/dotnet
94+
sudo rm -rf /opt/ghc
95+
sudo rm -rf "/usr/local/share/boost"
96+
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
97+
sudo apt-get clean
98+
df -h
99+
88100
- name: Download image to runner
89101
run: |
90102
. venv/bin/activate
91-
openstack image save --file ${{ env.TARGET_IMAGE }} ${{ env.TARGET_IMAGE }}
103+
openstack image save --file "${{ env.TARGET_IMAGE }}.raw" "${{ env.TARGET_IMAGE }}"
104+
df -h
105+
shell: bash
106+
107+
- name: Convert image to QCOW2
108+
run: |
109+
. venv/bin/activate
110+
qemu-img convert -f raw -O qcow2 -c "${{ env.TARGET_IMAGE }}.raw" "${{ env.TARGET_IMAGE }}"
92111
shell: bash
93112

94113
- name: Upload Image to S3

.github/workflows/stackhpc.yml

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,7 @@ on:
99
- '**'
1010
- '!dev/**'
1111
- 'dev/setup-env.sh'
12-
- '!docs/**'
13-
- '!README.md'
12+
- '!**.md'
1413
- '!.gitignore'
1514
- '!.github/workflows/'
1615
- '.github/workflows/stackhpc'
@@ -19,8 +18,7 @@ on:
1918
- '**'
2019
- '!dev/**'
2120
- 'dev/setup-env.sh'
22-
- '!docs/**'
23-
- '!README.md'
21+
- '!**.md'
2422
- '!.gitignore'
2523
- '!.github/workflows/'
2624
- '.github/workflows/stackhpc'
@@ -91,9 +89,9 @@ jobs:
9189
run: dev/setup-env.sh
9290

9391
- name: Install OpenTofu
94-
uses: opentofu/setup-opentofu@v1
92+
uses: opentofu/setup-opentofu@v1.0.5
9593
with:
96-
tofu_version: 1.6.2
94+
tofu_version: 1.9.0
9795

9896
- name: Initialise tofu
9997
run: tofu init
@@ -230,6 +228,16 @@ jobs:
230228
env:
231229
DEMO_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}
232230

231+
- name: Delete possible volume snapshot from slurm upgrade
232+
run: |
233+
. venv/bin/activate
234+
. environments/.stackhpc/activate
235+
if [ -n "$SNAPSHOT" ]
236+
then
237+
echo Deleting $SNAPSHOT
238+
openstack volume snapshot delete $SNAPSHOT
239+
fi
240+
233241
- name: Delete infrastructure
234242
run: |
235243
. venv/bin/activate

.github/workflows/upload-release-image.yml.sample

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ on:
1919
inputs:
2020
image_name:
2121
type: string
22-
description: Image name from: (https://object.arcus.openstack.hpc.cam.ac.uk/swift/v1/AUTH_3a06571936a0424bb40bc5c672c4ccb1/{BUCKET_NAME})
22+
description: Image name from: (https://leafcloud.store/swift/v1/AUTH_f39848421b2747148400ad8eeae8d536/{BUCKET_NAME})
2323
required: true
2424
bucket_name:
2525
type: choice

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,4 @@ venv
55
*.pyc
66
packer/openhpc2
77
.vscode
8+
requirements.yml.last

0 commit comments

Comments
 (0)