Skip to content

Commit 9f0d85a

Browse files
assumptionsandgKayobe AutomationAlex-Welshm-bull
authored
DOCA workflow support (#1469)
Add DOCA workflow support Co-authored-by: Kayobe Automation <kayobe-automation@example.org> Co-authored-by: Alex-Welsh <112560678+Alex-Welsh@users.noreply.github.com> Co-authored-by: Matt Anson <matta@stackhpc.com>
1 parent f6cd436 commit 9f0d85a

File tree

21 files changed

+329
-117
lines changed

21 files changed

+329
-117
lines changed

.github/workflows/package-build-ofed.yml

Lines changed: 24 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
---
2-
name: Build OFED packages
2+
name: Build OFED kernel modules
33
on:
44
workflow_dispatch:
55
inputs:
@@ -19,11 +19,11 @@ on:
1919

2020
env:
2121
ANSIBLE_FORCE_COLOR: True
22-
KAYOBE_ENVIRONMENT: ci-builder
22+
KAYOBE_ENVIRONMENT: ci-doca-builder
2323
KAYOBE_VAULT_PASSWORD: ${{ secrets.KAYOBE_VAULT_PASSWORD }}
2424
jobs:
2525
overcloud-ofed-packages:
26-
name: Build OFED packages
26+
name: Build OFED kernel modules
2727
if: github.repository == 'stackhpc/stackhpc-kayobe-config'
2828
runs-on: arc-skc-host-image-builder-runner
2929
permissions: {}
@@ -48,6 +48,11 @@ jobs:
4848
BRANCH=$(awk -F'=' '/defaultbranch/ {print $2}' src/kayobe-config/.gitreview)
4949
echo "openstack_release=${BRANCH}" | sed -E "s,(stable|unmaintained)/,," >> $GITHUB_OUTPUT
5050
51+
- name: Generate OFED tag
52+
id: ofed_tag
53+
run: |
54+
echo "ofed_tag=$(date +%Y%m%dT%H%M%S)" >> $GITHUB_OUTPUT
55+
5156
- name: Clone StackHPC Kayobe repository
5257
uses: actions/checkout@v4
5358
with:
@@ -86,6 +91,7 @@ jobs:
8691
id: image_tag
8792
run: |
8893
echo image_tag=$(grep stackhpc_rocky_9_overcloud_host_image_version: etc/kayobe/pulp-host-image-versions.yml | awk '{print $2}') >> $GITHUB_OUTPUT
94+
working-directory: ${{ github.workspace }}/src/kayobe-config
8995

9096
# Use the image override if set, otherwise use overcloud-os_distribution-os_release-tag
9197
- name: Output image name
@@ -145,13 +151,13 @@ jobs:
145151

146152
- name: Write Terraform outputs
147153
run: |
148-
cat << EOF > src/kayobe-config/etc/kayobe/environments/ci-builder/tf-outputs.yml
154+
cat << EOF > src/kayobe-config/etc/kayobe/environments/ci-doca-builder/tf-outputs.yml
149155
${{ steps.tf_outputs.outputs.stdout }}
150156
EOF
151157
152158
- name: Write Terraform network config
153159
run: |
154-
cat << EOF > src/kayobe-config/etc/kayobe/environments/ci-builder/tf-network-allocation.yml
160+
cat << EOF > src/kayobe-config/etc/kayobe/environments/ci-doca-builder/tf-network-allocation.yml
155161
---
156162
aio_ips:
157163
builder: "{{ access_ip_v4.value }}"
@@ -176,37 +182,37 @@ jobs:
176182
- name: Bootstrap the control host
177183
run: |
178184
source venvs/kayobe/bin/activate &&
179-
source src/kayobe-config/kayobe-env --environment ci-builder &&
185+
source src/kayobe-config/kayobe-env --environment ci-doca-builder &&
180186
kayobe control host bootstrap
181187
182188
- name: Run growroot playbook
183189
run: |
184190
source venvs/kayobe/bin/activate &&
185-
source src/kayobe-config/kayobe-env --environment ci-builder &&
191+
source src/kayobe-config/kayobe-env --environment ci-doca-builder &&
186192
kayobe playbook run src/kayobe-config/etc/kayobe/ansible/growroot.yml
187193
env:
188194
KAYOBE_VAULT_PASSWORD: ${{ secrets.KAYOBE_VAULT_PASSWORD }}
189195

190196
- name: Configure the seed host (Builder VM)
191197
run: |
192198
source venvs/kayobe/bin/activate &&
193-
source src/kayobe-config/kayobe-env --environment ci-builder &&
194-
kayobe seed host configure --skip-tags network,docker
199+
source src/kayobe-config/kayobe-env --environment ci-doca-builder &&
200+
kayobe seed host configure --skip-tags network,docker,docker-registry
195201
env:
196202
KAYOBE_VAULT_PASSWORD: ${{ secrets.KAYOBE_VAULT_PASSWORD }}
197203

198204
- name: Run a distro-sync
199205
run: |
200206
source venvs/kayobe/bin/activate &&
201-
source src/kayobe-config/kayobe-env --environment ci-builder &&
202-
kayobe seed host command run --become --command "dnf distro-sync --refresh"
207+
source src/kayobe-config/kayobe-env --environment ci-doca-builder &&
208+
kayobe seed host command run --become --command "dnf distro-sync --refresh --assumeyes"
203209
env:
204210
KAYOBE_VAULT_PASSWORD: ${{ secrets.KAYOBE_VAULT_PASSWORD }}
205211

206212
- name: Reset BLS entries on the seed host
207213
run: |
208214
source venvs/kayobe/bin/activate &&
209-
source src/kayobe-config/kayobe-env --environment ci-builder &&
215+
source src/kayobe-config/kayobe-env --environment ci-doca-builder &&
210216
kayobe playbook run src/kayobe-config/etc/kayobe/ansible/reset-bls-entries.yml \
211217
-e "reset_bls_host=ofed-builder"
212218
env:
@@ -215,32 +221,33 @@ jobs:
215221
- name: Disable noexec in /var/tmp
216222
run: |
217223
source venvs/kayobe/bin/activate &&
218-
source src/kayobe-config/kayobe-env --environment ci-builder &&
224+
source src/kayobe-config/kayobe-env --environment ci-doca-builder &&
219225
kayobe seed host command run --become --command "sed -i 's/noexec,//g' /etc/fstab"
220226
env:
221227
KAYOBE_VAULT_PASSWORD: ${{ secrets.KAYOBE_VAULT_PASSWORD }}
222228

223229
- name: Reboot to apply the kernel update
224230
run: |
225231
source venvs/kayobe/bin/activate &&
226-
source src/kayobe-config/kayobe-env --environment ci-builder &&
232+
source src/kayobe-config/kayobe-env --environment ci-doca-builder &&
227233
kayobe playbook run src/kayobe-config/etc/kayobe/ansible/reboot.yml
228234
env:
229235
KAYOBE_VAULT_PASSWORD: ${{ secrets.KAYOBE_VAULT_PASSWORD }}
230236

231237
- name: Run OFED builder playbook
232238
run: |
233239
source venvs/kayobe/bin/activate &&
234-
source src/kayobe-config/kayobe-env --environment ci-builder &&
240+
source src/kayobe-config/kayobe-env --environment ci-doca-builder &&
235241
kayobe playbook run src/kayobe-config/etc/kayobe/ansible/build-ofed-rocky.yml
236242
env:
237243
KAYOBE_VAULT_PASSWORD: ${{ secrets.KAYOBE_VAULT_PASSWORD }}
238244

239245
- name: Run OFED upload playbook
240246
run: |
241247
source venvs/kayobe/bin/activate &&
242-
source src/kayobe-config/kayobe-env --environment ci-builder &&
243-
kayobe playbook run src/kayobe-config/etc/kayobe/ansible/push-ofed.yml
248+
source src/kayobe-config/kayobe-env --environment ci-doca-builder &&
249+
kayobe playbook run src/kayobe-config/etc/kayobe/ansible/push-ofed.yml \
250+
-e "ofed_tag=${{ steps.ofed_tag.outputs.ofed_tag }}"
244251
env:
245252
KAYOBE_VAULT_PASSWORD: ${{ secrets.KAYOBE_VAULT_PASSWORD }}
246253

doc/source/contributor/ofed.rst

Lines changed: 72 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -4,19 +4,17 @@ OFED
44

55
Warning: Experimental workflow subject to change
66

7-
This section documents the workflow for building OFED packages for Release train integration.
8-
9-
The workflow builds the OFED kernel modules against the latest available kernel in Release train
10-
(as configured in SKC) and compiles them into RPM packages to be uploaded to Ark. Addtionally,
11-
this workflow downloads the userspace OFED packages from the Nvidia repository and uploads these
12-
to Ark.
7+
The Nvidia DOCA framework is distributed as part of StackHPC Release Train for OFED driver support,
8+
this repository is synced into Ark as part of the Release Train workflows, however to ensure
9+
compatibility with Release Train packages, we are required to build OFED modules with support for
10+
the latest Release Train kernel.
1311

1412
Workflow
1513
========
1614

1715
The workflow uses workflow_dispatch to manually request an OFED build, which will deploy a builder
1816
VM, apply kayobe config to the builder, upgrade the kernel, reboot, then run two Ansible playbooks
19-
for building and uploading OFED to Ark.
17+
for building and uploading OFED modules to Ark.
2018

2119
Pre-requisites
2220
--------------
@@ -25,31 +23,82 @@ Before building OFED packages, the workflow will ensure that:
2523

2624
* A full distro-sync has taken place, ensuring the kernel is upgraded.
2725

28-
* The bootloader has been configured to use the latest kernel
26+
* The bootloader has been configured to use the latest kernel (reset-bls-entries.yml)
2927

3028
* noexec is disabled in the temporary logical volume.
3129

3230
build-ofed
3331
----------
3432

35-
Currently we only support building Rocky Linux 9 OFED packages.
36-
37-
In order to setup OFED, we're required to build kernel modules for the OFED drivers as
38-
the kernels we provide in release train are unsupported by OFED. To accomplish this we
39-
will need to use the doca-kernel-support from the doca-extra repository.
33+
Currently we only support building Rocky Linux 9 OFED kernel module packages.
4034

41-
We will need to instll dependencies in order to build the OFED kernel modules, and these
42-
are installed at the beginning of the build playbook. We also install base and appstream
43-
dependencies of userspace OFED packages here, this is intended to stop these dependencies
44-
being pulled in later when we download the OFED packages from the doca-host repository.
35+
The Build OFED module workflow will check that the filesystem is configured (noexec disabled)
36+
to allow the DOCA build script to run. The workflow will also install any necessary dependencies
37+
for the module build.
4538

46-
At the end of the playbook following the kernel module build, the OFED userspace packages
47-
are downloaded from the upstream repository in order to upload these to Ark.
39+
The build script will output a ``doca-kernel-repo`` RPM which contains all kernel modules built
40+
as part of the workflow. When this RPM is installed, the repofile is created pointing to the
41+
modules in ``/usr/share/doca-host-<doca-version>/Modules/<kernel-version>/`` on the host.
4842

4943
push-ofed
5044
---------
5145

52-
As we're not syncing OFED from any upstream source, and are instead creating our own
53-
repository of custom packages, we will be required to setup the Pulp distribution/publication
54-
and upload the content directly to Ark. This playbook uses the Pulp CLI to upload the RPMs
55-
to Ark.
46+
As mentioned above, the DOCA repository is synced into the ``doca`` repository in Ark. This workflow
47+
will upload the ``doca-kernel-repo`` RPM to a separate repository named ``doca-modules``. The version
48+
for this repository is set in ``pulp-repo-versions.yml`` and is disabled for local pulp syncs by
49+
default.
50+
51+
Install process
52+
===============
53+
54+
Release Train configuration
55+
---------------------------
56+
57+
DOCA repositories will need to be synced to the local Pulp service, Ensure the DOCA
58+
hosts added to the ``mlnx`` group before running a package sync, if the group is not
59+
empty DOCA will be synced into the local Pulp. The local Pulp can be synced with Ark
60+
by running:
61+
62+
.. code-block:: console
63+
64+
kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/pulp-repo-sync.yml
65+
kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/pulp-repo-publish.yml
66+
67+
DOCA repositories can be templated to hosts by running Kayobe host configure.
68+
69+
.. code-block:: console
70+
71+
kayobe overcloud host configure -t dnf
72+
73+
StackHPC DOCA kernel modules will require the latest kernel version available in Ark for
74+
the current Rocky minor version. You should ensure that packages are up to date by running
75+
a package update, which can also be limited to hosts in the ``mlnx`` group.
76+
77+
.. code-block:: console
78+
79+
kayobe overcloud host package update --packages "*" --limit mlnx
80+
81+
To ensure the latest kernel is the default on boot, the bootloader entires will need
82+
to be reset before rebooting.
83+
84+
.. code-block:: console
85+
86+
kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/reset-bls-entires.yml -e reset_bls_host=mlnx
87+
88+
The hosts can now be rebooted to use the latest kernel, a rolling reboot may be applicable
89+
here to reduce distruptions. See the `package updates documentation <package-updates>`.
90+
91+
.. code-block:: console
92+
93+
kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/reboot.yml --limit mlnx
94+
95+
install-doca
96+
------------
97+
98+
A playbook is provided to install DOCA on hosts in the ``mlnx`` group. Ensure this group
99+
is configured to include the hosts you wish to install DOCA on. To run the install
100+
playbook:
101+
102+
.. code-block:: console
103+
104+
kayobe playbook run $KAYOBE_CONFIG_PATH/ansible/install-doca.yml
Lines changed: 17 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
---
2-
- name: Build OFED packages
2+
- name: Build OFED kernel modules
33
become: true
44
hosts: ofed-builder
55
gather_facts: false
@@ -17,57 +17,29 @@
1717
- name: Install package dependencies
1818
ansible.builtin.dnf:
1919
name:
20-
- kpartx
21-
- perl
22-
- rpm-build
20+
- autoconf
2321
- automake
24-
- patch
25-
- kernel
22+
- createrepo
23+
- cmake-filesystem
24+
- doca-extra
25+
- gcc-gfortran
2626
- kernel-devel
27-
- autoconf
28-
- pciutils
29-
- kernel-modules-extra
3027
- kernel-rpm-macros
31-
- lsof
28+
- kernel-modules-extra
29+
- kpartx
3230
- libtool
33-
- tk
34-
- gcc-gfortran
35-
- tcl
36-
- createrepo
37-
- cmake-filesystem
3831
- libnl3-devel
32+
- lsof
33+
- patch
34+
- pciutils
35+
- perl
3936
- python3-devel
37+
- rpm-build
38+
- tcl
39+
- tk
4040
state: latest
4141
update_cache: true
4242

43-
- name: Add DOCA host repository package
44-
ansible.builtin.dnf:
45-
name: "https://developer.nvidia.com/downloads/networking/secure/doca-sdk/DOCA_2.8/doca-host-2.8.0-204000_{{ stackhpc_pulp_doca_ofed_version }}_rhel9{{ stackhpc_pulp_repo_rocky_9_minor_version }}.x86_64.rpm"
46-
disable_gpg_check: true
47-
48-
- name: Install DOCA extra packages
49-
ansible.builtin.dnf:
50-
name: doca-extra
51-
52-
- name: Create build directory
53-
ansible.builtin.file:
54-
path: /home/cloud-user/ofed
55-
state: directory
56-
mode: "0777"
57-
58-
- name: Set build directory
59-
ansible.builtin.replace:
60-
path: /opt/mellanox/doca/tools/doca-kernel-support
61-
regexp: TMP_DIR=\$1
62-
replace: TMP_DIR=/home/cloud-user/ofed
63-
6443
- name: Build OFED kernel modules
65-
ansible.builtin.shell:
66-
cmd: |
67-
/opt/mellanox/doca/tools/doca-kernel-support
68-
69-
- name: Download OFED userspace packages
70-
ansible.builtin.dnf:
71-
name: doca-ofed-userspace
72-
download_only: true
73-
download_dir: /home/cloud-user/ofed
44+
ansible.builtin.command:
45+
cmd: /opt/mellanox/doca/tools/doca-kernel-support

etc/kayobe/ansible/install-doca.yml

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
---
2+
- name: Install DOCA
3+
become: true
4+
hosts: mlnx
5+
gather_facts: true
6+
tasks:
7+
- name: Get running kernel
8+
ansible.builtin.command:
9+
cmd: "uname -r"
10+
register: kernel
11+
12+
- name: Install kernel repo
13+
ansible.builtin.dnf:
14+
name: doca-kernel-repo
15+
state: latest
16+
update_cache: true
17+
18+
- name: Ensure correct priority for DOCA modules
19+
ansible.builtin.lineinfile:
20+
line: "priority=-2"
21+
insertafter: EOF
22+
path: "/etc/yum.repos.d/doca-kernel-{{ kernel.stdout }}.repo"
23+
24+
- name: Install DOCA OFED
25+
ansible.builtin.dnf:
26+
name: doca-ofed
27+
state: latest
28+
update_cache: true

0 commit comments

Comments
 (0)