From 1a33e846e1bea6b3c10e2f1dae9299ec950bbac7 Mon Sep 17 00:00:00 2001 From: Jake Hutchinson Date: Wed, 3 Jul 2024 14:59:37 +0100 Subject: [PATCH 01/28] OFED workflow --- .github/workflows/package-build-ofed.yml | 206 ++++++++++++++++++ etc/kayobe/ansible/build-ofed.yml | 75 +++++++ etc/kayobe/ansible/push-ofed.yml | 51 +++++ .../environments/ci-builder/inventory/hosts | 4 + etc/kayobe/pulp.yml | 13 ++ 5 files changed, 349 insertions(+) create mode 100644 .github/workflows/package-build-ofed.yml create mode 100644 etc/kayobe/ansible/build-ofed.yml create mode 100644 etc/kayobe/ansible/push-ofed.yml diff --git a/.github/workflows/package-build-ofed.yml b/.github/workflows/package-build-ofed.yml new file mode 100644 index 000000000..358c18c52 --- /dev/null +++ b/.github/workflows/package-build-ofed.yml @@ -0,0 +1,206 @@ +--- +name: Build OFED packages +on: + workflow_dispatch: + inputs: + rocky9: + description: Build Rocky Linux 9 + type: boolean + default: true + secrets: + KAYOBE_VAULT_PASSWORD: + required: true + CLOUDS_YAML: + required: true + OS_APPLICATION_CREDENTIAL_ID: + required: true + OS_APPLICATION_CREDENTIAL_SECRET: + required: true + +env: + ANSIBLE_FORCE_COLOR: True + KAYOBE_ENVIRONMENT: ci-builder + KAYOBE_VAULT_PASSWORD: ${{ secrets.KAYOBE_VAULT_PASSWORD }} +jobs: + overcloud-ofed-packages: + name: Build OFED packages + if: github.repository == 'stackhpc/stackhpc-kayobe-config' + runs-on: arc-skc-host-image-builder-runner + permissions: {} + steps: + - name: Install Package + uses: ConorMacBride/install-package@main + with: + apt: git unzip nodejs python3-pip python3-venv openssh-server openssh-client jq + + - name: Start the SSH service + run: | + sudo /etc/init.d/ssh start + + - name: Checkout + uses: actions/checkout@v4 + with: + path: src/kayobe-config + + - name: Determine OpenStack release + id: openstack_release + run: | + BRANCH=$(awk -F'=' '/defaultbranch/ {print $2}' src/kayobe-config/.gitreview) + echo "openstack_release=${BRANCH}" | sed -E "s,(stable|unmaintained)/,," >> $GITHUB_OUTPUT + + - name: Clone StackHPC Kayobe repository + uses: actions/checkout@v4 + with: + repository: stackhpc/kayobe + ref: refs/heads/stackhpc/${{ steps.openstack_release.outputs.openstack_release }} + path: src/kayobe + + - name: Install Kayobe + run: | + mkdir -p venvs && + pushd venvs && + python3 -m venv kayobe && + source kayobe/bin/activate && + pip install -U pip && + pip install ../src/kayobe + + - name: Install terraform + uses: hashicorp/setup-terraform@v2 + + - name: Initialise terraform + run: terraform init + working-directory: ${{ github.workspace }}/src/kayobe-config/terraform/aio + + - name: Generate SSH keypair + run: ssh-keygen -f id_rsa -N '' + working-directory: ${{ github.workspace }}/src/kayobe-config/terraform/aio + + - name: Generate clouds.yaml + run: | + cat << EOF > clouds.yaml + ${{ secrets.CLOUDS_YAML }} + EOF + working-directory: ${{ github.workspace }}/src/kayobe-config/terraform/aio + + - name: Generate terraform.tfvars + run: | + cat << EOF > terraform.tfvars + ssh_public_key = "id_rsa.pub" + ssh_username = "rocky" + aio_vm_name = "skc-host-image-builder" + aio_vm_image = "Rocky-9-GenericCloud-Base-9.3-20231113.0.x86_64.qcow2" + aio_vm_flavor = "en1.medium" + aio_vm_network = "stackhpc-ci" + aio_vm_subnet = "stackhpc-ci" + aio_vm_interface = "eth0" + EOF + working-directory: ${{ github.workspace }}/src/kayobe-config/terraform/aio + + - name: Terraform Plan + run: terraform plan + working-directory: ${{ github.workspace }}/src/kayobe-config/terraform/aio + env: + OS_CLOUD: "openstack" + OS_APPLICATION_CREDENTIAL_ID: ${{ secrets.OS_APPLICATION_CREDENTIAL_ID }} + OS_APPLICATION_CREDENTIAL_SECRET: ${{ secrets.OS_APPLICATION_CREDENTIAL_SECRET }} + + - name: Terraform Apply + run: | + for attempt in $(seq 5); do + if terraform apply -auto-approve; then + echo "Created infrastructure on attempt $attempt" + exit 0 + fi + echo "Failed to create infrastructure on attempt $attempt" + sleep 10 + terraform destroy -auto-approve + sleep 60 + done + echo "Failed to create infrastructure after $attempt attempts" + exit 1 + working-directory: ${{ github.workspace }}/src/kayobe-config/terraform/aio + env: + OS_CLOUD: "openstack" + OS_APPLICATION_CREDENTIAL_ID: ${{ secrets.OS_APPLICATION_CREDENTIAL_ID }} + OS_APPLICATION_CREDENTIAL_SECRET: ${{ secrets.OS_APPLICATION_CREDENTIAL_SECRET }} + + - name: Get Terraform outputs + id: tf_outputs + run: | + terraform output -json + working-directory: ${{ github.workspace }}/src/kayobe-config/terraform/aio + + - name: Write Terraform outputs + run: | + cat << EOF > src/kayobe-config/etc/kayobe/environments/ci-builder/tf-outputs.yml + ${{ steps.tf_outputs.outputs.stdout }} + EOF + + - name: Write Terraform network config + run: | + cat << EOF > src/kayobe-config/etc/kayobe/environments/ci-builder/tf-network-allocation.yml + --- + aio_ips: + builder: "{{ access_ip_v4.value }}" + EOF + + - name: Write Terraform network interface config + run: | + mkdir -p src/kayobe-config/etc/kayobe/environments/$KAYOBE_ENVIRONMENT/inventory/group_vars/seed + rm -f src/kayobe-config/etc/kayobe/environments/$KAYOBE_ENVIRONMENT/inventory/group_vars/seed/network-interfaces + cat << EOF > src/kayobe-config/etc/kayobe/environments/$KAYOBE_ENVIRONMENT/inventory/group_vars/seed/network-interfaces + admin_interface: "{{ access_interface.value }}" + aio_interface: "{{ access_interface.value }}" + EOF + + - name: Manage SSH keys + run: | + mkdir -p ~/.ssh + touch ~/.ssh/authorized_keys + cat src/kayobe-config/terraform/aio/id_rsa.pub >> ~/.ssh/authorized_keys + cp src/kayobe-config/terraform/aio/id_rsa* ~/.ssh/ + + - name: Bootstrap the control host + run: | + source venvs/kayobe/bin/activate && + source src/kayobe-config/kayobe-env --environment ci-builder && + kayobe control host bootstrap + + - name: Configure the seed host (Builder VM) + run: | + source venvs/kayobe/bin/activate && + source src/kayobe-config/kayobe-env --environment ci-builder && + kayobe seed host configure -e seed_bootstrap_user=rocky --skip-tags network + + - name: Run growroot playbook + run: | + source venvs/kayobe/bin/activate && + source src/kayobe-config/kayobe-env --environment ci-builder && + kayobe playbook run src/kayobe-config/etc/kayobe/ansible/growroot.yml + env: + KAYOBE_VAULT_PASSWORD: ${{ secrets.KAYOBE_VAULT_PASSWORD } + + - name: Run OFED builder playbook + run: | + source venvs/kayobe/bin/activate && + source src/kayobe-config/kayobe-env --environment ci-builder && + kayobe playbook run src/kayobe-config/etc/kayobe/ansible/build-ofed.yml + env: + KAYOBE_VAULT_PASSWORD: ${{ secrets.KAYOBE_VAULT_PASSWORD } + + - name: Run OFED publish playbook + run: | + source venvs/kayobe/bin/activate && + source src/kayobe-config/kayobe-env --environment ci-builder && + kayobe playbook run src/kayobe-config/etc/kayobe/ansible/push-ofed.yml + env: + KAYOBE_VAULT_PASSWORD: ${{ secrets.KAYOBE_VAULT_PASSWORD } + + - name: Destroy + run: terraform destroy -auto-approve + working-directory: ${{ github.workspace }}/src/kayobe-config/terraform/aio + env: + OS_CLOUD: openstack + OS_APPLICATION_CREDENTIAL_ID: ${{ secrets.OS_APPLICATION_CREDENTIAL_ID }} + OS_APPLICATION_CREDENTIAL_SECRET: ${{ secrets.OS_APPLICATION_CREDENTIAL_SECRET }} + if: always() diff --git a/etc/kayobe/ansible/build-ofed.yml b/etc/kayobe/ansible/build-ofed.yml new file mode 100644 index 000000000..ca2933161 --- /dev/null +++ b/etc/kayobe/ansible/build-ofed.yml @@ -0,0 +1,75 @@ +--- +- name: Build OFED packages + become: true + hosts: ofed-builder + gather_facts: false + vars: + stackhpc_mlnx_ofed_file_string: MLNX_OFED_LINUX-{{ stackhpc_pulp_mlnx_ofed_version }}-rhel9.{{ stackhpc_pulp_repo_rocky_9_minor_version }}-x86_64 + tasks: + - name: Extend the home logical volume + community.general.lvol: + vg: rootvg + lv: lv_home + size: +5G + + - name: Install package dependencies + ansible.builtin.dnf: + name: + - kpartx + - perl + - rpm-build + - automake + - patch + - kernel + - kernel-devel + - autoconf + - pciutils + - kernel-rpm-macros + - lsof + - libtool + - tk + - gcc-gfortran + - tcl + - createrepo + state: latest + update_cache: true + + - name: Update the default kernel entry + ansible.builtin.shell: + cmd: | + grubby --set-default /boot/$(rpm -qa kernel-devel | sed 's/kernel-devel/vmlinuz/g') + echo 'GRUB_DEFAULT=2' >> /etc/default/grub + grub2-mkconfig -o /boot/grub2/grub.cfg + + - name: Reboot builder to apply kernel update + ansible.builtin.reboot: + reboot_timeout: 600 + + - name: Create build directory + ansible.builtin.file: + path: /home/cloud-user/ofed + state: directory + mode: 0777 + + - name: Download MellanoxOFED archive + ansible.builtin.get_url: + url: https://content.mellanox.com/ofed/MLNX_OFED-{{ stackhpc_pulp_mlnx_ofed_version }}/{{ stackhpc_mlnx_ofed_file_string }}.tgz + dest: /home/cloud-user/ofed/ofed-archive.tgz + + - name: Extract MellanoxOFED archive + ansible.builtin.unarchive: + src: /home/cloud-user/ofed/ofed-archive.tgz + dest: /home/cloud-user/ofed + + - name: Ensure the current kernel is supported + ansible.builtin.shell: + cmd: | + /home/cloud-user/ofed/{{ stackhpc_mlnx_ofed_file_string }}/mlnx_add_kernel_support.sh \ + --mlnx_ofed /home/cloud-user/ofed/{{ stackhpc_mlnx_ofed_file_string }} \ + --tmpdir /home/cloud-user/ofed/ofed-build \ + --make-tgz -y \ + + - name: Extract the new archive + ansible.builtin.unarchive: + src: /home/cloud-user/ofed/ofed-build/{{ stackhpc_mlnx_ofed_file_string }}-ext.tgz + dest: /home/cloud-user/ofed/ diff --git a/etc/kayobe/ansible/push-ofed.yml b/etc/kayobe/ansible/push-ofed.yml new file mode 100644 index 000000000..3dbc31055 --- /dev/null +++ b/etc/kayobe/ansible/push-ofed.yml @@ -0,0 +1,51 @@ +--- +- name: Push OFED packages + hosts: ofed-builder + vars: + stackhpc_mlnx_ofed_file_string: MLNX_OFED_LINUX-{{ stackhpc_pulp_mlnx_ofed_version }}-rhel9.{{ stackhpc_pulp_repo_rocky_9_minor_version }}-x86_64 + tasks: + - name: Install python dependencies + ansible.builtin.pip: + name: pulp-cli + + - name: Create Pulp repository for OFED + pulp.squeezer.rpm_repository: + pulp_url: "{{ stackhpc_release_pulp_url }}" + username: "{{ stackhpc_release_pulp_username }}" + password: "{{ stackhpc_release_pulp_password }}" + name: "{{ stackhpc_pulp_repo_mlnx_ofed_rhel9.name }}" + state: present + retries: "{{ pulp_timeout_retries | default(3) }}" + + - name: Upload OFED RPMs to Pulp + ansible.builtin.shell: + cmd: | + pulp \ + --base-url '{{ stackhpc_release_pulp_url }}' \ + --username '{{ stackhpc_release_pulp_username }}' \ + --password '{{ stackhpc_release_pulp_password }}' \ + rpm content \ + --type package upload \ + --repository '{{ stackhpc_pulp_repo_mlnx_ofed_rhel9.name }}' \ + --file {{ item }} \ + with_fileglob: "/home/cloud-user/ofed/{{ stackhpc_mlnx_ofed_file_string }}-ext/RPMS/*.rpm" + no_log: true + + - name: Create Pulp publication for OFED + pulp.squeezer.rpm_publication: + pulp_url: "{{ stackhpc_release_pulp_url }}" + username: "{{ stackhpc_release_pulp_username }}" + password: "{{ stackhpc_release_pulp_password }}" + repository: "{{ stackhpc_pulp_repo_mlnx_ofed_rhel9.name }}" + state: present + register: publication + + - name: Create Pulp distribution for OFED + pulp.squeezer.rpm_distribution: + pulp_url: "{{ stackhpc_release_pulp_url }}" + username: "{{ stackhpc_release_pulp_username }}" + password: "{{ stackhpc_release_pulp_password }}" + name: "{{ stackhpc_pulp_repo_mlnx_ofed_rhel9.distribution_name }}" + publication: "{{ publication.publication.pulp_href }}" + base_path: "{{ stackhpc_pulp_repo_mlnx_ofed_rhel9.base_path }}" + state: present diff --git a/etc/kayobe/environments/ci-builder/inventory/hosts b/etc/kayobe/environments/ci-builder/inventory/hosts index 33fda8b73..759e41184 100644 --- a/etc/kayobe/environments/ci-builder/inventory/hosts +++ b/etc/kayobe/environments/ci-builder/inventory/hosts @@ -1,3 +1,7 @@ # A 'seed' host used for building images. + +[ofed-builder:children] +seed + [seed] builder diff --git a/etc/kayobe/pulp.yml b/etc/kayobe/pulp.yml index f05a36a8f..9f6a26d9d 100644 --- a/etc/kayobe/pulp.yml +++ b/etc/kayobe/pulp.yml @@ -425,6 +425,19 @@ stackhpc_pulp_distribution_rpm_production: >- {%- endfor -%} {{ prod_dists }} +############################################################################### +# Mellanox OFED + +# Mellanox OFED version +stackhpc_pulp_mlnx_ofed_version: 24.04-0.7.0.0 + +# Mellanox OFED repositories +stackhpc_pulp_repo_mlnx_ofed_rhel9: + name: Mellanox Technologies mlnx_ofed {{ stackhpc_pulp_mlnx_ofed_version }} Rocky 9.{{ stackhpc_pulp_repo_rocky_9_minor_version }} + url: "{{ stackhpc_release_pulp_content_url }}/mlnx_ofed/{{ stackhpc_pulp_mlnx_ofed_version }}/rhel9.{{ stackhpc_pulp_repo_rocky_9_minor_version }}/x86_64" + distribution_name: "mlnx-ofed-{{ stackhpc_pulp_mlnx_ofed_version }}-rocky-9-{{ stackhpc_pulp_repo_rocky_9_minor_version }}-" + base_path: "mlnx_ofed/{{ stackhpc_pulp_mlnx_ofed_version }}/rhel9.{{ stackhpc_pulp_repo_rocky_9_minor_version }}/x86_64/" + ############################################################################### # Containers From 7d5337eed37785ddb8bfc2c25fac854047fbb6c0 Mon Sep 17 00:00:00 2001 From: Jake Hutchinson Date: Tue, 17 Sep 2024 11:23:58 +0100 Subject: [PATCH 02/28] Support DOCA OFED --- etc/kayobe/ansible/build-ofed.yml | 47 +++++++++++++++++-------------- etc/kayobe/ansible/push-ofed.yml | 2 +- etc/kayobe/pulp.yml | 10 +++---- 3 files changed, 32 insertions(+), 27 deletions(-) diff --git a/etc/kayobe/ansible/build-ofed.yml b/etc/kayobe/ansible/build-ofed.yml index ca2933161..5b864d15f 100644 --- a/etc/kayobe/ansible/build-ofed.yml +++ b/etc/kayobe/ansible/build-ofed.yml @@ -3,8 +3,6 @@ become: true hosts: ofed-builder gather_facts: false - vars: - stackhpc_mlnx_ofed_file_string: MLNX_OFED_LINUX-{{ stackhpc_pulp_mlnx_ofed_version }}-rhel9.{{ stackhpc_pulp_repo_rocky_9_minor_version }}-x86_64 tasks: - name: Extend the home logical volume community.general.lvol: @@ -12,6 +10,12 @@ lv: lv_home size: +5G + - name: Extend the temporary logical volume + community.general.lvol: + vg: rootvg + lv: lv_var_tmp + size: +5G + - name: Install package dependencies ansible.builtin.dnf: name: @@ -38,38 +42,39 @@ ansible.builtin.shell: cmd: | grubby --set-default /boot/$(rpm -qa kernel-devel | sed 's/kernel-devel/vmlinuz/g') - echo 'GRUB_DEFAULT=2' >> /etc/default/grub grub2-mkconfig -o /boot/grub2/grub.cfg + - name: Disable noexec in temporary file system + ansible.builtin.replace: + path: /etc/fstab + regexp: '^(exclude=.*)noexec,\*\s*' + replace: '\1' + - name: Reboot builder to apply kernel update ansible.builtin.reboot: reboot_timeout: 600 + - name: Add DOCA host repository + ansible.builtin.dnf: + name: https://developer.nvidia.com/downloads/networking/secure/doca-sdk/DOCA_2.8/doca-host-2.8.0-204000_{{ stackhpc_pulp_doca_ofed_version }}_rhel9{{ stackhpc_pulp_repo_rocky_9_minor_version }}.x86_64.rpm + + - name: Install DOCA extra packages + ansible.builtin.dnf: + name: doca-extra + - name: Create build directory ansible.builtin.file: path: /home/cloud-user/ofed state: directory mode: 0777 - - name: Download MellanoxOFED archive - ansible.builtin.get_url: - url: https://content.mellanox.com/ofed/MLNX_OFED-{{ stackhpc_pulp_mlnx_ofed_version }}/{{ stackhpc_mlnx_ofed_file_string }}.tgz - dest: /home/cloud-user/ofed/ofed-archive.tgz - - - name: Extract MellanoxOFED archive - ansible.builtin.unarchive: - src: /home/cloud-user/ofed/ofed-archive.tgz - dest: /home/cloud-user/ofed + - name: Set build directory + ansible.builtin.lineinfile: + path: /opt/mellanox/doca/tools/doca-kernel-support + search_string: 'TMP_DIR=$1' + line: ' TMP_DIR=/home/cloud-user/ofed' - name: Ensure the current kernel is supported ansible.builtin.shell: cmd: | - /home/cloud-user/ofed/{{ stackhpc_mlnx_ofed_file_string }}/mlnx_add_kernel_support.sh \ - --mlnx_ofed /home/cloud-user/ofed/{{ stackhpc_mlnx_ofed_file_string }} \ - --tmpdir /home/cloud-user/ofed/ofed-build \ - --make-tgz -y \ - - - name: Extract the new archive - ansible.builtin.unarchive: - src: /home/cloud-user/ofed/ofed-build/{{ stackhpc_mlnx_ofed_file_string }}-ext.tgz - dest: /home/cloud-user/ofed/ + /opt/mellanox/doca/tools/doca-kernel-support diff --git a/etc/kayobe/ansible/push-ofed.yml b/etc/kayobe/ansible/push-ofed.yml index 3dbc31055..c590fc3db 100644 --- a/etc/kayobe/ansible/push-ofed.yml +++ b/etc/kayobe/ansible/push-ofed.yml @@ -2,7 +2,7 @@ - name: Push OFED packages hosts: ofed-builder vars: - stackhpc_mlnx_ofed_file_string: MLNX_OFED_LINUX-{{ stackhpc_pulp_mlnx_ofed_version }}-rhel9.{{ stackhpc_pulp_repo_rocky_9_minor_version }}-x86_64 + stackhpc_mlnx_ofed_file_string: MLNX_OFED_LINUX-{{ stackhpc_pulp_doca_ofed_version }}-rhel9.{{ stackhpc_pulp_repo_rocky_9_minor_version }}-x86_64 tasks: - name: Install python dependencies ansible.builtin.pip: diff --git a/etc/kayobe/pulp.yml b/etc/kayobe/pulp.yml index 9f6a26d9d..9fb0534d7 100644 --- a/etc/kayobe/pulp.yml +++ b/etc/kayobe/pulp.yml @@ -429,14 +429,14 @@ stackhpc_pulp_distribution_rpm_production: >- # Mellanox OFED # Mellanox OFED version -stackhpc_pulp_mlnx_ofed_version: 24.04-0.7.0.0 +stackhpc_pulp_doca_ofed_version: 24.07 # Mellanox OFED repositories stackhpc_pulp_repo_mlnx_ofed_rhel9: - name: Mellanox Technologies mlnx_ofed {{ stackhpc_pulp_mlnx_ofed_version }} Rocky 9.{{ stackhpc_pulp_repo_rocky_9_minor_version }} - url: "{{ stackhpc_release_pulp_content_url }}/mlnx_ofed/{{ stackhpc_pulp_mlnx_ofed_version }}/rhel9.{{ stackhpc_pulp_repo_rocky_9_minor_version }}/x86_64" - distribution_name: "mlnx-ofed-{{ stackhpc_pulp_mlnx_ofed_version }}-rocky-9-{{ stackhpc_pulp_repo_rocky_9_minor_version }}-" - base_path: "mlnx_ofed/{{ stackhpc_pulp_mlnx_ofed_version }}/rhel9.{{ stackhpc_pulp_repo_rocky_9_minor_version }}/x86_64/" + name: Mellanox Technologies mlnx_ofed {{ stackhpc_pulp_doca_ofed_version }} Rocky 9.{{ stackhpc_pulp_repo_rocky_9_minor_version }} + url: "{{ stackhpc_release_pulp_content_url }}/doca_ofed/{{ stackhpc_pulp_doca_ofed_version }}/rhel9.{{ stackhpc_pulp_repo_rocky_9_minor_version }}/x86_64" + distribution_name: "mlnx-ofed-{{ stackhpc_pulp_doca_ofed_version }}-rocky-9-{{ stackhpc_pulp_repo_rocky_9_minor_version }}-" + base_path: "mlnx_ofed/{{ stackhpc_pulp_doca_ofed_version }}/rhel9.{{ stackhpc_pulp_repo_rocky_9_minor_version }}/x86_64/" ############################################################################### # Containers From ffcf2a0c29aa1c7de6901fdc95b486fcf762276c Mon Sep 17 00:00:00 2001 From: Jake Hutchinson Date: Tue, 17 Sep 2024 12:32:20 +0100 Subject: [PATCH 03/28] Push kernel modules --- etc/kayobe/ansible/build-ofed.yml | 2 ++ etc/kayobe/ansible/push-ofed.yml | 4 +--- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/etc/kayobe/ansible/build-ofed.yml b/etc/kayobe/ansible/build-ofed.yml index 5b864d15f..3fe4d8502 100644 --- a/etc/kayobe/ansible/build-ofed.yml +++ b/etc/kayobe/ansible/build-ofed.yml @@ -9,12 +9,14 @@ vg: rootvg lv: lv_home size: +5G + resize2fs: true - name: Extend the temporary logical volume community.general.lvol: vg: rootvg lv: lv_var_tmp size: +5G + resize2fs: true - name: Install package dependencies ansible.builtin.dnf: diff --git a/etc/kayobe/ansible/push-ofed.yml b/etc/kayobe/ansible/push-ofed.yml index c590fc3db..358d0c4ce 100644 --- a/etc/kayobe/ansible/push-ofed.yml +++ b/etc/kayobe/ansible/push-ofed.yml @@ -1,8 +1,6 @@ --- - name: Push OFED packages hosts: ofed-builder - vars: - stackhpc_mlnx_ofed_file_string: MLNX_OFED_LINUX-{{ stackhpc_pulp_doca_ofed_version }}-rhel9.{{ stackhpc_pulp_repo_rocky_9_minor_version }}-x86_64 tasks: - name: Install python dependencies ansible.builtin.pip: @@ -28,7 +26,7 @@ --type package upload \ --repository '{{ stackhpc_pulp_repo_mlnx_ofed_rhel9.name }}' \ --file {{ item }} \ - with_fileglob: "/home/cloud-user/ofed/{{ stackhpc_mlnx_ofed_file_string }}-ext/RPMS/*.rpm" + with_fileglob: "/home/cloud-user/ofed/*.rpm" no_log: true - name: Create Pulp publication for OFED From 764a1c2fd5bbc06ef6736f273b85321dc03c782f Mon Sep 17 00:00:00 2001 From: Jake Hutchinson Date: Tue, 17 Sep 2024 14:18:32 +0100 Subject: [PATCH 04/28] Push OFED userspace packages --- etc/kayobe/ansible/build-ofed.yml | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/etc/kayobe/ansible/build-ofed.yml b/etc/kayobe/ansible/build-ofed.yml index 3fe4d8502..912d98f41 100644 --- a/etc/kayobe/ansible/build-ofed.yml +++ b/etc/kayobe/ansible/build-ofed.yml @@ -37,6 +37,8 @@ - gcc-gfortran - tcl - createrepo + - cmake-filesystem + - libnl3-devel state: latest update_cache: true @@ -76,7 +78,13 @@ search_string: 'TMP_DIR=$1' line: ' TMP_DIR=/home/cloud-user/ofed' - - name: Ensure the current kernel is supported + - name: Build OFED kernel modules ansible.builtin.shell: cmd: | /opt/mellanox/doca/tools/doca-kernel-support + + - name: Download OFED userspace packages + ansible.builtin.dnf: + name: doca-ofed-userspace + download_only: true + download_dir: /home/cloud-user/ofed From 78aed9961ba20674b542cf6d7346adbff1c93fa4 Mon Sep 17 00:00:00 2001 From: Jake Hutchinson Date: Tue, 17 Sep 2024 14:40:14 +0100 Subject: [PATCH 05/28] Fix build --- etc/kayobe/ansible/build-ofed.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/etc/kayobe/ansible/build-ofed.yml b/etc/kayobe/ansible/build-ofed.yml index 912d98f41..aa8c89925 100644 --- a/etc/kayobe/ansible/build-ofed.yml +++ b/etc/kayobe/ansible/build-ofed.yml @@ -8,15 +8,15 @@ community.general.lvol: vg: rootvg lv: lv_home - size: +5G - resize2fs: true + size: +2G + resizefs: true - name: Extend the temporary logical volume community.general.lvol: vg: rootvg lv: lv_var_tmp - size: +5G - resize2fs: true + size: +4G + resizefs: true - name: Install package dependencies ansible.builtin.dnf: From 95c4e9f4a6b8cdae23bba5d3b7d54e901e33eb51 Mon Sep 17 00:00:00 2001 From: Jake Hutchinson Date: Tue, 17 Sep 2024 16:11:54 +0100 Subject: [PATCH 06/28] Fix kernel upgrade --- etc/kayobe/ansible/build-ofed.yml | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/etc/kayobe/ansible/build-ofed.yml b/etc/kayobe/ansible/build-ofed.yml index aa8c89925..49a9300df 100644 --- a/etc/kayobe/ansible/build-ofed.yml +++ b/etc/kayobe/ansible/build-ofed.yml @@ -39,9 +39,21 @@ - createrepo - cmake-filesystem - libnl3-devel + - python3-devel state: latest update_cache: true + - name: Set GRUB default index to saved + ansible.builtin.lineinfile: + path: /etc/default/grub + line: "GRUB_DEFAULT=saved" + + - name: Set update default index in kernel sysconfig + ansible.builtin.lineinfile: + path: /etc/sysconfig/kernel + line: "UPDATEDEFAULT=yes" + create: true + - name: Update the default kernel entry ansible.builtin.shell: cmd: | @@ -51,8 +63,7 @@ - name: Disable noexec in temporary file system ansible.builtin.replace: path: /etc/fstab - regexp: '^(exclude=.*)noexec,\*\s*' - replace: '\1' + regexp: 'noexec,' - name: Reboot builder to apply kernel update ansible.builtin.reboot: From 2b30dff450d5fc2c29ba44d2144f746eabd22c17 Mon Sep 17 00:00:00 2001 From: Jake Hutchinson Date: Tue, 17 Sep 2024 16:17:26 +0100 Subject: [PATCH 07/28] Replace MLNX with DOCA --- etc/kayobe/ansible/push-ofed.yml | 10 +++++----- etc/kayobe/pulp.yml | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/etc/kayobe/ansible/push-ofed.yml b/etc/kayobe/ansible/push-ofed.yml index 358d0c4ce..59f16cdbf 100644 --- a/etc/kayobe/ansible/push-ofed.yml +++ b/etc/kayobe/ansible/push-ofed.yml @@ -11,7 +11,7 @@ pulp_url: "{{ stackhpc_release_pulp_url }}" username: "{{ stackhpc_release_pulp_username }}" password: "{{ stackhpc_release_pulp_password }}" - name: "{{ stackhpc_pulp_repo_mlnx_ofed_rhel9.name }}" + name: "{{ stackhpc_pulp_repo_doca_ofed_rhel9.name }}" state: present retries: "{{ pulp_timeout_retries | default(3) }}" @@ -24,7 +24,7 @@ --password '{{ stackhpc_release_pulp_password }}' \ rpm content \ --type package upload \ - --repository '{{ stackhpc_pulp_repo_mlnx_ofed_rhel9.name }}' \ + --repository '{{ stackhpc_pulp_repo_doca_ofed_rhel9.name }}' \ --file {{ item }} \ with_fileglob: "/home/cloud-user/ofed/*.rpm" no_log: true @@ -34,7 +34,7 @@ pulp_url: "{{ stackhpc_release_pulp_url }}" username: "{{ stackhpc_release_pulp_username }}" password: "{{ stackhpc_release_pulp_password }}" - repository: "{{ stackhpc_pulp_repo_mlnx_ofed_rhel9.name }}" + repository: "{{ stackhpc_pulp_repo_doca_ofed_rhel9.name }}" state: present register: publication @@ -43,7 +43,7 @@ pulp_url: "{{ stackhpc_release_pulp_url }}" username: "{{ stackhpc_release_pulp_username }}" password: "{{ stackhpc_release_pulp_password }}" - name: "{{ stackhpc_pulp_repo_mlnx_ofed_rhel9.distribution_name }}" + name: "{{ stackhpc_pulp_repo_doca_ofed_rhel9.distribution_name }}" publication: "{{ publication.publication.pulp_href }}" - base_path: "{{ stackhpc_pulp_repo_mlnx_ofed_rhel9.base_path }}" + base_path: "{{ stackhpc_pulp_repo_doca_ofed_rhel9.base_path }}" state: present diff --git a/etc/kayobe/pulp.yml b/etc/kayobe/pulp.yml index 9fb0534d7..8360d1ddd 100644 --- a/etc/kayobe/pulp.yml +++ b/etc/kayobe/pulp.yml @@ -426,14 +426,14 @@ stackhpc_pulp_distribution_rpm_production: >- {{ prod_dists }} ############################################################################### -# Mellanox OFED +# DOCA OFED -# Mellanox OFED version +# DOCA OFED version stackhpc_pulp_doca_ofed_version: 24.07 -# Mellanox OFED repositories -stackhpc_pulp_repo_mlnx_ofed_rhel9: - name: Mellanox Technologies mlnx_ofed {{ stackhpc_pulp_doca_ofed_version }} Rocky 9.{{ stackhpc_pulp_repo_rocky_9_minor_version }} +# DOCA OFED repositories +stackhpc_pulp_repo_doca_ofed_rhel9: + name: Mellanox Technologies doca_ofed {{ stackhpc_pulp_doca_ofed_version }} Rocky 9.{{ stackhpc_pulp_repo_rocky_9_minor_version }} url: "{{ stackhpc_release_pulp_content_url }}/doca_ofed/{{ stackhpc_pulp_doca_ofed_version }}/rhel9.{{ stackhpc_pulp_repo_rocky_9_minor_version }}/x86_64" distribution_name: "mlnx-ofed-{{ stackhpc_pulp_doca_ofed_version }}-rocky-9-{{ stackhpc_pulp_repo_rocky_9_minor_version }}-" base_path: "mlnx_ofed/{{ stackhpc_pulp_doca_ofed_version }}/rhel9.{{ stackhpc_pulp_repo_rocky_9_minor_version }}/x86_64/" From 0ddfcac2e204cfdb75f375af4af8ee63d3b1e6a3 Mon Sep 17 00:00:00 2001 From: Jake Hutchinson Date: Tue, 17 Sep 2024 16:36:16 +0100 Subject: [PATCH 08/28] Adjust lv_var_tmp to 2G --- etc/kayobe/ansible/build-ofed.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etc/kayobe/ansible/build-ofed.yml b/etc/kayobe/ansible/build-ofed.yml index 49a9300df..e9412ad47 100644 --- a/etc/kayobe/ansible/build-ofed.yml +++ b/etc/kayobe/ansible/build-ofed.yml @@ -15,7 +15,7 @@ community.general.lvol: vg: rootvg lv: lv_var_tmp - size: +4G + size: +2G resizefs: true - name: Install package dependencies From 395e9cc4bc336f94fec1e0bc6dafd5a4a9cc2648 Mon Sep 17 00:00:00 2001 From: Jake Hutchinson Date: Wed, 18 Sep 2024 10:26:41 +0100 Subject: [PATCH 09/28] Fix workflow --- .github/workflows/package-build-ofed.yml | 37 ++++++++++++++++-------- 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/.github/workflows/package-build-ofed.yml b/.github/workflows/package-build-ofed.yml index 358c18c52..2dee9f654 100644 --- a/.github/workflows/package-build-ofed.yml +++ b/.github/workflows/package-build-ofed.yml @@ -82,19 +82,32 @@ jobs: EOF working-directory: ${{ github.workspace }}/src/kayobe-config/terraform/aio + - name: Output image tag + id: image_tag + run: | + echo image_tag=$(grep stackhpc_${{ inputs.os_distribution }}_$(sed s/-/_/ <(echo "${{ inputs.os_release }}"))_overcloud_host_image_version: etc/kayobe/pulp-host-image-versions.yml | awk '{print $2}') >> $GITHUB_OUTPUT + + # Use the image override if set, otherwise use overcloud-os_distribution-os_release-tag + - name: Output image name + id: image_name + run: | + echo image_name=overcloud-${{ inputs.os_distribution }}-${{ inputs.os_release }}-${{ steps.image_tag.outputs.image_tag }} >> $GITHUB_OUTPUT + - name: Generate terraform.tfvars run: | cat << EOF > terraform.tfvars ssh_public_key = "id_rsa.pub" - ssh_username = "rocky" - aio_vm_name = "skc-host-image-builder" - aio_vm_image = "Rocky-9-GenericCloud-Base-9.3-20231113.0.x86_64.qcow2" + ssh_username = "cloud-user" + aio_vm_name = "skc-ofed-builder" + aio_vm_image = "${{ env.VM_IMAGE }}" aio_vm_flavor = "en1.medium" aio_vm_network = "stackhpc-ci" aio_vm_subnet = "stackhpc-ci" - aio_vm_interface = "eth0" + aio_vm_interface = "ens3" EOF working-directory: ${{ github.workspace }}/src/kayobe-config/terraform/aio + env: + VM_IMAGE: ${{ steps.image_name.outputs.image_name }} - name: Terraform Plan run: terraform plan @@ -166,19 +179,19 @@ jobs: source src/kayobe-config/kayobe-env --environment ci-builder && kayobe control host bootstrap - - name: Configure the seed host (Builder VM) + - name: Run growroot playbook run: | source venvs/kayobe/bin/activate && source src/kayobe-config/kayobe-env --environment ci-builder && - kayobe seed host configure -e seed_bootstrap_user=rocky --skip-tags network + kayobe playbook run src/kayobe-config/etc/kayobe/ansible/growroot.yml + env: + KAYOBE_VAULT_PASSWORD: ${{ secrets.KAYOBE_VAULT_PASSWORD }} - - name: Run growroot playbook + - name: Configure the seed host (Builder VM) run: | source venvs/kayobe/bin/activate && source src/kayobe-config/kayobe-env --environment ci-builder && - kayobe playbook run src/kayobe-config/etc/kayobe/ansible/growroot.yml - env: - KAYOBE_VAULT_PASSWORD: ${{ secrets.KAYOBE_VAULT_PASSWORD } + kayobe seed host configure -e seed_bootstrap_user=cloud-user --skip-tags network,docker - name: Run OFED builder playbook run: | @@ -186,7 +199,7 @@ jobs: source src/kayobe-config/kayobe-env --environment ci-builder && kayobe playbook run src/kayobe-config/etc/kayobe/ansible/build-ofed.yml env: - KAYOBE_VAULT_PASSWORD: ${{ secrets.KAYOBE_VAULT_PASSWORD } + KAYOBE_VAULT_PASSWORD: ${{ secrets.KAYOBE_VAULT_PASSWORD }} - name: Run OFED publish playbook run: | @@ -194,7 +207,7 @@ jobs: source src/kayobe-config/kayobe-env --environment ci-builder && kayobe playbook run src/kayobe-config/etc/kayobe/ansible/push-ofed.yml env: - KAYOBE_VAULT_PASSWORD: ${{ secrets.KAYOBE_VAULT_PASSWORD } + KAYOBE_VAULT_PASSWORD: ${{ secrets.KAYOBE_VAULT_PASSWORD }} - name: Destroy run: terraform destroy -auto-approve From b9025901b0563a8a3f54d61f3a2be61f54731b1b Mon Sep 17 00:00:00 2001 From: Jake Hutchinson Date: Wed, 18 Sep 2024 13:22:42 +0100 Subject: [PATCH 10/28] Disable gpg check for doca host --- etc/kayobe/ansible/build-ofed.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/etc/kayobe/ansible/build-ofed.yml b/etc/kayobe/ansible/build-ofed.yml index e9412ad47..fd5f63862 100644 --- a/etc/kayobe/ansible/build-ofed.yml +++ b/etc/kayobe/ansible/build-ofed.yml @@ -72,6 +72,7 @@ - name: Add DOCA host repository ansible.builtin.dnf: name: https://developer.nvidia.com/downloads/networking/secure/doca-sdk/DOCA_2.8/doca-host-2.8.0-204000_{{ stackhpc_pulp_doca_ofed_version }}_rhel9{{ stackhpc_pulp_repo_rocky_9_minor_version }}.x86_64.rpm + disable_gpg_check: true - name: Install DOCA extra packages ansible.builtin.dnf: From 35d8345b1cb236525bfdef92aaa3711c0e60d914 Mon Sep 17 00:00:00 2001 From: Jake Hutchinson Date: Wed, 18 Sep 2024 13:33:53 +0100 Subject: [PATCH 11/28] Fix inputs in workflow --- .github/workflows/package-build-ofed.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/package-build-ofed.yml b/.github/workflows/package-build-ofed.yml index 2dee9f654..6cb84d444 100644 --- a/.github/workflows/package-build-ofed.yml +++ b/.github/workflows/package-build-ofed.yml @@ -85,13 +85,13 @@ jobs: - name: Output image tag id: image_tag run: | - echo image_tag=$(grep stackhpc_${{ inputs.os_distribution }}_$(sed s/-/_/ <(echo "${{ inputs.os_release }}"))_overcloud_host_image_version: etc/kayobe/pulp-host-image-versions.yml | awk '{print $2}') >> $GITHUB_OUTPUT + echo image_tag=$(grep stackhpc_rocky_$(sed s/-/_/ <(echo "9"))_overcloud_host_image_version: etc/kayobe/pulp-host-image-versions.yml | awk '{print $2}') >> $GITHUB_OUTPUT # Use the image override if set, otherwise use overcloud-os_distribution-os_release-tag - name: Output image name id: image_name run: | - echo image_name=overcloud-${{ inputs.os_distribution }}-${{ inputs.os_release }}-${{ steps.image_tag.outputs.image_tag }} >> $GITHUB_OUTPUT + echo image_name=overcloud-rocky-9-${{ steps.image_tag.outputs.image_tag }} >> $GITHUB_OUTPUT - name: Generate terraform.tfvars run: | From a135c1b97a851c028e5d627c2b7a19190ab0cd49 Mon Sep 17 00:00:00 2001 From: Jake Hutchinson Date: Wed, 18 Sep 2024 14:21:03 +0100 Subject: [PATCH 12/28] Replace with_fileglob --- etc/kayobe/ansible/push-ofed.yml | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/etc/kayobe/ansible/push-ofed.yml b/etc/kayobe/ansible/push-ofed.yml index 59f16cdbf..c0214a0b0 100644 --- a/etc/kayobe/ansible/push-ofed.yml +++ b/etc/kayobe/ansible/push-ofed.yml @@ -15,6 +15,11 @@ state: present retries: "{{ pulp_timeout_retries | default(3) }}" + - name: Lookup Pulp RPMs on builder + ansible.builtin.find: + paths: "/home/cloud-user/ofed" + register: rpm_dir + - name: Upload OFED RPMs to Pulp ansible.builtin.shell: cmd: | @@ -25,8 +30,8 @@ rpm content \ --type package upload \ --repository '{{ stackhpc_pulp_repo_doca_ofed_rhel9.name }}' \ - --file {{ item }} \ - with_fileglob: "/home/cloud-user/ofed/*.rpm" + --file {{ item.path }} \ + with_items: "{{ rpm_dir.files }}" no_log: true - name: Create Pulp publication for OFED From 9d628e824e14647bc09638de0ebcb626a37dcd43 Mon Sep 17 00:00:00 2001 From: Jake Hutchinson Date: Wed, 18 Sep 2024 14:27:57 +0100 Subject: [PATCH 13/28] Remove trailing slash in base_path --- etc/kayobe/pulp.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etc/kayobe/pulp.yml b/etc/kayobe/pulp.yml index 8360d1ddd..75e4a3582 100644 --- a/etc/kayobe/pulp.yml +++ b/etc/kayobe/pulp.yml @@ -436,7 +436,7 @@ stackhpc_pulp_repo_doca_ofed_rhel9: name: Mellanox Technologies doca_ofed {{ stackhpc_pulp_doca_ofed_version }} Rocky 9.{{ stackhpc_pulp_repo_rocky_9_minor_version }} url: "{{ stackhpc_release_pulp_content_url }}/doca_ofed/{{ stackhpc_pulp_doca_ofed_version }}/rhel9.{{ stackhpc_pulp_repo_rocky_9_minor_version }}/x86_64" distribution_name: "mlnx-ofed-{{ stackhpc_pulp_doca_ofed_version }}-rocky-9-{{ stackhpc_pulp_repo_rocky_9_minor_version }}-" - base_path: "mlnx_ofed/{{ stackhpc_pulp_doca_ofed_version }}/rhel9.{{ stackhpc_pulp_repo_rocky_9_minor_version }}/x86_64/" + base_path: "mlnx_ofed/{{ stackhpc_pulp_doca_ofed_version }}/rhel9.{{ stackhpc_pulp_repo_rocky_9_minor_version }}/x86_64" ############################################################################### # Containers From cbb7a1bad9d2f4a019f301b41e6f68e6847bcc5a Mon Sep 17 00:00:00 2001 From: Jake Hutchinson Date: Wed, 18 Sep 2024 14:43:07 +0100 Subject: [PATCH 14/28] Install kernel modules --- etc/kayobe/ansible/build-ofed.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/etc/kayobe/ansible/build-ofed.yml b/etc/kayobe/ansible/build-ofed.yml index fd5f63862..e2f63b8c1 100644 --- a/etc/kayobe/ansible/build-ofed.yml +++ b/etc/kayobe/ansible/build-ofed.yml @@ -30,6 +30,7 @@ - kernel-devel - autoconf - pciutils + - kernel-modules-extra - kernel-rpm-macros - lsof - libtool From 1d14550cfba078e9cc98132d7765c9607c8f83a9 Mon Sep 17 00:00:00 2001 From: Jake Hutchinson Date: Wed, 18 Sep 2024 16:17:04 +0100 Subject: [PATCH 15/28] Re-add the vault password --- .github/workflows/package-build-ofed.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/package-build-ofed.yml b/.github/workflows/package-build-ofed.yml index 6cb84d444..f7f9cb581 100644 --- a/.github/workflows/package-build-ofed.yml +++ b/.github/workflows/package-build-ofed.yml @@ -192,6 +192,8 @@ jobs: source venvs/kayobe/bin/activate && source src/kayobe-config/kayobe-env --environment ci-builder && kayobe seed host configure -e seed_bootstrap_user=cloud-user --skip-tags network,docker + env: + KAYOBE_VAULT_PASSWORD: ${{ secrets.KAYOBE_VAULT_PASSWORD }} - name: Run OFED builder playbook run: | From c5e765775b4996a332dbaa59f498f9495f7c4bac Mon Sep 17 00:00:00 2001 From: Jake Hutchinson Date: Wed, 18 Sep 2024 16:26:15 +0100 Subject: [PATCH 16/28] Remove trailing dash from distribution --- etc/kayobe/pulp.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etc/kayobe/pulp.yml b/etc/kayobe/pulp.yml index 75e4a3582..c3120cec3 100644 --- a/etc/kayobe/pulp.yml +++ b/etc/kayobe/pulp.yml @@ -435,7 +435,7 @@ stackhpc_pulp_doca_ofed_version: 24.07 stackhpc_pulp_repo_doca_ofed_rhel9: name: Mellanox Technologies doca_ofed {{ stackhpc_pulp_doca_ofed_version }} Rocky 9.{{ stackhpc_pulp_repo_rocky_9_minor_version }} url: "{{ stackhpc_release_pulp_content_url }}/doca_ofed/{{ stackhpc_pulp_doca_ofed_version }}/rhel9.{{ stackhpc_pulp_repo_rocky_9_minor_version }}/x86_64" - distribution_name: "mlnx-ofed-{{ stackhpc_pulp_doca_ofed_version }}-rocky-9-{{ stackhpc_pulp_repo_rocky_9_minor_version }}-" + distribution_name: "mlnx-ofed-{{ stackhpc_pulp_doca_ofed_version }}-rocky-9-{{ stackhpc_pulp_repo_rocky_9_minor_version }}" base_path: "mlnx_ofed/{{ stackhpc_pulp_doca_ofed_version }}/rhel9.{{ stackhpc_pulp_repo_rocky_9_minor_version }}/x86_64" ############################################################################### From b62f57d1fd8e81efd37cb9da3b39c01301d1cb56 Mon Sep 17 00:00:00 2001 From: Jake Hutchinson Date: Sat, 21 Sep 2024 23:35:30 +0100 Subject: [PATCH 17/28] Remove LVM configuration LVM config is already applied for the seed in the ci-builder environment, and this should be enough for building OFED. --- etc/kayobe/ansible/build-ofed.yml | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/etc/kayobe/ansible/build-ofed.yml b/etc/kayobe/ansible/build-ofed.yml index e2f63b8c1..155d89dd1 100644 --- a/etc/kayobe/ansible/build-ofed.yml +++ b/etc/kayobe/ansible/build-ofed.yml @@ -4,20 +4,6 @@ hosts: ofed-builder gather_facts: false tasks: - - name: Extend the home logical volume - community.general.lvol: - vg: rootvg - lv: lv_home - size: +2G - resizefs: true - - - name: Extend the temporary logical volume - community.general.lvol: - vg: rootvg - lv: lv_var_tmp - size: +2G - resizefs: true - - name: Install package dependencies ansible.builtin.dnf: name: From 29bda8566e36edbbe1686bfb1a16715ef8243a8e Mon Sep 17 00:00:00 2001 From: Jake Hutchinson Date: Sun, 22 Sep 2024 00:23:16 +0100 Subject: [PATCH 18/28] Use reset-bls-entries playbook in OFED workflow --- .github/workflows/package-build-ofed.yml | 27 ++++++++++++++++++++++- etc/kayobe/ansible/build-ofed.yml | 28 +----------------------- etc/kayobe/ansible/reset-bls-entries.yml | 2 +- 3 files changed, 28 insertions(+), 29 deletions(-) diff --git a/.github/workflows/package-build-ofed.yml b/.github/workflows/package-build-ofed.yml index f7f9cb581..19f79a172 100644 --- a/.github/workflows/package-build-ofed.yml +++ b/.github/workflows/package-build-ofed.yml @@ -191,7 +191,32 @@ jobs: run: | source venvs/kayobe/bin/activate && source src/kayobe-config/kayobe-env --environment ci-builder && - kayobe seed host configure -e seed_bootstrap_user=cloud-user --skip-tags network,docker + kayobe seed host configure --skip-tags network,docker + env: + KAYOBE_VAULT_PASSWORD: ${{ secrets.KAYOBE_VAULT_PASSWORD }} + + - name: Reset BLS entries on the seed host + run: | + source venvs/kayobe/bin/activate && + source src/kayobe-config/kayobe-env --environment ci-builder && + kayobe playbook run src/kayobe-config/etc/kayobe/ansible/reset-bls-entries.yml \ + -e "reset_bls_host=ofed-builder" + env: + KAYOBE_VAULT_PASSWORD: ${{ secrets.KAYOBE_VAULT_PASSWORD }} + + - name: Disable noexec in /var/tmp + run: | + source venvs/kayobe/bin/activate && + source src/kayobe-config/kayobe-env --environment ci-builder && + kayobe seed host command run --become --command "sed -i 's/noexec,//g' /etc/fstab" + env: + KAYOBE_VAULT_PASSWORD: ${{ secrets.KAYOBE_VAULT_PASSWORD }} + + - name: Reboot to apply the kernel update + run: | + source venvs/kayobe/bin/activate && + source src/kayobe-config/kayobe-env --environment ci-builder && + kayobe playbook run src/kayobe-config/etc/kayobe/ansible/reboot.yml env: KAYOBE_VAULT_PASSWORD: ${{ secrets.KAYOBE_VAULT_PASSWORD }} diff --git a/etc/kayobe/ansible/build-ofed.yml b/etc/kayobe/ansible/build-ofed.yml index 155d89dd1..ca155be39 100644 --- a/etc/kayobe/ansible/build-ofed.yml +++ b/etc/kayobe/ansible/build-ofed.yml @@ -30,33 +30,7 @@ state: latest update_cache: true - - name: Set GRUB default index to saved - ansible.builtin.lineinfile: - path: /etc/default/grub - line: "GRUB_DEFAULT=saved" - - - name: Set update default index in kernel sysconfig - ansible.builtin.lineinfile: - path: /etc/sysconfig/kernel - line: "UPDATEDEFAULT=yes" - create: true - - - name: Update the default kernel entry - ansible.builtin.shell: - cmd: | - grubby --set-default /boot/$(rpm -qa kernel-devel | sed 's/kernel-devel/vmlinuz/g') - grub2-mkconfig -o /boot/grub2/grub.cfg - - - name: Disable noexec in temporary file system - ansible.builtin.replace: - path: /etc/fstab - regexp: 'noexec,' - - - name: Reboot builder to apply kernel update - ansible.builtin.reboot: - reboot_timeout: 600 - - - name: Add DOCA host repository + - name: Add DOCA host repository package ansible.builtin.dnf: name: https://developer.nvidia.com/downloads/networking/secure/doca-sdk/DOCA_2.8/doca-host-2.8.0-204000_{{ stackhpc_pulp_doca_ofed_version }}_rhel9{{ stackhpc_pulp_repo_rocky_9_minor_version }}.x86_64.rpm disable_gpg_check: true diff --git a/etc/kayobe/ansible/reset-bls-entries.yml b/etc/kayobe/ansible/reset-bls-entries.yml index 59e968cba..68989d1bb 100644 --- a/etc/kayobe/ansible/reset-bls-entries.yml +++ b/etc/kayobe/ansible/reset-bls-entries.yml @@ -5,7 +5,7 @@ # https://opendev.org/openstack/diskimage-builder/src/branch/master/diskimage_builder/elements/rhel/post-install.d/03-reset-bls-entries - name: Reset BLS entries - hosts: overcloud + hosts: "{{ reset_bls_host | default('overcloud') }}" become: true tags: - reset-bls-entries From 5a9126d6bf01fe6f60b1e6447798074984788e6c Mon Sep 17 00:00:00 2001 From: Jake Hutchinson Date: Sun, 22 Sep 2024 00:59:35 +0100 Subject: [PATCH 19/28] Use replace instead of lineinfile --- etc/kayobe/ansible/build-ofed.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/etc/kayobe/ansible/build-ofed.yml b/etc/kayobe/ansible/build-ofed.yml index ca155be39..feb924b50 100644 --- a/etc/kayobe/ansible/build-ofed.yml +++ b/etc/kayobe/ansible/build-ofed.yml @@ -46,10 +46,10 @@ mode: 0777 - name: Set build directory - ansible.builtin.lineinfile: + ansible.builtin.replace: path: /opt/mellanox/doca/tools/doca-kernel-support - search_string: 'TMP_DIR=$1' - line: ' TMP_DIR=/home/cloud-user/ofed' + regexp: 'TMP_DIR=\$1' + replace: 'TMP_DIR=/home/cloud-user/ofed' - name: Build OFED kernel modules ansible.builtin.shell: From 699f24492e6d6eccee79d5adf3a57c2770505773 Mon Sep 17 00:00:00 2001 From: Jake Hutchinson Date: Mon, 23 Sep 2024 17:16:40 +0100 Subject: [PATCH 20/28] Remove sed magic --- .github/workflows/package-build-ofed.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/package-build-ofed.yml b/.github/workflows/package-build-ofed.yml index 19f79a172..10d3ed644 100644 --- a/.github/workflows/package-build-ofed.yml +++ b/.github/workflows/package-build-ofed.yml @@ -85,7 +85,7 @@ jobs: - name: Output image tag id: image_tag run: | - echo image_tag=$(grep stackhpc_rocky_$(sed s/-/_/ <(echo "9"))_overcloud_host_image_version: etc/kayobe/pulp-host-image-versions.yml | awk '{print $2}') >> $GITHUB_OUTPUT + echo image_tag=$(grep stackhpc_rocky_9_overcloud_host_image_version: etc/kayobe/pulp-host-image-versions.yml | awk '{print $2}') >> $GITHUB_OUTPUT # Use the image override if set, otherwise use overcloud-os_distribution-os_release-tag - name: Output image name From 022a7ea46656ec2d9edec9397ca6aabaa7298f8c Mon Sep 17 00:00:00 2001 From: Jake Hutchinson Date: Tue, 24 Sep 2024 13:15:49 +0100 Subject: [PATCH 21/28] Move OFED repositories to ofed.yml --- etc/kayobe/ofed.yml | 12 ++++++++++++ etc/kayobe/pulp.yml | 13 ------------- 2 files changed, 12 insertions(+), 13 deletions(-) create mode 100644 etc/kayobe/ofed.yml diff --git a/etc/kayobe/ofed.yml b/etc/kayobe/ofed.yml new file mode 100644 index 000000000..6b4849019 --- /dev/null +++ b/etc/kayobe/ofed.yml @@ -0,0 +1,12 @@ +--- +# DOCA OFED configuration + +# DOCA OFED version +stackhpc_pulp_doca_ofed_version: 24.07 + +# DOCA OFED repositories +stackhpc_pulp_repo_doca_ofed_rhel9: + name: Mellanox Technologies doca_ofed {{ stackhpc_pulp_doca_ofed_version }} Rocky 9.{{ stackhpc_pulp_repo_rocky_9_minor_version }} + url: "{{ stackhpc_release_pulp_content_url }}/doca_ofed/{{ stackhpc_pulp_doca_ofed_version }}/rhel9.{{ stackhpc_pulp_repo_rocky_9_minor_version }}/x86_64" + distribution_name: "mlnx-ofed-{{ stackhpc_pulp_doca_ofed_version }}-rocky-9-{{ stackhpc_pulp_repo_rocky_9_minor_version }}" + base_path: "mlnx_ofed/{{ stackhpc_pulp_doca_ofed_version }}/rhel9.{{ stackhpc_pulp_repo_rocky_9_minor_version }}/x86_64" \ No newline at end of file diff --git a/etc/kayobe/pulp.yml b/etc/kayobe/pulp.yml index c3120cec3..f05a36a8f 100644 --- a/etc/kayobe/pulp.yml +++ b/etc/kayobe/pulp.yml @@ -425,19 +425,6 @@ stackhpc_pulp_distribution_rpm_production: >- {%- endfor -%} {{ prod_dists }} -############################################################################### -# DOCA OFED - -# DOCA OFED version -stackhpc_pulp_doca_ofed_version: 24.07 - -# DOCA OFED repositories -stackhpc_pulp_repo_doca_ofed_rhel9: - name: Mellanox Technologies doca_ofed {{ stackhpc_pulp_doca_ofed_version }} Rocky 9.{{ stackhpc_pulp_repo_rocky_9_minor_version }} - url: "{{ stackhpc_release_pulp_content_url }}/doca_ofed/{{ stackhpc_pulp_doca_ofed_version }}/rhel9.{{ stackhpc_pulp_repo_rocky_9_minor_version }}/x86_64" - distribution_name: "mlnx-ofed-{{ stackhpc_pulp_doca_ofed_version }}-rocky-9-{{ stackhpc_pulp_repo_rocky_9_minor_version }}" - base_path: "mlnx_ofed/{{ stackhpc_pulp_doca_ofed_version }}/rhel9.{{ stackhpc_pulp_repo_rocky_9_minor_version }}/x86_64" - ############################################################################### # Containers From 1840b058fca03dc7ff27497cf95e66b255613c5d Mon Sep 17 00:00:00 2001 From: Jake Hutchinson Date: Tue, 24 Sep 2024 13:18:25 +0100 Subject: [PATCH 22/28] Rename build-ofed to build-ofed-rocky --- .github/workflows/package-build-ofed.yml | 4 ++-- etc/kayobe/ansible/{build-ofed.yml => build-ofed-rocky.yml} | 0 2 files changed, 2 insertions(+), 2 deletions(-) rename etc/kayobe/ansible/{build-ofed.yml => build-ofed-rocky.yml} (100%) diff --git a/.github/workflows/package-build-ofed.yml b/.github/workflows/package-build-ofed.yml index 10d3ed644..97104b3b6 100644 --- a/.github/workflows/package-build-ofed.yml +++ b/.github/workflows/package-build-ofed.yml @@ -224,11 +224,11 @@ jobs: run: | source venvs/kayobe/bin/activate && source src/kayobe-config/kayobe-env --environment ci-builder && - kayobe playbook run src/kayobe-config/etc/kayobe/ansible/build-ofed.yml + kayobe playbook run src/kayobe-config/etc/kayobe/ansible/build-ofed-rl9.yml env: KAYOBE_VAULT_PASSWORD: ${{ secrets.KAYOBE_VAULT_PASSWORD }} - - name: Run OFED publish playbook + - name: Run OFED upload playbook run: | source venvs/kayobe/bin/activate && source src/kayobe-config/kayobe-env --environment ci-builder && diff --git a/etc/kayobe/ansible/build-ofed.yml b/etc/kayobe/ansible/build-ofed-rocky.yml similarity index 100% rename from etc/kayobe/ansible/build-ofed.yml rename to etc/kayobe/ansible/build-ofed-rocky.yml From c61deb2decc01ede1835205930a16dfdd10bb5cf Mon Sep 17 00:00:00 2001 From: Jake Hutchinson Date: Tue, 24 Sep 2024 13:38:12 +0100 Subject: [PATCH 23/28] Add precheck for noexec --- etc/kayobe/ansible/build-ofed-rocky.yml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/etc/kayobe/ansible/build-ofed-rocky.yml b/etc/kayobe/ansible/build-ofed-rocky.yml index feb924b50..4c5b74bba 100644 --- a/etc/kayobe/ansible/build-ofed-rocky.yml +++ b/etc/kayobe/ansible/build-ofed-rocky.yml @@ -4,6 +4,16 @@ hosts: ofed-builder gather_facts: false tasks: + - name: Check whether noexec is enabled for /var/tmp + ansible.builtin.lineinfile: + path: "/etc/fstab" + regexp: "noexec" + state: absent + changed_when: false + check_mode: true + register: result + failed_when: result.found + - name: Install package dependencies ansible.builtin.dnf: name: From d989478521c25c49eb958dc036d9a8c4a237d78a Mon Sep 17 00:00:00 2001 From: Jake Hutchinson Date: Tue, 24 Sep 2024 13:39:02 +0100 Subject: [PATCH 24/28] Update workflow --- .github/workflows/package-build-ofed.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/package-build-ofed.yml b/.github/workflows/package-build-ofed.yml index 97104b3b6..6fd338aff 100644 --- a/.github/workflows/package-build-ofed.yml +++ b/.github/workflows/package-build-ofed.yml @@ -224,7 +224,7 @@ jobs: run: | source venvs/kayobe/bin/activate && source src/kayobe-config/kayobe-env --environment ci-builder && - kayobe playbook run src/kayobe-config/etc/kayobe/ansible/build-ofed-rl9.yml + kayobe playbook run src/kayobe-config/etc/kayobe/ansible/build-ofed-rocky.yml env: KAYOBE_VAULT_PASSWORD: ${{ secrets.KAYOBE_VAULT_PASSWORD }} From 2847ad9bc484ecfdb3cc6da99e9052099666d6cc Mon Sep 17 00:00:00 2001 From: Jake Hutchinson Date: Tue, 24 Sep 2024 15:27:57 +0100 Subject: [PATCH 25/28] WIP: OFED documentation --- .github/workflows/package-build-ofed.yml | 8 ++++ doc/source/contributor/ofed.rst | 55 ++++++++++++++++++++++++ 2 files changed, 63 insertions(+) create mode 100644 doc/source/contributor/ofed.rst diff --git a/.github/workflows/package-build-ofed.yml b/.github/workflows/package-build-ofed.yml index 6fd338aff..798e0c4bf 100644 --- a/.github/workflows/package-build-ofed.yml +++ b/.github/workflows/package-build-ofed.yml @@ -195,6 +195,14 @@ jobs: env: KAYOBE_VAULT_PASSWORD: ${{ secrets.KAYOBE_VAULT_PASSWORD }} + - name: Run a distro-sync + run: | + source venvs/kayobe/bin/activate && + source src/kayobe-config/kayobe-env --environment ci-builder && + kayobe seed host command run --become --command "dnf distro-sync --refresh" + env: + KAYOBE_VAULT_PASSWORD: ${{ secrets.KAYOBE_VAULT_PASSWORD }} + - name: Reset BLS entries on the seed host run: | source venvs/kayobe/bin/activate && diff --git a/doc/source/contributor/ofed.rst b/doc/source/contributor/ofed.rst new file mode 100644 index 000000000..20c877188 --- /dev/null +++ b/doc/source/contributor/ofed.rst @@ -0,0 +1,55 @@ +==== +OFED +==== + +Warning: Experimental workflow subject to change + +This section documents the workflow for building OFED packages for Release train integration. + +The workflow builds the OFED kernel modules against the latest available kernel in Release train +(as configured in SKC) and compiles them into RPM packages to be uploaded to Ark. Addtionally, +this workflow downloads the userspace OFED packages from the Nvidia repository and uploads these +to Ark. + +Workflow +======== + +The workflow uses workflow_dispatch to manually request an OFED build, which will deploy a builder +VM, apply kayobe config to the builder, upgrade the kernel, reboot, then run two Ansible playbooks +for building and uploading OFED to Ark. + +Pre-requisites +-------------- + +Before building OFED packages, the workflow will ensure that: + +* A full distro-sync has taken place, ensuring the kernel is upgraded. + +* The bootloader has been configured to use the latest kernel + +* noexec is disabled in the temporary logical volume. + +build-ofed +---------- + +Currently we only support building Rocky Linux 9 OFED packages. + +In order to setup OFED, we're required to build kernel modules for the OFED drivers as +the kernels we provide in release train are unsupported by OFED. To accomplish this we +will need to use the doca-kernel-support from the doca-extra repository. + +We will need to instll dependencies in order to build the OFED kernel modules, and these +are installed at the beginning of the build playbook. We also install base and appstream +dependencies of userspace OFED packages here, this is intended to stop these dependencies +being pulled in later when we download the OFED packages from the doca-host repository. + +At the end of the playbook following the kernel module build, the OFED userspace packages +are downloaded from the upstream repository in order to upload these to Ark. + +push-ofed +--------- + +As we're not syncing OFED from any upstream source, and are instead creating our own +repository of custom packages, we will be required to setup the Pulp distribution/publication +and upload the content directly to Ark. This playbook uses the Pulp CLI to upload the RPMs +to Ark. \ No newline at end of file From 1c44353e48dc9732a1a6d9f9d544e774abc83166 Mon Sep 17 00:00:00 2001 From: Jake Hutchinson Date: Fri, 27 Sep 2024 11:38:22 +0100 Subject: [PATCH 26/28] Fix no eol --- etc/kayobe/ofed.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etc/kayobe/ofed.yml b/etc/kayobe/ofed.yml index 6b4849019..696e3c93b 100644 --- a/etc/kayobe/ofed.yml +++ b/etc/kayobe/ofed.yml @@ -9,4 +9,4 @@ stackhpc_pulp_repo_doca_ofed_rhel9: name: Mellanox Technologies doca_ofed {{ stackhpc_pulp_doca_ofed_version }} Rocky 9.{{ stackhpc_pulp_repo_rocky_9_minor_version }} url: "{{ stackhpc_release_pulp_content_url }}/doca_ofed/{{ stackhpc_pulp_doca_ofed_version }}/rhel9.{{ stackhpc_pulp_repo_rocky_9_minor_version }}/x86_64" distribution_name: "mlnx-ofed-{{ stackhpc_pulp_doca_ofed_version }}-rocky-9-{{ stackhpc_pulp_repo_rocky_9_minor_version }}" - base_path: "mlnx_ofed/{{ stackhpc_pulp_doca_ofed_version }}/rhel9.{{ stackhpc_pulp_repo_rocky_9_minor_version }}/x86_64" \ No newline at end of file + base_path: "mlnx_ofed/{{ stackhpc_pulp_doca_ofed_version }}/rhel9.{{ stackhpc_pulp_repo_rocky_9_minor_version }}/x86_64" From 3490003acdb9ce025202252c264f72c4af701ee0 Mon Sep 17 00:00:00 2001 From: Jake Hutchinson Date: Fri, 27 Sep 2024 12:33:37 +0100 Subject: [PATCH 27/28] Add a release note --- doc/source/contributor/ofed.rst | 4 ++-- releasenotes/notes/ofed-workflow-119225f36a9265c3.yaml | 6 ++++++ 2 files changed, 8 insertions(+), 2 deletions(-) create mode 100644 releasenotes/notes/ofed-workflow-119225f36a9265c3.yaml diff --git a/doc/source/contributor/ofed.rst b/doc/source/contributor/ofed.rst index 20c877188..e53b0f125 100644 --- a/doc/source/contributor/ofed.rst +++ b/doc/source/contributor/ofed.rst @@ -4,7 +4,7 @@ OFED Warning: Experimental workflow subject to change -This section documents the workflow for building OFED packages for Release train integration. +This section documents the workflow for building OFED packages for Release train integration. The workflow builds the OFED kernel modules against the latest available kernel in Release train (as configured in SKC) and compiles them into RPM packages to be uploaded to Ark. Addtionally, @@ -52,4 +52,4 @@ push-ofed As we're not syncing OFED from any upstream source, and are instead creating our own repository of custom packages, we will be required to setup the Pulp distribution/publication and upload the content directly to Ark. This playbook uses the Pulp CLI to upload the RPMs -to Ark. \ No newline at end of file +to Ark. diff --git a/releasenotes/notes/ofed-workflow-119225f36a9265c3.yaml b/releasenotes/notes/ofed-workflow-119225f36a9265c3.yaml new file mode 100644 index 000000000..b371b1a17 --- /dev/null +++ b/releasenotes/notes/ofed-workflow-119225f36a9265c3.yaml @@ -0,0 +1,6 @@ +--- +features: + - | + Implement an OFED workflow that builds kernel modules + to support OFED drivers in release train kernels and + upload OFED kernel/userspace drivers to Ark. From 5eaf53c77e6ab6edfff2ee2ec66d19910556614b Mon Sep 17 00:00:00 2001 From: Jake Hutchinson Date: Fri, 27 Sep 2024 12:35:55 +0100 Subject: [PATCH 28/28] Add to docs tree --- doc/source/contributor/index.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/contributor/index.rst b/doc/source/contributor/index.rst index 988957541..0073c48a2 100644 --- a/doc/source/contributor/index.rst +++ b/doc/source/contributor/index.rst @@ -12,3 +12,4 @@ This guide is for contributors of the StackHPC Kayobe configuration project. environments/index package-updates pre-commit + ofed