From 7e6593d5fb604c9ef3467724fe9c9f93560d327f Mon Sep 17 00:00:00 2001 From: Fifo Phonics Date: Wed, 27 Mar 2024 10:54:42 -0400 Subject: [PATCH 1/8] Fedora: Bug fix koji download of kernel-modules-core-* --- fedora/nvidia-driver | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fedora/nvidia-driver b/fedora/nvidia-driver index a8468ad8..c78c19be 100755 --- a/fedora/nvidia-driver +++ b/fedora/nvidia-driver @@ -129,7 +129,7 @@ _install_prerequisites() ( rm ./*.rpm echo "Installing Linux kernel-modules-core files..." - if ! dnf -q -y download kernel-modules-core${KERNEL_VERSION} > /dev/null; then + if ! dnf -q -y download kernel-modules-core-${KERNEL_VERSION} > /dev/null; then echo "Failed to find kernel-modules-core-${KERNEL_VERSION} in repositories." echo "Trying to download kernel-modules-core from koji..." KOJI_KERNEL_CORE_RPM=$KOJI_BASE_URL/packages/kernel/$KERNEL_RPM_VERSION/$KERNEL_RPM_RELEASE/$KERNEL_RPM_ARCH/kernel-modules-core-$KERNEL_VERSION.rpm From f267633e61d3aa8525744097b9c818cba76380fa Mon Sep 17 00:00:00 2001 From: Fifo Phonics Date: Tue, 11 Jun 2024 11:26:58 -0400 Subject: [PATCH 2/8] Fedora: Reference arch via `uname -r` (remove hardcoding x86_64) --- fedora/nvidia-driver | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fedora/nvidia-driver b/fedora/nvidia-driver index c78c19be..fd30c2fb 100755 --- a/fedora/nvidia-driver +++ b/fedora/nvidia-driver @@ -240,9 +240,9 @@ _create_driver_package() ( # lrwxrwxrwx 1 root root 36 Dec 8 20:10 default -> /etc/alternatives/ofa_kernel_headers # drwxr-xr-x 4 root root 4096 Dec 8 20:14 x86_64 # lrwxrwxrwx 1 root root 44 Dec 9 19:05 5.4.0-90-generic -> /usr/src/ofa_kernel/x86_64/5.4.0-90-generic/ - if [[ -d /run/mellanox/drivers/usr/src/ofa_kernel/x86_64/`uname -r` ]]; then - if [[ ! -e /usr/src/ofa_kernel/`uname -r` ]]; then - ln -s /run/mellanox/drivers/usr/src/ofa_kernel/x86_64/`uname -r` /usr/src/ofa_kernel/ + if [[ -d "/run/mellanox/drivers/usr/src/ofa_kernel/$(uname -m)/$(uname -r)" ]]; then + if [[ ! -e "/usr/src/ofa_kernel/$(uname -r)" ]]; then + ln -s "/run/mellanox/drivers/usr/src/ofa_kernel/$(uname -m)/$(uname -r)" /usr/src/ofa_kernel/ fi fi fi From d6cb8c02bfc4a46a50367808ecd3dd7d6e663a38 Mon Sep 17 00:00:00 2001 From: Fifo Phonics Date: Tue, 13 Feb 2024 19:45:04 -0500 Subject: [PATCH 3/8] Fedora CICD Changes --- ci/fedora/.gitlab-ci-fcos.yml | 45 +++++++++++++++-------------- fedora/README.md | 53 ++++++++++++++++++----------------- 2 files changed, 51 insertions(+), 47 deletions(-) diff --git a/ci/fedora/.gitlab-ci-fcos.yml b/ci/fedora/.gitlab-ci-fcos.yml index a37feada..2297ffc4 100644 --- a/ci/fedora/.gitlab-ci-fcos.yml +++ b/ci/fedora/.gitlab-ci-fcos.yml @@ -35,9 +35,9 @@ # the commit sha). This release stage is purely to test out the CICD code that # would for the 'fedora' branch publish to a remote repository. # -# Branches == "fedora" and tags == .*fedora$ +# Tags == .*fedora$ # -# The protected branch 'fedora' will cause container image builds on all three +# Matching pipelines will cause container image builds on all three # fcos runner types and build ALL_DRIVER_VERSIONS. The images will then be scan- # ned and providing there are no detected vulnerabilities will be pushed to the # remote repository defined by RELEASE_REGISTRY_PROJECT. @@ -49,7 +49,7 @@ # # Branches == "fedora.+" # -# Any other protected branch with the word fedora in it will do the same - build +# Any protected branch with the word fedora in it will do the same - build # all the NVIDIA driver versions on all the fcos releases - and scan them, but # will not publish them to the remote registry. # @@ -99,7 +99,7 @@ variables: DRIVER_VERSION: "535.154.05" DRIVER_VERSIONS: 535.154.05 525.147.05 - CUDA_VERSION: 12.2.0 + CUDA_VERSION: 12.3.1 CVE_UPDATES: "curl libc6" @@ -115,9 +115,9 @@ variables: RELEASE_REGISTRY_TOKEN: "" default: - image: docker:20.10.10-git + image: docker:25.0.2-git services: - - name: docker:20.10.10-dind + - name: docker:25.0.2-dind stages: - build @@ -199,8 +199,9 @@ build-push-next-one-only: - for driver_version in ${DRIVER_VERSION}; do build_push_fn ${driver_version} $OVERWRITE_TAGS ${CI_COMMIT_SHORT_SHA}-; done tags: - fcos-next - except: - - /fedora/ + rules: + # Only run on branches (not tags) which do not start with fedora + - if: $CI_COMMIT_REF_NAME !~ /^fedora/ && $CI_COMMIT_TAG == null build-push: stage: build @@ -212,8 +213,8 @@ build-push: - STREAM: [next, testing, stable] tags: - fcos-${STREAM} - only: - - /fedora/ + rules: + - if: $CI_COMMIT_REF_NAME =~ /^fedora/ || $CI_COMMIT_TAG =~ /fedora$/ .common-scan: image: registry.gitlab.com/security-products/container-scanning:6 @@ -273,8 +274,9 @@ scan-next-one-only: - scan_fn ${DRIVER_VERSION} ${CI_COMMIT_SHORT_SHA}- tags: - fcos-next - except: - - /fedora/ + rules: + # Only run on branches (not tags) which do not matching fedora + - if: $CI_COMMIT_REF_NAME !~ /^fedora/ && $CI_COMMIT_TAG == null # Gitlab does not yet support matrix jobs with dynamic matrix-based dependencies. # https://forum.gitlab.com/t/ci-specifying-artifact-dependencies-when-using-parallel-matrix/45026/2 @@ -288,24 +290,24 @@ scan-next: needs: ["build-push: [next]"] tags: - fcos-next - only: - - /fedora/ + rules: + - if: $CI_COMMIT_REF_NAME =~ /^fedora/ || $CI_COMMIT_TAG =~ /fedora$/ scan-testing: extends: .common-scan needs: ["build-push: [testing]"] tags: - fcos-testing - only: - - /fedora/ + rules: + - if: $CI_COMMIT_REF_NAME =~ /^fedora/ || $CI_COMMIT_TAG =~ /fedora$/ scan-stable: extends: .common-scan needs: ["build-push: [stable]"] tags: - fcos-stable - only: - - /fedora/ + rules: + - if: $CI_COMMIT_REF_NAME =~ /^fedora/ || $CI_COMMIT_TAG =~ /fedora$/ .common-release-fn-script: &common-release-fn-script - | @@ -347,7 +349,7 @@ scan-stable: - docker login -u "${RELEASE_REGISTRY_USER}" -p "${RELEASE_REGISTRY_TOKEN}" "${RELEASE_REGISTRY}" - for driver_version in ${DRIVER_VERSIONS:-${DRIVER_VERSION}}; do release_fn ${driver_version};done rules: - - if: $CI_COMMIT_TAG =~ /fedora$/ || $CI_COMMIT_REF_NAME == 'fedora' + - if: $CI_COMMIT_TAG =~ /fedora$/ release-next-one-only: stage: release @@ -370,8 +372,9 @@ release-next-one-only: - for driver_version in ${DRIVER_VERSION}; do release_fn ${driver_version} ${OVERWRITE_REMOTE_TAGS} ${CI_COMMIT_SHORT_SHA}-; done tags: - fcos-next - except: - - /fedora/ + rules: + # Only run on branches (not tags) which do not matching ^fedora + - if: $CI_COMMIT_REF_NAME !~ /^fedora/ && $CI_COMMIT_TAG == null # Gitlab does not yet support matrix jobs with dynamic matrix-based dependencies. # https://forum.gitlab.com/t/ci-specifying-artifact-dependencies-when-using-parallel-matrix/45026/2 diff --git a/fedora/README.md b/fedora/README.md index 54174658..3c6fa8a7 100644 --- a/fedora/README.md +++ b/fedora/README.md @@ -27,7 +27,7 @@ Currently built driver versions are specified in `ci/fedora/.common-ci-fcos.yml` The driver container is privileged, and here we choose to launch via podman instead of docker although both work. ```bash -$ DRIVER_VERSION=535.104.12 # Check ci/fedora/.common-ci-fcos.yml for latest +$ DRIVER_VERSION=535.154.05 # Check ci/fedora/.common-ci-fcos.yml for latest $ FEDORA_VERSION_ID=$(cat /etc/os-release | grep VERSION_ID | cut -d = -f2) $ podman run -d --privileged --pid=host \ -v /run/nvidia:/run/nvidia:shared \ @@ -58,7 +58,7 @@ storage: ExecStartPre=-setenforce 0 ExecStartPre=-/bin/mkdir -p /run/nvidia ExecStartPre=-/bin/sh -c 'KERNEL_VERSION=$(/bin/uname -r);FEDORA_VERSION_ID=$(cat /etc/os-release | grep VERSION_ID | cut -d = -f2); \ - /bin/podman pull registry.gitlab.com/container-toolkit-fcos/driver:535.104.12-$$KERNEL_VERSION-fedora$$FEDORA_VERSION_ID' + /bin/podman pull registry.gitlab.com/container-toolkit-fcos/driver:535.154.05-$$KERNEL_VERSION-fedora$$FEDORA_VERSION_ID' ExecStartPre=-/usr/sbin/modprobe video ExecStart=/bin/sh -c 'KERNEL_VERSION=$(/bin/uname -r);FEDORA_VERSION_ID=$(cat /etc/os-release | grep VERSION_ID | cut -d = -f2); \ /bin/podman run --name nvidia-driver \ @@ -67,7 +67,7 @@ storage: --privileged --pid=host \ # No need for network IF using container image with pre-built kernel headers \ --network=none \ - registry.gitlab.com/container-toolkit-fcos/driver:535.104.12-$$KERNEL_VERSION-fedora$$FEDORA_VERSION_ID \ + registry.gitlab.com/container-toolkit-fcos/driver:535.154.05-$$KERNEL_VERSION-fedora$$FEDORA_VERSION_ID \ --accept-license' ExecStop=/bin/podman stop nvidia-driver @@ -84,29 +84,30 @@ You should be able to step into the driver container and run the `nvidia-smi` to ```bash $ # Assumes you're running the driver container via podman and named nvidia-driver as above... -$ podman exec -it nvidia-driver bash -[root@8dc88dad905e nvidia-510.47.03]# nvidia-smi -Wed May 25 15:24:00 2022 -+-----------------------------------------------------------------------------+ -| NVIDIA-SMI 525.85.12 Driver Version: 525.85.12 CUDA Version: 12.0 | -|-------------------------------+----------------------+----------------------+ -| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | -| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | -| | | MIG M. | -|===============================+======================+======================| -| 0 NVIDIA A10G On | 00000000:00:1E.0 Off | 0 | -| 0% 39C P0 197W / 300W | 22022MiB / 23028MiB | 96% Default | -| | | N/A | -+-------------------------------+----------------------+----------------------+ - -+-----------------------------------------------------------------------------+ -| Processes: | -| GPU GI CI PID Type Process name GPU Memory | -| ID ID Usage | -|=============================================================================| -| No running processes found | -+-----------------------------------------------------------------------------+ -[root@8dc88dad905e]# +$ podman exec -it nvidia-driver sh +sh-5.2# nvidia-smi +Wed Feb 14 17:58:08 2024 ++---------------------------------------------------------------------------------------+ +| NVIDIA-SMI 535.154.05 Driver Version: 535.154.05 CUDA Version: 12.2 | +|-----------------------------------------+----------------------+----------------------+ +| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | +| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | +| | | MIG M. | +|=========================================+======================+======================| +| 0 NVIDIA A10G On | 00000000:00:1E.0 Off | 0 | +| 0% 26C P0 58W / 300W | 21216MiB / 23028MiB | 0% Default | +| | | N/A | ++-----------------------------------------+----------------------+----------------------+ + ++---------------------------------------------------------------------------------------+ +| Processes: | +| GPU GI CI PID Type Process name GPU Memory | +| ID ID Usage | +|=======================================================================================| +| 0 N/A N/A 11339 C tensorflow_model_server 21208MiB | ++---------------------------------------------------------------------------------------+ +| No running processes found | ++---------------------------------------------------------------------------------------+ ``` ### Install Container Runtime / Toolkit From e90baeee011bea2cd6c2ad5c6c621283e3288098 Mon Sep 17 00:00:00 2001 From: Fifo Phonics Date: Tue, 30 Apr 2024 15:47:25 -0400 Subject: [PATCH 4/8] Fedora: Add patch utility --- fedora/Dockerfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fedora/Dockerfile b/fedora/Dockerfile index 44841d34..1bd5683e 100644 --- a/fedora/Dockerfile +++ b/fedora/Dockerfile @@ -63,6 +63,8 @@ ENV NVIDIA_VISIBLE_DEVICES=void # getopt etc. RUN dnf install -y util-linux 'dnf-command(download)' +RUN dnf install -y patch + ADD install.sh /tmp/ RUN NVIDIA_GPGKEY_SUM=d0664fbbdb8c32356d45de36c5984617217b2d0bef41b93ccecd326ba3b80c87 && \ From 3ccde50db6b89bcbfec788a12509bff6a45fc799 Mon Sep 17 00:00:00 2001 From: Fifo Phonics Date: Tue, 30 Apr 2024 15:48:31 -0400 Subject: [PATCH 5/8] Fedora: Apply single patch on init command Apply patch on init via installer to produce/use new patched installer (-custom.run) --- fedora/nvidia-driver | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/fedora/nvidia-driver b/fedora/nvidia-driver index fd30c2fb..c18a3b21 100755 --- a/fedora/nvidia-driver +++ b/fedora/nvidia-driver @@ -595,14 +595,33 @@ _start_vgpu_topology_daemon() { nvidia-topologyd } +_apply_patch () { + # Apply a single *.patch file that has been mounted to /patch + if [ -d /patch ]; then + # Exit if multiple patches are found + if [ $(ls -1 /patch/*.patch | wc -l) -gt 1 ]; then + echo "Multiple patches found, only one patch is supported" + exit 1 + fi + for patch in /patch/*.patch; do + sh NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run --apply-patch ${patch} -m=${KERNEL_TYPE} + # The patched installer has by default the file name ending '-custom.run' + PATCHED_SUFFIX="-custom" + echo -e "NVIDIA Software installer patched with '/patch/${patch}'\n" + done + fi +} + _prepare() { if [ "${DRIVER_TYPE}" = "vgpu" ]; then _find_vgpu_driver_version || exit 1 fi + _apply_patch + # Install the userspace components and copy the kernel module sources. - sh NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run -x -m=${KERNEL_TYPE} && \ - cd NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION && \ + sh NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION${PATCHED_SUFFIX:-}.run -x -m=${KERNEL_TYPE} && \ + cd NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION${PATCHED_SUFFIX:-} && \ sh /tmp/install.sh nvinstall && \ mkdir -p /usr/src/nvidia-$DRIVER_VERSION && \ mv LICENSE mkprecompiled ${KERNEL_TYPE} /usr/src/nvidia-$DRIVER_VERSION && \ From 02792aa386326beec298962c1fa8acf974eccd33 Mon Sep 17 00:00:00 2001 From: Fifo Phonics Date: Wed, 27 Mar 2024 10:37:19 -0400 Subject: [PATCH 6/8] Fedora: Bump Golang Version --- fedora/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fedora/Dockerfile b/fedora/Dockerfile index 1bd5683e..42afd078 100644 --- a/fedora/Dockerfile +++ b/fedora/Dockerfile @@ -9,7 +9,7 @@ SHELL ["/bin/bash", "-c"] RUN dnf install -y git wget -ENV GOLANG_VERSION=1.21.5 +ENV GOLANG_VERSION=1.22.2 # download appropriate binary based on the target architecture for multi-arch builds RUN OS_ARCH=${TARGETARCH/x86_64/amd64} && OS_ARCH=${OS_ARCH/aarch64/arm64} && \ From fc608cf9c8756eeb2a97b9a2ab3ec032b469e9e5 Mon Sep 17 00:00:00 2001 From: Fifo Phonics Date: Wed, 27 Mar 2024 10:35:49 -0400 Subject: [PATCH 7/8] Fedora: Bump driver versions + CUDA version --- ci/fedora/.gitlab-ci-fcos.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ci/fedora/.gitlab-ci-fcos.yml b/ci/fedora/.gitlab-ci-fcos.yml index 2297ffc4..c4557d1a 100644 --- a/ci/fedora/.gitlab-ci-fcos.yml +++ b/ci/fedora/.gitlab-ci-fcos.yml @@ -96,10 +96,10 @@ variables: # To survey latest Data Center driver versions available: # https://www.nvidia.com/Download/Find.aspx # https://www.nvidia.com/en-us/drivers/unix/ - DRIVER_VERSION: "535.154.05" - DRIVER_VERSIONS: 535.154.05 525.147.05 + DRIVER_VERSION: "550.90.07" + DRIVER_VERSIONS: 550.90.07 535.183.01 - CUDA_VERSION: 12.3.1 + CUDA_VERSION: 12.4.1 CVE_UPDATES: "curl libc6" From 0c75962506e06953f64d272e3170ec0f82e11875 Mon Sep 17 00:00:00 2001 From: Fifo Phonics Date: Tue, 11 Jun 2024 11:04:09 -0400 Subject: [PATCH 8/8] Fedora: Add how to apply patch in README.md --- fedora/README.md | 122 +++++++++++++++++++++++++++++------------------ 1 file changed, 76 insertions(+), 46 deletions(-) diff --git a/fedora/README.md b/fedora/README.md index 3c6fa8a7..acb0738d 100644 --- a/fedora/README.md +++ b/fedora/README.md @@ -27,7 +27,7 @@ Currently built driver versions are specified in `ci/fedora/.common-ci-fcos.yml` The driver container is privileged, and here we choose to launch via podman instead of docker although both work. ```bash -$ DRIVER_VERSION=535.154.05 # Check ci/fedora/.common-ci-fcos.yml for latest +$ DRIVER_VERSION=550.90.07 # Check ci/fedora/.common-ci-fcos.yml for latest driver versions $ FEDORA_VERSION_ID=$(cat /etc/os-release | grep VERSION_ID | cut -d = -f2) $ podman run -d --privileged --pid=host \ -v /run/nvidia:/run/nvidia:shared \ @@ -36,13 +36,14 @@ $ podman run -d --privileged --pid=host \ registry.gitlab.com/container-toolkit-fcos/driver:${DRIVER_VERSION}-fedora$$FEDORA_VERSION_ID ``` -Or, on FCOS registering as a systemd unit via an ignition snippet, and using an image with kernel headers pre-installed for faster start up: +Or, on FCOS registering as a systemd unit via an ignition snippet. In this unit we attempt to pull a driver image matching the running kernel version (with pre-compiled kernel headers), but fall back to a generic Fedora version if one does not exist. Furthermore, we +mount a single patch file from a host directory that, if detected, will be applied to the generic Fedora version. ```yaml variant: fcos -version: 1.4.0 -storage: - files: +version: 1.5.0 +systemd: + units: - name: acme-nvidia-driver.service enabled: true contents: | @@ -57,18 +58,32 @@ storage: ExecStartPre=-/bin/podman rm nvidia-driver ExecStartPre=-setenforce 0 ExecStartPre=-/bin/mkdir -p /run/nvidia - ExecStartPre=-/bin/sh -c 'KERNEL_VERSION=$(/bin/uname -r);FEDORA_VERSION_ID=$(cat /etc/os-release | grep VERSION_ID | cut -d = -f2); \ - /bin/podman pull registry.gitlab.com/container-toolkit-fcos/driver:535.154.05-$$KERNEL_VERSION-fedora$$FEDORA_VERSION_ID' + # 5/17/24 - Without the following line the nvidia driver container will crash with no meaningful error message ExecStartPre=-/usr/sbin/modprobe video - ExecStart=/bin/sh -c 'KERNEL_VERSION=$(/bin/uname -r);FEDORA_VERSION_ID=$(cat /etc/os-release | grep VERSION_ID | cut -d = -f2); \ - /bin/podman run --name nvidia-driver \ - -v /run/nvidia:/run/nvidia:shared \ - -v /var/log:/var/log \ - --privileged --pid=host \ - # No need for network IF using container image with pre-built kernel headers \ - --network=none \ - registry.gitlab.com/container-toolkit-fcos/driver:535.154.05-$$KERNEL_VERSION-fedora$$FEDORA_VERSION_ID \ - --accept-license' + + # If there is a kernel-specific image (with pre-compiled kernel headers) then + # use it, otherwise fallback to the generic Fedora image mounting any patches that exist. + # + # Replace registry.gitlab.com/container-toolkit-fcos/driver with the registry name + # of your built/published driver images, or perhaps, docker.io/fifofonix/driver + ExecStart=/bin/sh -c ' \ + FEDORA_VERSION_ID=$(cat /etc/os-release | grep VERSION_ID | cut -d = -f2); \ + KERNEL_VERSION=$(/bin/uname -r); \ + if /bin/podman manifest inspect registry.gitlab.com/container-toolkit-fcos/driver:550.90.07-$$KERNEL_VERSION-fedora$$FEDORA_VERSION_ID > /dev/null; then \ + IMAGE_NAME=registry.gitlab.com/container-toolkit-fcos/driver:550.90.07-$$KERNEL_VERSION-fedora$$FEDORA_VERSION_ID; \ + else \ + IMAGE_NAME=registry.gitlab.com/container-toolkit-fcos/driver:550.90.07-fedora$$FEDORA_VERSION_ID; \ + PATCH_MOUNT="-v /var/acme/nvidia-driver-patch:/patch" + fi; \ + /bin/podman pull $$IMAGE_NAME; \ + /bin/podman run --name nvidia-driver \ + -v /run/nvidia:/run/nvidia:shared \ + -v /var/log:/var/log \ + $$PATCH_MOUNT \ + --privileged \ + --pid host \ + $$IMAGE_NAME \ + --accept-license' ExecStop=/bin/podman stop nvidia-driver Restart=on-failure @@ -86,46 +101,62 @@ You should be able to step into the driver container and run the `nvidia-smi` to $ # Assumes you're running the driver container via podman and named nvidia-driver as above... $ podman exec -it nvidia-driver sh sh-5.2# nvidia-smi -Wed Feb 14 17:58:08 2024 -+---------------------------------------------------------------------------------------+ -| NVIDIA-SMI 535.154.05 Driver Version: 535.154.05 CUDA Version: 12.2 | -|-----------------------------------------+----------------------+----------------------+ -| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | -| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | -| | | MIG M. | -|=========================================+======================+======================| -| 0 NVIDIA A10G On | 00000000:00:1E.0 Off | 0 | -| 0% 26C P0 58W / 300W | 21216MiB / 23028MiB | 0% Default | -| | | N/A | -+-----------------------------------------+----------------------+----------------------+ - -+---------------------------------------------------------------------------------------+ -| Processes: | -| GPU GI CI PID Type Process name GPU Memory | -| ID ID Usage | -|=======================================================================================| -| 0 N/A N/A 11339 C tensorflow_model_server 21208MiB | -+---------------------------------------------------------------------------------------+ -| No running processes found | -+---------------------------------------------------------------------------------------+ +Tue Jun 11 19:55:25 2024 ++-----------------------------------------------------------------------------------------+ +| NVIDIA-SMI 550.90.07 Driver Version: 550.90.07 CUDA Version: 12.4 | +|-----------------------------------------+------------------------+----------------------+ +| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | +| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | +| | | MIG M. | +|=========================================+========================+======================| +| 0 Tesla M60 On | 00000000:00:1E.0 Off | 0 | +| N/A 47C P0 46W / 150W | 7131MiB / 7680MiB | 0% Default | +| | | N/A | ++-----------------------------------------+------------------------+----------------------+ + ++-----------------------------------------------------------------------------------------+ +| Processes: | +| GPU GI CI PID Type Process name GPU Memory | +| ID ID Usage | +|=========================================================================================| +| No running processes found | ++-----------------------------------------------------------------------------------------+ ``` ### Install Container Runtime / Toolkit To run a CUDA container that leverages the NVIDIA driver container you now have running, install the separate NVIDIA container runtime and register it with your container runtime system (e.g. docker) following NVIDIA's instructions [here](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html). -On FedoraCoreOS you may choose to layer the container toolkit using `rpm-ostree`, and configure your runtime, with an ignition snippet like this (substitute your runtime, docker is shown, but containerd works too for example): +On FedoraCoreOS you may choose to layer the container toolkit using `rpm-ostree`, and configure your runtime, with an ignition snippet like this (substitute your runtime, containerd is shown, but docker works too for example): ```yaml variant: fcos -version: 1.4.0 +version: 1.5.0 storage: files: - - name: acme-layer-nvidia-container-runtime.service + - path: /etc/nvidia-container-runtime/config.toml + mode: 0644 + contents: + inline: | + [nvidia-container-cli] + #debug = "/var/log/nvidia-container-toolkit.log" + root = "/run/nvidia/driver" + path = "/usr/bin/nvidia-container-cli" + # Improvements made in NVIDIA container toolkit 1.15.0 do not yet seem to correctly + # support FCOS so we still need to explicitly add the driver path to ld.so.conf + - path: /etc/ld.so.conf.d/container-toolkit.conf + mode: 0644 + contents: + inline: | + /run/nvidia/driver/usr/lib64 +systemd: + units: + - name: acme-layer-nvidia-container-toolkit.service enabled: true # We run before `zincati.service` to avoid conflicting rpm-ostree transactions. contents: | [Unit] + Wants=network-online.target After=network-online.target Before=zincati.service ConditionPathExists=!/var/lib/%N.stamp @@ -138,13 +169,12 @@ storage: ExecStartPre=-/bin/sh -c 'curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo \ > /etc/yum.repos.d/nvidia-container-toolkit.repo' # Perhaps consider pinning the rpm version here depending on change aversion... - ExecStart=/usr/bin/rpm-ostree install --idempotent --allow-inactive --apply-live nvidia-container-toolkit - ExecStart=/bin/sh -c 'echo "/run/nvidia/driver/usr/lib64" > /etc/ld.so.conf.d/nv.conf; ldconfig' - # If we see that the nvidia-ctk is present, then we can configure docker... + ExecStart=/usr/bin/rpm-ostree install -y --idempotent --allow-inactive nvidia-container-toolkit ExecStart=/bin/sh -c 'if [[ -f /usr/bin/nvidia-ctk ]]; then \ - /usr/bin/nvidia-ctk runtime configure --runtime=docker --nvidia-set-as-default; \ - systemctl restart docker; \ + /usr/bin/nvidia-ctk runtime configure --runtime=containerd --nvidia-set-as-default; \ + systemctl restart containerd; \ /bin/touch /var/lib/%N.stamp; fi' + ExecStart=/bin/systemctl --no-block reboot Restart=on-failure RestartSec=60