Skip to content

Commit 3cfc27b

Browse files
test e2e build with cuda 12.8
1 parent 9095091 commit 3cfc27b

File tree

3 files changed

+49
-28
lines changed

3 files changed

+49
-28
lines changed

.github/workflows/build-and-push-base.yaml

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,7 @@ on:
1111
env:
1212
REGISTRY: ghcr.io
1313
IMAGE_NAME: ${{ github.repository }}-base
14-
CUDA_VERSION: 12_2
15-
NCCL_VERSION: 2_22_3_1
14+
CUDA_VERSION: 12_8
1615
jobs:
1716
docker:
1817
timeout-minutes: 40
@@ -42,7 +41,7 @@ jobs:
4241
context: .
4342
push: true
4443
tags: |
45-
${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:cuda${{ env.CUDA_VERSION }}-nccl${{ env.NCCL_VERSION }}
44+
${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:cuda${{ env.CUDA_VERSION }}
4645
platforms: linux/amd64
4746
cache-from: type=gha
4847
cache-to: type=gha,mode=max

.github/workflows/build-and-push.yaml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
name: Build and push docker image
22

33
on:
4+
push:
5+
branches:
6+
- "chore/bump-cuda-for-e2e"
47
release:
58
types:
69
- 'published'
@@ -44,7 +47,7 @@ jobs:
4447
tags: |
4548
${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.event.release.tag_name }},
4649
${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.sha }}
47-
${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:latest
50+
${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:e2e-latest
4851
platforms: linux/amd64
4952
cache-from: type=gha
5053
cache-to: type=gha,mode=max

Dockerfile.base

Lines changed: 43 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,48 +1,67 @@
11
FROM --platform=linux/amd64 ubuntu:22.04 as build-image
22

3-
RUN apt-get update && apt-get install -y pkg-config wget libssl-dev ca-certificates protobuf-compiler \
3+
ENV DEBIAN_FRONTEND=noninteractive
4+
5+
# Base build deps
6+
RUN apt-get update && apt-get install -y --no-install-recommends \
7+
pkg-config wget curl ca-certificates git build-essential debhelper \
8+
libssl-dev protobuf-compiler \
49
&& rm -rf /var/lib/apt/lists/*
10+
11+
# Add NVIDIA CUDA repo & install CUDA 12.8 + NCCL from that repo
512
RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb \
613
&& dpkg -i cuda-keyring_1.1-1_all.deb \
714
&& apt-get update \
8-
&& apt-get install -y cuda-toolkit-12-2 libnccl2=2.22.3-1+cuda12.2 libnccl-dev=2.22.3-1+cuda12.2 git curl build-essential debhelper
15+
&& apt-get install -y --no-install-recommends \
16+
cuda-toolkit-12-8 \
17+
libnccl2 libnccl-dev \
18+
&& rm -rf /var/lib/apt/lists/*
919

10-
ARG GDRCOPY_VERSION=v2.4.1
11-
ARG EFA_INSTALLER_VERSION=1.34.0
12-
ARG NCCL_INSTALL_VERSION=v2.22.3-1
13-
ARG AWS_OFI_NCCL_VERSION=aws
20+
# ---- Versions (override as needed) ----
21+
ARG GDRCOPY_VERSION=v2.5.1
22+
ARG EFA_INSTALLER_VERSION=1.43.1
23+
ARG AWS_OFI_NCCL_VERSION=v1.16.3
24+
# ---------------------------------------
1425

15-
ENV LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/lib:$LD_LIBRARY_PATH
16-
ENV PATH=/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin:$PATH
26+
# Helpful envs (CUDA + CUPTI visible; prefer system NCCL in /usr)
27+
ENV LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib:/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH
28+
ENV PATH=/usr/local/cuda/bin:$PATH
1729

30+
# ----- GDRCopy (optional but common with EFA) -----
1831
RUN git clone -b ${GDRCOPY_VERSION} https://github.com/NVIDIA/gdrcopy.git /tmp/gdrcopy \
19-
&& cd /tmp/gdrcopy \
20-
&& make prefix=/opt/gdrcopy install
21-
22-
ENV LD_LIBRARY_PATH=/opt/gdrcopy/lib:/usr/local/cuda/compat:$LD_LIBRARY_PATH
23-
ENV LIBRARY_PATH=/opt/gdrcopy/lib:/usr/local/cuda/compat/:$LIBRARY_PATH
32+
&& make -C /tmp/gdrcopy prefix=/opt/gdrcopy install
33+
ENV LD_LIBRARY_PATH=/opt/gdrcopy/lib:$LD_LIBRARY_PATH
34+
ENV LIBRARY_PATH=/opt/gdrcopy/lib:$LIBRARY_PATH
2435
ENV CPATH=/opt/gdrcopy/include:$CPATH
2536
ENV PATH=/opt/gdrcopy/bin:$PATH
2637

27-
# Install EFA installer
38+
# ----- EFA installer (userspace libfabric, Open MPI, etc.) -----
2839
RUN cd /tmp \
2940
&& curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
30-
&& tar -xf /tmp/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
31-
&& cd /tmp/aws-efa-installer \
41+
&& tar -xf aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
42+
&& cd aws-efa-installer \
43+
# --skip-kmod because kernel drivers come from the host on EKS
3244
&& ./efa_installer.sh -y -d --skip-kmod --skip-limit-conf --no-verify | tee /tmp/aws-efa-installer/install.log
3345

34-
# Test EFA installation
46+
# Confirm libfabric/fi_info presence
3547
RUN /opt/amazon/efa/bin/fi_info --version
3648

37-
# Install AWS-OFI-NCCL plugin
49+
# EFA now installs Open MPI 5 under /opt/amazon/openmpi5 (update paths)
50+
ENV PATH=/opt/amazon/openmpi5/bin:/opt/amazon/efa/bin:$PATH
51+
ENV LD_LIBRARY_PATH=/opt/amazon/openmpi5/lib:/opt/amazon/efa/lib:$LD_LIBRARY_PATH
52+
53+
# ----- AWS-OFI-NCCL plugin build against system NCCL (/usr) -----
3854
RUN cd /tmp \
3955
&& git clone https://github.com/aws/aws-ofi-nccl.git \
40-
&& cd /tmp/aws-ofi-nccl \
56+
&& cd aws-ofi-nccl \
4157
&& git checkout ${AWS_OFI_NCCL_VERSION} \
4258
&& ./autogen.sh \
4359
&& ./configure --prefix=/opt/aws-ofi-nccl/install \
44-
--with-libfabric=/opt/amazon/efa/ \
45-
--with-cuda=/usr/local/cuda \
46-
--with-nccl=/tmp/nccl/build \
47-
--with-mpi=/opt/amazon/openmpi/ \
48-
&& make && make install
60+
--with-libfabric=/opt/amazon/efa \
61+
--with-cuda=/usr/local/cuda \
62+
--with-nccl=/usr \
63+
--with-mpi=/opt/amazon/openmpi5 \
64+
&& make -j && make install
65+
66+
# Final sanity: show NCCL and CUDA versions available to the linker
67+
RUN ldconfig -p | grep -E 'nccl|cuda' || true

0 commit comments

Comments
 (0)