|
1 | 1 | FROM --platform=linux/amd64 ubuntu:22.04 as build-image
|
2 | 2 |
|
3 |
| -RUN apt-get update && apt-get install -y pkg-config wget libssl-dev ca-certificates protobuf-compiler \ |
| 3 | +ENV DEBIAN_FRONTEND=noninteractive |
| 4 | + |
| 5 | +# Base build deps |
| 6 | +RUN apt-get update && apt-get install -y --no-install-recommends \ |
| 7 | + pkg-config wget curl ca-certificates git build-essential debhelper \ |
| 8 | + libssl-dev protobuf-compiler \ |
4 | 9 | && rm -rf /var/lib/apt/lists/*
|
| 10 | + |
| 11 | +# Add NVIDIA CUDA repo & install CUDA 12.8 + NCCL from that repo |
5 | 12 | RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb \
|
6 | 13 | && dpkg -i cuda-keyring_1.1-1_all.deb \
|
7 | 14 | && apt-get update \
|
8 |
| - && apt-get install -y cuda-toolkit-12-2 libnccl2=2.22.3-1+cuda12.2 libnccl-dev=2.22.3-1+cuda12.2 git curl build-essential debhelper |
| 15 | + && apt-get install -y --no-install-recommends \ |
| 16 | + cuda-toolkit-12-8 \ |
| 17 | + libnccl2 libnccl-dev \ |
| 18 | + && rm -rf /var/lib/apt/lists/* |
9 | 19 |
|
10 |
| -ARG GDRCOPY_VERSION=v2.4.1 |
11 |
| -ARG EFA_INSTALLER_VERSION=1.34.0 |
12 |
| -ARG NCCL_INSTALL_VERSION=v2.22.3-1 |
13 |
| -ARG AWS_OFI_NCCL_VERSION=aws |
| 20 | +# ---- Versions (override as needed) ---- |
| 21 | +ARG GDRCOPY_VERSION=v2.5.1 |
| 22 | +ARG EFA_INSTALLER_VERSION=1.43.1 |
| 23 | +ARG AWS_OFI_NCCL_VERSION=v1.16.3 |
| 24 | +# --------------------------------------- |
14 | 25 |
|
15 |
| -ENV LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/nccl/build/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/lib:$LD_LIBRARY_PATH |
16 |
| -ENV PATH=/opt/amazon/openmpi/bin/:/opt/amazon/efa/bin:$PATH |
| 26 | +# Helpful envs (CUDA + CUPTI visible; prefer system NCCL in /usr) |
| 27 | +ENV LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib:/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH |
| 28 | +ENV PATH=/usr/local/cuda/bin:$PATH |
17 | 29 |
|
| 30 | +# ----- GDRCopy (optional but common with EFA) ----- |
18 | 31 | RUN git clone -b ${GDRCOPY_VERSION} https://github.com/NVIDIA/gdrcopy.git /tmp/gdrcopy \
|
19 |
| - && cd /tmp/gdrcopy \ |
20 |
| - && make prefix=/opt/gdrcopy install |
21 |
| - |
22 |
| -ENV LD_LIBRARY_PATH=/opt/gdrcopy/lib:/usr/local/cuda/compat:$LD_LIBRARY_PATH |
23 |
| -ENV LIBRARY_PATH=/opt/gdrcopy/lib:/usr/local/cuda/compat/:$LIBRARY_PATH |
| 32 | + && make -C /tmp/gdrcopy prefix=/opt/gdrcopy install |
| 33 | +ENV LD_LIBRARY_PATH=/opt/gdrcopy/lib:$LD_LIBRARY_PATH |
| 34 | +ENV LIBRARY_PATH=/opt/gdrcopy/lib:$LIBRARY_PATH |
24 | 35 | ENV CPATH=/opt/gdrcopy/include:$CPATH
|
25 | 36 | ENV PATH=/opt/gdrcopy/bin:$PATH
|
26 | 37 |
|
27 |
| -# Install EFA installer |
| 38 | +# ----- EFA installer (userspace libfabric, Open MPI, etc.) ----- |
28 | 39 | RUN cd /tmp \
|
29 | 40 | && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
|
30 |
| - && tar -xf /tmp/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \ |
31 |
| - && cd /tmp/aws-efa-installer \ |
| 41 | + && tar -xf aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \ |
| 42 | + && cd aws-efa-installer \ |
| 43 | + # --skip-kmod because kernel drivers come from the host on EKS |
32 | 44 | && ./efa_installer.sh -y -d --skip-kmod --skip-limit-conf --no-verify | tee /tmp/aws-efa-installer/install.log
|
33 | 45 |
|
34 |
| -# Test EFA installation |
| 46 | +# Confirm libfabric/fi_info presence |
35 | 47 | RUN /opt/amazon/efa/bin/fi_info --version
|
36 | 48 |
|
37 |
| -# Install AWS-OFI-NCCL plugin |
| 49 | +# EFA now installs Open MPI 5 under /opt/amazon/openmpi5 (update paths) |
| 50 | +ENV PATH=/opt/amazon/openmpi5/bin:/opt/amazon/efa/bin:$PATH |
| 51 | +ENV LD_LIBRARY_PATH=/opt/amazon/openmpi5/lib:/opt/amazon/efa/lib:$LD_LIBRARY_PATH |
| 52 | + |
| 53 | +# ----- AWS-OFI-NCCL plugin build against system NCCL (/usr) ----- |
38 | 54 | RUN cd /tmp \
|
39 | 55 | && git clone https://github.com/aws/aws-ofi-nccl.git \
|
40 |
| - && cd /tmp/aws-ofi-nccl \ |
| 56 | + && cd aws-ofi-nccl \ |
41 | 57 | && git checkout ${AWS_OFI_NCCL_VERSION} \
|
42 | 58 | && ./autogen.sh \
|
43 | 59 | && ./configure --prefix=/opt/aws-ofi-nccl/install \
|
44 |
| - --with-libfabric=/opt/amazon/efa/ \ |
45 |
| - --with-cuda=/usr/local/cuda \ |
46 |
| - --with-nccl=/tmp/nccl/build \ |
47 |
| - --with-mpi=/opt/amazon/openmpi/ \ |
48 |
| - && make && make install |
| 60 | + --with-libfabric=/opt/amazon/efa \ |
| 61 | + --with-cuda=/usr/local/cuda \ |
| 62 | + --with-nccl=/usr \ |
| 63 | + --with-mpi=/opt/amazon/openmpi5 \ |
| 64 | + && make -j && make install |
| 65 | + |
| 66 | +# Final sanity: show NCCL and CUDA versions available to the linker |
| 67 | +RUN ldconfig -p | grep -E 'nccl|cuda' || true |
0 commit comments