Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/workflows/cuda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -185,15 +185,15 @@ jobs:

# Build 3D libamrex cuda build with configure
configure-3d-cuda:
name: CUDA@12.6 [configure 3D]
name: CUDA@13.0 [configure 3D]
runs-on: ubuntu-24.04
needs: check_changes
if: needs.check_changes.outputs.has_non_docs_changes == 'true'
steps:
- uses: actions/checkout@v5
- name: Dependencies
run: |
.github/workflows/dependencies/dependencies_nvcc.sh 12.6
.github/workflows/dependencies/dependencies_nvcc_2404.sh 13.0
.github/workflows/dependencies/dependencies_ccache.sh
- name: Set Up Cache
uses: actions/cache@v4
Expand All @@ -215,7 +215,7 @@ jobs:
# /home/runner/work/amrex/amrex/Src/Base/AMReX_GpuLaunchGlobal.H:16:41: error: unused parameter ‘f0’ [-Werror=unused-parameter]
# 16 | AMREX_GPU_GLOBAL void launch_global (L f0) { f0(); }
#
make -j4 WARN_ALL=TRUE WARN_ERROR=TRUE XTRA_CXXFLAGS="-fno-operator-names -Wno-unused-parameter" CCACHE=ccache CUDA_ARCH="7.0 7.2"
make -j4 WARN_ALL=TRUE WARN_ERROR=TRUE XTRA_CXXFLAGS="-fno-operator-names -Wno-unused-parameter" CCACHE=ccache CUDA_ARCH="8.0 9.0"
make install

ccache -s
Expand Down
31 changes: 31 additions & 0 deletions .github/workflows/dependencies/dependencies_cmake.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#!/usr/bin/env bash

set -eu -o pipefail

# `man apt.conf`:
# Number of retries to perform. If this is non-zero APT will retry
# failed files the given number of times.
echo 'Acquire::Retries "3";' | sudo tee /etc/apt/apt.conf.d/80-retries

test -f /usr/share/doc/kitware-archive-keyring/copyright ||
wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | sudo tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null

if [[ ! -f /etc/apt/trusted.gpg.d/apt.llvm.org.asc ]]; then
wget -qO- https://apt.llvm.org/llvm-snapshot.gpg.key | sudo tee /etc/apt/trusted.gpg.d/apt.llvm.org.asc
fi

source /etc/os-release # set UBUNTU_CODENAME

echo "deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ ${UBUNTU_CODENAME} main" | sudo tee /etc/apt/sources.list.d/kitware.list >/dev/null

sudo apt-get update

test -f /usr/share/doc/kitware-archive-keyring/copyright ||
sudo rm /usr/share/keyrings/kitware-archive-keyring.gpg

sudo apt-get install kitware-archive-keyring

sudo apt-get install -y --no-install-recommends cmake

sudo rm -f /usr/local/bin/cmake
cmake --version
2 changes: 1 addition & 1 deletion .github/workflows/dependencies/dependencies_nvcc.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ sudo apt-get install -y \
wget

VERSION_DOTTED=${1-12.0} && VERSION_DASHED=$(sed 's/\./-/' <<< $VERSION_DOTTED)
curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb
curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb
sudo dpkg -i cuda-keyring_1.0-1_all.deb
sudo apt-get update
sudo apt-get install -y \
Expand Down
44 changes: 44 additions & 0 deletions .github/workflows/dependencies/dependencies_nvcc_2404.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#!/usr/bin/env bash
#
# Copyright 2020-2022 Axel Huebl
#
# License: BSD-3-Clause-LBNL

set -eu -o pipefail

# `man apt.conf`:
# Number of retries to perform. If this is non-zero APT will retry
# failed files the given number of times.
echo 'Acquire::Retries "3";' | sudo tee /etc/apt/apt.conf.d/80-retries

sudo apt-get -qqq update
sudo apt-get install -y \
build-essential \
ca-certificates \
cmake \
g++ \
gfortran \
gnupg \
libopenmpi-dev \
openmpi-bin \
pkg-config \
wget

VERSION_DOTTED=${1-12.0} && VERSION_DASHED=$(sed 's/\./-/' <<< $VERSION_DOTTED)
curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb
sudo dpkg -i cuda-keyring_1.1-1_all.deb
sudo apt-get update
sudo apt-get install -y \
cuda-command-line-tools-$VERSION_DASHED \
cuda-compiler-$VERSION_DASHED \
cuda-cupti-dev-$VERSION_DASHED \
cuda-minimal-build-$VERSION_DASHED \
cuda-nvml-dev-$VERSION_DASHED \
cuda-nvtx-$VERSION_DASHED \
libcufft-dev-$VERSION_DASHED \
libcurand-dev-$VERSION_DASHED \
libcusparse-dev-$VERSION_DASHED

sudo apt-get install -y --no-install-recommends libnvjitlink-dev-$VERSION_DASHED || true

sudo ln -s cuda-$VERSION_DOTTED /usr/local/cuda
16 changes: 16 additions & 0 deletions Src/Base/AMReX_BaseFab.H
Original file line number Diff line number Diff line change
Expand Up @@ -1205,9 +1205,17 @@ BaseFab<T>::prefetchToHost () const noexcept
#elif defined(AMREX_USE_CUDA) && !defined(_WIN32)
if (Gpu::Device::devicePropMajor() >= 6) {
std::size_t s = sizeof(T)*this->nvar*this->domain.numPts();
#if defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ >= 13)
cudaMemLocation location = {};
location.type = cudaMemLocationTypeDevice;
location.id = cudaCpuDeviceId;
AMREX_CUDA_SAFE_CALL(cudaMemPrefetchAsync(this->dptr, s, location, 0,
Gpu::gpuStream()));
#else
AMREX_CUDA_SAFE_CALL(cudaMemPrefetchAsync(this->dptr, s,
cudaCpuDeviceId,
Gpu::gpuStream()));
#endif
}
#elif defined(AMREX_USE_HIP)
// xxxxx HIP FIX HERE after managed memory is supported
Expand All @@ -1229,9 +1237,17 @@ BaseFab<T>::prefetchToDevice () const noexcept
#elif defined(AMREX_USE_CUDA) && !defined(_WIN32)
if (Gpu::Device::devicePropMajor() >= 6) {
std::size_t s = sizeof(T)*this->nvar*this->domain.numPts();
#if defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ >= 13)
cudaMemLocation location = {};
location.type = cudaMemLocationTypeDevice;
location.id = Gpu::Device::deviceId();
AMREX_CUDA_SAFE_CALL(cudaMemPrefetchAsync(this->dptr, s, location, 0,
Gpu::gpuStream()));
#else
AMREX_CUDA_SAFE_CALL(cudaMemPrefetchAsync(this->dptr, s,
Gpu::Device::deviceId(),
Gpu::gpuStream()));
#endif
}
#elif defined(AMREX_USE_HIP)
// xxxxx HIP FIX HERE after managed memory is supported
Expand Down
20 changes: 20 additions & 0 deletions Src/Base/AMReX_GpuContainers.H
Original file line number Diff line number Diff line change
Expand Up @@ -344,10 +344,20 @@ namespace amrex::Gpu {
// Currently only implemented for CUDA.
#if defined(AMREX_USE_CUDA) && !defined(_WIN32)
if (Gpu::Device::devicePropMajor() >= 6) {
#if defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ >= 13)
cudaMemLocation location = {};
location.type = cudaMemLocationTypeDevice;
location.id = cudaCpuDeviceId;
AMREX_CUDA_SAFE_CALL(cudaMemPrefetchAsync(&(*begin),
size*sizeof(value_type),
location, 0,
Gpu::gpuStream()));
#else
AMREX_CUDA_SAFE_CALL(cudaMemPrefetchAsync(&(*begin),
size*sizeof(value_type),
cudaCpuDeviceId,
Gpu::gpuStream()));
#endif
}
#endif
#endif
Expand Down Expand Up @@ -375,10 +385,20 @@ namespace amrex::Gpu {
// Currently only implemented for CUDA.
#if defined(AMREX_USE_CUDA) && !defined(_WIN32)
if (Gpu::Device::devicePropMajor() >= 6) {
#if defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ >= 13)
cudaMemLocation location = {};
location.type = cudaMemLocationTypeDevice;
location.id = Gpu::Device::deviceId();
AMREX_CUDA_SAFE_CALL(cudaMemPrefetchAsync(&(*begin),
size*sizeof(value_type),
location, 0,
Gpu::gpuStream()));
#else
AMREX_CUDA_SAFE_CALL(cudaMemPrefetchAsync(&(*begin),
size*sizeof(value_type),
Gpu::Device::deviceId(),
Gpu::gpuStream()));
#endif
}
#endif
#endif
Expand Down
22 changes: 20 additions & 2 deletions Src/Base/AMReX_GpuDevice.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -828,11 +828,20 @@ Device::mem_advise_set_preferred (void* p, std::size_t sz, int device)
#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
if (device_prop.managedMemory == 1 && device_prop.concurrentManagedAccess == 1)
{
#if defined(AMREX_USE_CUDA)
#if defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ >= 13)
cudaMemLocation location = {};
location.type = cudaMemLocationTypeDevice;
location.id = device;
#else
auto location = device;
#endif
#endif
AMREX_HIP_OR_CUDA
(AMREX_HIP_SAFE_CALL(
hipMemAdvise(p, sz, hipMemAdviseSetPreferredLocation, device)),
AMREX_CUDA_SAFE_CALL(
cudaMemAdvise(p, sz, cudaMemAdviseSetPreferredLocation, device)));
cudaMemAdvise(p, sz, cudaMemAdviseSetPreferredLocation, location)));
}
#elif defined(AMREX_USE_SYCL)
// xxxxx SYCL todo: mem_advise
Expand All @@ -851,11 +860,20 @@ Device::mem_advise_set_readonly (void* p, std::size_t sz)
#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
if (device_prop.managedMemory == 1 && device_prop.concurrentManagedAccess == 1)
{
#if defined(AMREX_USE_CUDA)
#if defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ >= 13)
cudaMemLocation location = {};
location.type = cudaMemLocationTypeDevice;
location.id = cudaCpuDeviceId;
#else
auto location = cudaCpuDeviceId;
#endif
#endif
AMREX_HIP_OR_CUDA
(AMREX_HIP_SAFE_CALL(
hipMemAdvise(p, sz, hipMemAdviseSetReadMostly, hipCpuDeviceId)),
AMREX_CUDA_SAFE_CALL(
cudaMemAdvise(p, sz, cudaMemAdviseSetReadMostly, cudaCpuDeviceId)));
cudaMemAdvise(p, sz, cudaMemAdviseSetReadMostly, location)));
}
#elif defined(AMREX_USE_SYCL)
// xxxxx SYCL todo: mem_advise
Expand Down
10 changes: 5 additions & 5 deletions Src/Base/AMReX_GpuReduce.H
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,13 @@

#if !defined(AMREX_USE_CUB) && defined(AMREX_USE_CUDA) && defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ >= 11)
#define AMREX_USE_CUB 1
#if defined(CCCL_MAJOR_VERSION) && (CCCL_MAJOR_VERSION >= 3)
#define AMREX_CUDA_CCCL_VER_3 1
#endif
#endif

#if defined(AMREX_USE_CUB)
#include <cub/cub.cuh>
#if defined(CCCL_MAJOR_VERSION) && (CCCL_MAJOR_VERSION >= 3)
#define AMREX_CUDA_CCCL_VER_GE_3 1
#endif
#elif defined(AMREX_USE_HIP)
#include <rocprim/rocprim.hpp>
#endif
Expand Down Expand Up @@ -423,7 +423,7 @@ T blockReduceMin (T source) noexcept
// we do it always to be safe.
__syncthreads();

#ifdef AMREX_CUDA_CCCL_VER_3
#ifdef AMREX_CUDA_CCCL_VER_GE_3
return BlockReduce(temp_storage).Reduce(source, cuda::minimum<T>{});
#else
return BlockReduce(temp_storage).Reduce(source, cub::Min());
Expand Down Expand Up @@ -477,7 +477,7 @@ T blockReduceMax (T source) noexcept
// and since we don't know how many times the user is calling it,
// we do it always to be safe.
__syncthreads();
#ifdef AMREX_CUDA_CCCL_VER_3
#ifdef AMREX_CUDA_CCCL_VER_GE_3
return BlockReduce(temp_storage).Reduce(source, cuda::maximum<T>());
#else
return BlockReduce(temp_storage).Reduce(source, cub::Max());
Expand Down
10 changes: 9 additions & 1 deletion Src/Base/AMReX_Scan.H
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@

#if defined(AMREX_USE_CUDA) && defined(__CUDACC__) && (__CUDACC_VER_MAJOR__ >= 11)
# include <cub/cub.cuh>
# ifdef AMREX_CUDA_CCCL_VER_GE_3
# include <thrust/iterator/transform_iterator.h>
# endif
#elif defined(AMREX_USE_HIP)
# include <rocprim/rocprim.hpp>
#elif defined(AMREX_USE_SYCL) && defined(AMREX_USE_ONEDPL)
Expand Down Expand Up @@ -828,7 +831,7 @@ T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = ret
using BlockScan = cub::BlockScan<T, nthreads, cub::BLOCK_SCAN_WARP_SCANS>;
using BlockExchange = cub::BlockExchange<T, nthreads, nelms_per_thread>;

#ifdef AMREX_CUDA_CCCL_VER_3
#ifdef AMREX_CUDA_CCCL_VER_GE_3
using Sum = cuda::std::plus<T>;
#else
using Sum = cub::Sum;
Expand All @@ -855,8 +858,13 @@ T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = ret
N iend = amrex::min(static_cast<N>(ibegin+nelms_per_block), n);

auto input_lambda = [&] (N i) -> T { return fin(i+ibegin); };
#ifdef AMREX_CUDA_CCCL_VER_GE_3
thrust::transform_iterator<decltype(input_lambda),thrust::counting_iterator<N> >
input_begin(thrust::counting_iterator<N>(0), input_lambda);
#else
cub::TransformInputIterator<T,decltype(input_lambda),cub::CountingInputIterator<N> >
input_begin(cub::CountingInputIterator<N>(0), input_lambda);
#endif

T data[nelms_per_thread];
if (static_cast<int>(iend-ibegin) == nelms_per_block) {
Expand Down
Loading