Skip to content

Commit 9a6b4aa

Browse files
committed
Update memcpy util to use the batch API for 13.0 and +
1 parent b52c511 commit 9a6b4aa

File tree

4 files changed

+8
-13
lines changed

4 files changed

+8
-13
lines changed

include/cuco/detail/hyperloglog/hyperloglog_impl.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2024-2025, NVIDIA CORPORATION.
2+
* Copyright (c) 2024-2026, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.

include/cuco/detail/open_addressing/open_addressing_impl.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2023-2025, NVIDIA CORPORATION.
2+
* Copyright (c) 2023-2026, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.

include/cuco/detail/static_map.inl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2020-2025, NVIDIA CORPORATION.
2+
* Copyright (c) 2020-2026, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.

include/cuco/detail/utility/memcpy_async.hpp

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2025, NVIDIA CORPORATION.
2+
* Copyright (c) 2025-2026, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -27,7 +27,7 @@ namespace cuco::detail {
2727
/**
2828
* @brief Asynchronous memory copy utility using cudaMemcpyBatchAsync when possible
2929
*
30-
* Uses cudaMemcpyBatchAsync for CUDA 12.8+ with proper edge case handling.
30+
* Uses cudaMemcpyBatchAsync for CUDA 13.0+ to avoid driver-side locking overhead.
3131
* Falls back to cudaMemcpyAsync for older CUDA versions or edge cases.
3232
*
3333
* @param dst Destination memory address
@@ -42,7 +42,7 @@ namespace cuco::detail {
4242
{
4343
if (dst == nullptr || src == nullptr || count == 0) { return cudaSuccess; }
4444

45-
#if CUDART_VERSION >= 12080
45+
#if CUDART_VERSION >= 13000
4646
if (stream.get() == nullptr) { return cudaMemcpyAsync(dst, src, count, kind, stream.get()); }
4747

4848
void* dsts[1] = {dst};
@@ -54,16 +54,11 @@ namespace cuco::detail {
5454
attrs[0].srcAccessOrder = cudaMemcpySrcAccessOrderStream;
5555
attrs[0].flags = cudaMemcpyFlagPreferOverlapWithCompute;
5656

57-
#if CUDART_VERSION >= 13000
5857
return cudaMemcpyBatchAsync(dsts, srcs, sizes, 1, attrs, attrs_idxs, 1, stream.get());
5958
#else
60-
std::size_t fail_idx;
61-
return cudaMemcpyBatchAsync(dsts, srcs, sizes, 1, attrs, attrs_idxs, 1, &fail_idx, stream.get());
62-
#endif // CUDART_VERSION >= 13000
63-
#else
64-
// CUDA < 12.8 - use regular cudaMemcpyAsync
59+
// CUDA < 13.0 - use regular cudaMemcpyAsync
6560
return cudaMemcpyAsync(dst, src, count, kind, stream.get());
66-
#endif // CUDART_VERSION >= 12080
61+
#endif // CUDART_VERSION >= 13000
6762
}
6863

6964
} // namespace cuco::detail

0 commit comments

Comments
 (0)