11/*
2- * Copyright (c) 2025, NVIDIA CORPORATION.
2+ * Copyright (c) 2025-2026 , NVIDIA CORPORATION.
33 *
44 * Licensed under the Apache License, Version 2.0 (the "License");
55 * you may not use this file except in compliance with the License.
@@ -27,7 +27,7 @@ namespace cuco::detail {
2727/* *
2828 * @brief Asynchronous memory copy utility using cudaMemcpyBatchAsync when possible
2929 *
30- * Uses cudaMemcpyBatchAsync for CUDA 12.8+ with proper edge case handling .
30+ * Uses cudaMemcpyBatchAsync for CUDA 13.0+ to avoid driver-side locking overhead .
3131 * Falls back to cudaMemcpyAsync for older CUDA versions or edge cases.
3232 *
3333 * @param dst Destination memory address
@@ -42,7 +42,7 @@ namespace cuco::detail {
4242{
4343 if (dst == nullptr || src == nullptr || count == 0 ) { return cudaSuccess; }
4444
45- #if CUDART_VERSION >= 12080
45+ #if CUDART_VERSION >= 13000
4646 if (stream.get () == nullptr ) { return cudaMemcpyAsync (dst, src, count, kind, stream.get ()); }
4747
4848 void * dsts[1 ] = {dst};
@@ -54,16 +54,11 @@ namespace cuco::detail {
5454 attrs[0 ].srcAccessOrder = cudaMemcpySrcAccessOrderStream;
5555 attrs[0 ].flags = cudaMemcpyFlagPreferOverlapWithCompute;
5656
57- #if CUDART_VERSION >= 13000
5857 return cudaMemcpyBatchAsync (dsts, srcs, sizes, 1 , attrs, attrs_idxs, 1 , stream.get ());
5958#else
60- std::size_t fail_idx;
61- return cudaMemcpyBatchAsync (dsts, srcs, sizes, 1 , attrs, attrs_idxs, 1 , &fail_idx, stream.get ());
62- #endif // CUDART_VERSION >= 13000
63- #else
64- // CUDA < 12.8 - use regular cudaMemcpyAsync
59+ // CUDA < 13.0 - use regular cudaMemcpyAsync
6560 return cudaMemcpyAsync (dst, src, count, kind, stream.get ());
66- #endif // CUDART_VERSION >= 12080
61+ #endif // CUDART_VERSION >= 13000
6762}
6863
6964} // namespace cuco::detail
0 commit comments