Skip to content

Commit c40de5a

Browse files
committed
Fix storage alignment
1 parent 8bdf5b5 commit c40de5a

File tree

3 files changed

+138
-17
lines changed

3 files changed

+138
-17
lines changed

include/cuco/bucket_storage.cuh

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2022-2025, NVIDIA CORPORATION.
2+
* Copyright (c) 2022-2026, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -21,12 +21,12 @@
2121
#include <cuco/utility/allocator.hpp>
2222

2323
#include <cuda/std/array>
24+
#include <cuda/std/bit>
2425
#include <cuda/std/functional>
2526
#include <cuda/stream_ref>
2627

2728
#include <cstddef>
2829
#include <cstdint>
29-
#include <iterator>
3030
#include <memory>
3131

3232
namespace cuco {
@@ -40,14 +40,14 @@ namespace cuco {
4040
template <typename T, int32_t BucketSize, typename Extent = cuco::extent<std::size_t>>
4141
class bucket_storage_ref {
4242
public:
43-
static constexpr int32_t bucket_size = BucketSize; ///< Number of elements processed per bucket
44-
static constexpr std::size_t alignment =
45-
cuda::std::min(sizeof(T) * bucket_size, std::size_t{16}); ///< Required alignment
43+
static constexpr int32_t bucket_size = BucketSize; ///< Number of elements processed per bucket
44+
using bucket_type = cuda::std::array<T, BucketSize>; ///< Slot bucket type
45+
static constexpr std::size_t alignment = cuda::std::min(cuda::std::bit_ceil(sizeof(bucket_type)),
46+
std::size_t{16}); ///< Required alignment
4647

4748
using extent_type = Extent; ///< Storage extent type
4849
using size_type = typename extent_type::value_type; ///< Storage size type
4950
using value_type = T; ///< Slot type
50-
using bucket_type = cuda::std::array<T, BucketSize>; ///< Slot bucket type
5151

5252
/**
5353
* @brief Constructor of slot storage ref.
@@ -150,7 +150,7 @@ class bucket_storage {
150150
using value_type = T; ///< Slot type
151151
using bucket_type = cuda::std::array<T, BucketSize>; ///< Slot bucket type
152152

153-
/// Type of the allocator to (de)allocate buckets
153+
/// Type of the allocator to (de)allocate slots
154154
using allocator_type =
155155
typename std::allocator_traits<Allocator>::template rebind_alloc<value_type>;
156156
using ref_type = bucket_storage_ref<value_type, bucket_size, extent_type>; ///< Storage ref type
@@ -244,13 +244,19 @@ class bucket_storage {
244244
[[nodiscard]] __host__ __device__ constexpr extent_type extent() const noexcept;
245245

246246
private:
247-
using slot_deleter_type =
248-
detail::custom_deleter<size_type, allocator_type>; ///< Type of slot deleter
247+
struct aligned_deleter {
248+
value_type* raw_ptr_;
249+
std::size_t size_;
250+
allocator_type& allocator_;
251+
cuda::stream_ref stream_;
252+
253+
void operator()(value_type*) const { allocator_.deallocate(raw_ptr_, size_, stream_); }
254+
};
249255

250256
extent_type extent_; ///< Storage extent
251257
allocator_type allocator_; ///< Allocator used to (de)allocate slots
252-
/// Pointer to the slot storage
253-
std::unique_ptr<value_type, slot_deleter_type> slots_;
258+
/// Pointer to the aligned slot storage
259+
std::unique_ptr<value_type, aligned_deleter> slots_;
254260
};
255261
} // namespace cuco
256262

include/cuco/detail/storage/bucket_storage.inl

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2022-2025, NVIDIA CORPORATION.
2+
* Copyright (c) 2022-2026, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -23,9 +23,11 @@
2323
#include <cub/device/device_for.cuh>
2424
#include <cuda/std/array>
2525
#include <cuda/std/bit>
26+
#include <cuda/std/cstdint>
2627
#include <cuda/stream_ref>
2728

2829
#include <cassert>
30+
#include <memory>
2931

3032
namespace cuco {
3133

@@ -98,10 +100,16 @@ bucket_storage_ref<T, BucketSize, Extent>::extent() const noexcept
98100
template <typename T, int BucketSize, typename Extent, typename Allocator>
99101
constexpr bucket_storage<T, BucketSize, Extent, Allocator>::bucket_storage(
100102
Extent size, Allocator const& allocator, cuda::stream_ref stream)
101-
: extent_{size},
102-
allocator_{allocator},
103-
slots_{allocator_.allocate(capacity(), stream),
104-
slot_deleter_type{capacity(), allocator_, stream}}
103+
: extent_{size}, allocator_{allocator}, slots_{[this, &stream]() {
104+
constexpr std::size_t align = ref_type::alignment;
105+
constexpr std::size_t extra = (align - 1) / sizeof(value_type) + 1;
106+
std::size_t const alloc_size = static_cast<std::size_t>(capacity()) + extra;
107+
auto* const raw_ptr = allocator_.allocate(alloc_size, stream);
108+
auto* const aligned_ptr = reinterpret_cast<value_type*>(
109+
(reinterpret_cast<cuda::std::uintptr_t>(raw_ptr) + align - 1) & ~(align - 1));
110+
return std::unique_ptr<value_type, aligned_deleter>{
111+
aligned_ptr, aligned_deleter{raw_ptr, alloc_size, allocator_, stream}};
112+
}()}
105113
{
106114
}
107115

tests/utility/storage_test.cu

Lines changed: 108 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2022-2025, NVIDIA CORPORATION.
2+
* Copyright (c) 2022-2026, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -21,8 +21,12 @@
2121
#include <cuco/pair.cuh>
2222
#include <cuco/utility/allocator.hpp>
2323

24+
#include <cuda/std/bit>
25+
2426
#include <catch2/catch_template_test_macros.hpp>
2527

28+
#include <cstdint>
29+
2630
TEMPLATE_TEST_CASE_SIG("utility storage tests",
2731
"",
2832
((typename Key, typename Value), Key, Value),
@@ -96,4 +100,107 @@ TEMPLATE_TEST_CASE_SIG("utility storage tests",
96100
STATIC_REQUIRE(num_buckets == size / bucket_size);
97101
STATIC_REQUIRE(capacity == gold_capacity);
98102
}
103+
104+
SECTION("Storage alignment constant is correct for pairs.")
105+
{
106+
using storage_ref_type =
107+
cuco::bucket_storage_ref<cuco::pair<Key, Value>, bucket_size, cuco::extent<std::size_t>>;
108+
using bucket_type = typename storage_ref_type::bucket_type;
109+
110+
constexpr auto alignment = storage_ref_type::alignment;
111+
constexpr auto expected_align =
112+
cuda::std::min(cuda::std::bit_ceil(sizeof(bucket_type)), std::size_t{16});
113+
114+
STATIC_REQUIRE(alignment == expected_align);
115+
STATIC_REQUIRE(cuda::std::has_single_bit(alignment));
116+
}
117+
118+
SECTION("Storage alignment constant is correct for keys.")
119+
{
120+
using storage_ref_type = cuco::bucket_storage_ref<Key, bucket_size, cuco::extent<std::size_t>>;
121+
using bucket_type = typename storage_ref_type::bucket_type;
122+
123+
constexpr auto alignment = storage_ref_type::alignment;
124+
constexpr auto expected_align =
125+
cuda::std::min(cuda::std::bit_ceil(sizeof(bucket_type)), std::size_t{16});
126+
127+
STATIC_REQUIRE(alignment == expected_align);
128+
STATIC_REQUIRE(cuda::std::has_single_bit(alignment));
129+
}
130+
131+
SECTION("Storage data pointer is aligned to bucket boundary for pairs.")
132+
{
133+
auto s = cuco::bucket_storage<cuco::pair<Key, Value>,
134+
bucket_size,
135+
cuco::extent<std::size_t>,
136+
allocator_type>(
137+
cuco::extent{size}, allocator, cuda::stream_ref{cudaStream_t{nullptr}});
138+
139+
auto const ptr = reinterpret_cast<std::uintptr_t>(s.data());
140+
auto const alignment = decltype(s)::ref_type::alignment;
141+
142+
REQUIRE((ptr % alignment) == 0);
143+
}
144+
145+
SECTION("Storage data pointer is aligned to bucket boundary for keys.")
146+
{
147+
auto s = cuco::bucket_storage<Key, bucket_size, cuco::extent<std::size_t>, allocator_type>(
148+
cuco::extent{size}, allocator, cuda::stream_ref{cudaStream_t{nullptr}});
149+
150+
auto const ptr = reinterpret_cast<std::uintptr_t>(s.data());
151+
auto const alignment = decltype(s)::ref_type::alignment;
152+
153+
REQUIRE((ptr % alignment) == 0);
154+
}
155+
}
156+
157+
TEMPLATE_TEST_CASE_SIG("bucket storage alignment with different bucket sizes",
158+
"",
159+
((typename T, int BucketSize), T, BucketSize),
160+
(int32_t, 1),
161+
(int32_t, 2),
162+
(int32_t, 4),
163+
(int64_t, 1),
164+
(int64_t, 2),
165+
(cuco::pair<int32_t, int32_t>, 1),
166+
(cuco::pair<int32_t, int32_t>, 2),
167+
(cuco::pair<int64_t, int64_t>, 1))
168+
{
169+
constexpr std::size_t size{1'000};
170+
171+
using allocator_type = cuco::cuda_allocator<char>;
172+
using storage_type =
173+
cuco::bucket_storage<T, BucketSize, cuco::extent<std::size_t>, allocator_type>;
174+
using storage_ref_type = typename storage_type::ref_type;
175+
using bucket_type = typename storage_ref_type::bucket_type;
176+
177+
auto allocator = allocator_type{};
178+
179+
SECTION("Alignment constant is power of 2 and capped at 16.")
180+
{
181+
constexpr auto alignment = storage_ref_type::alignment;
182+
183+
STATIC_REQUIRE(cuda::std::has_single_bit(alignment));
184+
STATIC_REQUIRE(alignment <= 16);
185+
STATIC_REQUIRE(alignment >= sizeof(T));
186+
}
187+
188+
SECTION("Alignment matches expected value.")
189+
{
190+
constexpr auto alignment = storage_ref_type::alignment;
191+
constexpr auto expected =
192+
cuda::std::min(cuda::std::bit_ceil(sizeof(bucket_type)), std::size_t{16});
193+
194+
STATIC_REQUIRE(alignment == expected);
195+
}
196+
197+
SECTION("Data pointer is aligned to bucket boundary.")
198+
{
199+
auto s = storage_type(cuco::extent{size}, allocator, cuda::stream_ref{cudaStream_t{nullptr}});
200+
201+
auto const ptr = reinterpret_cast<std::uintptr_t>(s.data());
202+
auto const alignment = storage_ref_type::alignment;
203+
204+
REQUIRE((ptr % alignment) == 0);
205+
}
99206
}

0 commit comments

Comments
 (0)