Skip to content

Commit ff7cff7

Browse files
committed
Add support for roaring bitmaps with run containers and fewer than four containers
1 parent d3701ae commit ff7cff7

File tree

3 files changed

+63
-20
lines changed

3 files changed

+63
-20
lines changed

include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -47,12 +47,10 @@ class roaring_bitmap_impl<cuda::std::uint32_t> {
4747

4848
__host__ __device__ roaring_bitmap_impl(storage_ref_type const& storage_ref)
4949
: storage_ref_{storage_ref},
50-
offsets_aligned_{(reinterpret_cast<cuda::std::uintptr_t>(
51-
storage_ref_.data() + storage_ref_.metadata().container_offsets)) %
50+
offsets_aligned_{(reinterpret_cast<cuda::std::uintptr_t>(storage_ref_.container_offsets())) %
5251
sizeof(cuda::std::uint32_t) ==
5352
0},
54-
aligned_16_{(reinterpret_cast<cuda::std::uintptr_t>(storage_ref_.data() +
55-
storage_ref_.metadata().key_cards)) %
53+
aligned_16_{(reinterpret_cast<cuda::std::uintptr_t>(storage_ref_.key_cards())) %
5654
sizeof(cuda::std::uint16_t) ==
5755
0} // if base address of key_cards is aligned, then all containers are aligned
5856
{

include/cuco/detail/roaring_bitmap/roaring_bitmap_storage.cuh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,9 @@ class roaring_bitmap_storage_ref<cuda::std::uint32_t> {
6161
data_{bitmap},
6262
run_container_bitmap_{bitmap + metadata.run_container_bitmap},
6363
key_cards_{bitmap + metadata.key_cards},
64-
container_offsets_{bitmap + metadata.container_offsets}
64+
container_offsets_{metadata.offsets_in_serialized_data
65+
? (bitmap + metadata.container_offsets)
66+
: reinterpret_cast<cuda::std::byte const*>(metadata.computed_offsets)}
6567
{
6668
assert(metadata.valid);
6769
}

include/cuco/detail/roaring_bitmap/util.cuh

Lines changed: 58 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,10 @@ template <>
6666
struct roaring_bitmap_metadata<cuda::std::uint32_t> {
6767
/// Maximum number of elements in an array container before converting to bitmap
6868
static constexpr cuda::std::uint32_t max_array_container_card = 4096;
69+
/// Threshold for omitting container offsets in serialized format
70+
static constexpr cuda::std::int32_t no_offset_threshold = 4;
71+
/// Fixed size of a bitset container in bytes
72+
static constexpr cuda::std::uint32_t bitset_container_bytes = 8192;
6973

7074
/// Total size of the bitmap in bytes
7175
cuda::std::size_t size_bytes = 0;
@@ -75,14 +79,18 @@ struct roaring_bitmap_metadata<cuda::std::uint32_t> {
7579
cuda::std::uint32_t run_container_bitmap = 0;
7680
/// Offset to key cardinality data
7781
cuda::std::uint32_t key_cards = 0;
78-
/// Offset to container offset data
82+
/// Offset to container offset data (only valid when offsets_in_serialized_data is true)
7983
cuda::std::uint32_t container_offsets = 0;
84+
/// Computed container offsets (used when offsets are not in serialized data)
85+
cuda::std::uint32_t computed_offsets[4] = {0, 0, 0, 0};
8086
/// Number of containers in the bitmap
8187
cuda::std::int32_t num_containers = 0;
8288
/// Whether the bitmap contains run containers
8389
bool has_run = false;
8490
/// Whether the metadata is valid
8591
bool valid = false;
92+
/// Whether container offsets are stored in the serialized data
93+
bool offsets_in_serialized_data = true;
8694

8795
/**
8896
* @brief Constructs metadata from a serialized bitmap
@@ -94,11 +102,9 @@ struct roaring_bitmap_metadata<cuda::std::uint32_t> {
94102
constexpr cuda::std::uint32_t serial_cookie_no_runcontainer = 12346;
95103
constexpr cuda::std::uint32_t serial_cookie = 12347;
96104
// constexpr cuda::std::uint32_t frozen_cookie = 13766; // not implemented
97-
constexpr cuda::std::int32_t no_offset_threshold = 4;
98-
constexpr cuda::std::int32_t max_containers = 1 << 16;
99-
constexpr cuda::std::uint32_t cookie_mask = 0xFFFF;
100-
constexpr cuda::std::uint32_t cookie_shift = 16;
101-
constexpr cuda::std::uint32_t bitset_container_bytes = 8192;
105+
constexpr cuda::std::int32_t max_containers = 1 << 16;
106+
constexpr cuda::std::uint32_t cookie_mask = 0xFFFF;
107+
constexpr cuda::std::uint32_t cookie_shift = 16;
102108

103109
cuda::std::byte const* buf = bitmap;
104110

@@ -147,14 +153,45 @@ struct roaring_bitmap_metadata<cuda::std::uint32_t> {
147153
buf += num_containers * 2 * sizeof(cuda::std::uint16_t);
148154

149155
if ((!has_run) || (num_containers >= no_offset_threshold)) {
150-
container_offsets = cuda::std::distance(bitmap, buf);
156+
// Container offsets are stored in the serialized data
157+
offsets_in_serialized_data = true;
158+
container_offsets = cuda::std::distance(bitmap, buf);
151159
buf += num_containers * sizeof(cuda::std::uint32_t);
152160
} else {
153-
valid = false;
154-
NV_IF_TARGET(
155-
NV_IS_HOST,
156-
CUCO_FAIL("Invalid bitmap format: not implemented");) // TODO device error handling
157-
return;
161+
// Container offsets are NOT stored in the serialized data
162+
// We need to compute them by walking through the containers
163+
offsets_in_serialized_data = false;
164+
container_offsets = 0;
165+
166+
cuda::std::byte const* container_ptr = buf;
167+
for (cuda::std::int32_t i = 0; i < num_containers; ++i) {
168+
// Store the computed offset for this container
169+
computed_offsets[i] =
170+
static_cast<cuda::std::uint32_t>(cuda::std::distance(bitmap, container_ptr));
171+
172+
// Get cardinality for this container
173+
cuda::std::byte const* card_ptr =
174+
bitmap + key_cards + (i * 2 + 1) * sizeof(cuda::std::uint16_t);
175+
cuda::std::uint32_t card_i = 1u + misaligned_load<cuda::std::uint16_t>(card_ptr);
176+
177+
// Check if this is a run container
178+
bool is_run_container = check_bit(bitmap + run_container_bitmap, i);
179+
180+
// Compute container size and advance pointer
181+
if (is_run_container) {
182+
// Run container: first uint16_t is num_runs, followed by num_runs (start, length) pairs
183+
cuda::std::uint16_t num_runs = misaligned_load<cuda::std::uint16_t>(container_ptr);
184+
container_ptr += sizeof(cuda::std::uint16_t) + num_runs * 2 * sizeof(cuda::std::uint16_t);
185+
} else if (card_i <= max_array_container_card) {
186+
// Array container
187+
container_ptr += card_i * sizeof(cuda::std::uint16_t);
188+
} else {
189+
// Bitset container (fixed size)
190+
container_ptr += bitset_container_bytes;
191+
}
192+
}
193+
// buf now points past all containers
194+
buf = container_ptr;
158195
}
159196

160197
cuda::std::uint32_t card = 0;
@@ -170,9 +207,15 @@ struct roaring_bitmap_metadata<cuda::std::uint32_t> {
170207
}
171208

172209
// find end of roaring bitmap (re-use card from last container)
173-
cuda::std::byte const* end =
174-
bitmap + misaligned_load<cuda::std::uint32_t>(
175-
bitmap + container_offsets + (num_containers - 1) * sizeof(cuda::std::uint32_t));
210+
cuda::std::byte const* end;
211+
if (offsets_in_serialized_data) {
212+
end =
213+
bitmap + misaligned_load<cuda::std::uint32_t>(
214+
bitmap + container_offsets + (num_containers - 1) * sizeof(cuda::std::uint32_t));
215+
} else {
216+
end = bitmap + computed_offsets[num_containers - 1];
217+
}
218+
176219
if (has_run and check_bit(bitmap + run_container_bitmap, num_containers - 1)) {
177220
cuda::std::uint16_t const num_runs = misaligned_load<cuda::std::uint16_t>(end);
178221
end += sizeof(cuda::std::uint16_t) + num_runs * 2 * sizeof(cuda::std::uint16_t);

0 commit comments

Comments
 (0)