@@ -66,6 +66,10 @@ template <>
6666struct roaring_bitmap_metadata <cuda::std::uint32_t > {
6767 // / Maximum number of elements in an array container before converting to bitmap
6868 static constexpr cuda::std::uint32_t max_array_container_card = 4096 ;
69+ // / Threshold for omitting container offsets in serialized format
70+ static constexpr cuda::std::int32_t no_offset_threshold = 4 ;
71+ // / Fixed size of a bitset container in bytes
72+ static constexpr cuda::std::uint32_t bitset_container_bytes = 8192 ;
6973
7074 // / Total size of the bitmap in bytes
7175 cuda::std::size_t size_bytes = 0 ;
@@ -75,14 +79,18 @@ struct roaring_bitmap_metadata<cuda::std::uint32_t> {
7579 cuda::std::uint32_t run_container_bitmap = 0 ;
7680 // / Offset to key cardinality data
7781 cuda::std::uint32_t key_cards = 0 ;
78- // / Offset to container offset data
82+ // / Offset to container offset data (only valid when offsets_in_serialized_data is true)
7983 cuda::std::uint32_t container_offsets = 0 ;
84+ // / Computed container offsets (used when offsets are not in serialized data)
85+ cuda::std::uint32_t computed_offsets[4 ] = {0 , 0 , 0 , 0 };
8086 // / Number of containers in the bitmap
8187 cuda::std::int32_t num_containers = 0 ;
8288 // / Whether the bitmap contains run containers
8389 bool has_run = false ;
8490 // / Whether the metadata is valid
8591 bool valid = false ;
92+ // / Whether container offsets are stored in the serialized data
93+ bool offsets_in_serialized_data = true ;
8694
8795 /* *
8896 * @brief Constructs metadata from a serialized bitmap
@@ -94,11 +102,9 @@ struct roaring_bitmap_metadata<cuda::std::uint32_t> {
94102 constexpr cuda::std::uint32_t serial_cookie_no_runcontainer = 12346 ;
95103 constexpr cuda::std::uint32_t serial_cookie = 12347 ;
96104 // constexpr cuda::std::uint32_t frozen_cookie = 13766; // not implemented
97- constexpr cuda::std::int32_t no_offset_threshold = 4 ;
98- constexpr cuda::std::int32_t max_containers = 1 << 16 ;
99- constexpr cuda::std::uint32_t cookie_mask = 0xFFFF ;
100- constexpr cuda::std::uint32_t cookie_shift = 16 ;
101- constexpr cuda::std::uint32_t bitset_container_bytes = 8192 ;
105+ constexpr cuda::std::int32_t max_containers = 1 << 16 ;
106+ constexpr cuda::std::uint32_t cookie_mask = 0xFFFF ;
107+ constexpr cuda::std::uint32_t cookie_shift = 16 ;
102108
103109 cuda::std::byte const * buf = bitmap;
104110
@@ -147,14 +153,45 @@ struct roaring_bitmap_metadata<cuda::std::uint32_t> {
147153 buf += num_containers * 2 * sizeof (cuda::std::uint16_t );
148154
149155 if ((!has_run) || (num_containers >= no_offset_threshold)) {
150- container_offsets = cuda::std::distance (bitmap, buf);
156+ // Container offsets are stored in the serialized data
157+ offsets_in_serialized_data = true ;
158+ container_offsets = cuda::std::distance (bitmap, buf);
151159 buf += num_containers * sizeof (cuda::std::uint32_t );
152160 } else {
153- valid = false ;
154- NV_IF_TARGET (
155- NV_IS_HOST,
156- CUCO_FAIL (" Invalid bitmap format: not implemented" );) // TODO device error handling
157- return ;
161+ // Container offsets are NOT stored in the serialized data
162+ // We need to compute them by walking through the containers
163+ offsets_in_serialized_data = false ;
164+ container_offsets = 0 ;
165+
166+ cuda::std::byte const * container_ptr = buf;
167+ for (cuda::std::int32_t i = 0 ; i < num_containers; ++i) {
168+ // Store the computed offset for this container
169+ computed_offsets[i] =
170+ static_cast <cuda::std::uint32_t >(cuda::std::distance (bitmap, container_ptr));
171+
172+ // Get cardinality for this container
173+ cuda::std::byte const * card_ptr =
174+ bitmap + key_cards + (i * 2 + 1 ) * sizeof (cuda::std::uint16_t );
175+ cuda::std::uint32_t card_i = 1u + misaligned_load<cuda::std::uint16_t >(card_ptr);
176+
177+ // Check if this is a run container
178+ bool is_run_container = check_bit (bitmap + run_container_bitmap, i);
179+
180+ // Compute container size and advance pointer
181+ if (is_run_container) {
182+ // Run container: first uint16_t is num_runs, followed by num_runs (start, length) pairs
183+ cuda::std::uint16_t num_runs = misaligned_load<cuda::std::uint16_t >(container_ptr);
184+ container_ptr += sizeof (cuda::std::uint16_t ) + num_runs * 2 * sizeof (cuda::std::uint16_t );
185+ } else if (card_i <= max_array_container_card) {
186+ // Array container
187+ container_ptr += card_i * sizeof (cuda::std::uint16_t );
188+ } else {
189+ // Bitset container (fixed size)
190+ container_ptr += bitset_container_bytes;
191+ }
192+ }
193+ // buf now points past all containers
194+ buf = container_ptr;
158195 }
159196
160197 cuda::std::uint32_t card = 0 ;
@@ -170,9 +207,15 @@ struct roaring_bitmap_metadata<cuda::std::uint32_t> {
170207 }
171208
172209 // find end of roaring bitmap (re-use card from last container)
173- cuda::std::byte const * end =
174- bitmap + misaligned_load<cuda::std::uint32_t >(
175- bitmap + container_offsets + (num_containers - 1 ) * sizeof (cuda::std::uint32_t ));
210+ cuda::std::byte const * end;
211+ if (offsets_in_serialized_data) {
212+ end =
213+ bitmap + misaligned_load<cuda::std::uint32_t >(
214+ bitmap + container_offsets + (num_containers - 1 ) * sizeof (cuda::std::uint32_t ));
215+ } else {
216+ end = bitmap + computed_offsets[num_containers - 1 ];
217+ }
218+
176219 if (has_run and check_bit (bitmap + run_container_bitmap, num_containers - 1 )) {
177220 cuda::std::uint16_t const num_runs = misaligned_load<cuda::std::uint16_t >(end);
178221 end += sizeof (cuda::std::uint16_t ) + num_runs * 2 * sizeof (cuda::std::uint16_t );
0 commit comments