rapidsai · rapids-bot · Oct 8, 2025 · Jul 25, 2025 · Jul 25, 2025 · Aug 15, 2025
@@ -737,7 +737,7 @@ table_with_metadata hybrid_scan_reader_impl::read_chunk_internal(
   preprocess_chunk_strings(mode, read_info, page_mask);
 
   // Allocate memory buffers for the output columns.
-  allocate_columns(mode, read_info.skip_rows, read_info.num_rows);
+  allocate_columns(mode, read_info.skip_rows, read_info.num_rows, page_mask);
 
   // Parse data into the output buffers.
   decode_page_data(mode, read_info.skip_rows, read_info.num_rows, page_mask);

@@ -608,8 +608,8 @@ CUDF_KERNEL void __launch_bounds__(preprocess_block_size)
   if (t == 0) {
     // don't clobber these if they're already computed from the index
     if (!pp->has_page_index) {
-      s->page.num_nulls  = 0;
-      s->page.num_valids = 0;
+      pp->num_nulls  = 0;
+      pp->num_valids = 0;
     }
     // reset str_bytes to 0 in case it's already been calculated (esp needed for chunked reads).
     pp->str_bytes = 0;
@@ -676,7 +676,6 @@ CUDF_KERNEL void __launch_bounds__(preprocess_block_size)
  * @param page_mask Page mask indicating if this column needs to be decoded
  * @param min_rows crop all rows below min_row
  * @param num_rows Maximum number of rows to read
- * other settings and records the result in the PageInfo::str_bytes_all field
  */
 CUDF_KERNEL void __launch_bounds__(delta_preproc_block_size)
   compute_delta_page_string_sizes_kernel(PageInfo* pages,
@@ -770,7 +769,6 @@ CUDF_KERNEL void __launch_bounds__(delta_preproc_block_size)
  * @param page_mask Page mask indicating if this column needs to be decoded
  * @param min_rows crop all rows below min_row
  * @param num_rows Maximum number of rows to read
- * other settings
  */
 CUDF_KERNEL void __launch_bounds__(delta_length_block_size)
   compute_delta_length_page_string_sizes_kernel(PageInfo* pages,

@@ -170,9 +170,11 @@ __device__ void update_string_offsets_for_pruned_pages(
 
   // Initial string offset
   auto const initial_value = page.str_offset;
-  // We must use the batch size from the nesting info (the size of the page for this batch)
-  auto value_count = page.nesting[state->col.max_nesting_depth - 1].batch_size;
-  auto const tid   = cg::this_thread_block().thread_rank();
+  // The value count is either the leaf-level batch size in case of lists or the number of
+  // effective rows being read by this page
+  auto const value_count =
+    has_lists ? page.nesting[state->col.max_nesting_depth - 1].batch_size : state->num_rows;
+  auto const tid = cg::this_thread_block().thread_rank();
 
   // Offsets pointer contains string sizes in case of large strings and actual offsets
   // otherwise
@@ -190,10 +192,6 @@ __device__ void update_string_offsets_for_pruned_pages(
     auto const input_col_idx       = page.chunk_idx % chunks_per_rowgroup;
     compute_initial_large_strings_offset<has_lists>(state, initial_str_offsets[input_col_idx]);
   } else {
-    // if no repetition we haven't calculated start/end bounds and instead just skipped
-    // values until we reach first_row. account for that here.
-    if constexpr (not has_lists) { value_count -= state->first_row; }
-
     // Write the initial offset at all positions to indicate zero sized strings
     for (int idx = tid; idx < value_count; idx += block_size) {
       offptr[idx] = initial_value;

@@ -709,7 +709,7 @@ table_with_metadata reader_impl::read_chunk_internal(read_mode mode)
   preprocess_chunk_strings(mode, read_info, page_mask);
 
   // Allocate memory buffers for the output columns.
-  allocate_columns(mode, read_info.skip_rows, read_info.num_rows);
+  allocate_columns(mode, read_info.skip_rows, read_info.num_rows, page_mask);
 
   // Parse data into the output buffers.
   decode_page_data(mode, read_info.skip_rows, read_info.num_rows, page_mask);

@@ -303,8 +303,12 @@ class reader_impl {
    * @param read_mode Value indicating if the data sources are read all at once or chunk by chunk
    * @param skip_rows Crop all rows below skip_rows
    * @param num_rows Number of rows to read
+   * @param page_mask Boolean device span indicating if a page needs to be decoded or is pruned
    */
-  void allocate_columns(read_mode mode, size_t skip_rows, size_t num_rows);
+  void allocate_columns(read_mode mode,
+                        size_t skip_rows,
+                        size_t num_rows,
+                        cudf::device_span<bool const> page_mask);
 
   /**
    * @brief Calculate per-page offsets for string data
@@ -319,7 +323,7 @@ class reader_impl {
    * @param read_mode Value indicating if the data sources are read all at once or chunk by chunk
    * @param skip_rows Number of rows to skip from the start
    * @param num_rows Number of rows to decode
-   * @param page_mask Boolean vector indicating if a page needs to be decoded or is pruned
+   * @param page_mask Boolean device span indicating if a page needs to be decoded or is pruned
    */
   void decode_page_data(read_mode mode,
                         size_t skip_rows,

@@ -687,7 +687,10 @@ void reader_impl::preprocess_subpass_pages(read_mode mode, size_t chunk_read_lim
   compute_output_chunks_for_subpass();
 }
 
-void reader_impl::allocate_columns(read_mode mode, size_t skip_rows, size_t num_rows)
+void reader_impl::allocate_columns(read_mode mode,
+                                   size_t skip_rows,
+                                   size_t num_rows,
+                                   cudf::device_span<bool const> page_mask)
 {
   CUDF_FUNC_RANGE();
 
@@ -779,17 +782,20 @@ void reader_impl::allocate_columns(read_mode mode, size_t skip_rows, size_t num_
 
     // To keep track of the starting key of an iteration
     size_t key_start = 0;
+
     // Loop until all keys are processed
     while (key_start < num_keys) {
       // Number of keys processed in this iteration
       auto const num_keys_this_iter = std::min<size_t>(num_keys_per_iter, num_keys - key_start);
-      thrust::transform(
-        rmm::exec_policy_nosync(_stream),
-        thrust::make_counting_iterator<size_t>(key_start),
-        thrust::make_counting_iterator<size_t>(key_start + num_keys_this_iter),
-        size_input.begin(),
-        get_page_nesting_size{
-          d_cols_info.data(), max_depth, subpass.pages.size(), subpass.pages.device_begin()});
+      thrust::transform(rmm::exec_policy_nosync(_stream),
+                        thrust::make_counting_iterator<size_t>(key_start),
+                        thrust::make_counting_iterator<size_t>(key_start + num_keys_this_iter),
+                        size_input.begin(),
+                        get_page_nesting_size{d_cols_info.data(),
+                                              max_depth,
+                                              subpass.pages.size(),
+                                              subpass.pages.device_begin(),
+                                              page_mask.data()});
 
       // Manually create a size_t `key_start` compatible counting_transform_iterator.
       auto const reduction_keys =

@@ -23,6 +23,8 @@
 #include <rmm/cuda_stream_view.hpp>
 
 #include <cuda/functional>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/logical.h>
 
 #include <future>
 #include <vector>
@@ -345,6 +347,7 @@ struct get_page_nesting_size {
   size_type const max_depth;
   size_t const num_pages;
   PageInfo const* const pages;
+  bool const* const page_mask;
 
   __device__ inline size_type operator()(size_t index) const
   {
@@ -357,6 +360,16 @@ struct get_page_nesting_size {
       return 0;
     }
 
+    // If this page is pruned and has a list parent, set the batch size for this depth to 0 to
+    // reduce the required output buffer size and eliminate any non-empty nulls
+    if (not page_mask[indices.page_idx] and indices.depth_idx > 0 and
+        thrust::any_of(thrust::seq,
+                       thrust::counting_iterator(0),
+                       thrust::counting_iterator(indices.depth_idx),
+                       [&](auto depth) { return page.nesting[depth].type == type_id::LIST; })) {
+      page.nesting[indices.depth_idx].batch_size = 0;
+    }
+
     return page.nesting[indices.depth_idx].batch_size;
   }
 };