@@ -3,7 +3,7 @@ use std::sync::Arc;
33
44use datafusion:: arrow:: array:: {
55 downcast_array, AnyDictionaryArray , Array , ArrayAccessor , ArrayRef , AsArray , DictionaryArray , LargeStringArray ,
6- PrimitiveArray , RunArray , StringArray , StringViewArray ,
6+ PrimitiveArray , PrimitiveBuilder , RunArray , StringArray , StringViewArray ,
77} ;
88use datafusion:: arrow:: compute:: kernels:: cast;
99use datafusion:: arrow:: compute:: take;
@@ -245,6 +245,34 @@ fn invoke_array_array<R: InvokeResult>(
245245 }
246246}
247247
248+ /// Transform keys that may be pointing to values with nulls to nulls themselves.
249+ /// keys = `[0, 1, 2, 3]`, values = `[null, "a", null, "b"]`
250+ /// into
251+ /// keys = `[null, 0, null, 1]`, values = `["a", "b"]`
252+ ///
253+ /// Arrow / `DataFusion` assumes that dictionary values do not contain nulls, nulls are encoded by the keys.
254+ /// Not following this invariant causes invalid dictionary arrays to be built later on inside of `DataFusion`
255+ /// when arrays are concacted and such.
256+ fn remap_dictionary_key_nulls ( keys : PrimitiveArray < Int64Type > , values : ArrayRef ) -> DictionaryArray < Int64Type > {
257+ // fast path: no nulls in values
258+ if values. null_count ( ) == 0 {
259+ return DictionaryArray :: new ( keys, values) ;
260+ }
261+
262+ let mut new_keys_builder = PrimitiveBuilder :: < Int64Type > :: new ( ) ;
263+
264+ for key in & keys {
265+ match key {
266+ Some ( k) if values. is_null ( k. as_usize ( ) ) => new_keys_builder. append_null ( ) ,
267+ Some ( k) => new_keys_builder. append_value ( k) ,
268+ None => new_keys_builder. append_null ( ) ,
269+ }
270+ }
271+
272+ let new_keys = new_keys_builder. finish ( ) ;
273+ DictionaryArray :: new ( new_keys, values)
274+ }
275+
248276fn invoke_array_scalars < R : InvokeResult > (
249277 json_array : & ArrayRef ,
250278 path : & [ JsonPath ] ,
@@ -281,7 +309,7 @@ fn invoke_array_scalars<R: InvokeResult>(
281309 let type_ids = values. as_union ( ) . type_ids ( ) ;
282310 keys = mask_dictionary_keys ( & keys, type_ids) ;
283311 }
284- Ok ( Arc :: new ( DictionaryArray :: new ( keys, values) ) )
312+ Ok ( Arc :: new ( remap_dictionary_key_nulls ( keys, values) ) )
285313 } else {
286314 // this is what cast would do under the hood to unpack a dictionary into an array of its values
287315 Ok ( take ( & values, json_array. keys ( ) , None ) ?)
0 commit comments