@@ -10,7 +10,9 @@ use futures_util::future::LocalBoxFuture;
10
10
use futures_util:: FutureExt ;
11
11
12
12
use super :: { TimeUnit , Value } ;
13
- use crate :: record:: { decode_arrow_datatype, encode_arrow_datatype, ValueError , ValueRef } ;
13
+ use crate :: record:: {
14
+ decode_arrow_datatype, encode_arrow_datatype, size_of_arrow_datatype, ValueError , ValueRef ,
15
+ } ;
14
16
15
17
#[ cfg( not( target_arch = "wasm32" ) ) ]
16
18
type BoxedFuture < ' a , T > = BoxFuture < ' a , T > ;
@@ -198,38 +200,31 @@ impl Encode for Value {
198
200
}
199
201
200
202
fn size ( & self ) -> usize {
201
- match self {
202
- Value :: Null => 1 ,
203
- Value :: Boolean ( v) => 1 + v. size ( ) ,
204
- Value :: Int8 ( v) => 1 + v. size ( ) ,
205
- Value :: Int16 ( v) => 1 + v. size ( ) ,
206
- Value :: Int32 ( v) => 1 + v. size ( ) ,
207
- Value :: Int64 ( v) => 1 + v. size ( ) ,
208
- Value :: UInt8 ( v) => 1 + v. size ( ) ,
209
- Value :: UInt16 ( v) => 1 + v. size ( ) ,
210
- Value :: UInt32 ( v) => 1 + v. size ( ) ,
211
- Value :: UInt64 ( v) => 1 + v. size ( ) ,
212
- Value :: Float32 ( v) => 1 + v. size ( ) ,
213
- Value :: Float64 ( v) => 1 + v. size ( ) ,
214
- Value :: String ( v) => 1 + v. size ( ) ,
215
- Value :: Binary ( v) => 1 + v. size ( ) ,
216
- Value :: FixedSizeBinary ( v, _) => 1 + v. size ( ) ,
217
- Value :: Date32 ( v) => 1 + v. size ( ) ,
218
- Value :: Date64 ( v) => 1 + v. size ( ) ,
219
- Value :: Timestamp ( v, time_unit) => 1 + v. size ( ) + time_unit. size ( ) ,
220
- Value :: Time32 ( v, time_unit) => 1 + v. size ( ) + time_unit. size ( ) ,
221
- Value :: Time64 ( v, time_unit) => 1 + v. size ( ) + time_unit. size ( ) ,
222
- Value :: List ( data_type, vec) => {
223
- vec. iter ( ) . map ( |v| v. size ( ) ) . sum :: < usize > ( )
224
- + match data_type {
225
- DataType :: Timestamp ( _, _) => 2 ,
226
- DataType :: Time32 ( _) | DataType :: Time64 ( _) => 2 ,
227
- DataType :: List ( field) => 1 + field. size ( ) ,
228
- _ => 1 ,
229
- }
203
+ size_of_arrow_datatype ( & self . data_type ( ) )
204
+ + match self {
205
+ Value :: Null => 1 ,
206
+ Value :: Boolean ( v) => v. size ( ) ,
207
+ Value :: Int8 ( v) => v. size ( ) ,
208
+ Value :: Int16 ( v) => v. size ( ) ,
209
+ Value :: Int32 ( v) => v. size ( ) ,
210
+ Value :: Int64 ( v) => v. size ( ) ,
211
+ Value :: UInt8 ( v) => v. size ( ) ,
212
+ Value :: UInt16 ( v) => v. size ( ) ,
213
+ Value :: UInt32 ( v) => v. size ( ) ,
214
+ Value :: UInt64 ( v) => v. size ( ) ,
215
+ Value :: Float32 ( v) => v. size ( ) ,
216
+ Value :: Float64 ( v) => v. size ( ) ,
217
+ Value :: String ( v) => v. size ( ) ,
218
+ Value :: Binary ( v) => v. size ( ) ,
219
+ Value :: FixedSizeBinary ( v, _) => v. size ( ) ,
220
+ Value :: Date32 ( v) => v. size ( ) ,
221
+ Value :: Date64 ( v) => v. size ( ) ,
222
+ Value :: Timestamp ( v, _) => v. size ( ) ,
223
+ Value :: Time32 ( v, _) => v. size ( ) ,
224
+ Value :: Time64 ( v, _) => v. size ( ) ,
225
+ Value :: List ( _, vec) => vec. size ( ) ,
226
+ Value :: Dictionary ( _, value) => value. size ( ) ,
230
227
}
231
- Value :: Dictionary ( key_type, value) => 1 + key_type. size ( ) + value. size ( ) ,
232
- }
233
228
}
234
229
}
235
230
@@ -379,38 +374,31 @@ impl Encode for ValueRef<'_> {
379
374
}
380
375
381
376
fn size ( & self ) -> usize {
382
- match self {
383
- ValueRef :: Null => 1 ,
384
- ValueRef :: Boolean ( v) => 1 + v. size ( ) ,
385
- ValueRef :: Int8 ( v) => 1 + v. size ( ) ,
386
- ValueRef :: Int16 ( v) => 1 + v. size ( ) ,
387
- ValueRef :: Int32 ( v) => 1 + v. size ( ) ,
388
- ValueRef :: Int64 ( v) => 1 + v. size ( ) ,
389
- ValueRef :: UInt8 ( v) => 1 + v. size ( ) ,
390
- ValueRef :: UInt16 ( v) => 1 + v. size ( ) ,
391
- ValueRef :: UInt32 ( v) => 1 + v. size ( ) ,
392
- ValueRef :: UInt64 ( v) => 1 + v. size ( ) ,
393
- ValueRef :: Float32 ( v) => 1 + v. size ( ) ,
394
- ValueRef :: Float64 ( v) => 1 + v. size ( ) ,
395
- ValueRef :: String ( v) => 1 + v. size ( ) ,
396
- ValueRef :: Binary ( v) => 1 + v. size ( ) ,
397
- ValueRef :: FixedSizeBinary ( v, _) => 1 + v. size ( ) ,
398
- ValueRef :: Date32 ( v) => 1 + v. size ( ) ,
399
- ValueRef :: Date64 ( v) => 1 + v. size ( ) ,
400
- ValueRef :: Timestamp ( v, time_unit) => 1 + v. size ( ) + time_unit. size ( ) ,
401
- ValueRef :: Time32 ( v, time_unit) => 1 + v. size ( ) + time_unit. size ( ) ,
402
- ValueRef :: Time64 ( v, time_unit) => 1 + v. size ( ) + time_unit. size ( ) ,
403
- ValueRef :: List ( data_type, vec) => {
404
- vec. iter ( ) . map ( |v| v. size ( ) ) . sum :: < usize > ( )
405
- + match data_type {
406
- DataType :: Timestamp ( _, _) => 2 ,
407
- DataType :: Time32 ( _) | DataType :: Time64 ( _) => 2 ,
408
- DataType :: List ( field) => 1 + field. size ( ) ,
409
- _ => 1 ,
410
- }
377
+ size_of_arrow_datatype ( & self . data_type ( ) )
378
+ + match self {
379
+ ValueRef :: Null => 1 ,
380
+ ValueRef :: Boolean ( v) => v. size ( ) ,
381
+ ValueRef :: Int8 ( v) => v. size ( ) ,
382
+ ValueRef :: Int16 ( v) => v. size ( ) ,
383
+ ValueRef :: Int32 ( v) => v. size ( ) ,
384
+ ValueRef :: Int64 ( v) => v. size ( ) ,
385
+ ValueRef :: UInt8 ( v) => v. size ( ) ,
386
+ ValueRef :: UInt16 ( v) => v. size ( ) ,
387
+ ValueRef :: UInt32 ( v) => v. size ( ) ,
388
+ ValueRef :: UInt64 ( v) => v. size ( ) ,
389
+ ValueRef :: Float32 ( v) => v. size ( ) ,
390
+ ValueRef :: Float64 ( v) => v. size ( ) ,
391
+ ValueRef :: String ( v) => v. size ( ) ,
392
+ ValueRef :: Binary ( v) => ( v. len ( ) as u32 ) . size ( ) + v. size ( ) ,
393
+ ValueRef :: FixedSizeBinary ( v, _) => 4 + v. size ( ) ,
394
+ ValueRef :: Date32 ( v) => v. size ( ) ,
395
+ ValueRef :: Date64 ( v) => v. size ( ) ,
396
+ ValueRef :: Timestamp ( v, _) => v. size ( ) ,
397
+ ValueRef :: Time32 ( v, _) => v. size ( ) ,
398
+ ValueRef :: Time64 ( v, _) => v. size ( ) ,
399
+ ValueRef :: List ( _, vec) => vec. size ( ) ,
400
+ ValueRef :: Dictionary ( _, value_ref) => value_ref. size ( ) ,
411
401
}
412
- ValueRef :: Dictionary ( key_type, value_ref) => 1 + key_type. size ( ) + value_ref. size ( ) ,
413
- }
414
402
}
415
403
}
416
404
@@ -619,50 +607,137 @@ mod tests {
619
607
assert_eq ! ( value2. as_key_ref( ) , decoded. as_key_ref( ) ) ;
620
608
}
621
609
622
- // This test demonstrates a bug in size() for Dictionary values:
623
- // the encoded output includes the dictionary's value-type header,
624
- // but size() does not account for those bytes.
625
610
#[ tokio:: test]
626
- async fn test_dictionary_size_matches_encoded_len_value ( ) {
627
- let value = Value :: Dictionary (
628
- DictionaryKeyType :: UInt8 ,
629
- Box :: new ( Value :: String ( "a" . to_string ( ) ) ) ,
630
- ) ;
611
+ async fn test_size_matches_encoded_len_value ( ) {
612
+ let values = [
613
+ Value :: UInt8 ( 1 ) ,
614
+ Value :: UInt16 ( 1 ) ,
615
+ Value :: UInt32 ( 1 ) ,
616
+ Value :: UInt64 ( 1 ) ,
617
+ Value :: Int8 ( 1 ) ,
618
+ Value :: Int16 ( 1 ) ,
619
+ Value :: Int32 ( 1 ) ,
620
+ Value :: Int64 ( 1 ) ,
621
+ Value :: Float32 ( 1.0 ) ,
622
+ Value :: Float64 ( 1.0 ) ,
623
+ Value :: String ( "tonbo" . to_string ( ) ) ,
624
+ Value :: Binary ( b"tonbo" . to_vec ( ) ) ,
625
+ Value :: FixedSizeBinary ( b"tonbo" . to_vec ( ) , 5 ) ,
626
+ Value :: Date32 ( 1 ) ,
627
+ Value :: Date64 ( 1 ) ,
628
+ Value :: Timestamp ( 1 , TimeUnit :: Millisecond ) ,
629
+ Value :: Time32 ( 1 , TimeUnit :: Millisecond ) ,
630
+ Value :: Time64 ( 1 , TimeUnit :: Nanosecond ) ,
631
+ Value :: List (
632
+ DataType :: List ( Arc :: new ( Field :: new (
633
+ "timestamp" ,
634
+ DataType :: Timestamp ( arrow:: datatypes:: TimeUnit :: Millisecond , None ) ,
635
+ false ,
636
+ ) ) ) ,
637
+ vec ! [ Arc :: new( Value :: Timestamp (
638
+ 1732838400 ,
639
+ TimeUnit :: Millisecond ,
640
+ ) ) ] ,
641
+ ) ,
642
+ Value :: Dictionary ( DictionaryKeyType :: Int64 , Box :: new ( Value :: Date64 ( 114 ) ) ) ,
643
+ Value :: Dictionary (
644
+ DictionaryKeyType :: UInt8 ,
645
+ Box :: new ( Value :: String ( "fusio" . to_string ( ) ) ) ,
646
+ ) ,
647
+ Value :: Dictionary (
648
+ DictionaryKeyType :: Int32 ,
649
+ Box :: new ( Value :: Binary ( b"tonbo" . to_vec ( ) ) ) ,
650
+ ) ,
651
+ Value :: Dictionary (
652
+ DictionaryKeyType :: UInt8 ,
653
+ Box :: new ( Value :: FixedSizeBinary ( b"tonbo" . to_vec ( ) , 5 ) ) ,
654
+ ) ,
655
+ ] ;
656
+
631
657
let mut buf = Vec :: new ( ) ;
632
658
let mut cursor = Cursor :: new ( & mut buf) ;
633
659
634
- value. encode ( & mut cursor) . await . unwrap ( ) ;
635
-
636
- let encoded_len = buf. len ( ) ;
637
- let reported_size = value. size ( ) ;
638
- // This assertion is expected to fail with current implementation,
639
- // proving size() undercounts dictionary-encoded bytes.
640
- assert_eq ! (
641
- encoded_len, reported_size,
642
- "encoded length {} differs from size() {} for dictionary value" ,
643
- encoded_len, reported_size
644
- ) ;
660
+ for value in values. iter ( ) {
661
+ let before = cursor. position ( ) ;
662
+ value. encode ( & mut cursor) . await . unwrap ( ) ;
663
+ let after = cursor. position ( ) ;
664
+
665
+ assert_eq ! (
666
+ value. size( ) ,
667
+ ( after - before) as usize ,
668
+ "encoded length {} differs from size() {} for dictionary value. value: {:?}" ,
669
+ after - before,
670
+ value. size( ) ,
671
+ value
672
+ ) ;
673
+ }
645
674
}
646
675
647
- // Same mismatch for ValueRef::Dictionary
648
676
#[ tokio:: test]
649
- async fn test_dictionary_size_matches_encoded_len_value_ref ( ) {
650
- let value = Value :: Dictionary ( DictionaryKeyType :: Int64 , Box :: new ( Value :: Date64 ( 114 ) ) ) ;
651
- let value_ref = value. as_key_ref ( ) ;
677
+ async fn test_size_matches_encoded_len_value_ref ( ) {
678
+ let values = [
679
+ Value :: UInt8 ( 1 ) ,
680
+ Value :: UInt16 ( 1 ) ,
681
+ Value :: UInt32 ( 1 ) ,
682
+ Value :: UInt64 ( 1 ) ,
683
+ Value :: Int8 ( 1 ) ,
684
+ Value :: Int16 ( 1 ) ,
685
+ Value :: Int32 ( 1 ) ,
686
+ Value :: Int64 ( 1 ) ,
687
+ Value :: Float32 ( 1.0 ) ,
688
+ Value :: Float64 ( 1.0 ) ,
689
+ Value :: String ( "fusio" . to_string ( ) ) ,
690
+ Value :: Binary ( b"fusio" . to_vec ( ) ) ,
691
+ Value :: FixedSizeBinary ( b"fusio" . to_vec ( ) , 5 ) ,
692
+ Value :: Date32 ( 1 ) ,
693
+ Value :: Date64 ( 1 ) ,
694
+ Value :: Timestamp ( 1 , TimeUnit :: Millisecond ) ,
695
+ Value :: Time32 ( 1 , TimeUnit :: Millisecond ) ,
696
+ Value :: Time64 ( 1 , TimeUnit :: Nanosecond ) ,
697
+ Value :: List (
698
+ DataType :: List ( Arc :: new ( Field :: new (
699
+ "timestamp" ,
700
+ DataType :: Timestamp ( arrow:: datatypes:: TimeUnit :: Millisecond , None ) ,
701
+ false ,
702
+ ) ) ) ,
703
+ vec ! [ Arc :: new( Value :: Timestamp (
704
+ 1732838400 ,
705
+ TimeUnit :: Millisecond ,
706
+ ) ) ] ,
707
+ ) ,
708
+ Value :: Dictionary ( DictionaryKeyType :: Int64 , Box :: new ( Value :: Date64 ( 114 ) ) ) ,
709
+ Value :: Dictionary (
710
+ DictionaryKeyType :: UInt8 ,
711
+ Box :: new ( Value :: String ( "fusio" . to_string ( ) ) ) ,
712
+ ) ,
713
+ Value :: Dictionary (
714
+ DictionaryKeyType :: Int32 ,
715
+ Box :: new ( Value :: Binary ( b"tonbo" . to_vec ( ) ) ) ,
716
+ ) ,
717
+ Value :: Dictionary (
718
+ DictionaryKeyType :: UInt8 ,
719
+ Box :: new ( Value :: FixedSizeBinary ( b"tonbo" . to_vec ( ) , 5 ) ) ,
720
+ ) ,
721
+ ] ;
652
722
653
723
let mut buf = Vec :: new ( ) ;
654
724
let mut cursor = Cursor :: new ( & mut buf) ;
655
725
656
- value_ref. encode ( & mut cursor) . await . unwrap ( ) ;
657
-
658
- let encoded_len = buf. len ( ) ;
659
- let reported_size = value_ref. size ( ) ;
660
- // This assertion is expected to fail with current implementation,
661
- // proving size() undercounts dictionary-encoded bytes.
662
- assert_eq ! (
663
- encoded_len, reported_size,
664
- "encoded length {} differs from size() {} for dictionary value_ref" ,
665
- encoded_len, reported_size
666
- ) ;
726
+ for value in values. iter ( ) {
727
+ let before = cursor. position ( ) ;
728
+ let value_ref = value. as_key_ref ( ) ;
729
+ value_ref. encode ( & mut cursor) . await . unwrap ( ) ;
730
+ let after = cursor. position ( ) ;
731
+
732
+ assert_eq ! (
733
+ value_ref. size( ) ,
734
+ ( after - before) as usize ,
735
+ "encoded length {} differs from size() {} for dictionary value_ref. value_ref: \
736
+ {:?}",
737
+ after - before,
738
+ value_ref. size( ) ,
739
+ value_ref
740
+ ) ;
741
+ }
667
742
}
668
743
}
0 commit comments