Skip to content

Commit f7c53cf

Browse files
committed
fix: fix value size not matches with encoded len
1 parent a730066 commit f7c53cf

File tree

4 files changed

+278
-126
lines changed

4 files changed

+278
-126
lines changed

src/record/dynamic/value/dict.rs

Lines changed: 19 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,8 @@
11
use fusio::Write;
22
use fusio_log::{Decode, Encode};
33

4-
use crate::record::ValueError;
5-
64
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
5+
#[repr(u8)]
76
pub enum DictionaryKeyType {
87
Int8,
98
Int16,
@@ -15,6 +14,22 @@ pub enum DictionaryKeyType {
1514
UInt64,
1615
}
1716

17+
impl From<u8> for DictionaryKeyType {
18+
fn from(value: u8) -> Self {
19+
match value {
20+
0 => Self::Int8,
21+
1 => Self::Int16,
22+
2 => Self::Int32,
23+
3 => Self::Int64,
24+
4 => Self::UInt8,
25+
5 => Self::UInt16,
26+
6 => Self::UInt32,
27+
7 => Self::UInt64,
28+
_ => unreachable!(),
29+
}
30+
}
31+
}
32+
1833
impl From<DictionaryKeyType> for arrow::datatypes::DataType {
1934
fn from(value: DictionaryKeyType) -> Self {
2035
match value {
@@ -66,16 +81,7 @@ impl Encode for DictionaryKeyType {
6681
where
6782
W: Write,
6883
{
69-
match self {
70-
DictionaryKeyType::Int8 => 0u8.encode(writer).await,
71-
DictionaryKeyType::Int16 => 1u8.encode(writer).await,
72-
DictionaryKeyType::Int32 => 2u8.encode(writer).await,
73-
DictionaryKeyType::Int64 => 3u8.encode(writer).await,
74-
DictionaryKeyType::UInt8 => 4u8.encode(writer).await,
75-
DictionaryKeyType::UInt16 => 5u8.encode(writer).await,
76-
DictionaryKeyType::UInt32 => 6u8.encode(writer).await,
77-
DictionaryKeyType::UInt64 => 7u8.encode(writer).await,
78-
}
84+
(*self as u8).encode(writer).await
7985
}
8086

8187
fn size(&self) -> usize {
@@ -88,20 +94,7 @@ impl Decode for DictionaryKeyType {
8894
where
8995
R: fusio::SeqRead,
9096
{
91-
let value = u8::decode(reader).await?;
92-
match value {
93-
0 => Ok(DictionaryKeyType::Int8),
94-
1 => Ok(DictionaryKeyType::Int16),
95-
2 => Ok(DictionaryKeyType::Int32),
96-
3 => Ok(DictionaryKeyType::Int64),
97-
4 => Ok(DictionaryKeyType::UInt8),
98-
5 => Ok(DictionaryKeyType::UInt16),
99-
6 => Ok(DictionaryKeyType::UInt32),
100-
7 => Ok(DictionaryKeyType::UInt64),
101-
_ => Err(fusio::Error::Other(Box::new(ValueError::InvalidDataType(
102-
"invalid dictionary key type".to_string(),
103-
)))),
104-
}
97+
Ok(u8::decode(reader).await?.into())
10598
}
10699
}
107100

src/record/dynamic/value/encoding.rs

Lines changed: 172 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,9 @@ use futures_util::future::LocalBoxFuture;
1010
use futures_util::FutureExt;
1111

1212
use super::{TimeUnit, Value};
13-
use crate::record::{decode_arrow_datatype, encode_arrow_datatype, ValueError, ValueRef};
13+
use crate::record::{
14+
decode_arrow_datatype, encode_arrow_datatype, size_of_arrow_datatype, ValueError, ValueRef,
15+
};
1416

1517
#[cfg(not(target_arch = "wasm32"))]
1618
type BoxedFuture<'a, T> = BoxFuture<'a, T>;
@@ -198,38 +200,31 @@ impl Encode for Value {
198200
}
199201

200202
fn size(&self) -> usize {
201-
match self {
202-
Value::Null => 1,
203-
Value::Boolean(v) => 1 + v.size(),
204-
Value::Int8(v) => 1 + v.size(),
205-
Value::Int16(v) => 1 + v.size(),
206-
Value::Int32(v) => 1 + v.size(),
207-
Value::Int64(v) => 1 + v.size(),
208-
Value::UInt8(v) => 1 + v.size(),
209-
Value::UInt16(v) => 1 + v.size(),
210-
Value::UInt32(v) => 1 + v.size(),
211-
Value::UInt64(v) => 1 + v.size(),
212-
Value::Float32(v) => 1 + v.size(),
213-
Value::Float64(v) => 1 + v.size(),
214-
Value::String(v) => 1 + v.size(),
215-
Value::Binary(v) => 1 + v.size(),
216-
Value::FixedSizeBinary(v, _) => 1 + v.size(),
217-
Value::Date32(v) => 1 + v.size(),
218-
Value::Date64(v) => 1 + v.size(),
219-
Value::Timestamp(v, time_unit) => 1 + v.size() + time_unit.size(),
220-
Value::Time32(v, time_unit) => 1 + v.size() + time_unit.size(),
221-
Value::Time64(v, time_unit) => 1 + v.size() + time_unit.size(),
222-
Value::List(data_type, vec) => {
223-
vec.iter().map(|v| v.size()).sum::<usize>()
224-
+ match data_type {
225-
DataType::Timestamp(_, _) => 2,
226-
DataType::Time32(_) | DataType::Time64(_) => 2,
227-
DataType::List(field) => 1 + field.size(),
228-
_ => 1,
229-
}
203+
size_of_arrow_datatype(&self.data_type())
204+
+ match self {
205+
Value::Null => 1,
206+
Value::Boolean(v) => v.size(),
207+
Value::Int8(v) => v.size(),
208+
Value::Int16(v) => v.size(),
209+
Value::Int32(v) => v.size(),
210+
Value::Int64(v) => v.size(),
211+
Value::UInt8(v) => v.size(),
212+
Value::UInt16(v) => v.size(),
213+
Value::UInt32(v) => v.size(),
214+
Value::UInt64(v) => v.size(),
215+
Value::Float32(v) => v.size(),
216+
Value::Float64(v) => v.size(),
217+
Value::String(v) => v.size(),
218+
Value::Binary(v) => v.size(),
219+
Value::FixedSizeBinary(v, _) => v.size(),
220+
Value::Date32(v) => v.size(),
221+
Value::Date64(v) => v.size(),
222+
Value::Timestamp(v, _) => v.size(),
223+
Value::Time32(v, _) => v.size(),
224+
Value::Time64(v, _) => v.size(),
225+
Value::List(_, vec) => vec.size(),
226+
Value::Dictionary(_, value) => value.size(),
230227
}
231-
Value::Dictionary(key_type, value) => 1 + key_type.size() + value.size(),
232-
}
233228
}
234229
}
235230

@@ -379,38 +374,31 @@ impl Encode for ValueRef<'_> {
379374
}
380375

381376
fn size(&self) -> usize {
382-
match self {
383-
ValueRef::Null => 1,
384-
ValueRef::Boolean(v) => 1 + v.size(),
385-
ValueRef::Int8(v) => 1 + v.size(),
386-
ValueRef::Int16(v) => 1 + v.size(),
387-
ValueRef::Int32(v) => 1 + v.size(),
388-
ValueRef::Int64(v) => 1 + v.size(),
389-
ValueRef::UInt8(v) => 1 + v.size(),
390-
ValueRef::UInt16(v) => 1 + v.size(),
391-
ValueRef::UInt32(v) => 1 + v.size(),
392-
ValueRef::UInt64(v) => 1 + v.size(),
393-
ValueRef::Float32(v) => 1 + v.size(),
394-
ValueRef::Float64(v) => 1 + v.size(),
395-
ValueRef::String(v) => 1 + v.size(),
396-
ValueRef::Binary(v) => 1 + v.size(),
397-
ValueRef::FixedSizeBinary(v, _) => 1 + v.size(),
398-
ValueRef::Date32(v) => 1 + v.size(),
399-
ValueRef::Date64(v) => 1 + v.size(),
400-
ValueRef::Timestamp(v, time_unit) => 1 + v.size() + time_unit.size(),
401-
ValueRef::Time32(v, time_unit) => 1 + v.size() + time_unit.size(),
402-
ValueRef::Time64(v, time_unit) => 1 + v.size() + time_unit.size(),
403-
ValueRef::List(data_type, vec) => {
404-
vec.iter().map(|v| v.size()).sum::<usize>()
405-
+ match data_type {
406-
DataType::Timestamp(_, _) => 2,
407-
DataType::Time32(_) | DataType::Time64(_) => 2,
408-
DataType::List(field) => 1 + field.size(),
409-
_ => 1,
410-
}
377+
size_of_arrow_datatype(&self.data_type())
378+
+ match self {
379+
ValueRef::Null => 1,
380+
ValueRef::Boolean(v) => v.size(),
381+
ValueRef::Int8(v) => v.size(),
382+
ValueRef::Int16(v) => v.size(),
383+
ValueRef::Int32(v) => v.size(),
384+
ValueRef::Int64(v) => v.size(),
385+
ValueRef::UInt8(v) => v.size(),
386+
ValueRef::UInt16(v) => v.size(),
387+
ValueRef::UInt32(v) => v.size(),
388+
ValueRef::UInt64(v) => v.size(),
389+
ValueRef::Float32(v) => v.size(),
390+
ValueRef::Float64(v) => v.size(),
391+
ValueRef::String(v) => v.size(),
392+
ValueRef::Binary(v) => (v.len() as u32).size() + v.size(),
393+
ValueRef::FixedSizeBinary(v, _) => 4 + v.size(),
394+
ValueRef::Date32(v) => v.size(),
395+
ValueRef::Date64(v) => v.size(),
396+
ValueRef::Timestamp(v, _) => v.size(),
397+
ValueRef::Time32(v, _) => v.size(),
398+
ValueRef::Time64(v, _) => v.size(),
399+
ValueRef::List(_, vec) => vec.size(),
400+
ValueRef::Dictionary(_, value_ref) => value_ref.size(),
411401
}
412-
ValueRef::Dictionary(key_type, value_ref) => 1 + key_type.size() + value_ref.size(),
413-
}
414402
}
415403
}
416404

@@ -619,50 +607,137 @@ mod tests {
619607
assert_eq!(value2.as_key_ref(), decoded.as_key_ref());
620608
}
621609

622-
// This test demonstrates a bug in size() for Dictionary values:
623-
// the encoded output includes the dictionary's value-type header,
624-
// but size() does not account for those bytes.
625610
#[tokio::test]
626-
async fn test_dictionary_size_matches_encoded_len_value() {
627-
let value = Value::Dictionary(
628-
DictionaryKeyType::UInt8,
629-
Box::new(Value::String("a".to_string())),
630-
);
611+
async fn test_size_matches_encoded_len_value() {
612+
let values = [
613+
Value::UInt8(1),
614+
Value::UInt16(1),
615+
Value::UInt32(1),
616+
Value::UInt64(1),
617+
Value::Int8(1),
618+
Value::Int16(1),
619+
Value::Int32(1),
620+
Value::Int64(1),
621+
Value::Float32(1.0),
622+
Value::Float64(1.0),
623+
Value::String("tonbo".to_string()),
624+
Value::Binary(b"tonbo".to_vec()),
625+
Value::FixedSizeBinary(b"tonbo".to_vec(), 5),
626+
Value::Date32(1),
627+
Value::Date64(1),
628+
Value::Timestamp(1, TimeUnit::Millisecond),
629+
Value::Time32(1, TimeUnit::Millisecond),
630+
Value::Time64(1, TimeUnit::Nanosecond),
631+
Value::List(
632+
DataType::List(Arc::new(Field::new(
633+
"timestamp",
634+
DataType::Timestamp(arrow::datatypes::TimeUnit::Millisecond, None),
635+
false,
636+
))),
637+
vec![Arc::new(Value::Timestamp(
638+
1732838400,
639+
TimeUnit::Millisecond,
640+
))],
641+
),
642+
Value::Dictionary(DictionaryKeyType::Int64, Box::new(Value::Date64(114))),
643+
Value::Dictionary(
644+
DictionaryKeyType::UInt8,
645+
Box::new(Value::String("fusio".to_string())),
646+
),
647+
Value::Dictionary(
648+
DictionaryKeyType::Int32,
649+
Box::new(Value::Binary(b"tonbo".to_vec())),
650+
),
651+
Value::Dictionary(
652+
DictionaryKeyType::UInt8,
653+
Box::new(Value::FixedSizeBinary(b"tonbo".to_vec(), 5)),
654+
),
655+
];
656+
631657
let mut buf = Vec::new();
632658
let mut cursor = Cursor::new(&mut buf);
633659

634-
value.encode(&mut cursor).await.unwrap();
635-
636-
let encoded_len = buf.len();
637-
let reported_size = value.size();
638-
// This assertion is expected to fail with current implementation,
639-
// proving size() undercounts dictionary-encoded bytes.
640-
assert_eq!(
641-
encoded_len, reported_size,
642-
"encoded length {} differs from size() {} for dictionary value",
643-
encoded_len, reported_size
644-
);
660+
for value in values.iter() {
661+
let before = cursor.position();
662+
value.encode(&mut cursor).await.unwrap();
663+
let after = cursor.position();
664+
665+
assert_eq!(
666+
value.size(),
667+
(after - before) as usize,
668+
"encoded length {} differs from size() {} for dictionary value. value: {:?}",
669+
after - before,
670+
value.size(),
671+
value
672+
);
673+
}
645674
}
646675

647-
// Same mismatch for ValueRef::Dictionary
648676
#[tokio::test]
649-
async fn test_dictionary_size_matches_encoded_len_value_ref() {
650-
let value = Value::Dictionary(DictionaryKeyType::Int64, Box::new(Value::Date64(114)));
651-
let value_ref = value.as_key_ref();
677+
async fn test_size_matches_encoded_len_value_ref() {
678+
let values = [
679+
Value::UInt8(1),
680+
Value::UInt16(1),
681+
Value::UInt32(1),
682+
Value::UInt64(1),
683+
Value::Int8(1),
684+
Value::Int16(1),
685+
Value::Int32(1),
686+
Value::Int64(1),
687+
Value::Float32(1.0),
688+
Value::Float64(1.0),
689+
Value::String("fusio".to_string()),
690+
Value::Binary(b"fusio".to_vec()),
691+
Value::FixedSizeBinary(b"fusio".to_vec(), 5),
692+
Value::Date32(1),
693+
Value::Date64(1),
694+
Value::Timestamp(1, TimeUnit::Millisecond),
695+
Value::Time32(1, TimeUnit::Millisecond),
696+
Value::Time64(1, TimeUnit::Nanosecond),
697+
Value::List(
698+
DataType::List(Arc::new(Field::new(
699+
"timestamp",
700+
DataType::Timestamp(arrow::datatypes::TimeUnit::Millisecond, None),
701+
false,
702+
))),
703+
vec![Arc::new(Value::Timestamp(
704+
1732838400,
705+
TimeUnit::Millisecond,
706+
))],
707+
),
708+
Value::Dictionary(DictionaryKeyType::Int64, Box::new(Value::Date64(114))),
709+
Value::Dictionary(
710+
DictionaryKeyType::UInt8,
711+
Box::new(Value::String("fusio".to_string())),
712+
),
713+
Value::Dictionary(
714+
DictionaryKeyType::Int32,
715+
Box::new(Value::Binary(b"tonbo".to_vec())),
716+
),
717+
Value::Dictionary(
718+
DictionaryKeyType::UInt8,
719+
Box::new(Value::FixedSizeBinary(b"tonbo".to_vec(), 5)),
720+
),
721+
];
652722

653723
let mut buf = Vec::new();
654724
let mut cursor = Cursor::new(&mut buf);
655725

656-
value_ref.encode(&mut cursor).await.unwrap();
657-
658-
let encoded_len = buf.len();
659-
let reported_size = value_ref.size();
660-
// This assertion is expected to fail with current implementation,
661-
// proving size() undercounts dictionary-encoded bytes.
662-
assert_eq!(
663-
encoded_len, reported_size,
664-
"encoded length {} differs from size() {} for dictionary value_ref",
665-
encoded_len, reported_size
666-
);
726+
for value in values.iter() {
727+
let before = cursor.position();
728+
let value_ref = value.as_key_ref();
729+
value_ref.encode(&mut cursor).await.unwrap();
730+
let after = cursor.position();
731+
732+
assert_eq!(
733+
value_ref.size(),
734+
(after - before) as usize,
735+
"encoded length {} differs from size() {} for dictionary value_ref. value_ref: \
736+
{:?}",
737+
after - before,
738+
value_ref.size(),
739+
value_ref
740+
);
741+
}
667742
}
668743
}

0 commit comments

Comments
 (0)