Skip to content

Commit 1b18582

Browse files
authored
[Variant] Add try_value/value for VariantArray (#8719)
# Which issue does this PR close? - Closes #8672 . # What changes are included in this PR? - Add `try_value/value` function for `VariantArray` - Add test for `VariantArray::try_value` # Are these changes tested? Covered by existing tests and added new tests # Are there any user-facing changes? Yes, add a new function for `VariantArray::try_value`, and the `VariantArray::value` changed to panic from returning `Variant::Null` if there is some cast error.
1 parent 452dadb commit 1b18582

File tree

3 files changed

+200
-38
lines changed

3 files changed

+200
-38
lines changed

parquet-variant-compute/src/type_conversion.rs

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -194,10 +194,10 @@ macro_rules! non_generic_conversion_single_value {
194194
($array:expr, $cast_fn:expr, $index:expr) => {{
195195
let array = $array;
196196
if array.is_null($index) {
197-
Variant::Null
197+
Ok(Variant::Null)
198198
} else {
199199
let cast_value = $cast_fn(array.value($index));
200-
Variant::from(cast_value)
200+
Ok(Variant::from(cast_value))
201201
}
202202
}};
203203
}
@@ -217,6 +217,23 @@ macro_rules! generic_conversion_single_value {
217217
}
218218
pub(crate) use generic_conversion_single_value;
219219

220+
macro_rules! generic_conversion_single_value_with_result {
221+
($t:ty, $method:ident, $cast_fn:expr, $input:expr, $index:expr) => {{
222+
let arr = $input.$method::<$t>();
223+
let v = arr.value($index);
224+
match ($cast_fn)(v) {
225+
Ok(var) => Ok(Variant::from(var)),
226+
Err(e) => Err(ArrowError::CastError(format!(
227+
"Cast failed at index {idx} (array type: {ty}): {e}",
228+
idx = $index,
229+
ty = <$t as ::arrow::datatypes::ArrowPrimitiveType>::DATA_TYPE
230+
))),
231+
}
232+
}};
233+
}
234+
235+
pub(crate) use generic_conversion_single_value_with_result;
236+
220237
/// Convert the value at a specific index in the given array into a `Variant`.
221238
macro_rules! primitive_conversion_single_value {
222239
($t:ty, $input:expr, $index:expr) => {{

parquet-variant-compute/src/variant_array.rs

Lines changed: 122 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,10 @@
1818
//! [`VariantArray`] implementation
1919
2020
use crate::VariantArrayBuilder;
21-
use crate::type_conversion::{generic_conversion_single_value, primitive_conversion_single_value};
21+
use crate::type_conversion::{
22+
generic_conversion_single_value, generic_conversion_single_value_with_result,
23+
primitive_conversion_single_value,
24+
};
2225
use arrow::array::{Array, ArrayRef, AsArray, BinaryViewArray, StructArray};
2326
use arrow::buffer::NullBuffer;
2427
use arrow::compute::cast;
@@ -27,6 +30,7 @@ use arrow::datatypes::{
2730
Float64Type, Int8Type, Int16Type, Int32Type, Int64Type, Time64MicrosecondType,
2831
TimestampMicrosecondType, TimestampNanosecondType,
2932
};
33+
use arrow::error::Result;
3034
use arrow_schema::extension::ExtensionType;
3135
use arrow_schema::{ArrowError, DataType, Field, FieldRef, Fields, TimeUnit};
3236
use chrono::{DateTime, NaiveTime};
@@ -58,11 +62,11 @@ impl ExtensionType for VariantType {
5862
Some(String::new())
5963
}
6064

61-
fn deserialize_metadata(_metadata: Option<&str>) -> Result<Self::Metadata, ArrowError> {
65+
fn deserialize_metadata(_metadata: Option<&str>) -> Result<Self::Metadata> {
6266
Ok("")
6367
}
6468

65-
fn supports_data_type(&self, data_type: &DataType) -> Result<(), ArrowError> {
69+
fn supports_data_type(&self, data_type: &DataType) -> Result<()> {
6670
if matches!(data_type, DataType::Struct(_)) {
6771
Ok(())
6872
} else {
@@ -72,7 +76,7 @@ impl ExtensionType for VariantType {
7276
}
7377
}
7478

75-
fn try_new(data_type: &DataType, _metadata: Self::Metadata) -> Result<Self, ArrowError> {
79+
fn try_new(data_type: &DataType, _metadata: Self::Metadata) -> Result<Self> {
7680
Self.supports_data_type(data_type)?;
7781
Ok(Self)
7882
}
@@ -249,7 +253,7 @@ impl VariantArray {
249253
/// int8.
250254
///
251255
/// Currently, only [`BinaryViewArray`] are supported.
252-
pub fn try_new(inner: &dyn Array) -> Result<Self, ArrowError> {
256+
pub fn try_new(inner: &dyn Array) -> Result<Self> {
253257
// Workaround lack of support for Binary
254258
// https://github.com/apache/arrow-rs/issues/8387
255259
let inner = cast_to_binary_view_arrays(inner)?;
@@ -325,12 +329,32 @@ impl VariantArray {
325329

326330
/// Return the [`Variant`] instance stored at the given row
327331
///
328-
/// Note: This method does not check for nulls and the value is arbitrary
329-
/// (but still well-defined) if [`is_null`](Self::is_null) returns true for the index.
332+
/// This is a convenience wrapper that calls [`VariantArray::try_value`] and unwraps the `Result`.
333+
/// Use `try_value` if you need to handle conversion errors gracefully.
330334
///
331335
/// # Panics
332336
/// * if the index is out of bounds
333337
/// * if the array value is null
338+
/// * if `try_value` returns an error.
339+
pub fn value(&self, index: usize) -> Variant<'_, '_> {
340+
self.try_value(index).unwrap()
341+
}
342+
343+
/// Return the [`Variant`] instance stored at the given row
344+
///
345+
/// Note: This method does not check for nulls and the value is arbitrary
346+
/// (but still well-defined) if [`is_null`](Self::is_null) returns true for the index.
347+
///
348+
/// # Panics
349+
///
350+
/// Panics if
351+
/// * the index is out of bounds
352+
/// * the array value is null
353+
///
354+
/// # Errors
355+
///
356+
/// Errors if
357+
/// - the data in `typed_value` cannot be interpreted as a valid `Variant`
334358
///
335359
/// If this is a shredded variant but has no value at the shredded location, it
336360
/// will return [`Variant::Null`].
@@ -343,19 +367,19 @@ impl VariantArray {
343367
///
344368
/// Note: Does not do deep validation of the [`Variant`], so it is up to the
345369
/// caller to ensure that the metadata and value were constructed correctly.
346-
pub fn value(&self, index: usize) -> Variant<'_, '_> {
370+
pub fn try_value(&self, index: usize) -> Result<Variant<'_, '_>> {
347371
match (self.typed_value_field(), self.value_field()) {
348372
// Always prefer typed_value, if available
349373
(Some(typed_value), value) if typed_value.is_valid(index) => {
350374
typed_value_to_variant(typed_value, value, index)
351375
}
352376
// Otherwise fall back to value, if available
353377
(_, Some(value)) if value.is_valid(index) => {
354-
Variant::new(self.metadata.value(index), value.value(index))
378+
Ok(Variant::new(self.metadata.value(index), value.value(index)))
355379
}
356380
// It is technically invalid for neither value nor typed_value fields to be available,
357381
// but the spec specifically requires readers to return Variant::Null in this case.
358-
_ => Variant::Null,
382+
_ => Ok(Variant::Null),
359383
}
360384
}
361385

@@ -603,7 +627,7 @@ impl ShreddedVariantFieldArray {
603627
/// or be a list, large_list, list_view or struct
604628
///
605629
/// Currently, only `value` columns of type [`BinaryViewArray`] are supported.
606-
pub fn try_new(inner: &dyn Array) -> Result<Self, ArrowError> {
630+
pub fn try_new(inner: &dyn Array) -> Result<Self> {
607631
let Some(inner_struct) = inner.as_struct_opt() else {
608632
return Err(ArrowError::InvalidArgumentError(
609633
"Invalid ShreddedVariantFieldArray: requires StructArray as input".to_string(),
@@ -835,7 +859,7 @@ impl<'a> BorrowedShreddingState<'a> {
835859
impl<'a> TryFrom<&'a StructArray> for BorrowedShreddingState<'a> {
836860
type Error = ArrowError;
837861

838-
fn try_from(inner_struct: &'a StructArray) -> Result<Self, ArrowError> {
862+
fn try_from(inner_struct: &'a StructArray) -> Result<Self> {
839863
// The `value` column need not exist, but if it does it must be a binary view.
840864
let value = if let Some(value_col) = inner_struct.column_by_name("value") {
841865
let Some(binary_view) = value_col.as_binary_view_opt() else {
@@ -856,7 +880,7 @@ impl<'a> TryFrom<&'a StructArray> for BorrowedShreddingState<'a> {
856880
impl TryFrom<&StructArray> for ShreddingState {
857881
type Error = ArrowError;
858882

859-
fn try_from(inner_struct: &StructArray) -> Result<Self, ArrowError> {
883+
fn try_from(inner_struct: &StructArray) -> Result<Self> {
860884
Ok(BorrowedShreddingState::try_from(inner_struct)?.into())
861885
}
862886
}
@@ -914,34 +938,34 @@ fn typed_value_to_variant<'a>(
914938
typed_value: &'a ArrayRef,
915939
value: Option<&BinaryViewArray>,
916940
index: usize,
917-
) -> Variant<'a, 'a> {
941+
) -> Result<Variant<'a, 'a>> {
918942
let data_type = typed_value.data_type();
919943
if value.is_some_and(|v| !matches!(data_type, DataType::Struct(_)) && v.is_valid(index)) {
920944
// Only a partially shredded struct is allowed to have values for both columns
921945
panic!("Invalid variant, conflicting value and typed_value");
922946
}
923947
match data_type {
924-
DataType::Null => Variant::Null,
948+
DataType::Null => Ok(Variant::Null),
925949
DataType::Boolean => {
926950
let boolean_array = typed_value.as_boolean();
927951
let value = boolean_array.value(index);
928-
Variant::from(value)
952+
Ok(Variant::from(value))
929953
}
930954
// 16-byte FixedSizeBinary alway corresponds to a UUID; all other sizes are illegal.
931955
DataType::FixedSizeBinary(16) => {
932956
let array = typed_value.as_fixed_size_binary();
933957
let value = array.value(index);
934-
Uuid::from_slice(value).unwrap().into() // unwrap is safe: slice is always 16 bytes
958+
Ok(Uuid::from_slice(value).unwrap().into()) // unwrap is safe: slice is always 16 bytes
935959
}
936960
DataType::BinaryView => {
937961
let array = typed_value.as_binary_view();
938962
let value = array.value(index);
939-
Variant::from(value)
963+
Ok(Variant::from(value))
940964
}
941965
DataType::Utf8 => {
942966
let array = typed_value.as_string::<i32>();
943967
let value = array.value(index);
944-
Variant::from(value)
968+
Ok(Variant::from(value))
945969
}
946970
DataType::Int8 => {
947971
primitive_conversion_single_value!(Int8Type, typed_value, index)
@@ -965,28 +989,28 @@ fn typed_value_to_variant<'a>(
965989
primitive_conversion_single_value!(Float64Type, typed_value, index)
966990
}
967991
DataType::Decimal32(_, s) => {
968-
generic_conversion_single_value!(
992+
generic_conversion_single_value_with_result!(
969993
Decimal32Type,
970994
as_primitive,
971-
|v| VariantDecimal4::try_new(v, *s as u8).map_or(Variant::Null, Variant::from),
995+
|v| VariantDecimal4::try_new(v, *s as u8),
972996
typed_value,
973997
index
974998
)
975999
}
9761000
DataType::Decimal64(_, s) => {
977-
generic_conversion_single_value!(
1001+
generic_conversion_single_value_with_result!(
9781002
Decimal64Type,
9791003
as_primitive,
980-
|v| VariantDecimal8::try_new(v, *s as u8).map_or(Variant::Null, Variant::from),
1004+
|v| VariantDecimal8::try_new(v, *s as u8),
9811005
typed_value,
9821006
index
9831007
)
9841008
}
9851009
DataType::Decimal128(_, s) => {
986-
generic_conversion_single_value!(
1010+
generic_conversion_single_value_with_result!(
9871011
Decimal128Type,
9881012
as_primitive,
989-
|v| VariantDecimal16::try_new(v, *s as u8).map_or(Variant::Null, Variant::from),
1013+
|v| VariantDecimal16::try_new(v, *s as u8),
9901014
typed_value,
9911015
index
9921016
)
@@ -1001,14 +1025,14 @@ fn typed_value_to_variant<'a>(
10011025
)
10021026
}
10031027
DataType::Time64(TimeUnit::Microsecond) => {
1004-
generic_conversion_single_value!(
1028+
generic_conversion_single_value_with_result!(
10051029
Time64MicrosecondType,
10061030
as_primitive,
10071031
|v| NaiveTime::from_num_seconds_from_midnight_opt(
10081032
(v / 1_000_000) as u32,
10091033
(v % 1_000_000) as u32 * 1000
10101034
)
1011-
.map_or(Variant::Null, Variant::from),
1035+
.ok_or_else(|| format!("Invalid microsecond from midnight: {}", v)),
10121036
typed_value,
10131037
index
10141038
)
@@ -1060,7 +1084,7 @@ fn typed_value_to_variant<'a>(
10601084
"Unsupported typed_value type: {}",
10611085
typed_value.data_type()
10621086
);
1063-
Variant::Null
1087+
Ok(Variant::Null)
10641088
}
10651089
}
10661090
}
@@ -1075,7 +1099,7 @@ fn typed_value_to_variant<'a>(
10751099
/// * `StructArray<metadata: BinaryView, value: BinaryView>`
10761100
///
10771101
/// So cast them to get the right type.
1078-
fn cast_to_binary_view_arrays(array: &dyn Array) -> Result<ArrayRef, ArrowError> {
1102+
fn cast_to_binary_view_arrays(array: &dyn Array) -> Result<ArrayRef> {
10791103
let new_type = canonicalize_and_verify_data_type(array.data_type())?;
10801104
if let Cow::Borrowed(_) = new_type {
10811105
if let Some(array) = array.as_struct_opt() {
@@ -1088,9 +1112,7 @@ fn cast_to_binary_view_arrays(array: &dyn Array) -> Result<ArrayRef, ArrowError>
10881112
/// Recursively visits a data type, ensuring that it only contains data types that can legally
10891113
/// appear in a (possibly shredded) variant array. It also replaces Binary fields with BinaryView,
10901114
/// since that's what comes back from the parquet reader and what the variant code expects to find.
1091-
fn canonicalize_and_verify_data_type(
1092-
data_type: &DataType,
1093-
) -> Result<Cow<'_, DataType>, ArrowError> {
1115+
fn canonicalize_and_verify_data_type(data_type: &DataType) -> Result<Cow<'_, DataType>> {
10941116
use DataType::*;
10951117

10961118
// helper macros
@@ -1188,7 +1210,7 @@ fn canonicalize_and_verify_data_type(
11881210
Ok(new_data_type)
11891211
}
11901212

1191-
fn canonicalize_and_verify_field(field: &Arc<Field>) -> Result<Cow<'_, Arc<Field>>, ArrowError> {
1213+
fn canonicalize_and_verify_field(field: &Arc<Field>) -> Result<Cow<'_, Arc<Field>>> {
11921214
let Cow::Owned(new_data_type) = canonicalize_and_verify_data_type(field.data_type())? else {
11931215
return Ok(Cow::Borrowed(field));
11941216
};
@@ -1199,11 +1221,15 @@ fn canonicalize_and_verify_field(field: &Arc<Field>) -> Result<Cow<'_, Arc<Field
11991221
#[cfg(test)]
12001222
mod test {
12011223
use crate::VariantArrayBuilder;
1224+
use std::str::FromStr;
12021225

12031226
use super::*;
1204-
use arrow::array::{BinaryViewArray, Int32Array};
1227+
use arrow::array::{
1228+
BinaryViewArray, Decimal32Array, Decimal64Array, Decimal128Array, Int32Array,
1229+
Time64MicrosecondArray,
1230+
};
12051231
use arrow_schema::{Field, Fields};
1206-
use parquet_variant::ShortString;
1232+
use parquet_variant::{EMPTY_VARIANT_METADATA_BYTES, ShortString};
12071233

12081234
#[test]
12091235
fn invalid_not_a_struct_array() {
@@ -1535,4 +1561,65 @@ mod test {
15351561
assert_ne!(v, v_sliced);
15361562
}
15371563
}
1564+
1565+
macro_rules! invalid_variant_array_test {
1566+
($fn_name: ident, $invalid_typed_value: expr, $error_msg: literal) => {
1567+
#[test]
1568+
fn $fn_name() {
1569+
let metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n(
1570+
EMPTY_VARIANT_METADATA_BYTES,
1571+
1,
1572+
));
1573+
let invalid_typed_value = $invalid_typed_value;
1574+
1575+
let struct_array = StructArrayBuilder::new()
1576+
.with_field("metadata", Arc::new(metadata), false)
1577+
.with_field("typed_value", Arc::new(invalid_typed_value), true)
1578+
.build();
1579+
1580+
let array: VariantArray = VariantArray::try_new(&struct_array)
1581+
.expect("should create variant array")
1582+
.into();
1583+
1584+
let result = array.try_value(0);
1585+
assert!(result.is_err());
1586+
let error = result.unwrap_err();
1587+
assert!(matches!(error, ArrowError::CastError(_)));
1588+
1589+
let expected: &str = $error_msg;
1590+
assert!(
1591+
error.to_string().contains($error_msg),
1592+
"error `{}` did not contain `{}`",
1593+
error,
1594+
expected
1595+
)
1596+
}
1597+
};
1598+
}
1599+
1600+
invalid_variant_array_test!(
1601+
test_variant_array_invalide_time,
1602+
Time64MicrosecondArray::from(vec![Some(86401000000)]),
1603+
"Cast error: Cast failed at index 0 (array type: Time64(µs)): Invalid microsecond from midnight: 86401000000"
1604+
);
1605+
1606+
invalid_variant_array_test!(
1607+
test_variant_array_invalid_decimal32,
1608+
Decimal32Array::from(vec![Some(1234567890)]),
1609+
"Cast error: Cast failed at index 0 (array type: Decimal32(9, 2)): Invalid argument error: 1234567890 is wider than max precision 9"
1610+
);
1611+
1612+
invalid_variant_array_test!(
1613+
test_variant_array_invalid_decimal64,
1614+
Decimal64Array::from(vec![Some(1234567890123456789)]),
1615+
"Cast error: Cast failed at index 0 (array type: Decimal64(18, 6)): Invalid argument error: 1234567890123456789 is wider than max precision 18"
1616+
);
1617+
1618+
invalid_variant_array_test!(
1619+
test_variant_array_invalid_decimal128,
1620+
Decimal128Array::from(vec![Some(
1621+
i128::from_str("123456789012345678901234567890123456789").unwrap()
1622+
),]),
1623+
"Cast error: Cast failed at index 0 (array type: Decimal128(38, 10)): Invalid argument error: 123456789012345678901234567890123456789 is wider than max precision 38"
1624+
);
15381625
}

0 commit comments

Comments
 (0)