Skip to content

Commit 8667622

Browse files
maxitgrtyler
authored andcommitted
feat(python, rust): add statistics_enabled to ColumnProperties
Signed-off-by: Max Piskunov <max.piskunov@plus.ai>
1 parent 797888f commit 8667622

File tree

3 files changed

+28
-2
lines changed

3 files changed

+28
-2
lines changed

python/deltalake/table.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -217,22 +217,28 @@ class ColumnProperties:
217217
def __init__(
218218
self,
219219
dictionary_enabled: Optional[bool] = None,
220+
statistics_enabled: Optional[Literal["NONE", "CHUNK", "PAGE"]] = None,
220221
max_statistics_size: Optional[int] = None,
221222
bloom_filter_properties: Optional[BloomFilterProperties] = None,
222223
):
223224
"""Create a Column Properties instance for the Rust parquet writer:
224225
225226
Args:
226227
dictionary_enabled: Enable dictionary encoding for the column.
228+
statistics_enabled: Statistics level for the column.
227229
max_statistics_size: Maximum size of statistics for the column.
228230
bloom_filter_properties: Bloom Filter Properties for the column.
229231
"""
230232
self.dictionary_enabled = dictionary_enabled
233+
self.statistics_enabled = statistics_enabled
231234
self.max_statistics_size = max_statistics_size
232235
self.bloom_filter_properties = bloom_filter_properties
233236

234237
def __str__(self) -> str:
235-
return f"dictionary_enabled: {self.dictionary_enabled}, max_statistics_size: {self.max_statistics_size}, bloom_filter_properties: {self.bloom_filter_properties}"
238+
return (
239+
f"dictionary_enabled: {self.dictionary_enabled}, statistics_enabled: {self.statistics_enabled}, "
240+
f"max_statistics_size: {self.max_statistics_size}, bloom_filter_properties: {self.bloom_filter_properties}"
241+
)
236242

237243

238244
@dataclass(init=True)

python/src/lib.rs

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ use deltalake::operations::vacuum::VacuumBuilder;
5252
use deltalake::operations::{collect_sendable_stream, CustomExecuteHandler};
5353
use deltalake::parquet::basic::Compression;
5454
use deltalake::parquet::errors::ParquetError;
55-
use deltalake::parquet::file::properties::WriterProperties;
55+
use deltalake::parquet::file::properties::{EnabledStatistics, WriterProperties};
5656
use deltalake::partitions::PartitionFilter;
5757
use deltalake::protocol::{DeltaOperation, SaveMode};
5858
use deltalake::storage::{IORuntime, ObjectStoreRef};
@@ -1566,6 +1566,13 @@ fn set_writer_properties(writer_properties: PyWriterProperties) -> DeltaResult<W
15661566
if let Some(dictionary_enabled) = default_column_properties.dictionary_enabled {
15671567
properties = properties.set_dictionary_enabled(dictionary_enabled);
15681568
}
1569+
if let Some(statistics_enabled) = default_column_properties.statistics_enabled {
1570+
let enabled_statistics: EnabledStatistics = statistics_enabled
1571+
.parse()
1572+
.map_err(|err: String| DeltaTableError::Generic(err))?;
1573+
1574+
properties = properties.set_statistics_enabled(enabled_statistics);
1575+
}
15691576
if let Some(max_statistics_size) = default_column_properties.max_statistics_size {
15701577
properties = properties.set_max_statistics_size(max_statistics_size);
15711578
}
@@ -1591,6 +1598,16 @@ fn set_writer_properties(writer_properties: PyWriterProperties) -> DeltaResult<W
15911598
dictionary_enabled,
15921599
);
15931600
}
1601+
if let Some(statistics_enabled) = column_prop.statistics_enabled {
1602+
let enabled_statistics: EnabledStatistics = statistics_enabled
1603+
.parse()
1604+
.map_err(|err: String| DeltaTableError::Generic(err))?;
1605+
1606+
properties = properties.set_column_statistics_enabled(
1607+
column_name.clone().into(),
1608+
enabled_statistics,
1609+
);
1610+
}
15941611
if let Some(bloom_filter_properties) = column_prop.bloom_filter_properties {
15951612
if let Some(set_bloom_filter_enabled) =
15961613
bloom_filter_properties.set_bloom_filter_enabled
@@ -1919,6 +1936,7 @@ pub struct BloomFilterProperties {
19191936
#[derive(FromPyObject)]
19201937
pub struct ColumnProperties {
19211938
pub dictionary_enabled: Option<bool>,
1939+
pub statistics_enabled: Option<String>,
19221940
pub max_statistics_size: Option<usize>,
19231941
pub bloom_filter_properties: Option<BloomFilterProperties>,
19241942
}

python/tests/test_writerproperties.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,13 +28,15 @@ def test_writer_properties_all_filled():
2828
column_properties={
2929
"a": ColumnProperties(
3030
dictionary_enabled=True,
31+
statistics_enabled="CHUNK",
3132
max_statistics_size=40,
3233
bloom_filter_properties=BloomFilterProperties(
3334
set_bloom_filter_enabled=True, fpp=0.2, ndv=30
3435
),
3536
),
3637
"b": ColumnProperties(
3738
dictionary_enabled=True,
39+
statistics_enabled="PAGE",
3840
max_statistics_size=400,
3941
bloom_filter_properties=BloomFilterProperties(
4042
set_bloom_filter_enabled=False, fpp=0.2, ndv=30

0 commit comments

Comments
 (0)