@@ -18,28 +18,27 @@ def count_records(dataset: dataiku.Dataset) -> int:
18
18
19
19
Returns:
20
20
Number of records
21
+
21
22
"""
22
23
metric_id = "records:COUNT_RECORDS"
23
24
partitions = dataset .read_partitions
24
25
client = dataiku .api_client ()
25
26
project = client .get_project (dataset .project_key )
26
27
record_count = 0
27
- logging .info ("Counting records of dataset: {}..." . format ( dataset . name ) )
28
+ logging .info (f "Counting records of dataset: { dataset . name } ..." )
28
29
if partitions is None or len (partitions ) == 0 :
29
30
project .get_dataset (dataset .short_name ).compute_metrics (metric_ids = [metric_id ])
30
31
metric = dataset .get_last_metric_values ()
31
32
record_count = dataiku .ComputedMetrics .get_value_from_data (metric .get_global_data (metric_id = metric_id ))
32
- logging .info ("Dataset {} contains {:d} records and is not partitioned" . format ( dataset . name , record_count ) )
33
+ logging .info (f "Dataset { dataset . name } contains { record_count :d} records and is not partitioned" )
33
34
else :
34
35
for partition in partitions :
35
36
project .get_dataset (dataset .short_name ).compute_metrics (partition = partition , metric_ids = [metric_id ])
36
37
metric = dataset .get_last_metric_values ()
37
38
record_count += dataiku .ComputedMetrics .get_value_from_data (
38
39
metric .get_partition_data (partition = partition , metric_id = metric_id )
39
40
)
40
- logging .info (
41
- "Dataset {} contains {:d} records in partition(s) {}" .format (dataset .name , record_count , partitions )
42
- )
41
+ logging .info (f"Dataset { dataset .name } contains { record_count :d} records in partition(s) { partitions } " )
43
42
return record_count
44
43
45
44
@@ -48,8 +47,8 @@ def process_dataset_chunks(
48
47
) -> None :
49
48
"""Read a dataset by chunks, process each dataframe chunk with a function and write back to another dataset.
50
49
51
- Passes keyword arguments to the function, adds a tqdm progress bar and generic logging.
52
- Directly writes chunks to the output_dataset, so that only one chunk needs to be processed in-memory at a time.
50
+ Pass keyword arguments to the function, adds a tqdm progress bar and generic logging.
51
+ Directly write chunks to the output_dataset, so that only one chunk needs to be processed in-memory at a time.
53
52
54
53
Args:
55
54
input_dataset: Input dataiku.Dataset instance
@@ -59,45 +58,49 @@ def process_dataset_chunks(
59
58
and output another pandas.DataFrame
60
59
chunksize: Number of rows of each chunk of pandas.DataFrame fed to `func`
61
60
**kwargs: Optional keyword arguments fed to `func`
61
+
62
+ Raises:
63
+ ValueError: If the input dataset is empty or if pandas cannot read it without type inference
64
+
62
65
"""
63
66
input_count_records = count_records (input_dataset )
64
67
if input_count_records == 0 :
65
68
raise ValueError ("Input dataset has no records" )
66
- logging .info (
67
- "Processing dataset {} of {:d} rows by chunks of {:d}..." .format (
68
- input_dataset .name , input_count_records , chunksize
69
- )
70
- )
69
+ logging .info (f"Processing dataset { input_dataset .name } of { input_count_records } rows by chunks of { chunksize } ..." )
71
70
start = time ()
71
+ # First, initialize output schema if not present. Required to show the real error if `iter_dataframes` fails.
72
+ if not output_dataset .read_schema (raise_if_empty = False ):
73
+ df = input_dataset .get_dataframe (limit = 5 , infer_with_pandas = False )
74
+ output_df = func (df = df , ** kwargs )
75
+ output_dataset .write_schema_from_dataframe (output_df )
72
76
with output_dataset .get_writer () as writer :
73
77
df_iterator = input_dataset .iter_dataframes (chunksize = chunksize , infer_with_pandas = False )
74
78
len_iterator = math .ceil (input_count_records / chunksize )
75
- for i , df in tqdm (enumerate (df_iterator ), total = len_iterator ):
79
+ for i , df in tqdm (enumerate (df_iterator ), total = len_iterator , unit = "chunk" , mininterval = 1.0 ):
76
80
output_df = func (df = df , ** kwargs )
77
81
if i == 0 :
78
82
output_dataset .write_schema_from_dataframe (
79
83
output_df , dropAndCreate = bool (not output_dataset .writePartition )
80
84
)
81
85
writer .write_dataframe (output_df )
82
86
logging .info (
83
- "Processing dataset {} of {:d} rows: Done in {:.2f} seconds." .format (
84
- input_dataset .name , input_count_records , time () - start
85
- )
87
+ f"Processing dataset { input_dataset .name } of { input_count_records } rows: Done in { time () - start :.2f} seconds."
86
88
)
87
89
88
90
89
- def set_column_description (
90
- output_dataset : dataiku .Dataset , column_description_dict : Dict , input_dataset : dataiku .Dataset = None
91
+ def set_column_descriptions (
92
+ output_dataset : dataiku .Dataset , column_descriptions : Dict , input_dataset : dataiku .Dataset = None
91
93
) -> None :
92
94
"""Set column descriptions of the output dataset based on a dictionary of column descriptions
93
95
94
- Retains the column descriptions from the input dataset if the column name matches.
96
+ Retain the column descriptions from the input dataset if the column name matches.
95
97
96
98
Args:
97
99
output_dataset: Output dataiku.Dataset instance
98
- column_description_dict : Dictionary holding column descriptions (value) by column name (key)
100
+ column_descriptions : Dictionary holding column descriptions (value) by column name (key)
99
101
input_dataset: Optional input dataiku.Dataset instance
100
102
in case you want to retain input column descriptions
103
+
101
104
"""
102
105
output_dataset_schema = output_dataset .read_schema ()
103
106
input_dataset_schema = []
@@ -107,7 +110,7 @@ def set_column_description(
107
110
input_columns_names = [col ["name" ] for col in input_dataset_schema ]
108
111
for output_col_info in output_dataset_schema :
109
112
output_col_name = output_col_info .get ("name" , "" )
110
- output_col_info ["comment" ] = column_description_dict .get (output_col_name )
113
+ output_col_info ["comment" ] = column_descriptions .get (output_col_name )
111
114
if output_col_name in input_columns_names :
112
115
matched_comment = [
113
116
input_col_info .get ("comment" , "" )
0 commit comments