74
74
75
75
from mtpy .processing .run_summary import RunSummary
76
76
from mtpy .processing import (
77
- ADDED_KERNEL_DATASET_COLUMNS ,
78
- KERNEL_DATASET_COLUMNS ,
77
+ KERNEL_DATASET_DTYPE ,
79
78
MINI_SUMMARY_COLUMNS ,
80
79
)
81
80
@@ -183,16 +182,7 @@ def df(self, value):
183
182
184
183
raise TypeError (msg )
185
184
186
- need_columns = []
187
- for col in KERNEL_DATASET_COLUMNS :
188
- if not col in value .columns :
189
- need_columns .append (col )
190
- if need_columns :
191
- msg = f"DataFrame needs columns { ', ' .join (need_columns )} "
192
- logger .error (msg )
193
- raise ValueError (msg )
194
-
195
- self ._df = self ._set_datetime_columns (value )
185
+ self ._df = self ._set_datetime_columns (self ._add_columns (value ))
196
186
197
187
def _set_datetime_columns (self , df ):
198
188
"""
@@ -216,6 +206,28 @@ def clone_dataframe(self) -> pd.DataFrame:
216
206
"""return a deep copy of dataframe"""
217
207
return copy .deepcopy (self .df )
218
208
209
+ def _add_columns (self , df ):
210
+ """
211
+ add columns with appropriate dtypes
212
+ """
213
+
214
+ for col , dtype in KERNEL_DATASET_DTYPE :
215
+ if not col in df .columns :
216
+ if col in ["survey" , "station" , "run" , "start" , "end" ]:
217
+ raise ValueError (
218
+ f"{ col } must be a filled column in the dataframe"
219
+ )
220
+ else :
221
+ if isinstance (dtype , object ):
222
+ df [col ] = None
223
+ else :
224
+ df [col ] = dtype (0 )
225
+ logger .warning (
226
+ f"KernelDataset DataFrame needs column { col } , adding "
227
+ f"and setting dtype to { dtype } ."
228
+ )
229
+ return df
230
+
219
231
def from_run_summary (
220
232
self ,
221
233
run_summary : RunSummary ,
@@ -254,18 +266,14 @@ def from_run_summary(
254
266
raise ValueError (msg )
255
267
256
268
# add columns column
257
- for col in ADDED_KERNEL_DATASET_COLUMNS :
258
- df [col ] = None
259
-
260
- df ["fc" ] = False
269
+ df = self ._add_columns (df )
261
270
262
271
# set remote reference
263
- df ["remote" ] = False
264
272
if remote_station_id :
265
273
cond = df .station == remote_station_id
266
274
df .remote = cond
267
275
268
- # be sure to set date time columns
276
+ # be sure to set date time columns and restrict to simultaneous runs
269
277
df = self ._set_datetime_columns (df )
270
278
if remote_station_id :
271
279
df = self .restrict_run_intervals_to_simultaneous (df )
@@ -286,11 +294,6 @@ def mini_summary(self) -> pd.DataFrame:
286
294
"""return a dataframe that fits in terminal"""
287
295
return self .df [self ._mini_summary_columns ]
288
296
289
- @property
290
- def print_mini_summary (self ) -> None :
291
- """prints a dataframe that (hopefully) fits in terminal"""
292
- logger .info (self .mini_summary )
293
-
294
297
@property
295
298
def local_survey_id (self ) -> str :
296
299
"""return string label for local survey id"""
@@ -523,6 +526,8 @@ def get_station_metadata(self, local_station_id: str):
523
526
assert len (run_ids ) == len (sub_df )
524
527
525
528
# iterate over these runs, packing metadata into
529
+ # get run metadata from the group object instead of loading the runTS
530
+ # object, should be much faster.
526
531
station_metadata = None
527
532
for i , row in sub_df .iterrows ():
528
533
local_run_obj = self .get_run_object (row )
@@ -533,6 +538,34 @@ def get_station_metadata(self, local_station_id: str):
533
538
station_metadata .add_run (run_metadata )
534
539
return station_metadata
535
540
541
+ def get_run_object (
542
+ self , index_or_row : Union [int , pd .Series ]
543
+ ) -> mt_metadata .timeseries .Run :
544
+ """
545
+ Gets the run object associated with a row of the df
546
+
547
+ Development Notes:
548
+ TODO: This appears to be unused except by get_station_metadata.
549
+ Delete or integrate if desired.
550
+ - This has likely been deprecated by direct calls to
551
+ run_obj = row.mth5_obj.from_reference(row.run_reference) in pipelines.
552
+
553
+ Parameters
554
+ ----------
555
+ index_or_row: integer index of df, or pd.Series object
556
+
557
+ Returns
558
+ -------
559
+ run_obj: mt_metadata.timeseries.Run
560
+ The run associated with the row of the df.
561
+ """
562
+ if isinstance (index_or_row , int ):
563
+ row = self .df .loc [index_or_row ]
564
+ else :
565
+ row = index_or_row
566
+ run_obj = row .mth5_obj .from_reference (row .run_reference )
567
+ return run_obj
568
+
536
569
@property
537
570
def num_sample_rates (self ) -> int :
538
571
"""returns the number of unique sample rates in the dataframe"""
0 commit comments