Skip to content

Commit 1f74dfb

Browse files
author
Elye Bliss
committed
temporarily saving to csv instead of parquet to avoid losing progress
1 parent 6d724bb commit 1f74dfb

File tree

1 file changed

+9
-6
lines changed

1 file changed

+9
-6
lines changed

src/rra_climate_health/data_prep/run_training_data_prep.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1377,7 +1377,7 @@ def run_training_data_prep_child_mortality(
13771377
df_exploded["age_year"] = df_exploded["age_year"].astype(int)
13781378
df_exploded["age_month_at_year_end"] = df_exploded["age_month_at_year_end"].astype(int)
13791379
# override int_year, used to get climate vars
1380-
df_exploded.rename(columns={"years_to_expand": "int_year"}, inplace=True)
1380+
df_exploded["int_year"] = df_exploded["years_to_expand"].astype(int)
13811381

13821382
# for rows with child_alive==0, replace with child_alive=1 if int_year < year_of_recorded_age
13831383
df_exploded["child_alive"] = df_exploded["child_alive"].astype(int)
@@ -1412,7 +1412,7 @@ def run_training_data_prep_child_mortality(
14121412
logging.info("Processing climate data...")
14131413
climate_vars = get_climate_vars_for_dataframe(df_exploded)
14141414
df_climate = merge_left_without_inflating(df_exploded, climate_vars, on=["int_year", "lat", "long"])
1415-
1415+
14161416
logging.info("Adding elevation data...")
14171417
df_climate = get_elevation_for_dataframe(df_climate)
14181418

@@ -1421,15 +1421,18 @@ def run_training_data_prep_child_mortality(
14211421
#Write to output
14221422
for measure in MEASURES_IN_SOURCE[data_source_type]:
14231423
measure_df = df_climate[df_climate[measure].notna()].copy()
1424-
measure_df["measure"] = measure
1424+
measure_df["measure"] = data_source_type
14251425
measure_df["value"] = measure_df[measure]
1426-
measure_root = Path(output_root) / measure
1426+
measure_root = Path(output_root) / data_source_type
1427+
os.makedirs(measure_root, exist_ok=True,mode=0o777)
1428+
os.makedirs(Path(measure_root) / "training_data", exist_ok=True,mode=0o777)
14271429
cm_data = ClimateMalnutritionData(measure_root)
1428-
logging.info(f"Saving data for {measure} to {measure_root} {len(measure_df)} rows")
1430+
logging.info(f"Saving data for {data_source_type} to {measure_root} {len(measure_df)} rows")
14291431
for ldi_col in ['ldipc_weighted_no_match']: #ldi_cols:
14301432
measure_df['ldi_pc_pd'] = measure_df[ldi_col] / 365
14311433
version = cm_data.new_training_version()
1432-
logging.info(f"Saving data for {measure} to version {version} with {ldi_col} as LDI")
1434+
os.makedirs(Path(measure_root) / "training_data" / version, exist_ok=True, mode=0o777)
1435+
logging.info(f"Saving data for {data_source_type} to version {version} with {ldi_col} as LDI")
14331436
cm_data.save_training_data(measure_df, version)
14341437
message = "Used " + ldi_col + " as LDI"
14351438
# Save a small file with a record of which ldi column was used for this version

0 commit comments

Comments
 (0)