@@ -1562,11 +1562,15 @@ def run_training_data_prep_child_mortality(
1562
1562
), "NAs introduced by cleaning"
1563
1563
df .drop (columns = ["old_hh_id" ], inplace = True )
1564
1564
1565
- df ["nid" ] = df ["nid" ].astype (int )
1566
- df ["psu" ] = df ["psu" ].astype (int )
1567
- df ["hh_id" ] = df ["hh_id" ].astype (int )
1568
- df ["strata" ] = df ["strata" ].astype (int )
1569
- df ["geospatial_id" ] = df ["geospatial_id" ].astype (int )
1565
+ # update variable data types
1566
+ int_cols = [
1567
+ "nid" ,
1568
+ "psu" ,
1569
+ "hh_id" ,
1570
+ "strata" ,
1571
+ "geospatial_id" ,
1572
+ ]
1573
+ df [int_cols ] = df [int_cols ].astype ("int" )
1570
1574
1571
1575
# Prepping wealth dataset
1572
1576
dhs_wealth_data_raw = get_DHS_wealth_dataset ()
@@ -1713,9 +1717,8 @@ def run_training_data_prep_child_mortality(
1713
1717
1714
1718
# NID 275090 is a very long survey in Peru, 2003-2008 that is coded as having
1715
1719
# multiple year_starts. Removing it.
1716
- # NID 411301 is a Zambia survey in which the prevalences end up being 0
1717
- # after removing data with invalid age columns, remove it.
1718
- problematic_nids = [275090 , 411301 ]
1720
+ # NID 411301 - updated: not in BR data extractions
1721
+ problematic_nids = [275090 ]
1719
1722
before_rows = len (df_exploded )
1720
1723
df_exploded = df_exploded .query ("nid not in @problematic_nids" )
1721
1724
dropped_problematic_nids = before_rows - len (df_exploded )
@@ -1748,16 +1751,13 @@ def run_training_data_prep_child_mortality(
1748
1751
measure_df = df_climate [df_climate [measure ].notna ()].copy ()
1749
1752
measure_df ["measure" ] = measure
1750
1753
measure_df ["value" ] = measure_df [measure ]
1751
- measure_root = Path (output_root ) / measure
1752
- cm_data = ClimateMalnutritionData (measure_root )
1753
1754
logging .info (
1754
- f"Saving data for { data_source_type } to { measure_root } { len (measure_df )} rows"
1755
+ f"Saving data for { measure } to { output_path_version } { len (measure_df )} rows"
1755
1756
)
1756
1757
for ldi_col in ["ldipc_weighted_no_match" ]: # ldi_cols:
1757
1758
measure_df ["ldi_pc_pd" ] = measure_df [ldi_col ] / 365
1758
- version = cm_data .new_training_version ()
1759
1759
logging .info (
1760
- f"Saving data for { data_source_type } to version { version } with { ldi_col } as LDI"
1760
+ f"Saving data for { measure } to version { version } with { ldi_col } as LDI"
1761
1761
)
1762
1762
cm_data .save_training_data (measure_df , version )
1763
1763
message = "Used " + ldi_col + " as LDI"
0 commit comments