Skip to content

Commit d18e822

Browse files
author
Elye Bliss
committed
removing NID 411301 filter
1 parent 194cd94 commit d18e822

File tree

1 file changed

+13
-13
lines changed

1 file changed

+13
-13
lines changed

src/rra_climate_health/data_prep/run_training_data_prep.py

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1562,11 +1562,15 @@ def run_training_data_prep_child_mortality(
15621562
), "NAs introduced by cleaning"
15631563
df.drop(columns=["old_hh_id"], inplace=True)
15641564

1565-
df["nid"] = df["nid"].astype(int)
1566-
df["psu"] = df["psu"].astype(int)
1567-
df["hh_id"] = df["hh_id"].astype(int)
1568-
df["strata"] = df["strata"].astype(int)
1569-
df["geospatial_id"] = df["geospatial_id"].astype(int)
1565+
# update variable data types
1566+
int_cols = [
1567+
"nid",
1568+
"psu",
1569+
"hh_id",
1570+
"strata",
1571+
"geospatial_id",
1572+
]
1573+
df[int_cols] = df[int_cols].astype("int")
15701574

15711575
# Prepping wealth dataset
15721576
dhs_wealth_data_raw = get_DHS_wealth_dataset()
@@ -1713,9 +1717,8 @@ def run_training_data_prep_child_mortality(
17131717

17141718
# NID 275090 is a very long survey in Peru, 2003-2008 that is coded as having
17151719
# multiple year_starts. Removing it.
1716-
# NID 411301 is a Zambia survey in which the prevalences end up being 0
1717-
# after removing data with invalid age columns, remove it.
1718-
problematic_nids = [275090, 411301]
1720+
# NID 411301 - updated: not in BR data extractions
1721+
problematic_nids = [275090]
17191722
before_rows = len(df_exploded)
17201723
df_exploded = df_exploded.query("nid not in @problematic_nids")
17211724
dropped_problematic_nids = before_rows - len(df_exploded)
@@ -1748,16 +1751,13 @@ def run_training_data_prep_child_mortality(
17481751
measure_df = df_climate[df_climate[measure].notna()].copy()
17491752
measure_df["measure"] = measure
17501753
measure_df["value"] = measure_df[measure]
1751-
measure_root = Path(output_root) / measure
1752-
cm_data = ClimateMalnutritionData(measure_root)
17531754
logging.info(
1754-
f"Saving data for {data_source_type} to {measure_root} {len(measure_df)} rows"
1755+
f"Saving data for {measure} to {output_path_version} {len(measure_df)} rows"
17551756
)
17561757
for ldi_col in ["ldipc_weighted_no_match"]: # ldi_cols:
17571758
measure_df["ldi_pc_pd"] = measure_df[ldi_col] / 365
1758-
version = cm_data.new_training_version()
17591759
logging.info(
1760-
f"Saving data for {data_source_type} to version {version} with {ldi_col} as LDI"
1760+
f"Saving data for {measure} to version {version} with {ldi_col} as LDI"
17611761
)
17621762
cm_data.save_training_data(measure_df, version)
17631763
message = "Used " + ldi_col + " as LDI"

0 commit comments

Comments
 (0)