Skip to content

Commit 194cd94

Browse files
author
Elye Bliss
committed
cleaning output path for anemia
1 parent 12465bd commit 194cd94

File tree

1 file changed

+46
-45
lines changed

1 file changed

+46
-45
lines changed

src/rra_climate_health/data_prep/run_training_data_prep.py

Lines changed: 46 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@
8080
MEASURES_IN_SOURCE = {
8181
"cgf": ["stunting", "wasting", "underweight"],
8282
"bmi": ["low_adult_bmi"],
83-
"anemia": ["anemia_anemic_brinda", "anemia_mod_sev_brinda"],
83+
"anemia": ["anemia"],
8484
"child_mortality": ["child_alive"],
8585
}
8686

@@ -1231,11 +1231,20 @@ def run_training_data_prep_anemia(
12311231
data_source_type: str,
12321232
) -> None:
12331233

1234-
# Set up logging
1235-
dataprep_log_path = (
1236-
Path(output_root) / data_source_type / "anemia" / "data_prep_log.txt"
1234+
# Set up logging and versioned output path
1235+
measure_root = Path(output_root) / data_source_type
1236+
os.makedirs(Path(measure_root) / "training_data", exist_ok=True, mode=0o777)
1237+
cm_data = ClimateMalnutritionData(measure_root)
1238+
version = cm_data.new_training_version()
1239+
output_path_version = Path(measure_root) / "training_data" / version
1240+
os.makedirs(
1241+
output_path_version,
1242+
exist_ok=True,
1243+
mode=0o777,
12371244
)
1238-
os.makedirs(dataprep_log_path.parent, exist_ok=True)
1245+
1246+
# Set up logging
1247+
dataprep_log_path = Path(output_path_version) / "data_prep_log.txt"
12391248

12401249
logging.basicConfig(
12411250
level=logging.INFO,
@@ -1279,6 +1288,9 @@ def run_training_data_prep_anemia(
12791288
]
12801289
]
12811290

1291+
# Using anemia_anemic_brinda as outcome variable and renaming to'anemia'
1292+
anemia_data = anemia_data.rename(columns={"anemia_anemic_brinda": "anemia"})
1293+
12821294
anemia_data = anemia_data.rename(columns=COLUMN_NAME_TRANSLATOR)
12831295

12841296
anemia_data["old_hh_id"] = anemia_data["hh_id"]
@@ -1365,9 +1377,7 @@ def run_training_data_prep_anemia(
13651377
anemia_df = assign_age_group(anemia_df, indicator="anemia")
13661378
anemia_df = anemia_df.dropna(subset=["age_group_id"])
13671379
dropped_due_to_age = before_rows - len(anemia_df)
1368-
logging.info(
1369-
f"Dropped {dropped_due_to_age:,} rows due to age groups not found among 388, 389, 238, 34"
1370-
)
1380+
logging.info(f"Dropped {dropped_due_to_age:,} rows due to age groups not found")
13711381

13721382
# Take out data with invalid lat and long
13731383
before_rows = len(anemia_df)
@@ -1380,9 +1390,8 @@ def run_training_data_prep_anemia(
13801390

13811391
# NID 275090 is a very long survey in Peru, 2003-2008 that is coded as having
13821392
# multiple year_starts. Removing it.
1383-
# NID 411301 is a Zambia survey in which the prevalences end up being 0
1384-
# after removing data with invalid age columns, remove it.
1385-
problematic_nids = [275090, 411301]
1393+
# NID 411301 - update: not problematic for anemia
1394+
problematic_nids = [275090]
13861395
before_rows = len(anemia_df)
13871396
anemia_df = anemia_df.query("nid not in @problematic_nids")
13881397
dropped_problematic_nids = before_rows - len(anemia_df)
@@ -1395,17 +1404,18 @@ def run_training_data_prep_anemia(
13951404

13961405
full_data_rows = len(anemia_df) - rows_with_na_outcomes
13971406
logging.info(
1398-
f"Data contains {full_data_rows:,} "
1399-
f"rows out of raw {len(anemia_data_raw):,}. "
1400-
f"Dropped data includes:\n"
1401-
f" - {len(missing_hh_rows):,} with missing hh_id in raw data\n"
1402-
f" - {dropped_too_missingness:,} that were dropped due to excessive missingness\n"
1403-
f' - {len(unmergable_rows):,} that further failed to merge on "nid", "ihme_loc_id", "hh_id", "psu", "year_start" variables\n'
1404-
f" - {dropped_due_to_age:,} due to age groups not found among 388, 389, 238, 34\n"
1405-
f" - {dropped_due_to_coords:,} due to invalid lat and long values\n"
1406-
f" - {dropped_problematic_nids:,} due to problematic NIDs\n"
1407-
f" - {rows_with_na_outcomes:,} with missing outcome variables ({measure_columns})\n"
1407+
f"Dropped {rows_with_na_outcomes:,} rows with missing outcome variables"
14081408
)
1409+
logging.info(f"Data contains {full_data_rows:,} rows after cleaning")
1410+
1411+
anemia_df = anemia_df.dropna(subset=measure_columns)
1412+
# update variable data types
1413+
int_cols = [
1414+
"anemia",
1415+
"anemia_mod_sev_brinda",
1416+
"age_group_id",
1417+
]
1418+
anemia_df[int_cols] = anemia_df[int_cols].astype("int")
14091419

14101420
# Merge with climate data
14111421
logging.info("Processing climate data...")
@@ -1424,21 +1434,11 @@ def run_training_data_prep_anemia(
14241434
measure_df = anemia_df[anemia_df[measure].notna()].copy()
14251435
measure_df["measure"] = measure
14261436
measure_df["value"] = measure_df[measure]
1427-
measure_root = Path(output_root) / measure
1428-
os.makedirs(measure_root, exist_ok=True, mode=0o777)
1429-
os.makedirs(Path(measure_root) / "training_data", exist_ok=True, mode=0o777)
1430-
cm_data = ClimateMalnutritionData(measure_root)
14311437
logging.info(
1432-
f"Saving data for {measure} to {measure_root} {len(measure_df)} rows"
1438+
f"Saving data for {measure} to {output_path_version} {len(measure_df)} rows"
14331439
)
14341440
for ldi_col in ["ldipc_weighted_no_match"]: # ldi_cols:
14351441
measure_df["ldi_pc_pd"] = measure_df[ldi_col] / 365
1436-
version = cm_data.new_training_version()
1437-
os.makedirs(
1438-
Path(measure_root) / "training_data" / version,
1439-
exist_ok=True,
1440-
mode=0o777,
1441-
)
14421442
logging.info(
14431443
f"Saving data for {measure} to version {version} with {ldi_col} as LDI"
14441444
)
@@ -1498,11 +1498,19 @@ def run_training_data_prep_child_mortality(
14981498
output_root: str | Path, data_source_type: str, module: str
14991499
) -> None:
15001500

1501-
# Set up logging
1502-
dataprep_log_path = (
1503-
Path(output_root) / data_source_type / module / "data_prep_log.txt"
1501+
# Set up logging and versioned output path
1502+
measure_root = Path(output_root) / data_source_type
1503+
os.makedirs(Path(measure_root) / "training_data", exist_ok=True, mode=0o777)
1504+
cm_data = ClimateMalnutritionData(measure_root)
1505+
version = cm_data.new_training_version()
1506+
output_path_version = Path(measure_root) / "training_data" / version
1507+
os.makedirs(
1508+
output_path_version,
1509+
exist_ok=True,
1510+
mode=0o777,
15041511
)
1505-
os.makedirs(dataprep_log_path.parent, exist_ok=True)
1512+
1513+
dataprep_log_path = Path(output_path_version) / "data_prep_log.txt"
15061514

15071515
logging.basicConfig(
15081516
level=logging.INFO,
@@ -1738,23 +1746,16 @@ def run_training_data_prep_child_mortality(
17381746
# Write to output
17391747
for measure in MEASURES_IN_SOURCE[data_source_type]:
17401748
measure_df = df_climate[df_climate[measure].notna()].copy()
1741-
measure_df["measure"] = data_source_type
1749+
measure_df["measure"] = measure
17421750
measure_df["value"] = measure_df[measure]
1743-
measure_root = Path(output_root) / data_source_type
1744-
os.makedirs(measure_root, exist_ok=True, mode=0o777)
1745-
os.makedirs(Path(measure_root) / "training_data", exist_ok=True, mode=0o777)
1751+
measure_root = Path(output_root) / measure
17461752
cm_data = ClimateMalnutritionData(measure_root)
17471753
logging.info(
17481754
f"Saving data for {data_source_type} to {measure_root} {len(measure_df)} rows"
17491755
)
17501756
for ldi_col in ["ldipc_weighted_no_match"]: # ldi_cols:
17511757
measure_df["ldi_pc_pd"] = measure_df[ldi_col] / 365
17521758
version = cm_data.new_training_version()
1753-
os.makedirs(
1754-
Path(measure_root) / "training_data" / version,
1755-
exist_ok=True,
1756-
mode=0o777,
1757-
)
17581759
logging.info(
17591760
f"Saving data for {data_source_type} to version {version} with {ldi_col} as LDI"
17601761
)

0 commit comments

Comments
 (0)