80
80
MEASURES_IN_SOURCE = {
81
81
"cgf" : ["stunting" , "wasting" , "underweight" ],
82
82
"bmi" : ["low_adult_bmi" ],
83
- "anemia" : ["anemia_anemic_brinda" , "anemia_mod_sev_brinda " ],
83
+ "anemia" : ["anemia " ],
84
84
"child_mortality" : ["child_alive" ],
85
85
}
86
86
@@ -1231,11 +1231,20 @@ def run_training_data_prep_anemia(
1231
1231
data_source_type : str ,
1232
1232
) -> None :
1233
1233
1234
- # Set up logging
1235
- dataprep_log_path = (
1236
- Path (output_root ) / data_source_type / "anemia" / "data_prep_log.txt"
1234
+ # Set up logging and versioned output path
1235
+ measure_root = Path (output_root ) / data_source_type
1236
+ os .makedirs (Path (measure_root ) / "training_data" , exist_ok = True , mode = 0o777 )
1237
+ cm_data = ClimateMalnutritionData (measure_root )
1238
+ version = cm_data .new_training_version ()
1239
+ output_path_version = Path (measure_root ) / "training_data" / version
1240
+ os .makedirs (
1241
+ output_path_version ,
1242
+ exist_ok = True ,
1243
+ mode = 0o777 ,
1237
1244
)
1238
- os .makedirs (dataprep_log_path .parent , exist_ok = True )
1245
+
1246
+ # Set up logging
1247
+ dataprep_log_path = Path (output_path_version ) / "data_prep_log.txt"
1239
1248
1240
1249
logging .basicConfig (
1241
1250
level = logging .INFO ,
@@ -1279,6 +1288,9 @@ def run_training_data_prep_anemia(
1279
1288
]
1280
1289
]
1281
1290
1291
+ # Using anemia_anemic_brinda as outcome variable and renaming to'anemia'
1292
+ anemia_data = anemia_data .rename (columns = {"anemia_anemic_brinda" : "anemia" })
1293
+
1282
1294
anemia_data = anemia_data .rename (columns = COLUMN_NAME_TRANSLATOR )
1283
1295
1284
1296
anemia_data ["old_hh_id" ] = anemia_data ["hh_id" ]
@@ -1365,9 +1377,7 @@ def run_training_data_prep_anemia(
1365
1377
anemia_df = assign_age_group (anemia_df , indicator = "anemia" )
1366
1378
anemia_df = anemia_df .dropna (subset = ["age_group_id" ])
1367
1379
dropped_due_to_age = before_rows - len (anemia_df )
1368
- logging .info (
1369
- f"Dropped { dropped_due_to_age :,} rows due to age groups not found among 388, 389, 238, 34"
1370
- )
1380
+ logging .info (f"Dropped { dropped_due_to_age :,} rows due to age groups not found" )
1371
1381
1372
1382
# Take out data with invalid lat and long
1373
1383
before_rows = len (anemia_df )
@@ -1380,9 +1390,8 @@ def run_training_data_prep_anemia(
1380
1390
1381
1391
# NID 275090 is a very long survey in Peru, 2003-2008 that is coded as having
1382
1392
# multiple year_starts. Removing it.
1383
- # NID 411301 is a Zambia survey in which the prevalences end up being 0
1384
- # after removing data with invalid age columns, remove it.
1385
- problematic_nids = [275090 , 411301 ]
1393
+ # NID 411301 - update: not problematic for anemia
1394
+ problematic_nids = [275090 ]
1386
1395
before_rows = len (anemia_df )
1387
1396
anemia_df = anemia_df .query ("nid not in @problematic_nids" )
1388
1397
dropped_problematic_nids = before_rows - len (anemia_df )
@@ -1395,17 +1404,18 @@ def run_training_data_prep_anemia(
1395
1404
1396
1405
full_data_rows = len (anemia_df ) - rows_with_na_outcomes
1397
1406
logging .info (
1398
- f"Data contains { full_data_rows :,} "
1399
- f"rows out of raw { len (anemia_data_raw ):,} . "
1400
- f"Dropped data includes:\n "
1401
- f" - { len (missing_hh_rows ):,} with missing hh_id in raw data\n "
1402
- f" - { dropped_too_missingness :,} that were dropped due to excessive missingness\n "
1403
- f' - { len (unmergable_rows ):,} that further failed to merge on "nid", "ihme_loc_id", "hh_id", "psu", "year_start" variables\n '
1404
- f" - { dropped_due_to_age :,} due to age groups not found among 388, 389, 238, 34\n "
1405
- f" - { dropped_due_to_coords :,} due to invalid lat and long values\n "
1406
- f" - { dropped_problematic_nids :,} due to problematic NIDs\n "
1407
- f" - { rows_with_na_outcomes :,} with missing outcome variables ({ measure_columns } )\n "
1407
+ f"Dropped { rows_with_na_outcomes :,} rows with missing outcome variables"
1408
1408
)
1409
+ logging .info (f"Data contains { full_data_rows :,} rows after cleaning" )
1410
+
1411
+ anemia_df = anemia_df .dropna (subset = measure_columns )
1412
+ # update variable data types
1413
+ int_cols = [
1414
+ "anemia" ,
1415
+ "anemia_mod_sev_brinda" ,
1416
+ "age_group_id" ,
1417
+ ]
1418
+ anemia_df [int_cols ] = anemia_df [int_cols ].astype ("int" )
1409
1419
1410
1420
# Merge with climate data
1411
1421
logging .info ("Processing climate data..." )
@@ -1424,21 +1434,11 @@ def run_training_data_prep_anemia(
1424
1434
measure_df = anemia_df [anemia_df [measure ].notna ()].copy ()
1425
1435
measure_df ["measure" ] = measure
1426
1436
measure_df ["value" ] = measure_df [measure ]
1427
- measure_root = Path (output_root ) / measure
1428
- os .makedirs (measure_root , exist_ok = True , mode = 0o777 )
1429
- os .makedirs (Path (measure_root ) / "training_data" , exist_ok = True , mode = 0o777 )
1430
- cm_data = ClimateMalnutritionData (measure_root )
1431
1437
logging .info (
1432
- f"Saving data for { measure } to { measure_root } { len (measure_df )} rows"
1438
+ f"Saving data for { measure } to { output_path_version } { len (measure_df )} rows"
1433
1439
)
1434
1440
for ldi_col in ["ldipc_weighted_no_match" ]: # ldi_cols:
1435
1441
measure_df ["ldi_pc_pd" ] = measure_df [ldi_col ] / 365
1436
- version = cm_data .new_training_version ()
1437
- os .makedirs (
1438
- Path (measure_root ) / "training_data" / version ,
1439
- exist_ok = True ,
1440
- mode = 0o777 ,
1441
- )
1442
1442
logging .info (
1443
1443
f"Saving data for { measure } to version { version } with { ldi_col } as LDI"
1444
1444
)
@@ -1498,11 +1498,19 @@ def run_training_data_prep_child_mortality(
1498
1498
output_root : str | Path , data_source_type : str , module : str
1499
1499
) -> None :
1500
1500
1501
- # Set up logging
1502
- dataprep_log_path = (
1503
- Path (output_root ) / data_source_type / module / "data_prep_log.txt"
1501
+ # Set up logging and versioned output path
1502
+ measure_root = Path (output_root ) / data_source_type
1503
+ os .makedirs (Path (measure_root ) / "training_data" , exist_ok = True , mode = 0o777 )
1504
+ cm_data = ClimateMalnutritionData (measure_root )
1505
+ version = cm_data .new_training_version ()
1506
+ output_path_version = Path (measure_root ) / "training_data" / version
1507
+ os .makedirs (
1508
+ output_path_version ,
1509
+ exist_ok = True ,
1510
+ mode = 0o777 ,
1504
1511
)
1505
- os .makedirs (dataprep_log_path .parent , exist_ok = True )
1512
+
1513
+ dataprep_log_path = Path (output_path_version ) / "data_prep_log.txt"
1506
1514
1507
1515
logging .basicConfig (
1508
1516
level = logging .INFO ,
@@ -1738,23 +1746,16 @@ def run_training_data_prep_child_mortality(
1738
1746
# Write to output
1739
1747
for measure in MEASURES_IN_SOURCE [data_source_type ]:
1740
1748
measure_df = df_climate [df_climate [measure ].notna ()].copy ()
1741
- measure_df ["measure" ] = data_source_type
1749
+ measure_df ["measure" ] = measure
1742
1750
measure_df ["value" ] = measure_df [measure ]
1743
- measure_root = Path (output_root ) / data_source_type
1744
- os .makedirs (measure_root , exist_ok = True , mode = 0o777 )
1745
- os .makedirs (Path (measure_root ) / "training_data" , exist_ok = True , mode = 0o777 )
1751
+ measure_root = Path (output_root ) / measure
1746
1752
cm_data = ClimateMalnutritionData (measure_root )
1747
1753
logging .info (
1748
1754
f"Saving data for { data_source_type } to { measure_root } { len (measure_df )} rows"
1749
1755
)
1750
1756
for ldi_col in ["ldipc_weighted_no_match" ]: # ldi_cols:
1751
1757
measure_df ["ldi_pc_pd" ] = measure_df [ldi_col ] / 365
1752
1758
version = cm_data .new_training_version ()
1753
- os .makedirs (
1754
- Path (measure_root ) / "training_data" / version ,
1755
- exist_ok = True ,
1756
- mode = 0o777 ,
1757
- )
1758
1759
logging .info (
1759
1760
f"Saving data for { data_source_type } to version { version } with { ldi_col } as LDI"
1760
1761
)
0 commit comments