37
37
"/mnt/team/integrated_analytics/pub/goalkeepers/goalkeepers_2024/data"
38
38
)
39
39
40
- # ANEMIA_ROOT = Path(
41
- # "/mnt/team/rapidresponse/pub/population/modeling/climate_malnutrition/input/extractions/anemia/"
42
- # )
43
-
44
40
ANEMIA_ROOT = Path (
45
- "/mnt/team/integrated_analytics /pub/goalkeepers/goalkeepers_2024/data "
41
+ "/mnt/team/rapidresponse /pub/population/modeling/climate_malnutrition/input/extractions/anemia/ "
46
42
)
47
43
48
44
SDI_PATH = Path ("/mnt/share/forecasting/data/7/past/sdi/20240531_gk24/sdi.nc" )
59
55
"DHS" : WEALTH_DATA_ROOT / "DHS_wealth.parquet" ,
60
56
"MICS" : WEALTH_DATA_ROOT / "MICS_wealth.parquet" ,
61
57
},
62
- "anemia" : ANEMIA_ROOT / "anemia" / "anemia_combined_wealth_rex.csv" ,
63
- # "anemia": ANEMIA_ROOT / "anemia_extracts_compiled_09_02_2025.csv",
58
+ "anemia" : ANEMIA_ROOT / "anemia_extracts_compiled_09_02_2025.csv" ,
64
59
}
65
60
66
61
DATA_SOURCE_TYPE = {"stunting" : "cgf" , "wasting" : "cgf" , "underweight" :"cgf" , "low_adult_bmi" : "bmi" ,"anemia" :"anemia" }
@@ -414,9 +409,7 @@ def ldi_to_wealth(x):
414
409
"ldipc_weighted_match" ,
415
410
]
416
411
if asset_df_ldipc [new_cols ].isna ().sum ().sum () != 0 :
417
- # raise RuntimeError("Null LDI-PC values in one of the methods.") temporarily commenting
418
- print ("Null LDI-PC values in one of the methods." )
419
- asset_df_ldipc = asset_df_ldipc [~ asset_df_ldipc [new_cols ].isna ().any (axis = 1 )]
412
+ raise RuntimeError ("Null LDI-PC values in one of the methods." )
420
413
if len (asset_df_ldipc ) != len (asset_df ):
421
414
raise RuntimeError ("Mismatch in length of asset data and LDI-PC data." )
422
415
return asset_df_ldipc # type: ignore[no-any-return]
@@ -659,6 +652,8 @@ def run_training_data_prep(output_root: str, source_type: str) -> None:
659
652
run_training_data_prep_main (output_root , source_type )
660
653
661
654
def clean_hh_id (row ):
655
+ if pd .isna (row ['hh_id' ]):
656
+ return row ['hh_id' ]
662
657
hh_id_str = str (row ['hh_id' ])
663
658
geo_str = str (row ['geospatial_id' ])
664
659
# Match one or more leading zeros followed by the geospatial_id at the start
@@ -670,6 +665,7 @@ def clean_hh_id(row):
670
665
return cleaned
671
666
672
667
668
+
673
669
output_root = DEFAULT_ROOT
674
670
data_source_type = "cgf"
675
671
def run_training_data_prep_main ( # noqa: PLR0915
@@ -1017,46 +1013,139 @@ def run_training_data_prep_anemia(
1017
1013
loc_meta = pd .read_parquet (paths .FHS_LOCATION_METADATA_FILEPATH )
1018
1014
1019
1015
anemia_data_raw = pd .read_csv (
1020
- survey_data_path ,
1021
- dtype = {"hh_id" : str , "year_start" : int , "int_year" : int , "year_end" : int },
1016
+ survey_data_path
1022
1017
)
1023
1018
1024
- anemia_data_raw = anemia_data_raw .rename (columns = COLUMN_NAME_TRANSLATOR )
1025
-
1026
1019
anemia_data = anemia_data_raw [
1027
1020
[
1028
1021
"nid" ,
1029
1022
"ihme_loc_id" ,
1023
+ 'survey_name' ,
1030
1024
"year_start" ,
1031
1025
"year_end" ,
1032
- # "geospatial_id", # not found in 2024 version
1033
- "psu" ,
1026
+ 'urban' ,
1027
+ "geospatial_id" ,
1028
+ "psu_id" ,
1034
1029
"pweight" ,
1035
1030
"strata" ,
1036
1031
"hh_id" ,
1037
- "urban " ,
1032
+ "line_id " ,
1038
1033
"sex_id" ,
1039
1034
"age_year" ,
1040
- # "age_month", # not found in 2024 version
1035
+ "age_month" ,
1041
1036
"int_year" ,
1042
- "int_month" ,
1043
- "brinda_adj_hemog" , # likely to include more anemia vars
1044
- "who_adj_hemog" ,
1045
- "wealth_index_dhs" , # not found in extractions data
1046
- "lat" ,
1047
- "long" ,
1037
+ 'anemia_anemic_brinda' ,
1038
+ 'anemia_mod_sev_brinda' ,
1039
+ 'latnum' ,
1040
+ 'longnum' ,
1048
1041
]
1049
1042
]
1050
1043
1044
+ anemia_data = anemia_data .rename (columns = COLUMN_NAME_TRANSLATOR )
1045
+
1046
+ anemia_data ["old_hh_id" ] = anemia_data ["hh_id" ]
1047
+
1048
+
1049
+ def clean_hh_id (row ):
1050
+ if pd .isna (row ['hh_id_v2' ]):
1051
+ return row ['hh_id_v2' ]
1052
+ hh_id_str = str (row ['hh_id_v2' ])
1053
+ geo_str = str (row ['geospatial_id' ])
1054
+ # Match one or more leading zeros followed by the geospatial_id at the start
1055
+ pattern = r'^0+' + re .escape (geo_str )
1056
+ # Remove the matched pattern (if found)
1057
+ cleaned = re .sub (pattern , '' , hh_id_str )
1058
+ # Strip remaining leading zeros and handle empty results
1059
+ cleaned = cleaned .lstrip ('0' ) or '0'
1060
+ return cleaned
1061
+
1062
+ def clean_hh_id_anemia (row ):
1063
+ hh_id = row ['hh_id' ]
1064
+ geo_str = str (row ['geospatial_id' ])
1065
+
1066
+ if pd .isna (hh_id ):
1067
+ return hh_id
1068
+
1069
+ # Convert to string and trim leading/trailing whitespace
1070
+ hh_id = str (hh_id ).strip ()
1071
+
1072
+ # Replace multiple spaces with a single space
1073
+ hh_id = re .sub (r'\s{2,}' , ' ' , hh_id )
1074
+
1075
+ # If the hh_id is already clean (no spaces or underscores), return it
1076
+ if len (re .split (r"[_ ]" , hh_id )) == 1 :
1077
+ return float (hh_id )
1078
+
1079
+ # Handle cases with spaces or underscores
1080
+ if " " in hh_id :
1081
+ hh_id = hh_id .split (" " )[- 1 ]
1082
+ elif "_" in hh_id :
1083
+ hh_id = hh_id .split ("_" )[- 1 ]
1084
+
1085
+ # Match and remove leading zeros followed by the geospatial_id
1086
+ pattern = r'^0+' + re .escape (geo_str )
1087
+ hh_id = re .sub (pattern , '' , hh_id )
1088
+
1089
+ # Strip remaining leading zeros and handle empty results
1090
+ hh_id = hh_id .lstrip ('0' ) or '0'
1091
+
1092
+ return float (hh_id ) # Return as float to handle NAs
1093
+
1094
+ anemia_data ["hh_id" ] = anemia_data ["hh_id" ].apply (clean_hh_id_anemia )
1095
+
1096
+ assert len (anemia_data [anemia_data ["hh_id" ].isna ()]) == len (anemia_data [anemia_data ["old_hh_id" ].isna ()]), "NAs introduced by cleaning"
1097
+
1098
+ # anemia_data["hh_id_v2"] = anemia_data["old_hh_id"].str.split(r"[_ ]").str[-1]
1099
+
1100
+ # anemia_data[(anemia_data["hh_id"].isna()) & (anemia_data["hh_id_v2"].notna())]
1101
+ # anemia_data[(anemia_data["hh_id"].notna()) & (anemia_data["hh_id_v2"].isna())]
1102
+
1103
+ # anemia_data["psu"] = anemia_data["psu"].astype(int)
1104
+ # anemia_data["hh_id_v2"] = anemia_data.apply(clean_hh_id, axis=1)
1105
+
1106
+ # len(anemia_data[anemia_data["hh_id_v2"].isna()])
1107
+ # len(anemia_data[anemia_data["hh_id"].isna()])
1108
+ # len(anemia_data[anemia_data["old_hh_id"].isna()])
1109
+
1110
+ # anemia_data = anemia_data[~anemia_data["hh_id"].isna()]
1111
+ # anemia_data["hh_id"] = anemia_data["hh_id"].astype(int)
1112
+ # anemia_data["hh_id"] = anemia_data["hh_id"].astype(str)
1113
+ # anemia_data[(anemia_data["hh_id"]!=anemia_data["hh_id_v2"])][["psu","geospatial_id","hh_id","hh_id_v2","old_hh_id"]]
1114
+
1115
+ # anemia_data[(anemia_data["hh_id"].isna())&(~anemia_data["old_hh_id"].isna())]
1116
+
1051
1117
# Take away NIDs without wealth information, to see later if we can add it from second source
1052
1118
print (len (anemia_data ))
1053
1119
anemia_data_nids_without_wealth = anemia_data [
1054
1120
anemia_data .wealth_index_dhs .isna ()
1055
1121
].nid .unique ()
1056
1122
1057
- lsms_wealth_data = get_LSMS_wealth_dataset ()
1058
- dhs_wealth_data = get_DHS_wealth_dataset ()
1059
- mics_wealth_data = get_MICS_wealth_dataset ()
1123
+ # Prepping wealth dataset
1124
+ dhs_wealth_data_raw = get_DHS_wealth_dataset ()
1125
+ dhs_wealth_data = dhs_wealth_data_raw .copy ()
1126
+
1127
+ dhs_wealth_data ["old_hh_id" ] = dhs_wealth_data ["hh_id" ]
1128
+ dhs_wealth_data ["hh_id" ] = dhs_wealth_data .apply (clean_hh_id_anemia , axis = 1 )
1129
+
1130
+ assert len (dhs_wealth_data [dhs_wealth_data ["hh_id" ].isna ()]) == len (dhs_wealth_data [dhs_wealth_data ["old_hh_id" ].isna ()]), "NAs introduced by cleaning"
1131
+
1132
+
1133
+ dhs_wealth_data ["hh_id_v2" ] = dhs_wealth_data ["old_hh_id" ].str .split (r"[_ ]" ).str [- 1 ]
1134
+ dhs_wealth_data ["psu" ] = dhs_wealth_data ["psu" ].astype (int )
1135
+ dhs_wealth_data ["psu" ] = dhs_wealth_data ["psu" ].astype (str )
1136
+ dhs_wealth_data ["hh_id_v2" ] = dhs_wealth_data .apply (clean_hh_id , axis = 1 )
1137
+
1138
+ assert len (dhs_wealth_data [dhs_wealth_data ["hh_id_v2" ].isna ()]) == len (dhs_wealth_data [dhs_wealth_data ["old_hh_id" ].isna ()]), "NAs introduced by cleaning"
1139
+
1140
+ # dhs_wealth_data["hh_id"] = dhs_wealth_data["hh_id"].astype(int)
1141
+ # dhs_wealth_data["hh_id"] = dhs_wealth_data["hh_id"].astype(str)
1142
+
1143
+
1144
+ # dhs_wealth_data[(dhs_wealth_data["hh_id"]!=dhs_wealth_data["hh_id_v2"])&(dhs_wealth_data["psu"]!=dhs_wealth_data["geospatial_id"])]
1145
+
1146
+ dhs_wealth_data ["hh_id" ] = dhs_wealth_data ["hh_id" ].astype (int )
1147
+ dhs_wealth_data ["hh_id_v1" ] = dhs_wealth_data ["hh_id_v1" ].astype (int )
1148
+ dhs_wealth_data [dhs_wealth_data ["hh_id" ]!= dhs_wealth_data ["hh_id_v1" ]][["psu" ,"hh_id" ,"hh_id_v1" ,"old_hh_id" ]]
1060
1149
1061
1150
wealth_nids = set (dhs_wealth_data .nid .unique ()) | set (mics_wealth_data .nid .unique ()) | set (lsms_wealth_data .nid .unique ())
1062
1151
anemia_nids = set (anemia_data .nid .unique ())
@@ -1065,7 +1154,23 @@ def run_training_data_prep_anemia(
1065
1154
nid_with_wealth_pc = 100 * len (common_nids )/ len (anemia_nids )
1066
1155
print (f"{ nid_with_wealth_pc :.1f} % of anemia NIDs - { len (common_nids )} out of { len (anemia_nids )} in wealth data NIDs" )
1067
1156
1068
-
1157
+ # Find out percent of anemia nids and hh_ids that can be matched in wealth data
1158
+ dhs_wealth_data ['hh_id' ] = dhs_wealth_data ['hh_id' ].apply (clean_dhs_wealth_hh_id )
1159
+ dhs_wealth_data ['psu' ] = dhs_wealth_data ['psu' ].astype (int )
1160
+ anemia_data_hhs = anemia_data [["nid" ,"hh_id" ]].drop_duplicates ()
1161
+ dhs_wealth_data_hhs = dhs_wealth_data [["nid" ,"hh_id" ]].drop_duplicates ()
1162
+ dhs_wealth_data_hhs ["dhs_wealth_data" ] = True
1163
+ merged_hhs = anemia_data_hhs .merge (dhs_wealth_data_hhs , on = ["nid" ,"hh_id" ], how = "left" )
1164
+ merged_hhs ["dhs_wealth_data" ].fillna (False , inplace = True )
1165
+ merged_hhs ["dhs_wealth_data" ].value_counts ()
1166
+
1167
+
1168
+ # remove rows with missing hh_id
1169
+ anemia_data = anemia_data [anemia_data ["hh_id" ].notna ()]
1170
+
1171
+ anemia_data .drop (columns = ["old_hh_id" ],inplace = True )
1172
+ anemia_data ["hh_id" ] = anemia_data ["hh_id" ].astype (int )
1173
+ anemia_data ["psu" ] = anemia_data ["psu" ].astype (int )
1069
1174
1070
1175
# Subset to common nids
1071
1176
lsms_wealth_data = lsms_wealth_data .query ('nid in @common_nids' )
0 commit comments