@@ -664,7 +664,37 @@ def clean_hh_id(row):
664
664
cleaned = cleaned .lstrip ('0' ) or '0'
665
665
return cleaned
666
666
667
+ def clean_hh_id_anemia (row ):
668
+ hh_id = row ['hh_id' ]
669
+ geo_str = str (row ['geospatial_id' ])
670
+
671
+ if pd .isna (hh_id ):
672
+ return hh_id
673
+
674
+ # Convert to string and trim leading/trailing whitespace
675
+ hh_id = str (hh_id ).strip ()
676
+
677
+ # Replace multiple spaces with a single space
678
+ hh_id = re .sub (r'\s{2,}' , ' ' , hh_id )
679
+
680
+ # If the hh_id is already clean (no spaces or underscores), return it
681
+ if (len (re .split (r"[_ ]" , hh_id )) == 1 ) and not (hh_id .startswith ('0' )):
682
+ return float (hh_id )
683
+
684
+ # Handle cases with spaces or underscores
685
+ if " " in hh_id :
686
+ hh_id = hh_id .split (" " )[- 1 ]
687
+ elif "_" in hh_id :
688
+ hh_id = hh_id .split ("_" )[- 1 ]
689
+
690
+ # Match and remove leading zeros followed by the geospatial_id
691
+ pattern = r'^0+' + re .escape (geo_str )
692
+ hh_id = re .sub (pattern , '' , hh_id )
693
+
694
+ # Strip remaining leading zeros and handle empty results
695
+ hh_id = hh_id .lstrip ('0' ) or '0'
667
696
697
+ return float (hh_id ) # Return as float to handle NAs
668
698
669
699
output_root = DEFAULT_ROOT
670
700
data_source_type = "cgf"
@@ -1045,168 +1075,62 @@ def run_training_data_prep_anemia(
1045
1075
1046
1076
anemia_data ["old_hh_id" ] = anemia_data ["hh_id" ]
1047
1077
1048
-
1049
- def clean_hh_id (row ):
1050
- if pd .isna (row ['hh_id_v2' ]):
1051
- return row ['hh_id_v2' ]
1052
- hh_id_str = str (row ['hh_id_v2' ])
1053
- geo_str = str (row ['geospatial_id' ])
1054
- # Match one or more leading zeros followed by the geospatial_id at the start
1055
- pattern = r'^0+' + re .escape (geo_str )
1056
- # Remove the matched pattern (if found)
1057
- cleaned = re .sub (pattern , '' , hh_id_str )
1058
- # Strip remaining leading zeros and handle empty results
1059
- cleaned = cleaned .lstrip ('0' ) or '0'
1060
- return cleaned
1061
-
1062
- def clean_hh_id_anemia (row ):
1063
- hh_id = row ['hh_id' ]
1064
- geo_str = str (row ['geospatial_id' ])
1065
-
1066
- if pd .isna (hh_id ):
1067
- return hh_id
1068
-
1069
- # Convert to string and trim leading/trailing whitespace
1070
- hh_id = str (hh_id ).strip ()
1071
-
1072
- # Replace multiple spaces with a single space
1073
- hh_id = re .sub (r'\s{2,}' , ' ' , hh_id )
1074
-
1075
- # If the hh_id is already clean (no spaces or underscores), return it
1076
- if len (re .split (r"[_ ]" , hh_id )) == 1 :
1077
- return float (hh_id )
1078
-
1079
- # Handle cases with spaces or underscores
1080
- if " " in hh_id :
1081
- hh_id = hh_id .split (" " )[- 1 ]
1082
- elif "_" in hh_id :
1083
- hh_id = hh_id .split ("_" )[- 1 ]
1084
-
1085
- # Match and remove leading zeros followed by the geospatial_id
1086
- pattern = r'^0+' + re .escape (geo_str )
1087
- hh_id = re .sub (pattern , '' , hh_id )
1088
-
1089
- # Strip remaining leading zeros and handle empty results
1090
- hh_id = hh_id .lstrip ('0' ) or '0'
1091
-
1092
- return float (hh_id ) # Return as float to handle NAs
1093
-
1094
- anemia_data ["hh_id" ] = anemia_data ["hh_id" ].apply (clean_hh_id_anemia )
1078
+ anemia_data ["hh_id" ] = anemia_data .apply (clean_hh_id_anemia , axis = 1 )
1095
1079
1096
1080
assert len (anemia_data [anemia_data ["hh_id" ].isna ()]) == len (anemia_data [anemia_data ["old_hh_id" ].isna ()]), "NAs introduced by cleaning"
1097
1081
1098
- # anemia_data["hh_id_v2"] = anemia_data["old_hh_id"].str.split(r"[_ ]").str[-1]
1099
-
1100
- # anemia_data[(anemia_data["hh_id"].isna()) & (anemia_data["hh_id_v2"].notna())]
1101
- # anemia_data[(anemia_data["hh_id"].notna()) & (anemia_data["hh_id_v2"].isna())]
1102
-
1103
- # anemia_data["psu"] = anemia_data["psu"].astype(int)
1104
- # anemia_data["hh_id_v2"] = anemia_data.apply(clean_hh_id, axis=1)
1105
-
1106
- # len(anemia_data[anemia_data["hh_id_v2"].isna()])
1107
- # len(anemia_data[anemia_data["hh_id"].isna()])
1108
- # len(anemia_data[anemia_data["old_hh_id"].isna()])
1109
-
1110
- # anemia_data = anemia_data[~anemia_data["hh_id"].isna()]
1111
- # anemia_data["hh_id"] = anemia_data["hh_id"].astype(int)
1112
- # anemia_data["hh_id"] = anemia_data["hh_id"].astype(str)
1113
- # anemia_data[(anemia_data["hh_id"]!=anemia_data["hh_id_v2"])][["psu","geospatial_id","hh_id","hh_id_v2","old_hh_id"]]
1114
-
1115
- # anemia_data[(anemia_data["hh_id"].isna())&(~anemia_data["old_hh_id"].isna())]
1116
-
1117
- # Take away NIDs without wealth information, to see later if we can add it from second source
1118
- print (len (anemia_data ))
1119
- anemia_data_nids_without_wealth = anemia_data [
1120
- anemia_data .wealth_index_dhs .isna ()
1121
- ].nid .unique ()
1122
-
1123
1082
# Prepping wealth dataset
1124
1083
dhs_wealth_data_raw = get_DHS_wealth_dataset ()
1125
1084
dhs_wealth_data = dhs_wealth_data_raw .copy ()
1126
1085
1086
+ cm_data = ClimateMalnutritionData (Path (DEFAULT_ROOT )/ 'anemia' )
1087
+ dhs_wealth_data = get_ldipc_from_asset_score (
1088
+ dhs_wealth_data , cm_data , asset_score_col = "wealth_index_dhs" , weights_col = "hhweight" ,
1089
+ plot_pdf_path = Path ("/mnt/team/integrated_analytics/pub/goalkeepers/goalkeepers_2025/plots/anemia" ) / "gbd_plots.pdf" , # temporary
1090
+ ldi_version = LDI_VERSION ,
1091
+ )
1092
+
1127
1093
dhs_wealth_data ["old_hh_id" ] = dhs_wealth_data ["hh_id" ]
1128
1094
dhs_wealth_data ["hh_id" ] = dhs_wealth_data .apply (clean_hh_id_anemia , axis = 1 )
1129
1095
1130
1096
assert len (dhs_wealth_data [dhs_wealth_data ["hh_id" ].isna ()]) == len (dhs_wealth_data [dhs_wealth_data ["old_hh_id" ].isna ()]), "NAs introduced by cleaning"
1131
1097
1098
+ missing_hh_rows = anemia_data [anemia_data ['hh_id' ].isna ()]
1099
+ print (f"Dropping { len (missing_hh_rows )} rows from anemia data with missing hh_id" )
1100
+ anemia_data = anemia_data [anemia_data ["hh_id" ].notna ()]
1132
1101
1133
- dhs_wealth_data ["hh_id_v2" ] = dhs_wealth_data ["old_hh_id" ].str .split (r"[_ ]" ).str [- 1 ]
1134
- dhs_wealth_data ["psu" ] = dhs_wealth_data ["psu" ].astype (int )
1135
- dhs_wealth_data ["psu" ] = dhs_wealth_data ["psu" ].astype (str )
1136
- dhs_wealth_data ["hh_id_v2" ] = dhs_wealth_data .apply (clean_hh_id , axis = 1 )
1137
-
1138
- assert len (dhs_wealth_data [dhs_wealth_data ["hh_id_v2" ].isna ()]) == len (dhs_wealth_data [dhs_wealth_data ["old_hh_id" ].isna ()]), "NAs introduced by cleaning"
1139
-
1140
- # dhs_wealth_data["hh_id"] = dhs_wealth_data["hh_id"].astype(int)
1141
- # dhs_wealth_data["hh_id"] = dhs_wealth_data["hh_id"].astype(str)
1142
-
1143
-
1144
- # dhs_wealth_data[(dhs_wealth_data["hh_id"]!=dhs_wealth_data["hh_id_v2"])&(dhs_wealth_data["psu"]!=dhs_wealth_data["geospatial_id"])]
1145
-
1146
- dhs_wealth_data ["hh_id" ] = dhs_wealth_data ["hh_id" ].astype (int )
1147
- dhs_wealth_data ["hh_id_v1" ] = dhs_wealth_data ["hh_id_v1" ].astype (int )
1148
- dhs_wealth_data [dhs_wealth_data ["hh_id" ]!= dhs_wealth_data ["hh_id_v1" ]][["psu" ,"hh_id" ,"hh_id_v1" ,"old_hh_id" ]]
1102
+ # Find out percent of anemia nids and hh_ids that can be matched in wealth data
1103
+ merge_cols = ["nid" , "ihme_loc_id" , "hh_id" , "psu" , "year_start" ]
1104
+ anemia_data ['hh_id' ] = anemia_data ['hh_id' ].astype (int )
1105
+ dhs_wealth_data ['hh_id' ] = dhs_wealth_data ['hh_id' ].astype (int )
1106
+ anemia_data ['psu' ] = anemia_data ['psu' ].astype (int )
1107
+ dhs_wealth_data ['psu' ] = dhs_wealth_data ['psu' ].astype (int )
1149
1108
1150
- wealth_nids = set (dhs_wealth_data .nid .unique ()) | set ( mics_wealth_data . nid . unique ()) | set ( lsms_wealth_data . nid . unique ())
1109
+ wealth_nids = set (dhs_wealth_data .nid .unique ())
1151
1110
anemia_nids = set (anemia_data .nid .unique ())
1152
1111
common_nids = wealth_nids .intersection (anemia_nids )
1153
1112
1154
1113
nid_with_wealth_pc = 100 * len (common_nids )/ len (anemia_nids )
1155
- print (f"{ nid_with_wealth_pc :.1f} % of anemia NIDs - { len (common_nids )} out of { len (anemia_nids )} in wealth data NIDs" )
1114
+ print (f"{ nid_with_wealth_pc :.1f} % of anemia NIDs - { len (common_nids )} out of "
1115
+ f"{ len (anemia_nids )} in wealth data NIDs" )
1156
1116
1157
- # Find out percent of anemia nids and hh_ids that can be matched in wealth data
1158
- dhs_wealth_data ['hh_id' ] = dhs_wealth_data ['hh_id' ].apply (clean_dhs_wealth_hh_id )
1159
- dhs_wealth_data ['psu' ] = dhs_wealth_data ['psu' ].astype (int )
1160
- anemia_data_hhs = anemia_data [["nid" ,"hh_id" ]].drop_duplicates ()
1161
- dhs_wealth_data_hhs = dhs_wealth_data [["nid" ,"hh_id" ]].drop_duplicates ()
1162
- dhs_wealth_data_hhs ["dhs_wealth_data" ] = True
1163
- merged_hhs = anemia_data_hhs .merge (dhs_wealth_data_hhs , on = ["nid" ,"hh_id" ], how = "left" )
1164
- merged_hhs ["dhs_wealth_data" ].fillna (False , inplace = True )
1165
- merged_hhs ["dhs_wealth_data" ].value_counts ()
1166
-
1167
-
1168
- # remove rows with missing hh_id
1169
- anemia_data = anemia_data [anemia_data ["hh_id" ].notna ()]
1117
+ dhs_wealth_data = dhs_wealth_data .query ("nid in @anemia_nids" )
1170
1118
1171
1119
anemia_data .drop (columns = ["old_hh_id" ],inplace = True )
1172
- anemia_data ["hh_id" ] = anemia_data ["hh_id" ].astype (int )
1173
- anemia_data ["psu" ] = anemia_data ["psu" ].astype (int )
1174
-
1175
- # Subset to common nids
1176
- lsms_wealth_data = lsms_wealth_data .query ('nid in @common_nids' )
1177
- dhs_wealth_data = dhs_wealth_data .query ('nid in @common_nids' )
1178
- mics_wealth_data = mics_wealth_data .query ('nid in @common_nids' )
1120
+ dhs_wealth_data .drop (columns = ["old_hh_id" ],inplace = True )
1179
1121
1122
+ # Merge data
1123
+ anemia_data_wealth = merge_left_without_inflating (anemia_data , dhs_wealth_data .drop (columns = ['geospatial_id' ,'lat' , 'long' ]), on = merge_cols )
1180
1124
1181
- # All of GBD's come from DHS. For GBD, we prefer the wealth data from the wealth team,
1182
- # so subset to the nids in the DHS wealth data
1183
- anemia_data_to_match = anemia_data .query ('nid in @dhs_wealth_data.nid' )
1125
+ unmergable_rows = anemia_data_wealth [anemia_data_wealth ['wealth_index_dhs' ].isna ()]
1126
+ merge_percent = 100 * len (anemia_data_wealth [anemia_data_wealth ['wealth_index_dhs' ].notna ()])/ len (anemia_data_raw )
1184
1127
1185
- # If there are any NIDs that aren't in the wealth data but don't have their own wealth, then they're missing wealth altogether
1186
- nids_without_wealth_in_any_dataset = set (anemia_data .nid .unique ()) - set (anemia_data_to_match .nid .unique ())
1187
- print ("NIDs without wealth in any dataset:" , nids_without_wealth_in_any_dataset )
1188
-
1189
- # below is same as anemia_data_to_match
1190
- anemia_data_own_wealth = anemia_data .query ('nid not in @anemia_data_nids_without_wealth' )
1191
-
1192
- # First the data with its own wealth data
1193
- # We get wealth data to be merged with anemoa data and merge
1194
- print ("Processing data with its own wealth data..." )
1195
- anemia_data_wealth_distribution = (
1196
- anemia_data_own_wealth .groupby (["nid" , "ihme_loc_id" , "year_start" , "psu" , "hh_id" ])
1197
- .agg (
1198
- wealth_index_dhs = ("wealth_index_dhs" , "first" ),
1199
- pweight = ("pweight" , "first" ),
1200
- check = ("wealth_index_dhs" , "nunique" ),
1201
- )
1202
- .reset_index ()
1203
- )
1204
-
1205
- if (anemia_data_wealth_distribution .check != 1 ).any ():
1206
- msg = "Multiple wealth index values for the same household."
1207
- raise RuntimeError (msg )
1128
+ print (f"{ len (anemia_data_wealth [anemia_data_wealth ['wealth_index_dhs' ].notna ()]):,} "
1129
+ f"rows out of { len (anemia_data_raw ):,} merged ({ merge_percent :.1f} %). "
1130
+ f"Unmergeable includes { len (missing_hh_rows ):,} with missing hh_id in raw "
1131
+ f"data, and { len (unmergable_rows ):,} that failed to merge on "
1132
+ '"nid", "ihme_loc_id", "hh_id", "psu", "year_start" variables' )
1208
1133
1209
- anemia_data_wealth_distribution = merge_left_without_inflating (anemia_data_wealth_distribution , loc_meta , on = "ihme_loc_id" )
1210
1134
1211
1135
cm_data = ClimateMalnutritionData (Path (DEFAULT_ROOT ) / MEASURES_IN_SOURCE [data_source_type ][0 ])
1212
1136
0 commit comments