Skip to content

Commit 8e5044b

Browse files
author
Elye Bliss
committed
merging with wealth data
1 parent e7bca13 commit 8e5044b

File tree

1 file changed

+61
-137
lines changed

1 file changed

+61
-137
lines changed

src/rra_climate_health/data_prep/run_training_data_prep.py

Lines changed: 61 additions & 137 deletions
Original file line numberDiff line numberDiff line change
@@ -664,7 +664,37 @@ def clean_hh_id(row):
664664
cleaned = cleaned.lstrip('0') or '0'
665665
return cleaned
666666

667+
def clean_hh_id_anemia(row):
668+
hh_id = row['hh_id']
669+
geo_str = str(row['geospatial_id'])
670+
671+
if pd.isna(hh_id):
672+
return hh_id
673+
674+
# Convert to string and trim leading/trailing whitespace
675+
hh_id = str(hh_id).strip()
676+
677+
# Replace multiple spaces with a single space
678+
hh_id = re.sub(r'\s{2,}', ' ', hh_id)
679+
680+
# If the hh_id is already clean (no spaces or underscores), return it
681+
if (len(re.split(r"[_ ]", hh_id)) == 1) and not (hh_id.startswith('0')):
682+
return float(hh_id)
683+
684+
# Handle cases with spaces or underscores
685+
if " " in hh_id:
686+
hh_id = hh_id.split(" ")[-1]
687+
elif "_" in hh_id:
688+
hh_id = hh_id.split("_")[-1]
689+
690+
# Match and remove leading zeros followed by the geospatial_id
691+
pattern = r'^0+' + re.escape(geo_str)
692+
hh_id = re.sub(pattern, '', hh_id)
693+
694+
# Strip remaining leading zeros and handle empty results
695+
hh_id = hh_id.lstrip('0') or '0'
667696

697+
return float(hh_id) # Return as float to handle NAs
668698

669699
output_root = DEFAULT_ROOT
670700
data_source_type = "cgf"
@@ -1045,168 +1075,62 @@ def run_training_data_prep_anemia(
10451075

10461076
anemia_data["old_hh_id"] = anemia_data["hh_id"]
10471077

1048-
1049-
def clean_hh_id(row):
1050-
if pd.isna(row['hh_id_v2']):
1051-
return row['hh_id_v2']
1052-
hh_id_str = str(row['hh_id_v2'])
1053-
geo_str = str(row['geospatial_id'])
1054-
# Match one or more leading zeros followed by the geospatial_id at the start
1055-
pattern = r'^0+' + re.escape(geo_str)
1056-
# Remove the matched pattern (if found)
1057-
cleaned = re.sub(pattern, '', hh_id_str)
1058-
# Strip remaining leading zeros and handle empty results
1059-
cleaned = cleaned.lstrip('0') or '0'
1060-
return cleaned
1061-
1062-
def clean_hh_id_anemia(row):
1063-
hh_id = row['hh_id']
1064-
geo_str = str(row['geospatial_id'])
1065-
1066-
if pd.isna(hh_id):
1067-
return hh_id
1068-
1069-
# Convert to string and trim leading/trailing whitespace
1070-
hh_id = str(hh_id).strip()
1071-
1072-
# Replace multiple spaces with a single space
1073-
hh_id = re.sub(r'\s{2,}', ' ', hh_id)
1074-
1075-
# If the hh_id is already clean (no spaces or underscores), return it
1076-
if len(re.split(r"[_ ]", hh_id)) == 1:
1077-
return float(hh_id)
1078-
1079-
# Handle cases with spaces or underscores
1080-
if " " in hh_id:
1081-
hh_id = hh_id.split(" ")[-1]
1082-
elif "_" in hh_id:
1083-
hh_id = hh_id.split("_")[-1]
1084-
1085-
# Match and remove leading zeros followed by the geospatial_id
1086-
pattern = r'^0+' + re.escape(geo_str)
1087-
hh_id = re.sub(pattern, '', hh_id)
1088-
1089-
# Strip remaining leading zeros and handle empty results
1090-
hh_id = hh_id.lstrip('0') or '0'
1091-
1092-
return float(hh_id) # Return as float to handle NAs
1093-
1094-
anemia_data["hh_id"] = anemia_data["hh_id"].apply(clean_hh_id_anemia)
1078+
anemia_data["hh_id"] = anemia_data.apply(clean_hh_id_anemia, axis=1)
10951079

10961080
assert len(anemia_data[anemia_data["hh_id"].isna()]) == len(anemia_data[anemia_data["old_hh_id"].isna()]), "NAs introduced by cleaning"
10971081

1098-
# anemia_data["hh_id_v2"] = anemia_data["old_hh_id"].str.split(r"[_ ]").str[-1]
1099-
1100-
# anemia_data[(anemia_data["hh_id"].isna()) & (anemia_data["hh_id_v2"].notna())]
1101-
# anemia_data[(anemia_data["hh_id"].notna()) & (anemia_data["hh_id_v2"].isna())]
1102-
1103-
# anemia_data["psu"] = anemia_data["psu"].astype(int)
1104-
# anemia_data["hh_id_v2"] = anemia_data.apply(clean_hh_id, axis=1)
1105-
1106-
# len(anemia_data[anemia_data["hh_id_v2"].isna()])
1107-
# len(anemia_data[anemia_data["hh_id"].isna()])
1108-
# len(anemia_data[anemia_data["old_hh_id"].isna()])
1109-
1110-
# anemia_data = anemia_data[~anemia_data["hh_id"].isna()]
1111-
# anemia_data["hh_id"] = anemia_data["hh_id"].astype(int)
1112-
# anemia_data["hh_id"] = anemia_data["hh_id"].astype(str)
1113-
# anemia_data[(anemia_data["hh_id"]!=anemia_data["hh_id_v2"])][["psu","geospatial_id","hh_id","hh_id_v2","old_hh_id"]]
1114-
1115-
# anemia_data[(anemia_data["hh_id"].isna())&(~anemia_data["old_hh_id"].isna())]
1116-
1117-
# Take away NIDs without wealth information, to see later if we can add it from second source
1118-
print(len(anemia_data))
1119-
anemia_data_nids_without_wealth = anemia_data[
1120-
anemia_data.wealth_index_dhs.isna()
1121-
].nid.unique()
1122-
11231082
# Prepping wealth dataset
11241083
dhs_wealth_data_raw = get_DHS_wealth_dataset()
11251084
dhs_wealth_data = dhs_wealth_data_raw.copy()
11261085

1086+
cm_data = ClimateMalnutritionData(Path(DEFAULT_ROOT)/'anemia')
1087+
dhs_wealth_data = get_ldipc_from_asset_score(
1088+
dhs_wealth_data, cm_data, asset_score_col="wealth_index_dhs", weights_col="hhweight",
1089+
plot_pdf_path=Path("/mnt/team/integrated_analytics/pub/goalkeepers/goalkeepers_2025/plots/anemia") / "gbd_plots.pdf", # temporary
1090+
ldi_version = LDI_VERSION,
1091+
)
1092+
11271093
dhs_wealth_data["old_hh_id"] = dhs_wealth_data["hh_id"]
11281094
dhs_wealth_data["hh_id"] = dhs_wealth_data.apply(clean_hh_id_anemia, axis=1)
11291095

11301096
assert len(dhs_wealth_data[dhs_wealth_data["hh_id"].isna()]) == len(dhs_wealth_data[dhs_wealth_data["old_hh_id"].isna()]), "NAs introduced by cleaning"
11311097

1098+
missing_hh_rows = anemia_data[anemia_data['hh_id'].isna()]
1099+
print(f"Dropping {len(missing_hh_rows)} rows from anemia data with missing hh_id")
1100+
anemia_data = anemia_data[anemia_data["hh_id"].notna()]
11321101

1133-
dhs_wealth_data["hh_id_v2"] = dhs_wealth_data["old_hh_id"].str.split(r"[_ ]").str[-1]
1134-
dhs_wealth_data["psu"] = dhs_wealth_data["psu"].astype(int)
1135-
dhs_wealth_data["psu"] = dhs_wealth_data["psu"].astype(str)
1136-
dhs_wealth_data["hh_id_v2"] = dhs_wealth_data.apply(clean_hh_id, axis=1)
1137-
1138-
assert len(dhs_wealth_data[dhs_wealth_data["hh_id_v2"].isna()]) == len(dhs_wealth_data[dhs_wealth_data["old_hh_id"].isna()]), "NAs introduced by cleaning"
1139-
1140-
# dhs_wealth_data["hh_id"] = dhs_wealth_data["hh_id"].astype(int)
1141-
# dhs_wealth_data["hh_id"] = dhs_wealth_data["hh_id"].astype(str)
1142-
1143-
1144-
# dhs_wealth_data[(dhs_wealth_data["hh_id"]!=dhs_wealth_data["hh_id_v2"])&(dhs_wealth_data["psu"]!=dhs_wealth_data["geospatial_id"])]
1145-
1146-
dhs_wealth_data["hh_id"] = dhs_wealth_data["hh_id"].astype(int)
1147-
dhs_wealth_data["hh_id_v1"] = dhs_wealth_data["hh_id_v1"].astype(int)
1148-
dhs_wealth_data[dhs_wealth_data["hh_id"]!=dhs_wealth_data["hh_id_v1"]][["psu","hh_id","hh_id_v1","old_hh_id"]]
1102+
# Find out percent of anemia nids and hh_ids that can be matched in wealth data
1103+
merge_cols = ["nid", "ihme_loc_id", "hh_id", "psu", "year_start"]
1104+
anemia_data['hh_id'] = anemia_data['hh_id'].astype(int)
1105+
dhs_wealth_data['hh_id'] = dhs_wealth_data['hh_id'].astype(int)
1106+
anemia_data['psu'] = anemia_data['psu'].astype(int)
1107+
dhs_wealth_data['psu'] = dhs_wealth_data['psu'].astype(int)
11491108

1150-
wealth_nids = set(dhs_wealth_data.nid.unique()) | set(mics_wealth_data.nid.unique()) | set(lsms_wealth_data.nid.unique())
1109+
wealth_nids = set(dhs_wealth_data.nid.unique())
11511110
anemia_nids = set(anemia_data.nid.unique())
11521111
common_nids = wealth_nids.intersection(anemia_nids)
11531112

11541113
nid_with_wealth_pc = 100*len(common_nids)/len(anemia_nids)
1155-
print(f"{nid_with_wealth_pc:.1f}% of anemia NIDs - {len(common_nids)} out of {len(anemia_nids)} in wealth data NIDs")
1114+
print(f"{nid_with_wealth_pc:.1f}% of anemia NIDs - {len(common_nids)} out of "
1115+
f"{len(anemia_nids)} in wealth data NIDs")
11561116

1157-
# Find out percent of anemia nids and hh_ids that can be matched in wealth data
1158-
dhs_wealth_data['hh_id'] = dhs_wealth_data['hh_id'].apply(clean_dhs_wealth_hh_id)
1159-
dhs_wealth_data['psu'] = dhs_wealth_data['psu'].astype(int)
1160-
anemia_data_hhs = anemia_data[["nid","hh_id"]].drop_duplicates()
1161-
dhs_wealth_data_hhs = dhs_wealth_data[["nid","hh_id"]].drop_duplicates()
1162-
dhs_wealth_data_hhs["dhs_wealth_data"] = True
1163-
merged_hhs = anemia_data_hhs.merge(dhs_wealth_data_hhs, on=["nid","hh_id"], how="left")
1164-
merged_hhs["dhs_wealth_data"].fillna(False, inplace=True)
1165-
merged_hhs["dhs_wealth_data"].value_counts()
1166-
1167-
1168-
# remove rows with missing hh_id
1169-
anemia_data = anemia_data[anemia_data["hh_id"].notna()]
1117+
dhs_wealth_data = dhs_wealth_data.query("nid in @anemia_nids")
11701118

11711119
anemia_data.drop(columns=["old_hh_id"],inplace=True)
1172-
anemia_data["hh_id"] = anemia_data["hh_id"].astype(int)
1173-
anemia_data["psu"] = anemia_data["psu"].astype(int)
1174-
1175-
# Subset to common nids
1176-
lsms_wealth_data = lsms_wealth_data.query('nid in @common_nids')
1177-
dhs_wealth_data = dhs_wealth_data.query('nid in @common_nids')
1178-
mics_wealth_data = mics_wealth_data.query('nid in @common_nids')
1120+
dhs_wealth_data.drop(columns=["old_hh_id"],inplace=True)
11791121

1122+
# Merge data
1123+
anemia_data_wealth = merge_left_without_inflating(anemia_data, dhs_wealth_data.drop(columns=['geospatial_id','lat', 'long']), on=merge_cols)
11801124

1181-
# All of GBD's come from DHS. For GBD, we prefer the wealth data from the wealth team,
1182-
# so subset to the nids in the DHS wealth data
1183-
anemia_data_to_match = anemia_data.query('nid in @dhs_wealth_data.nid')
1125+
unmergable_rows = anemia_data_wealth[anemia_data_wealth['wealth_index_dhs'].isna()]
1126+
merge_percent = 100*len(anemia_data_wealth[anemia_data_wealth['wealth_index_dhs'].notna()])/len(anemia_data_raw)
11841127

1185-
# If there are any NIDs that aren't in the wealth data but don't have their own wealth, then they're missing wealth altogether
1186-
nids_without_wealth_in_any_dataset = set(anemia_data.nid.unique()) - set(anemia_data_to_match.nid.unique())
1187-
print("NIDs without wealth in any dataset:", nids_without_wealth_in_any_dataset)
1188-
1189-
# below is same as anemia_data_to_match
1190-
anemia_data_own_wealth = anemia_data.query('nid not in @anemia_data_nids_without_wealth')
1191-
1192-
# First the data with its own wealth data
1193-
# We get wealth data to be merged with anemoa data and merge
1194-
print("Processing data with its own wealth data...")
1195-
anemia_data_wealth_distribution = (
1196-
anemia_data_own_wealth.groupby(["nid", "ihme_loc_id", "year_start", "psu", "hh_id"])
1197-
.agg(
1198-
wealth_index_dhs=("wealth_index_dhs", "first"),
1199-
pweight=("pweight", "first"),
1200-
check=("wealth_index_dhs", "nunique"),
1201-
)
1202-
.reset_index()
1203-
)
1204-
1205-
if (anemia_data_wealth_distribution.check != 1).any():
1206-
msg = "Multiple wealth index values for the same household."
1207-
raise RuntimeError(msg)
1128+
print(f"{len(anemia_data_wealth[anemia_data_wealth['wealth_index_dhs'].notna()]):,} "
1129+
f"rows out of {len(anemia_data_raw):,} merged ({merge_percent:.1f}%). "
1130+
f"Unmergeable includes {len(missing_hh_rows):,} with missing hh_id in raw "
1131+
f"data, and {len(unmergable_rows):,} that failed to merge on "
1132+
'"nid", "ihme_loc_id", "hh_id", "psu", "year_start" variables')
12081133

1209-
anemia_data_wealth_distribution = merge_left_without_inflating(anemia_data_wealth_distribution, loc_meta, on="ihme_loc_id")
12101134

12111135
cm_data = ClimateMalnutritionData(Path(DEFAULT_ROOT) / MEASURES_IN_SOURCE[data_source_type][0])
12121136

0 commit comments

Comments
 (0)