Skip to content

Commit e7bca13

Browse files
author
Elye Bliss
committed
working on hh_id cleaning
1 parent 4731636 commit e7bca13

File tree

1 file changed

+133
-28
lines changed

1 file changed

+133
-28
lines changed

src/rra_climate_health/data_prep/run_training_data_prep.py

Lines changed: 133 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -37,12 +37,8 @@
3737
"/mnt/team/integrated_analytics/pub/goalkeepers/goalkeepers_2024/data"
3838
)
3939

40-
# ANEMIA_ROOT = Path(
41-
# "/mnt/team/rapidresponse/pub/population/modeling/climate_malnutrition/input/extractions/anemia/"
42-
# )
43-
4440
ANEMIA_ROOT = Path(
45-
"/mnt/team/integrated_analytics/pub/goalkeepers/goalkeepers_2024/data"
41+
"/mnt/team/rapidresponse/pub/population/modeling/climate_malnutrition/input/extractions/anemia/"
4642
)
4743

4844
SDI_PATH = Path("/mnt/share/forecasting/data/7/past/sdi/20240531_gk24/sdi.nc")
@@ -59,8 +55,7 @@
5955
"DHS": WEALTH_DATA_ROOT / "DHS_wealth.parquet",
6056
"MICS": WEALTH_DATA_ROOT / "MICS_wealth.parquet",
6157
},
62-
"anemia": ANEMIA_ROOT / "anemia" / "anemia_combined_wealth_rex.csv",
63-
# "anemia": ANEMIA_ROOT / "anemia_extracts_compiled_09_02_2025.csv",
58+
"anemia": ANEMIA_ROOT / "anemia_extracts_compiled_09_02_2025.csv",
6459
}
6560

6661
DATA_SOURCE_TYPE = {"stunting": "cgf", "wasting": "cgf", "underweight":"cgf", "low_adult_bmi": "bmi","anemia":"anemia"}
@@ -414,9 +409,7 @@ def ldi_to_wealth(x):
414409
"ldipc_weighted_match",
415410
]
416411
if asset_df_ldipc[new_cols].isna().sum().sum() != 0:
417-
# raise RuntimeError("Null LDI-PC values in one of the methods.") temporarily commenting
418-
print("Null LDI-PC values in one of the methods.")
419-
asset_df_ldipc = asset_df_ldipc[~asset_df_ldipc[new_cols].isna().any(axis=1)]
412+
raise RuntimeError("Null LDI-PC values in one of the methods.")
420413
if len(asset_df_ldipc) != len(asset_df):
421414
raise RuntimeError("Mismatch in length of asset data and LDI-PC data.")
422415
return asset_df_ldipc # type: ignore[no-any-return]
@@ -659,6 +652,8 @@ def run_training_data_prep(output_root: str, source_type: str) -> None:
659652
run_training_data_prep_main(output_root, source_type)
660653

661654
def clean_hh_id(row):
655+
if pd.isna(row['hh_id']):
656+
return row['hh_id']
662657
hh_id_str = str(row['hh_id'])
663658
geo_str = str(row['geospatial_id'])
664659
# Match one or more leading zeros followed by the geospatial_id at the start
@@ -670,6 +665,7 @@ def clean_hh_id(row):
670665
return cleaned
671666

672667

668+
673669
output_root = DEFAULT_ROOT
674670
data_source_type = "cgf"
675671
def run_training_data_prep_main( # noqa: PLR0915
@@ -1017,46 +1013,139 @@ def run_training_data_prep_anemia(
10171013
loc_meta = pd.read_parquet(paths.FHS_LOCATION_METADATA_FILEPATH)
10181014

10191015
anemia_data_raw = pd.read_csv(
1020-
survey_data_path,
1021-
dtype={"hh_id": str, "year_start": int, "int_year": int, "year_end": int},
1016+
survey_data_path
10221017
)
10231018

1024-
anemia_data_raw = anemia_data_raw.rename(columns=COLUMN_NAME_TRANSLATOR)
1025-
10261019
anemia_data = anemia_data_raw[
10271020
[
10281021
"nid",
10291022
"ihme_loc_id",
1023+
'survey_name',
10301024
"year_start",
10311025
"year_end",
1032-
# "geospatial_id", # not found in 2024 version
1033-
"psu",
1026+
'urban',
1027+
"geospatial_id",
1028+
"psu_id",
10341029
"pweight",
10351030
"strata",
10361031
"hh_id",
1037-
"urban",
1032+
"line_id",
10381033
"sex_id",
10391034
"age_year",
1040-
# "age_month", # not found in 2024 version
1035+
"age_month",
10411036
"int_year",
1042-
"int_month",
1043-
"brinda_adj_hemog", # likely to include more anemia vars
1044-
"who_adj_hemog",
1045-
"wealth_index_dhs", # not found in extractions data
1046-
"lat",
1047-
"long",
1037+
'anemia_anemic_brinda',
1038+
'anemia_mod_sev_brinda',
1039+
'latnum',
1040+
'longnum',
10481041
]
10491042
]
10501043

1044+
anemia_data = anemia_data.rename(columns=COLUMN_NAME_TRANSLATOR)
1045+
1046+
anemia_data["old_hh_id"] = anemia_data["hh_id"]
1047+
1048+
1049+
def clean_hh_id(row):
1050+
if pd.isna(row['hh_id_v2']):
1051+
return row['hh_id_v2']
1052+
hh_id_str = str(row['hh_id_v2'])
1053+
geo_str = str(row['geospatial_id'])
1054+
# Match one or more leading zeros followed by the geospatial_id at the start
1055+
pattern = r'^0+' + re.escape(geo_str)
1056+
# Remove the matched pattern (if found)
1057+
cleaned = re.sub(pattern, '', hh_id_str)
1058+
# Strip remaining leading zeros and handle empty results
1059+
cleaned = cleaned.lstrip('0') or '0'
1060+
return cleaned
1061+
1062+
def clean_hh_id_anemia(row):
1063+
hh_id = row['hh_id']
1064+
geo_str = str(row['geospatial_id'])
1065+
1066+
if pd.isna(hh_id):
1067+
return hh_id
1068+
1069+
# Convert to string and trim leading/trailing whitespace
1070+
hh_id = str(hh_id).strip()
1071+
1072+
# Replace multiple spaces with a single space
1073+
hh_id = re.sub(r'\s{2,}', ' ', hh_id)
1074+
1075+
# If the hh_id is already clean (no spaces or underscores), return it
1076+
if len(re.split(r"[_ ]", hh_id)) == 1:
1077+
return float(hh_id)
1078+
1079+
# Handle cases with spaces or underscores
1080+
if " " in hh_id:
1081+
hh_id = hh_id.split(" ")[-1]
1082+
elif "_" in hh_id:
1083+
hh_id = hh_id.split("_")[-1]
1084+
1085+
# Match and remove leading zeros followed by the geospatial_id
1086+
pattern = r'^0+' + re.escape(geo_str)
1087+
hh_id = re.sub(pattern, '', hh_id)
1088+
1089+
# Strip remaining leading zeros and handle empty results
1090+
hh_id = hh_id.lstrip('0') or '0'
1091+
1092+
return float(hh_id) # Return as float to handle NAs
1093+
1094+
anemia_data["hh_id"] = anemia_data["hh_id"].apply(clean_hh_id_anemia)
1095+
1096+
assert len(anemia_data[anemia_data["hh_id"].isna()]) == len(anemia_data[anemia_data["old_hh_id"].isna()]), "NAs introduced by cleaning"
1097+
1098+
# anemia_data["hh_id_v2"] = anemia_data["old_hh_id"].str.split(r"[_ ]").str[-1]
1099+
1100+
# anemia_data[(anemia_data["hh_id"].isna()) & (anemia_data["hh_id_v2"].notna())]
1101+
# anemia_data[(anemia_data["hh_id"].notna()) & (anemia_data["hh_id_v2"].isna())]
1102+
1103+
# anemia_data["psu"] = anemia_data["psu"].astype(int)
1104+
# anemia_data["hh_id_v2"] = anemia_data.apply(clean_hh_id, axis=1)
1105+
1106+
# len(anemia_data[anemia_data["hh_id_v2"].isna()])
1107+
# len(anemia_data[anemia_data["hh_id"].isna()])
1108+
# len(anemia_data[anemia_data["old_hh_id"].isna()])
1109+
1110+
# anemia_data = anemia_data[~anemia_data["hh_id"].isna()]
1111+
# anemia_data["hh_id"] = anemia_data["hh_id"].astype(int)
1112+
# anemia_data["hh_id"] = anemia_data["hh_id"].astype(str)
1113+
# anemia_data[(anemia_data["hh_id"]!=anemia_data["hh_id_v2"])][["psu","geospatial_id","hh_id","hh_id_v2","old_hh_id"]]
1114+
1115+
# anemia_data[(anemia_data["hh_id"].isna())&(~anemia_data["old_hh_id"].isna())]
1116+
10511117
# Take away NIDs without wealth information, to see later if we can add it from second source
10521118
print(len(anemia_data))
10531119
anemia_data_nids_without_wealth = anemia_data[
10541120
anemia_data.wealth_index_dhs.isna()
10551121
].nid.unique()
10561122

1057-
lsms_wealth_data = get_LSMS_wealth_dataset()
1058-
dhs_wealth_data = get_DHS_wealth_dataset()
1059-
mics_wealth_data = get_MICS_wealth_dataset()
1123+
# Prepping wealth dataset
1124+
dhs_wealth_data_raw = get_DHS_wealth_dataset()
1125+
dhs_wealth_data = dhs_wealth_data_raw.copy()
1126+
1127+
dhs_wealth_data["old_hh_id"] = dhs_wealth_data["hh_id"]
1128+
dhs_wealth_data["hh_id"] = dhs_wealth_data.apply(clean_hh_id_anemia, axis=1)
1129+
1130+
assert len(dhs_wealth_data[dhs_wealth_data["hh_id"].isna()]) == len(dhs_wealth_data[dhs_wealth_data["old_hh_id"].isna()]), "NAs introduced by cleaning"
1131+
1132+
1133+
dhs_wealth_data["hh_id_v2"] = dhs_wealth_data["old_hh_id"].str.split(r"[_ ]").str[-1]
1134+
dhs_wealth_data["psu"] = dhs_wealth_data["psu"].astype(int)
1135+
dhs_wealth_data["psu"] = dhs_wealth_data["psu"].astype(str)
1136+
dhs_wealth_data["hh_id_v2"] = dhs_wealth_data.apply(clean_hh_id, axis=1)
1137+
1138+
assert len(dhs_wealth_data[dhs_wealth_data["hh_id_v2"].isna()]) == len(dhs_wealth_data[dhs_wealth_data["old_hh_id"].isna()]), "NAs introduced by cleaning"
1139+
1140+
# dhs_wealth_data["hh_id"] = dhs_wealth_data["hh_id"].astype(int)
1141+
# dhs_wealth_data["hh_id"] = dhs_wealth_data["hh_id"].astype(str)
1142+
1143+
1144+
# dhs_wealth_data[(dhs_wealth_data["hh_id"]!=dhs_wealth_data["hh_id_v2"])&(dhs_wealth_data["psu"]!=dhs_wealth_data["geospatial_id"])]
1145+
1146+
dhs_wealth_data["hh_id"] = dhs_wealth_data["hh_id"].astype(int)
1147+
dhs_wealth_data["hh_id_v1"] = dhs_wealth_data["hh_id_v1"].astype(int)
1148+
dhs_wealth_data[dhs_wealth_data["hh_id"]!=dhs_wealth_data["hh_id_v1"]][["psu","hh_id","hh_id_v1","old_hh_id"]]
10601149

10611150
wealth_nids = set(dhs_wealth_data.nid.unique()) | set(mics_wealth_data.nid.unique()) | set(lsms_wealth_data.nid.unique())
10621151
anemia_nids = set(anemia_data.nid.unique())
@@ -1065,7 +1154,23 @@ def run_training_data_prep_anemia(
10651154
nid_with_wealth_pc = 100*len(common_nids)/len(anemia_nids)
10661155
print(f"{nid_with_wealth_pc:.1f}% of anemia NIDs - {len(common_nids)} out of {len(anemia_nids)} in wealth data NIDs")
10671156

1068-
1157+
# Find out percent of anemia nids and hh_ids that can be matched in wealth data
1158+
dhs_wealth_data['hh_id'] = dhs_wealth_data['hh_id'].apply(clean_dhs_wealth_hh_id)
1159+
dhs_wealth_data['psu'] = dhs_wealth_data['psu'].astype(int)
1160+
anemia_data_hhs = anemia_data[["nid","hh_id"]].drop_duplicates()
1161+
dhs_wealth_data_hhs = dhs_wealth_data[["nid","hh_id"]].drop_duplicates()
1162+
dhs_wealth_data_hhs["dhs_wealth_data"] = True
1163+
merged_hhs = anemia_data_hhs.merge(dhs_wealth_data_hhs, on=["nid","hh_id"], how="left")
1164+
merged_hhs["dhs_wealth_data"].fillna(False, inplace=True)
1165+
merged_hhs["dhs_wealth_data"].value_counts()
1166+
1167+
1168+
# remove rows with missing hh_id
1169+
anemia_data = anemia_data[anemia_data["hh_id"].notna()]
1170+
1171+
anemia_data.drop(columns=["old_hh_id"],inplace=True)
1172+
anemia_data["hh_id"] = anemia_data["hh_id"].astype(int)
1173+
anemia_data["psu"] = anemia_data["psu"].astype(int)
10691174

10701175
# Subset to common nids
10711176
lsms_wealth_data = lsms_wealth_data.query('nid in @common_nids')

0 commit comments

Comments
 (0)