From a1dc36c7ca277ba5964f8c86e35ccf0e6e81ff07 Mon Sep 17 00:00:00 2001 From: Patrick Lenz Date: Mon, 5 Feb 2024 15:52:28 +0100 Subject: [PATCH 1/2] regiostar mapping --- .../memilio/epidata/defaultDict.py | 12 ++++- .../memilio/epidata/geoModificationGermany.py | 44 +++++++++++++++++++ 2 files changed, 54 insertions(+), 2 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/defaultDict.py b/pycode/memilio-epidata/memilio/epidata/defaultDict.py index c52be44395..b376c38bb8 100644 --- a/pycode/memilio-epidata/memilio/epidata/defaultDict.py +++ b/pycode/memilio-epidata/memilio/epidata/defaultDict.py @@ -106,7 +106,11 @@ 'region_name': 'County', 'region_id': 'ID_County', 'desc': 'Description', - 'incidence': 'Incidence' + 'incidence': 'Incidence', + 'values': 'Values', + 'description': 'Description', + 'variable': 'Variable', + 'district': 'District' } GerEng = { @@ -155,7 +159,11 @@ 'ags5': EngEng['idCounty'], 'm_code': EngEng['npiCode'], 'code': EngEng['npiCode'], - 'Bundesland_Id': EngEng['idState'] + 'Bundesland_Id': EngEng['idState'], + 'Werte': EngEng['values'], + 'Beschreibung': EngEng['description'], + 'Variable': EngEng['variable'], + 'gem_20': EngEng['district'] } EsEng = {'fecha': EngEng['date'], diff --git a/pycode/memilio-epidata/memilio/epidata/geoModificationGermany.py b/pycode/memilio-epidata/memilio/epidata/geoModificationGermany.py index 231a7a9c75..19bbb532bc 100644 --- a/pycode/memilio-epidata/memilio/epidata/geoModificationGermany.py +++ b/pycode/memilio-epidata/memilio/epidata/geoModificationGermany.py @@ -632,3 +632,47 @@ def merge_df_counties_all( df = merge_df_counties(df, key, val, sorting, columns, method) return df + + +def regiostar_mapping(RegioStaR='7'): + """! Mapping of RegioStaR categories to countyIDs + + @param RegioStar str One of [2,4,17,7(default),5,Gem7,Gem5] + @return Dict Mapped Regiostar + """ + if not isinstance(RegioStaR, str): + RegioStaR = str(RegioStaR) + RegioStaR = 'Regiostar'+RegioStaR + # read file + xlsx = pd.ExcelFile( + 'https://www.mcloud.de/downloads/mcloud/536149D1-2902-4975-9F7D-253191C0AD07/RegioStaR-Referenzdateien.xlsx', engine='openpyxl') + # save different sheets into different variables + codeplan = pd.read_excel(xlsx, sheet_name='Codeplan', header=4) + codeplan.rename(dd.GerEng, axis=1, inplace=True) + codelist = pd.read_excel(xlsx, sheet_name='ReferenzGebietsstand2020') + codelist.rename(dd.GerEng, axis=1, inplace=True) + # get start and end point of regiostar values + idx_begin = np.where(codeplan[dd.EngEng['variable']] == RegioStaR)[0][0] + idx_end = idx_begin+1 + while pd.isnull(codeplan[dd.EngEng['variable']][idx_end]): + idx_end += 1 + # cut codeplan + codeplan = codeplan.iloc[idx_begin:idx_end] + # save in array + regio_values = codeplan[dd.EngEng['values']].values + # save them as dict + RegioDict = dict() + # + new_cols = list(codelist.columns[:9]) + new_cols += [c.lower() for c in codelist.columns[9:]] + codelist.columns = new_cols + for v in regio_values: + subframe = codelist.iloc[np.where(codelist[RegioStaR.lower()] == v)] + # remove last 3 digits of gem_20 -> now represents CountyID + district_ids = subframe[dd.EngEng['district']].values + county_ids = {str(d_id)[:-3] for d_id in district_ids} + # write into dict + RegioDict[codeplan[codeplan[dd.EngEng['values']] == v] + [dd.EngEng['description']].values[0]] = county_ids + + return RegioDict From 52eb5302b737f9ca7acaf477362e9257ea71b1fb Mon Sep 17 00:00:00 2001 From: Patrick Lenz Date: Mon, 5 Feb 2024 16:16:49 +0100 Subject: [PATCH 2/2] add optional counting of multiple entries --- .../memilio/epidata/geoModificationGermany.py | 24 +++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/geoModificationGermany.py b/pycode/memilio-epidata/memilio/epidata/geoModificationGermany.py index 19bbb532bc..2f436531db 100644 --- a/pycode/memilio-epidata/memilio/epidata/geoModificationGermany.py +++ b/pycode/memilio-epidata/memilio/epidata/geoModificationGermany.py @@ -634,7 +634,7 @@ def merge_df_counties_all( return df -def regiostar_mapping(RegioStaR='7'): +def regiostar_mapping(RegioStaR='7', count_multiple_entries=False): """! Mapping of RegioStaR categories to countyIDs @param RegioStar str One of [2,4,17,7(default),5,Gem7,Gem5] @@ -654,6 +654,7 @@ def regiostar_mapping(RegioStaR='7'): # get start and end point of regiostar values idx_begin = np.where(codeplan[dd.EngEng['variable']] == RegioStaR)[0][0] idx_end = idx_begin+1 + while pd.isnull(codeplan[dd.EngEng['variable']][idx_end]): idx_end += 1 # cut codeplan @@ -666,13 +667,28 @@ def regiostar_mapping(RegioStaR='7'): new_cols = list(codelist.columns[:9]) new_cols += [c.lower() for c in codelist.columns[9:]] codelist.columns = new_cols + + if count_multiple_entries: + # create zero-filled dataframe with countyIDs as rows and RegioStaRIDs as columns + count_df = pd.DataFrame( + 0, columns=codeplan[dd.EngEng['description']], index=dd.County.keys()) + for v in regio_values: + regio_desc = codeplan[codeplan[dd.EngEng['values']] + == v][dd.EngEng['description']].values[0] subframe = codelist.iloc[np.where(codelist[RegioStaR.lower()] == v)] # remove last 3 digits of gem_20 -> now represents CountyID district_ids = subframe[dd.EngEng['district']].values - county_ids = {str(d_id)[:-3] for d_id in district_ids} + county_ids = [str(d_id)[:-3] for d_id in district_ids] + if count_multiple_entries: + for c_id in county_ids: + count_df.at[int(c_id), regio_desc] += 1 + unique_county_ids = set(county_ids) # write into dict - RegioDict[codeplan[codeplan[dd.EngEng['values']] == v] - [dd.EngEng['description']].values[0]] = county_ids + RegioDict[regio_desc] = unique_county_ids + + if count_multiple_entries: + gd.write_dataframe(count_df, os.path.join( + dd.defaultDict['out_folder'], 'Germany'), 'multiple_entries_'+RegioStaR.lower(), 'txt') return RegioDict