Skip to content

Commit 23b5d3b

Browse files
authored
Merge branch 'main' into feature/request-by-specific-year
2 parents df03a14 + 2bc4074 commit 23b5d3b

File tree

4 files changed

+135
-73
lines changed

4 files changed

+135
-73
lines changed

.vscode/tasks.json

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,9 @@
1818
"label": "Run Evidence",
1919
"type": "shell",
2020
"command": "if command -v npm > /dev/null 2>&1; then cd analytics/evidence && npm install && npm run sources && npm run dev -- --host 0.0.0.0; else echo 'NPM not installed'; fi",
21+
"windows": {
22+
"command": "cd analytics\\evidence; npm install; npm run sources; npm run dev -- --host 0.0.0.0"
23+
},
2124
"group": "none",
2225
"icon": {
2326
"id": "run"
@@ -29,4 +32,4 @@
2932
}
3033
}
3134
]
32-
}
35+
}

analytics/notebooks/exemple.ipynb

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
"cell_type": "markdown",
55
"metadata": {},
66
"source": [
7-
"# Exemple de notebook - premières analyses des données SISE-Eaux\n"
7+
"# Exemple de notebook - premières analyses des données EDC (Eau Distribuée par Commune)\n"
88
]
99
},
1010
{
@@ -197,7 +197,7 @@
197197
"source": [
198198
"# Affichons les tables\n",
199199
"\n",
200-
"con.table(\"sise_communes\").df()"
200+
"con.table(\"edc_communes\").df()"
201201
]
202202
},
203203
{
@@ -584,7 +584,7 @@
584584
}
585585
],
586586
"source": [
587-
"con.table(\"sise_prelevements\").df()"
587+
"con.table(\"edc_prelevements\").df()"
588588
]
589589
},
590590
{
@@ -1155,7 +1155,7 @@
11551155
}
11561156
],
11571157
"source": [
1158-
"con.table(\"sise_resultats\").df().head(20)"
1158+
"con.table(\"edc_resultats\").df().head(20)"
11591159
]
11601160
},
11611161
{
@@ -1172,9 +1172,9 @@
11721172
}
11731173
],
11741174
"source": [
1175-
"# Chargeons la table sise_communes dans un pandas dataframe, et calculons le nombre de communes\n",
1175+
"# Chargeons la table edc_communes dans un pandas dataframe, et calculons le nombre de communes\n",
11761176
"\n",
1177-
"communes = con.table(\"sise_communes\").to_df()\n",
1177+
"communes = con.table(\"edc_communes\").to_df()\n",
11781178
"nombre_de_communes = communes.nunique()[\"inseecommune\"]\n",
11791179
"print(f\"nombre_de_communes = {nombre_de_communes}\")"
11801180
]
@@ -1228,7 +1228,7 @@
12281228
"\n",
12291229
"con.sql(\"\"\"\n",
12301230
" SELECT libmajparametre, COUNT(*) as count\n",
1231-
" FROM sise_resultats\n",
1231+
" FROM edc_resultats\n",
12321232
" GROUP BY libmajparametre\n",
12331233
" ORDER BY count DESC\n",
12341234
"\"\"\").show()"
@@ -1361,7 +1361,7 @@
13611361
"\n",
13621362
"# ...et faisons la même requête SQL en utilisant l'extension SQL pour Jupyter\n",
13631363
"\n",
1364-
"%sql SELECT libmajparametre, COUNT(*) as count FROM sise_resultats GROUP BY libmajparametre ORDER BY count DESC;"
1364+
"%sql SELECT libmajparametre, COUNT(*) as count FROM edc_resultats GROUP BY libmajparametre ORDER BY count DESC;"
13651365
]
13661366
},
13671367
{
@@ -1414,7 +1414,7 @@
14141414
"\n",
14151415
"con.sql(f\"\"\"\n",
14161416
" SELECT *\n",
1417-
" FROM sise_prelevements\n",
1417+
" FROM edc_prelevements\n",
14181418
" WHERE nomcommuneprinc = '{nomcommune}'\n",
14191419
"\"\"\").show()"
14201420
]

pipelines/tasks/_config_edc.py

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
from typing import Dict
2+
3+
4+
def get_edc_config() -> Dict:
5+
"""
6+
Returns various configuration for processing the EDC (Eau distribuée par commune) datasets.
7+
The data comes from https://www.data.gouv.fr/fr/datasets/resultats-du-controle-sanitaire-de-leau-distribuee-commune-par-commune/
8+
For each year a dataset is downloadable on a URL like this (ex. 2024):
9+
https://www.data.gouv.fr/fr/datasets/r/84a67a3b-08a7-4001-98e6-231c74a98139
10+
:return: A dict with the config used for processing.
11+
The "source" part is related to the data.gouv datasource
12+
The "files" part is related to the extracted files information and sql table names
13+
"""
14+
15+
edc_config = {
16+
"source": {
17+
"base_url": "https://www.data.gouv.fr/fr/datasets/r/",
18+
"available_years": [
19+
"2016",
20+
"2017",
21+
"2018",
22+
"2019",
23+
"2020",
24+
"2021",
25+
"2022",
26+
"2023",
27+
"2024",
28+
],
29+
"yearly_files_infos": {
30+
"2024": {
31+
"id": "84a67a3b-08a7-4001-98e6-231c74a98139",
32+
"zipfile": "dis-2024.zip",
33+
},
34+
"2023": {
35+
"id": "c89dec4a-d985-447c-a102-75ba814c398e",
36+
"zipfile": "dis-2023.zip",
37+
},
38+
"2022": {
39+
"id": "a97b6074-c4dd-4ef2-8922-b0cf04dbff9a",
40+
"zipfile": "dis-2022.zip",
41+
},
42+
"2021": {
43+
"id": "d2b432cc-3761-44d3-8e66-48bc15300bb5",
44+
"zipfile": "dis-2021.zip",
45+
},
46+
"2020": {
47+
"id": "a6cb4fea-ef8c-47a5-acb3-14e49ccad801",
48+
"zipfile": "dis-2020.zip",
49+
},
50+
"2019": {
51+
"id": "861f2a7d-024c-4bf0-968b-9e3069d9de07",
52+
"zipfile": "dis-2019.zip",
53+
},
54+
"2018": {
55+
"id": "0513b3c0-dc18-468d-a969-b3508f079792",
56+
"zipfile": "dis-2018.zip",
57+
},
58+
"2017": {
59+
"id": "5785427b-3167-49fa-a581-aef835f0fb04",
60+
"zipfile": "dis-2017.zip",
61+
},
62+
"2016": {
63+
"id": "483c84dd-7912-483b-b96f-4fa5e1d8651f",
64+
"zipfile": "dis-2016.zip",
65+
},
66+
},
67+
},
68+
"files": {
69+
"communes": {
70+
"file_name_prefix": "DIS_COM_UDI_",
71+
"file_extension": ".txt",
72+
"table_name": "edc_communes",
73+
},
74+
"prelevements": {
75+
"file_name_prefix": "DIS_PLV_",
76+
"file_extension": ".txt",
77+
"table_name": "edc_prelevements",
78+
},
79+
"resultats": {
80+
"file_name_prefix": "DIS_RESULT_",
81+
"file_extension": ".txt",
82+
"table_name": "edc_resultats",
83+
},
84+
},
85+
}
86+
87+
return edc_config
88+
89+
90+
def create_edc_yearly_filename(
91+
file_name_prefix: str, file_extension: str, year: str
92+
) -> str:
93+
"""
94+
This function is used to recreate the yearly filenames of the extracted files.
95+
It is intended for use with the edc_config["files"] data above.
96+
For example in 2024 the file name for communes should be:
97+
DIS_COM_UDI_2024.txt
98+
:param file_name_prefix: prefix of the filename
99+
:param file_extension: extension of the file
100+
:param year: year of the needed file
101+
:return: the yearly filename as a string
102+
"""
103+
return file_name_prefix + year + file_extension

pipelines/tasks/build_database.py

Lines changed: 19 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,10 @@
1111
import requests
1212

1313
from ._common import CACHE_FOLDER, DUCKDB_FILE, clear_cache
14+
from ._config_edc import get_edc_config, create_edc_yearly_filename
1415

1516
logger = logging.getLogger(__name__)
17+
edc_config = get_edc_config()
1618

1719

1820
def check_table_existence(conn: duckdb.DuckDBPyConnection, table_name: str) -> bool:
@@ -31,30 +33,6 @@ def check_table_existence(conn: duckdb.DuckDBPyConnection, table_name: str) -> b
3133
return list(conn.fetchone())[0] == 1
3234

3335

34-
def get_yearly_edc_infos(year: str) -> Dict[str, str]:
35-
"""
36-
Returns information for yearly dataset extract of the EDC (Eau distribuée par commune) datasets.
37-
The data comes from https://www.data.gouv.fr/fr/datasets/resultats-du-controle-sanitaire-de-leau-distribuee-commune-par-commune/
38-
For each year a dataset is downloadable on a URL like this (ex. 2024):
39-
https://www.data.gouv.fr/fr/datasets/r/84a67a3b-08a7-4001-98e6-231c74a98139
40-
The id of the dataset is the last part of this URL
41-
The name of the dataset is dis-YEAR.zip (but the format could potentially change).
42-
:param year: The year from which we want to get the dataset information.
43-
:return: A dict with the id and name of the dataset.
44-
"""
45-
edc_dis_files_info_by_year = {
46-
"2024": {"id": "84a67a3b-08a7-4001-98e6-231c74a98139", "name": "dis-2024.zip"},
47-
"2023": {"id": "c89dec4a-d985-447c-a102-75ba814c398e", "name": "dis-2023.zip"},
48-
"2022": {"id": "a97b6074-c4dd-4ef2-8922-b0cf04dbff9a", "name": "dis-2022.zip"},
49-
"2021": {"id": "d2b432cc-3761-44d3-8e66-48bc15300bb5", "name": "dis-2021.zip"},
50-
"2020": {"id": "a6cb4fea-ef8c-47a5-acb3-14e49ccad801", "name": "dis-2020.zip"},
51-
"2019": {"id": "861f2a7d-024c-4bf0-968b-9e3069d9de07", "name": "dis-2019.zip"},
52-
"2018": {"id": "0513b3c0-dc18-468d-a969-b3508f079792", "name": "dis-2018.zip"},
53-
"2017": {"id": "5785427b-3167-49fa-a581-aef835f0fb04", "name": "dis-2017.zip"},
54-
"2016": {"id": "483c84dd-7912-483b-b96f-4fa5e1d8651f", "name": "dis-2016.zip"},
55-
}
56-
return edc_dis_files_info_by_year[year]
57-
5836

5937
def download_extract_insert_yearly_edc_data(year: str):
6038
"""
@@ -64,31 +42,16 @@ def download_extract_insert_yearly_edc_data(year: str):
6442
:return: Create or replace the associated tables in the duckcb database.
6543
It adds the column "de_partition" based on year as an integer.
6644
"""
67-
68-
yearly_dataset_info = get_yearly_edc_infos(year=year)
69-
7045
# Dataset specific constants
71-
DATA_URL = f"https://www.data.gouv.fr/fr/datasets/r/{yearly_dataset_info['id']}"
72-
ZIP_FILE = os.path.join(CACHE_FOLDER, yearly_dataset_info["name"])
46+
DATA_URL = (
47+
edc_config["source"]["base_url"]
48+
+ edc_config["source"]["yearly_files_infos"][year]["id"]
49+
)
50+
ZIP_FILE = os.path.join(
51+
CACHE_FOLDER, edc_config["source"]["yearly_files_infos"][year]["zipfile"]
52+
)
7353
EXTRACT_FOLDER = os.path.join(CACHE_FOLDER, f"raw_data_{year}")
74-
75-
FILES = {
76-
"communes": {
77-
"filename_prefix": f"DIS_COM_UDI_",
78-
"file_extension": ".txt",
79-
"table_name": f"edc_communes",
80-
},
81-
"prelevements": {
82-
"filename_prefix": f"DIS_PLV_",
83-
"file_extension": ".txt",
84-
"table_name": f"edc_prelevements",
85-
},
86-
"resultats": {
87-
"filename_prefix": f"DIS_RESULT_",
88-
"file_extension": ".txt",
89-
"table_name": f"edc_resultats",
90-
},
91-
}
54+
FILES = edc_config["files"]
9255

9356
logger.info(f"Processing EDC dataset for {year}...")
9457
response = requests.get(DATA_URL, stream=True)
@@ -106,7 +69,11 @@ def download_extract_insert_yearly_edc_data(year: str):
10669
for file_info in FILES.values():
10770
filepath = os.path.join(
10871
EXTRACT_FOLDER,
109-
f"{file_info['filename_prefix']}{year}{file_info['file_extension']}",
72+
create_edc_yearly_filename(
73+
file_name_prefix=file_info["file_name_prefix"],
74+
file_extension=file_info["file_extension"],
75+
year=year,
76+
),
11077
)
11178

11279
if check_table_existence(conn=conn, table_name=f"{file_info['table_name']}"):
@@ -124,7 +91,8 @@ def download_extract_insert_yearly_edc_data(year: str):
12491
query_select = f"""
12592
SELECT
12693
*,
127-
CAST({year} as INTEGER) AS de_partition
94+
CAST({year} AS INTEGER) AS de_partition,
95+
current_date AS de_ingestion_date
12896
FROM read_csv('{filepath}', header=true, delim=',');
12997
"""
13098

@@ -139,7 +107,7 @@ def download_extract_insert_yearly_edc_data(year: str):
139107

140108

141109
def process_edc_datasets(
142-
refresh_type: Literal["all", "last", "custom"] = "all",
110+
refresh_type: Literal["all", "last", "custom"] = "last",
143111
custom_years: List[str] = None,
144112
):
145113
"""
@@ -151,17 +119,7 @@ def process_edc_datasets(
151119
:param custom_years: years to update
152120
:return:
153121
"""
154-
available_years = [
155-
"2016",
156-
"2017",
157-
"2018",
158-
"2019",
159-
"2020",
160-
"2021",
161-
"2022",
162-
"2023",
163-
"2024",
164-
]
122+
available_years = edc_config["source"]["available_years"]
165123

166124
if refresh_type == "all":
167125
years_to_update = available_years
@@ -181,7 +139,6 @@ def process_edc_datasets(
181139
raise ValueError(
182140
""" custom_years parameter needs to be specified if refresh_type="custom" """
183141
)
184-
185142
else:
186143
raise ValueError(
187144
f""" refresh_type needs to be one of ["all", "last", "custom"], it can't be: {refresh_type}"""
@@ -196,7 +153,6 @@ def process_edc_datasets(
196153
clear_cache(recreate_folder=False)
197154
return True
198155

199-
200156
def execute(refresh_type: str = "all", custom_years: List[str] = None):
201157
"""
202158
Execute the EDC dataset processing with specified parameters.

0 commit comments

Comments
 (0)