Merge branch 'main' into feature/request-by-specific-year

SprinTech · web-flow · commit 23b5d3bcfe78 · 2025-02-05T13:53:52.000+01:00
diff --git a/.vscode/tasks.json b/.vscode/tasks.json
@@ -18,6 +18,9 @@
       "label": "Run Evidence",
       "type": "shell",
       "command": "if command -v npm > /dev/null 2>&1; then cd analytics/evidence && npm install && npm run sources && npm run dev -- --host 0.0.0.0; else echo 'NPM not installed'; fi",
+      "windows": {
+        "command": "cd analytics\\evidence; npm install; npm run sources; npm run dev -- --host 0.0.0.0"
+      },
       "group": "none",
       "icon": {
         "id": "run"
@@ -29,4 +32,4 @@
       }
     }
   ]
-}
+}
diff --git a/analytics/notebooks/exemple.ipynb b/analytics/notebooks/exemple.ipynb
@@ -4,7 +4,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Exemple de notebook - premières analyses des données SISE-Eaux\n"
+    "# Exemple de notebook - premières analyses des données EDC (Eau Distribuée par Commune)\n"
    ]
   },
   {
@@ -197,7 +197,7 @@
    "source": [
     "# Affichons les tables\n",
     "\n",
-    "con.table(\"sise_communes\").df()"
+    "con.table(\"edc_communes\").df()"
    ]
   },
   {
@@ -584,7 +584,7 @@
     }
    ],
    "source": [
-    "con.table(\"sise_prelevements\").df()"
+    "con.table(\"edc_prelevements\").df()"
    ]
   },
   {
@@ -1155,7 +1155,7 @@
     }
    ],
    "source": [
-    "con.table(\"sise_resultats\").df().head(20)"
+    "con.table(\"edc_resultats\").df().head(20)"
    ]
   },
   {
@@ -1172,9 +1172,9 @@
     }
    ],
    "source": [
-    "# Chargeons la table sise_communes dans un pandas dataframe, et calculons le nombre de communes\n",
+    "# Chargeons la table edc_communes dans un pandas dataframe, et calculons le nombre de communes\n",
     "\n",
-    "communes = con.table(\"sise_communes\").to_df()\n",
+    "communes = con.table(\"edc_communes\").to_df()\n",
     "nombre_de_communes = communes.nunique()[\"inseecommune\"]\n",
     "print(f\"nombre_de_communes = {nombre_de_communes}\")"
    ]
@@ -1228,7 +1228,7 @@
     "\n",
     "con.sql(\"\"\"\n",
     "    SELECT libmajparametre, COUNT(*) as count\n",
-    "    FROM sise_resultats\n",
+    "    FROM edc_resultats\n",
     "    GROUP BY libmajparametre\n",
     "    ORDER BY count DESC\n",
     "\"\"\").show()"
@@ -1361,7 +1361,7 @@
     "\n",
     "# ...et faisons la même requête SQL en utilisant l'extension SQL pour Jupyter\n",
     "\n",
-    "%sql SELECT libmajparametre, COUNT(*) as count FROM sise_resultats GROUP BY libmajparametre ORDER BY count DESC;"
+    "%sql SELECT libmajparametre, COUNT(*) as count FROM edc_resultats GROUP BY libmajparametre ORDER BY count DESC;"
    ]
   },
   {
@@ -1414,7 +1414,7 @@
     "\n",
     "con.sql(f\"\"\"\n",
     "    SELECT *\n",
-    "    FROM sise_prelevements\n",
+    "    FROM edc_prelevements\n",
     "    WHERE nomcommuneprinc = '{nomcommune}'\n",
     "\"\"\").show()"
    ]
diff --git a/pipelines/tasks/_config_edc.py b/pipelines/tasks/_config_edc.py
@@ -0,0 +1,103 @@
+from typing import Dict
+
+
+def get_edc_config() -> Dict:
+    """
+    Returns various configuration for processing the EDC (Eau distribuée par commune) datasets.
+    The data comes from https://www.data.gouv.fr/fr/datasets/resultats-du-controle-sanitaire-de-leau-distribuee-commune-par-commune/
+    For each year a dataset is downloadable on a URL like this (ex. 2024):
+        https://www.data.gouv.fr/fr/datasets/r/84a67a3b-08a7-4001-98e6-231c74a98139
+    :return: A dict with the config used for processing.
+        The "source" part is related to the data.gouv datasource
+        The "files" part is related to the extracted files information and sql table names
+    """
+
+    edc_config = {
+        "source": {
+            "base_url": "https://www.data.gouv.fr/fr/datasets/r/",
+            "available_years": [
+                "2016",
+                "2017",
+                "2018",
+                "2019",
+                "2020",
+                "2021",
+                "2022",
+                "2023",
+                "2024",
+            ],
+            "yearly_files_infos": {
+                "2024": {
+                    "id": "84a67a3b-08a7-4001-98e6-231c74a98139",
+                    "zipfile": "dis-2024.zip",
+                },
+                "2023": {
+                    "id": "c89dec4a-d985-447c-a102-75ba814c398e",
+                    "zipfile": "dis-2023.zip",
+                },
+                "2022": {
+                    "id": "a97b6074-c4dd-4ef2-8922-b0cf04dbff9a",
+                    "zipfile": "dis-2022.zip",
+                },
+                "2021": {
+                    "id": "d2b432cc-3761-44d3-8e66-48bc15300bb5",
+                    "zipfile": "dis-2021.zip",
+                },
+                "2020": {
+                    "id": "a6cb4fea-ef8c-47a5-acb3-14e49ccad801",
+                    "zipfile": "dis-2020.zip",
+                },
+                "2019": {
+                    "id": "861f2a7d-024c-4bf0-968b-9e3069d9de07",
+                    "zipfile": "dis-2019.zip",
+                },
+                "2018": {
+                    "id": "0513b3c0-dc18-468d-a969-b3508f079792",
+                    "zipfile": "dis-2018.zip",
+                },
+                "2017": {
+                    "id": "5785427b-3167-49fa-a581-aef835f0fb04",
+                    "zipfile": "dis-2017.zip",
+                },
+                "2016": {
+                    "id": "483c84dd-7912-483b-b96f-4fa5e1d8651f",
+                    "zipfile": "dis-2016.zip",
+                },
+            },
+        },
+        "files": {
+            "communes": {
+                "file_name_prefix": "DIS_COM_UDI_",
+                "file_extension": ".txt",
+                "table_name": "edc_communes",
+            },
+            "prelevements": {
+                "file_name_prefix": "DIS_PLV_",
+                "file_extension": ".txt",
+                "table_name": "edc_prelevements",
+            },
+            "resultats": {
+                "file_name_prefix": "DIS_RESULT_",
+                "file_extension": ".txt",
+                "table_name": "edc_resultats",
+            },
+        },
+    }
+
+    return edc_config
+
+
+def create_edc_yearly_filename(
+    file_name_prefix: str, file_extension: str, year: str
+) -> str:
+    """
+    This function is used to recreate the yearly filenames of the extracted files.
+    It is intended for use with the edc_config["files"] data above.
+    For example in 2024 the file name for communes should be:
+        DIS_COM_UDI_2024.txt
+    :param file_name_prefix: prefix of the filename
+    :param file_extension: extension of the file
+    :param year: year of the needed file
+    :return: the yearly filename as a string
+    """
+    return file_name_prefix + year + file_extension
diff --git a/pipelines/tasks/build_database.py b/pipelines/tasks/build_database.py
@@ -11,8 +11,10 @@
 import requests
 
 from ._common import CACHE_FOLDER, DUCKDB_FILE, clear_cache
+from ._config_edc import get_edc_config, create_edc_yearly_filename
 
 logger = logging.getLogger(__name__)
+edc_config = get_edc_config()
 
 
 def check_table_existence(conn: duckdb.DuckDBPyConnection, table_name: str) -> bool:
@@ -31,30 +33,6 @@ def check_table_existence(conn: duckdb.DuckDBPyConnection, table_name: str) -> b
     return list(conn.fetchone())[0] == 1
 
 
-def get_yearly_edc_infos(year: str) -> Dict[str, str]:
-    """
-    Returns information for yearly dataset extract of the EDC (Eau distribuée par commune) datasets.
-    The data comes from https://www.data.gouv.fr/fr/datasets/resultats-du-controle-sanitaire-de-leau-distribuee-commune-par-commune/
-    For each year a dataset is downloadable on a URL like this (ex. 2024):
-        https://www.data.gouv.fr/fr/datasets/r/84a67a3b-08a7-4001-98e6-231c74a98139
-    The id of the dataset is the last part of this URL
-    The name of the dataset is dis-YEAR.zip (but the format could potentially change).
-    :param year: The year from which we want to get the dataset information.
-    :return: A dict with the id and name of the dataset.
-    """
-    edc_dis_files_info_by_year = {
-        "2024": {"id": "84a67a3b-08a7-4001-98e6-231c74a98139", "name": "dis-2024.zip"},
-        "2023": {"id": "c89dec4a-d985-447c-a102-75ba814c398e", "name": "dis-2023.zip"},
-        "2022": {"id": "a97b6074-c4dd-4ef2-8922-b0cf04dbff9a", "name": "dis-2022.zip"},
-        "2021": {"id": "d2b432cc-3761-44d3-8e66-48bc15300bb5", "name": "dis-2021.zip"},
-        "2020": {"id": "a6cb4fea-ef8c-47a5-acb3-14e49ccad801", "name": "dis-2020.zip"},
-        "2019": {"id": "861f2a7d-024c-4bf0-968b-9e3069d9de07", "name": "dis-2019.zip"},
-        "2018": {"id": "0513b3c0-dc18-468d-a969-b3508f079792", "name": "dis-2018.zip"},
-        "2017": {"id": "5785427b-3167-49fa-a581-aef835f0fb04", "name": "dis-2017.zip"},
-        "2016": {"id": "483c84dd-7912-483b-b96f-4fa5e1d8651f", "name": "dis-2016.zip"},
-    }
-    return edc_dis_files_info_by_year[year]
-
 
 def download_extract_insert_yearly_edc_data(year: str):
     """
@@ -64,31 +42,16 @@ def download_extract_insert_yearly_edc_data(year: str):
     :return: Create or replace the associated tables in the duckcb database.
         It adds the column "de_partition" based on year as an integer.
     """
-
-    yearly_dataset_info = get_yearly_edc_infos(year=year)
-
     # Dataset specific constants
-    DATA_URL = f"https://www.data.gouv.fr/fr/datasets/r/{yearly_dataset_info['id']}"
-    ZIP_FILE = os.path.join(CACHE_FOLDER, yearly_dataset_info["name"])
+    DATA_URL = (
+        edc_config["source"]["base_url"]
+        + edc_config["source"]["yearly_files_infos"][year]["id"]
+    )
+    ZIP_FILE = os.path.join(
+        CACHE_FOLDER, edc_config["source"]["yearly_files_infos"][year]["zipfile"]
+    )
     EXTRACT_FOLDER = os.path.join(CACHE_FOLDER, f"raw_data_{year}")
-
-    FILES = {
-        "communes": {
-            "filename_prefix": f"DIS_COM_UDI_",
-            "file_extension": ".txt",
-            "table_name": f"edc_communes",
-        },
-        "prelevements": {
-            "filename_prefix": f"DIS_PLV_",
-            "file_extension": ".txt",
-            "table_name": f"edc_prelevements",
-        },
-        "resultats": {
-            "filename_prefix": f"DIS_RESULT_",
-            "file_extension": ".txt",
-            "table_name": f"edc_resultats",
-        },
-    }
+    FILES = edc_config["files"]
 
     logger.info(f"Processing EDC dataset for {year}...")
     response = requests.get(DATA_URL, stream=True)
@@ -106,7 +69,11 @@ def download_extract_insert_yearly_edc_data(year: str):
     for file_info in FILES.values():
         filepath = os.path.join(
             EXTRACT_FOLDER,
-            f"{file_info['filename_prefix']}{year}{file_info['file_extension']}",
+            create_edc_yearly_filename(
+                file_name_prefix=file_info["file_name_prefix"],
+                file_extension=file_info["file_extension"],
+                year=year,
+            ),
         )
 
         if check_table_existence(conn=conn, table_name=f"{file_info['table_name']}"):
@@ -124,7 +91,8 @@ def download_extract_insert_yearly_edc_data(year: str):
         query_select = f"""
             SELECT 
                 *,
-                CAST({year} as INTEGER) AS de_partition
+                CAST({year} AS INTEGER) AS de_partition,
+                current_date            AS de_ingestion_date
             FROM read_csv('{filepath}', header=true, delim=',');
         """
 
@@ -139,7 +107,7 @@ def download_extract_insert_yearly_edc_data(year: str):
 
 
 def process_edc_datasets(
-    refresh_type: Literal["all", "last", "custom"] = "all",
+    refresh_type: Literal["all", "last", "custom"] = "last",
     custom_years: List[str] = None,
 ):
     """
@@ -151,17 +119,7 @@ def process_edc_datasets(
     :param custom_years: years to update
     :return:
     """
-    available_years = [
-        "2016",
-        "2017",
-        "2018",
-        "2019",
-        "2020",
-        "2021",
-        "2022",
-        "2023",
-        "2024",
-    ]
+    available_years = edc_config["source"]["available_years"]
 
     if refresh_type == "all":
         years_to_update = available_years
@@ -181,7 +139,6 @@ def process_edc_datasets(
             raise ValueError(
                 """ custom_years parameter needs to be specified if refresh_type="custom" """
             )
-
     else:
         raise ValueError(
             f""" refresh_type needs to be one of ["all", "last", "custom"], it can't be: {refresh_type}"""
@@ -196,7 +153,6 @@ def process_edc_datasets(
     clear_cache(recreate_folder=False)
     return True
 
-
 def execute(refresh_type: str = "all", custom_years: List[str] = None):
     """
     Execute the EDC dataset processing with specified parameters.

Original file line number	Diff line number	Diff line change
`@@ -18,6 +18,9 @@`
`18`	`18`	`"label": "Run Evidence",`
`19`	`19`	`"type": "shell",`
`20`	`20`	`"command": "if command -v npm > /dev/null 2>&1; then cd analytics/evidence && npm install && npm run sources && npm run dev -- --host 0.0.0.0; else echo 'NPM not installed'; fi",`
	`21`	`+ "windows": {`
	`22`	`+ "command": "cd analytics\\evidence; npm install; npm run sources; npm run dev -- --host 0.0.0.0"`
	`23`	`+ },`
`21`	`24`	`"group": "none",`
`22`	`25`	`"icon": {`
`23`	`26`	`"id": "run"`
`@@ -29,4 +32,4 @@`
`29`	`32`	`}`
`30`	`33`	`}`
`31`	`34`	`]`
`32`		`-}`
	`35`	`+}`
Original file line number	Diff line number	Diff line change
`@@ -4,7 +4,7 @@`
`4`	`4`	`"cell_type": "markdown",`
`5`	`5`	`"metadata": {},`
`6`	`6`	`"source": [`
`7`		`- "# Exemple de notebook - premières analyses des données SISE-Eaux\n"`
	`7`	`+ "# Exemple de notebook - premières analyses des données EDC (Eau Distribuée par Commune)\n"`
`8`	`8`	`]`
`9`	`9`	`},`
`10`	`10`	`{`
`@@ -197,7 +197,7 @@`
`197`	`197`	`"source": [`
`198`	`198`	`"# Affichons les tables\n",`
`199`	`199`	`"\n",`
`200`		`- "con.table(\"sise_communes\").df()"`
	`200`	`+ "con.table(\"edc_communes\").df()"`
`201`	`201`	`]`
`202`	`202`	`},`
`203`	`203`	`{`
`@@ -584,7 +584,7 @@`
`584`	`584`	`}`
`585`	`585`	`],`
`586`	`586`	`"source": [`
`587`		`- "con.table(\"sise_prelevements\").df()"`
	`587`	`+ "con.table(\"edc_prelevements\").df()"`
`588`	`588`	`]`
`589`	`589`	`},`
`590`	`590`	`{`
`@@ -1155,7 +1155,7 @@`
`1155`	`1155`	`}`
`1156`	`1156`	`],`
`1157`	`1157`	`"source": [`
`1158`		`- "con.table(\"sise_resultats\").df().head(20)"`
	`1158`	`+ "con.table(\"edc_resultats\").df().head(20)"`
`1159`	`1159`	`]`
`1160`	`1160`	`},`
`1161`	`1161`	`{`
`@@ -1172,9 +1172,9 @@`
`1172`	`1172`	`}`
`1173`	`1173`	`],`
`1174`	`1174`	`"source": [`
`1175`		`- "# Chargeons la table sise_communes dans un pandas dataframe, et calculons le nombre de communes\n",`
	`1175`	`+ "# Chargeons la table edc_communes dans un pandas dataframe, et calculons le nombre de communes\n",`
`1176`	`1176`	`"\n",`
`1177`		`- "communes = con.table(\"sise_communes\").to_df()\n",`
	`1177`	`+ "communes = con.table(\"edc_communes\").to_df()\n",`
`1178`	`1178`	`"nombre_de_communes = communes.nunique()[\"inseecommune\"]\n",`
`1179`	`1179`	`"print(f\"nombre_de_communes = {nombre_de_communes}\")"`
`1180`	`1180`	`]`
`@@ -1228,7 +1228,7 @@`
`1228`	`1228`	`"\n",`
`1229`	`1229`	`"con.sql(\"\"\"\n",`
`1230`	`1230`	`" SELECT libmajparametre, COUNT(*) as count\n",`
`1231`		`- " FROM sise_resultats\n",`
	`1231`	`+ " FROM edc_resultats\n",`
`1232`	`1232`	`" GROUP BY libmajparametre\n",`
`1233`	`1233`	`" ORDER BY count DESC\n",`
`1234`	`1234`	`"\"\"\").show()"`
`@@ -1361,7 +1361,7 @@`
`1361`	`1361`	`"\n",`
`1362`	`1362`	`"# ...et faisons la même requête SQL en utilisant l'extension SQL pour Jupyter\n",`
`1363`	`1363`	`"\n",`
`1364`		`- "%sql SELECT libmajparametre, COUNT(*) as count FROM sise_resultats GROUP BY libmajparametre ORDER BY count DESC;"`
	`1364`	`+ "%sql SELECT libmajparametre, COUNT(*) as count FROM edc_resultats GROUP BY libmajparametre ORDER BY count DESC;"`
`1365`	`1365`	`]`
`1366`	`1366`	`},`
`1367`	`1367`	`{`
`@@ -1414,7 +1414,7 @@`
`1414`	`1414`	`"\n",`
`1415`	`1415`	`"con.sql(f\"\"\"\n",`
`1416`	`1416`	`" SELECT *\n",`
`1417`		`- " FROM sise_prelevements\n",`
	`1417`	`+ " FROM edc_prelevements\n",`
`1418`	`1418`	`" WHERE nomcommuneprinc = '{nomcommune}'\n",`
`1419`	`1419`	`"\"\"\").show()"`
`1420`	`1420`	`]`