|
4 | 4 |
|
5 | 5 | import logging
|
6 | 6 | import os
|
| 7 | +from typing import List, Literal |
7 | 8 | from zipfile import ZipFile
|
8 | 9 |
|
9 | 10 | import duckdb
|
10 | 11 | import requests
|
11 | 12 |
|
12 | 13 | from ._common import CACHE_FOLDER, DUCKDB_FILE, clear_cache
|
| 14 | +from ._config_edc import get_edc_config, create_edc_yearly_filename |
13 | 15 |
|
14 | 16 | logger = logging.getLogger(__name__)
|
| 17 | +edc_config = get_edc_config() |
15 | 18 |
|
16 | 19 |
|
17 |
| -def process_sise_eaux_dataset_2024(): |
18 |
| - """Process SISE-Eaux dataset for 2024.""" |
| 20 | +def check_table_existence(conn: duckdb.DuckDBPyConnection, table_name: str) -> bool: |
| 21 | + """ |
| 22 | + Check if a table exists in the duckdb database |
| 23 | + :param conn: The duckdb connection to use |
| 24 | + :param table_name: The table name to check existence |
| 25 | + :return: True if the table exists, False if not |
| 26 | + """ |
| 27 | + query = f""" |
| 28 | + SELECT COUNT(*) |
| 29 | + FROM information_schema.tables |
| 30 | + WHERE table_name = '{table_name}' |
| 31 | + """ |
| 32 | + conn.execute(query) |
| 33 | + return list(conn.fetchone())[0] == 1 |
| 34 | + |
| 35 | + |
| 36 | +def download_extract_insert_yearly_edc_data(year: str): |
| 37 | + """ |
| 38 | + Downloads from www.data.gouv.fr the EDC (Eau distribuée par commune) dataset for one year, |
| 39 | + extracts the files and insert the data into duckdb |
| 40 | + :param year: The year from which we want to download the dataset |
| 41 | + :return: Create or replace the associated tables in the duckcb database. |
| 42 | + It adds the column "de_partition" based on year as an integer. |
| 43 | + """ |
19 | 44 |
|
20 | 45 | # Dataset specific constants
|
21 | 46 | DATA_URL = (
|
22 |
| - "https://www.data.gouv.fr/fr/datasets/r/84a67a3b-08a7-4001-98e6-231c74a98139" |
| 47 | + edc_config["source"]["base_url"] |
| 48 | + + edc_config["source"]["yearly_files_infos"][year]["id"] |
23 | 49 | )
|
24 |
| - ZIP_FILE = os.path.join(CACHE_FOLDER, "dis-2024.zip") |
25 |
| - EXTRACT_FOLDER = os.path.join(CACHE_FOLDER, "raw_data_2024") |
26 |
| - |
27 |
| - FILES = { |
28 |
| - "communes": {"filename": "DIS_COM_UDI_2024.txt", "table": "sise_communes"}, |
29 |
| - "prelevements": {"filename": "DIS_PLV_2024.txt", "table": "sise_prelevements"}, |
30 |
| - "resultats": {"filename": "DIS_RESULT_2024.txt", "table": "sise_resultats"}, |
31 |
| - } |
| 50 | + ZIP_FILE = os.path.join( |
| 51 | + CACHE_FOLDER, edc_config["source"]["yearly_files_infos"][year]["zipfile"] |
| 52 | + ) |
| 53 | + EXTRACT_FOLDER = os.path.join(CACHE_FOLDER, f"raw_data_{year}") |
| 54 | + FILES = edc_config["files"] |
32 | 55 |
|
33 |
| - logger.info("Downloading and extracting SISE-Eaux dataset for 2024...") |
| 56 | + logger.info(f"Processing EDC dataset for {year}...") |
34 | 57 | response = requests.get(DATA_URL, stream=True)
|
35 | 58 | with open(ZIP_FILE, "wb") as f:
|
36 | 59 | for chunk in response.iter_content(chunk_size=8192):
|
37 | 60 | f.write(chunk)
|
38 | 61 |
|
39 |
| - logger.info("Extracting files...") |
| 62 | + logger.info(" Extracting files...") |
40 | 63 | with ZipFile(ZIP_FILE, "r") as zip_ref:
|
41 | 64 | zip_ref.extractall(EXTRACT_FOLDER)
|
42 | 65 |
|
43 |
| - logger.info("Creating tables in the database...") |
| 66 | + logger.info(" Creating or updating tables in the database...") |
44 | 67 | conn = duckdb.connect(DUCKDB_FILE)
|
| 68 | + |
45 | 69 | for file_info in FILES.values():
|
46 |
| - filepath = os.path.join(EXTRACT_FOLDER, file_info["filename"]) |
47 |
| - query = f""" |
48 |
| - CREATE OR REPLACE TABLE {file_info["table"]} AS |
49 |
| - SELECT * FROM read_csv('{filepath}', header=true, delim=','); |
| 70 | + filepath = os.path.join( |
| 71 | + EXTRACT_FOLDER, |
| 72 | + create_edc_yearly_filename( |
| 73 | + file_name_prefix=file_info["file_name_prefix"], |
| 74 | + file_extension=file_info["file_extension"], |
| 75 | + year=year, |
| 76 | + ), |
| 77 | + ) |
| 78 | + |
| 79 | + if check_table_existence(conn=conn, table_name=f"{file_info['table_name']}"): |
| 80 | + query = f""" |
| 81 | + DELETE FROM {f"{file_info['table_name']}"} |
| 82 | + WHERE de_partition = CAST({year} as INTEGER) |
| 83 | + ; |
| 84 | + """ |
| 85 | + conn.execute(query) |
| 86 | + query_start = f"INSERT INTO {f'{file_info["table_name"]}'} " |
| 87 | + |
| 88 | + else: |
| 89 | + query_start = f"CREATE TABLE {f'{file_info["table_name"]}'} AS " |
| 90 | + |
| 91 | + query_select = f""" |
| 92 | + SELECT |
| 93 | + *, |
| 94 | + CAST({year} AS INTEGER) AS de_partition, |
| 95 | + current_date AS de_ingestion_date |
| 96 | + FROM read_csv('{filepath}', header=true, delim=','); |
50 | 97 | """
|
51 |
| - conn.execute(query) |
| 98 | + |
| 99 | + conn.execute(query_start + query_select) |
| 100 | + |
52 | 101 | conn.close()
|
53 | 102 |
|
54 |
| - logger.info("Cleaning up...") |
| 103 | + logger.info(" Cleaning up cache...") |
55 | 104 | clear_cache()
|
56 | 105 |
|
57 | 106 | return True
|
58 | 107 |
|
59 | 108 |
|
| 109 | +def process_edc_datasets( |
| 110 | + refresh_type: Literal["all", "last", "custom"] = "last", |
| 111 | + custom_years: List[str] = None, |
| 112 | +): |
| 113 | + """ |
| 114 | + Process the EDC datasets. |
| 115 | + :param refresh_type: Refresh type to run |
| 116 | + - "all": Refresh the data for every possible year |
| 117 | + - "last": Refresh the data only for the last available year |
| 118 | + - "custom": Refresh the data for the years specified in the list custom_years |
| 119 | + :param custom_years: years to update |
| 120 | + :return: |
| 121 | + """ |
| 122 | + available_years = edc_config["source"]["available_years"] |
| 123 | + |
| 124 | + if refresh_type == "all": |
| 125 | + years_to_update = available_years |
| 126 | + elif refresh_type == "last": |
| 127 | + years_to_update = available_years[-1:] |
| 128 | + elif refresh_type == "custom": |
| 129 | + if custom_years: |
| 130 | + years_to_update = list(set(custom_years).intersection(available_years)) |
| 131 | + else: |
| 132 | + raise ValueError( |
| 133 | + """ custom_years parameter needs to be specified if refresh_type="custom" """ |
| 134 | + ) |
| 135 | + else: |
| 136 | + raise ValueError( |
| 137 | + f""" refresh_type needs to be one of ["all", "last", "custom"], it can't be: {refresh_type}""" |
| 138 | + ) |
| 139 | + |
| 140 | + logger.info(f"Launching processing of EDC datasets for years: {years_to_update}") |
| 141 | + |
| 142 | + for year in years_to_update: |
| 143 | + download_extract_insert_yearly_edc_data(year=year) |
| 144 | + |
| 145 | + logger.info("Cleaning up cache...") |
| 146 | + clear_cache(recreate_folder=False) |
| 147 | + return True |
| 148 | + |
| 149 | + |
60 | 150 | def execute():
|
61 |
| - process_sise_eaux_dataset_2024() |
| 151 | + process_edc_datasets() |
0 commit comments