Feature/request by specific year (#11)

SprinTech · moreaupascal56 · jereze · web-flow · commit e8198c879e5b · 2025-02-07T19:51:51.000+01:00
* add fct get_yearly_dataset_infos

* parametrize process_sise_eaux_dataset_2024

* recreate cache folder by default in clear_cache()

* add check_table_existence function

* parametrize process_sise_eaux_dataset_2024()

* format _common.py

* add process_sise_eaux_dataset() controller and rename process_sise_eaux_dataset_2024 to download_extract_insert_yearly_SISE_data

* upd docstrings, formatting

* upd logs and add a clear_cache() in process_sise_eaux_dataset

* reorganize file

* add notebook to preview data

* fix Incompatible types in assignment

* rename SISE to EDC

* rename annee_prelevement to de_partition

* catch and raise error if refresh_type not in the allowed values

* format

* fix typo

* Add cli argument to request by specific years

* Improve documentation from CLI

* Adapt run command to task

---------

Co-authored-by: moreaupascal56 &lt;moreaupascal56@gmail.com&gt;
Co-authored-by: Jeremy Greze &lt;jereze@users.noreply.github.com&gt;
diff --git a/README.md b/README.md
@@ -80,10 +80,22 @@ Tout le code dans pipelines sera installé en tant que package python automatiqu
 ### Comment construire la database
 
 Une fois l'environnement python setup avec uv, vous pouvez lancer data_pipeline/run.py pour remplir la database
-Il suffit de lancer
 
+Le téléchargement des données peut se faire de plusieurs manières :
+* 1. Téléchargement des données de la dernière année (par défaut)
 ```bash
-uv run pipelines/run.py run build_database
+uv run pipelines/run.py run build_database --refresh-type last
+```
+
+* 2. Téléchargement de toutes les données
+
+```bash
+uv run pipelines/run.py run build_database --refresh-type all
+```
+
+* 3. Téléchargement de données d'années spécifiques
+```bash
+uv run pipelines/run.py run build_database --refresh-type custom --custom-years 2018,2024,...
 ```
 
 ### Comment télécharger la database depuis S3
diff --git a/pipelines/config/.env.example b/pipelines/config/.env.example
@@ -1,3 +1,2 @@
 SCW_ACCESS_KEY=MyKey
-SCW_SECRET_KEY=MySecret
-ENV=dev
+SCW_SECRET_KEY=MySecret
diff --git a/pipelines/run.py b/pipelines/run.py
@@ -1,7 +1,6 @@
 import importlib
 import logging
 import os
-import sys
 
 import click
 
@@ -26,29 +25,82 @@ def cli():
 def list():
     """List all available tasks."""
     tasks_dir = os.path.join(os.path.dirname(__file__), "tasks")
-    click.echo("Available tasks:")
-    for filename in os.listdir(tasks_dir):
+
+    for filename in sorted(os.listdir(tasks_dir)):
         if filename.endswith(".py") and not filename.startswith("_"):
             module_name = filename[:-3]
             module = importlib.import_module(f"tasks.{module_name}")
-            description = module.__doc__ or "No description"
-            description = description.strip().split("\n")[0]
-            click.echo(f"- {module_name}: {description}")
 
+            doc = module.__doc__ or "No description"
+            doc_lines = doc.strip().split("\n")
+            while doc_lines and not doc_lines[0].strip():
+                doc_lines.pop(0)
+            while doc_lines and not doc_lines[-1].strip():
+                doc_lines.pop()
 
-@cli.command()
-@click.argument("task_name")
-def run(task_name):
-    """Run a specified task."""
-    try:
-        module = importlib.import_module(f"tasks.{task_name}")
-        task_func = getattr(module, "execute")
-        logging.info(f"Starting task {task_name}...")
-        task_func()
-        logging.info(f"Task {task_name} completed.")
-    except (ModuleNotFoundError, AttributeError):
-        logging.error(f"Task {task_name} not found.")
-        sys.exit(1)
+            click.echo(f"\n{module_name}:")
+            for line in doc_lines:
+                click.echo(f"    {line}")
+
+
+@cli.group()
+def run():
+    """Run tasks."""
+    pass
+
+
+@run.command("build_database")
+@click.option(
+    "--refresh-type",
+    type=click.Choice(["all", "last", "custom"]),
+    default="all",
+    help="Type of refresh to perform",
+)
+@click.option(
+    "--custom-years",
+    type=str,
+    help="Comma-separated list of years to process (for custom refresh type)",
+)
+def run_build_database(refresh_type, custom_years):
+    """Run build_database task."""
+    module = importlib.import_module("tasks.build_database")
+    task_func = getattr(module, "execute")
+
+    custom_years_list = None
+    if custom_years:
+        custom_years_list = [year.strip() for year in custom_years.split(",")]
+
+    task_func(refresh_type=refresh_type, custom_years=custom_years_list)
+
+
+@run.command("download_database")
+@click.option(
+    "--env",
+    type=click.Choice(["dev", "prod"]),
+    default="prod",
+    help="Environment to download from",
+)
+def run_download_database(env):
+    """Download database from S3."""
+    os.environ["ENVIRONMENT"] = env
+    module = importlib.import_module("tasks.download_database")
+    task_func = getattr(module, "execute")
+    task_func()
+
+
+@run.command("upload_database")
+@click.option(
+    "--env",
+    type=click.Choice(["dev", "prod"]),
+    default="dev",
+    help="Environment to upload to",
+)
+def run_upload_database(env):
+    """Upload database to S3."""
+    os.environ["ENVIRONMENT"] = env
+    module = importlib.import_module("tasks.upload_database")
+    task_func = getattr(module, "execute")
+    task_func()
 
 
 if __name__ == "__main__":
diff --git a/pipelines/tasks/build_database.py b/pipelines/tasks/build_database.py
@@ -1,5 +1,14 @@
 """
 Consolidate data into the database.
+
+Args:
+    - refresh-type (str): Type of refresh to perform ("all", "last", or "custom")
+    - custom-years (str): List of years to process when refresh_type is "custom"
+
+Examples:
+    - build_database --refresh-type all : Process all years
+    - build_database --refresh-type last : Process last year only
+    - build_database --refresh-type custom --custom-years 2018,2024 : Process only the years 2018 and 2024
 """
 
 import logging
@@ -41,7 +50,6 @@ def download_extract_insert_yearly_edc_data(year: str):
     :return: Create or replace the associated tables in the duckcb database.
         It adds the column "de_partition" based on year as an integer.
     """
-
     # Dataset specific constants
     DATA_URL = (
         edc_config["source"]["base_url"]
@@ -127,7 +135,16 @@ def process_edc_datasets(
         years_to_update = available_years[-1:]
     elif refresh_type == "custom":
         if custom_years:
-            years_to_update = list(set(custom_years).intersection(available_years))
+            # Check if every year provided are available
+            invalid_years = set(custom_years) - set(available_years)
+            if invalid_years:
+                raise ValueError(
+                    f"Invalid years provided: {sorted(invalid_years)}. Years must be among: {available_years}"
+                )
+            # Filtering and sorting of valid years
+            years_to_update = sorted(
+                list(set(custom_years).intersection(available_years))
+            )
         else:
             raise ValueError(
                 """ custom_years parameter needs to be specified if refresh_type="custom" """
@@ -147,5 +164,11 @@ def process_edc_datasets(
     return True
 
 
-def execute():
-    process_edc_datasets()
+def execute(refresh_type: str = "all", custom_years: List[str] = None):
+    """
+    Execute the EDC dataset processing with specified parameters.
+
+    :param refresh_type: Type of refresh to perform ("all", "last", or "custom")
+    :param custom_years: List of years to process when refresh_type is "custom"
+    """
+    process_edc_datasets(refresh_type=refresh_type, custom_years=custom_years)
diff --git a/pipelines/tasks/download_database.py b/pipelines/tasks/download_database.py
@@ -1,3 +1,14 @@
+"""
+Download database from S3 storage.
+
+Args:
+    - env (str): Environment to download from ("dev" or "prod")
+
+Examples:
+    - download_database --env prod : Download database from production environment
+    - download_database --env dev  : Download database from development environment
+"""
+
 import logging
 
 from pipelines.config.config import get_environment, get_s3_path
diff --git a/pipelines/tasks/upload_database.py b/pipelines/tasks/upload_database.py
@@ -1,3 +1,14 @@
+"""
+Upload database to S3 storage.
+
+Args:
+    - env (str): Environment to upload to ("dev" or "prod")
+
+Examples:
+    - upload_database --env dev  : Upload database to development environment
+    - upload_database --env prod : Upload database to production environment
+"""
+
 import logging
 
 from pipelines.config.config import get_environment, get_s3_path