From 943ee94acd25d56556a904496c5aeb4510f20db8 Mon Sep 17 00:00:00 2001 From: Max Corbeau Date: Wed, 12 Mar 2025 13:23:35 +0100 Subject: [PATCH 01/50] RGPD: anonymiser noms acteurs: init commit --- dags/rgpd/config/__init__.py | 4 + dags/rgpd/config/columns.py | 26 +++++ dags/rgpd/config/paths.py | 6 ++ .../sql/read/rgpd_anonymize_people_read.sql | 32 ++++++ dags/rgpd/config/tasks.py | 10 ++ dags/rgpd/config/xcoms.py | 54 ++++++++++ dags/rgpd/dags/rgpd_anonymize_people.py | 73 +++++++++++++ .../rgpd_anonymize_people_match_task.py | 50 +++++++++ .../rgpd_anonymize_people_read_task.py | 49 +++++++++ .../rgpd_anonymize_people_suggest_task.py | 51 +++++++++ .../rgpd_anonymize_people_match.py | 79 ++++++++++++++ .../rgpd_anonymize_people_read.py | 30 ++++++ .../rgpd_anonymize_people_suggest.py | 101 ++++++++++++++++++ dags/sources/config/shared_constants.py | 3 + data/models/suggestion.py | 1 + 15 files changed, 569 insertions(+) create mode 100644 dags/rgpd/config/__init__.py create mode 100644 dags/rgpd/config/columns.py create mode 100644 dags/rgpd/config/paths.py create mode 100644 dags/rgpd/config/sql/read/rgpd_anonymize_people_read.sql create mode 100644 dags/rgpd/config/tasks.py create mode 100644 dags/rgpd/config/xcoms.py create mode 100644 dags/rgpd/dags/rgpd_anonymize_people.py create mode 100644 dags/rgpd/tasks/airflow_logic/rgpd_anonymize_people_match_task.py create mode 100644 dags/rgpd/tasks/airflow_logic/rgpd_anonymize_people_read_task.py create mode 100644 dags/rgpd/tasks/airflow_logic/rgpd_anonymize_people_suggest_task.py create mode 100644 dags/rgpd/tasks/business_logic/rgpd_anonymize_people_match.py create mode 100644 dags/rgpd/tasks/business_logic/rgpd_anonymize_people_read.py create mode 100644 dags/rgpd/tasks/business_logic/rgpd_anonymize_people_suggest.py diff --git a/dags/rgpd/config/__init__.py b/dags/rgpd/config/__init__.py new file mode 100644 index 000000000..af729ea54 --- /dev/null +++ b/dags/rgpd/config/__init__.py @@ -0,0 +1,4 @@ +from .columns import COLS # noqa: F401 +from .paths import DIR_SQL_READ # noqa: F401 +from .tasks import TASKS # noqa: F401 +from .xcoms import XCOMS, xcom_pull # noqa: F401 diff --git a/dags/rgpd/config/columns.py b/dags/rgpd/config/columns.py new file mode 100644 index 000000000..33567f3e3 --- /dev/null +++ b/dags/rgpd/config/columns.py @@ -0,0 +1,26 @@ +"""Column names for RGPD anonymize DAG. Columns +are used in conf, dataframes and SQL queries. These +don't include Acteur fields (for this we stick to Acteur models)""" + +from dataclasses import dataclass + + +@dataclass(frozen=True) +class COLS: + # Dry run + DRY_RUN: str = "dry_run" + + # QFDMO + QFDMO_ACTEUR_NOMS_ORIGIN: str = "qfdmo_acteur_noms_origine" + QFDMO_ACTEUR_NOMS_MATCH: str = "qfdmo_acteur_noms_match" + QFDMO_ACTEUR_NOMS_COMPARISON: str = "qfdmo_acteur_noms_comparaison" + QFDMO_ACTEUR_ID: str = "qfdmo_acteur_id" + + # Annuaire Entreprise + AE_NOM_PREFIX: str = "ae_nom" + AE_PRENOM_PREFIX: str = "ae_prenom" + + # Matching + MATCH_SCORE: str = "match_score" + MATCH_WORDS: str = "match_words" + MATCH_THRESHOLD: str = "match_threshold" diff --git a/dags/rgpd/config/paths.py b/dags/rgpd/config/paths.py new file mode 100644 index 000000000..2faa9eaac --- /dev/null +++ b/dags/rgpd/config/paths.py @@ -0,0 +1,6 @@ +"""Constants for file paths""" + +from pathlib import Path + +DIR_CURRENT = Path(__file__).resolve() +DIR_SQL_READ = DIR_CURRENT.parent / "sql" / "read" diff --git a/dags/rgpd/config/sql/read/rgpd_anonymize_people_read.sql b/dags/rgpd/config/sql/read/rgpd_anonymize_people_read.sql new file mode 100644 index 000000000..8701e7deb --- /dev/null +++ b/dags/rgpd/config/sql/read/rgpd_anonymize_people_read.sql @@ -0,0 +1,32 @@ +WITH acteurs_with_siren AS ( + SELECT + TRIM(CONCAT(nom || ' ' || nom_officiel || ' ' || nom_commercial)) AS noms, + LEFT(siret,9) AS siren, + identifiant_unique, + commentaires + FROM qfdmo_displayedacteur + WHERE siret IS NOT NULL AND siret != '' AND LENGTH(siret) = 14 + AND commentaires LIKE '%{{filter_comments_contain}}%' +) +SELECT + unite.siren, + acteurs.noms AS qfdmo_acteur_noms_origine, + acteurs.noms AS qfdmo_acteur_noms_comparaison, + acteurs.identifiant_unique AS qfdmo_acteur_id, + acteurs.commentaires AS qfdmo_acteur_commentaires, + unite."prenom1UniteLegale" AS "ae_prenom1UniteLegale", + unite."prenom2UniteLegale" AS "ae_prenom2UniteLegale", + unite."prenom3UniteLegale" AS "ae_prenom3UniteLegale", + unite."prenom4UniteLegale" AS "ae_prenom4UniteLegale", + unite."nomUniteLegale" AS "ae_nomUniteLegale", + unite."nomUsageUniteLegale" AS "ae_nomUsageUniteLegale" +FROM clone_ae_unite_legale_in_use AS unite +JOIN acteurs_with_siren AS acteurs ON acteurs.siren = unite.siren +WHERE( + ("prenom1UniteLegale" IS NOT NULL AND "prenom1UniteLegale" != '[ND]') + OR ("prenom2UniteLegale" IS NOT NULL AND "prenom2UniteLegale" != '[ND]') + OR ("prenom3UniteLegale" IS NOT NULL AND "prenom3UniteLegale" != '[ND]') + OR ("prenom4UniteLegale" IS NOT NULL AND "prenom4UniteLegale" != '[ND]') + OR ("nomUniteLegale" IS NOT NULL AND "nomUniteLegale" != '[ND]') + OR ("nomUsageUniteLegale" IS NOT NULL AND "nomUsageUniteLegale" != '[ND]') +) \ No newline at end of file diff --git a/dags/rgpd/config/tasks.py b/dags/rgpd/config/tasks.py new file mode 100644 index 000000000..651cdb30c --- /dev/null +++ b/dags/rgpd/config/tasks.py @@ -0,0 +1,10 @@ +"""Task IDs for RGPD anonymize people DAG""" + +from dataclasses import dataclass + + +@dataclass(frozen=True) +class TASKS: + READ: str = "rgpd_anonymize_people_read" + MATCH_SCORE: str = "rgpd_anonymize_people_match" + SUGGEST: str = "rgpd_anonymize_people_suggest" diff --git a/dags/rgpd/config/xcoms.py b/dags/rgpd/config/xcoms.py new file mode 100644 index 000000000..840a9e0fa --- /dev/null +++ b/dags/rgpd/config/xcoms.py @@ -0,0 +1,54 @@ +"""Constants and helpers to configure XCom for Crawl DAG, +so we are more reliable & concise in our XCOM usage +(so easy to typo a key or pull from wrong task and Airflow +happily gives None without complaining)""" + +from dataclasses import dataclass +from typing import Any + +import pandas as pd +from airflow.exceptions import AirflowSkipException +from airflow.models.taskinstance import TaskInstance +from rgpd.config.tasks import TASKS +from utils import logging_utils as log + + +@dataclass(frozen=True) +class XCOMS: + DF_READ: str = "df_read" + DF_MATCH: str = "df_match" + + +def xcom_pull(ti: TaskInstance, key: str, skip_if_empty: bool = False) -> Any: + """For pulls, we create a helper to constrain keys + to specific task ids to guarantee consistent pulls""" + + # Init + value: Any = None # type: ignore + msg = f"XCOM from {ti.task_id=} pulling {key=}:" # For logging + + # Reading values + if key == XCOMS.DF_READ: + value: pd.DataFrame = ti.xcom_pull(key=key, task_ids=TASKS.READ) + elif key == XCOMS.DF_MATCH: + value: pd.DataFrame = ti.xcom_pull(key=key, task_ids=TASKS.MATCH_SCORE) + else: + raise ValueError(f"{msg} key inconnue") + + # Skip if empty + if skip_if_empty and ( + value is None or (isinstance(value, pd.DataFrame) and value.empty) + ): + raise AirflowSkipException(f"✋ {msg} est vide, on s'arrête là") + + # Logging + log.preview(f"{msg} value = ", value) + + return value + + +# We don't have an helper for xcom_push because +# it can be done via the TaskInstance easily +# as ti.xcom_push(key=..., value=...) +# and we don't neet to align keys with task ids +# (task id is automatically that of the pushing task) diff --git a/dags/rgpd/dags/rgpd_anonymize_people.py b/dags/rgpd/dags/rgpd_anonymize_people.py new file mode 100644 index 000000000..3e3bcb33b --- /dev/null +++ b/dags/rgpd/dags/rgpd_anonymize_people.py @@ -0,0 +1,73 @@ +""" +DAG to anonymize QFDMO acteur which names +contains people from Annuaire Entreprise (AE) +""" + +from datetime import datetime + +from airflow import DAG +from airflow.models.baseoperator import chain +from airflow.models.param import Param +from rgpd.config import COLS +from rgpd.tasks.airflow_logic.rgpd_anonymize_people_match_task import ( + rgpd_anonymize_people_match_task, +) +from rgpd.tasks.airflow_logic.rgpd_anonymize_people_read_task import ( + rgpd_anonymize_people_read_task, +) +from rgpd.tasks.airflow_logic.rgpd_anonymize_people_suggest_task import ( + rgpd_anonymize_people_suggest_task, +) + +FILTER_COMMENTS_CONTAIN_DEFAULT = ( + "source changee le 18-07-2024. Ancienne source CMA non-reparActeur. " + "Nouvelle source : LVAO" +) + +with DAG( + dag_id="rgpd_anonymize_people", + dag_display_name="RGPD - Anonymiser les personnes acteurs", + default_args={ + "owner": "airflow", + "depends_on_past": False, + "start_date": datetime(2025, 3, 5), + "catchup": False, + "email_on_failure": False, + "email_on_retry": False, + "retries": 0, + }, + description=( + "Un DAG pour anonymiser les acteurs QFDMO dont" + "le nom contient des personnes de l'Annuaire Entreprise (AE)" + ), + tags=["rgpd", "annuaire", "entreprise", "siren", "ae", "acteurs"], + params={ + COLS.DRY_RUN: Param( + True, + type="boolean", + description_md="🚱 Si coché, aucune tâche d'écriture ne sera effectuée", + ), + "filter_comments_contain": Param( + FILTER_COMMENTS_CONTAIN_DEFAULT, + type="string", + description_md="🔍 Filtre sur les commentaires pour la lecture des données", + ), + COLS.MATCH_THRESHOLD: Param( + 1, + type="number", + description_md=r"""🎯 Seuil de match pour considérer un acteur + anonymisable. + - **match** = ratio du nombre de mots du nom de l'acteur qui correspondent + à des mots de nom/prénom des personnes de l'AE + - **threshold** = contrainte en dur de ==1 pour la v1 + """, + ), + }, + schedule=None, + catchup=False, +) as dag: + chain( + rgpd_anonymize_people_read_task(dag), + rgpd_anonymize_people_match_task(dag), + rgpd_anonymize_people_suggest_task(dag), + ) diff --git a/dags/rgpd/tasks/airflow_logic/rgpd_anonymize_people_match_task.py b/dags/rgpd/tasks/airflow_logic/rgpd_anonymize_people_match_task.py new file mode 100644 index 000000000..9323a48e0 --- /dev/null +++ b/dags/rgpd/tasks/airflow_logic/rgpd_anonymize_people_match_task.py @@ -0,0 +1,50 @@ +"""Match acteurs from QFDMO vs. AE based on people names""" + +import logging + +from airflow import DAG +from airflow.exceptions import AirflowSkipException +from airflow.operators.python import PythonOperator +from rgpd.config import COLS, TASKS, XCOMS +from rgpd.tasks.business_logic.rgpd_anonymize_people_match import ( + rgpd_anonymize_people_match, +) + +logger = logging.getLogger(__name__) + + +def task_info_get(): + return f""" + ============================================================ + Description de la tâche "{TASKS.MATCH_SCORE}" + ============================================================ + 💡 quoi: on cherche à déterminer quels acteurs QFDMO ont un + nom qui correspond à des noms de personnes dans l'AE + + 🎯 pourquoi: le but de ce DAG: pouvoir par la suite anonymiser + + 🏗️ comment: normalisation puis matching python sur la base + du ratio de mots dans le nom de l'acteur qui matchent avec des + noms/prénoms de personnes dans l'AE + """ + + +def rgpd_anonymize_people_match_wrapper(ti, params) -> None: + logger.info(task_info_get()) + + df = rgpd_anonymize_people_match( + df=ti.xcom_pull(key=XCOMS.DF_READ), + match_threshold=params[COLS.MATCH_THRESHOLD], + ) + if df.empty: + raise AirflowSkipException("Pas de matches, on s'arrête là") + + ti.xcom_push(key=XCOMS.DF_MATCH, value=df) + + +def rgpd_anonymize_people_match_task(dag: DAG) -> PythonOperator: + return PythonOperator( + task_id=TASKS.MATCH_SCORE, + python_callable=rgpd_anonymize_people_match_wrapper, + dag=dag, + ) diff --git a/dags/rgpd/tasks/airflow_logic/rgpd_anonymize_people_read_task.py b/dags/rgpd/tasks/airflow_logic/rgpd_anonymize_people_read_task.py new file mode 100644 index 000000000..9e95469b6 --- /dev/null +++ b/dags/rgpd/tasks/airflow_logic/rgpd_anonymize_people_read_task.py @@ -0,0 +1,49 @@ +"""Read data from DB needed for RGPD anonymization""" + +import logging + +from airflow import DAG +from airflow.exceptions import AirflowSkipException +from airflow.operators.python import PythonOperator +from rgpd.config import TASKS, XCOMS +from rgpd.tasks.business_logic.rgpd_anonymize_people_read import ( + rgpd_anonymize_people_read, +) + +logger = logging.getLogger(__name__) + + +def task_info_get(): + return f""" + ============================================================ + Description de la tâche "{TASKS.READ}" + ============================================================ + 💡 quoi: lecture des données de la base (QFDMO Acteurs + et Unité Légales de l'Annuaire Entreprise) + + 🎯 pourquoi: faire un pré-filtre sur les matches potentiels + (pas récupérer les ~27M de lignes de la table AE) + + 🏗️ comment: on récupère uniquement les matches SIREN avec + des infos de noms/prénoms dans l'AE + """ + + +def rgpd_anonymize_people_read_wrapper(ti, params) -> None: + logger.info(task_info_get()) + + df = rgpd_anonymize_people_read( + filter_comments_contain=params["filter_comments_contain"] + ) + if df.empty: + raise AirflowSkipException("Pas de données DB, on s'arrête là") + + ti.xcom_push(key=XCOMS.DF_READ, value=df) + + +def rgpd_anonymize_people_read_task(dag: DAG) -> PythonOperator: + return PythonOperator( + task_id=TASKS.READ, + python_callable=rgpd_anonymize_people_read_wrapper, + dag=dag, + ) diff --git a/dags/rgpd/tasks/airflow_logic/rgpd_anonymize_people_suggest_task.py b/dags/rgpd/tasks/airflow_logic/rgpd_anonymize_people_suggest_task.py new file mode 100644 index 000000000..a11a66994 --- /dev/null +++ b/dags/rgpd/tasks/airflow_logic/rgpd_anonymize_people_suggest_task.py @@ -0,0 +1,51 @@ +"""Match acteurs from QFDMO vs. AE based on people names""" + +import logging + +from airflow import DAG +from airflow.exceptions import AirflowSkipException +from airflow.operators.python import PythonOperator +from rgpd.config import COLS, TASKS, XCOMS +from rgpd.tasks.business_logic.rgpd_anonymize_people_suggest import ( + rgpd_anonymize_people_suggest, +) + +logger = logging.getLogger(__name__) + + +def task_info_get(): + return f""" + ============================================================ + Description de la tâche "{TASKS.SUGGEST}" + ============================================================ + 💡 quoi: on cherche à déterminer quels acteurs QFDMO ont un + nom qui correspond à des noms de personnes dans l'AE + + 🎯 pourquoi: le but de ce DAG: pouvoir par la suite anonymiser + + 🏗️ comment: normalisation puis suggesting python sur la base + du ratio de mots dans le nom de l'acteur qui suggestent avec des + noms/prénoms de personnes dans l'AE + """ + + +def rgpd_anonymize_people_suggest_wrapper(ti, params, dag, run_id) -> None: + logger.info(task_info_get()) + + rgpd_anonymize_people_suggest( + df=ti.xcom_pull(key=XCOMS.DF_MATCH), + identifiant_action=dag.dag_id, + identifiant_execution=run_id, + dry_run=params[COLS.DRY_RUN], + ) + # Flagging as skipped at the end to help read status in Airflow UI + if params[COLS.DRY_RUN]: + raise AirflowSkipException("Pas de données DB, on s'arrête là") + + +def rgpd_anonymize_people_suggest_task(dag: DAG) -> PythonOperator: + return PythonOperator( + task_id=TASKS.SUGGEST, + python_callable=rgpd_anonymize_people_suggest_wrapper, + dag=dag, + ) diff --git a/dags/rgpd/tasks/business_logic/rgpd_anonymize_people_match.py b/dags/rgpd/tasks/business_logic/rgpd_anonymize_people_match.py new file mode 100644 index 000000000..bac4b57bd --- /dev/null +++ b/dags/rgpd/tasks/business_logic/rgpd_anonymize_people_match.py @@ -0,0 +1,79 @@ +"""Match acteurs from QFDMO vs. AE based on people names""" + +import pandas as pd +from rgpd.config import COLS +from shared.tasks.business_logic import normalize +from utils import logging_utils as log +from utils.raisers import raise_if + + +def word_overlap_ratio( + row: pd.Series, cols_a: list, cols_b: list +) -> tuple[list[str], float]: + # Gather words from columns in cols_a + words_a = set() + for col in cols_a: + if row[col] is not None: + words_a.update(str(row[col]).split()) + + # Gather words from columns in cols_b + words_b = set() + for col in cols_b: + if row[col] is not None: + words_b.update(str(row[col]).split()) + + # Avoid division by zero + if not words_a: + return ([], 0.0) + + words_matched = [word for word in words_a if word in words_b] + words_count = len(words_matched) + ratio = words_count / len(words_a) + if ratio > 1: + raise ValueError(f"ratio > 1 {ratio}: {words_a} - {words_b}") + return (words_matched, ratio) + + +def rgpd_anonymize_people_match( + df: pd.DataFrame, + match_threshold: float = 0.6, +) -> pd.DataFrame: + """Identify matches between QFDMO company names and AE's people names.""" + # TODO: remove first below once métier happy with trying thresholds < 1 + raise_if(match_threshold < 1, f"Seuil de match < 1: {match_threshold}") + raise_if(match_threshold <= 0, f"Seuil de match <= 0: {match_threshold}") + + df = df.copy() + + # Defining columns + cols_names_qfdmo = [COLS.QFDMO_ACTEUR_NOMS_COMPARISON] + cols_names_ae = [ + x + for x in df.columns + if x.startswith(COLS.AE_NOM_PREFIX) or x.startswith(COLS.AE_PRENOM_PREFIX) + ] + + # Normalization + cols_to_norm = cols_names_qfdmo + cols_names_ae + for col in cols_to_norm: + df[col] = df[col].map(normalize.string_basic) + + # Matching + df["temp"] = df.apply( + lambda x: word_overlap_ratio(x, cols_names_qfdmo, cols_names_ae), axis=1 + ) + df[COLS.MATCH_WORDS] = df["temp"].apply(lambda x: x[0]) + df[COLS.MATCH_SCORE] = df["temp"].apply(lambda x: x[1]) + df.drop(columns=["temp"], inplace=True) + + # Selecting & previewing matches + df_no_match = df[df[COLS.MATCH_SCORE] == 0] + df_partial = df[(df[COLS.MATCH_SCORE] > 0) & (df[COLS.MATCH_SCORE] < 1)] + df_perfect = df[df[COLS.MATCH_SCORE] == 1] + df_retained = df[df[COLS.MATCH_SCORE] >= match_threshold].copy() + log.preview_df_as_markdown("🔴 Matches non-existant (==0)", df_no_match) + log.preview_df_as_markdown("🟡 Matches partiel (>0 & <1)", df_partial) + log.preview_df_as_markdown("🟢 Matches parfait (==1)", df_perfect) + log.preview_df_as_markdown(f"💾 Matches retenus (>={match_threshold})", df_retained) + + return df_retained.sort_values(COLS.MATCH_SCORE, ascending=False) diff --git a/dags/rgpd/tasks/business_logic/rgpd_anonymize_people_read.py b/dags/rgpd/tasks/business_logic/rgpd_anonymize_people_read.py new file mode 100644 index 000000000..59a0ad42a --- /dev/null +++ b/dags/rgpd/tasks/business_logic/rgpd_anonymize_people_read.py @@ -0,0 +1,30 @@ +"""Read data from DB needed for RGPD anonymization""" + +import numpy as np +import pandas as pd +from rgpd.config import DIR_SQL_READ +from utils import logging_utils as log +from utils.django import django_setup_full + +django_setup_full() + + +def rgpd_anonymize_people_read(filter_comments_contain: str = "") -> pd.DataFrame: + """Reads necessary QFDMO acteurs and AE entries from DB""" + from django.db import connection + + # Get SQL query and set filter + sql = (DIR_SQL_READ / "rgpd_anonymize_people_read.sql").read_text() + sql = sql.replace("{{filter_comments_contain}}", filter_comments_contain) + + # Execute SQL query and get data + with connection.cursor() as cursor: + cursor.execute(sql) + columns = [col[0] for col in cursor.description] + data = cursor.fetchall() + + # Create DataFrame and preview + df = pd.DataFrame(data, columns=columns, dtype="object").replace({np.nan: None}) + log.preview_df_as_markdown("Acteurs & entrées Annuaire Entreprise", df) + + return df diff --git a/dags/rgpd/tasks/business_logic/rgpd_anonymize_people_suggest.py b/dags/rgpd/tasks/business_logic/rgpd_anonymize_people_suggest.py new file mode 100644 index 000000000..098de9c8c --- /dev/null +++ b/dags/rgpd/tasks/business_logic/rgpd_anonymize_people_suggest.py @@ -0,0 +1,101 @@ +"""Generate suggestions from matches""" + +import logging + +import pandas as pd +from rgpd.config import COLS +from sources.config.shared_constants import INFO_TO_HIDE +from utils import logging_utils as log +from utils.django import django_setup_full + +django_setup_full() + +logger = logging.getLogger(__name__) + + +def rgpd_anonymize_people_suggest( + df: pd.DataFrame, + identifiant_action: str, + identifiant_execution: str, + dry_run: bool = True, +) -> list[dict]: + """Generate suggestions from matches""" + from data.models import ( + Suggestion, + SuggestionAction, + SuggestionCohorte, + SuggestionStatut, + ) + from data.models.change import SuggestionChange + from data.models.changes import ChangeActeurUpdateData + from qfdmo.models import Acteur + + # Acteur fields + field_nom = Acteur._meta.get_field("nom").name + field_nom_officiel = Acteur._meta.get_field("nom_officiel").name + field_nom_commercial = Acteur._meta.get_field("nom_commercial").name + + # Prepare suggestions + suggestions = [] + for _, row in df.iterrows(): + changes = [] + + # Preparing & validating the change params + acteur_id = row[COLS.QFDMO_ACTEUR_ID] + model_params = { + "id": acteur_id, + "data": { + field_nom: INFO_TO_HIDE, + field_nom_officiel: INFO_TO_HIDE, + field_nom_commercial: INFO_TO_HIDE, + }, + } + ChangeActeurUpdateData(**model_params).validate() + + # Preparing suggestion with change and ensuring we can JSON serialize it + change = SuggestionChange( + order=1, + reason="Noms/prénoms détectés dans l'Annuaire Entreprise (AE)", + entity_type="acteur_displayed", + model_name=ChangeActeurUpdateData.name(), + model_params=model_params, + ).model_dump() + changes.append(change) + suggestion = { + "contexte": "Idem suggestion", + "suggestion": { + "title": "🕵️ RGPD: anonymiser les noms des acteurs", + "details": { + "noms_origine": row[COLS.QFDMO_ACTEUR_NOMS_ORIGIN], + "mots_de_match": row[COLS.MATCH_WORDS], + "score_de_match": row[COLS.MATCH_SCORE], + "changement": f"""{field_nom} & {field_nom_officiel} & + {field_nom_commercial} -> {INFO_TO_HIDE}""", + }, + "changes": changes, + }, + } + suggestions.append(suggestion) + log.preview(f"Suggestion pour acteur: {acteur_id}", suggestion) + + # Saving suggestions + logging.info(log.banner_string("✍️ Ecritures en DB")) + if dry_run: + logger.info("✋ Dry run: suggestions pas écrites en base") + else: + cohort = SuggestionCohorte( + identifiant_action=identifiant_action, + identifiant_execution=identifiant_execution, + type_action=SuggestionAction.RGPD_ANONYMIZE, + metadata={"Nombre de suggestions": len(suggestions)}, + ) + cohort.save() + for suggestion in suggestions: + Suggestion( + suggestion_cohorte=cohort, + statut=SuggestionStatut.AVALIDER, + contexte=suggestion["contexte"], + suggestion=suggestion["suggestion"], + ).save() + + return suggestions diff --git a/dags/sources/config/shared_constants.py b/dags/sources/config/shared_constants.py index dcf228027..52de5a3c3 100755 --- a/dags/sources/config/shared_constants.py +++ b/dags/sources/config/shared_constants.py @@ -32,3 +32,6 @@ # Special field values EMPTY_ACTEUR_FIELD = "__empty__" + +# To handle data we don't want to show on frontend +INFO_TO_HIDE = "[Information masquée]" diff --git a/data/models/suggestion.py b/data/models/suggestion.py index 5ee94a6b2..32f536f37 100644 --- a/data/models/suggestion.py +++ b/data/models/suggestion.py @@ -54,6 +54,7 @@ class SuggestionCohorteStatut(models.TextChoices): class SuggestionAction(models.TextChoices): CRAWL_URLS = SUGGESTION_CRAWL_URLS, "🔗 URLs scannées" + RGPD_ANONYMIZE = "RGPD_ANONYMISATION", "Anonymisation RGPD" CLUSTERING = SUGGESTION_CLUSTERING, "regroupement/déduplication des acteurs" SOURCE_AJOUT = ( SUGGESTION_SOURCE_AJOUT, From eac97dc556aed0d557c57590953b541b3a1cfd17 Mon Sep 17 00:00:00 2001 From: Max Corbeau Date: Wed, 12 Mar 2025 14:48:45 +0100 Subject: [PATCH 02/50] ajout template django data --- .../rgpd_anonymize_people_suggest.py | 10 ++++---- data/models/suggestion.py | 3 +++ .../_partials/generic_suggestion_details.html | 23 +++++++++++++++++++ 3 files changed, 31 insertions(+), 5 deletions(-) create mode 100644 templates/data/_partials/generic_suggestion_details.html diff --git a/dags/rgpd/tasks/business_logic/rgpd_anonymize_people_suggest.py b/dags/rgpd/tasks/business_logic/rgpd_anonymize_people_suggest.py index 098de9c8c..8420d7c4e 100644 --- a/dags/rgpd/tasks/business_logic/rgpd_anonymize_people_suggest.py +++ b/dags/rgpd/tasks/business_logic/rgpd_anonymize_people_suggest.py @@ -65,10 +65,10 @@ def rgpd_anonymize_people_suggest( "contexte": "Idem suggestion", "suggestion": { "title": "🕵️ RGPD: anonymiser les noms des acteurs", - "details": { - "noms_origine": row[COLS.QFDMO_ACTEUR_NOMS_ORIGIN], - "mots_de_match": row[COLS.MATCH_WORDS], - "score_de_match": row[COLS.MATCH_SCORE], + "summary": { + "noms d'origine": row[COLS.QFDMO_ACTEUR_NOMS_ORIGIN], + "mots de match": row[COLS.MATCH_WORDS], + "score de match": row[COLS.MATCH_SCORE], "changement": f"""{field_nom} & {field_nom_officiel} & {field_nom_commercial} -> {INFO_TO_HIDE}""", }, @@ -87,7 +87,7 @@ def rgpd_anonymize_people_suggest( identifiant_action=identifiant_action, identifiant_execution=identifiant_execution, type_action=SuggestionAction.RGPD_ANONYMIZE, - metadata={"Nombre de suggestions": len(suggestions)}, + metadata={"🔢 Nombre de suggestions": len(suggestions)}, ) cohort.save() for suggestion in suggestions: diff --git a/data/models/suggestion.py b/data/models/suggestion.py index 32f536f37..ddc01eecc 100644 --- a/data/models/suggestion.py +++ b/data/models/suggestion.py @@ -225,6 +225,8 @@ def display_suggestion_details(self): template_name = "data/_partials/ajout_suggestion_details.html" elif self.suggestion_cohorte.type_action == SuggestionAction.CRAWL_URLS: template_name = "data/_partials/crawl_urls_suggestion_details.html" + elif self.suggestion_cohorte.type_action == SuggestionAction.RGPD_ANONYMIZE: + template_name = "data/_partials/generic_suggestion_details.html" template_context = self.suggestion.copy() return render_to_string(template_name, template_context) @@ -311,6 +313,7 @@ def apply(self): if self.suggestion_cohorte.type_action in [ SuggestionAction.CLUSTERING, SuggestionAction.CRAWL_URLS, + SuggestionAction.RGPD_ANONYMIZE, ]: changes = self.suggestion["changes"] changes.sort(key=lambda x: x["order"]) diff --git a/templates/data/_partials/generic_suggestion_details.html b/templates/data/_partials/generic_suggestion_details.html new file mode 100644 index 000000000..5c3b2f796 --- /dev/null +++ b/templates/data/_partials/generic_suggestion_details.html @@ -0,0 +1,23 @@ +{% extends "data/_partials/suggestion_details.html" %} +{% load custom_filters %} + +{% block suggestion_title %} +{{ title }} +{% endblock suggestion_title %} + +{% block suggestion_details %} + +

💡 Résumé des changements:

+ + +

🔢 {{ changes|length }} acteur(s) impacté(s):

+ +{% endblock suggestion_details %} From 3a11ae55f463bd845367bd1459add6819a347bab Mon Sep 17 00:00:00 2001 From: Max Corbeau Date: Wed, 12 Mar 2025 15:04:09 +0100 Subject: [PATCH 03/50] migration Django data --- dags/sources/config/shared_constants.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dags/sources/config/shared_constants.py b/dags/sources/config/shared_constants.py index 52de5a3c3..0f25418a5 100755 --- a/dags/sources/config/shared_constants.py +++ b/dags/sources/config/shared_constants.py @@ -8,6 +8,7 @@ # SuggestionCohorte type_action SUGGESTION_CRAWL_URLS = "CRAWL_URLS" +SUGGESTION_RGPD_ANONYMIZE = "RGPD_ANONYMIZE" SUGGESTION_CLUSTERING = "CLUSTERING" SUGGESTION_SOURCE_AJOUT = "SOURCE_AJOUT" SUGGESTION_SOURCE_MODIFICATION = "SOURCE_MODIFICATION" From 03b13a69a693366dd2e40a02d3150672a24c2b9f Mon Sep 17 00:00:00 2001 From: Max Corbeau Date: Wed, 19 Mar 2025 13:01:25 +0100 Subject: [PATCH 04/50] refacto dags/rgpd -> dags/enrich, utilisation dbt --- dags/{rgpd => enrich}/config/__init__.py | 1 + dags/{rgpd => enrich}/config/columns.py | 14 ++++---- dags/enrich/config/dbt.py | 8 +++++ dags/{rgpd => enrich}/config/paths.py | 0 dags/enrich/config/tasks.py | 10 ++++++ dags/{rgpd => enrich}/config/xcoms.py | 2 +- .../dags/enrich_ae_rgpd.py} | 35 ++++++++----------- .../enrich_ae_rgpd_match_task.py} | 14 ++++---- .../enrich_ae_rgpd_read_task.py} | 26 +++++++------- .../enrich_ae_rgpd_suggest_task.py} | 14 ++++---- .../business_logic/enrich_ae_rgpd_match.py} | 24 ++++++------- .../business_logic/enrich_ae_rgpd_read.py} | 16 ++++----- .../business_logic/enrich_ae_rgpd_suggest.py} | 8 ++--- .../sql/read/rgpd_anonymize_people_read.sql | 32 ----------------- dags/rgpd/config/tasks.py | 10 ------ dags/utils/dbt.py | 33 +++++++++++++++++ .../enrich/tasks/test_enrich_ae_rgpd.py | 17 +++++++++ dags_unit_tests/utils/test_dbt.py | 23 ++++++++++++ ...h_ea_rgpd.sql => marts_enrich_ae_rgpd.sql} | 26 ++++++++------ dbt/models/marts/enrich/schema.yml | 14 +++++--- 20 files changed, 191 insertions(+), 136 deletions(-) rename dags/{rgpd => enrich}/config/__init__.py (83%) rename dags/{rgpd => enrich}/config/columns.py (59%) create mode 100644 dags/enrich/config/dbt.py rename dags/{rgpd => enrich}/config/paths.py (100%) create mode 100644 dags/enrich/config/tasks.py rename dags/{rgpd => enrich}/config/xcoms.py (97%) rename dags/{rgpd/dags/rgpd_anonymize_people.py => enrich/dags/enrich_ae_rgpd.py} (62%) rename dags/{rgpd/tasks/airflow_logic/rgpd_anonymize_people_match_task.py => enrich/tasks/airflow_logic/enrich_ae_rgpd_match_task.py} (76%) rename dags/{rgpd/tasks/airflow_logic/rgpd_anonymize_people_read_task.py => enrich/tasks/airflow_logic/enrich_ae_rgpd_read_task.py} (59%) rename dags/{rgpd/tasks/airflow_logic/rgpd_anonymize_people_suggest_task.py => enrich/tasks/airflow_logic/enrich_ae_rgpd_suggest_task.py} (77%) rename dags/{rgpd/tasks/business_logic/rgpd_anonymize_people_match.py => enrich/tasks/business_logic/enrich_ae_rgpd_match.py} (78%) rename dags/{rgpd/tasks/business_logic/rgpd_anonymize_people_read.py => enrich/tasks/business_logic/enrich_ae_rgpd_read.py} (59%) rename dags/{rgpd/tasks/business_logic/rgpd_anonymize_people_suggest.py => enrich/tasks/business_logic/enrich_ae_rgpd_suggest.py} (94%) delete mode 100644 dags/rgpd/config/sql/read/rgpd_anonymize_people_read.sql delete mode 100644 dags/rgpd/config/tasks.py create mode 100644 dags/utils/dbt.py create mode 100644 dags_unit_tests/enrich/tasks/test_enrich_ae_rgpd.py create mode 100644 dags_unit_tests/utils/test_dbt.py rename dbt/models/marts/enrich/{marts_enrich_ea_rgpd.sql => marts_enrich_ae_rgpd.sql} (69%) diff --git a/dags/rgpd/config/__init__.py b/dags/enrich/config/__init__.py similarity index 83% rename from dags/rgpd/config/__init__.py rename to dags/enrich/config/__init__.py index af729ea54..77e633a18 100644 --- a/dags/rgpd/config/__init__.py +++ b/dags/enrich/config/__init__.py @@ -1,4 +1,5 @@ from .columns import COLS # noqa: F401 +from .dbt import DBT # noqa: F401 from .paths import DIR_SQL_READ # noqa: F401 from .tasks import TASKS # noqa: F401 from .xcoms import XCOMS, xcom_pull # noqa: F401 diff --git a/dags/rgpd/config/columns.py b/dags/enrich/config/columns.py similarity index 59% rename from dags/rgpd/config/columns.py rename to dags/enrich/config/columns.py index 33567f3e3..c65aa6e63 100644 --- a/dags/rgpd/config/columns.py +++ b/dags/enrich/config/columns.py @@ -10,15 +10,17 @@ class COLS: # Dry run DRY_RUN: str = "dry_run" + # COMMON + SIREN: str = "siren" + # QFDMO - QFDMO_ACTEUR_NOMS_ORIGIN: str = "qfdmo_acteur_noms_origine" - QFDMO_ACTEUR_NOMS_MATCH: str = "qfdmo_acteur_noms_match" - QFDMO_ACTEUR_NOMS_COMPARISON: str = "qfdmo_acteur_noms_comparaison" - QFDMO_ACTEUR_ID: str = "qfdmo_acteur_id" + ACTEUR_ID: str = "acteur_id" + ACTEUR_NOMS_ORIGINE: str = "acteur_noms_origine" + ACTEUR_NOMS_NORMALISES: str = "acteur_noms_normalises" + ACTEUR_COMMENTAIRES: str = "acteur_commentaires" # Annuaire Entreprise - AE_NOM_PREFIX: str = "ae_nom" - AE_PRENOM_PREFIX: str = "ae_prenom" + AE_DIRIGEANTS_NOMS: str = "ae_dirigeants_noms_prenoms" # Matching MATCH_SCORE: str = "match_score" diff --git a/dags/enrich/config/dbt.py b/dags/enrich/config/dbt.py new file mode 100644 index 000000000..d3cfd43be --- /dev/null +++ b/dags/enrich/config/dbt.py @@ -0,0 +1,8 @@ +"""DBT models used in the enrich DAGs""" + +from dataclasses import dataclass + + +@dataclass(frozen=True) +class DBT: + MARTS_ENRICH_AE_RGPD: str = "marts_enrich_ae_rgpd" diff --git a/dags/rgpd/config/paths.py b/dags/enrich/config/paths.py similarity index 100% rename from dags/rgpd/config/paths.py rename to dags/enrich/config/paths.py diff --git a/dags/enrich/config/tasks.py b/dags/enrich/config/tasks.py new file mode 100644 index 000000000..b5889493a --- /dev/null +++ b/dags/enrich/config/tasks.py @@ -0,0 +1,10 @@ +"""Task IDs for RGPD anonymize people DAG""" + +from dataclasses import dataclass + + +@dataclass(frozen=True) +class TASKS: + READ: str = "enrich_ae_rgpd_read" + MATCH_SCORE: str = "enrich_ae_rgpd_match" + SUGGEST: str = "enrich_ae_rgpd_suggest" diff --git a/dags/rgpd/config/xcoms.py b/dags/enrich/config/xcoms.py similarity index 97% rename from dags/rgpd/config/xcoms.py rename to dags/enrich/config/xcoms.py index 840a9e0fa..4d3cebda1 100644 --- a/dags/rgpd/config/xcoms.py +++ b/dags/enrich/config/xcoms.py @@ -9,7 +9,7 @@ import pandas as pd from airflow.exceptions import AirflowSkipException from airflow.models.taskinstance import TaskInstance -from rgpd.config.tasks import TASKS +from enrich.config.tasks import TASKS from utils import logging_utils as log diff --git a/dags/rgpd/dags/rgpd_anonymize_people.py b/dags/enrich/dags/enrich_ae_rgpd.py similarity index 62% rename from dags/rgpd/dags/rgpd_anonymize_people.py rename to dags/enrich/dags/enrich_ae_rgpd.py index 3e3bcb33b..b8a98f48b 100644 --- a/dags/rgpd/dags/rgpd_anonymize_people.py +++ b/dags/enrich/dags/enrich_ae_rgpd.py @@ -8,25 +8,20 @@ from airflow import DAG from airflow.models.baseoperator import chain from airflow.models.param import Param -from rgpd.config import COLS -from rgpd.tasks.airflow_logic.rgpd_anonymize_people_match_task import ( - rgpd_anonymize_people_match_task, +from enrich.config import COLS +from enrich.tasks.airflow_logic.enrich_ae_rgpd_match_task import ( + enrich_ae_rgpd_match_task, ) -from rgpd.tasks.airflow_logic.rgpd_anonymize_people_read_task import ( - rgpd_anonymize_people_read_task, +from enrich.tasks.airflow_logic.enrich_ae_rgpd_read_task import ( + enrich_ae_rgpd_read_task, ) -from rgpd.tasks.airflow_logic.rgpd_anonymize_people_suggest_task import ( - rgpd_anonymize_people_suggest_task, -) - -FILTER_COMMENTS_CONTAIN_DEFAULT = ( - "source changee le 18-07-2024. Ancienne source CMA non-reparActeur. " - "Nouvelle source : LVAO" +from enrich.tasks.airflow_logic.enrich_ae_rgpd_suggest_task import ( + enrich_ae_rgpd_suggest_task, ) with DAG( - dag_id="rgpd_anonymize_people", - dag_display_name="RGPD - Anonymiser les personnes acteurs", + dag_id="enrich_ae_rgpd", + dag_display_name="Enrichir - AE - RGPD", default_args={ "owner": "airflow", "depends_on_past": False, @@ -40,7 +35,7 @@ "Un DAG pour anonymiser les acteurs QFDMO dont" "le nom contient des personnes de l'Annuaire Entreprise (AE)" ), - tags=["rgpd", "annuaire", "entreprise", "siren", "ae", "acteurs"], + tags=["enrich", "annuaire", "entreprise", "siren", "ae", "acteurs"], params={ COLS.DRY_RUN: Param( True, @@ -48,8 +43,8 @@ description_md="🚱 Si coché, aucune tâche d'écriture ne sera effectuée", ), "filter_comments_contain": Param( - FILTER_COMMENTS_CONTAIN_DEFAULT, - type="string", + "", + type=["null", "string"], description_md="🔍 Filtre sur les commentaires pour la lecture des données", ), COLS.MATCH_THRESHOLD: Param( @@ -67,7 +62,7 @@ catchup=False, ) as dag: chain( - rgpd_anonymize_people_read_task(dag), - rgpd_anonymize_people_match_task(dag), - rgpd_anonymize_people_suggest_task(dag), + enrich_ae_rgpd_read_task(dag), + enrich_ae_rgpd_match_task(dag), + enrich_ae_rgpd_suggest_task(dag), ) diff --git a/dags/rgpd/tasks/airflow_logic/rgpd_anonymize_people_match_task.py b/dags/enrich/tasks/airflow_logic/enrich_ae_rgpd_match_task.py similarity index 76% rename from dags/rgpd/tasks/airflow_logic/rgpd_anonymize_people_match_task.py rename to dags/enrich/tasks/airflow_logic/enrich_ae_rgpd_match_task.py index 9323a48e0..27b94012b 100644 --- a/dags/rgpd/tasks/airflow_logic/rgpd_anonymize_people_match_task.py +++ b/dags/enrich/tasks/airflow_logic/enrich_ae_rgpd_match_task.py @@ -5,9 +5,9 @@ from airflow import DAG from airflow.exceptions import AirflowSkipException from airflow.operators.python import PythonOperator -from rgpd.config import COLS, TASKS, XCOMS -from rgpd.tasks.business_logic.rgpd_anonymize_people_match import ( - rgpd_anonymize_people_match, +from enrich.config import COLS, TASKS, XCOMS +from enrich.tasks.business_logic.enrich_ae_rgpd_match import ( + enrich_ae_rgpd_match, ) logger = logging.getLogger(__name__) @@ -29,10 +29,10 @@ def task_info_get(): """ -def rgpd_anonymize_people_match_wrapper(ti, params) -> None: +def enrich_ae_rgpd_match_wrapper(ti, params) -> None: logger.info(task_info_get()) - df = rgpd_anonymize_people_match( + df = enrich_ae_rgpd_match( df=ti.xcom_pull(key=XCOMS.DF_READ), match_threshold=params[COLS.MATCH_THRESHOLD], ) @@ -42,9 +42,9 @@ def rgpd_anonymize_people_match_wrapper(ti, params) -> None: ti.xcom_push(key=XCOMS.DF_MATCH, value=df) -def rgpd_anonymize_people_match_task(dag: DAG) -> PythonOperator: +def enrich_ae_rgpd_match_task(dag: DAG) -> PythonOperator: return PythonOperator( task_id=TASKS.MATCH_SCORE, - python_callable=rgpd_anonymize_people_match_wrapper, + python_callable=enrich_ae_rgpd_match_wrapper, dag=dag, ) diff --git a/dags/rgpd/tasks/airflow_logic/rgpd_anonymize_people_read_task.py b/dags/enrich/tasks/airflow_logic/enrich_ae_rgpd_read_task.py similarity index 59% rename from dags/rgpd/tasks/airflow_logic/rgpd_anonymize_people_read_task.py rename to dags/enrich/tasks/airflow_logic/enrich_ae_rgpd_read_task.py index 9e95469b6..1f11b35a3 100644 --- a/dags/rgpd/tasks/airflow_logic/rgpd_anonymize_people_read_task.py +++ b/dags/enrich/tasks/airflow_logic/enrich_ae_rgpd_read_task.py @@ -5,9 +5,9 @@ from airflow import DAG from airflow.exceptions import AirflowSkipException from airflow.operators.python import PythonOperator -from rgpd.config import TASKS, XCOMS -from rgpd.tasks.business_logic.rgpd_anonymize_people_read import ( - rgpd_anonymize_people_read, +from enrich.config import DBT, TASKS, XCOMS +from enrich.tasks.business_logic.enrich_ae_rgpd_read import ( + enrich_ae_rgpd_read, ) logger = logging.getLogger(__name__) @@ -18,22 +18,24 @@ def task_info_get(): ============================================================ Description de la tâche "{TASKS.READ}" ============================================================ - 💡 quoi: lecture des données de la base (QFDMO Acteurs - et Unité Légales de l'Annuaire Entreprise) + 💡 quoi: lecture des données via le modèle DBT + {DBT.MARTS_ENRICH_AE_RGPD} 🎯 pourquoi: faire un pré-filtre sur les matches potentiels - (pas récupérer les ~27M de lignes de la table AE) + (pas récupérer les ~27M de lignes de la table AE unite_legale) 🏗️ comment: on récupère uniquement les matches SIREN avec - des infos de noms/prénoms dans l'AE + des infos de noms/prénoms dans l'AE en passant par de la normalisation + de chaines de caractères """ -def rgpd_anonymize_people_read_wrapper(ti, params) -> None: +def enrich_ae_rgpd_read_wrapper(ti, params) -> None: logger.info(task_info_get()) - df = rgpd_anonymize_people_read( - filter_comments_contain=params["filter_comments_contain"] + df = enrich_ae_rgpd_read( + dbt_model_name=DBT.MARTS_ENRICH_AE_RGPD, + filter_comments_contain=params["filter_comments_contain"], ) if df.empty: raise AirflowSkipException("Pas de données DB, on s'arrête là") @@ -41,9 +43,9 @@ def rgpd_anonymize_people_read_wrapper(ti, params) -> None: ti.xcom_push(key=XCOMS.DF_READ, value=df) -def rgpd_anonymize_people_read_task(dag: DAG) -> PythonOperator: +def enrich_ae_rgpd_read_task(dag: DAG) -> PythonOperator: return PythonOperator( task_id=TASKS.READ, - python_callable=rgpd_anonymize_people_read_wrapper, + python_callable=enrich_ae_rgpd_read_wrapper, dag=dag, ) diff --git a/dags/rgpd/tasks/airflow_logic/rgpd_anonymize_people_suggest_task.py b/dags/enrich/tasks/airflow_logic/enrich_ae_rgpd_suggest_task.py similarity index 77% rename from dags/rgpd/tasks/airflow_logic/rgpd_anonymize_people_suggest_task.py rename to dags/enrich/tasks/airflow_logic/enrich_ae_rgpd_suggest_task.py index a11a66994..9b195b3b3 100644 --- a/dags/rgpd/tasks/airflow_logic/rgpd_anonymize_people_suggest_task.py +++ b/dags/enrich/tasks/airflow_logic/enrich_ae_rgpd_suggest_task.py @@ -5,9 +5,9 @@ from airflow import DAG from airflow.exceptions import AirflowSkipException from airflow.operators.python import PythonOperator -from rgpd.config import COLS, TASKS, XCOMS -from rgpd.tasks.business_logic.rgpd_anonymize_people_suggest import ( - rgpd_anonymize_people_suggest, +from enrich.config import COLS, TASKS, XCOMS +from enrich.tasks.business_logic.enrich_ae_rgpd_suggest import ( + enrich_ae_rgpd_suggest, ) logger = logging.getLogger(__name__) @@ -29,10 +29,10 @@ def task_info_get(): """ -def rgpd_anonymize_people_suggest_wrapper(ti, params, dag, run_id) -> None: +def enrich_ae_rgpd_suggest_wrapper(ti, params, dag, run_id) -> None: logger.info(task_info_get()) - rgpd_anonymize_people_suggest( + enrich_ae_rgpd_suggest( df=ti.xcom_pull(key=XCOMS.DF_MATCH), identifiant_action=dag.dag_id, identifiant_execution=run_id, @@ -43,9 +43,9 @@ def rgpd_anonymize_people_suggest_wrapper(ti, params, dag, run_id) -> None: raise AirflowSkipException("Pas de données DB, on s'arrête là") -def rgpd_anonymize_people_suggest_task(dag: DAG) -> PythonOperator: +def enrich_ae_rgpd_suggest_task(dag: DAG) -> PythonOperator: return PythonOperator( task_id=TASKS.SUGGEST, - python_callable=rgpd_anonymize_people_suggest_wrapper, + python_callable=enrich_ae_rgpd_suggest_wrapper, dag=dag, ) diff --git a/dags/rgpd/tasks/business_logic/rgpd_anonymize_people_match.py b/dags/enrich/tasks/business_logic/enrich_ae_rgpd_match.py similarity index 78% rename from dags/rgpd/tasks/business_logic/rgpd_anonymize_people_match.py rename to dags/enrich/tasks/business_logic/enrich_ae_rgpd_match.py index bac4b57bd..896563c93 100644 --- a/dags/rgpd/tasks/business_logic/rgpd_anonymize_people_match.py +++ b/dags/enrich/tasks/business_logic/enrich_ae_rgpd_match.py @@ -1,10 +1,9 @@ """Match acteurs from QFDMO vs. AE based on people names""" import pandas as pd -from rgpd.config import COLS +from enrich.config import COLS from shared.tasks.business_logic import normalize from utils import logging_utils as log -from utils.raisers import raise_if def word_overlap_ratio( @@ -34,24 +33,21 @@ def word_overlap_ratio( return (words_matched, ratio) -def rgpd_anonymize_people_match( +def enrich_ae_rgpd_match( df: pd.DataFrame, - match_threshold: float = 0.6, + match_threshold: float, ) -> pd.DataFrame: """Identify matches between QFDMO company names and AE's people names.""" - # TODO: remove first below once métier happy with trying thresholds < 1 - raise_if(match_threshold < 1, f"Seuil de match < 1: {match_threshold}") - raise_if(match_threshold <= 0, f"Seuil de match <= 0: {match_threshold}") + if df.empty: + raise ValueError("df vide, on devrait pas être là") + if match_threshold < 0 or match_threshold > 1: + raise ValueError(f"match_threshold invalide: {match_threshold}") df = df.copy() - # Defining columns - cols_names_qfdmo = [COLS.QFDMO_ACTEUR_NOMS_COMPARISON] - cols_names_ae = [ - x - for x in df.columns - if x.startswith(COLS.AE_NOM_PREFIX) or x.startswith(COLS.AE_PRENOM_PREFIX) - ] + # Matching columns + cols_names_qfdmo = [COLS.ACTEUR_NOMS_NORMALISES] + cols_names_ae = [COLS.AE_DIRIGEANTS_NOMS] # Normalization cols_to_norm = cols_names_qfdmo + cols_names_ae diff --git a/dags/rgpd/tasks/business_logic/rgpd_anonymize_people_read.py b/dags/enrich/tasks/business_logic/enrich_ae_rgpd_read.py similarity index 59% rename from dags/rgpd/tasks/business_logic/rgpd_anonymize_people_read.py rename to dags/enrich/tasks/business_logic/enrich_ae_rgpd_read.py index 59a0ad42a..b960c01ae 100644 --- a/dags/rgpd/tasks/business_logic/rgpd_anonymize_people_read.py +++ b/dags/enrich/tasks/business_logic/enrich_ae_rgpd_read.py @@ -2,29 +2,29 @@ import numpy as np import pandas as pd -from rgpd.config import DIR_SQL_READ from utils import logging_utils as log from utils.django import django_setup_full django_setup_full() -def rgpd_anonymize_people_read(filter_comments_contain: str = "") -> pd.DataFrame: +def enrich_ae_rgpd_read( + dbt_model_name: str, filter_comments_contain: str = "" +) -> pd.DataFrame: """Reads necessary QFDMO acteurs and AE entries from DB""" from django.db import connection - # Get SQL query and set filter - sql = (DIR_SQL_READ / "rgpd_anonymize_people_read.sql").read_text() - sql = sql.replace("{{filter_comments_contain}}", filter_comments_contain) - # Execute SQL query and get data with connection.cursor() as cursor: - cursor.execute(sql) + cursor.execute(f"SELECT * FROM {dbt_model_name}") columns = [col[0] for col in cursor.description] data = cursor.fetchall() # Create DataFrame and preview df = pd.DataFrame(data, columns=columns, dtype="object").replace({np.nan: None}) - log.preview_df_as_markdown("Acteurs & entrées Annuaire Entreprise", df) + log.preview_df_as_markdown("Matches AVANT filtre commentaires", df) + if not df.empty and filter_comments_contain: + df = df[df["acteur_commentaires"].str.contains(filter_comments_contain)].copy() + log.preview_df_as_markdown("Matches APRES filtre commentaires", df) return df diff --git a/dags/rgpd/tasks/business_logic/rgpd_anonymize_people_suggest.py b/dags/enrich/tasks/business_logic/enrich_ae_rgpd_suggest.py similarity index 94% rename from dags/rgpd/tasks/business_logic/rgpd_anonymize_people_suggest.py rename to dags/enrich/tasks/business_logic/enrich_ae_rgpd_suggest.py index 8420d7c4e..4ba53c414 100644 --- a/dags/rgpd/tasks/business_logic/rgpd_anonymize_people_suggest.py +++ b/dags/enrich/tasks/business_logic/enrich_ae_rgpd_suggest.py @@ -3,7 +3,7 @@ import logging import pandas as pd -from rgpd.config import COLS +from enrich.config import COLS from sources.config.shared_constants import INFO_TO_HIDE from utils import logging_utils as log from utils.django import django_setup_full @@ -13,7 +13,7 @@ logger = logging.getLogger(__name__) -def rgpd_anonymize_people_suggest( +def enrich_ae_rgpd_suggest( df: pd.DataFrame, identifiant_action: str, identifiant_execution: str, @@ -41,7 +41,7 @@ def rgpd_anonymize_people_suggest( changes = [] # Preparing & validating the change params - acteur_id = row[COLS.QFDMO_ACTEUR_ID] + acteur_id = row[COLS.ACTEUR_ID] model_params = { "id": acteur_id, "data": { @@ -66,7 +66,7 @@ def rgpd_anonymize_people_suggest( "suggestion": { "title": "🕵️ RGPD: anonymiser les noms des acteurs", "summary": { - "noms d'origine": row[COLS.QFDMO_ACTEUR_NOMS_ORIGIN], + "noms d'origine": row[COLS.ACTEUR_NOMS_ORIGINE], "mots de match": row[COLS.MATCH_WORDS], "score de match": row[COLS.MATCH_SCORE], "changement": f"""{field_nom} & {field_nom_officiel} & diff --git a/dags/rgpd/config/sql/read/rgpd_anonymize_people_read.sql b/dags/rgpd/config/sql/read/rgpd_anonymize_people_read.sql deleted file mode 100644 index 8701e7deb..000000000 --- a/dags/rgpd/config/sql/read/rgpd_anonymize_people_read.sql +++ /dev/null @@ -1,32 +0,0 @@ -WITH acteurs_with_siren AS ( - SELECT - TRIM(CONCAT(nom || ' ' || nom_officiel || ' ' || nom_commercial)) AS noms, - LEFT(siret,9) AS siren, - identifiant_unique, - commentaires - FROM qfdmo_displayedacteur - WHERE siret IS NOT NULL AND siret != '' AND LENGTH(siret) = 14 - AND commentaires LIKE '%{{filter_comments_contain}}%' -) -SELECT - unite.siren, - acteurs.noms AS qfdmo_acteur_noms_origine, - acteurs.noms AS qfdmo_acteur_noms_comparaison, - acteurs.identifiant_unique AS qfdmo_acteur_id, - acteurs.commentaires AS qfdmo_acteur_commentaires, - unite."prenom1UniteLegale" AS "ae_prenom1UniteLegale", - unite."prenom2UniteLegale" AS "ae_prenom2UniteLegale", - unite."prenom3UniteLegale" AS "ae_prenom3UniteLegale", - unite."prenom4UniteLegale" AS "ae_prenom4UniteLegale", - unite."nomUniteLegale" AS "ae_nomUniteLegale", - unite."nomUsageUniteLegale" AS "ae_nomUsageUniteLegale" -FROM clone_ae_unite_legale_in_use AS unite -JOIN acteurs_with_siren AS acteurs ON acteurs.siren = unite.siren -WHERE( - ("prenom1UniteLegale" IS NOT NULL AND "prenom1UniteLegale" != '[ND]') - OR ("prenom2UniteLegale" IS NOT NULL AND "prenom2UniteLegale" != '[ND]') - OR ("prenom3UniteLegale" IS NOT NULL AND "prenom3UniteLegale" != '[ND]') - OR ("prenom4UniteLegale" IS NOT NULL AND "prenom4UniteLegale" != '[ND]') - OR ("nomUniteLegale" IS NOT NULL AND "nomUniteLegale" != '[ND]') - OR ("nomUsageUniteLegale" IS NOT NULL AND "nomUsageUniteLegale" != '[ND]') -) \ No newline at end of file diff --git a/dags/rgpd/config/tasks.py b/dags/rgpd/config/tasks.py deleted file mode 100644 index 651cdb30c..000000000 --- a/dags/rgpd/config/tasks.py +++ /dev/null @@ -1,10 +0,0 @@ -"""Task IDs for RGPD anonymize people DAG""" - -from dataclasses import dataclass - - -@dataclass(frozen=True) -class TASKS: - READ: str = "rgpd_anonymize_people_read" - MATCH_SCORE: str = "rgpd_anonymize_people_match" - SUGGEST: str = "rgpd_anonymize_people_suggest" diff --git a/dags/utils/dbt.py b/dags/utils/dbt.py new file mode 100644 index 000000000..3f4138691 --- /dev/null +++ b/dags/utils/dbt.py @@ -0,0 +1,33 @@ +"""Utilities to help us integrate Airflow <-> DBT, e.g. to detect +schema issues during tests/config checks without wasting time on DAG runs""" + +import json +from functools import cache +from pathlib import Path + +DIR_CURRENT = Path(__file__).resolve() +DIR_DBT = DIR_CURRENT.parent.parent.parent / "dbt" + + +@cache +def dbt_manifest_read() -> dict: + """Get the dbt manifest data""" + return json.loads((DIR_DBT / "target" / "manifest.json").read_text()) + + +def dbt_assert_model_schema(model_name: str, columns: list[str]) -> None: + """Check if a model exists in a dbt schema with some columns""" + + # Get manifest + manifest = dbt_manifest_read() + + # Ensure model is present + model_key = f"model.qfdmo.{model_name}" + if model_key not in manifest["nodes"]: + raise ValueError(f"Model {model_name} not found in dbt manifest") + + # Ensure columns are present + model_columns = manifest["nodes"][model_key]["columns"] + diff = set(columns) - set(model_columns) + if diff: + raise ValueError(f"Columns {diff} not found in model {model_name}") diff --git a/dags_unit_tests/enrich/tasks/test_enrich_ae_rgpd.py b/dags_unit_tests/enrich/tasks/test_enrich_ae_rgpd.py new file mode 100644 index 000000000..332983b56 --- /dev/null +++ b/dags_unit_tests/enrich/tasks/test_enrich_ae_rgpd.py @@ -0,0 +1,17 @@ +from dags.enrich.config import COLS, DBT +from dags.utils.dbt import dbt_assert_model_schema + + +class TestEnrichAeRgpd: + + def test_dbt(self): + model_name = DBT.MARTS_ENRICH_AE_RGPD + columns = [ + COLS.SIREN, + COLS.ACTEUR_ID, + COLS.ACTEUR_NOMS_ORIGINE, + COLS.ACTEUR_NOMS_NORMALISES, + COLS.ACTEUR_COMMENTAIRES, + COLS.AE_DIRIGEANTS_NOMS, + ] + dbt_assert_model_schema(model_name, columns) diff --git a/dags_unit_tests/utils/test_dbt.py b/dags_unit_tests/utils/test_dbt.py new file mode 100644 index 000000000..17f24b9d7 --- /dev/null +++ b/dags_unit_tests/utils/test_dbt.py @@ -0,0 +1,23 @@ +import pytest + +from dags.utils.dbt import dbt_assert_model_schema + +MODEL_NAME_OK = "marts_enrich_ae_rgpd" +COLUMNS_OK = ["siren", "acteur_id"] + + +class TestDbtCheckModelSchema: + + def test_working(self): + dbt_assert_model_schema(MODEL_NAME_OK, COLUMNS_OK) + pass + + def test_raise_if_column_not_found(self): + with pytest.raises(ValueError): + dbt_assert_model_schema( + MODEL_NAME_OK, ["siren", "🔴 COLUMN DOES NOT EXIST"] + ) + + def test_raise_if_model_not_found(self): + with pytest.raises(ValueError): + dbt_assert_model_schema("🔴 MODEL DOES NOT EXIST", COLUMNS_OK) diff --git a/dbt/models/marts/enrich/marts_enrich_ea_rgpd.sql b/dbt/models/marts/enrich/marts_enrich_ae_rgpd.sql similarity index 69% rename from dbt/models/marts/enrich/marts_enrich_ea_rgpd.sql rename to dbt/models/marts/enrich/marts_enrich_ae_rgpd.sql index 6e36bf5f7..b57af9b3e 100644 --- a/dbt/models/marts/enrich/marts_enrich_ea_rgpd.sql +++ b/dbt/models/marts/enrich/marts_enrich_ae_rgpd.sql @@ -17,7 +17,12 @@ WITH acteurs_with_siren AS ( SELECT LEFT(siret,9) AS siren, identifiant_unique AS acteur_id, - udf_normalize_string_alpha_for_match(CONCAT(nom || ' ' || nom_officiel || ' ' || nom_commercial)) AS acteur_noms, + TRIM(REGEXP_REPLACE( + CONCAT(nom || ', ' || nom_officiel || ', ' || nom_commercial), + ', , ', + '') + ) AS acteur_noms_origine, + udf_normalize_string_alpha_for_match(CONCAT(nom || ' ' || nom_officiel || ' ' || nom_commercial)) AS acteur_noms_normalises, commentaires AS acteur_commentaires FROM {{ ref('marts_carte_acteur') }} /* @@ -33,7 +38,8 @@ SELECT -- Acteur fields acteur_id, - acteur_noms, + acteur_noms_origine, + acteur_noms_normalises, acteur_commentaires, -- Unite legale fields @@ -50,7 +56,7 @@ SELECT dirigeant_prenom2, dirigeant_prenom3, dirigeant_prenom4 - ) AS dirigeants_noms_prenoms + ) AS ae_dirigeants_noms_prenoms FROM {{ ref('int_ae_unite_legale') }} AS unite LEFT JOIN acteurs_with_siren AS acteurs ON acteurs.siren = unite.siren @@ -58,11 +64,11 @@ WHERE acteurs.siren IS NOT NULL -- i.e. we have a match AND a_dirigeant_noms_ou_prenoms_non_null -- we have some directors names AND ( -- Any of the directors names appear in the acteur names - position(dirigeant_nom IN acteur_noms) > 0 - OR position(dirigeant_nom_usage IN acteur_noms) > 0 - OR position(dirigeant_pseudonyme IN acteur_noms) > 0 - OR position(dirigeant_prenom1 IN acteur_noms) > 0 - OR position(dirigeant_prenom2 IN acteur_noms) > 0 - OR position(dirigeant_prenom3 IN acteur_noms) > 0 - OR position(dirigeant_prenom4 IN acteur_noms) > 0 + position(dirigeant_nom IN acteur_noms_normalises) > 0 + OR position(dirigeant_nom_usage IN acteur_noms_normalises) > 0 + OR position(dirigeant_pseudonyme IN acteur_noms_normalises) > 0 + OR position(dirigeant_prenom1 IN acteur_noms_normalises) > 0 + OR position(dirigeant_prenom2 IN acteur_noms_normalises) > 0 + OR position(dirigeant_prenom3 IN acteur_noms_normalises) > 0 + OR position(dirigeant_prenom4 IN acteur_noms_normalises) > 0 ) \ No newline at end of file diff --git a/dbt/models/marts/enrich/schema.yml b/dbt/models/marts/enrich/schema.yml index d26f7a6a3..deaf70901 100644 --- a/dbt/models/marts/enrich/schema.yml +++ b/dbt/models/marts/enrich/schema.yml @@ -1,7 +1,7 @@ version: 2 models: - - name: marts_enrich_ea_rgpd + - name: marts_enrich_ae_rgpd description: Unités légales de l'Annuaire Entreprises (AE) préfiltrés | et prématchés sur la base des noms/prénoms de dirigeants dont au | moins 1 apparait dans le nom de nos acteurs (le modèle sera ensuite @@ -22,9 +22,13 @@ models: description: Identifiant unique de l'acteur data_tests: - not_null - - name: acteur_noms - description: Nom, nom officiel et nom commercial de l'acteur - | regroupés & normalisés pour réduire la taille de la table, sachant + - name: acteur_noms_origine + description: Nom, nom officiel et nom commercial de l'acteur regroupés + data_tests: + - not_null + - name: acteur_noms_normalises + description: Nom, nom officiel et nom commercial de l'acteur regroupés + | & normalisés pour réduire la taille de la table, sachant | qu'on fait un matching plus poussés avec python par la suite # Ensuring we are not matching empty strings data_tests: @@ -32,7 +36,7 @@ models: - name: acteur_commentaires description: Commentaires de l'acteur pour debug ET si on veut faire | filtering avec des paramètres de DAG - - name: dirigeants_noms_prenoms + - name: ae_dirigeants_noms_prenoms description: Noms & prénoms de tous les dirigeants | regroupés & normalisés pour réduire la taille de la table, sachant | qu'on fait un matching plus poussés avec python par la suite From e6528f8d2c07b385558d2375aad4ecd3b50155f9 Mon Sep 17 00:00:00 2001 From: Max Corbeau Date: Wed, 19 Mar 2025 17:25:04 +0100 Subject: [PATCH 05/50] ajout nouvelle config & test rgpd --- dags/enrich/config/__init__.py | 1 + dags/enrich/config/rgpd.py | 19 ++++ .../business_logic/enrich_ae_rgpd_suggest.py | 24 ++--- dags/sources/config/shared_constants.py | 3 - .../enrich/tasks/test_enrich_ae_rgpd.py | 92 ++++++++++++++++++- 5 files changed, 117 insertions(+), 22 deletions(-) create mode 100644 dags/enrich/config/rgpd.py diff --git a/dags/enrich/config/__init__.py b/dags/enrich/config/__init__.py index 77e633a18..deb08f5fb 100644 --- a/dags/enrich/config/__init__.py +++ b/dags/enrich/config/__init__.py @@ -1,5 +1,6 @@ from .columns import COLS # noqa: F401 from .dbt import DBT # noqa: F401 from .paths import DIR_SQL_READ # noqa: F401 +from .rgpd import RGPD # noqa: F401 from .tasks import TASKS # noqa: F401 from .xcoms import XCOMS, xcom_pull # noqa: F401 diff --git a/dags/enrich/config/rgpd.py b/dags/enrich/config/rgpd.py new file mode 100644 index 000000000..2d1f980ec --- /dev/null +++ b/dags/enrich/config/rgpd.py @@ -0,0 +1,19 @@ +"""Config to handle RGPD anonymization""" + +from dataclasses import dataclass + + +@dataclass(frozen=True) +class RGPD: + ACTEUR_FIELD_ANONYMIZED = "ANONYMISE POUR RAISON RGPD" + ACTEUR_FIELDS_TO_ANONYMIZE = [ + "nom", + "nom_officiel", + "nom_commercial", + "description", + "email", + "telephone", + "adresse", + "adresse_complement", + ] + ACTEUR_STATUS = "INACTIF" diff --git a/dags/enrich/tasks/business_logic/enrich_ae_rgpd_suggest.py b/dags/enrich/tasks/business_logic/enrich_ae_rgpd_suggest.py index 4ba53c414..908a051b4 100644 --- a/dags/enrich/tasks/business_logic/enrich_ae_rgpd_suggest.py +++ b/dags/enrich/tasks/business_logic/enrich_ae_rgpd_suggest.py @@ -3,8 +3,7 @@ import logging import pandas as pd -from enrich.config import COLS -from sources.config.shared_constants import INFO_TO_HIDE +from enrich.config import COLS, RGPD from utils import logging_utils as log from utils.django import django_setup_full @@ -28,12 +27,6 @@ def enrich_ae_rgpd_suggest( ) from data.models.change import SuggestionChange from data.models.changes import ChangeActeurUpdateData - from qfdmo.models import Acteur - - # Acteur fields - field_nom = Acteur._meta.get_field("nom").name - field_nom_officiel = Acteur._meta.get_field("nom_officiel").name - field_nom_commercial = Acteur._meta.get_field("nom_commercial").name # Prepare suggestions suggestions = [] @@ -42,14 +35,11 @@ def enrich_ae_rgpd_suggest( # Preparing & validating the change params acteur_id = row[COLS.ACTEUR_ID] - model_params = { - "id": acteur_id, - "data": { - field_nom: INFO_TO_HIDE, - field_nom_officiel: INFO_TO_HIDE, - field_nom_commercial: INFO_TO_HIDE, - }, + data = { + x: RGPD.ACTEUR_FIELD_ANONYMIZED for x in RGPD.ACTEUR_FIELDS_TO_ANONYMIZE } + data["statut"] = RGPD.ACTEUR_STATUS + model_params = {"id": acteur_id, "data": data} ChangeActeurUpdateData(**model_params).validate() # Preparing suggestion with change and ensuring we can JSON serialize it @@ -69,8 +59,8 @@ def enrich_ae_rgpd_suggest( "noms d'origine": row[COLS.ACTEUR_NOMS_ORIGINE], "mots de match": row[COLS.MATCH_WORDS], "score de match": row[COLS.MATCH_SCORE], - "changement": f"""{field_nom} & {field_nom_officiel} & - {field_nom_commercial} -> {INFO_TO_HIDE}""", + "changement": f"""{','.join(RGPD.ACTEUR_FIELDS_TO_ANONYMIZE)} + -> {RGPD.ACTEUR_FIELD_ANONYMIZED}""", }, "changes": changes, }, diff --git a/dags/sources/config/shared_constants.py b/dags/sources/config/shared_constants.py index 0f25418a5..85537beaa 100755 --- a/dags/sources/config/shared_constants.py +++ b/dags/sources/config/shared_constants.py @@ -33,6 +33,3 @@ # Special field values EMPTY_ACTEUR_FIELD = "__empty__" - -# To handle data we don't want to show on frontend -INFO_TO_HIDE = "[Information masquée]" diff --git a/dags_unit_tests/enrich/tasks/test_enrich_ae_rgpd.py b/dags_unit_tests/enrich/tasks/test_enrich_ae_rgpd.py index 332983b56..03b47e6e5 100644 --- a/dags_unit_tests/enrich/tasks/test_enrich_ae_rgpd.py +++ b/dags_unit_tests/enrich/tasks/test_enrich_ae_rgpd.py @@ -1,8 +1,14 @@ -from dags.enrich.config import COLS, DBT +import pandas as pd +import pytest + +from dags.enrich.config import COLS, DBT, RGPD +from dags.enrich.tasks.business_logic.enrich_ae_rgpd_suggest import ( + enrich_ae_rgpd_suggest, +) from dags.utils.dbt import dbt_assert_model_schema -class TestEnrichAeRgpd: +class TestEnrichAeRgpdConfig: def test_dbt(self): model_name = DBT.MARTS_ENRICH_AE_RGPD @@ -15,3 +21,85 @@ def test_dbt(self): COLS.AE_DIRIGEANTS_NOMS, ] dbt_assert_model_schema(model_name, columns) + + def test_rgpd_lock_fields_list(self): + # A test just to lock list of fields to anonymize + assert RGPD.ACTEUR_FIELDS_TO_ANONYMIZE == [ + "nom", + "nom_officiel", + "nom_commercial", + "description", + "email", + "telephone", + "adresse", + "adresse_complement", + ] + + def test_rgpd_lock_field_anonymized(self): + # A test just to lock field to anonymize + assert RGPD.ACTEUR_FIELD_ANONYMIZED == "ANONYMISE POUR RAISON RGPD" + + +@pytest.mark.django_db +class TestEnrichAeRgpdSuggest: + + @pytest.fixture + def df(self): + return pd.DataFrame( + { + COLS.ACTEUR_ID: ["id1", "id2"], + COLS.ACTEUR_NOMS_ORIGINE: ["acteur1", "acteur2"], + } + ) + + @pytest.fixture + def acteurs(self, df): + from qfdmo.models import Acteur + + for _, row in df.iterrows(): + Acteur.objects.create( + identifiant_unique=row[COLS.ACTEUR_ID], + nom=row[COLS.ACTEUR_NOMS_ORIGINE], + ) + + @pytest.fixture + def suggestions(self, df, acteurs): + return enrich_ae_rgpd_suggest( + df=df, + identifiant_action="my_action_id", + identifiant_execution="my_execution_id", + dry_run=True, + ) + + @pytest.fixture + def suggest(self, suggestions): + return suggestions[0] + + def test_one_suggestion_per_acteur(self, df, suggestions): + assert len(suggestions) == len(df) + + def test_one_change_per_suggestion(self, suggest): + # 1 change per acteur, we don't group acteurs together + # even if they have identical SIREN or SIRET + assert len(suggest["suggestion"]["changes"]) == 1 + + def test_suggestion_change_structure(self, suggest): + # The changes being sensitive, this test intentionnally + # hardcodes the structure of the suggestion so we need + # to udpate tests with intention when changing the DAG + change = suggest["suggestion"]["changes"][0] + assert change["model_name"] == "acteur_update_data" + assert change["model_params"] == { + "id": "id1", + "data": { + "nom": "ANONYMISE POUR RAISON RGPD", + "nom_officiel": "ANONYMISE POUR RAISON RGPD", + "nom_commercial": "ANONYMISE POUR RAISON RGPD", + "description": "ANONYMISE POUR RAISON RGPD", + "email": "ANONYMISE POUR RAISON RGPD", + "telephone": "ANONYMISE POUR RAISON RGPD", + "adresse": "ANONYMISE POUR RAISON RGPD", + "adresse_complement": "ANONYMISE POUR RAISON RGPD", + "statut": "INACTIF", + }, + } From 6b5b3df52f58db3add4c3e89034a9189e8731751 Mon Sep 17 00:00:00 2001 From: Max Corbeau Date: Thu, 20 Mar 2025 10:21:05 +0100 Subject: [PATCH 06/50] =?UTF-8?q?mod=C3=A8le=20RGPD,=20tests=20&=20migrati?= =?UTF-8?q?on?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- dags/enrich/config/__init__.py | 1 - dags/enrich/config/rgpd.py | 19 --- dags/enrich/dags/enrich_ae_rgpd.py | 7 +- .../business_logic/enrich_ae_rgpd_match.py | 2 +- .../business_logic/enrich_ae_rgpd_read.py | 24 +++- .../business_logic/enrich_ae_rgpd_suggest.py | 45 ++++--- dags/utils/dbt.py | 33 ----- .../enrich/tasks/test_enrich_ae_rgpd.py | 105 --------------- .../enrich/tasks/test_enrich_ae_rgpd_read.py | 49 +++++++ .../tasks/test_enrich_ae_rgpd_suggest.py | 115 +++++++++++++++++ dags_unit_tests/utils/test_dbt.py | 23 ---- ...009_alter_suggestioncohorte_type_action.py | 32 +++++ data/models/changes/__init__.py | 2 + data/models/changes/acteur_rgpd_anonymize.py | 62 +++++++++ data/models/changes/acteur_update_data.py | 7 +- data/models/suggestion.py | 22 ++-- qfdmo/models/acteur.py | 28 ++++ .../_partials/generic_suggestion_details.html | 7 +- .../changes/test_acteur_rgpd_anonymize.py | 121 ++++++++++++++++++ unit_tests/qfdmo/test_acteur_methods.py | 28 ++++ 20 files changed, 513 insertions(+), 219 deletions(-) delete mode 100644 dags/enrich/config/rgpd.py delete mode 100644 dags/utils/dbt.py delete mode 100644 dags_unit_tests/enrich/tasks/test_enrich_ae_rgpd.py create mode 100644 dags_unit_tests/enrich/tasks/test_enrich_ae_rgpd_read.py create mode 100644 dags_unit_tests/enrich/tasks/test_enrich_ae_rgpd_suggest.py delete mode 100644 dags_unit_tests/utils/test_dbt.py create mode 100644 data/migrations/0009_alter_suggestioncohorte_type_action.py create mode 100644 data/models/changes/acteur_rgpd_anonymize.py create mode 100644 unit_tests/data/models/changes/test_acteur_rgpd_anonymize.py create mode 100644 unit_tests/qfdmo/test_acteur_methods.py diff --git a/dags/enrich/config/__init__.py b/dags/enrich/config/__init__.py index deb08f5fb..77e633a18 100644 --- a/dags/enrich/config/__init__.py +++ b/dags/enrich/config/__init__.py @@ -1,6 +1,5 @@ from .columns import COLS # noqa: F401 from .dbt import DBT # noqa: F401 from .paths import DIR_SQL_READ # noqa: F401 -from .rgpd import RGPD # noqa: F401 from .tasks import TASKS # noqa: F401 from .xcoms import XCOMS, xcom_pull # noqa: F401 diff --git a/dags/enrich/config/rgpd.py b/dags/enrich/config/rgpd.py deleted file mode 100644 index 2d1f980ec..000000000 --- a/dags/enrich/config/rgpd.py +++ /dev/null @@ -1,19 +0,0 @@ -"""Config to handle RGPD anonymization""" - -from dataclasses import dataclass - - -@dataclass(frozen=True) -class RGPD: - ACTEUR_FIELD_ANONYMIZED = "ANONYMISE POUR RAISON RGPD" - ACTEUR_FIELDS_TO_ANONYMIZE = [ - "nom", - "nom_officiel", - "nom_commercial", - "description", - "email", - "telephone", - "adresse", - "adresse_complement", - ] - ACTEUR_STATUS = "INACTIF" diff --git a/dags/enrich/dags/enrich_ae_rgpd.py b/dags/enrich/dags/enrich_ae_rgpd.py index b8a98f48b..dd7b1d654 100644 --- a/dags/enrich/dags/enrich_ae_rgpd.py +++ b/dags/enrich/dags/enrich_ae_rgpd.py @@ -50,11 +50,14 @@ COLS.MATCH_THRESHOLD: Param( 1, type="number", + minimum=0.5, + maximum=1, description_md=r"""🎯 Seuil de match pour considérer un acteur - anonymisable. + anonymisable: - **match** = ratio du nombre de mots du nom de l'acteur qui correspondent à des mots de nom/prénom des personnes de l'AE - - **threshold** = contrainte en dur de ==1 pour la v1 + - **minimum** = 0.5 + - **maximum** = 1 """, ), }, diff --git a/dags/enrich/tasks/business_logic/enrich_ae_rgpd_match.py b/dags/enrich/tasks/business_logic/enrich_ae_rgpd_match.py index 896563c93..75d73045e 100644 --- a/dags/enrich/tasks/business_logic/enrich_ae_rgpd_match.py +++ b/dags/enrich/tasks/business_logic/enrich_ae_rgpd_match.py @@ -40,7 +40,7 @@ def enrich_ae_rgpd_match( """Identify matches between QFDMO company names and AE's people names.""" if df.empty: raise ValueError("df vide, on devrait pas être là") - if match_threshold < 0 or match_threshold > 1: + if match_threshold < 0.5 or match_threshold > 1: raise ValueError(f"match_threshold invalide: {match_threshold}") df = df.copy() diff --git a/dags/enrich/tasks/business_logic/enrich_ae_rgpd_read.py b/dags/enrich/tasks/business_logic/enrich_ae_rgpd_read.py index b960c01ae..3c9bfee4e 100644 --- a/dags/enrich/tasks/business_logic/enrich_ae_rgpd_read.py +++ b/dags/enrich/tasks/business_logic/enrich_ae_rgpd_read.py @@ -1,12 +1,17 @@ """Read data from DB needed for RGPD anonymization""" +import logging + import numpy as np import pandas as pd +from enrich.config import COLS from utils import logging_utils as log from utils.django import django_setup_full django_setup_full() +logger = logging.getLogger(__name__) + def enrich_ae_rgpd_read( dbt_model_name: str, filter_comments_contain: str = "" @@ -22,9 +27,20 @@ def enrich_ae_rgpd_read( # Create DataFrame and preview df = pd.DataFrame(data, columns=columns, dtype="object").replace({np.nan: None}) - log.preview_df_as_markdown("Matches AVANT filtre commentaires", df) - if not df.empty and filter_comments_contain: - df = df[df["acteur_commentaires"].str.contains(filter_comments_contain)].copy() - log.preview_df_as_markdown("Matches APRES filtre commentaires", df) + log.preview_df_as_markdown("Matches acteurs vs. Annuaire Entreprises", df) + + # Filtering if needed + filter = (filter_comments_contain or "").strip() + if not df.empty and filter: + logger.info(f"Filtre sur les commentaires: {filter}") + df = df[df[COLS.ACTEUR_COMMENTAIRES].notnull()].copy() + df = df[ + df[COLS.ACTEUR_COMMENTAIRES].str.contains( + filter, + regex=True, + case=False, + ) + ].copy() + log.preview_df_as_markdown("Matches APRES filtre commentaires", df) return df diff --git a/dags/enrich/tasks/business_logic/enrich_ae_rgpd_suggest.py b/dags/enrich/tasks/business_logic/enrich_ae_rgpd_suggest.py index 908a051b4..3ff3e5b9e 100644 --- a/dags/enrich/tasks/business_logic/enrich_ae_rgpd_suggest.py +++ b/dags/enrich/tasks/business_logic/enrich_ae_rgpd_suggest.py @@ -1,9 +1,10 @@ """Generate suggestions from matches""" import logging +from typing import Any import pandas as pd -from enrich.config import COLS, RGPD +from enrich.config import COLS from utils import logging_utils as log from utils.django import django_setup_full @@ -12,6 +13,12 @@ logger = logging.getLogger(__name__) +# TODO: create a utility + model which helps us generate +# structured & consistent details for generic_suggestion_details.html +def sumline(label: str, value: Any, value_type: str): + return locals() + + def enrich_ae_rgpd_suggest( df: pd.DataFrame, identifiant_action: str, @@ -26,7 +33,10 @@ def enrich_ae_rgpd_suggest( SuggestionStatut, ) from data.models.change import SuggestionChange - from data.models.changes import ChangeActeurUpdateData + from data.models.changes.acteur_rgpd_anonymize import ( + ACTEUR_FIELDS_TO_ANONYMIZE, + ChangeActeurRgpdAnonymize, + ) # Prepare suggestions suggestions = [] @@ -35,33 +45,32 @@ def enrich_ae_rgpd_suggest( # Preparing & validating the change params acteur_id = row[COLS.ACTEUR_ID] - data = { - x: RGPD.ACTEUR_FIELD_ANONYMIZED for x in RGPD.ACTEUR_FIELDS_TO_ANONYMIZE - } - data["statut"] = RGPD.ACTEUR_STATUS - model_params = {"id": acteur_id, "data": data} - ChangeActeurUpdateData(**model_params).validate() + model_params = {"id": acteur_id} + ChangeActeurRgpdAnonymize(**model_params).validate() # Preparing suggestion with change and ensuring we can JSON serialize it change = SuggestionChange( order=1, reason="Noms/prénoms détectés dans l'Annuaire Entreprise (AE)", entity_type="acteur_displayed", - model_name=ChangeActeurUpdateData.name(), + model_name=ChangeActeurRgpdAnonymize.name(), model_params=model_params, ).model_dump() changes.append(change) + contexte_changes = ACTEUR_FIELDS_TO_ANONYMIZE.copy() + contexte_changes["commentaires"] = "➕ Ajout mention avec 📆 date & ⏰ heure" suggestion = { - "contexte": "Idem suggestion", + "contexte": { + "changements": contexte_changes, + }, "suggestion": { - "title": "🕵️ RGPD: anonymiser les noms des acteurs", - "summary": { - "noms d'origine": row[COLS.ACTEUR_NOMS_ORIGINE], - "mots de match": row[COLS.MATCH_WORDS], - "score de match": row[COLS.MATCH_SCORE], - "changement": f"""{','.join(RGPD.ACTEUR_FIELDS_TO_ANONYMIZE)} - -> {RGPD.ACTEUR_FIELD_ANONYMIZED}""", - }, + "title": "🕵️ Anonymisation RGPD", + "summary": [ + sumline("noms d'origine", row[COLS.ACTEUR_NOMS_ORIGINE], "text"), + sumline("mots de match", row[COLS.MATCH_WORDS], "text_list"), + sumline("score de match", row[COLS.MATCH_SCORE], "score_0_to_1"), + sumline("changements", "voir contexte/détails", "text"), + ], "changes": changes, }, } diff --git a/dags/utils/dbt.py b/dags/utils/dbt.py deleted file mode 100644 index 3f4138691..000000000 --- a/dags/utils/dbt.py +++ /dev/null @@ -1,33 +0,0 @@ -"""Utilities to help us integrate Airflow <-> DBT, e.g. to detect -schema issues during tests/config checks without wasting time on DAG runs""" - -import json -from functools import cache -from pathlib import Path - -DIR_CURRENT = Path(__file__).resolve() -DIR_DBT = DIR_CURRENT.parent.parent.parent / "dbt" - - -@cache -def dbt_manifest_read() -> dict: - """Get the dbt manifest data""" - return json.loads((DIR_DBT / "target" / "manifest.json").read_text()) - - -def dbt_assert_model_schema(model_name: str, columns: list[str]) -> None: - """Check if a model exists in a dbt schema with some columns""" - - # Get manifest - manifest = dbt_manifest_read() - - # Ensure model is present - model_key = f"model.qfdmo.{model_name}" - if model_key not in manifest["nodes"]: - raise ValueError(f"Model {model_name} not found in dbt manifest") - - # Ensure columns are present - model_columns = manifest["nodes"][model_key]["columns"] - diff = set(columns) - set(model_columns) - if diff: - raise ValueError(f"Columns {diff} not found in model {model_name}") diff --git a/dags_unit_tests/enrich/tasks/test_enrich_ae_rgpd.py b/dags_unit_tests/enrich/tasks/test_enrich_ae_rgpd.py deleted file mode 100644 index 03b47e6e5..000000000 --- a/dags_unit_tests/enrich/tasks/test_enrich_ae_rgpd.py +++ /dev/null @@ -1,105 +0,0 @@ -import pandas as pd -import pytest - -from dags.enrich.config import COLS, DBT, RGPD -from dags.enrich.tasks.business_logic.enrich_ae_rgpd_suggest import ( - enrich_ae_rgpd_suggest, -) -from dags.utils.dbt import dbt_assert_model_schema - - -class TestEnrichAeRgpdConfig: - - def test_dbt(self): - model_name = DBT.MARTS_ENRICH_AE_RGPD - columns = [ - COLS.SIREN, - COLS.ACTEUR_ID, - COLS.ACTEUR_NOMS_ORIGINE, - COLS.ACTEUR_NOMS_NORMALISES, - COLS.ACTEUR_COMMENTAIRES, - COLS.AE_DIRIGEANTS_NOMS, - ] - dbt_assert_model_schema(model_name, columns) - - def test_rgpd_lock_fields_list(self): - # A test just to lock list of fields to anonymize - assert RGPD.ACTEUR_FIELDS_TO_ANONYMIZE == [ - "nom", - "nom_officiel", - "nom_commercial", - "description", - "email", - "telephone", - "adresse", - "adresse_complement", - ] - - def test_rgpd_lock_field_anonymized(self): - # A test just to lock field to anonymize - assert RGPD.ACTEUR_FIELD_ANONYMIZED == "ANONYMISE POUR RAISON RGPD" - - -@pytest.mark.django_db -class TestEnrichAeRgpdSuggest: - - @pytest.fixture - def df(self): - return pd.DataFrame( - { - COLS.ACTEUR_ID: ["id1", "id2"], - COLS.ACTEUR_NOMS_ORIGINE: ["acteur1", "acteur2"], - } - ) - - @pytest.fixture - def acteurs(self, df): - from qfdmo.models import Acteur - - for _, row in df.iterrows(): - Acteur.objects.create( - identifiant_unique=row[COLS.ACTEUR_ID], - nom=row[COLS.ACTEUR_NOMS_ORIGINE], - ) - - @pytest.fixture - def suggestions(self, df, acteurs): - return enrich_ae_rgpd_suggest( - df=df, - identifiant_action="my_action_id", - identifiant_execution="my_execution_id", - dry_run=True, - ) - - @pytest.fixture - def suggest(self, suggestions): - return suggestions[0] - - def test_one_suggestion_per_acteur(self, df, suggestions): - assert len(suggestions) == len(df) - - def test_one_change_per_suggestion(self, suggest): - # 1 change per acteur, we don't group acteurs together - # even if they have identical SIREN or SIRET - assert len(suggest["suggestion"]["changes"]) == 1 - - def test_suggestion_change_structure(self, suggest): - # The changes being sensitive, this test intentionnally - # hardcodes the structure of the suggestion so we need - # to udpate tests with intention when changing the DAG - change = suggest["suggestion"]["changes"][0] - assert change["model_name"] == "acteur_update_data" - assert change["model_params"] == { - "id": "id1", - "data": { - "nom": "ANONYMISE POUR RAISON RGPD", - "nom_officiel": "ANONYMISE POUR RAISON RGPD", - "nom_commercial": "ANONYMISE POUR RAISON RGPD", - "description": "ANONYMISE POUR RAISON RGPD", - "email": "ANONYMISE POUR RAISON RGPD", - "telephone": "ANONYMISE POUR RAISON RGPD", - "adresse": "ANONYMISE POUR RAISON RGPD", - "adresse_complement": "ANONYMISE POUR RAISON RGPD", - "statut": "INACTIF", - }, - } diff --git a/dags_unit_tests/enrich/tasks/test_enrich_ae_rgpd_read.py b/dags_unit_tests/enrich/tasks/test_enrich_ae_rgpd_read.py new file mode 100644 index 000000000..47320dea8 --- /dev/null +++ b/dags_unit_tests/enrich/tasks/test_enrich_ae_rgpd_read.py @@ -0,0 +1,49 @@ +import pytest + +from dags.enrich.config import COLS +from dags.enrich.tasks.business_logic.enrich_ae_rgpd_read import ( + enrich_ae_rgpd_read, +) + +DBT_MODEL_NAME = "my_dummy_dbt_model" + + +@pytest.mark.django_db +class TestEnrichAeRgpdRead: + + @pytest.fixture + def dbt_model(self): + from django.db import connection + + sql = f"""CREATE TABLE {DBT_MODEL_NAME} ( + {COLS.ACTEUR_COMMENTAIRES} TEXT + ); + + INSERT INTO {DBT_MODEL_NAME} ({COLS.ACTEUR_COMMENTAIRES}) VALUES + (NULL), + (' '), + ('This is the first comment.'), + ('Second comment here.'), + ('Another comment added.');""" + + with connection.cursor() as cursor: + cursor.execute(sql) + + def test_default(self, dbt_model): + df = enrich_ae_rgpd_read(DBT_MODEL_NAME) + assert df[COLS.ACTEUR_COMMENTAIRES].tolist() == [ + None, + " ", + "This is the first comment.", + "Second comment here.", + "Another comment added.", + ] + + def test_filter_supports_insensitive_regex(self, dbt_model): + df = enrich_ae_rgpd_read( + DBT_MODEL_NAME, filter_comments_contain="(second|another)" + ) + assert df[COLS.ACTEUR_COMMENTAIRES].tolist() == [ + "Second comment here.", + "Another comment added.", + ] diff --git a/dags_unit_tests/enrich/tasks/test_enrich_ae_rgpd_suggest.py b/dags_unit_tests/enrich/tasks/test_enrich_ae_rgpd_suggest.py new file mode 100644 index 000000000..ca2f4bdb3 --- /dev/null +++ b/dags_unit_tests/enrich/tasks/test_enrich_ae_rgpd_suggest.py @@ -0,0 +1,115 @@ +import json +import re + +import pandas as pd +import pytest +from django.contrib.gis.geos import Point +from rich import print + +from dags.enrich.config import COLS +from dags.enrich.tasks.business_logic.enrich_ae_rgpd_suggest import ( + enrich_ae_rgpd_suggest, +) + +CHANGE_ANON = "ANONYMISE POUR RAISON RGPD" +COMMENT_PATTERN = CHANGE_ANON + r" le \d{4}-\d{2}-\d{2} à \d{2}:\d{2}:\d{2} UTC" + + +@pytest.mark.django_db +class TestEnrichAeRgpdSuggest: + + @pytest.fixture + def df(self): + return pd.DataFrame( + { + COLS.ACTEUR_ID: ["id1", "id2"], + COLS.ACTEUR_NOMS_ORIGINE: ["acteur1", "acteur2"], + COLS.MATCH_WORDS: ["acteur1", "acteur2"], + COLS.MATCH_SCORE: [1.0, 1.0], + } + ) + + @pytest.fixture + def acteurs(self, df): + from qfdmo.models import Acteur, ActeurType + + at1 = ActeurType(code="at1") + at1.save() + + for _, row in df.iterrows(): + Acteur.objects.create( + # Required fields + identifiant_unique=row[COLS.ACTEUR_ID], + acteur_type=at1, + location=Point(1, 2), + # Fields to anonymize + nom=row[COLS.ACTEUR_NOMS_ORIGINE], + nom_officiel="🟠 not anonymized", + nom_commercial="🟠 not anonymized", + email="me@myself.com", + telephone="🟠 not anonymized", + adresse="🟠 not anonymized", + adresse_complement="🟠 not anonymized", + # Fields to keep as-is + description="🟠 not anonymized", + ) + + @pytest.fixture + def suggestions(self, df, acteurs): + return enrich_ae_rgpd_suggest( + df=df, + identifiant_action="my_action_id", + identifiant_execution="my_execution_id", + dry_run=True, + ) + + @pytest.fixture + def suggest(self, suggestions) -> dict: + suggest = suggestions[0] + print(f"{suggest=}") + return suggest + + def test_one_suggestion_per_acteur(self, df, suggestions): + assert len(suggestions) == len(df) + + def test_one_change_per_suggestion(self, suggest): + # 1 change per acteur, we don't group acteurs together + # even if they have identical SIREN or SIRET + assert len(suggest["suggestion"]["changes"]) == 1 + + def test_suggestion_change(self, suggest): + # The changes being sensitive, this test intentionnally + # hardcodes the structure of the suggestion so we need + # to udpate tests with intention when changing the DAG + from data.models.change import SuggestionChange + from qfdmo.models import Acteur, ActeurStatus + + change = suggest["suggestion"]["changes"][0] + assert change["model_name"] == "acteur_rgpd_anonymize" + assert change["model_params"] == {"id": "id1"} + + SuggestionChange(**change).apply() + + acteur = Acteur.objects.get(identifiant_unique="id1") + + # Fields anonymized + assert acteur.nom == "ANONYMISE POUR RAISON RGPD" + assert acteur.nom_officiel == "ANONYMISE POUR RAISON RGPD" + assert acteur.nom_commercial == "ANONYMISE POUR RAISON RGPD" + assert acteur.email is None + assert acteur.telephone == "ANONYMISE POUR RAISON RGPD" + assert acteur.adresse == "ANONYMISE POUR RAISON RGPD" + assert acteur.adresse_complement == "ANONYMISE POUR RAISON RGPD" + + # Status set to inactif + assert acteur.statut == ActeurStatus.INACTIF + + # Check comment + comments = json.loads(acteur.commentaires) + assert re.match(COMMENT_PATTERN, comments[0]["message"]) + + # Fields not changed + assert acteur.description == "🟠 not anonymized" + assert acteur.location.x == 1 + assert acteur.location.y == 2 + assert acteur.acteur_type.code == "at1" diff --git a/dags_unit_tests/utils/test_dbt.py b/dags_unit_tests/utils/test_dbt.py deleted file mode 100644 index 17f24b9d7..000000000 --- a/dags_unit_tests/utils/test_dbt.py +++ /dev/null @@ -1,23 +0,0 @@ -import pytest - -from dags.utils.dbt import dbt_assert_model_schema - -MODEL_NAME_OK = "marts_enrich_ae_rgpd" -COLUMNS_OK = ["siren", "acteur_id"] - - -class TestDbtCheckModelSchema: - - def test_working(self): - dbt_assert_model_schema(MODEL_NAME_OK, COLUMNS_OK) - pass - - def test_raise_if_column_not_found(self): - with pytest.raises(ValueError): - dbt_assert_model_schema( - MODEL_NAME_OK, ["siren", "🔴 COLUMN DOES NOT EXIST"] - ) - - def test_raise_if_model_not_found(self): - with pytest.raises(ValueError): - dbt_assert_model_schema("🔴 MODEL DOES NOT EXIST", COLUMNS_OK) diff --git a/data/migrations/0009_alter_suggestioncohorte_type_action.py b/data/migrations/0009_alter_suggestioncohorte_type_action.py new file mode 100644 index 000000000..17d58a020 --- /dev/null +++ b/data/migrations/0009_alter_suggestioncohorte_type_action.py @@ -0,0 +1,32 @@ +# Generated by Django 5.1.7 on 2025-03-20 09:08 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("data", "0008_alter_suggestioncohorte_type_action"), + ] + + operations = [ + migrations.AlterField( + model_name="suggestioncohorte", + name="type_action", + field=models.CharField( + blank=True, + choices=[ + ("CRAWL_URLS", "🔗 URLs scannées"), + ("RGPD_ANONYMISATION", "🕵️ Anonymisation RGPD"), + ("CLUSTERING", "regroupement/déduplication des acteurs"), + ("SOURCE_AJOUT", "ingestion de source de données - nouveau acteur"), + ( + "SOURCE_MODIFICATION", + "ingestion de source de données - modification d'acteur existant", + ), + ("SOURCE_SUPRESSION", "ingestion de source de données"), + ], + max_length=50, + ), + ), + ] diff --git a/data/models/changes/__init__.py b/data/models/changes/__init__.py index f3503fdb0..21166e57d 100644 --- a/data/models/changes/__init__.py +++ b/data/models/changes/__init__.py @@ -2,12 +2,14 @@ from .acteur_create_as_parent import ChangeActeurCreateAsParent from .acteur_delete_as_parent import ChangeActeurDeleteAsParent from .acteur_keep_as_parent import ChangeActeurKeepAsParent +from .acteur_rgpd_anonymize import ChangeActeurRgpdAnonymize from .acteur_update_data import ChangeActeurUpdateData from .acteur_update_parent_id import ChangeActeurUpdateParentId from .acteur_verify_in_revision import ChangeActeurVerifyRevision from .sample_model_do_nothing import SampleModelDoNothing CHANGE_MODELS = { + ChangeActeurRgpdAnonymize.name(): ChangeActeurRgpdAnonymize, ChangeActeurUpdateData.name(): ChangeActeurUpdateData, ChangeActeurCreateAsParent.name(): ChangeActeurCreateAsParent, ChangeActeurDeleteAsParent.name(): ChangeActeurDeleteAsParent, diff --git a/data/models/changes/acteur_rgpd_anonymize.py b/data/models/changes/acteur_rgpd_anonymize.py new file mode 100644 index 000000000..f4bc1c257 --- /dev/null +++ b/data/models/changes/acteur_rgpd_anonymize.py @@ -0,0 +1,62 @@ +"""Special change model dedicated to RGPD because: + +- NORMALLY we version data through RevisionActeur + + consequence: we create a Revision if it doesn't exist + +- HOWEVER WITH RGPD we don't do data versioning, we overwrite + the data so it disappears from our DB + = consequence: we don't create a Revision if it doesn't exist + (again we are not versioning, just overwriting) + +Since the approach to RGPD should be consistent, we don't +expect the model to take any other input than the ID of the acteur +we are changing, and the model takes care of the rest +""" + +from datetime import datetime, timezone + +from data.models.changes.acteur_abstract import ChangeActeurAbstract +from qfdmo.models import Acteur, ActeurStatus, RevisionActeur + +VALUE_ANONYMIZED = "ANONYMISE POUR RAISON RGPD" +ACTEUR_FIELDS_TO_ANONYMIZE = { + "nom": VALUE_ANONYMIZED, + "nom_officiel": VALUE_ANONYMIZED, + "nom_commercial": VALUE_ANONYMIZED, + "email": None, # due to email constraint + "telephone": VALUE_ANONYMIZED, + "adresse": VALUE_ANONYMIZED, + "adresse_complement": VALUE_ANONYMIZED, + "statut": ActeurStatus.INACTIF, +} + + +class ChangeActeurRgpdAnonymize(ChangeActeurAbstract): + @classmethod + def name(cls) -> str: + return "acteur_rgpd_anonymize" + + def validate(self) -> list[Acteur | RevisionActeur]: + if self.data: + raise ValueError("Pour RGPD ne pas fournir de data, le modèle efface") + # The parent should already exist in revision or base + # and we return all its instances to overwrite them all + instances = [] + rev = RevisionActeur.objects.filter(pk=self.id).first() + if rev: + instances.append(rev) + instances.append(Acteur.objects.get(pk=self.id)) + return instances + + def apply(self): + # For each instance found + instances = self.validate() + for instance in instances: + # We anonymize the fields + for key, value in ACTEUR_FIELDS_TO_ANONYMIZE.items(): + setattr(instance, key, value) + + # Special case for comments + now = datetime.now(timezone.utc).strftime("le %Y-%m-%d à %H:%M:%S UTC") + instance.commentaires_ajouter(f"{VALUE_ANONYMIZED} {now}") + instance.save() diff --git a/data/models/changes/acteur_update_data.py b/data/models/changes/acteur_update_data.py index f740ebc3a..2c3104db7 100644 --- a/data/models/changes/acteur_update_data.py +++ b/data/models/changes/acteur_update_data.py @@ -12,17 +12,20 @@ class ChangeActeurUpdateData(ChangeActeurAbstract): def name(cls) -> str: return "acteur_update_data" - def validate(self): - # The parent should already exist + def validate(self) -> Acteur | RevisionActeur: if not self.data: raise ValueError("No data provided") + # The parent should already exist in revision or base + # We tolerate absence from revision result = RevisionActeur.objects.filter(pk=self.id).first() if not result: + # But if not in revision, must be in base result = Acteur.objects.get(pk=self.id) return result def apply(self): acteur = self.validate() + # If acteur is only in base, we need to create a revision if isinstance(acteur, Acteur): acteur = RevisionActeur(identifiant_unique=acteur.identifiant_unique) data = data_reconstruct(RevisionActeur, self.data) diff --git a/data/models/suggestion.py b/data/models/suggestion.py index ddc01eecc..f6e7d2f58 100644 --- a/data/models/suggestion.py +++ b/data/models/suggestion.py @@ -54,7 +54,7 @@ class SuggestionCohorteStatut(models.TextChoices): class SuggestionAction(models.TextChoices): CRAWL_URLS = SUGGESTION_CRAWL_URLS, "🔗 URLs scannées" - RGPD_ANONYMIZE = "RGPD_ANONYMISATION", "Anonymisation RGPD" + RGPD_ANONYMIZE = "RGPD_ANONYMISATION", "🕵️ Anonymisation RGPD" CLUSTERING = SUGGESTION_CLUSTERING, "regroupement/déduplication des acteurs" SOURCE_AJOUT = ( SUGGESTION_SOURCE_AJOUT, @@ -188,9 +188,18 @@ def display_contexte_details(self): def display_suggestion_details(self): template_name = "data/_partials/suggestion_details.html" template_context = {"suggestion": self.suggestion} + + # Suggestions leveraging the PYDANTIC SuggestionChange model if self.suggestion_cohorte.type_action == SuggestionAction.CLUSTERING: template_context = self.suggestion template_name = "data/_partials/clustering_suggestion_details.html" + elif self.suggestion_cohorte.type_action == SuggestionAction.CRAWL_URLS: + template_name = "data/_partials/crawl_urls_suggestion_details.html" + elif self.suggestion_cohorte.type_action == SuggestionAction.RGPD_ANONYMIZE: + template_name = "data/_partials/generic_suggestion_details.html" + template_context = self.suggestion + + # TODO: suggestions to migrate to PYDANTIC classes elif ( self.suggestion_cohorte.type_action == SuggestionAction.SOURCE_SUPPRESSION and isinstance(self.suggestion, dict) @@ -223,11 +232,6 @@ def display_suggestion_details(self): and isinstance(self.suggestion, dict) ): template_name = "data/_partials/ajout_suggestion_details.html" - elif self.suggestion_cohorte.type_action == SuggestionAction.CRAWL_URLS: - template_name = "data/_partials/crawl_urls_suggestion_details.html" - elif self.suggestion_cohorte.type_action == SuggestionAction.RGPD_ANONYMIZE: - template_name = "data/_partials/generic_suggestion_details.html" - template_context = self.suggestion.copy() return render_to_string(template_name, template_context) @@ -307,9 +311,9 @@ def _update_acteur(self): self._remove_acteur_linked_objects(acteur) self._create_acteur_linked_objects(acteur) - # FIXME: this acteur management will be reviewed with PYDANTIC classes which will - # be used to handle all specificities of suggestions def apply(self): + + # Suggestions leveraging the PYDANTIC SuggestionChange model if self.suggestion_cohorte.type_action in [ SuggestionAction.CLUSTERING, SuggestionAction.CRAWL_URLS, @@ -319,6 +323,8 @@ def apply(self): changes.sort(key=lambda x: x["order"]) for change in changes: SuggestionChange(**change).apply() + + # FIXME: this acteur management will be reviewed with PYDANTIC classes elif self.suggestion_cohorte.type_action == SuggestionAction.SOURCE_AJOUT: self._create_acteur() elif ( diff --git a/qfdmo/models/acteur.py b/qfdmo/models/acteur.py index fde50f37b..e2662c7cb 100644 --- a/qfdmo/models/acteur.py +++ b/qfdmo/models/acteur.py @@ -1,3 +1,4 @@ +import json import logging import random import re @@ -601,6 +602,33 @@ def get_fields_for_clone(cls): "labels", } + def commentaires_ajouter(self, added): + """Historically this field has been defined as TextField + but has contained a mix of free text and JSON data, hence + method to help append data in a JSON format""" + existing = self.commentaires + + # If empty we overwrite + if existing is None or existing.strip() == "": + self.commentaires = json.dumps([{"message": added}]) + else: + try: + # If not empty, trying to parse as JSON + existing_data = json.loads(existing) + if not isinstance(existing_data, list): + raise NotImplementedError( + "Cas de commentaires JSON non-liste pas prévu" + ) + except (json.JSONDecodeError, ValueError): + # If existing not JSON we turn it into a list + existing_data = [{"message": existing}] + + # Appending new data + existing_data.append({"message": added}) + self.commentaires = json.dumps(existing_data) + + self.save() + def clean_parent(parent): try: diff --git a/templates/data/_partials/generic_suggestion_details.html b/templates/data/_partials/generic_suggestion_details.html index 5c3b2f796..3a823cca5 100644 --- a/templates/data/_partials/generic_suggestion_details.html +++ b/templates/data/_partials/generic_suggestion_details.html @@ -7,10 +7,11 @@ {% block suggestion_details %} -

💡 Résumé des changements:

+

💡 Résumé:

    - {% for key,value in summary.items %} -
  • {{ key }}: {{ value }}
  • + {% for entry in summary %} + {# TODO: we can use entry.value_type to customize rendering #} +
  • {{ entry.label }}: {{ entry.value }}
  • {% endfor %}
diff --git a/unit_tests/data/models/changes/test_acteur_rgpd_anonymize.py b/unit_tests/data/models/changes/test_acteur_rgpd_anonymize.py new file mode 100644 index 000000000..74d7c02fe --- /dev/null +++ b/unit_tests/data/models/changes/test_acteur_rgpd_anonymize.py @@ -0,0 +1,121 @@ +""" +Test file for the ChangeActeurRgpdAnonymize model. + +""" + +import json +import re + +import pytest +from django.contrib.gis.geos import Point + +from data.models.changes.acteur_rgpd_anonymize import ( + ChangeActeurRgpdAnonymize, +) +from qfdmo.models.acteur import Acteur, ActeurStatus, ActeurType, RevisionActeur + +TEST_DATA = { + "location": Point(1, 2), + "nom": "🟠 not anonymized", + "nom_officiel": "🟠 not anonymized", + "nom_commercial": "🟠 not anonymized", + "description": "🟠 not anonymized", + "email": "me@myself.com", + "telephone": "🟠 not anonymized", + "adresse": "🟠 not anonymized", + "adresse_complement": "🟠 not anonymized", + "statut": ActeurStatus.ACTIF, + "commentaires": " ", +} + +# Intentionally replicating & hardcoding the expected +# changes to prevent accidental modification to model +# without updating the tests +CHANGE_ANON = "ANONYMISE POUR RAISON RGPD" +CHANGES_EXPECTED = { + "nom": CHANGE_ANON, + "nom_officiel": CHANGE_ANON, + "nom_commercial": CHANGE_ANON, + "email": None, # due to email constraint + "telephone": CHANGE_ANON, + "adresse": CHANGE_ANON, + "adresse_complement": CHANGE_ANON, + "statut": ActeurStatus.INACTIF, +} +COMMENT_PATTERN = CHANGE_ANON + r" le \d{4}-\d{2}-\d{2} à \d{2}:\d{2}:\d{2} UTC" + + +@pytest.mark.django_db +class TestChangeActeurRgpdAnonymize: + def test_name(self): + assert ChangeActeurRgpdAnonymize.name() == "acteur_rgpd_anonymize" + + def test_raise_if_data_provided(self): + change = ChangeActeurRgpdAnonymize(id="dummy", data={"nom": "dummy"}) + with pytest.raises(ValueError, match="Pour RGPD ne pas fournir de data"): + change.apply() + + def test_raise_if_acteur_does_not_exist(self): + change = ChangeActeurRgpdAnonymize(id="dummy") + with pytest.raises(Acteur.DoesNotExist): + change.apply() + + def test_working_only_in_base(self): + # We start by creating acteur only in base + at1 = ActeurType.objects.create(code="at1") + id1 = "id1" + data = TEST_DATA.copy() + data["acteur_type"] = at1 + data["identifiant_unique"] = id1 + Acteur.objects.create(**data) + + # We check that acteur isn't in revision yet + assert RevisionActeur.objects.filter(pk=id1).count() == 0 + + # Since RGPD changes are to owerwrite consistently, we don't + # pass any data to the model, only the ID of the acteur + # and the model takes care of the rest + ChangeActeurRgpdAnonymize(id=id1).apply() + + # We check that no revision was created because we overwrite + # hence don't want Revisions meants for versioning + assert not RevisionActeur.objects.filter(pk=id1).exists() + + # We check that acteur in base was anonymized + base = Acteur.objects.get(pk=id1) + for key, value in CHANGES_EXPECTED.items(): + assert getattr(base, key) == value + + # Comments + comments = json.loads(base.commentaires) + assert re.match(COMMENT_PATTERN, comments[0]["message"]) + + # We check that other fields were not modified + assert base.description == "🟠 not anonymized" + + def test_working_both_base_and_revision(self): + # We start by creating acteur BOTH in base and revision + at1 = ActeurType.objects.create(code="at1") + id2 = "id2" + data = TEST_DATA.copy() + data["acteur_type"] = at1 + data["identifiant_unique"] = id2 + Acteur.objects.create(**data) + RevisionActeur.objects.create(**data) + + # Same remark as previous test on not having to pass data + ChangeActeurRgpdAnonymize(id=id2).apply() + + # In this case we check that all instances were anonymized + instances = [ + Acteur.objects.get(pk=id2), + RevisionActeur.objects.get(pk=id2), + ] + for instance in instances: + for key, value in CHANGES_EXPECTED.items(): + assert getattr(instance, key) == value + assert instance.description == "🟠 not anonymized" + + # Comments + comments = json.loads(instance.commentaires) + assert re.match(COMMENT_PATTERN, comments[0]["message"]) diff --git a/unit_tests/qfdmo/test_acteur_methods.py b/unit_tests/qfdmo/test_acteur_methods.py new file mode 100644 index 000000000..d04b43291 --- /dev/null +++ b/unit_tests/qfdmo/test_acteur_methods.py @@ -0,0 +1,28 @@ +"""Test file dedicated to acteur methods""" + +import json + +import pytest + +from unit_tests.qfdmo.acteur_factory import ( + ActeurFactory, +) + + +@pytest.mark.django_db +class TestActeurMethods: + + @pytest.mark.parametrize( + "initial,expected", + [ + (None, [{"message": "test"}]), + (" ", [{"message": "test"}]), + ("foo", [{"message": "foo"}, {"message": "test"}]), + ('[{"message": "bar"}]', [{"message": "bar"}, {"message": "test"}]), + ], + ) + def test_commentaires_ajouter(self, initial, expected): + acteur = ActeurFactory(commentaires=initial) + acteur.commentaires_ajouter("test") + actual = json.loads(acteur.commentaires) + assert actual == expected From 61a2d7df0ce535c3c124ac9ff87d0384f2e3ab5b Mon Sep 17 00:00:00 2001 From: Max Corbeau Date: Mon, 24 Mar 2025 12:24:26 +0100 Subject: [PATCH 07/50] v1 qui fonctionne --- dbt/dbt_project.yml | 1 + .../udf_columns_concat_unique_non_empty.sql | 3 +- .../udf/udf_columns_words_in_common_count.sql | 23 +++++ .../udf_normalize_string_alpha_for_match.sql | 11 +-- .../base_ae_etablissement.sql | 3 + .../base/ae_annuaire_entreprises/schema.yml | 6 +- .../int_ae_etablissement.sql | 36 +++++-- .../ae_annuaire_entreprises/schema.yml | 6 +- .../marts/enrich/marts_enrich_ae_closed.sql | 97 +++++++++++++++++++ 9 files changed, 160 insertions(+), 26 deletions(-) create mode 100644 dbt/macros/udf/udf_columns_words_in_common_count.sql create mode 100644 dbt/models/marts/enrich/marts_enrich_ae_closed.sql diff --git a/dbt/dbt_project.yml b/dbt/dbt_project.yml index 04b0ca16a..656b2f801 100644 --- a/dbt/dbt_project.yml +++ b/dbt/dbt_project.yml @@ -25,6 +25,7 @@ on-run-start: - "{{ create_udf_uuid_to_int() }}" - "{{ create_udf_safe_divmod() }}" - "{{ create_udf_columns_concat_unique_non_empty() }}" + - "{{create_udf_columns_words_in_common_count()}}" - "{{ create_udf_normalize_string_alpha_for_match() }}" clean-targets: diff --git a/dbt/macros/udf/udf_columns_concat_unique_non_empty.sql b/dbt/macros/udf/udf_columns_concat_unique_non_empty.sql index 76b32544b..fc7423f5b 100644 --- a/dbt/macros/udf/udf_columns_concat_unique_non_empty.sql +++ b/dbt/macros/udf/udf_columns_concat_unique_non_empty.sql @@ -1,7 +1,6 @@ {% macro create_udf_columns_concat_unique_non_empty() %} /* - Function to concatenate strings from various - columns while only retaining non-empty values + Concatenate strings from various columns while only retaining non-empty values */ DROP FUNCTION IF EXISTS {{ target.schema }}.udf_columns_concat_unique_non_empty(VARIADIC input_columns TEXT[]); diff --git a/dbt/macros/udf/udf_columns_words_in_common_count.sql b/dbt/macros/udf/udf_columns_words_in_common_count.sql new file mode 100644 index 000000000..99bf1c676 --- /dev/null +++ b/dbt/macros/udf/udf_columns_words_in_common_count.sql @@ -0,0 +1,23 @@ +{% macro create_udf_columns_words_in_common_count() %} +/* + Count number of words in common between 2 columns +*/ +CREATE OR REPLACE FUNCTION {{ target.schema }}.columns_words_in_common_count(col1 text, col2 text) +RETURNS integer AS $$ +DECLARE + word text; + count integer := 0; +BEGIN + FOR word IN + SELECT unnest(string_to_array(col1, ' ')) + LOOP + -- TODO: accuracy could be improved with REGEXP boundaries to count whole words + IF position(word IN col2) > 0 THEN + count := count + 1; + END IF; + END LOOP; + + RETURN count; +END; +$$ LANGUAGE plpgsql; +{% endmacro %} \ No newline at end of file diff --git a/dbt/macros/udf/udf_normalize_string_alpha_for_match.sql b/dbt/macros/udf/udf_normalize_string_alpha_for_match.sql index 32d144ba8..affea32c5 100644 --- a/dbt/macros/udf/udf_normalize_string_alpha_for_match.sql +++ b/dbt/macros/udf/udf_normalize_string_alpha_for_match.sql @@ -1,6 +1,6 @@ {% macro create_udf_normalize_string_alpha_for_match() %} /* - Function to normalize strings for the purpose of matching. + Normalize strings for the purpose of matching. For instance for RGPD we want to identify acteurs which names are directors' names, but we cant just pull everything for processing in Python because as of 2025-03-17 there are 13M unite_legale rows @@ -15,19 +15,10 @@ CREATE FUNCTION {{ target.schema }}.udf_normalize_string_alpha_for_match(input_t DECLARE normalized TEXT; BEGIN - -- Step 1: Transliterate using unaccent normalized := unaccent(input_text); - - -- Step 2: Convert to lowercase normalized := lower(normalized); - - -- Step 3: Replace non-alpha characters with space normalized := regexp_replace(normalized, '[^a-z]', ' ', 'g'); - - -- Step 4: Replace multiple spaces with a single space normalized := regexp_replace(normalized, '\s+', ' ', 'g'); - - -- Step 5: Trim leading and trailing spaces normalized := trim(normalized); RETURN normalized; diff --git a/dbt/models/base/ae_annuaire_entreprises/base_ae_etablissement.sql b/dbt/models/base/ae_annuaire_entreprises/base_ae_etablissement.sql index 3e31ec0bb..af952b324 100644 --- a/dbt/models/base/ae_annuaire_entreprises/base_ae_etablissement.sql +++ b/dbt/models/base/ae_annuaire_entreprises/base_ae_etablissement.sql @@ -17,6 +17,9 @@ SELECT siret, activite_principale, +-- Names +denomination_usuelle, + -- Status etat_administratif, diff --git a/dbt/models/base/ae_annuaire_entreprises/schema.yml b/dbt/models/base/ae_annuaire_entreprises/schema.yml index 0ea0e6f85..15b9a0cd3 100644 --- a/dbt/models/base/ae_annuaire_entreprises/schema.yml +++ b/dbt/models/base/ae_annuaire_entreprises/schema.yml @@ -52,6 +52,10 @@ models: data_tests: - not_null - unique + - name: activite_principale + description: "Code NAF Rev2" + - name: denomination_usuelle + description: "Nom de l'établissement" - name: etat_administratif description: "A = Actif, F = Fermé" data_type: varchar(1) @@ -59,8 +63,6 @@ models: - not_null - accepted_values: values: ["A", "F"] - - name: activite_principale - description: "Code NAF Rev2" - name: numero_voie description: "Numéro de voie" - name: complement_adresse diff --git a/dbt/models/intermediate/ae_annuaire_entreprises/int_ae_etablissement.sql b/dbt/models/intermediate/ae_annuaire_entreprises/int_ae_etablissement.sql index 7b333a52c..677c6fe58 100644 --- a/dbt/models/intermediate/ae_annuaire_entreprises/int_ae_etablissement.sql +++ b/dbt/models/intermediate/ae_annuaire_entreprises/int_ae_etablissement.sql @@ -17,8 +17,15 @@ Notes: SELECT -- Codes - siret, - activite_principale AS naf, -- Making NAF explicit since it's a code + etab.siret, + etab.activite_principale AS naf, -- Making NAF explicit being a well-known code + + -- Names + CASE + WHEN etab.denomination_usuelle = '[ND]' AND unite.denomination IS NOT NULL THEN unite.denomination + WHEN etab.denomination_usuelle IS NULL AND unite.denomination IS NOT NULL THEN unite.denomination + ELSE etab.denomination_usuelle + END AS nom, /* Is active or not: converting this field to BOOLEAN to: @@ -27,19 +34,28 @@ SELECT using different flags - create more efficient data type and index */ - CASE etat_administratif + CASE etab.etat_administratif WHEN 'A' THEN TRUE ELSE FALSE END AS est_actif, + CASE unite.etat_administratif + WHEN 'A' THEN TRUE + ELSE FALSE + END AS unite_est_actif, -- Addresse udf_columns_concat_unique_non_empty( - numero_voie, - type_voie, - libelle_voie + etab.numero_voie, + etab.type_voie, + etab.libelle_voie ) AS adresse, - complement_adresse AS adresse_complement, - code_postal, - libelle_commune AS ville + etab.complement_adresse AS adresse_complement, + etab.code_postal, + etab.libelle_commune AS ville -FROM {{ ref('base_ae_etablissement') }} \ No newline at end of file +FROM {{ ref('base_ae_etablissement') }} AS etab +/* Joining with unite_legale to bring some essential +data from parent unite into each etablissement to save +us from making expensive JOINS in downstream models */ +JOIN {{ ref('base_ae_unite_legale') }} AS unite +ON unite.siren = LEFT(etab.siret,9) \ No newline at end of file diff --git a/dbt/models/intermediate/ae_annuaire_entreprises/schema.yml b/dbt/models/intermediate/ae_annuaire_entreprises/schema.yml index 5b9e15b70..b2cd1dd98 100644 --- a/dbt/models/intermediate/ae_annuaire_entreprises/schema.yml +++ b/dbt/models/intermediate/ae_annuaire_entreprises/schema.yml @@ -50,13 +50,15 @@ models: data_tests: - not_null - unique + - name: activite_principale + description: "Code NAF Rev2" + - name: nom + description: "Nom de l'établissement" - name: est_actif description: "OUI si A = Actif" data_type: boolean data_tests: - not_null - - name: activite_principale - description: "Code NAF Rev2" - name: numero_voie description: "Numéro de voie" - name: complement_adresse diff --git a/dbt/models/marts/enrich/marts_enrich_ae_closed.sql b/dbt/models/marts/enrich/marts_enrich_ae_closed.sql new file mode 100644 index 000000000..6fd23cee4 --- /dev/null +++ b/dbt/models/marts/enrich/marts_enrich_ae_closed.sql @@ -0,0 +1,97 @@ +/* +Model to find entries from AE's etablissement which are +potential replacements for closed acteurs. + +Code is repetitive (e.g. same logic in SELECT and ROW_NUMBER) and +could be made more concise with an intermediary CTE. However from experience, +intermediate CTEs lead to slower performance (we constraint the planner) +than just letting the query planner do its job. Thus for now, I focus +on performance given the 40M rows + +Notes: + - 🧹 Pre-matching/filtering at SQL level to reduce data size (40M rows) + - 👁️‍🗨️ Keeping as view to always re-evaluate vs. ever changing QFDMO data +*/ +{{ + config( + materialized = 'view', + tags=['marts', 'ae', 'annuaire_entreprises', 'etablissement', 'closed'], + ) +}} +-- Starting from our acteurs we can match via SIRET +WITH acteurs_with_siret AS ( + SELECT + LEFT(siret,9) AS siren, + siret, + nom AS acteur_nom, + udf_normalize_string_alpha_for_match(nom) AS acteur_nom_normalise, + identifiant_unique AS acteur_id, + commentaires AS acteur_commentaires, + statut AS statut + FROM {{ ref('marts_carte_acteur') }} + WHERE siret IS NOT NULL AND siret != '' AND LENGTH(siret) = 14 +), +/* Filtering on closed establishments +note: there is another + */ +closed AS ( +SELECT + etab.siret, + etab.est_actif AS etab_est_actif, + etab.code_postal AS etab_code_postal, + etab.adresse AS etab_adresse, + etab.naf AS etab_naf, + acteurs.acteur_id, + acteurs.acteur_nom, + acteurs.acteur_nom_normalise, + acteurs.acteur_commentaires +FROM acteurs_with_siret AS acteurs +JOIN {{ ref('int_ae_etablissement') }} AS etab ON acteurs.siret = etab.siret +/* + By NOT filtering on unite_est_actif we have an opportunite to get replacements + from the same unite (SIREN) +*/ +WHERE NOT etab.est_actif +), ae_potential_replacements AS ( + SELECT + closed.acteur_id AS acteur_id, + closed.siret AS acteur_siret, + replacements.siret AS remplacer_siret, + CASE + WHEN LEFT(closed.siret,9) = LEFT(replacements.siret,9) THEN 1 + ELSE 0 + END AS remplacer_meme_siren, + closed.acteur_nom, + replacements.nom AS remplacer_nom, + columns_words_in_common_count( + closed.acteur_nom_normalise, + udf_normalize_string_alpha_for_match(replacements.nom) + ) AS noms_nombre_mots_commun, + closed.acteur_commentaires AS acteur_commentaires, + replacements.naf AS naf, + replacements.ville AS ville, + replacements.code_postal AS code_postal, + replacements.adresse AS adresse, + ROW_NUMBER() OVER ( + PARTITION BY closed.siret + ORDER BY + -- Prioritize replacements from same company + CASE + WHEN LEFT(closed.siret,9) = LEFT(replacements.siret,9) THEN 1 + ELSE 0 + END DESC, + -- Then etablissements with more words in common + columns_words_in_common_count( + closed.acteur_nom_normalise, + udf_normalize_string_alpha_for_match(replacements.nom) + ) DESC + ) AS rn + FROM closed + INNER JOIN {{ ref('int_ae_etablissement') }} AS replacements + ON replacements.naf = closed.etab_naf + AND replacements.code_postal = closed.etab_code_postal + AND replacements.adresse = closed.etab_adresse + WHERE replacements.est_actif +) +SELECT * FROM ae_potential_replacements +WHERE rn=1 \ No newline at end of file From 94405bebc61936c90e4f66e8f3a6a7a95ca23bbd Mon Sep 17 00:00:00 2001 From: Max Corbeau Date: Mon, 31 Mar 2025 07:12:29 +0200 Subject: [PATCH 08/50] =?UTF-8?q?d=C3=A9but=20refacto=20pour=20factoriser?= =?UTF-8?q?=20RGPD=20+=20fermetures?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- dags/enrich/config/__init__.py | 1 + dags/enrich/config/columns.py | 2 +- dags/enrich/config/dbt.py | 2 + dags/enrich/config/models.py | 46 +++++++++ dags/enrich/config/tasks.py | 13 ++- dags/enrich/config/xcoms.py | 4 +- dags/enrich/dags/enrich_ae_closed.py | 44 +++++++++ .../enrich_ae_rgpd_match_task.py | 4 +- .../airflow_logic/enrich_ae_rgpd_read_task.py | 4 +- .../enrich_ae_rgpd_suggest_task.py | 4 +- .../enrich_read_ae_closed_candidates_task.py | 51 ++++++++++ .../enrich_read_ae_closed_replaced_task.py | 0 .../business_logic/enrich_ae_rgpd_match.py | 14 +-- .../business_logic/enrich_ae_rgpd_suggest.py | 4 +- .../tasks/business_logic/enrich_read.py | 45 +++++++++ dags/enrich/test.py | 3 + .../config/test_enrich_closed_config.py | 28 ++++++ .../marts/enrich/marts_enrich_ae_closed.sql | 97 ------------------- .../marts_enrich_ae_closed_candidates.sql | 54 +++++++++++ .../marts_enrich_ae_closed_replaced.sql | 51 ++++++++++ 20 files changed, 355 insertions(+), 116 deletions(-) create mode 100644 dags/enrich/config/models.py create mode 100644 dags/enrich/dags/enrich_ae_closed.py create mode 100644 dags/enrich/tasks/airflow_logic/enrich_read_ae_closed_candidates_task.py create mode 100644 dags/enrich/tasks/airflow_logic/enrich_read_ae_closed_replaced_task.py create mode 100644 dags/enrich/tasks/business_logic/enrich_read.py create mode 100644 dags/enrich/test.py create mode 100644 dags_unit_tests/enrich/config/test_enrich_closed_config.py delete mode 100644 dbt/models/marts/enrich/marts_enrich_ae_closed.sql create mode 100644 dbt/models/marts/enrich/marts_enrich_ae_closed_candidates.sql create mode 100644 dbt/models/marts/enrich/marts_enrich_ae_closed_replaced.sql diff --git a/dags/enrich/config/__init__.py b/dags/enrich/config/__init__.py index 77e633a18..856f8b15f 100644 --- a/dags/enrich/config/__init__.py +++ b/dags/enrich/config/__init__.py @@ -1,5 +1,6 @@ from .columns import COLS # noqa: F401 from .dbt import DBT # noqa: F401 +from .models import EnrichClosedConfig # noqa: F401 from .paths import DIR_SQL_READ # noqa: F401 from .tasks import TASKS # noqa: F401 from .xcoms import XCOMS, xcom_pull # noqa: F401 diff --git a/dags/enrich/config/columns.py b/dags/enrich/config/columns.py index c65aa6e63..661bb2796 100644 --- a/dags/enrich/config/columns.py +++ b/dags/enrich/config/columns.py @@ -23,6 +23,6 @@ class COLS: AE_DIRIGEANTS_NOMS: str = "ae_dirigeants_noms_prenoms" # Matching - MATCH_SCORE: str = "match_score" + MATCH_SCORE_AE_RGPD: str = "match_score" MATCH_WORDS: str = "match_words" MATCH_THRESHOLD: str = "match_threshold" diff --git a/dags/enrich/config/dbt.py b/dags/enrich/config/dbt.py index d3cfd43be..f5bc9684d 100644 --- a/dags/enrich/config/dbt.py +++ b/dags/enrich/config/dbt.py @@ -6,3 +6,5 @@ @dataclass(frozen=True) class DBT: MARTS_ENRICH_AE_RGPD: str = "marts_enrich_ae_rgpd" + MARTS_ENRICH_AE_CLOSED_CANDIDATES: str = "marts_enrich_ae_closed_candidates" + MARTS_ENRICH_AE_CLOSED_REPLACED: str = "marts_enrich_ae_closed_replaced" diff --git a/dags/enrich/config/models.py b/dags/enrich/config/models.py new file mode 100644 index 000000000..13c6dcf5d --- /dev/null +++ b/dags/enrich/config/models.py @@ -0,0 +1,46 @@ +"""Configuration models enrich DAG""" + +import re +from typing import Optional + +from pydantic import BaseModel + +SEPARATOR_FILTER_FIELD = "__" + + +def filters_get(model: BaseModel, prefix: str) -> list[dict[str, str]]: + """Utility to get list of filters (field, value) to apply to the data, + used 2 ways: + - generate the Airflow params for the UI from field names only + - read Airflow params to generate filters with values + + Thus we have a dynamic Airflow UI controlled by and always aligned with + our config model by only maintaining the latter. + """ + filters = [] + for field in model.model_fields: + if re.fullmatch(f"{prefix}{SEPARATOR_FILTER_FIELD}[a-z_]+", field): + filters.append( + { + "field": field.replace(f"{prefix}{SEPARATOR_FILTER_FIELD}", ""), + "value": getattr(model, field), + } + ) + return filters + + +class EnrichBaseConfig(BaseModel): + dry_run: bool + filter_contains__commentaires: Optional[str] = "test" + filter_contains__nom: Optional[str] + filter_equals__statut: Optional[str] + + def filters_contains(self) -> list[dict[str, str]]: + return filters_get(self, "filter_contains") + + def filters_equals(self) -> list[dict[str, str]]: + return filters_get(self, "filter_equals") + + +class EnrichClosedConfig(EnrichBaseConfig): + filter_contains__naf: Optional[str] diff --git a/dags/enrich/config/tasks.py b/dags/enrich/config/tasks.py index b5889493a..f914b2264 100644 --- a/dags/enrich/config/tasks.py +++ b/dags/enrich/config/tasks.py @@ -5,6 +5,13 @@ @dataclass(frozen=True) class TASKS: - READ: str = "enrich_ae_rgpd_read" - MATCH_SCORE: str = "enrich_ae_rgpd_match" - SUGGEST: str = "enrich_ae_rgpd_suggest" + # Read tasks + READ_AE_RGPD: str = "enrich_ae_rgpd_read" + READ_AE_CLOSED_CANDIDATES: str = "enrich_read_ae_closed_candidates" + READ_AE_CLOSED_REPLACED: str = "enrich_read_ae_closed_replaced" + + # Matching tasks + MATCH_SCORE_AE_RGPD: str = "enrich_ae_rgpd_match" + + # Suggestion tasks + SUGGEST_AE_RGPD: str = "enrich_ae_rgpd_suggest" diff --git a/dags/enrich/config/xcoms.py b/dags/enrich/config/xcoms.py index 4d3cebda1..694128e19 100644 --- a/dags/enrich/config/xcoms.py +++ b/dags/enrich/config/xcoms.py @@ -29,9 +29,9 @@ def xcom_pull(ti: TaskInstance, key: str, skip_if_empty: bool = False) -> Any: # Reading values if key == XCOMS.DF_READ: - value: pd.DataFrame = ti.xcom_pull(key=key, task_ids=TASKS.READ) + value: pd.DataFrame = ti.xcom_pull(key=key, task_ids=TASKS.READ_AE_RGPD) elif key == XCOMS.DF_MATCH: - value: pd.DataFrame = ti.xcom_pull(key=key, task_ids=TASKS.MATCH_SCORE) + value: pd.DataFrame = ti.xcom_pull(key=key, task_ids=TASKS.MATCH_SCORE_AE_RGPD) else: raise ValueError(f"{msg} key inconnue") diff --git a/dags/enrich/dags/enrich_ae_closed.py b/dags/enrich/dags/enrich_ae_closed.py new file mode 100644 index 000000000..2de827805 --- /dev/null +++ b/dags/enrich/dags/enrich_ae_closed.py @@ -0,0 +1,44 @@ +""" +DAG to anonymize QFDMO acteur which names +contains people from Annuaire Entreprise (AE) +""" + +from datetime import datetime + +from airflow import DAG +from airflow.models.baseoperator import chain +from airflow.models.param import Param +from enrich.tasks.airflow_logic.enrich_read_ae_closed_candidates_task import ( + enrich_read_ae_closed_candidates_task, +) + +with DAG( + dag_id="enrich_ae_closed", + dag_display_name="Enrichir - AE - Acteurs fermés", + default_args={ + "owner": "airflow", + "depends_on_past": False, + "start_date": datetime(2025, 3, 5), + "catchup": False, + "email_on_failure": False, + "email_on_retry": False, + "retries": 0, + }, + description=( + "Un DAG pour détécter et remplacer les acteurs fermés" + "dans l'Annuaire Entreprises (AE)" + ), + tags=["annuaire", "entreprise", "ae", "siren", "siret", "acteurs"], + schedule=None, + catchup=False, + params={ + "dry_run": Param( + True, + type="boolean", + description_md="🚱 Si coché, aucune tâche d'écriture ne sera effectuée", + ), + }, +) as dag: + chain( + enrich_read_ae_closed_candidates_task(dag), + ) diff --git a/dags/enrich/tasks/airflow_logic/enrich_ae_rgpd_match_task.py b/dags/enrich/tasks/airflow_logic/enrich_ae_rgpd_match_task.py index 27b94012b..bfc27bbbc 100644 --- a/dags/enrich/tasks/airflow_logic/enrich_ae_rgpd_match_task.py +++ b/dags/enrich/tasks/airflow_logic/enrich_ae_rgpd_match_task.py @@ -16,7 +16,7 @@ def task_info_get(): return f""" ============================================================ - Description de la tâche "{TASKS.MATCH_SCORE}" + Description de la tâche "{TASKS.MATCH_SCORE_AE_RGPD}" ============================================================ 💡 quoi: on cherche à déterminer quels acteurs QFDMO ont un nom qui correspond à des noms de personnes dans l'AE @@ -44,7 +44,7 @@ def enrich_ae_rgpd_match_wrapper(ti, params) -> None: def enrich_ae_rgpd_match_task(dag: DAG) -> PythonOperator: return PythonOperator( - task_id=TASKS.MATCH_SCORE, + task_id=TASKS.MATCH_SCORE_AE_RGPD, python_callable=enrich_ae_rgpd_match_wrapper, dag=dag, ) diff --git a/dags/enrich/tasks/airflow_logic/enrich_ae_rgpd_read_task.py b/dags/enrich/tasks/airflow_logic/enrich_ae_rgpd_read_task.py index 1f11b35a3..426317042 100644 --- a/dags/enrich/tasks/airflow_logic/enrich_ae_rgpd_read_task.py +++ b/dags/enrich/tasks/airflow_logic/enrich_ae_rgpd_read_task.py @@ -16,7 +16,7 @@ def task_info_get(): return f""" ============================================================ - Description de la tâche "{TASKS.READ}" + Description de la tâche "{TASKS.READ_AE_RGPD}" ============================================================ 💡 quoi: lecture des données via le modèle DBT {DBT.MARTS_ENRICH_AE_RGPD} @@ -45,7 +45,7 @@ def enrich_ae_rgpd_read_wrapper(ti, params) -> None: def enrich_ae_rgpd_read_task(dag: DAG) -> PythonOperator: return PythonOperator( - task_id=TASKS.READ, + task_id=TASKS.READ_AE_RGPD, python_callable=enrich_ae_rgpd_read_wrapper, dag=dag, ) diff --git a/dags/enrich/tasks/airflow_logic/enrich_ae_rgpd_suggest_task.py b/dags/enrich/tasks/airflow_logic/enrich_ae_rgpd_suggest_task.py index 9b195b3b3..ba3c47c29 100644 --- a/dags/enrich/tasks/airflow_logic/enrich_ae_rgpd_suggest_task.py +++ b/dags/enrich/tasks/airflow_logic/enrich_ae_rgpd_suggest_task.py @@ -16,7 +16,7 @@ def task_info_get(): return f""" ============================================================ - Description de la tâche "{TASKS.SUGGEST}" + Description de la tâche "{TASKS.SUGGEST_AE_RGPD}" ============================================================ 💡 quoi: on cherche à déterminer quels acteurs QFDMO ont un nom qui correspond à des noms de personnes dans l'AE @@ -45,7 +45,7 @@ def enrich_ae_rgpd_suggest_wrapper(ti, params, dag, run_id) -> None: def enrich_ae_rgpd_suggest_task(dag: DAG) -> PythonOperator: return PythonOperator( - task_id=TASKS.SUGGEST, + task_id=TASKS.SUGGEST_AE_RGPD, python_callable=enrich_ae_rgpd_suggest_wrapper, dag=dag, ) diff --git a/dags/enrich/tasks/airflow_logic/enrich_read_ae_closed_candidates_task.py b/dags/enrich/tasks/airflow_logic/enrich_read_ae_closed_candidates_task.py new file mode 100644 index 000000000..b46dadafe --- /dev/null +++ b/dags/enrich/tasks/airflow_logic/enrich_read_ae_closed_candidates_task.py @@ -0,0 +1,51 @@ +"""Read data from DB needed for RGPD anonymization""" + +import logging + +from airflow import DAG +from airflow.exceptions import AirflowSkipException +from airflow.operators.python import PythonOperator +from enrich.config import DBT, TASKS, XCOMS +from enrich.tasks.business_logic.enrich_read import ( + enrich_read, +) + +logger = logging.getLogger(__name__) + + +def task_info_get(): + return f""" + ============================================================ + Description de la tâche "{TASKS.READ_AE_RGPD}" + ============================================================ + 💡 quoi: lecture des données via le modèle DBT + {DBT.MARTS_ENRICH_AE_RGPD} + + 🎯 pourquoi: faire un pré-filtre sur les matches potentiels + (pas récupérer les ~27M de lignes de la table AE unite_legale) + + 🏗️ comment: on récupère uniquement les matches SIREN avec + des infos de noms/prénoms dans l'AE en passant par de la normalisation + de chaines de caractères + """ + + +def enrich_read_ae_closed_candidates_wrapper(ti, params) -> None: + logger.info(task_info_get()) + + df = enrich_read( + dbt_model_name=DBT.MARTS_ENRICH_AE_CLOSED_CANDIDATES, + filter_comments_contain=params["filter_comments_contain"], + ) + if df.empty: + raise AirflowSkipException("Pas de données DB, on s'arrête là") + + ti.xcom_push(key=XCOMS.DF_READ, value=df) + + +def enrich_read_ae_closed_candidates_task(dag: DAG) -> PythonOperator: + return PythonOperator( + task_id=TASKS.READ_AE_RGPD, + python_callable=enrich_read_ae_closed_candidates_wrapper, + dag=dag, + ) diff --git a/dags/enrich/tasks/airflow_logic/enrich_read_ae_closed_replaced_task.py b/dags/enrich/tasks/airflow_logic/enrich_read_ae_closed_replaced_task.py new file mode 100644 index 000000000..e69de29bb diff --git a/dags/enrich/tasks/business_logic/enrich_ae_rgpd_match.py b/dags/enrich/tasks/business_logic/enrich_ae_rgpd_match.py index 75d73045e..c40dd2676 100644 --- a/dags/enrich/tasks/business_logic/enrich_ae_rgpd_match.py +++ b/dags/enrich/tasks/business_logic/enrich_ae_rgpd_match.py @@ -59,17 +59,19 @@ def enrich_ae_rgpd_match( lambda x: word_overlap_ratio(x, cols_names_qfdmo, cols_names_ae), axis=1 ) df[COLS.MATCH_WORDS] = df["temp"].apply(lambda x: x[0]) - df[COLS.MATCH_SCORE] = df["temp"].apply(lambda x: x[1]) + df[COLS.MATCH_SCORE_AE_RGPD] = df["temp"].apply(lambda x: x[1]) df.drop(columns=["temp"], inplace=True) # Selecting & previewing matches - df_no_match = df[df[COLS.MATCH_SCORE] == 0] - df_partial = df[(df[COLS.MATCH_SCORE] > 0) & (df[COLS.MATCH_SCORE] < 1)] - df_perfect = df[df[COLS.MATCH_SCORE] == 1] - df_retained = df[df[COLS.MATCH_SCORE] >= match_threshold].copy() + df_no_match = df[df[COLS.MATCH_SCORE_AE_RGPD] == 0] + df_partial = df[ + (df[COLS.MATCH_SCORE_AE_RGPD] > 0) & (df[COLS.MATCH_SCORE_AE_RGPD] < 1) + ] + df_perfect = df[df[COLS.MATCH_SCORE_AE_RGPD] == 1] + df_retained = df[df[COLS.MATCH_SCORE_AE_RGPD] >= match_threshold].copy() log.preview_df_as_markdown("🔴 Matches non-existant (==0)", df_no_match) log.preview_df_as_markdown("🟡 Matches partiel (>0 & <1)", df_partial) log.preview_df_as_markdown("🟢 Matches parfait (==1)", df_perfect) log.preview_df_as_markdown(f"💾 Matches retenus (>={match_threshold})", df_retained) - return df_retained.sort_values(COLS.MATCH_SCORE, ascending=False) + return df_retained.sort_values(COLS.MATCH_SCORE_AE_RGPD, ascending=False) diff --git a/dags/enrich/tasks/business_logic/enrich_ae_rgpd_suggest.py b/dags/enrich/tasks/business_logic/enrich_ae_rgpd_suggest.py index 3ff3e5b9e..5f774778c 100644 --- a/dags/enrich/tasks/business_logic/enrich_ae_rgpd_suggest.py +++ b/dags/enrich/tasks/business_logic/enrich_ae_rgpd_suggest.py @@ -68,7 +68,9 @@ def enrich_ae_rgpd_suggest( "summary": [ sumline("noms d'origine", row[COLS.ACTEUR_NOMS_ORIGINE], "text"), sumline("mots de match", row[COLS.MATCH_WORDS], "text_list"), - sumline("score de match", row[COLS.MATCH_SCORE], "score_0_to_1"), + sumline( + "score de match", row[COLS.MATCH_SCORE_AE_RGPD], "score_0_to_1" + ), sumline("changements", "voir contexte/détails", "text"), ], "changes": changes, diff --git a/dags/enrich/tasks/business_logic/enrich_read.py b/dags/enrich/tasks/business_logic/enrich_read.py new file mode 100644 index 000000000..f3d893436 --- /dev/null +++ b/dags/enrich/tasks/business_logic/enrich_read.py @@ -0,0 +1,45 @@ +"""Read data from DB needed for RGPD anonymization""" + +import logging + +import numpy as np +import pandas as pd +from utils import logging_utils as log +from utils.django import django_setup_full + +django_setup_full() + +logger = logging.getLogger(__name__) + + +def enrich_read( + dbt_model_name: str, filters_contain: list[tuple[str, str]] = [] +) -> pd.DataFrame: + """Reads necessary QFDMO acteurs and AE entries from DB""" + from django.db import connection + + # Execute SQL query and get data + with connection.cursor() as cursor: + cursor.execute(f"SELECT * FROM {dbt_model_name}") + columns = [col[0] for col in cursor.description] + data = cursor.fetchall() + + # Create DataFrame and preview + df = pd.DataFrame(data, columns=columns, dtype="object").replace({np.nan: None}) + log.preview_df_as_markdown(f"Données de {dbt_model_name} SANS filtre", df) + + # Filtering if needed + if not df.empty: + for col_name, col_value in filters_contain: + col_value = (col_value or "").strip() + if filter: + logger.info(f"Filtre sur {col_name} CONTIENT {col_value}") + df = df[df[col_name].notnull()].copy() + df = df[ + df[col_name].str.contains(col_value, regex=True, case=False) + ].copy() + log.preview_df_as_markdown( + f"Données de {dbt_model_name} APRES filtre", df + ) + + return df diff --git a/dags/enrich/test.py b/dags/enrich/test.py new file mode 100644 index 000000000..bb9712b72 --- /dev/null +++ b/dags/enrich/test.py @@ -0,0 +1,3 @@ +azerty = "foo" + +print(azerty) diff --git a/dags_unit_tests/enrich/config/test_enrich_closed_config.py b/dags_unit_tests/enrich/config/test_enrich_closed_config.py new file mode 100644 index 000000000..82212ef3c --- /dev/null +++ b/dags_unit_tests/enrich/config/test_enrich_closed_config.py @@ -0,0 +1,28 @@ +import pytest + +from dags.enrich.config.models import EnrichClosedConfig + + +class TestEnrichClosedConfig: + + @pytest.fixture + def config(self): + return EnrichClosedConfig( + dry_run=True, + filter_contains__commentaires="commentaires", + filter_contains__nom="nom", + filter_contains__naf=None, + filter_equals__statut="ACTIF", + ) + + def test_filters_contain(self, config): + assert config.filters_contains() == [ + {"field": "commentaires", "value": "commentaires"}, + {"field": "nom", "value": "nom"}, + {"field": "naf", "value": None}, + ] + + def test_filters_equals(self, config): + assert config.filters_equals() == [ + {"field": "statut", "value": "ACTIF"}, + ] diff --git a/dbt/models/marts/enrich/marts_enrich_ae_closed.sql b/dbt/models/marts/enrich/marts_enrich_ae_closed.sql deleted file mode 100644 index 6fd23cee4..000000000 --- a/dbt/models/marts/enrich/marts_enrich_ae_closed.sql +++ /dev/null @@ -1,97 +0,0 @@ -/* -Model to find entries from AE's etablissement which are -potential replacements for closed acteurs. - -Code is repetitive (e.g. same logic in SELECT and ROW_NUMBER) and -could be made more concise with an intermediary CTE. However from experience, -intermediate CTEs lead to slower performance (we constraint the planner) -than just letting the query planner do its job. Thus for now, I focus -on performance given the 40M rows - -Notes: - - 🧹 Pre-matching/filtering at SQL level to reduce data size (40M rows) - - 👁️‍🗨️ Keeping as view to always re-evaluate vs. ever changing QFDMO data -*/ -{{ - config( - materialized = 'view', - tags=['marts', 'ae', 'annuaire_entreprises', 'etablissement', 'closed'], - ) -}} --- Starting from our acteurs we can match via SIRET -WITH acteurs_with_siret AS ( - SELECT - LEFT(siret,9) AS siren, - siret, - nom AS acteur_nom, - udf_normalize_string_alpha_for_match(nom) AS acteur_nom_normalise, - identifiant_unique AS acteur_id, - commentaires AS acteur_commentaires, - statut AS statut - FROM {{ ref('marts_carte_acteur') }} - WHERE siret IS NOT NULL AND siret != '' AND LENGTH(siret) = 14 -), -/* Filtering on closed establishments -note: there is another - */ -closed AS ( -SELECT - etab.siret, - etab.est_actif AS etab_est_actif, - etab.code_postal AS etab_code_postal, - etab.adresse AS etab_adresse, - etab.naf AS etab_naf, - acteurs.acteur_id, - acteurs.acteur_nom, - acteurs.acteur_nom_normalise, - acteurs.acteur_commentaires -FROM acteurs_with_siret AS acteurs -JOIN {{ ref('int_ae_etablissement') }} AS etab ON acteurs.siret = etab.siret -/* - By NOT filtering on unite_est_actif we have an opportunite to get replacements - from the same unite (SIREN) -*/ -WHERE NOT etab.est_actif -), ae_potential_replacements AS ( - SELECT - closed.acteur_id AS acteur_id, - closed.siret AS acteur_siret, - replacements.siret AS remplacer_siret, - CASE - WHEN LEFT(closed.siret,9) = LEFT(replacements.siret,9) THEN 1 - ELSE 0 - END AS remplacer_meme_siren, - closed.acteur_nom, - replacements.nom AS remplacer_nom, - columns_words_in_common_count( - closed.acteur_nom_normalise, - udf_normalize_string_alpha_for_match(replacements.nom) - ) AS noms_nombre_mots_commun, - closed.acteur_commentaires AS acteur_commentaires, - replacements.naf AS naf, - replacements.ville AS ville, - replacements.code_postal AS code_postal, - replacements.adresse AS adresse, - ROW_NUMBER() OVER ( - PARTITION BY closed.siret - ORDER BY - -- Prioritize replacements from same company - CASE - WHEN LEFT(closed.siret,9) = LEFT(replacements.siret,9) THEN 1 - ELSE 0 - END DESC, - -- Then etablissements with more words in common - columns_words_in_common_count( - closed.acteur_nom_normalise, - udf_normalize_string_alpha_for_match(replacements.nom) - ) DESC - ) AS rn - FROM closed - INNER JOIN {{ ref('int_ae_etablissement') }} AS replacements - ON replacements.naf = closed.etab_naf - AND replacements.code_postal = closed.etab_code_postal - AND replacements.adresse = closed.etab_adresse - WHERE replacements.est_actif -) -SELECT * FROM ae_potential_replacements -WHERE rn=1 \ No newline at end of file diff --git a/dbt/models/marts/enrich/marts_enrich_ae_closed_candidates.sql b/dbt/models/marts/enrich/marts_enrich_ae_closed_candidates.sql new file mode 100644 index 000000000..b143ce1ce --- /dev/null +++ b/dbt/models/marts/enrich/marts_enrich_ae_closed_candidates.sql @@ -0,0 +1,54 @@ +/* +Model to find entries from AE's etablissement which are +potential replacements for etab_closed acteurs. + +Code is repetitive (e.g. same logic in SELECT and ROW_NUMBER) and +could be made more concise with an intermediary CTE. However from experience, +intermediate CTEs lead to slower performance (we constraint the planner) +than just letting the query planner do its job. Thus for now, I focus +on performance given the 40M rows + +Notes: + - 🧹 Pre-matching/filtering at SQL level to reduce data size (40M rows) + - 👁️‍🗨️ Keeping as view to always re-evaluate vs. ever changing QFDMO data +*/ +{{ + config( + materialized = 'view', + tags=['marts', 'ae', 'annuaire_entreprises', 'etablissement'], + ) +}} +-- Starting from our acteurs we can match via SIRET +WITH acteurs_with_siret AS ( + SELECT + LEFT(siret,9) AS siren, + siret, + nom AS acteur_nom, + udf_normalize_string_alpha_for_match(nom) AS acteur_nom_normalise, + identifiant_unique AS acteur_id, + commentaires AS acteur_commentaires, + statut AS acteur_statut + FROM {{ ref('marts_carte_acteur') }} + WHERE siret IS NOT NULL AND siret != '' AND LENGTH(siret) = 14 +), +/* Filtering on etab closed (NOT etab.est_actif) BUT +not on unite closed (NOT unite_est_actif) because +open unite might bring potential replacements */ +etab_closed_candidates AS ( +SELECT + etab.siret, + etab.unite_est_actif AS unite_est_actif, + etab.est_actif AS etab_est_actif, + etab.code_postal AS etab_code_postal, + etab.adresse AS etab_adresse, + etab.naf AS etab_naf, + acteurs.acteur_id, + acteurs.acteur_statut, + acteurs.acteur_nom, + acteurs.acteur_nom_normalise, + acteurs.acteur_commentaires +FROM acteurs_with_siret AS acteurs +JOIN {{ ref('int_ae_etablissement') }} AS etab ON acteurs.siret = etab.siret +WHERE etab.est_actif = FALSE +) +SELECT * FROM etab_closed_candidates \ No newline at end of file diff --git a/dbt/models/marts/enrich/marts_enrich_ae_closed_replaced.sql b/dbt/models/marts/enrich/marts_enrich_ae_closed_replaced.sql new file mode 100644 index 000000000..a3cc01285 --- /dev/null +++ b/dbt/models/marts/enrich/marts_enrich_ae_closed_replaced.sql @@ -0,0 +1,51 @@ +{{ + config( + materialized = 'view', + tags=['marts', 'ae', 'annuaire_entreprises', 'etablissement'], + ) +}} + +WITH potential_replacements AS ( + SELECT + candidates.acteur_id AS acteur_id, + candidates.acteur_statut AS acteur_statut, + candidates.siret AS acteur_siret, + replacements.siret AS remplacer_siret, + CASE + WHEN LEFT(candidates.siret,9) = LEFT(replacements.siret,9) THEN 1 + ELSE 0 + END AS remplacer_meme_siren, + candidates.acteur_nom, + replacements.nom AS remplacer_nom, + columns_words_in_common_count( + candidates.acteur_nom_normalise, + udf_normalize_string_alpha_for_match(replacements.nom) + ) AS noms_nombre_mots_commun, + candidates.acteur_commentaires AS acteur_commentaires, + replacements.naf AS naf, + replacements.ville AS ville, + replacements.code_postal AS code_postal, + replacements.adresse AS adresse, + ROW_NUMBER() OVER ( + PARTITION BY candidates.siret + ORDER BY + -- Prioritize replacements from same company + CASE + WHEN LEFT(candidates.siret,9) = LEFT(replacements.siret,9) THEN 1 + ELSE 0 + END DESC, + -- Then etablissements with more words in common + columns_words_in_common_count( + candidates.acteur_nom_normalise, + udf_normalize_string_alpha_for_match(replacements.nom) + ) DESC + ) AS replacement_priority + FROM {{ ref('marts_enrich_ae_closed_candidates') }} AS candidates + INNER JOIN {{ ref('int_ae_etablissement') }} AS replacements + ON replacements.naf = candidates.etab_naf + AND replacements.code_postal = candidates.etab_code_postal + AND replacements.adresse = candidates.etab_adresse + WHERE replacements.est_actif +) +SELECT * FROM potential_replacements +WHERE replacement_priority=1 \ No newline at end of file From 92123d0788d2566929fa11475b876881c327ca5f Mon Sep 17 00:00:00 2001 From: Max Corbeau Date: Thu, 3 Apr 2025 16:01:27 +0200 Subject: [PATCH 09/50] =?UTF-8?q?d=C3=A9but=20refacto=20et=20progr=C3=A8s?= =?UTF-8?q?=20vers=20d=C3=A9cision=20m=C3=A9tier?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- dags/enrich/config/__init__.py | 3 +- dags/enrich/config/cohorts.py | 12 +++ dags/enrich/config/models.py | 54 +++++++--- dags/enrich/config/tasks.py | 3 + dags/enrich/config/xcoms.py | 13 ++- dags/enrich/dags/enrich_acteurs_closed.py | 58 +++++++++++ ...rich_ae_rgpd.py => enrich_acteurs_rgpd.py} | 4 +- dags/enrich/dags/enrich_ae_closed.py | 44 --------- .../enrich_config_create_task.py | 40 ++++++++ ..._task.py => enrich_read_dbt_model_task.py} | 30 +++--- .../business_logic/enrich_ae_suggestions.py | 99 +++++++++++++++++++ .../tasks/business_logic/enrich_read.py | 45 --------- .../business_logic/enrich_read_dbt_model.py | 58 +++++++++++ dags/shared/config/models.py | 36 +++++++ .../test_enrich_acteurs_closed_config.py | 27 +++++ .../config/test_enrich_closed_config.py | 28 ------ .../config/test_shared_config_models.py | 56 +++++++++++ data/models/suggestion.py | 1 + qfdmo/models/acteur.py | 15 +++ 19 files changed, 479 insertions(+), 147 deletions(-) create mode 100644 dags/enrich/config/cohorts.py create mode 100644 dags/enrich/dags/enrich_acteurs_closed.py rename dags/enrich/dags/{enrich_ae_rgpd.py => enrich_acteurs_rgpd.py} (95%) delete mode 100644 dags/enrich/dags/enrich_ae_closed.py create mode 100644 dags/enrich/tasks/airflow_logic/enrich_config_create_task.py rename dags/enrich/tasks/airflow_logic/{enrich_read_ae_closed_candidates_task.py => enrich_read_dbt_model_task.py} (57%) create mode 100644 dags/enrich/tasks/business_logic/enrich_ae_suggestions.py delete mode 100644 dags/enrich/tasks/business_logic/enrich_read.py create mode 100644 dags/enrich/tasks/business_logic/enrich_read_dbt_model.py create mode 100644 dags/shared/config/models.py create mode 100644 dags_unit_tests/enrich/config/test_enrich_acteurs_closed_config.py delete mode 100644 dags_unit_tests/enrich/config/test_enrich_closed_config.py create mode 100644 dags_unit_tests/shared/config/test_shared_config_models.py diff --git a/dags/enrich/config/__init__.py b/dags/enrich/config/__init__.py index 856f8b15f..b6c1b63e6 100644 --- a/dags/enrich/config/__init__.py +++ b/dags/enrich/config/__init__.py @@ -1,6 +1,7 @@ +from .cohorts import COHORTS # noqa: F401 from .columns import COLS # noqa: F401 from .dbt import DBT # noqa: F401 -from .models import EnrichClosedConfig # noqa: F401 +from .models import DAG_ID_TO_CONFIG_MODEL, EnrichActeursClosedConfig # noqa: F401 from .paths import DIR_SQL_READ # noqa: F401 from .tasks import TASKS # noqa: F401 from .xcoms import XCOMS, xcom_pull # noqa: F401 diff --git a/dags/enrich/config/cohorts.py b/dags/enrich/config/cohorts.py new file mode 100644 index 000000000..5e9be42f1 --- /dev/null +++ b/dags/enrich/config/cohorts.py @@ -0,0 +1,12 @@ +"""Cohorts for enrich DAGs""" + +from dataclasses import dataclass + +INTRO = "🚪 Acteurs Fermés:" + + +@dataclass(frozen=True) +class COHORTS: + ACTEURS_CLOSED_NOT_REPLACED: str = f"{INTRO} 🔴 non remplacés" + ACTEURS_CLOSED_REP_DIFF_SIREN: str = f"{INTRO} 🟡 remplacés via SIREN diff" + ACTEURS_CLOSED_REP_SAME_SIREN: str = f"{INTRO} 🟢 remplacés via SIREN idem" diff --git a/dags/enrich/config/models.py b/dags/enrich/config/models.py index 13c6dcf5d..5254ffb6f 100644 --- a/dags/enrich/config/models.py +++ b/dags/enrich/config/models.py @@ -3,12 +3,12 @@ import re from typing import Optional -from pydantic import BaseModel +from pydantic import BaseModel, Field, computed_field SEPARATOR_FILTER_FIELD = "__" -def filters_get(model: BaseModel, prefix: str) -> list[dict[str, str]]: +def filters_get(model: BaseModel, prefix: str, operator: str) -> list[dict[str, str]]: """Utility to get list of filters (field, value) to apply to the data, used 2 ways: - generate the Airflow params for the UI from field names only @@ -19,28 +19,60 @@ def filters_get(model: BaseModel, prefix: str) -> list[dict[str, str]]: """ filters = [] for field in model.model_fields: + value = getattr(model, field) if re.fullmatch(f"{prefix}{SEPARATOR_FILTER_FIELD}[a-z_]+", field): + + # Skipping None if it's not exclitely is_null operator + if value is None and operator != "is_null": + continue + filters.append( { "field": field.replace(f"{prefix}{SEPARATOR_FILTER_FIELD}", ""), - "value": getattr(model, field), + "operator": operator, + "value": value, } ) return filters class EnrichBaseConfig(BaseModel): - dry_run: bool - filter_contains__commentaires: Optional[str] = "test" - filter_contains__nom: Optional[str] - filter_equals__statut: Optional[str] + dry_run: bool = Field( + default=True, + description="🚱 Si coché, aucune tâche d'écriture ne sera effectuée", + ) + filter_contains__acteur_commentaires: Optional[str] = Field( + default=None, + description="🔍 Filtre sur **acteur_commentaires**", + ) + filter_contains__acteur_nom: Optional[str] = Field( + default=None, + description="🔍 Filtre sur **acteur_nom**", + ) + filter_equals__acteur_statut: Optional[str] = Field( + default=None, + description="🔍 Filtre sur **acteur_statut**", + ) def filters_contains(self) -> list[dict[str, str]]: - return filters_get(self, "filter_contains") + return filters_get(self, "filter_contains", "contains") def filters_equals(self) -> list[dict[str, str]]: - return filters_get(self, "filter_equals") + return filters_get(self, "filter_equals", "equals") + + @computed_field + @property + def filters(self) -> list[dict[str, str]]: + return self.filters_contains() + self.filters_equals() + + +class EnrichActeursClosedConfig(EnrichBaseConfig): + filter_contains__etab_naf: Optional[str] = Field( + default=None, + description="🔍 Filtre sur **NAF AE Etablissement**", + ) -class EnrichClosedConfig(EnrichBaseConfig): - filter_contains__naf: Optional[str] +DAG_ID_TO_CONFIG_MODEL = { + "enrich_acteurs_closed": EnrichActeursClosedConfig, +} diff --git a/dags/enrich/config/tasks.py b/dags/enrich/config/tasks.py index f914b2264..034a6a1b9 100644 --- a/dags/enrich/config/tasks.py +++ b/dags/enrich/config/tasks.py @@ -5,6 +5,9 @@ @dataclass(frozen=True) class TASKS: + # Config + CONFIG_CREATE: str = "enrich_config_create" + # Read tasks READ_AE_RGPD: str = "enrich_ae_rgpd_read" READ_AE_CLOSED_CANDIDATES: str = "enrich_read_ae_closed_candidates" diff --git a/dags/enrich/config/xcoms.py b/dags/enrich/config/xcoms.py index 694128e19..511b1670b 100644 --- a/dags/enrich/config/xcoms.py +++ b/dags/enrich/config/xcoms.py @@ -15,23 +15,28 @@ @dataclass(frozen=True) class XCOMS: + CONFIG: str = "config" DF_READ: str = "df_read" DF_MATCH: str = "df_match" + DF_CLOSED_CANDIDATES: str = "df_acteurs_closed_candidates" + DF_CLOSED_REPLACED: str = "df_acteurs_closed_replaced" + def xcom_pull(ti: TaskInstance, key: str, skip_if_empty: bool = False) -> Any: """For pulls, we create a helper to constrain keys to specific task ids to guarantee consistent pulls""" # Init - value: Any = None # type: ignore msg = f"XCOM from {ti.task_id=} pulling {key=}:" # For logging # Reading values - if key == XCOMS.DF_READ: - value: pd.DataFrame = ti.xcom_pull(key=key, task_ids=TASKS.READ_AE_RGPD) + if key == XCOMS.CONFIG: + value = ti.xcom_pull(key=key, task_ids=TASKS.CONFIG_CREATE) + elif key == XCOMS.DF_READ: + value = ti.xcom_pull(key=key, task_ids=TASKS.READ_AE_RGPD) elif key == XCOMS.DF_MATCH: - value: pd.DataFrame = ti.xcom_pull(key=key, task_ids=TASKS.MATCH_SCORE_AE_RGPD) + value = ti.xcom_pull(key=key, task_ids=TASKS.MATCH_SCORE_AE_RGPD) else: raise ValueError(f"{msg} key inconnue") diff --git a/dags/enrich/dags/enrich_acteurs_closed.py b/dags/enrich/dags/enrich_acteurs_closed.py new file mode 100644 index 000000000..4188b1cfc --- /dev/null +++ b/dags/enrich/dags/enrich_acteurs_closed.py @@ -0,0 +1,58 @@ +""" +DAG to anonymize QFDMO acteur which names +contains people from Annuaire Entreprise (AE) +""" + +from datetime import datetime + +from airflow import DAG +from airflow.models.baseoperator import chain +from enrich.config import DBT, TASKS, XCOMS, EnrichActeursClosedConfig +from enrich.tasks.airflow_logic.enrich_config_create_task import ( + enrich_config_create_task, +) +from enrich.tasks.airflow_logic.enrich_read_dbt_model_task import ( + enrich_read_dbt_model_task, +) +from shared.config.models import config_to_airflow_params + +with DAG( + dag_id="enrich_acteurs_closed", + dag_display_name="🚪 Enrichir - Acteurs Fermés", + default_args={ + "owner": "airflow", + "depends_on_past": False, + "start_date": datetime(2025, 3, 5), + "catchup": False, + "email_on_failure": False, + "email_on_retry": False, + "retries": 0, + }, + description=( + "Un DAG pour détécter et remplacer les acteurs fermés" + "dans l'Annuaire Entreprises (AE)" + ), + tags=["annuaire", "entreprises", "ae", "siren", "siret", "acteurs", "fermés"], + schedule=None, + catchup=False, + params=config_to_airflow_params( + EnrichActeursClosedConfig( + filter_equals__acteur_statut="ACTIF", + ) + ), +) as dag: + chain( + enrich_config_create_task(dag), + enrich_read_dbt_model_task( + dag, + task_id=TASKS.READ_AE_CLOSED_CANDIDATES, + dbt_model_name=DBT.MARTS_ENRICH_AE_CLOSED_CANDIDATES, + xcom_push_key=XCOMS.DF_CLOSED_CANDIDATES, + ), + enrich_read_dbt_model_task( + dag, + task_id=TASKS.READ_AE_CLOSED_REPLACED, + dbt_model_name=DBT.MARTS_ENRICH_AE_CLOSED_REPLACED, + xcom_push_key=XCOMS.DF_CLOSED_REPLACED, + ), + ) diff --git a/dags/enrich/dags/enrich_ae_rgpd.py b/dags/enrich/dags/enrich_acteurs_rgpd.py similarity index 95% rename from dags/enrich/dags/enrich_ae_rgpd.py rename to dags/enrich/dags/enrich_acteurs_rgpd.py index dd7b1d654..c2c64ba64 100644 --- a/dags/enrich/dags/enrich_ae_rgpd.py +++ b/dags/enrich/dags/enrich_acteurs_rgpd.py @@ -20,8 +20,8 @@ ) with DAG( - dag_id="enrich_ae_rgpd", - dag_display_name="Enrichir - AE - RGPD", + dag_id="enrich_ae_acteurs_rgpd", + dag_display_name="Enrichir - AE - Acteurs RGPD", default_args={ "owner": "airflow", "depends_on_past": False, diff --git a/dags/enrich/dags/enrich_ae_closed.py b/dags/enrich/dags/enrich_ae_closed.py deleted file mode 100644 index 2de827805..000000000 --- a/dags/enrich/dags/enrich_ae_closed.py +++ /dev/null @@ -1,44 +0,0 @@ -""" -DAG to anonymize QFDMO acteur which names -contains people from Annuaire Entreprise (AE) -""" - -from datetime import datetime - -from airflow import DAG -from airflow.models.baseoperator import chain -from airflow.models.param import Param -from enrich.tasks.airflow_logic.enrich_read_ae_closed_candidates_task import ( - enrich_read_ae_closed_candidates_task, -) - -with DAG( - dag_id="enrich_ae_closed", - dag_display_name="Enrichir - AE - Acteurs fermés", - default_args={ - "owner": "airflow", - "depends_on_past": False, - "start_date": datetime(2025, 3, 5), - "catchup": False, - "email_on_failure": False, - "email_on_retry": False, - "retries": 0, - }, - description=( - "Un DAG pour détécter et remplacer les acteurs fermés" - "dans l'Annuaire Entreprises (AE)" - ), - tags=["annuaire", "entreprise", "ae", "siren", "siret", "acteurs"], - schedule=None, - catchup=False, - params={ - "dry_run": Param( - True, - type="boolean", - description_md="🚱 Si coché, aucune tâche d'écriture ne sera effectuée", - ), - }, -) as dag: - chain( - enrich_read_ae_closed_candidates_task(dag), - ) diff --git a/dags/enrich/tasks/airflow_logic/enrich_config_create_task.py b/dags/enrich/tasks/airflow_logic/enrich_config_create_task.py new file mode 100644 index 000000000..aea481f0f --- /dev/null +++ b/dags/enrich/tasks/airflow_logic/enrich_config_create_task.py @@ -0,0 +1,40 @@ +"""Read data from DB needed for RGPD anonymization""" + +import logging + +from airflow import DAG +from airflow.operators.python import PythonOperator +from enrich.config import DAG_ID_TO_CONFIG_MODEL, TASKS, XCOMS + +logger = logging.getLogger(__name__) + + +def task_info_get(): + return f""" + ============================================================ + Description de la tâche "{TASKS.CONFIG_CREATE}" + ============================================================ + 💡 quoi: création de la config + + 🎯 pourquoi: s'assurer qu'elle est OK avant de faire du travail, + réutiliser la config pour les autres tâches + + 🏗️ comment: on ingère les paramètres Airflow dans un modèle pydantic + """ + + +def enrich_config_create_wrapper(ti, dag, params) -> None: + logger.info(task_info_get()) + + config = DAG_ID_TO_CONFIG_MODEL[dag.dag_id](**params) + logger.info(f"📖 Configuration:\n{config.model_dump_json(indent=2)}") + + ti.xcom_push(key=XCOMS.CONFIG, value=config) + + +def enrich_config_create_task(dag: DAG) -> PythonOperator: + return PythonOperator( + task_id=TASKS.CONFIG_CREATE, + python_callable=enrich_config_create_wrapper, + dag=dag, + ) diff --git a/dags/enrich/tasks/airflow_logic/enrich_read_ae_closed_candidates_task.py b/dags/enrich/tasks/airflow_logic/enrich_read_dbt_model_task.py similarity index 57% rename from dags/enrich/tasks/airflow_logic/enrich_read_ae_closed_candidates_task.py rename to dags/enrich/tasks/airflow_logic/enrich_read_dbt_model_task.py index b46dadafe..22fdfb72b 100644 --- a/dags/enrich/tasks/airflow_logic/enrich_read_ae_closed_candidates_task.py +++ b/dags/enrich/tasks/airflow_logic/enrich_read_dbt_model_task.py @@ -5,9 +5,9 @@ from airflow import DAG from airflow.exceptions import AirflowSkipException from airflow.operators.python import PythonOperator -from enrich.config import DBT, TASKS, XCOMS -from enrich.tasks.business_logic.enrich_read import ( - enrich_read, +from enrich.config import DBT, TASKS, XCOMS, xcom_pull +from enrich.tasks.business_logic.enrich_read_dbt_model import ( + enrich_read_dbt_model, ) logger = logging.getLogger(__name__) @@ -30,22 +30,28 @@ def task_info_get(): """ -def enrich_read_ae_closed_candidates_wrapper(ti, params) -> None: +def enrich_read_dbt_model_wrapper(dbt_model_name, xcom_push_key, ti) -> None: logger.info(task_info_get()) - df = enrich_read( - dbt_model_name=DBT.MARTS_ENRICH_AE_CLOSED_CANDIDATES, - filter_comments_contain=params["filter_comments_contain"], - ) + # Config + config = xcom_pull(ti, XCOMS.CONFIG) + logger.info(f"📖 Configuration:\n{config.model_dump_json(indent=2)}") + + # Processing + df = enrich_read_dbt_model(dbt_model_name=dbt_model_name, filters=config.filters) if df.empty: raise AirflowSkipException("Pas de données DB, on s'arrête là") - ti.xcom_push(key=XCOMS.DF_READ, value=df) + # Result + ti.xcom_push(key=xcom_push_key, value=df) -def enrich_read_ae_closed_candidates_task(dag: DAG) -> PythonOperator: +def enrich_read_dbt_model_task( + dag: DAG, task_id: str, dbt_model_name: str, xcom_push_key: str +) -> PythonOperator: return PythonOperator( - task_id=TASKS.READ_AE_RGPD, - python_callable=enrich_read_ae_closed_candidates_wrapper, + task_id=task_id, + python_callable=enrich_read_dbt_model_wrapper, + op_args=[dbt_model_name, xcom_push_key], dag=dag, ) diff --git a/dags/enrich/tasks/business_logic/enrich_ae_suggestions.py b/dags/enrich/tasks/business_logic/enrich_ae_suggestions.py new file mode 100644 index 000000000..1816742f6 --- /dev/null +++ b/dags/enrich/tasks/business_logic/enrich_ae_suggestions.py @@ -0,0 +1,99 @@ +import logging + +import pandas as pd +from enrich.config import COHORTS + +logger = logging.getLogger(__name__) + + +def enrich_ae_suggestions( + df: pd.DataFrame, + cohort_type: str, + identifiant_action: str, + identifiant_execution: str, + dry_run: bool = True, +) -> None: + from data.models import ( + Suggestion, + SuggestionAction, + SuggestionCohorte, + SuggestionStatut, + ) + from data.models.change import SuggestionChange + from data.models.changes import ChangeActeurUpdateData + from qfdmo.models import ActeurStatus + + if cohort_type not in [ + COHORTS.ACTEURS_CLOSED_NOT_REPLACED, + COHORTS.ACTEURS_CLOSED_REP_DIFF_SIREN, + COHORTS.ACTEURS_CLOSED_REP_SAME_SIREN, + ]: + raise ValueError(f"Mauvaise cohorte: {cohort_type=}") + + suggestions = [] + + for row in df.itertuples(index=False): + + # ----------------------------------------- + # CHANGES: PREPARE + # ----------------------------------------- + changes = [] + + model_params = { + "id": row.acteur_id, + "data": { + "statut": ActeurStatus.INACTIF, + }, + } + ChangeActeurUpdateData(**model_params).validate() + change = SuggestionChange( + order=1, + reason=cohort_type, + entity_type="acteur_displayed", + model_name=ChangeActeurUpdateData.name(), + model_params=model_params, + ).model_dump() + changes.append(change) + + # ----------------------------------------- + # SUGGESTION: PREPARE + # ----------------------------------------- + suggestions.append( + { + # TODO: free format thanks to recursive model + "contexte": {}, + "suggestion": { + "title": cohort_type, + "summary": [], + "changes": changes, + }, + } + ) + + # ----------------------------------------- + # DRY RUN: STOP HERE + # ----------------------------------------- + if dry_run: + logger.info("✋ Dry run: suggestions pas écrites en base") + return + + # ----------------------------------------- + # SUGGESTION: WRITE TO DB + # ----------------------------------------- + cohort = SuggestionCohorte( + identifiant_action=identifiant_action, + identifiant_execution=f"{cohort_type} {identifiant_execution}", + statut=SuggestionStatut.AVALIDER, + type_action=SuggestionAction.ACTEURS_CLOSED, + metadata={"🔢 Nombre de suggestions": len(suggestions)}, + ) + cohort.save() + for suggestion in suggestions: + + for suggestion in suggestions: + Suggestion( + suggestion_cohorte=cohort, + statut=SuggestionStatut.AVALIDER, + contexte=suggestion["contexte"], + suggestion=suggestion["suggestion"], + ).save() diff --git a/dags/enrich/tasks/business_logic/enrich_read.py b/dags/enrich/tasks/business_logic/enrich_read.py deleted file mode 100644 index f3d893436..000000000 --- a/dags/enrich/tasks/business_logic/enrich_read.py +++ /dev/null @@ -1,45 +0,0 @@ -"""Read data from DB needed for RGPD anonymization""" - -import logging - -import numpy as np -import pandas as pd -from utils import logging_utils as log -from utils.django import django_setup_full - -django_setup_full() - -logger = logging.getLogger(__name__) - - -def enrich_read( - dbt_model_name: str, filters_contain: list[tuple[str, str]] = [] -) -> pd.DataFrame: - """Reads necessary QFDMO acteurs and AE entries from DB""" - from django.db import connection - - # Execute SQL query and get data - with connection.cursor() as cursor: - cursor.execute(f"SELECT * FROM {dbt_model_name}") - columns = [col[0] for col in cursor.description] - data = cursor.fetchall() - - # Create DataFrame and preview - df = pd.DataFrame(data, columns=columns, dtype="object").replace({np.nan: None}) - log.preview_df_as_markdown(f"Données de {dbt_model_name} SANS filtre", df) - - # Filtering if needed - if not df.empty: - for col_name, col_value in filters_contain: - col_value = (col_value or "").strip() - if filter: - logger.info(f"Filtre sur {col_name} CONTIENT {col_value}") - df = df[df[col_name].notnull()].copy() - df = df[ - df[col_name].str.contains(col_value, regex=True, case=False) - ].copy() - log.preview_df_as_markdown( - f"Données de {dbt_model_name} APRES filtre", df - ) - - return df diff --git a/dags/enrich/tasks/business_logic/enrich_read_dbt_model.py b/dags/enrich/tasks/business_logic/enrich_read_dbt_model.py new file mode 100644 index 000000000..49fa0ec92 --- /dev/null +++ b/dags/enrich/tasks/business_logic/enrich_read_dbt_model.py @@ -0,0 +1,58 @@ +"""Read data from DB needed for RGPD anonymization""" + +import logging + +import numpy as np +import pandas as pd +from utils import logging_utils as log +from utils.django import django_setup_full + +django_setup_full() + +logger = logging.getLogger(__name__) + + +def enrich_read_dbt_model( + dbt_model_name: str, filters: list[dict] = [] +) -> pd.DataFrame: + """Reads necessary QFDMO acteurs and AE entries from DB""" + from django.db import connection + + # Execute SQL query and get data + with connection.cursor() as cursor: + cursor.execute(f"SELECT * FROM {dbt_model_name}") + columns = [col[0] for col in cursor.description] + data = cursor.fetchall() + + # Create DataFrame and preview + df = pd.DataFrame(data, columns=columns, dtype="object").replace({np.nan: None}) + log.preview_df_as_markdown(f"Données de {dbt_model_name} SANS filtre", df) + + # Filtering if needed + filter_applied = False + if not df.empty: + for filter in filters: + + # Assignment & info + filter_applied = True + field = filter["field"] + operator = filter["operator"] + value = filter["value"] + logger.info(f"\n🔽 Filtre sur {field=} {operator=} {value=}") + logger.info(f"Avant filtre : {df.shape[0]} lignes") + + # Filtering + if filter["operator"] == "equals": + logger.info(f"Filtre sur {field} EQUALS {value}") + df = df[df[field] == value].copy() + elif filter["operator"] == "contains": + df = df[df[field].str.contains(value, regex=True, case=False)].copy() + else: + raise NotImplementedError(f"{filter['operator']=} non implémenté") + + logger.info(f"Après filtre : {df.shape[0]} lignes") + + if filter_applied: + log.preview_df_as_markdown(f"Données de {dbt_model_name} APRES filtre(s)", df) + + return df diff --git a/dags/shared/config/models.py b/dags/shared/config/models.py new file mode 100644 index 000000000..89740aa14 --- /dev/null +++ b/dags/shared/config/models.py @@ -0,0 +1,36 @@ +import typing + +from airflow.models.param import Param +from pydantic import BaseModel + +PYDANTIC_TYPE_TO_AIRFLOW_TYPE = { + bool: "boolean", + str: "string", + typing.Optional[str]: ["null", "string"], +} + + +def config_to_airflow_params(model_instance: BaseModel) -> dict[str, Param]: + """Generate Airflow params from a pydantic config model instance: + + TODO: to implement recurring/complex types, we can use a mapping + with field_name as entry, and keep the generic fallback below + + if field_name == "complex_field": + params = PARAMS[field_name] + elif: + ... + else: + fallback to current logic + """ + params = {} + model_cls = model_instance.__class__ + for field_name, field_info in model_cls.model_fields.items(): + field_value = getattr(model_instance, field_name) # Get value from instance + + params[field_name] = Param( + field_value, + type=PYDANTIC_TYPE_TO_AIRFLOW_TYPE[field_info.annotation], + description_md=field_info.description, + ) + return params diff --git a/dags_unit_tests/enrich/config/test_enrich_acteurs_closed_config.py b/dags_unit_tests/enrich/config/test_enrich_acteurs_closed_config.py new file mode 100644 index 000000000..595536f35 --- /dev/null +++ b/dags_unit_tests/enrich/config/test_enrich_acteurs_closed_config.py @@ -0,0 +1,27 @@ +import pytest + +from dags.enrich.config.models import EnrichActeursClosedConfig + + +class TestEnrichClosedConfig: + + @pytest.fixture + def config(self): + return EnrichActeursClosedConfig( + dry_run=True, + filter_contains__acteur_commentaires="my comment", + filter_contains__acteur_nom=None, + filter_contains__etab_naf="test NAF", + filter_equals__acteur_statut="ACTIF", + ) + + def test_filters_get(self, config): + assert config.filters == [ + { + "field": "acteur_commentaires", + "operator": "contains", + "value": "my comment", + }, + {"field": "etab_naf", "operator": "contains", "value": "test NAF"}, + {"field": "acteur_statut", "operator": "equals", "value": "ACTIF"}, + ] diff --git a/dags_unit_tests/enrich/config/test_enrich_closed_config.py b/dags_unit_tests/enrich/config/test_enrich_closed_config.py deleted file mode 100644 index 82212ef3c..000000000 --- a/dags_unit_tests/enrich/config/test_enrich_closed_config.py +++ /dev/null @@ -1,28 +0,0 @@ -import pytest - -from dags.enrich.config.models import EnrichClosedConfig - - -class TestEnrichClosedConfig: - - @pytest.fixture - def config(self): - return EnrichClosedConfig( - dry_run=True, - filter_contains__commentaires="commentaires", - filter_contains__nom="nom", - filter_contains__naf=None, - filter_equals__statut="ACTIF", - ) - - def test_filters_contain(self, config): - assert config.filters_contains() == [ - {"field": "commentaires", "value": "commentaires"}, - {"field": "nom", "value": "nom"}, - {"field": "naf", "value": None}, - ] - - def test_filters_equals(self, config): - assert config.filters_equals() == [ - {"field": "statut", "value": "ACTIF"}, - ] diff --git a/dags_unit_tests/shared/config/test_shared_config_models.py b/dags_unit_tests/shared/config/test_shared_config_models.py new file mode 100644 index 000000000..194b6517f --- /dev/null +++ b/dags_unit_tests/shared/config/test_shared_config_models.py @@ -0,0 +1,56 @@ +from typing import Optional + +import pytest +from pydantic import BaseModel, Field + +from dags.shared.config.models import config_to_airflow_params + + +class MyModel(BaseModel): + dry_run: bool = Field( + default=True, + description="🚱 Si coché...", + ) + some_string: str = Field( + default="foo", + description="SOME STRING", + ) + opt_string_untouched: Optional[str] = Field( + default=None, + description="OPT STRING UNTOUCHED", + ) + opt_string_changed: Optional[str] = Field( + default=None, + description="OPT STRING CHANGED", + ) + + +class TestConfigModelToAirflowParams: + @pytest.fixture + def model_instance(self): + return MyModel(opt_string_changed="bar") + + @pytest.fixture + def params(self, model_instance): + return config_to_airflow_params(model_instance) + + def test_boolean(self, params): + param = params["dry_run"] + assert param.value is True + assert param.schema["type"] == "boolean" + assert param.schema["description_md"] == "🚱 Si coché..." + + def test_string(self, params): + param = params["some_string"] + assert param.value == "foo" + assert param.schema["type"] == "string" + + def test_opt_string_untouched(self, params): + param = params["opt_string_untouched"] + assert param.value is None + assert param.schema["type"] == ["null", "string"] + + def test_opt_string_changed(self, params): + param = params["opt_string_changed"] + assert param.value == "bar" + assert param.schema["type"] == ["null", "string"] diff --git a/data/models/suggestion.py b/data/models/suggestion.py index f6e7d2f58..d8ce1daac 100644 --- a/data/models/suggestion.py +++ b/data/models/suggestion.py @@ -55,6 +55,7 @@ class SuggestionCohorteStatut(models.TextChoices): class SuggestionAction(models.TextChoices): CRAWL_URLS = SUGGESTION_CRAWL_URLS, "🔗 URLs scannées" RGPD_ANONYMIZE = "RGPD_ANONYMISATION", "🕵️ Anonymisation RGPD" + ACTEURS_CLOSED = "ACTEURS_CLOSED", "🚪 Acteurs fermés" CLUSTERING = SUGGESTION_CLUSTERING, "regroupement/déduplication des acteurs" SOURCE_AJOUT = ( SUGGESTION_SOURCE_AJOUT, diff --git a/qfdmo/models/acteur.py b/qfdmo/models/acteur.py index e2662c7cb..905313221 100644 --- a/qfdmo/models/acteur.py +++ b/qfdmo/models/acteur.py @@ -355,6 +355,14 @@ class Meta: siret = models.CharField( max_length=14, blank=True, default="", db_default="", db_index=True ) + # To backfill SIRET status into our DB from AE and avoid having to evaluate + # AE's DB at runtime (which has 40M rows), also helping with Django admin info + siret_is_closed = models.BooleanField( + default=None, # by default we can't assume a SIRET is opened + blank=True, + verbose_name="SIRET fermé", + help_text="Indique si le SIRET est fermé ou non dans l'Annuaire Entreprises", + ) source = models.ForeignKey(Source, on_delete=models.CASCADE, blank=True, null=True) identifiant_externe = models.CharField( max_length=255, blank=True, default="", db_default="" @@ -730,6 +738,13 @@ class Meta: related_name="duplicats", validators=[clean_parent], ) + parent_reason = models.CharField( + max_length=255, + blank=True, + default="", + db_default="", + help_text="Raison du rattachement au parent", + ) @property def is_parent(self): From f1d4915fe31a46b7da44e84c8e44a608a1558e49 Mon Sep 17 00:00:00 2001 From: Max Corbeau Date: Mon, 7 Apr 2025 08:43:38 +0200 Subject: [PATCH 10/50] utilisation constantes de config dans DAG --- dags/enrich/dags/enrich_acteurs_closed.py | 11 ++++------- dags/shared/config/__init__.py | 1 + 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/dags/enrich/dags/enrich_acteurs_closed.py b/dags/enrich/dags/enrich_acteurs_closed.py index 4188b1cfc..fcf45cbf6 100644 --- a/dags/enrich/dags/enrich_acteurs_closed.py +++ b/dags/enrich/dags/enrich_acteurs_closed.py @@ -3,8 +3,6 @@ contains people from Annuaire Entreprise (AE) """ -from datetime import datetime - from airflow import DAG from airflow.models.baseoperator import chain from enrich.config import DBT, TASKS, XCOMS, EnrichActeursClosedConfig @@ -14,7 +12,7 @@ from enrich.tasks.airflow_logic.enrich_read_dbt_model_task import ( enrich_read_dbt_model_task, ) -from shared.config.models import config_to_airflow_params +from shared.config import CATCHUPS, SCHEDULES, START_DATES, config_to_airflow_params with DAG( dag_id="enrich_acteurs_closed", @@ -22,8 +20,6 @@ default_args={ "owner": "airflow", "depends_on_past": False, - "start_date": datetime(2025, 3, 5), - "catchup": False, "email_on_failure": False, "email_on_retry": False, "retries": 0, @@ -33,8 +29,9 @@ "dans l'Annuaire Entreprises (AE)" ), tags=["annuaire", "entreprises", "ae", "siren", "siret", "acteurs", "fermés"], - schedule=None, - catchup=False, + schedule=SCHEDULES.NONE, + catchup=CATCHUPS.AWLAYS_FALSE, + start_date=START_DATES.FOR_SCHEDULE_NONE, params=config_to_airflow_params( EnrichActeursClosedConfig( filter_equals__acteur_statut="ACTIF", diff --git a/dags/shared/config/__init__.py b/dags/shared/config/__init__.py index f8f8adb54..712d8484d 100644 --- a/dags/shared/config/__init__.py +++ b/dags/shared/config/__init__.py @@ -1,3 +1,4 @@ from .catchups import CATCHUPS # noqa: F401 +from .models import config_to_airflow_params # noqa: F401 from .schedules import SCHEDULES # noqa: F401 from .start_dates import START_DATES # noqa: F401 From a37de1487fb422e1e7bdecb017ab9b6883252228 Mon Sep 17 00:00:00 2001 From: Max Corbeau Date: Mon, 7 Apr 2025 11:00:01 +0200 Subject: [PATCH 11/50] =?UTF-8?q?suggestions:=20d=C3=A9but=20de=20cr=C3=A9?= =?UTF-8?q?ation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- dags/enrich/config/columns.py | 12 + dags/enrich/dags/enrich_acteurs_closed.py | 18 +- .../enrich_read_dbt_model_task.py | 2 + .../enrich_acteurs_closed_suggestions.py | 149 ++++++++++++ .../business_logic/enrich_ae_suggestions.py | 99 -------- dags/enrich/test.py | 3 - .../test_enrich_acteurs_closed_suggestions.py | 212 ++++++++++++++++++ dbt/macros/constants/value_unavailable.sql | 3 + .../int_ae_etablissement.sql | 12 +- .../ae_annuaire_entreprises/schema.yml | 2 + .../marts_enrich_ae_closed_replaced.sql | 10 +- dbt/models/marts/enrich/schema.yml | 87 ++++++- .../0140_acteur_siret_is_closed_and_more.py | 64 ++++++ 13 files changed, 561 insertions(+), 112 deletions(-) create mode 100644 dags/enrich/tasks/business_logic/enrich_acteurs_closed_suggestions.py delete mode 100644 dags/enrich/tasks/business_logic/enrich_ae_suggestions.py delete mode 100644 dags/enrich/test.py create mode 100644 dags_unit_tests/enrich/tasks/test_enrich_acteurs_closed_suggestions.py create mode 100644 dbt/macros/constants/value_unavailable.sql create mode 100644 qfdmo/migrations/0140_acteur_siret_is_closed_and_more.py diff --git a/dags/enrich/config/columns.py b/dags/enrich/config/columns.py index 661bb2796..2c0461810 100644 --- a/dags/enrich/config/columns.py +++ b/dags/enrich/config/columns.py @@ -15,12 +15,24 @@ class COLS: # QFDMO ACTEUR_ID: str = "acteur_id" + ACTEUR_SIRET: str = "acteur_siret" + ACTEUR_NOM: str = "acteur_nom" ACTEUR_NOMS_ORIGINE: str = "acteur_noms_origine" ACTEUR_NOMS_NORMALISES: str = "acteur_noms_normalises" ACTEUR_COMMENTAIRES: str = "acteur_commentaires" # Annuaire Entreprise AE_DIRIGEANTS_NOMS: str = "ae_dirigeants_noms_prenoms" + REMPLACER_SIRET: str = "remplacer_siret" + REMPLACER_NOM: str = "remplacer_nom" + REMPLACER_COHORTE: str = "remplacer_cohorte" + + # Fields identical between acteurs and remplacements + # hence replacer_ prefix not present on the model column names + REMPLACER_ADRESSE: str = "adresse" + REMPLACER_CODE_POSTAL: str = "code_postal" + REMPLACER_VILLE: str = "ville" + REMPLACER_NAF: str = "naf" # Matching MATCH_SCORE_AE_RGPD: str = "match_score" diff --git a/dags/enrich/dags/enrich_acteurs_closed.py b/dags/enrich/dags/enrich_acteurs_closed.py index fcf45cbf6..fab5ca318 100644 --- a/dags/enrich/dags/enrich_acteurs_closed.py +++ b/dags/enrich/dags/enrich_acteurs_closed.py @@ -4,7 +4,6 @@ """ from airflow import DAG -from airflow.models.baseoperator import chain from enrich.config import DBT, TASKS, XCOMS, EnrichActeursClosedConfig from enrich.tasks.airflow_logic.enrich_config_create_task import ( enrich_config_create_task, @@ -38,6 +37,7 @@ ) ), ) as dag: + """ chain( enrich_config_create_task(dag), enrich_read_dbt_model_task( @@ -53,3 +53,19 @@ xcom_push_key=XCOMS.DF_CLOSED_REPLACED, ), ) + """ + config = enrich_config_create_task(dag) + closed_candidates = enrich_read_dbt_model_task( + dag, + task_id=TASKS.READ_AE_CLOSED_CANDIDATES, + dbt_model_name=DBT.MARTS_ENRICH_AE_CLOSED_CANDIDATES, + xcom_push_key=XCOMS.DF_CLOSED_CANDIDATES, + ) + closed_replaced = enrich_read_dbt_model_task( + dag, + task_id=TASKS.READ_AE_CLOSED_REPLACED, + dbt_model_name=DBT.MARTS_ENRICH_AE_CLOSED_REPLACED, + xcom_push_key=XCOMS.DF_CLOSED_REPLACED, + ) + config >> closed_candidates + config >> closed_replaced diff --git a/dags/enrich/tasks/airflow_logic/enrich_read_dbt_model_task.py b/dags/enrich/tasks/airflow_logic/enrich_read_dbt_model_task.py index 22fdfb72b..dbca68c7c 100644 --- a/dags/enrich/tasks/airflow_logic/enrich_read_dbt_model_task.py +++ b/dags/enrich/tasks/airflow_logic/enrich_read_dbt_model_task.py @@ -54,4 +54,6 @@ def enrich_read_dbt_model_task( python_callable=enrich_read_dbt_model_wrapper, op_args=[dbt_model_name, xcom_push_key], dag=dag, + # pool="dbt_model_read", + # pool_slots=1, ) diff --git a/dags/enrich/tasks/business_logic/enrich_acteurs_closed_suggestions.py b/dags/enrich/tasks/business_logic/enrich_acteurs_closed_suggestions.py new file mode 100644 index 000000000..760d1e45d --- /dev/null +++ b/dags/enrich/tasks/business_logic/enrich_acteurs_closed_suggestions.py @@ -0,0 +1,149 @@ +import logging +from datetime import datetime, timezone + +import pandas as pd +from cluster.tasks.business_logic.cluster_acteurs_parents_choose_new import ( + parent_id_generate, +) +from enrich.config import COHORTS, COLS + +logger = logging.getLogger(__name__) + + +def enrich_acteurs_closed_suggestions( + df: pd.DataFrame, + cohort_type: str, + identifiant_action: str, + identifiant_execution: str, + dry_run: bool = True, +) -> None: + from data.models import ( + Suggestion, + SuggestionAction, + SuggestionCohorte, + SuggestionStatut, + ) + from data.models.change import SuggestionChange + from data.models.changes import ChangeActeurCreateAsParent, ChangeActeurUpdateData + from qfdmo.models import ActeurStatus + + today = datetime.now(timezone.utc).strftime("%Y-%m-%d") + + if cohort_type not in [ + COHORTS.ACTEURS_CLOSED_NOT_REPLACED, + COHORTS.ACTEURS_CLOSED_REP_DIFF_SIREN, + COHORTS.ACTEURS_CLOSED_REP_SAME_SIREN, + ]: + raise ValueError(f"Mauvaise cohorte: {cohort_type=}") + + suggestions = [] + + for _, row in df.iterrows(): + row = row._asdict() + + # ----------------------------------------- + # NOT REPLACED + # ----------------------------------------- + if cohort_type == COHORTS.ACTEURS_CLOSED_NOT_REPLACED: + raise NotImplementedError("Pas encore implémenté") + + # ----------------------------------------- + # REPLACED + # ----------------------------------------- + elif cohort_type not in [ + COHORTS.ACTEURS_CLOSED_REP_DIFF_SIREN, + COHORTS.ACTEURS_CLOSED_REP_SAME_SIREN, + ]: + cohorts = row[COLS.REMPLACER_COHORTE].unique() + if len(cohorts) > 1: + raise ValueError(f"Une seule cohorte à la fois: {cohorts=}") + + changes = [] + + # Parent + parent_id = parent_id_generate([row[COLS.REMPLACER_SIRET]]) + model_params = { + "id": parent_id, + "data": { + "nom": row[COLS.REMPLACER_NOM], + "adresse": row[COLS.REMPLACER_ADRESSE], + "code_postal": row[COLS.REMPLACER_CODE_POSTAL], + "ville": row[COLS.REMPLACER_VILLE], + "siren": row[COLS.REMPLACER_SIRET][:9], + "siret": row[COLS.REMPLACER_SIRET], + "naf": row[COLS.REMPLACER_NAF], + }, + } + ChangeActeurCreateAsParent(**model_params).validate() + change = SuggestionChange( + order=1, + reason=cohort_type, + entity_type="acteur_displayed", + model_name=ChangeActeurCreateAsParent.name(), + model_params=model_params, + ).model_dump() + changes.append(change) + + # Child + model_params = { + "id": row.acteur_id, + "data": { + "statut": ActeurStatus.INACTIF, + "parent_id": parent_id, + "parent_reason": f"""SIRET {row.acteur_siret} détecté le {today} + comme fermé dans AE, remplacé par SIRET {row.remplacer_siret}""", + "siret_is_closed": True, + }, + } + ChangeActeurUpdateData(**model_params).validate() + change = SuggestionChange( + order=2, + reason=cohort_type, + entity_type="acteur_displayed", + model_name=ChangeActeurUpdateData.name(), + model_params=model_params, + ).model_dump() + changes.append(change) + + # ----------------------------------------- + # SUGGESTION: PREPARE + # ----------------------------------------- + suggestions.append( + { + # TODO: free format thanks to recursive model + "contexte": {}, + "suggestion": { + "title": cohort_type, + "summary": [], + "changes": changes, + }, + } + ) + + # ----------------------------------------- + # DRY RUN: STOP HERE + # ----------------------------------------- + if dry_run: + logger.info("✋ Dry run: suggestions pas écrites en base") + return + + # ----------------------------------------- + # SUGGESTION: WRITE TO DB + # ----------------------------------------- + cohort = SuggestionCohorte( + identifiant_action=identifiant_action, + identifiant_execution=f"{cohort_type} {identifiant_execution}", + statut=SuggestionStatut.AVALIDER, + type_action=SuggestionAction.ACTEURS_CLOSED, + metadata={"🔢 Nombre de suggestions": len(suggestions)}, + ) + cohort.save() + for suggestion in suggestions: + + for suggestion in suggestions: + Suggestion( + suggestion_cohorte=cohort, + statut=SuggestionStatut.AVALIDER, + contexte=suggestion["contexte"], + suggestion=suggestion["suggestion"], + ).save() diff --git a/dags/enrich/tasks/business_logic/enrich_ae_suggestions.py b/dags/enrich/tasks/business_logic/enrich_ae_suggestions.py deleted file mode 100644 index 1816742f6..000000000 --- a/dags/enrich/tasks/business_logic/enrich_ae_suggestions.py +++ /dev/null @@ -1,99 +0,0 @@ -import logging - -import pandas as pd -from enrich.config import COHORTS - -logger = logging.getLogger(__name__) - - -def enrich_ae_suggestions( - df: pd.DataFrame, - cohort_type: str, - identifiant_action: str, - identifiant_execution: str, - dry_run: bool = True, -) -> None: - from data.models import ( - Suggestion, - SuggestionAction, - SuggestionCohorte, - SuggestionStatut, - ) - from data.models.change import SuggestionChange - from data.models.changes import ChangeActeurUpdateData - from qfdmo.models import ActeurStatus - - if cohort_type not in [ - COHORTS.ACTEURS_CLOSED_NOT_REPLACED, - COHORTS.ACTEURS_CLOSED_REP_DIFF_SIREN, - COHORTS.ACTEURS_CLOSED_REP_SAME_SIREN, - ]: - raise ValueError(f"Mauvaise cohorte: {cohort_type=}") - - suggestions = [] - - for row in df.itertuples(index=False): - - # ----------------------------------------- - # CHANGES: PREPARE - # ----------------------------------------- - changes = [] - - model_params = { - "id": row.acteur_id, - "data": { - "statut": ActeurStatus.INACTIF, - }, - } - ChangeActeurUpdateData(**model_params).validate() - change = SuggestionChange( - order=1, - reason=cohort_type, - entity_type="acteur_displayed", - model_name=ChangeActeurUpdateData.name(), - model_params=model_params, - ).model_dump() - changes.append(change) - - # ----------------------------------------- - # SUGGESTION: PREPARE - # ----------------------------------------- - suggestions.append( - { - # TODO: free format thanks to recursive model - "contexte": {}, - "suggestion": { - "title": cohort_type, - "summary": [], - "changes": changes, - }, - } - ) - - # ----------------------------------------- - # DRY RUN: STOP HERE - # ----------------------------------------- - if dry_run: - logger.info("✋ Dry run: suggestions pas écrites en base") - return - - # ----------------------------------------- - # SUGGESTION: WRITE TO DB - # ----------------------------------------- - cohort = SuggestionCohorte( - identifiant_action=identifiant_action, - identifiant_execution=f"{cohort_type} {identifiant_execution}", - statut=SuggestionStatut.AVALIDER, - type_action=SuggestionAction.ACTEURS_CLOSED, - metadata={"🔢 Nombre de suggestions": len(suggestions)}, - ) - cohort.save() - for suggestion in suggestions: - - for suggestion in suggestions: - Suggestion( - suggestion_cohorte=cohort, - statut=SuggestionStatut.AVALIDER, - contexte=suggestion["contexte"], - suggestion=suggestion["suggestion"], - ).save() diff --git a/dags/enrich/test.py b/dags/enrich/test.py deleted file mode 100644 index bb9712b72..000000000 --- a/dags/enrich/test.py +++ /dev/null @@ -1,3 +0,0 @@ -azerty = "foo" - -print(azerty) diff --git a/dags_unit_tests/enrich/tasks/test_enrich_acteurs_closed_suggestions.py b/dags_unit_tests/enrich/tasks/test_enrich_acteurs_closed_suggestions.py new file mode 100644 index 000000000..1eccf7afe --- /dev/null +++ b/dags_unit_tests/enrich/tasks/test_enrich_acteurs_closed_suggestions.py @@ -0,0 +1,212 @@ +from datetime import datetime, timezone + +import pandas as pd +import pytest +from cluster.tasks.business_logic.cluster_acteurs_parents_choose_new import ( + parent_id_generate, +) + +from dags.enrich.config import COHORTS, COLS +from dags.enrich.tasks.business_logic.enrich_acteurs_closed_suggestions import ( + enrich_acteurs_closed_suggestions, +) + +TODAY = datetime.now(timezone.utc).strftime("%Y-%m-%d") + + +@pytest.mark.django_db +class TestEnrichActeursClosedSuggestions: + + @pytest.fixture + def df_not_replaced(self): + return pd.DataFrame( + { + # Acteurs data + COLS.ACTEUR_ID: ["a01", "a02"], + COLS.ACTEUR_SIRET: ["00000000000001", "00000000000002"], + COLS.ACTEUR_NOM: ["AVANT a01", "AVANT a02"], + } + ) + + @pytest.fixture + def df_replaced(self): + return pd.DataFrame( + { + # Acteurs data + COLS.ACTEUR_ID: ["a1", "a2"], + COLS.ACTEUR_SIRET: ["11111111100001", "22222222200001"], + # Replacement data + COLS.REMPLACER_SIRET: ["11111111100002", "33333333300001"], + COLS.REMPLACER_NOM: ["APRES a1", "APRES a2"], + COLS.REMPLACER_COHORTE: ["meme_siret", "autre_siret"], + COLS.REMPLACER_ADRESSE: ["Adresse1", "Adresse2"], + COLS.REMPLACER_CODE_POSTAL: ["12345", "67890"], + COLS.REMPLACER_VILLE: ["Ville1", "Ville2"], + COLS.REMPLACER_NAF: ["naf1", "naf2"], + } + ) + + @pytest.fixture + def df_replaced_meme_siret(self, df_replaced): + return df_replaced[df_replaced[COLS.REMPLACER_COHORTE] == "meme_siret"] + + @pytest.fixture + def df_replaced_autre_siret(self, df_replaced): + return df_replaced[df_replaced[COLS.REMPLACER_COHORTE] == "autre_siret"] + + def test_df_replaced(self, df_replaced): + assert sorted(df_replaced[COLS.REMPLACER_COHORTE].unique()) == sorted( + [ + "meme_siret", + "autre_siret", + ] + ) + + @pytest.fixture + def acteurs(self, df_not_replaced, df_replaced): + # Creating acteurs as presence required to apply changes + from qfdmo.models import Acteur, ActeurType, Source + + df_concat = pd.concat([df_not_replaced, df_replaced]) + acteur_ids = df_concat[COLS.ACTEUR_ID].tolist() + s1 = Source.objects.create(nom="Source1") + at1 = ActeurType.objects.create(nom="Acteur1") + for acteur_id in acteur_ids: + Acteur.objects.create( + identifiant_unique=acteur_id, + nom=f"AVANT {acteur_id}", + acteur_type=at1, + source=s1, + ) + + def test_cohorte_not_replaced(self, acteurs, df_not_replaced): + from data.models import Suggestion, SuggestionCohorte + from qfdmo.models import ActeurStatus, RevisionActeur + + # Write suggestions to DB + enrich_acteurs_closed_suggestions( + df=df_not_replaced, + cohort_type=COHORTS.ACTEURS_CLOSED_NOT_REPLACED, + identifiant_action="test_cohorte_not_replaced", + identifiant_execution="test_cohorte_not_replaced", + dry_run=False, + ) + + # Check suggestions have been written to DB + cohort = SuggestionCohorte.objects.get( + identifiant_unique="test_cohorte_not_replaced", + identifiant_execution="test_cohorte_not_replaced", + ) + suggestions = Suggestion.objects.filter(cohorte=cohort) + assert len(suggestions) == 2 + + # Apply suggestions + for suggestion in suggestions: + suggestion.apply() + + # Verify changes + # 2 revisions should be created but not parent + a01 = RevisionActeur.objects.get(pk="a01") + assert a01.statut == ActeurStatus.INACTIF + assert a01.parent is None + assert a01.parent_reason is None + assert a01.siret_is_closed is True + + a02 = RevisionActeur.objects.get(pk="a02") + assert a02.statut == ActeurStatus.INACTIF + assert a02.parent is None + assert a02.parent_reason is None + assert a02.siret_is_closed is True + + def test_cohorte_meme_siren(self, acteurs, df_replaced_meme_siret): + from data.models import Suggestion, SuggestionCohorte + from qfdmo.models import ActeurStatus, RevisionActeur + + # Write suggestions to DB + enrich_acteurs_closed_suggestions( + df=df_replaced_meme_siret, + cohort_type=COHORTS.ACTEURS_CLOSED_REP_SAME_SIREN, + identifiant_action="test_cohorte_meme_siren", + identifiant_execution="test_cohorte_meme_siren", + dry_run=False, + ) + + # Check suggestions have been written to DB + cohort = SuggestionCohorte.objects.get( + identifiant_unique="test_cohorte_meme_siren", + identifiant_execution="test_cohorte_meme_siren", + ) + suggestions = Suggestion.objects.filter(cohorte=cohort) + assert len(suggestions) == 1 + + # Apply suggestions + for suggestion in suggestions: + suggestion.apply() + + # Verify changes + # 1 parent should be created in revision with replacement data + # 1 child should be created in revision with status=INACT and parent_id pointing + parent_id = parent_id_generate(["11111111100002"]) + parent = RevisionActeur.objects.get(pk=parent_id) + assert parent.nom == "APRES a1" + assert parent.adresse == "Adresse1" + assert parent.code_postal == "12345" + assert parent.ville == "Ville1" + assert parent.naf_principal == "naf1" + + child = RevisionActeur.objects.get(pk="a1") + assert child.statut == ActeurStatus.INACTIF + assert child.parent == parent + assert ( + child.parent_reason + == f"""SIRET 11111111100001 détecté le {TODAY} comme fermé dans AE, + remplacé par SIRET 11111111100002""" + ) + assert child.siret_is_closed is True + + def test_cohorte_autre_siren(self, acteurs, df_replaced_autre_siret): + from data.models import Suggestion, SuggestionCohorte + from qfdmo.models import ActeurStatus, RevisionActeur + + # Write suggestions to DB + enrich_acteurs_closed_suggestions( + df=df_replaced_autre_siret, + cohort_type=COHORTS.ACTEURS_CLOSED_REP_DIFF_SIREN, + identifiant_action="test_cohorte_autre_siren", + identifiant_execution="test_cohorte_autre_siren", + dry_run=False, + ) + + # Check suggestions have been written to DB + cohort = SuggestionCohorte.objects.get( + identifiant_unique="test_cohorte_autre_siren", + identifiant_execution="test_cohorte_autre_siren", + ) + suggestions = Suggestion.objects.filter(cohorte=cohort) + assert len(suggestions) == 1 + + # Apply suggestions + for suggestion in suggestions: + suggestion.apply() + + # Verify changes + # 1 parent should be created in revision with replacement data + # 1 child should be created in revision with status=INACT and parent_id pointing + parent_id = parent_id_generate(["33333333300001"]) + parent = RevisionActeur.objects.get(pk=parent_id) + assert parent.nom == "APRES a2" + assert parent.adresse == "Adresse2" + assert parent.code_postal == "67890" + assert parent.ville == "Ville2" + assert parent.naf_principal == "naf2" + + child = RevisionActeur.objects.get(pk="a2") + assert child.nom == "AVANT a2" + assert child.statut == ActeurStatus.INACTIF + assert child.parent == parent + assert ( + child.parent_reason + == f"""SIRET 22222222200001 détecté le {TODAY} comme fermé dans AE, + remplacé par SIRET 33333333300001""" + ) + assert child.siret_is_closed is True diff --git a/dbt/macros/constants/value_unavailable.sql b/dbt/macros/constants/value_unavailable.sql new file mode 100644 index 000000000..7039b89f8 --- /dev/null +++ b/dbt/macros/constants/value_unavailable.sql @@ -0,0 +1,3 @@ +/* Constant used to flag that a value is unavailable +and detect, test, exclude it more easily */ +{% macro value_unavailable() %}'🔴 VALEUR INDISPONIBLE'{% endmacro %} \ No newline at end of file diff --git a/dbt/models/intermediate/ae_annuaire_entreprises/int_ae_etablissement.sql b/dbt/models/intermediate/ae_annuaire_entreprises/int_ae_etablissement.sql index 677c6fe58..77281bc05 100644 --- a/dbt/models/intermediate/ae_annuaire_entreprises/int_ae_etablissement.sql +++ b/dbt/models/intermediate/ae_annuaire_entreprises/int_ae_etablissement.sql @@ -22,9 +22,9 @@ SELECT -- Names CASE - WHEN etab.denomination_usuelle = '[ND]' AND unite.denomination IS NOT NULL THEN unite.denomination - WHEN etab.denomination_usuelle IS NULL AND unite.denomination IS NOT NULL THEN unite.denomination - ELSE etab.denomination_usuelle + WHEN TRIM(etab.denomination_usuelle) NOT IN ('', '[ND]', NULL) THEN TRIM(etab.denomination_usuelle) + WHEN TRIM(etab.denomination_usuelle) IN ('', '[ND]', NULL) AND TRIM(unite.denomination) NOT IN ('', '[ND]', NULL) THEN TRIM(unite.denomination) + ELSE {{ value_unavailable() }} END AS nom, /* @@ -58,4 +58,8 @@ FROM {{ ref('base_ae_etablissement') }} AS etab data from parent unite into each etablissement to save us from making expensive JOINS in downstream models */ JOIN {{ ref('base_ae_unite_legale') }} AS unite -ON unite.siren = LEFT(etab.siret,9) \ No newline at end of file +ON unite.siren = LEFT(etab.siret,9) +/* Here we keep unavailable names as int_ models aren't +responsible for business logic. Keeping allows investigating +AND nom != {{ value_unavailable() }} +*/ \ No newline at end of file diff --git a/dbt/models/intermediate/ae_annuaire_entreprises/schema.yml b/dbt/models/intermediate/ae_annuaire_entreprises/schema.yml index b2cd1dd98..5e6567c05 100644 --- a/dbt/models/intermediate/ae_annuaire_entreprises/schema.yml +++ b/dbt/models/intermediate/ae_annuaire_entreprises/schema.yml @@ -54,6 +54,8 @@ models: description: "Code NAF Rev2" - name: nom description: "Nom de l'établissement" + data_tests: + - not_null - name: est_actif description: "OUI si A = Actif" data_type: boolean diff --git a/dbt/models/marts/enrich/marts_enrich_ae_closed_replaced.sql b/dbt/models/marts/enrich/marts_enrich_ae_closed_replaced.sql index a3cc01285..3e2844d5c 100644 --- a/dbt/models/marts/enrich/marts_enrich_ae_closed_replaced.sql +++ b/dbt/models/marts/enrich/marts_enrich_ae_closed_replaced.sql @@ -12,9 +12,9 @@ WITH potential_replacements AS ( candidates.siret AS acteur_siret, replacements.siret AS remplacer_siret, CASE - WHEN LEFT(candidates.siret,9) = LEFT(replacements.siret,9) THEN 1 - ELSE 0 - END AS remplacer_meme_siren, + WHEN LEFT(candidates.siret,9) = LEFT(replacements.siret,9) THEN 'meme_siret' + ELSE 'autre_siret' + END AS remplacer_cohorte, candidates.acteur_nom, replacements.nom AS remplacer_nom, columns_words_in_common_count( @@ -48,4 +48,6 @@ WITH potential_replacements AS ( WHERE replacements.est_actif ) SELECT * FROM potential_replacements -WHERE replacement_priority=1 \ No newline at end of file +WHERE replacement_priority=1 +/* We don't want to propose replacements with unavailable names */ +AND remplacer_nom != {{ value_unavailable() }} \ No newline at end of file diff --git a/dbt/models/marts/enrich/schema.yml b/dbt/models/marts/enrich/schema.yml index deaf70901..233186877 100644 --- a/dbt/models/marts/enrich/schema.yml +++ b/dbt/models/marts/enrich/schema.yml @@ -42,4 +42,89 @@ models: | qu'on fait un matching plus poussés avec python par la suite # If we had a match then we must have at least one director's name data_tests: - - not_null \ No newline at end of file + - not_null + + - name: marts_enrich_ae_closed_replaced + description: Etablissements de l'Annuaire Entreprises (AE) qui ont été + | fermés et remplacés par un autre établissement + columns: + - name: acteur_id + description: Identifiant unique de l'acteur + data_tests: + - not_null + - name: acteur_statut + description: Statut de l'acteur dans QFDMO + - name: acteur_siret + description: SIRET de l'acteur fermé + data_tests: + - not_null + - name: remplacer_siret + description: SIRET de l'établissement qui remplace l'acteur fermé + data_tests: + - not_null + - name: remplacer_cohorte + description: "Si le SIRET de remplacement appartient à la même entreprise (meme_siret) ou non (autre_siret)" + data_tests: + - not_null + - accepted_values: + values: ['meme_siret', 'autre_siret'] + - name: acteur_nom + description: Nom de l'acteur fermé + - name: remplacer_nom + description: Nom de l'établissement qui remplace l'acteur fermé + - name: noms_nombre_mots_commun + description: Nombre de mots en commun entre le nom de l'acteur et celui du remplaçant + data_type: integer + - name: acteur_commentaires + description: Commentaires de l'acteur pour debug + - name: naf + description: Code NAF de l'établissement remplaçant + - name: ville + description: Ville de l'établissement remplaçant + - name: code_postal + description: Code postal de l'établissement remplaçant + - name: adresse + description: Adresse de l'établissement remplaçant + - name: replacement_priority + description: "Priorité du remplacement (1 = meilleur match)" + data_type: integer + data_tests: + - not_null + - accepted_values: + values: [1] + + - name: marts_enrich_ae_closed_candidates + description: Etablissements fermés de l'Annuaire Entreprises (AE) qui pourraient être remplacés + columns: + - name: siret + description: SIRET de l'établissement fermé + data_tests: + - not_null + - name: unite_est_actif + description: Si l'unité légale est toujours active + data_type: boolean + - name: etab_est_actif + description: Si l'établissement est toujours actif (toujours FALSE ici) + data_type: boolean + data_tests: + - accepted_values: + values: [false] + - name: etab_code_postal + description: Code postal de l'établissement fermé + - name: etab_adresse + description: Adresse de l'établissement fermé + - name: etab_naf + description: Code NAF de l'établissement fermé + - name: acteur_id + description: Identifiant unique de l'acteur dans QFDMO + data_tests: + - not_null + - name: acteur_statut + description: Statut de l'acteur dans QFDMO + - name: acteur_nom + description: Nom de l'acteur dans QFDMO + - name: acteur_nom_normalise + description: Nom de l'acteur normalisé pour faciliter les comparaisons + - name: acteur_commentaires + description: Commentaires de l'acteur pour debug + diff --git a/qfdmo/migrations/0140_acteur_siret_is_closed_and_more.py b/qfdmo/migrations/0140_acteur_siret_is_closed_and_more.py new file mode 100644 index 000000000..13d2b801f --- /dev/null +++ b/qfdmo/migrations/0140_acteur_siret_is_closed_and_more.py @@ -0,0 +1,64 @@ +# Generated by Django 5.1.6 on 2025-04-07 08:33 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("qfdmo", "0139_alter_acteur_not_nullable_char_fields_tel_url_ville"), + ] + + operations = [ + migrations.AddField( + model_name="acteur", + name="siret_is_closed", + field=models.BooleanField( + blank=True, + default=None, + help_text="Indique si le SIRET est fermé ou non dans l'Annuaire Entreprises", + verbose_name="SIRET fermé", + ), + ), + migrations.AddField( + model_name="displayedacteur", + name="siret_is_closed", + field=models.BooleanField( + blank=True, + default=None, + help_text="Indique si le SIRET est fermé ou non dans l'Annuaire Entreprises", + verbose_name="SIRET fermé", + ), + ), + migrations.AddField( + model_name="displayedacteurtemp", + name="siret_is_closed", + field=models.BooleanField( + blank=True, + default=None, + help_text="Indique si le SIRET est fermé ou non dans l'Annuaire Entreprises", + verbose_name="SIRET fermé", + ), + ), + migrations.AddField( + model_name="revisionacteur", + name="parent_reason", + field=models.CharField( + blank=True, + db_default="", + default="", + help_text="Raison du rattachement au parent", + max_length=255, + ), + ), + migrations.AddField( + model_name="revisionacteur", + name="siret_is_closed", + field=models.BooleanField( + blank=True, + default=None, + help_text="Indique si le SIRET est fermé ou non dans l'Annuaire Entreprises", + verbose_name="SIRET fermé", + ), + ), + ] From bc34343936e6692c24a1b920e3fd962d6c83773f Mon Sep 17 00:00:00 2001 From: Max Corbeau Date: Mon, 7 Apr 2025 12:33:32 +0200 Subject: [PATCH 12/50] suggestions: presque fonctionnel --- dags/enrich/config/columns.py | 2 + .../enrich_acteurs_closed_suggestions.py | 40 ++++++--- .../business_logic/enrich_ae_rgpd_suggest.py | 2 +- dags/sources/config/shared_constants.py | 2 +- .../test_enrich_acteurs_closed_suggestions.py | 86 +++++++++++-------- ...010_alter_suggestioncohorte_type_action.py | 33 +++++++ data/models/changes/acteur_abstract.py | 6 +- .../changes/acteur_change_nothing_in_base.py | 3 +- .../models/changes/acteur_create_as_parent.py | 7 +- .../models/changes/acteur_delete_as_parent.py | 6 +- data/models/changes/acteur_update_data.py | 9 +- .../models/changes/acteur_update_parent_id.py | 6 +- .../changes/acteur_verify_in_revision.py | 14 +-- .../models/changes/sample_model_do_nothing.py | 3 +- data/models/suggestion.py | 11 ++- .../marts_enrich_ae_closed_candidates.sql | 6 +- .../0140_acteur_siret_is_closed_and_more.py | 4 + qfdmo/models/acteur.py | 1 + 18 files changed, 142 insertions(+), 99 deletions(-) create mode 100644 data/migrations/0010_alter_suggestioncohorte_type_action.py diff --git a/dags/enrich/config/columns.py b/dags/enrich/config/columns.py index 2c0461810..a5a890999 100644 --- a/dags/enrich/config/columns.py +++ b/dags/enrich/config/columns.py @@ -15,6 +15,8 @@ class COLS: # QFDMO ACTEUR_ID: str = "acteur_id" + ACTEUR_TYPE: str = "acteur_type" + ACTEUR_SOURCE: str = "acteur_source" ACTEUR_SIRET: str = "acteur_siret" ACTEUR_NOM: str = "acteur_nom" ACTEUR_NOMS_ORIGINE: str = "acteur_noms_origine" diff --git a/dags/enrich/tasks/business_logic/enrich_acteurs_closed_suggestions.py b/dags/enrich/tasks/business_logic/enrich_acteurs_closed_suggestions.py index 760d1e45d..c630e1fdf 100644 --- a/dags/enrich/tasks/business_logic/enrich_acteurs_closed_suggestions.py +++ b/dags/enrich/tasks/business_logic/enrich_acteurs_closed_suggestions.py @@ -14,7 +14,6 @@ def enrich_acteurs_closed_suggestions( df: pd.DataFrame, cohort_type: str, identifiant_action: str, - identifiant_execution: str, dry_run: bool = True, ) -> None: from data.models import ( @@ -29,6 +28,10 @@ def enrich_acteurs_closed_suggestions( today = datetime.now(timezone.utc).strftime("%Y-%m-%d") + # Validation + if df is None or df.empty: + raise ValueError("df vide: on devrait pas être ici") + if cohort_type not in [ COHORTS.ACTEURS_CLOSED_NOT_REPLACED, COHORTS.ACTEURS_CLOSED_REP_DIFF_SIREN, @@ -36,10 +39,14 @@ def enrich_acteurs_closed_suggestions( ]: raise ValueError(f"Mauvaise cohorte: {cohort_type=}") - suggestions = [] + cohortes = df[COLS.REMPLACER_COHORTE].unique() + if len(cohortes) > 1: + raise ValueError(f"Une seule cohorte à la fois: {cohortes=}") + # Suggestions + suggestions = [] for _, row in df.iterrows(): - row = row._asdict() + row = dict(row) # ----------------------------------------- # NOT REPLACED @@ -50,18 +57,16 @@ def enrich_acteurs_closed_suggestions( # ----------------------------------------- # REPLACED # ----------------------------------------- - elif cohort_type not in [ + elif cohort_type in [ COHORTS.ACTEURS_CLOSED_REP_DIFF_SIREN, COHORTS.ACTEURS_CLOSED_REP_SAME_SIREN, ]: - cohorts = row[COLS.REMPLACER_COHORTE].unique() - if len(cohorts) > 1: - raise ValueError(f"Une seule cohorte à la fois: {cohorts=}") + logger.info(f"{cohort_type}: suggestion pour acteur {row[COLS.ACTEUR_ID]}") changes = [] # Parent - parent_id = parent_id_generate([row[COLS.REMPLACER_SIRET]]) + parent_id = parent_id_generate([str(row[COLS.REMPLACER_SIRET])]) model_params = { "id": parent_id, "data": { @@ -71,7 +76,9 @@ def enrich_acteurs_closed_suggestions( "ville": row[COLS.REMPLACER_VILLE], "siren": row[COLS.REMPLACER_SIRET][:9], "siret": row[COLS.REMPLACER_SIRET], - "naf": row[COLS.REMPLACER_NAF], + "naf_principal": row[COLS.REMPLACER_NAF], + "acteur_type": row[COLS.ACTEUR_TYPE], + "source": None, }, } ChangeActeurCreateAsParent(**model_params).validate() @@ -86,13 +93,16 @@ def enrich_acteurs_closed_suggestions( # Child model_params = { - "id": row.acteur_id, + "id": row[COLS.ACTEUR_ID], "data": { "statut": ActeurStatus.INACTIF, "parent_id": parent_id, - "parent_reason": f"""SIRET {row.acteur_siret} détecté le {today} - comme fermé dans AE, remplacé par SIRET {row.remplacer_siret}""", + "parent_reason": f"""SIRET {row[COLS.ACTEUR_SIRET]} + détecté le {today} comme fermé dans AE, + remplacé par SIRET {row[COLS.REMPLACER_SIRET]}""", "siret_is_closed": True, + "acteur_type": row[COLS.ACTEUR_TYPE], + "source": row[COLS.ACTEUR_SOURCE], }, } ChangeActeurUpdateData(**model_params).validate() @@ -120,6 +130,8 @@ def enrich_acteurs_closed_suggestions( } ) + else: + raise ValueError(f"Mauvaise cohorte: {cohort_type=}") # ----------------------------------------- # DRY RUN: STOP HERE # ----------------------------------------- @@ -132,9 +144,9 @@ def enrich_acteurs_closed_suggestions( # ----------------------------------------- cohort = SuggestionCohorte( identifiant_action=identifiant_action, - identifiant_execution=f"{cohort_type} {identifiant_execution}", + identifiant_execution=f"{cohort_type}", statut=SuggestionStatut.AVALIDER, - type_action=SuggestionAction.ACTEURS_CLOSED, + type_action=SuggestionAction.ENRICH_ACTEURS_CLOSED, metadata={"🔢 Nombre de suggestions": len(suggestions)}, ) cohort.save() diff --git a/dags/enrich/tasks/business_logic/enrich_ae_rgpd_suggest.py b/dags/enrich/tasks/business_logic/enrich_ae_rgpd_suggest.py index 5f774778c..a77dc0bc3 100644 --- a/dags/enrich/tasks/business_logic/enrich_ae_rgpd_suggest.py +++ b/dags/enrich/tasks/business_logic/enrich_ae_rgpd_suggest.py @@ -87,7 +87,7 @@ def enrich_ae_rgpd_suggest( cohort = SuggestionCohorte( identifiant_action=identifiant_action, identifiant_execution=identifiant_execution, - type_action=SuggestionAction.RGPD_ANONYMIZE, + type_action=SuggestionAction.ENRICH_ACTEURS_RGPD, metadata={"🔢 Nombre de suggestions": len(suggestions)}, ) cohort.save() diff --git a/dags/sources/config/shared_constants.py b/dags/sources/config/shared_constants.py index 85537beaa..90740f724 100755 --- a/dags/sources/config/shared_constants.py +++ b/dags/sources/config/shared_constants.py @@ -8,7 +8,7 @@ # SuggestionCohorte type_action SUGGESTION_CRAWL_URLS = "CRAWL_URLS" -SUGGESTION_RGPD_ANONYMIZE = "RGPD_ANONYMIZE" +SUGGESTION_RGPD_ANONYMIZE = "ENRICH_ACTEURS_RGPD" SUGGESTION_CLUSTERING = "CLUSTERING" SUGGESTION_SOURCE_AJOUT = "SOURCE_AJOUT" SUGGESTION_SOURCE_MODIFICATION = "SOURCE_MODIFICATION" diff --git a/dags_unit_tests/enrich/tasks/test_enrich_acteurs_closed_suggestions.py b/dags_unit_tests/enrich/tasks/test_enrich_acteurs_closed_suggestions.py index 1eccf7afe..5c631df26 100644 --- a/dags_unit_tests/enrich/tasks/test_enrich_acteurs_closed_suggestions.py +++ b/dags_unit_tests/enrich/tasks/test_enrich_acteurs_closed_suggestions.py @@ -5,6 +5,8 @@ from cluster.tasks.business_logic.cluster_acteurs_parents_choose_new import ( parent_id_generate, ) +from django.contrib.gis.geos import Point +from rich import print from dags.enrich.config import COHORTS, COLS from dags.enrich.tasks.business_logic.enrich_acteurs_closed_suggestions import ( @@ -13,28 +15,46 @@ TODAY = datetime.now(timezone.utc).strftime("%Y-%m-%d") +COHORT_DEBUG_COLS = ["identifiant_action", "identifiant_execution", "type_action"] + @pytest.mark.django_db class TestEnrichActeursClosedSuggestions: @pytest.fixture - def df_not_replaced(self): + def acteur_source(self): + from data.models import Source + + return Source.objects.create(code="s1") + + @pytest.fixture + def acteur_type(self): + from qfdmo.models import ActeurType + + return ActeurType.objects.create(code="at1") + + @pytest.fixture + def df_not_replaced(self, acteur_type, acteur_source): return pd.DataFrame( { # Acteurs data COLS.ACTEUR_ID: ["a01", "a02"], COLS.ACTEUR_SIRET: ["00000000000001", "00000000000002"], COLS.ACTEUR_NOM: ["AVANT a01", "AVANT a02"], + COLS.ACTEUR_TYPE: [acteur_type.pk, acteur_type.pk], + COLS.ACTEUR_SOURCE: [acteur_source.pk, acteur_source.pk], } ) @pytest.fixture - def df_replaced(self): + def df_replaced(self, acteur_type, acteur_source): return pd.DataFrame( { # Acteurs data COLS.ACTEUR_ID: ["a1", "a2"], COLS.ACTEUR_SIRET: ["11111111100001", "22222222200001"], + COLS.ACTEUR_TYPE: [acteur_type.pk, acteur_type.pk], + COLS.ACTEUR_SOURCE: [acteur_source.pk, acteur_source.pk], # Replacement data COLS.REMPLACER_SIRET: ["11111111100002", "33333333300001"], COLS.REMPLACER_NOM: ["APRES a1", "APRES a2"], @@ -46,14 +66,6 @@ def df_replaced(self): } ) - @pytest.fixture - def df_replaced_meme_siret(self, df_replaced): - return df_replaced[df_replaced[COLS.REMPLACER_COHORTE] == "meme_siret"] - - @pytest.fixture - def df_replaced_autre_siret(self, df_replaced): - return df_replaced[df_replaced[COLS.REMPLACER_COHORTE] == "autre_siret"] - def test_df_replaced(self, df_replaced): assert sorted(df_replaced[COLS.REMPLACER_COHORTE].unique()) == sorted( [ @@ -63,23 +75,30 @@ def test_df_replaced(self, df_replaced): ) @pytest.fixture - def acteurs(self, df_not_replaced, df_replaced): + def df_replaced_meme_siret(self, df_replaced): + return df_replaced[df_replaced[COLS.REMPLACER_COHORTE] == "meme_siret"] + + @pytest.fixture + def df_replaced_autre_siret(self, df_replaced): + return df_replaced[df_replaced[COLS.REMPLACER_COHORTE] == "autre_siret"] + + @pytest.fixture + def acteurs(self, df_not_replaced, df_replaced, acteur_type, acteur_source): # Creating acteurs as presence required to apply changes - from qfdmo.models import Acteur, ActeurType, Source + from qfdmo.models import Acteur df_concat = pd.concat([df_not_replaced, df_replaced]) acteur_ids = df_concat[COLS.ACTEUR_ID].tolist() - s1 = Source.objects.create(nom="Source1") - at1 = ActeurType.objects.create(nom="Acteur1") for acteur_id in acteur_ids: Acteur.objects.create( identifiant_unique=acteur_id, nom=f"AVANT {acteur_id}", - acteur_type=at1, - source=s1, + acteur_type=acteur_type, + source=acteur_source, + location=Point(x=0, y=0), ) - def test_cohorte_not_replaced(self, acteurs, df_not_replaced): + def DISABLED_test_cohorte_not_replaced(self, acteurs, df_not_replaced): from data.models import Suggestion, SuggestionCohorte from qfdmo.models import ActeurStatus, RevisionActeur @@ -87,17 +106,14 @@ def test_cohorte_not_replaced(self, acteurs, df_not_replaced): enrich_acteurs_closed_suggestions( df=df_not_replaced, cohort_type=COHORTS.ACTEURS_CLOSED_NOT_REPLACED, - identifiant_action="test_cohorte_not_replaced", - identifiant_execution="test_cohorte_not_replaced", + identifiant_action="test_not_replaced", dry_run=False, ) # Check suggestions have been written to DB - cohort = SuggestionCohorte.objects.get( - identifiant_unique="test_cohorte_not_replaced", - identifiant_execution="test_cohorte_not_replaced", - ) - suggestions = Suggestion.objects.filter(cohorte=cohort) + print(list(SuggestionCohorte.objects.all().values(*COHORT_DEBUG_COLS))) + cohort = SuggestionCohorte.objects.get(identifiant_action="test_not_replaced") + suggestions = Suggestion.objects.filter(suggestion_cohorte=cohort) assert len(suggestions) == 2 # Apply suggestions @@ -126,17 +142,14 @@ def test_cohorte_meme_siren(self, acteurs, df_replaced_meme_siret): enrich_acteurs_closed_suggestions( df=df_replaced_meme_siret, cohort_type=COHORTS.ACTEURS_CLOSED_REP_SAME_SIREN, - identifiant_action="test_cohorte_meme_siren", - identifiant_execution="test_cohorte_meme_siren", + identifiant_action="test_meme_siren", dry_run=False, ) # Check suggestions have been written to DB - cohort = SuggestionCohorte.objects.get( - identifiant_unique="test_cohorte_meme_siren", - identifiant_execution="test_cohorte_meme_siren", - ) - suggestions = Suggestion.objects.filter(cohorte=cohort) + print(list(SuggestionCohorte.objects.all().values(*COHORT_DEBUG_COLS))) + cohort = SuggestionCohorte.objects.get(identifiant_action="test_meme_siren") + suggestions = Suggestion.objects.filter(suggestion_cohorte=cohort) assert len(suggestions) == 1 # Apply suggestions @@ -172,17 +185,14 @@ def test_cohorte_autre_siren(self, acteurs, df_replaced_autre_siret): enrich_acteurs_closed_suggestions( df=df_replaced_autre_siret, cohort_type=COHORTS.ACTEURS_CLOSED_REP_DIFF_SIREN, - identifiant_action="test_cohorte_autre_siren", - identifiant_execution="test_cohorte_autre_siren", + identifiant_action="test_autre_siren", dry_run=False, ) # Check suggestions have been written to DB - cohort = SuggestionCohorte.objects.get( - identifiant_unique="test_cohorte_autre_siren", - identifiant_execution="test_cohorte_autre_siren", - ) - suggestions = Suggestion.objects.filter(cohorte=cohort) + print(list(SuggestionCohorte.objects.all().values(*COHORT_DEBUG_COLS))) + cohort = SuggestionCohorte.objects.get(identifiant_action="test_autre_siren") + suggestions = Suggestion.objects.filter(suggestion_cohorte=cohort) assert len(suggestions) == 1 # Apply suggestions diff --git a/data/migrations/0010_alter_suggestioncohorte_type_action.py b/data/migrations/0010_alter_suggestioncohorte_type_action.py new file mode 100644 index 000000000..ac430a116 --- /dev/null +++ b/data/migrations/0010_alter_suggestioncohorte_type_action.py @@ -0,0 +1,33 @@ +# Generated by Django 5.1.6 on 2025-04-07 10:01 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("data", "0009_alter_suggestioncohorte_type_action"), + ] + + operations = [ + migrations.AlterField( + model_name="suggestioncohorte", + name="type_action", + field=models.CharField( + blank=True, + choices=[ + ("CRAWL_URLS", "🔗 URLs scannées"), + ("RGPD_ANONYMISATION", "🕵️ Anonymisation RGPD"), + ("ENRICH_ACTEURS_CLOSED", "🚪 Acteurs fermés"), + ("CLUSTERING", "regroupement/déduplication des acteurs"), + ("SOURCE_AJOUT", "ingestion de source de données - nouveau acteur"), + ( + "SOURCE_MODIFICATION", + "ingestion de source de données - modification d'acteur existant", + ), + ("SOURCE_SUPRESSION", "ingestion de source de données"), + ], + max_length=50, + ), + ), + ] diff --git a/data/models/changes/acteur_abstract.py b/data/models/changes/acteur_abstract.py index a72e66284..83a6fb71b 100644 --- a/data/models/changes/acteur_abstract.py +++ b/data/models/changes/acteur_abstract.py @@ -1,8 +1,4 @@ -""" -change_model to use as template for acteur changes - - -""" +"""change model to use as template for acteur changes""" from pydantic import BaseModel diff --git a/data/models/changes/acteur_change_nothing_in_base.py b/data/models/changes/acteur_change_nothing_in_base.py index f6c01fe36..816e77597 100644 --- a/data/models/changes/acteur_change_nothing_in_base.py +++ b/data/models/changes/acteur_change_nothing_in_base.py @@ -1,5 +1,4 @@ -""" -change_model to make no change to an acteur +"""change model to make no change to an acteur Reason for having such a model is that we can follow the same pattern to be consistent across the board. diff --git a/data/models/changes/acteur_create_as_parent.py b/data/models/changes/acteur_create_as_parent.py index f30c00daf..73baf552a 100644 --- a/data/models/changes/acteur_create_as_parent.py +++ b/data/models/changes/acteur_create_as_parent.py @@ -1,8 +1,6 @@ -""" -change_model to create a parent acteur +"""change model to create a parent acteur""" - -""" +from rich import print from data.models.changes.acteur_abstract import ChangeActeurAbstract from data.models.changes.utils import data_reconstruct @@ -19,6 +17,7 @@ def name(cls) -> str: def validate(self): """The parent shouldn't already exist""" + print(f"ChangeActeurCreateAsParent.validate: {self.id=} {self.data=}") rev = RevisionActeur.objects.filter(identifiant_unique=self.id) if rev.exists(): raise ValueError(f"Parent to create '{self.id}' already exists") diff --git a/data/models/changes/acteur_delete_as_parent.py b/data/models/changes/acteur_delete_as_parent.py index 5f51eaf26..886919054 100644 --- a/data/models/changes/acteur_delete_as_parent.py +++ b/data/models/changes/acteur_delete_as_parent.py @@ -1,8 +1,4 @@ -""" -change_model to delete a parent acteur - - -""" +"""change model to delete a parent""" from data.models.changes.acteur_abstract import ChangeActeurAbstract from qfdmo.models import RevisionActeur diff --git a/data/models/changes/acteur_update_data.py b/data/models/changes/acteur_update_data.py index 2c3104db7..dd097b87b 100644 --- a/data/models/changes/acteur_update_data.py +++ b/data/models/changes/acteur_update_data.py @@ -1,6 +1,8 @@ -"""Generic change model which should allow updating anything -about an acteur, taking care of handling Acteur vs. RevisionActeur -and data reconstruction.""" +"""Generic change model to update an acteur's data. If your use-case +is very specific (e.g. RGPD), use a dedicated model fore more clarity/robustness, +else you can use this model.""" + +from rich import print from data.models.changes.acteur_abstract import ChangeActeurAbstract from data.models.changes.utils import data_reconstruct @@ -13,6 +15,7 @@ def name(cls) -> str: return "acteur_update_data" def validate(self) -> Acteur | RevisionActeur: + print(f"ChangeActeurUpdateData.validate: {self.id=} {self.data=}") if not self.data: raise ValueError("No data provided") # The parent should already exist in revision or base diff --git a/data/models/changes/acteur_update_parent_id.py b/data/models/changes/acteur_update_parent_id.py index c0426a8d1..f19e95a2d 100644 --- a/data/models/changes/acteur_update_parent_id.py +++ b/data/models/changes/acteur_update_parent_id.py @@ -1,8 +1,4 @@ -""" -change_model to update an acteur's parent - - -""" +"""change model to update an acteur's parent""" from data.models.changes.acteur_abstract import ChangeActeurAbstract from qfdmo.models import Acteur, RevisionActeur diff --git a/data/models/changes/acteur_verify_in_revision.py b/data/models/changes/acteur_verify_in_revision.py index e5a6f3753..3d6e50cb5 100644 --- a/data/models/changes/acteur_verify_in_revision.py +++ b/data/models/changes/acteur_verify_in_revision.py @@ -1,16 +1,4 @@ -""" -change_model to make no change to an acteur - -Reason for having such a model is that we can -follow the same pattern to be consistent across the board. - -For instance in the clustering pipeline, we might decide -that some acteurs do not need to be changed as they already point -to the chosen parent, yes we want to reflect all decisions made -in the cluster summary, this model allows us to do just that -without havint to create messy conditional code in pipelines - -""" +"""change model to verify an acteur's presence in revision""" from data.models.changes.acteur_abstract import ChangeActeurAbstract from qfdmo.models import RevisionActeur diff --git a/data/models/changes/sample_model_do_nothing.py b/data/models/changes/sample_model_do_nothing.py index a81a692da..c7d64f094 100644 --- a/data/models/changes/sample_model_do_nothing.py +++ b/data/models/changes/sample_model_do_nothing.py @@ -1,5 +1,4 @@ -"""A sample model which does nothing BUT -helps us create test cases for the +"""A sample model which does nothing BUT helps us create test cases for the overall SuggestionChange model""" from pydantic import BaseModel diff --git a/data/models/suggestion.py b/data/models/suggestion.py index d8ce1daac..606ee8e4d 100644 --- a/data/models/suggestion.py +++ b/data/models/suggestion.py @@ -54,8 +54,8 @@ class SuggestionCohorteStatut(models.TextChoices): class SuggestionAction(models.TextChoices): CRAWL_URLS = SUGGESTION_CRAWL_URLS, "🔗 URLs scannées" - RGPD_ANONYMIZE = "RGPD_ANONYMISATION", "🕵️ Anonymisation RGPD" - ACTEURS_CLOSED = "ACTEURS_CLOSED", "🚪 Acteurs fermés" + ENRICH_ACTEURS_RGPD = "RGPD_ANONYMISATION", "🕵️ Anonymisation RGPD" + ENRICH_ACTEURS_CLOSED = "ENRICH_ACTEURS_CLOSED", "🚪 Acteurs fermés" CLUSTERING = SUGGESTION_CLUSTERING, "regroupement/déduplication des acteurs" SOURCE_AJOUT = ( SUGGESTION_SOURCE_AJOUT, @@ -196,7 +196,9 @@ def display_suggestion_details(self): template_name = "data/_partials/clustering_suggestion_details.html" elif self.suggestion_cohorte.type_action == SuggestionAction.CRAWL_URLS: template_name = "data/_partials/crawl_urls_suggestion_details.html" - elif self.suggestion_cohorte.type_action == SuggestionAction.RGPD_ANONYMIZE: + elif ( + self.suggestion_cohorte.type_action == SuggestionAction.ENRICH_ACTEURS_RGPD + ): template_name = "data/_partials/generic_suggestion_details.html" template_context = self.suggestion @@ -318,7 +320,8 @@ def apply(self): if self.suggestion_cohorte.type_action in [ SuggestionAction.CLUSTERING, SuggestionAction.CRAWL_URLS, - SuggestionAction.RGPD_ANONYMIZE, + SuggestionAction.ENRICH_ACTEURS_RGPD, + SuggestionAction.ENRICH_ACTEURS_CLOSED, ]: changes = self.suggestion["changes"] changes.sort(key=lambda x: x["order"]) diff --git a/dbt/models/marts/enrich/marts_enrich_ae_closed_candidates.sql b/dbt/models/marts/enrich/marts_enrich_ae_closed_candidates.sql index b143ce1ce..4a33a195f 100644 --- a/dbt/models/marts/enrich/marts_enrich_ae_closed_candidates.sql +++ b/dbt/models/marts/enrich/marts_enrich_ae_closed_candidates.sql @@ -27,7 +27,8 @@ WITH acteurs_with_siret AS ( udf_normalize_string_alpha_for_match(nom) AS acteur_nom_normalise, identifiant_unique AS acteur_id, commentaires AS acteur_commentaires, - statut AS acteur_statut + statut AS acteur_statut, + acteur_type AS acteur_type FROM {{ ref('marts_carte_acteur') }} WHERE siret IS NOT NULL AND siret != '' AND LENGTH(siret) = 14 ), @@ -46,7 +47,8 @@ SELECT acteurs.acteur_statut, acteurs.acteur_nom, acteurs.acteur_nom_normalise, - acteurs.acteur_commentaires + acteurs.acteur_commentaires, + acteurs.acteur_type FROM acteurs_with_siret AS acteurs JOIN {{ ref('int_ae_etablissement') }} AS etab ON acteurs.siret = etab.siret WHERE etab.est_actif = FALSE diff --git a/qfdmo/migrations/0140_acteur_siret_is_closed_and_more.py b/qfdmo/migrations/0140_acteur_siret_is_closed_and_more.py index 13d2b801f..80e1a8079 100644 --- a/qfdmo/migrations/0140_acteur_siret_is_closed_and_more.py +++ b/qfdmo/migrations/0140_acteur_siret_is_closed_and_more.py @@ -15,6 +15,7 @@ class Migration(migrations.Migration): name="siret_is_closed", field=models.BooleanField( blank=True, + null=True, default=None, help_text="Indique si le SIRET est fermé ou non dans l'Annuaire Entreprises", verbose_name="SIRET fermé", @@ -25,6 +26,7 @@ class Migration(migrations.Migration): name="siret_is_closed", field=models.BooleanField( blank=True, + null=True, default=None, help_text="Indique si le SIRET est fermé ou non dans l'Annuaire Entreprises", verbose_name="SIRET fermé", @@ -35,6 +37,7 @@ class Migration(migrations.Migration): name="siret_is_closed", field=models.BooleanField( blank=True, + null=True, default=None, help_text="Indique si le SIRET est fermé ou non dans l'Annuaire Entreprises", verbose_name="SIRET fermé", @@ -56,6 +59,7 @@ class Migration(migrations.Migration): name="siret_is_closed", field=models.BooleanField( blank=True, + null=True, default=None, help_text="Indique si le SIRET est fermé ou non dans l'Annuaire Entreprises", verbose_name="SIRET fermé", diff --git a/qfdmo/models/acteur.py b/qfdmo/models/acteur.py index 905313221..12710f726 100644 --- a/qfdmo/models/acteur.py +++ b/qfdmo/models/acteur.py @@ -359,6 +359,7 @@ class Meta: # AE's DB at runtime (which has 40M rows), also helping with Django admin info siret_is_closed = models.BooleanField( default=None, # by default we can't assume a SIRET is opened + null=True, blank=True, verbose_name="SIRET fermé", help_text="Indique si le SIRET est fermé ou non dans l'Annuaire Entreprises", From 3622d3654cd78aa2d840df7b09a591b8caa43f7d Mon Sep 17 00:00:00 2001 From: Max Corbeau Date: Mon, 7 Apr 2025 14:59:37 +0200 Subject: [PATCH 13/50] suggestions: tests qui fonctionnent --- dags/enrich/config/cohorts.py | 4 +- .../enrich_acteurs_closed_suggestions.py | 83 +++++++++------- .../test_enrich_acteurs_closed_suggestions.py | 98 +++++++++++-------- data/models/changes/acteur_update_data.py | 7 +- 4 files changed, 116 insertions(+), 76 deletions(-) diff --git a/dags/enrich/config/cohorts.py b/dags/enrich/config/cohorts.py index 5e9be42f1..76cdbe143 100644 --- a/dags/enrich/config/cohorts.py +++ b/dags/enrich/config/cohorts.py @@ -8,5 +8,5 @@ @dataclass(frozen=True) class COHORTS: ACTEURS_CLOSED_NOT_REPLACED: str = f"{INTRO} 🔴 non remplacés" - ACTEURS_CLOSED_REP_DIFF_SIREN: str = f"{INTRO} 🟡 remplacés via SIREN diff" - ACTEURS_CLOSED_REP_SAME_SIREN: str = f"{INTRO} 🟢 remplacés via SIREN idem" + ACTEURS_CLOSED_REP_DIFF_SIREN: str = f"{INTRO} 🟡 remplacés par SIRET autre SIREN" + ACTEURS_CLOSED_REP_SAME_SIREN: str = f"{INTRO} 🟢 remplacés par SIRET même SIREN" diff --git a/dags/enrich/tasks/business_logic/enrich_acteurs_closed_suggestions.py b/dags/enrich/tasks/business_logic/enrich_acteurs_closed_suggestions.py index c630e1fdf..adb960dc7 100644 --- a/dags/enrich/tasks/business_logic/enrich_acteurs_closed_suggestions.py +++ b/dags/enrich/tasks/business_logic/enrich_acteurs_closed_suggestions.py @@ -39,10 +39,6 @@ def enrich_acteurs_closed_suggestions( ]: raise ValueError(f"Mauvaise cohorte: {cohort_type=}") - cohortes = df[COLS.REMPLACER_COHORTE].unique() - if len(cohortes) > 1: - raise ValueError(f"Une seule cohorte à la fois: {cohortes=}") - # Suggestions suggestions = [] for _, row in df.iterrows(): @@ -52,7 +48,25 @@ def enrich_acteurs_closed_suggestions( # NOT REPLACED # ----------------------------------------- if cohort_type == COHORTS.ACTEURS_CLOSED_NOT_REPLACED: - raise NotImplementedError("Pas encore implémenté") + changes = [] + model_params = { + "id": row[COLS.ACTEUR_ID], + "data": { + "statut": ActeurStatus.INACTIF, + "siret_is_closed": True, + "acteur_type": row[COLS.ACTEUR_TYPE], + "source": row[COLS.ACTEUR_SOURCE], + }, + } + ChangeActeurUpdateData(**model_params).validate() + change = SuggestionChange( + order=1, + reason=cohort_type, + entity_type="acteur_displayed", + model_name=ChangeActeurUpdateData.name(), + model_params=model_params, + ).model_dump() + changes.append(change) # ----------------------------------------- # REPLACED @@ -61,7 +75,10 @@ def enrich_acteurs_closed_suggestions( COHORTS.ACTEURS_CLOSED_REP_DIFF_SIREN, COHORTS.ACTEURS_CLOSED_REP_SAME_SIREN, ]: - logger.info(f"{cohort_type}: suggestion pour acteur {row[COLS.ACTEUR_ID]}") + cohortes = df[COLS.REMPLACER_COHORTE].unique() + if len(cohortes) > 1: + raise ValueError(f"Une seule cohorte à la fois: {cohortes=}") + logger.info(f"{cohort_type}: suggestion acteur id={row[COLS.ACTEUR_ID]}") changes = [] @@ -70,6 +87,7 @@ def enrich_acteurs_closed_suggestions( model_params = { "id": parent_id, "data": { + "identifiant_unique": parent_id, "nom": row[COLS.REMPLACER_NOM], "adresse": row[COLS.REMPLACER_ADRESSE], "code_postal": row[COLS.REMPLACER_CODE_POSTAL], @@ -96,10 +114,12 @@ def enrich_acteurs_closed_suggestions( "id": row[COLS.ACTEUR_ID], "data": { "statut": ActeurStatus.INACTIF, - "parent_id": parent_id, - "parent_reason": f"""SIRET {row[COLS.ACTEUR_SIRET]} - détecté le {today} comme fermé dans AE, - remplacé par SIRET {row[COLS.REMPLACER_SIRET]}""", + "parent": parent_id, + "parent_reason": ( + f"SIRET {row[COLS.ACTEUR_SIRET]} " + f"détecté le {today} comme fermé dans AE, " + f"remplacé par SIRET {row[COLS.REMPLACER_SIRET]}" + ), "siret_is_closed": True, "acteur_type": row[COLS.ACTEUR_TYPE], "source": row[COLS.ACTEUR_SOURCE], @@ -115,23 +135,22 @@ def enrich_acteurs_closed_suggestions( ).model_dump() changes.append(change) - # ----------------------------------------- - # SUGGESTION: PREPARE - # ----------------------------------------- - suggestions.append( - { - # TODO: free format thanks to recursive model - "contexte": {}, - "suggestion": { - "title": cohort_type, - "summary": [], - "changes": changes, - }, - } - ) - else: raise ValueError(f"Mauvaise cohorte: {cohort_type=}") + + # Generic to all cohorts + suggestions.append( + { + # TODO: free format thanks to recursive model + "contexte": {}, + "suggestion": { + "title": cohort_type, + "summary": [], + "changes": changes, + }, + } + ) + # ----------------------------------------- # DRY RUN: STOP HERE # ----------------------------------------- @@ -151,11 +170,9 @@ def enrich_acteurs_closed_suggestions( ) cohort.save() for suggestion in suggestions: - - for suggestion in suggestions: - Suggestion( - suggestion_cohorte=cohort, - statut=SuggestionStatut.AVALIDER, - contexte=suggestion["contexte"], - suggestion=suggestion["suggestion"], - ).save() + Suggestion( + suggestion_cohorte=cohort, + statut=SuggestionStatut.AVALIDER, + contexte=suggestion["contexte"], + suggestion=suggestion["suggestion"], + ).save() diff --git a/dags_unit_tests/enrich/tasks/test_enrich_acteurs_closed_suggestions.py b/dags_unit_tests/enrich/tasks/test_enrich_acteurs_closed_suggestions.py index 5c631df26..1ab91fe63 100644 --- a/dags_unit_tests/enrich/tasks/test_enrich_acteurs_closed_suggestions.py +++ b/dags_unit_tests/enrich/tasks/test_enrich_acteurs_closed_suggestions.py @@ -6,7 +6,6 @@ parent_id_generate, ) from django.contrib.gis.geos import Point -from rich import print from dags.enrich.config import COHORTS, COLS from dags.enrich.tasks.business_logic.enrich_acteurs_closed_suggestions import ( @@ -15,54 +14,60 @@ TODAY = datetime.now(timezone.utc).strftime("%Y-%m-%d") -COHORT_DEBUG_COLS = ["identifiant_action", "identifiant_execution", "type_action"] - @pytest.mark.django_db class TestEnrichActeursClosedSuggestions: @pytest.fixture - def acteur_source(self): + def source(self): from data.models import Source return Source.objects.create(code="s1") @pytest.fixture - def acteur_type(self): + def atype(self): from qfdmo.models import ActeurType return ActeurType.objects.create(code="at1") @pytest.fixture - def df_not_replaced(self, acteur_type, acteur_source): + def df_not_replaced(self, atype, source): return pd.DataFrame( { # Acteurs data COLS.ACTEUR_ID: ["a01", "a02"], COLS.ACTEUR_SIRET: ["00000000000001", "00000000000002"], COLS.ACTEUR_NOM: ["AVANT a01", "AVANT a02"], - COLS.ACTEUR_TYPE: [acteur_type.pk, acteur_type.pk], - COLS.ACTEUR_SOURCE: [acteur_source.pk, acteur_source.pk], + COLS.ACTEUR_TYPE: [atype.pk, atype.pk], + COLS.ACTEUR_SOURCE: [source.pk, source.pk], } ) @pytest.fixture - def df_replaced(self, acteur_type, acteur_source): + def df_replaced(self, atype, source): return pd.DataFrame( { # Acteurs data - COLS.ACTEUR_ID: ["a1", "a2"], - COLS.ACTEUR_SIRET: ["11111111100001", "22222222200001"], - COLS.ACTEUR_TYPE: [acteur_type.pk, acteur_type.pk], - COLS.ACTEUR_SOURCE: [acteur_source.pk, acteur_source.pk], + COLS.ACTEUR_ID: ["a1", "a2", "a3"], + COLS.ACTEUR_SIRET: [ + "11111111100001", + "22222222200001", + "44444444400001", + ], + COLS.ACTEUR_TYPE: [atype.pk, atype.pk, atype.pk], + COLS.ACTEUR_SOURCE: [source.pk, source.pk, source.pk], # Replacement data - COLS.REMPLACER_SIRET: ["11111111100002", "33333333300001"], - COLS.REMPLACER_NOM: ["APRES a1", "APRES a2"], - COLS.REMPLACER_COHORTE: ["meme_siret", "autre_siret"], - COLS.REMPLACER_ADRESSE: ["Adresse1", "Adresse2"], - COLS.REMPLACER_CODE_POSTAL: ["12345", "67890"], - COLS.REMPLACER_VILLE: ["Ville1", "Ville2"], - COLS.REMPLACER_NAF: ["naf1", "naf2"], + COLS.REMPLACER_SIRET: [ + "11111111100002", + "33333333300001", + "55555555500001", + ], + COLS.REMPLACER_NOM: ["APRES a1", "APRES a2", "APRES a3"], + COLS.REMPLACER_COHORTE: ["meme_siret", "autre_siret", "autre_siret"], + COLS.REMPLACER_ADRESSE: ["Adresse1", "Adresse2", "Adresse3"], + COLS.REMPLACER_CODE_POSTAL: ["12345", "67890", "12345"], + COLS.REMPLACER_VILLE: ["Ville1", "Ville2", "Ville3"], + COLS.REMPLACER_NAF: ["naf1", "naf2", "naf3"], } ) @@ -83,7 +88,7 @@ def df_replaced_autre_siret(self, df_replaced): return df_replaced[df_replaced[COLS.REMPLACER_COHORTE] == "autre_siret"] @pytest.fixture - def acteurs(self, df_not_replaced, df_replaced, acteur_type, acteur_source): + def acteurs(self, df_not_replaced, df_replaced, atype, source): # Creating acteurs as presence required to apply changes from qfdmo.models import Acteur @@ -93,12 +98,12 @@ def acteurs(self, df_not_replaced, df_replaced, acteur_type, acteur_source): Acteur.objects.create( identifiant_unique=acteur_id, nom=f"AVANT {acteur_id}", - acteur_type=acteur_type, - source=acteur_source, + acteur_type=atype, + source=source, location=Point(x=0, y=0), ) - def DISABLED_test_cohorte_not_replaced(self, acteurs, df_not_replaced): + def test_cohorte_not_replaced(self, acteurs, df_not_replaced): from data.models import Suggestion, SuggestionCohorte from qfdmo.models import ActeurStatus, RevisionActeur @@ -111,7 +116,6 @@ def DISABLED_test_cohorte_not_replaced(self, acteurs, df_not_replaced): ) # Check suggestions have been written to DB - print(list(SuggestionCohorte.objects.all().values(*COHORT_DEBUG_COLS))) cohort = SuggestionCohorte.objects.get(identifiant_action="test_not_replaced") suggestions = Suggestion.objects.filter(suggestion_cohorte=cohort) assert len(suggestions) == 2 @@ -125,16 +129,16 @@ def DISABLED_test_cohorte_not_replaced(self, acteurs, df_not_replaced): a01 = RevisionActeur.objects.get(pk="a01") assert a01.statut == ActeurStatus.INACTIF assert a01.parent is None - assert a01.parent_reason is None + assert a01.parent_reason == "" # consequence of empty strings in DB assert a01.siret_is_closed is True a02 = RevisionActeur.objects.get(pk="a02") assert a02.statut == ActeurStatus.INACTIF assert a02.parent is None - assert a02.parent_reason is None + assert a02.parent_reason == "" assert a02.siret_is_closed is True - def test_cohorte_meme_siren(self, acteurs, df_replaced_meme_siret): + def test_cohorte_meme_siren(self, acteurs, atype, source, df_replaced_meme_siret): from data.models import Suggestion, SuggestionCohorte from qfdmo.models import ActeurStatus, RevisionActeur @@ -147,7 +151,6 @@ def test_cohorte_meme_siren(self, acteurs, df_replaced_meme_siret): ) # Check suggestions have been written to DB - print(list(SuggestionCohorte.objects.all().values(*COHORT_DEBUG_COLS))) cohort = SuggestionCohorte.objects.get(identifiant_action="test_meme_siren") suggestions = Suggestion.objects.filter(suggestion_cohorte=cohort) assert len(suggestions) == 1 @@ -161,19 +164,21 @@ def test_cohorte_meme_siren(self, acteurs, df_replaced_meme_siret): # 1 child should be created in revision with status=INACT and parent_id pointing parent_id = parent_id_generate(["11111111100002"]) parent = RevisionActeur.objects.get(pk=parent_id) + assert parent.pk == parent_id assert parent.nom == "APRES a1" assert parent.adresse == "Adresse1" assert parent.code_postal == "12345" assert parent.ville == "Ville1" assert parent.naf_principal == "naf1" + assert parent.acteur_type == atype + assert parent.source is None child = RevisionActeur.objects.get(pk="a1") assert child.statut == ActeurStatus.INACTIF assert child.parent == parent - assert ( - child.parent_reason - == f"""SIRET 11111111100001 détecté le {TODAY} comme fermé dans AE, - remplacé par SIRET 11111111100002""" + assert child.parent_reason == ( + f"SIRET 11111111100001 détecté le {TODAY} comme fermé dans AE, " + f"remplacé par SIRET 11111111100002" ) assert child.siret_is_closed is True @@ -190,10 +195,9 @@ def test_cohorte_autre_siren(self, acteurs, df_replaced_autre_siret): ) # Check suggestions have been written to DB - print(list(SuggestionCohorte.objects.all().values(*COHORT_DEBUG_COLS))) cohort = SuggestionCohorte.objects.get(identifiant_action="test_autre_siren") suggestions = Suggestion.objects.filter(suggestion_cohorte=cohort) - assert len(suggestions) == 1 + assert len(suggestions) == 2 # (1 parent + 1 child) x 2 acteurs fermés # Apply suggestions for suggestion in suggestions: @@ -211,12 +215,26 @@ def test_cohorte_autre_siren(self, acteurs, df_replaced_autre_siret): assert parent.naf_principal == "naf2" child = RevisionActeur.objects.get(pk="a2") - assert child.nom == "AVANT a2" assert child.statut == ActeurStatus.INACTIF assert child.parent == parent - assert ( - child.parent_reason - == f"""SIRET 22222222200001 détecté le {TODAY} comme fermé dans AE, - remplacé par SIRET 33333333300001""" + assert child.parent_reason == ( + f"SIRET 22222222200001 détecté le {TODAY} comme fermé dans AE, " + f"remplacé par SIRET 33333333300001" ) assert child.siret_is_closed is True + + parent_id = parent_id_generate(["55555555500001"]) + parent = RevisionActeur.objects.get(pk=parent_id) + assert parent.nom == "APRES a3" + assert parent.adresse == "Adresse3" + assert parent.code_postal == "12345" + assert parent.ville == "Ville3" + assert parent.naf_principal == "naf3" + + child = RevisionActeur.objects.get(pk="a3") + assert child.statut == ActeurStatus.INACTIF + assert child.parent == parent + assert child.parent_reason == ( + f"SIRET 44444444400001 détecté le {TODAY} comme fermé dans AE, " + f"remplacé par SIRET 55555555500001" + ) diff --git a/data/models/changes/acteur_update_data.py b/data/models/changes/acteur_update_data.py index dd097b87b..1bc5a9eac 100644 --- a/data/models/changes/acteur_update_data.py +++ b/data/models/changes/acteur_update_data.py @@ -2,8 +2,14 @@ is very specific (e.g. RGPD), use a dedicated model fore more clarity/robustness, else you can use this model.""" +<<<<<<< HEAD from rich import print +======= +from dags.cluster.tasks.business_logic.misc.data_serialize_reconstruct import ( + data_reconstruct, +) +>>>>>>> b8c92032 (suggestions: tests qui fonctionnent) from data.models.changes.acteur_abstract import ChangeActeurAbstract from data.models.changes.utils import data_reconstruct from qfdmo.models import Acteur, RevisionActeur @@ -15,7 +21,6 @@ def name(cls) -> str: return "acteur_update_data" def validate(self) -> Acteur | RevisionActeur: - print(f"ChangeActeurUpdateData.validate: {self.id=} {self.data=}") if not self.data: raise ValueError("No data provided") # The parent should already exist in revision or base From 126498dde8b77435675e29f5b31d7518f3f5c140 Mon Sep 17 00:00:00 2001 From: Max Corbeau Date: Wed, 9 Apr 2025 06:33:50 +0200 Subject: [PATCH 14/50] refacto du DAG avec dbt & suggestions --- dags/enrich/config/cohorts.py | 6 +- dags/enrich/config/dbt.py | 12 ++- dags/enrich/config/models.py | 9 +++ dags/enrich/config/tasks.py | 17 ++++- dags/enrich/config/xcoms.py | 11 ++- dags/enrich/dags/enrich_acteurs_closed.py | 73 ++++++++++++++----- .../enrich_acteurs_closed_suggestions.py | 57 +++++++++++++++ .../enrich_config_create_task.py | 1 + .../enrich_dbt_model_read_task.py | 60 +++++++++++++++ .../enrich_dbt_models_refresh_task.py | 58 +++++++++++++++ .../enrich_read_ae_closed_replaced_task.py | 0 .../enrich_read_dbt_model_task.py | 15 ++-- .../enrich_acteurs_closed_suggestions.py | 12 +-- ..._dbt_model.py => enrich_dbt_model_read.py} | 2 +- .../test_enrich_acteurs_closed_suggestions.py | 6 +- ...arts_enrich_acteurs_closed_candidates.sql} | 23 ++---- ...rts_enrich_acteurs_closed_not_replaced.sql | 21 ++++++ ... marts_enrich_acteurs_closed_replaced.sql} | 11 +-- ...ch_acteurs_closed_replaced_other_siren.sql | 9 +++ ...ich_acteurs_closed_replaced_same_siren.sql | 9 +++ dbt/models/marts/enrich/schema.yml | 12 +-- 21 files changed, 352 insertions(+), 72 deletions(-) create mode 100644 dags/enrich/tasks/airflow_logic/enrich_acteurs_closed_suggestions.py create mode 100644 dags/enrich/tasks/airflow_logic/enrich_dbt_model_read_task.py create mode 100644 dags/enrich/tasks/airflow_logic/enrich_dbt_models_refresh_task.py delete mode 100644 dags/enrich/tasks/airflow_logic/enrich_read_ae_closed_replaced_task.py rename dags/enrich/tasks/business_logic/{enrich_read_dbt_model.py => enrich_dbt_model_read.py} (98%) rename dbt/models/marts/enrich/{marts_enrich_ae_closed_candidates.sql => marts_enrich_acteurs_closed_candidates.sql} (60%) create mode 100644 dbt/models/marts/enrich/marts_enrich_acteurs_closed_not_replaced.sql rename dbt/models/marts/enrich/{marts_enrich_ae_closed_replaced.sql => marts_enrich_acteurs_closed_replaced.sql} (84%) create mode 100644 dbt/models/marts/enrich/marts_enrich_acteurs_closed_replaced_other_siren.sql create mode 100644 dbt/models/marts/enrich/marts_enrich_acteurs_closed_replaced_same_siren.sql diff --git a/dags/enrich/config/cohorts.py b/dags/enrich/config/cohorts.py index 76cdbe143..dbdee29b2 100644 --- a/dags/enrich/config/cohorts.py +++ b/dags/enrich/config/cohorts.py @@ -7,6 +7,6 @@ @dataclass(frozen=True) class COHORTS: - ACTEURS_CLOSED_NOT_REPLACED: str = f"{INTRO} 🔴 non remplacés" - ACTEURS_CLOSED_REP_DIFF_SIREN: str = f"{INTRO} 🟡 remplacés par SIRET autre SIREN" - ACTEURS_CLOSED_REP_SAME_SIREN: str = f"{INTRO} 🟢 remplacés par SIRET même SIREN" + CLOSED_NOT_REPLACED: str = f"{INTRO} 🔴 non remplacés" + CLOSED_REP_OTHER_SIREN: str = f"{INTRO} 🟡 remplacés par SIRET d'un autre SIREN" + CLOSED_REP_SAME_SIREN: str = f"{INTRO} 🟢 remplacés par SIRET du même SIREN" diff --git a/dags/enrich/config/dbt.py b/dags/enrich/config/dbt.py index f5bc9684d..dbfc0d1aa 100644 --- a/dags/enrich/config/dbt.py +++ b/dags/enrich/config/dbt.py @@ -6,5 +6,13 @@ @dataclass(frozen=True) class DBT: MARTS_ENRICH_AE_RGPD: str = "marts_enrich_ae_rgpd" - MARTS_ENRICH_AE_CLOSED_CANDIDATES: str = "marts_enrich_ae_closed_candidates" - MARTS_ENRICH_AE_CLOSED_REPLACED: str = "marts_enrich_ae_closed_replaced" + MARTS_ENRICH_AE_CLOSED_CANDIDATES: str = "marts_enrich_acteurs_closed_candidates" + MARTS_ENRICH_AE_CLOSED_REPLACED_SAME_SIREN: str = ( + "marts_enrich_acteurs_closed_replaced_same_siren" + ) + MARTS_ENRICH_AE_CLOSED_REPLACED_OTHER_SIREN: str = ( + "marts_enrich_acteurs_closed_replaced_other_siren" + ) + MARTS_ENRICH_AE_CLOSED_NOT_REPLACED: str = ( + "marts_enrich_acteurs_closed_not_replaced" + ) diff --git a/dags/enrich/config/models.py b/dags/enrich/config/models.py index 5254ffb6f..2005c0839 100644 --- a/dags/enrich/config/models.py +++ b/dags/enrich/config/models.py @@ -41,6 +41,15 @@ class EnrichBaseConfig(BaseModel): default=True, description="🚱 Si coché, aucune tâche d'écriture ne sera effectuée", ) + dbt_models_refresh: bool = Field( + default=True, + description="""🔄 Si coché, les modèles DBT seront rafraîchis. + 🔴 Désactiver uniquement pour des tests.""", + ) + dbt_models_refresh_command: str = Field( + default="dbt build --select tag:marts,tag:enrich,tag:closed", + description="🔄 Commande DBT à exécuter pour rafraîchir les modèles", + ) filter_contains__acteur_commentaires: Optional[str] = Field( default=None, description="🔍 Filtre sur **acteur_commentaires**", diff --git a/dags/enrich/config/tasks.py b/dags/enrich/config/tasks.py index 034a6a1b9..0652eacc5 100644 --- a/dags/enrich/config/tasks.py +++ b/dags/enrich/config/tasks.py @@ -10,8 +10,21 @@ class TASKS: # Read tasks READ_AE_RGPD: str = "enrich_ae_rgpd_read" - READ_AE_CLOSED_CANDIDATES: str = "enrich_read_ae_closed_candidates" - READ_AE_CLOSED_REPLACED: str = "enrich_read_ae_closed_replaced" + ENRICH_CLOSED_REPLACED_SAME_SIREN: str = "enrich_acteurs_closed_replaced_same_siren" + ENRICH_CLOSED_REPLACED_OTHER_SIREN: str = ( + "enrich_acteurs_closed_replaced_other_siren" + ) + ENRICH_CLOSED_NOT_REPLACED: str = "enrich_acteurs_closed_not_replaced" + ENRICH_CLOSED_SUGGESTIONS_SAME_SIREN: str = ( + "enrich_acteurs_closed_suggestions_same_siren" + ) + ENRICH_CLOSED_SUGGESTIONS_OTHER_SIREN: str = ( + "enrich_acteurs_closed_suggestions_other_siren" + ) + ENRICH_CLOSED_SUGGESTIONS_NOT_REPLACED: str = ( + "enrich_acteurs_closed_suggestions_not_replaced" + ) + ENRICH_DBT_MODELS_REFRESH: str = "enrich_dbt_models_refresh" # Matching tasks MATCH_SCORE_AE_RGPD: str = "enrich_ae_rgpd_match" diff --git a/dags/enrich/config/xcoms.py b/dags/enrich/config/xcoms.py index 511b1670b..0d9451913 100644 --- a/dags/enrich/config/xcoms.py +++ b/dags/enrich/config/xcoms.py @@ -19,8 +19,9 @@ class XCOMS: DF_READ: str = "df_read" DF_MATCH: str = "df_match" - DF_CLOSED_CANDIDATES: str = "df_acteurs_closed_candidates" - DF_CLOSED_REPLACED: str = "df_acteurs_closed_replaced" + DF_CLOSED_REPLACED_SAME_SIREN: str = "df_acteurs_closed_replaced_same_siren" + DF_CLOSED_REPLACED_OTHER_SIREN: str = "df_acteurs_closed_replaced_other_siren" + DF_CLOSED_NOT_REPLACED: str = "df_acteurs_closed_not_replaced" def xcom_pull(ti: TaskInstance, key: str, skip_if_empty: bool = False) -> Any: @@ -37,6 +38,12 @@ def xcom_pull(ti: TaskInstance, key: str, skip_if_empty: bool = False) -> Any: value = ti.xcom_pull(key=key, task_ids=TASKS.READ_AE_RGPD) elif key == XCOMS.DF_MATCH: value = ti.xcom_pull(key=key, task_ids=TASKS.MATCH_SCORE_AE_RGPD) + elif key == XCOMS.DF_CLOSED_REPLACED_SAME_SIREN: + value = ti.xcom_pull(key=key, task_ids=TASKS.ENRICH_CLOSED_REPLACED_SAME_SIREN) + elif key == XCOMS.DF_CLOSED_REPLACED_OTHER_SIREN: + value = ti.xcom_pull(key=key, task_ids=TASKS.ENRICH_CLOSED_REPLACED_OTHER_SIREN) + elif key == XCOMS.DF_CLOSED_NOT_REPLACED: + value = ti.xcom_pull(key=key, task_ids=TASKS.ENRICH_CLOSED_NOT_REPLACED) else: raise ValueError(f"{msg} key inconnue") diff --git a/dags/enrich/dags/enrich_acteurs_closed.py b/dags/enrich/dags/enrich_acteurs_closed.py index fab5ca318..b6aebcff7 100644 --- a/dags/enrich/dags/enrich_acteurs_closed.py +++ b/dags/enrich/dags/enrich_acteurs_closed.py @@ -4,12 +4,24 @@ """ from airflow import DAG -from enrich.config import DBT, TASKS, XCOMS, EnrichActeursClosedConfig +from enrich.config import ( + COHORTS, + DBT, + TASKS, + XCOMS, + EnrichActeursClosedConfig, +) +from enrich.tasks.airflow_logic.enrich_acteurs_closed_suggestions import ( + enrich_acteurs_closed_suggestions_task, +) from enrich.tasks.airflow_logic.enrich_config_create_task import ( enrich_config_create_task, ) -from enrich.tasks.airflow_logic.enrich_read_dbt_model_task import ( - enrich_read_dbt_model_task, +from enrich.tasks.airflow_logic.enrich_dbt_model_read_task import ( + enrich_dbt_model_read_task, +) +from enrich.tasks.airflow_logic.enrich_dbt_models_refresh_task import ( + enrich_dbt_models_refresh_task, ) from shared.config import CATCHUPS, SCHEDULES, START_DATES, config_to_airflow_params @@ -40,32 +52,59 @@ """ chain( enrich_config_create_task(dag), - enrich_read_dbt_model_task( + enrich_dbt_model_read_task( dag, - task_id=TASKS.READ_AE_CLOSED_CANDIDATES, + task_id=TASKS.ENRICH_CLOSED_CANDIDATES, dbt_model_name=DBT.MARTS_ENRICH_AE_CLOSED_CANDIDATES, xcom_push_key=XCOMS.DF_CLOSED_CANDIDATES, ), - enrich_read_dbt_model_task( + enrich_dbt_model_read_task( dag, - task_id=TASKS.READ_AE_CLOSED_REPLACED, + task_id=TASKS.ENRICH_CLOSED_REPLACED, dbt_model_name=DBT.MARTS_ENRICH_AE_CLOSED_REPLACED, xcom_push_key=XCOMS.DF_CLOSED_REPLACED, ), ) """ config = enrich_config_create_task(dag) - closed_candidates = enrich_read_dbt_model_task( + refresh_dbt = enrich_dbt_models_refresh_task(dag) + replaced_same_siren = enrich_dbt_model_read_task( + dag, + task_id=TASKS.ENRICH_CLOSED_REPLACED_SAME_SIREN, + dbt_model_name=DBT.MARTS_ENRICH_AE_CLOSED_REPLACED_SAME_SIREN, + xcom_push_key=XCOMS.DF_CLOSED_REPLACED_SAME_SIREN, + ) + replaced_other_siren = enrich_dbt_model_read_task( + dag, + task_id=TASKS.ENRICH_CLOSED_REPLACED_OTHER_SIREN, + dbt_model_name=DBT.MARTS_ENRICH_AE_CLOSED_REPLACED_OTHER_SIREN, + xcom_push_key=XCOMS.DF_CLOSED_REPLACED_OTHER_SIREN, + ) + not_replaced = enrich_dbt_model_read_task( + dag, + task_id=TASKS.ENRICH_CLOSED_NOT_REPLACED, + dbt_model_name=DBT.MARTS_ENRICH_AE_CLOSED_NOT_REPLACED, + xcom_push_key=XCOMS.DF_CLOSED_NOT_REPLACED, + ) + suggestions_same_siren = enrich_acteurs_closed_suggestions_task( + dag, + task_id=TASKS.ENRICH_CLOSED_SUGGESTIONS_SAME_SIREN, + cohort_type=COHORTS.CLOSED_REP_SAME_SIREN, + df_xcom_key=XCOMS.DF_CLOSED_REPLACED_SAME_SIREN, + ) + suggestions_other_siren = enrich_acteurs_closed_suggestions_task( dag, - task_id=TASKS.READ_AE_CLOSED_CANDIDATES, - dbt_model_name=DBT.MARTS_ENRICH_AE_CLOSED_CANDIDATES, - xcom_push_key=XCOMS.DF_CLOSED_CANDIDATES, + task_id=TASKS.ENRICH_CLOSED_SUGGESTIONS_OTHER_SIREN, + cohort_type=COHORTS.CLOSED_REP_OTHER_SIREN, + df_xcom_key=XCOMS.DF_CLOSED_REPLACED_OTHER_SIREN, ) - closed_replaced = enrich_read_dbt_model_task( + suggestions_not_replaced = enrich_acteurs_closed_suggestions_task( dag, - task_id=TASKS.READ_AE_CLOSED_REPLACED, - dbt_model_name=DBT.MARTS_ENRICH_AE_CLOSED_REPLACED, - xcom_push_key=XCOMS.DF_CLOSED_REPLACED, + task_id=TASKS.ENRICH_CLOSED_SUGGESTIONS_NOT_REPLACED, + cohort_type=COHORTS.CLOSED_NOT_REPLACED, + df_xcom_key=XCOMS.DF_CLOSED_NOT_REPLACED, ) - config >> closed_candidates - config >> closed_replaced + config >> refresh_dbt # type: ignore + refresh_dbt >> replaced_same_siren >> suggestions_same_siren # type: ignore + refresh_dbt >> replaced_other_siren >> suggestions_other_siren # type: ignore + refresh_dbt >> not_replaced >> suggestions_not_replaced # type: ignore diff --git a/dags/enrich/tasks/airflow_logic/enrich_acteurs_closed_suggestions.py b/dags/enrich/tasks/airflow_logic/enrich_acteurs_closed_suggestions.py new file mode 100644 index 000000000..39a743b8e --- /dev/null +++ b/dags/enrich/tasks/airflow_logic/enrich_acteurs_closed_suggestions.py @@ -0,0 +1,57 @@ +"""Read data from DB needed for RGPD anonymization""" + +import logging + +from airflow import DAG +from airflow.models.taskinstance import TaskInstance +from airflow.operators.python import PythonOperator +from enrich.config import XCOMS, xcom_pull +from enrich.tasks.business_logic.enrich_acteurs_closed_suggestions import ( + enrich_acteurs_closed_suggestions, +) + +logger = logging.getLogger(__name__) + + +def task_info_get(task_id, df_xcom_key): + return f""" + ============================================================ + Description de la tâche "{task_id}" + ============================================================ + 💡 quoi: on génère les suggestions à partir de la df + {df_xcom_key} + + 🎯 pourquoi: le but de ce DAG + + 🏗️ comment: pour chaque acteur fermé, on génère 1 suggestion + """ + + +def enrich_acteurs_closed_suggestions_wrapper( + cohort_type: str, df_xcom_key: str, task_id: str, ti: TaskInstance, dag: DAG +) -> None: + logger.info(task_info_get(task_id, df_xcom_key)) + + # Config + config = xcom_pull(ti, XCOMS.CONFIG) + logger.info(f"📖 Configuration:\n{config.model_dump_json(indent=2)}") + + # Processing + enrich_acteurs_closed_suggestions( + df=xcom_pull(ti, df_xcom_key), + cohort_type=cohort_type, + identifiant_action=dag.dag_id, + dry_run=config.dry_run, + ) + + +def enrich_acteurs_closed_suggestions_task( + dag: DAG, task_id: str, cohort_type: str, df_xcom_key: str +) -> PythonOperator: + return PythonOperator( + task_id=task_id, + python_callable=enrich_acteurs_closed_suggestions_wrapper, + op_args=[cohort_type, df_xcom_key, task_id], + dag=dag, + doc_md=f"**Suggestions** pour la cohorte: {cohort_type}**", + ) diff --git a/dags/enrich/tasks/airflow_logic/enrich_config_create_task.py b/dags/enrich/tasks/airflow_logic/enrich_config_create_task.py index aea481f0f..c38c252d5 100644 --- a/dags/enrich/tasks/airflow_logic/enrich_config_create_task.py +++ b/dags/enrich/tasks/airflow_logic/enrich_config_create_task.py @@ -37,4 +37,5 @@ def enrich_config_create_task(dag: DAG) -> PythonOperator: task_id=TASKS.CONFIG_CREATE, python_callable=enrich_config_create_wrapper, dag=dag, + doc_md="📖 **Création de la config**", ) diff --git a/dags/enrich/tasks/airflow_logic/enrich_dbt_model_read_task.py b/dags/enrich/tasks/airflow_logic/enrich_dbt_model_read_task.py new file mode 100644 index 000000000..325af36e8 --- /dev/null +++ b/dags/enrich/tasks/airflow_logic/enrich_dbt_model_read_task.py @@ -0,0 +1,60 @@ +"""Read data from DB needed for RGPD anonymization""" + +import logging + +from airflow import DAG +from airflow.exceptions import AirflowSkipException +from airflow.operators.python import PythonOperator +from airflow.utils.trigger_rule import TriggerRule +from enrich.config import DBT, TASKS, XCOMS, xcom_pull +from enrich.tasks.business_logic.enrich_dbt_model_read import ( + enrich_dbt_model_read, +) + +logger = logging.getLogger(__name__) + + +def task_info_get(): + return f""" + ============================================================ + Description de la tâche "{TASKS.READ_AE_RGPD}" + ============================================================ + 💡 quoi: lecture des données via le modèle DBT + {DBT.MARTS_ENRICH_AE_RGPD} + + 🎯 pourquoi: faire un pré-filtre sur les matches potentiels + (pas récupérer les ~27M de lignes de la table AE unite_legale) + + 🏗️ comment: on récupère uniquement les matches SIREN avec + des infos de noms/prénoms dans l'AE en passant par de la normalisation + de chaines de caractères + """ + + +def enrich_dbt_model_read_wrapper(dbt_model_name, xcom_push_key, ti) -> None: + logger.info(task_info_get()) + + # Config + config = xcom_pull(ti, XCOMS.CONFIG) + logger.info(f"📖 Configuration:\n{config.model_dump_json(indent=2)}") + + # Processing + df = enrich_dbt_model_read(dbt_model_name=dbt_model_name, filters=config.filters) + if df.empty: + raise AirflowSkipException("Pas de données DB, on s'arrête là") + + # Result + ti.xcom_push(key=xcom_push_key, value=df) + + +def enrich_dbt_model_read_task( + dag: DAG, task_id: str, dbt_model_name: str, xcom_push_key: str +) -> PythonOperator: + return PythonOperator( + task_id=task_id, + python_callable=enrich_dbt_model_read_wrapper, + op_args=[dbt_model_name, xcom_push_key], + dag=dag, + doc_md=f"**Lecture du modèle DBT**: `{dbt_model_name}`", + trigger_rule=TriggerRule.ALL_DONE, + ) diff --git a/dags/enrich/tasks/airflow_logic/enrich_dbt_models_refresh_task.py b/dags/enrich/tasks/airflow_logic/enrich_dbt_models_refresh_task.py new file mode 100644 index 000000000..b7c6070c4 --- /dev/null +++ b/dags/enrich/tasks/airflow_logic/enrich_dbt_models_refresh_task.py @@ -0,0 +1,58 @@ +"""Read data from DB needed for RGPD anonymization""" + +import logging + +from airflow import DAG +from airflow.exceptions import AirflowSkipException +from airflow.operators.bash import BashOperator +from airflow.operators.python import PythonOperator +from enrich.config import DBT, TASKS, XCOMS, xcom_pull + +logger = logging.getLogger(__name__) + + +def task_info_get(): + return f""" + ============================================================ + Description de la tâche "{TASKS.ENRICH_DBT_MODELS_REFRESH}" + ============================================================ + 💡 quoi: lecture des données via le modèle DBT + {DBT.MARTS_ENRICH_AE_RGPD} + + 🎯 pourquoi: faire un pré-filtre sur les matches potentiels + (pas récupérer les ~27M de lignes de la table AE unite_legale) + + 🏗️ comment: on récupère uniquement les matches SIREN avec + des infos de noms/prénoms dans l'AE en passant par de la normalisation + de chaines de caractères + """ + + +def enrich_dbt_models_refresh_wrapper(ti) -> None: + logger.info(task_info_get()) + + # Config + config = xcom_pull(ti, XCOMS.CONFIG) + logger.info(f"📖 Configuration:\n{config.model_dump_json(indent=2)}") + + if not config.dbt_models_refresh: + raise AirflowSkipException("🚫 Rafraîchissement des modèles DBT désactivé") + + logger.info( + f"🔄 Rafraîchissement des modèles DBT: {config.dbt_models_refresh_command}" + ) + bash = BashOperator( + task_id=TASKS.ENRICH_DBT_MODELS_REFRESH + "_bash", + bash_command=config.dbt_build_command, + ) + bash.execute(context=ti.get_template_context()) + + +def enrich_dbt_models_refresh_task( + dag: DAG, +) -> PythonOperator: + return PythonOperator( + task_id=TASKS.ENRICH_DBT_MODELS_REFRESH, + python_callable=enrich_dbt_models_refresh_wrapper, + dag=dag, + ) diff --git a/dags/enrich/tasks/airflow_logic/enrich_read_ae_closed_replaced_task.py b/dags/enrich/tasks/airflow_logic/enrich_read_ae_closed_replaced_task.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/dags/enrich/tasks/airflow_logic/enrich_read_dbt_model_task.py b/dags/enrich/tasks/airflow_logic/enrich_read_dbt_model_task.py index dbca68c7c..72a54c040 100644 --- a/dags/enrich/tasks/airflow_logic/enrich_read_dbt_model_task.py +++ b/dags/enrich/tasks/airflow_logic/enrich_read_dbt_model_task.py @@ -6,8 +6,8 @@ from airflow.exceptions import AirflowSkipException from airflow.operators.python import PythonOperator from enrich.config import DBT, TASKS, XCOMS, xcom_pull -from enrich.tasks.business_logic.enrich_read_dbt_model import ( - enrich_read_dbt_model, +from enrich.tasks.business_logic.enrich_dbt_model_read import ( + enrich_dbt_model_read, ) logger = logging.getLogger(__name__) @@ -30,7 +30,7 @@ def task_info_get(): """ -def enrich_read_dbt_model_wrapper(dbt_model_name, xcom_push_key, ti) -> None: +def enrich_dbt_model_read_wrapper(dbt_model_name, xcom_push_key, ti) -> None: logger.info(task_info_get()) # Config @@ -38,7 +38,7 @@ def enrich_read_dbt_model_wrapper(dbt_model_name, xcom_push_key, ti) -> None: logger.info(f"📖 Configuration:\n{config.model_dump_json(indent=2)}") # Processing - df = enrich_read_dbt_model(dbt_model_name=dbt_model_name, filters=config.filters) + df = enrich_dbt_model_read(dbt_model_name=dbt_model_name, filters=config.filters) if df.empty: raise AirflowSkipException("Pas de données DB, on s'arrête là") @@ -46,14 +46,13 @@ def enrich_read_dbt_model_wrapper(dbt_model_name, xcom_push_key, ti) -> None: ti.xcom_push(key=xcom_push_key, value=df) -def enrich_read_dbt_model_task( +def enrich_dbt_model_read_task( dag: DAG, task_id: str, dbt_model_name: str, xcom_push_key: str ) -> PythonOperator: return PythonOperator( task_id=task_id, - python_callable=enrich_read_dbt_model_wrapper, + python_callable=enrich_dbt_model_read_wrapper, op_args=[dbt_model_name, xcom_push_key], dag=dag, - # pool="dbt_model_read", - # pool_slots=1, + doc_md=f"**Lecture du modèle DBT**: `{dbt_model_name}`", ) diff --git a/dags/enrich/tasks/business_logic/enrich_acteurs_closed_suggestions.py b/dags/enrich/tasks/business_logic/enrich_acteurs_closed_suggestions.py index adb960dc7..b00c34fd1 100644 --- a/dags/enrich/tasks/business_logic/enrich_acteurs_closed_suggestions.py +++ b/dags/enrich/tasks/business_logic/enrich_acteurs_closed_suggestions.py @@ -33,9 +33,9 @@ def enrich_acteurs_closed_suggestions( raise ValueError("df vide: on devrait pas être ici") if cohort_type not in [ - COHORTS.ACTEURS_CLOSED_NOT_REPLACED, - COHORTS.ACTEURS_CLOSED_REP_DIFF_SIREN, - COHORTS.ACTEURS_CLOSED_REP_SAME_SIREN, + COHORTS.CLOSED_NOT_REPLACED, + COHORTS.CLOSED_REP_OTHER_SIREN, + COHORTS.CLOSED_REP_SAME_SIREN, ]: raise ValueError(f"Mauvaise cohorte: {cohort_type=}") @@ -47,7 +47,7 @@ def enrich_acteurs_closed_suggestions( # ----------------------------------------- # NOT REPLACED # ----------------------------------------- - if cohort_type == COHORTS.ACTEURS_CLOSED_NOT_REPLACED: + if cohort_type == COHORTS.CLOSED_NOT_REPLACED: changes = [] model_params = { "id": row[COLS.ACTEUR_ID], @@ -72,8 +72,8 @@ def enrich_acteurs_closed_suggestions( # REPLACED # ----------------------------------------- elif cohort_type in [ - COHORTS.ACTEURS_CLOSED_REP_DIFF_SIREN, - COHORTS.ACTEURS_CLOSED_REP_SAME_SIREN, + COHORTS.CLOSED_REP_OTHER_SIREN, + COHORTS.CLOSED_REP_SAME_SIREN, ]: cohortes = df[COLS.REMPLACER_COHORTE].unique() if len(cohortes) > 1: diff --git a/dags/enrich/tasks/business_logic/enrich_read_dbt_model.py b/dags/enrich/tasks/business_logic/enrich_dbt_model_read.py similarity index 98% rename from dags/enrich/tasks/business_logic/enrich_read_dbt_model.py rename to dags/enrich/tasks/business_logic/enrich_dbt_model_read.py index 49fa0ec92..45d9644ab 100644 --- a/dags/enrich/tasks/business_logic/enrich_read_dbt_model.py +++ b/dags/enrich/tasks/business_logic/enrich_dbt_model_read.py @@ -12,7 +12,7 @@ logger = logging.getLogger(__name__) -def enrich_read_dbt_model( +def enrich_dbt_model_read( dbt_model_name: str, filters: list[dict] = [] ) -> pd.DataFrame: """Reads necessary QFDMO acteurs and AE entries from DB""" diff --git a/dags_unit_tests/enrich/tasks/test_enrich_acteurs_closed_suggestions.py b/dags_unit_tests/enrich/tasks/test_enrich_acteurs_closed_suggestions.py index 1ab91fe63..d71d9bd97 100644 --- a/dags_unit_tests/enrich/tasks/test_enrich_acteurs_closed_suggestions.py +++ b/dags_unit_tests/enrich/tasks/test_enrich_acteurs_closed_suggestions.py @@ -110,7 +110,7 @@ def test_cohorte_not_replaced(self, acteurs, df_not_replaced): # Write suggestions to DB enrich_acteurs_closed_suggestions( df=df_not_replaced, - cohort_type=COHORTS.ACTEURS_CLOSED_NOT_REPLACED, + cohort_type=COHORTS.CLOSED_NOT_REPLACED, identifiant_action="test_not_replaced", dry_run=False, ) @@ -145,7 +145,7 @@ def test_cohorte_meme_siren(self, acteurs, atype, source, df_replaced_meme_siret # Write suggestions to DB enrich_acteurs_closed_suggestions( df=df_replaced_meme_siret, - cohort_type=COHORTS.ACTEURS_CLOSED_REP_SAME_SIREN, + cohort_type=COHORTS.CLOSED_REP_SAME_SIREN, identifiant_action="test_meme_siren", dry_run=False, ) @@ -189,7 +189,7 @@ def test_cohorte_autre_siren(self, acteurs, df_replaced_autre_siret): # Write suggestions to DB enrich_acteurs_closed_suggestions( df=df_replaced_autre_siret, - cohort_type=COHORTS.ACTEURS_CLOSED_REP_DIFF_SIREN, + cohort_type=COHORTS.CLOSED_REP_OTHER_SIREN, identifiant_action="test_autre_siren", dry_run=False, ) diff --git a/dbt/models/marts/enrich/marts_enrich_ae_closed_candidates.sql b/dbt/models/marts/enrich/marts_enrich_acteurs_closed_candidates.sql similarity index 60% rename from dbt/models/marts/enrich/marts_enrich_ae_closed_candidates.sql rename to dbt/models/marts/enrich/marts_enrich_acteurs_closed_candidates.sql index 4a33a195f..9e2fcce5f 100644 --- a/dbt/models/marts/enrich/marts_enrich_ae_closed_candidates.sql +++ b/dbt/models/marts/enrich/marts_enrich_acteurs_closed_candidates.sql @@ -1,21 +1,14 @@ /* -Model to find entries from AE's etablissement which are -potential replacements for etab_closed acteurs. - -Code is repetitive (e.g. same logic in SELECT and ROW_NUMBER) and -could be made more concise with an intermediary CTE. However from experience, -intermediate CTEs lead to slower performance (we constraint the planner) -than just letting the query planner do its job. Thus for now, I focus -on performance given the 40M rows +Acteurs which SIRET is closed in AE's etablissement Notes: - - 🧹 Pre-matching/filtering at SQL level to reduce data size (40M rows) - - 👁️‍🗨️ Keeping as view to always re-evaluate vs. ever changing QFDMO data + - 📦 Materialized as table but refreshed by DAG enrich_acteurs_closed + as many models/tests depending on each other = would take too long */ {{ config( - materialized = 'view', - tags=['marts', 'ae', 'annuaire_entreprises', 'etablissement'], + materialized = 'table', + tags=['marts', 'enrich', 'closed', 'ae', 'annuaire_entreprises', 'etablissement'], ) }} -- Starting from our acteurs we can match via SIRET @@ -28,7 +21,7 @@ WITH acteurs_with_siret AS ( identifiant_unique AS acteur_id, commentaires AS acteur_commentaires, statut AS acteur_statut, - acteur_type AS acteur_type + acteur_type_id FROM {{ ref('marts_carte_acteur') }} WHERE siret IS NOT NULL AND siret != '' AND LENGTH(siret) = 14 ), @@ -48,9 +41,9 @@ SELECT acteurs.acteur_nom, acteurs.acteur_nom_normalise, acteurs.acteur_commentaires, - acteurs.acteur_type + acteurs.acteur_type_id FROM acteurs_with_siret AS acteurs JOIN {{ ref('int_ae_etablissement') }} AS etab ON acteurs.siret = etab.siret -WHERE etab.est_actif = FALSE +WHERE etab.est_actif IS FALSE ) SELECT * FROM etab_closed_candidates \ No newline at end of file diff --git a/dbt/models/marts/enrich/marts_enrich_acteurs_closed_not_replaced.sql b/dbt/models/marts/enrich/marts_enrich_acteurs_closed_not_replaced.sql new file mode 100644 index 000000000..1f821794f --- /dev/null +++ b/dbt/models/marts/enrich/marts_enrich_acteurs_closed_not_replaced.sql @@ -0,0 +1,21 @@ +/* +Acteurs which SIRENT & SIRET is closed in AE's etablissement +BUT for which we couldn't find replacements +*/ +{{ + config( + materialized = 'table', + tags=['marts', 'enrich', 'closed', 'ae', 'annuaire_entreprises', 'etablissement'], + ) +}} + +SELECT * FROM {{ ref('marts_enrich_acteurs_closed_candidates') }} +WHERE + /* In candidates we don't filter on unite_est_actif IS FALSE + because it would prevent us from finding replacements for same unite, + however for acteurs we consider fully closed we do apply that filter */ + unite_est_actif is FALSE + AND acteur_id NOT IN ( + SELECT acteur_id FROM {{ ref('marts_enrich_acteurs_closed_replaced') }} + ) + diff --git a/dbt/models/marts/enrich/marts_enrich_ae_closed_replaced.sql b/dbt/models/marts/enrich/marts_enrich_acteurs_closed_replaced.sql similarity index 84% rename from dbt/models/marts/enrich/marts_enrich_ae_closed_replaced.sql rename to dbt/models/marts/enrich/marts_enrich_acteurs_closed_replaced.sql index 3e2844d5c..49b4ff2e7 100644 --- a/dbt/models/marts/enrich/marts_enrich_ae_closed_replaced.sql +++ b/dbt/models/marts/enrich/marts_enrich_acteurs_closed_replaced.sql @@ -1,19 +1,20 @@ {{ config( - materialized = 'view', - tags=['marts', 'ae', 'annuaire_entreprises', 'etablissement'], + materialized = 'table', + tags=['marts', 'enrich', 'closed', 'ae', 'annuaire_entreprises', 'etablissement'], ) }} WITH potential_replacements AS ( SELECT candidates.acteur_id AS acteur_id, + candidates.acteur_type_id AS acteur_type_id, candidates.acteur_statut AS acteur_statut, candidates.siret AS acteur_siret, replacements.siret AS remplacer_siret, CASE - WHEN LEFT(candidates.siret,9) = LEFT(replacements.siret,9) THEN 'meme_siret' - ELSE 'autre_siret' + WHEN LEFT(candidates.siret,9) = LEFT(replacements.siret,9) THEN 'siret_du_meme_siren' + ELSE 'siret_dun_autre_siren' END AS remplacer_cohorte, candidates.acteur_nom, replacements.nom AS remplacer_nom, @@ -40,7 +41,7 @@ WITH potential_replacements AS ( udf_normalize_string_alpha_for_match(replacements.nom) ) DESC ) AS replacement_priority - FROM {{ ref('marts_enrich_ae_closed_candidates') }} AS candidates + FROM {{ ref('marts_enrich_acteurs_closed_candidates') }} AS candidates INNER JOIN {{ ref('int_ae_etablissement') }} AS replacements ON replacements.naf = candidates.etab_naf AND replacements.code_postal = candidates.etab_code_postal diff --git a/dbt/models/marts/enrich/marts_enrich_acteurs_closed_replaced_other_siren.sql b/dbt/models/marts/enrich/marts_enrich_acteurs_closed_replaced_other_siren.sql new file mode 100644 index 000000000..9f5458cbb --- /dev/null +++ b/dbt/models/marts/enrich/marts_enrich_acteurs_closed_replaced_other_siren.sql @@ -0,0 +1,9 @@ +{{ + config( + materialized = 'table', + tags=['marts', 'enrich', 'closed', 'ae', 'annuaire_entreprises', 'etablissement'], + ) +}} + +SELECT * FROM {{ ref('marts_enrich_acteurs_closed_replaced') }} +WHERE remplacer_cohorte = 'siret_dun_autre_siren' diff --git a/dbt/models/marts/enrich/marts_enrich_acteurs_closed_replaced_same_siren.sql b/dbt/models/marts/enrich/marts_enrich_acteurs_closed_replaced_same_siren.sql new file mode 100644 index 000000000..108927314 --- /dev/null +++ b/dbt/models/marts/enrich/marts_enrich_acteurs_closed_replaced_same_siren.sql @@ -0,0 +1,9 @@ +{{ + config( + materialized = 'table', + tags=['marts', 'enrich', 'closed', 'ae', 'annuaire_entreprises', 'etablissement'], + ) +}} + +SELECT * FROM {{ ref('marts_enrich_acteurs_closed_replaced') }} +WHERE remplacer_cohorte = 'siret_du_meme_siren' diff --git a/dbt/models/marts/enrich/schema.yml b/dbt/models/marts/enrich/schema.yml index 233186877..bf9eaef2b 100644 --- a/dbt/models/marts/enrich/schema.yml +++ b/dbt/models/marts/enrich/schema.yml @@ -44,7 +44,7 @@ models: data_tests: - not_null - - name: marts_enrich_ae_closed_replaced + - name: marts_enrich_acteurs_closed_replaced description: Etablissements de l'Annuaire Entreprises (AE) qui ont été | fermés et remplacés par un autre établissement columns: @@ -66,8 +66,8 @@ models: description: "Si le SIRET de remplacement appartient à la même entreprise (meme_siret) ou non (autre_siret)" data_tests: - not_null - - accepted_values: - values: ['meme_siret', 'autre_siret'] + # - accepted_values: + # values: ['siret_du_meme_siren', 'siret_dun_autre_siren'] - name: acteur_nom description: Nom de l'acteur fermé - name: remplacer_nom @@ -88,12 +88,8 @@ models: - name: replacement_priority description: "Priorité du remplacement (1 = meilleur match)" data_type: integer - data_tests: - - not_null - - accepted_values: - values: [1] - - name: marts_enrich_ae_closed_candidates + - name: marts_enrich_acteurs_closed_candidates description: Etablissements fermés de l'Annuaire Entreprises (AE) qui pourraient être remplacés columns: - name: siret From abc46b082e797146e938149acd415afab3b43f3b Mon Sep 17 00:00:00 2001 From: Max Corbeau Date: Wed, 9 Apr 2025 10:57:47 +0200 Subject: [PATCH 15/50] dbt: nettoyage & sampling --- dbt/dbt_project.yml | 3 +- dbt/macros/constants/value_unavailable.sql | 6 ++-- dbt/macros/udf/udf_ae_string_cleanup.sql | 19 ++++++++++ .../udf/udf_columns_words_in_common_count.sql | 2 +- .../base_ae_etablissement.sql | 36 +++++++++++++------ .../base_ae_unite_legale.sql | 36 ++++++++++++------- .../int_ae_etablissement.sql | 11 +++--- ...marts_enrich_acteurs_closed_candidates.sql | 1 + .../marts_enrich_acteurs_closed_replaced.sql | 4 +-- dbt/profiles.yml | 12 +++++++ 10 files changed, 95 insertions(+), 35 deletions(-) create mode 100644 dbt/macros/udf/udf_ae_string_cleanup.sql diff --git a/dbt/dbt_project.yml b/dbt/dbt_project.yml index 656b2f801..f4404fd9e 100644 --- a/dbt/dbt_project.yml +++ b/dbt/dbt_project.yml @@ -25,8 +25,9 @@ on-run-start: - "{{ create_udf_uuid_to_int() }}" - "{{ create_udf_safe_divmod() }}" - "{{ create_udf_columns_concat_unique_non_empty() }}" - - "{{create_udf_columns_words_in_common_count()}}" + - "{{ create_udf_columns_words_in_common_count() }}" - "{{ create_udf_normalize_string_alpha_for_match() }}" + - "{{ create_udf_ae_string_cleanup() }}" clean-targets: - "target" diff --git a/dbt/macros/constants/value_unavailable.sql b/dbt/macros/constants/value_unavailable.sql index 7039b89f8..5aaba79a0 100644 --- a/dbt/macros/constants/value_unavailable.sql +++ b/dbt/macros/constants/value_unavailable.sql @@ -1,3 +1,5 @@ -/* Constant used to flag that a value is unavailable -and detect, test, exclude it more easily */ +/* Constant used to flag a value as unavailable +and detect, test, exclude it more easily +vs. values such as "" or NULL which might raise +doubts about functionality of model */ {% macro value_unavailable() %}'🔴 VALEUR INDISPONIBLE'{% endmacro %} \ No newline at end of file diff --git a/dbt/macros/udf/udf_ae_string_cleanup.sql b/dbt/macros/udf/udf_ae_string_cleanup.sql new file mode 100644 index 000000000..977416058 --- /dev/null +++ b/dbt/macros/udf/udf_ae_string_cleanup.sql @@ -0,0 +1,19 @@ +{% macro create_udf_ae_string_cleanup() %} +/* + Converts string values from Annuaire Entreprises + to 1 consistent format, taking into account cases + such as '[ND]' = Non disponible, with conversion + to NULL for easier processing whenever we consider + it to be empty. +*/ +CREATE OR REPLACE FUNCTION {{ target.schema }}.udf_ae_string_cleanup(val TEXT) +RETURNS TEXT AS $$ +BEGIN + IF TRIM(val) = '' OR TRIM(val) = '[ND]' THEN + RETURN NULL; + ELSE + RETURN TRIM(val); + END IF; +END; +$$ LANGUAGE plpgsql STRICT; +{% endmacro %} \ No newline at end of file diff --git a/dbt/macros/udf/udf_columns_words_in_common_count.sql b/dbt/macros/udf/udf_columns_words_in_common_count.sql index 99bf1c676..616cfcbf7 100644 --- a/dbt/macros/udf/udf_columns_words_in_common_count.sql +++ b/dbt/macros/udf/udf_columns_words_in_common_count.sql @@ -2,7 +2,7 @@ /* Count number of words in common between 2 columns */ -CREATE OR REPLACE FUNCTION {{ target.schema }}.columns_words_in_common_count(col1 text, col2 text) +CREATE OR REPLACE FUNCTION {{ target.schema }}.udf_columns_words_in_common_count(col1 text, col2 text) RETURNS integer AS $$ DECLARE word text; diff --git a/dbt/models/base/ae_annuaire_entreprises/base_ae_etablissement.sql b/dbt/models/base/ae_annuaire_entreprises/base_ae_etablissement.sql index af952b324..22909b05b 100644 --- a/dbt/models/base/ae_annuaire_entreprises/base_ae_etablissement.sql +++ b/dbt/models/base/ae_annuaire_entreprises/base_ae_etablissement.sql @@ -13,25 +13,39 @@ Notes: SELECT + -- Codes -siret, -activite_principale, +{{ udf_ae_string_cleanup(siret) }} AS siret, +{{ udf_ae_string_cleanup(activite_principale) }} AS activite_principale, -- Names -denomination_usuelle, +{{ udf_ae_string_cleanup(denomination_usuelle) }} AS denomination_usuelle, -- Status -etat_administratif, +{{ udf_ae_string_cleanup(etat_administratif) }} AS etat_administratif, -- Address -numero_voie, -complement_adresse, -type_voie, -libelle_voie, -code_postal, -libelle_commune +{{ udf_ae_string_cleanup(numero_voie) }} AS numero_voie, +{{ udf_ae_string_cleanup(complement_adresse) }} AS complement_adresse, +{{ udf_ae_string_cleanup(type_voie) }} AS type_voie, +{{ udf_ae_string_cleanup(libelle_voie) }} AS libelle_voie, +{{ udf_ae_string_cleanup(code_postal) }} AS code_postal, +{{ udf_ae_string_cleanup(libelle_commune) }} AS libelle_commune FROM {{ source('ae', 'clone_ae_etablissement_in_use') }} -- Filtering out foreign establishments as our focus is France -- On 2025-03-17 this allows excluding ~316K rows -WHERE code_pays_etranger IS NULL \ No newline at end of file +WHERE code_pays_etranger IS NULL +{% if target.name == 'sampling' %} +/* We can't do random sampling else we risk having +no matching etablissement vs. unite legale. Can't +sample on location as not available in unite to match, +falling back to latest SIRET/SIREN as they will give +matches while representing recent data. +*/ +/* TODO: improve sampling by grabbing what we have +in acteurs + a little more if we can suggestion models +to have more data */ +ORDER BY siret DESC +LIMIT 1000000 +{% endif %} diff --git a/dbt/models/base/ae_annuaire_entreprises/base_ae_unite_legale.sql b/dbt/models/base/ae_annuaire_entreprises/base_ae_unite_legale.sql index 6df83ed8c..f248f640f 100644 --- a/dbt/models/base/ae_annuaire_entreprises/base_ae_unite_legale.sql +++ b/dbt/models/base/ae_annuaire_entreprises/base_ae_unite_legale.sql @@ -15,23 +15,33 @@ Notes: SELECT -- Codes -siren, -activite_principale, +{{ udf_ae_string_cleanup(siren) }} AS siren, +{{ udf_ae_string_cleanup(activite_principale) }} AS activite_principale, -- Status -etat_administratif, +{{ udf_ae_string_cleanup(etat_administratif) }} AS etat_administratif, -- Business names -denomination, +{{ udf_ae_string_cleanup(denomination) }} AS denomination, -- Director's names -CASE WHEN prenom1 = '[ND]' THEN NULL ELSE prenom1 END AS prenom1, -CASE WHEN prenom2 = '[ND]' THEN NULL ELSE prenom2 END AS prenom2, -CASE WHEN prenom3 = '[ND]' THEN NULL ELSE prenom3 END AS prenom3, -CASE WHEN prenom4 = '[ND]' THEN NULL ELSE prenom4 END AS prenom4, -CASE WHEN prenom_usuel = '[ND]' THEN NULL ELSE prenom_usuel END AS prenom_usuel, -CASE WHEN pseudonyme = '[ND]' THEN NULL ELSE pseudonyme END AS pseudonyme, -CASE WHEN nom = '[ND]' THEN NULL ELSE nom END AS nom, -CASE WHEN nom_usage = '[ND]' THEN NULL ELSE nom_usage END AS nom_usage +{{ udf_ae_string_cleanup(prenom1) }} AS prenom1, +{{ udf_ae_string_cleanup(prenom2) }} AS prenom2, +{{ udf_ae_string_cleanup(prenom3) }} AS prenom3, +{{ udf_ae_string_cleanup(prenom4) }} AS prenom4, +{{ udf_ae_string_cleanup(prenom_usuel) }} AS prenom_usuel, +{{ udf_ae_string_cleanup(pseudonyme) }} AS pseudonyme, +{{ udf_ae_string_cleanup(nom) }} AS nom, +{{ udf_ae_string_cleanup(nom_usage) }} AS nom_usage -FROM {{ source('ae', 'clone_ae_unite_legale_in_use') }} \ No newline at end of file +FROM {{ source('ae', 'clone_ae_unite_legale_in_use') }} +/* We can't do random sampling else we risk having +no matching etablissement vs. unite legale. Can't +sample on location as not available in unite to match, +falling back to latest SIRET/SIREN as they will give +matches while representing recent data. +*/ +{% if target.name == 'sampling' %} +ORDER BY siren DESC +LIMIT 500000 -- 1 SIREN for 2 SIRET +{% endif %} \ No newline at end of file diff --git a/dbt/models/intermediate/ae_annuaire_entreprises/int_ae_etablissement.sql b/dbt/models/intermediate/ae_annuaire_entreprises/int_ae_etablissement.sql index 77281bc05..2025cfd51 100644 --- a/dbt/models/intermediate/ae_annuaire_entreprises/int_ae_etablissement.sql +++ b/dbt/models/intermediate/ae_annuaire_entreprises/int_ae_etablissement.sql @@ -22,9 +22,9 @@ SELECT -- Names CASE - WHEN TRIM(etab.denomination_usuelle) NOT IN ('', '[ND]', NULL) THEN TRIM(etab.denomination_usuelle) - WHEN TRIM(etab.denomination_usuelle) IN ('', '[ND]', NULL) AND TRIM(unite.denomination) NOT IN ('', '[ND]', NULL) THEN TRIM(unite.denomination) - ELSE {{ value_unavailable() }} + WHEN etab.denomination_usuelle IS NOT NULL THEN etab.denomination_usuelle + WHEN etab.denomination_usuelle IS NULL AND unite.denomination IS NOT NULL THEN unite.denomination + ELSE {{ value_unavailable() }} -- To make this case explicit END AS nom, /* @@ -49,14 +49,15 @@ SELECT etab.type_voie, etab.libelle_voie ) AS adresse, + etab.numero_voie AS adresse_numero, etab.complement_adresse AS adresse_complement, etab.code_postal, etab.libelle_commune AS ville FROM {{ ref('base_ae_etablissement') }} AS etab /* Joining with unite_legale to bring some essential -data from parent unite into each etablissement to save -us from making expensive JOINS in downstream models */ +data from parent unite into each etablissement (saves +us from making expensive JOINS in downstream models) */ JOIN {{ ref('base_ae_unite_legale') }} AS unite ON unite.siren = LEFT(etab.siret,9) /* Here we keep unavailable names as int_ models aren't diff --git a/dbt/models/marts/enrich/marts_enrich_acteurs_closed_candidates.sql b/dbt/models/marts/enrich/marts_enrich_acteurs_closed_candidates.sql index 9e2fcce5f..62a66bd84 100644 --- a/dbt/models/marts/enrich/marts_enrich_acteurs_closed_candidates.sql +++ b/dbt/models/marts/enrich/marts_enrich_acteurs_closed_candidates.sql @@ -45,5 +45,6 @@ SELECT FROM acteurs_with_siret AS acteurs JOIN {{ ref('int_ae_etablissement') }} AS etab ON acteurs.siret = etab.siret WHERE etab.est_actif IS FALSE +AND etab.numero_voie ) SELECT * FROM etab_closed_candidates \ No newline at end of file diff --git a/dbt/models/marts/enrich/marts_enrich_acteurs_closed_replaced.sql b/dbt/models/marts/enrich/marts_enrich_acteurs_closed_replaced.sql index 49b4ff2e7..7ce1b427f 100644 --- a/dbt/models/marts/enrich/marts_enrich_acteurs_closed_replaced.sql +++ b/dbt/models/marts/enrich/marts_enrich_acteurs_closed_replaced.sql @@ -18,7 +18,7 @@ WITH potential_replacements AS ( END AS remplacer_cohorte, candidates.acteur_nom, replacements.nom AS remplacer_nom, - columns_words_in_common_count( + udf_columns_words_in_common_count( candidates.acteur_nom_normalise, udf_normalize_string_alpha_for_match(replacements.nom) ) AS noms_nombre_mots_commun, @@ -36,7 +36,7 @@ WITH potential_replacements AS ( ELSE 0 END DESC, -- Then etablissements with more words in common - columns_words_in_common_count( + udf_columns_words_in_common_count( candidates.acteur_nom_normalise, udf_normalize_string_alpha_for_match(replacements.nom) ) DESC diff --git a/dbt/profiles.yml b/dbt/profiles.yml index edebeb981..0d82812cb 100644 --- a/dbt/profiles.yml +++ b/dbt/profiles.yml @@ -9,3 +9,15 @@ default: password: "{{ env_var('POSTGRES_PASSWORD', 'qfdmo') }}" dbname: "{{ env_var('POSTGRES_DB', 'qfdmo') }}" schema: "{{ env_var('POSTGRES_SCHEMA', 'public') }}" + # To implement sampling logic for large tables + # (e.g. Annuaire Entreprises Etablissements = 40M rows) + # Each model is free to implement its own sampling strategy + # using {% if target.name == 'sampling' %}...{% endif %} + sampling: + type: postgres + host: "{{ env_var('POSTGRES_HOST', 'localhost') }}" + port: "{{ env_var('POSTGRES_PORT', 6543) | as_number }}" + user: "{{ env_var('POSTGRES_USER', 'qfdmo') }}" + password: "{{ env_var('POSTGRES_PASSWORD', 'qfdmo') }}" + dbname: "{{ env_var('POSTGRES_DB', 'qfdmo') }}" + schema: "{{ env_var('POSTGRES_SCHEMA', 'public') }}" \ No newline at end of file From be74405b7601ec39bb0b4e95121b3788e0393134 Mon Sep 17 00:00:00 2001 From: Max Corbeau Date: Thu, 10 Apr 2025 09:54:26 +0200 Subject: [PATCH 16/50] DAG & Admin UI fonctionnels --- .../misc/data_serialize_reconstruct.py | 100 ++++++++++++++++++ dags/enrich/config/__init__.py | 2 +- dags/enrich/config/cohorts.py | 21 +++- dags/enrich/config/columns.py | 12 ++- dags/enrich/config/dbt.py | 6 +- dags/enrich/dags/enrich_acteurs_closed.py | 77 ++++---------- .../enrich_acteurs_closed_suggestions.py | 57 ---------- .../enrich_dbt_model_read_task.py | 60 ----------- .../enrich_dbt_model_suggest_task.py | 67 ++++++++++++ .../enrich_read_dbt_model_task.py | 58 ---------- .../enrich_dbt_model_suggest.py | 29 +++++ ....py => enrich_dbt_model_to_suggestions.py} | 75 +++++++------ .../test_enrich_acteurs_closed_config.py | 3 +- .../test_enrich_acteurs_closed_suggestions.py | 39 +++---- .../utils/test_data_serialize_reconstruct.py | 18 +++- data/models/suggestion.py | 9 +- dbt/README.md | 18 ++++ dbt/macros/table/macro_acteur.sql | 2 + dbt/macros/udf/udf_ae_string_cleanup.sql | 3 +- .../udf_columns_concat_unique_non_empty.sql | 3 +- .../udf/udf_columns_words_in_common_count.sql | 3 +- dbt/macros/udf/udf_encode_base57.sql | 4 +- .../udf_normalize_string_alpha_for_match.sql | 17 ++- dbt/macros/udf/udf_safe_divmod.sql | 3 +- dbt/macros/udf/udf_uuid_to_int.sql | 3 +- dbt/models/base/acteurs/base_acteur.sql | 5 +- dbt/models/base/acteurs/base_acteur_type.sql | 1 + dbt/models/base/acteurs/schema.yml | 14 +++ .../base_ae_etablissement.sql | 23 ++-- .../base_ae_unite_legale.sql | 28 ++--- .../intermediate/acteurs/int_acteur.sql | 14 +++ .../int_ae_etablissement.sql | 6 +- .../int_ae_unite_legale.sql | 16 +-- ...marts_enrich_acteurs_closed_candidates.sql | 51 +++++++-- .../marts_enrich_acteurs_closed_replaced.sql | 43 +++++--- ...ch_acteurs_closed_replaced_other_siren.sql | 9 -- ...ich_acteurs_closed_replaced_same_siren.sql | 9 -- ...h_acteurs_closed_suggest_not_replaced.sql} | 13 ++- ...rs_closed_suggest_replaced_other_siren.sql | 13 +++ ...urs_closed_suggest_replaced_same_siren.sql | 13 +++ .../marts/enrich/marts_enrich_ae_rgpd.sql | 2 +- dbt/models/source/source_acteur.yml | 1 + dbt/profiles.yml | 12 --- templates/data/_partials/dicts_to_table.html | 33 ++++++ .../_partials/generic_suggestion_details.html | 24 ----- .../_partials/suggestion_details_changes.html | 13 +++ templates/data/_partials/value_details.html | 41 ++++++- 47 files changed, 640 insertions(+), 433 deletions(-) create mode 100644 dags/cluster/tasks/business_logic/misc/data_serialize_reconstruct.py delete mode 100644 dags/enrich/tasks/airflow_logic/enrich_acteurs_closed_suggestions.py delete mode 100644 dags/enrich/tasks/airflow_logic/enrich_dbt_model_read_task.py create mode 100644 dags/enrich/tasks/airflow_logic/enrich_dbt_model_suggest_task.py delete mode 100644 dags/enrich/tasks/airflow_logic/enrich_read_dbt_model_task.py create mode 100644 dags/enrich/tasks/business_logic/enrich_dbt_model_suggest.py rename dags/enrich/tasks/business_logic/{enrich_acteurs_closed_suggestions.py => enrich_dbt_model_to_suggestions.py} (70%) create mode 100644 dbt/models/base/acteurs/base_acteur_type.sql delete mode 100644 dbt/models/marts/enrich/marts_enrich_acteurs_closed_replaced_other_siren.sql delete mode 100644 dbt/models/marts/enrich/marts_enrich_acteurs_closed_replaced_same_siren.sql rename dbt/models/marts/enrich/{marts_enrich_acteurs_closed_not_replaced.sql => marts_enrich_acteurs_closed_suggest_not_replaced.sql} (50%) create mode 100644 dbt/models/marts/enrich/marts_enrich_acteurs_closed_suggest_replaced_other_siren.sql create mode 100644 dbt/models/marts/enrich/marts_enrich_acteurs_closed_suggest_replaced_same_siren.sql create mode 100644 templates/data/_partials/dicts_to_table.html delete mode 100644 templates/data/_partials/generic_suggestion_details.html create mode 100644 templates/data/_partials/suggestion_details_changes.html diff --git a/dags/cluster/tasks/business_logic/misc/data_serialize_reconstruct.py b/dags/cluster/tasks/business_logic/misc/data_serialize_reconstruct.py new file mode 100644 index 000000000..aab3757da --- /dev/null +++ b/dags/cluster/tasks/business_logic/misc/data_serialize_reconstruct.py @@ -0,0 +1,100 @@ +"""Functions to serialize and reconstruct the Django +data as we need to pass it over wire/JSON and thus have +to lose Python/Django objects""" + +from datetime import datetime + +from django.contrib.gis.geos import Point +from django.db import models + + +def data_serialize(model: type[models.Model], data: dict) -> dict: + """ + Serialize a dictionary to match the Django model structure. + + Args: + - model_class: The Django model class. + - data: The dictionary containing the data to serialize. + + Returns: + - A dictionary with values adjusted to match the model's requirements. + """ + result = {} + + for key, value in data.items(): + field = model._meta.get_field(key) + + # We don't try to be fancy with None, it's None + if value is None: + # Due to clean_location check on Acteur model + # which prevents None if acteur is non-digital + # AND the fact that we can't know for sure whether + # acteur is digital or not, we just skip None locations + # TODO: we need to revamp the validation architecture + # as those if-elses all over the code are not maintainable + if key == "location": + continue + else: + result[key] = value + elif isinstance(field, models.ForeignKey): + if isinstance(value, (str, int)): + result[key] = value + else: + result[key] = value.pk + elif key == "location": + result["longitude"] = data["location"].x + result["latitude"] = data["location"].y + elif isinstance(value, datetime): + result[key] = value.isoformat() + else: + result[key] = value + + return result + + +def data_reconstruct(model: type[models.Model], data_src: dict) -> dict: + """ + Reconstruct data ready to use in Django model. + + Args: + - model_class: The Django model class. + - data: The dictionary containing the data to reconstruct. + + Returns: + - An instance of the model with the data populated. + """ + result = {} + data = data_src.copy() + + if "longitude" in data and "latitude" in data: + result["location"] = Point(data.pop("longitude"), data.pop("latitude")) + + for key, value in data.items(): + field = model._meta.get_field(key) + + # We don't try to be fancy with None, it's None + if value is None: + # Same explanation as in data_serialize + if key == "location": + continue + else: + result[key] = value + elif isinstance(field, models.ForeignKey): + # Django seems to handle both {field}_id and {field} transparently + # but since we reconstruct for Django, we favor the {field} flavour, + # this prevents having inconsistent representations when we work with + # Django vs. DBT models + if key.endswith("_id"): + try: + key_no_id = key.rstrip("_id") + field = model._meta.get_field(key_no_id) + key = key_no_id + except Exception: + pass + related_instance = field.related_model.objects.get(pk=value) # type: ignore + result[key] = related_instance + + else: + result[key] = value + + return result diff --git a/dags/enrich/config/__init__.py b/dags/enrich/config/__init__.py index b6c1b63e6..26fceffac 100644 --- a/dags/enrich/config/__init__.py +++ b/dags/enrich/config/__init__.py @@ -1,4 +1,4 @@ -from .cohorts import COHORTS # noqa: F401 +from .cohorts import COHORTS, Cohort # noqa: F401 from .columns import COLS # noqa: F401 from .dbt import DBT # noqa: F401 from .models import DAG_ID_TO_CONFIG_MODEL, EnrichActeursClosedConfig # noqa: F401 diff --git a/dags/enrich/config/cohorts.py b/dags/enrich/config/cohorts.py index dbdee29b2..c601baaca 100644 --- a/dags/enrich/config/cohorts.py +++ b/dags/enrich/config/cohorts.py @@ -5,8 +5,23 @@ INTRO = "🚪 Acteurs Fermés:" +@dataclass(frozen=True) +class Cohort: + code: str + label: str + + @dataclass(frozen=True) class COHORTS: - CLOSED_NOT_REPLACED: str = f"{INTRO} 🔴 non remplacés" - CLOSED_REP_OTHER_SIREN: str = f"{INTRO} 🟡 remplacés par SIRET d'un autre SIREN" - CLOSED_REP_SAME_SIREN: str = f"{INTRO} 🟢 remplacés par SIRET du même SIREN" + CLOSED_NOT_REPLACED: Cohort = Cohort( + code="acteurs_closed_not_replaced", + label=f"{INTRO} 🔴 non remplacés", + ) + CLOSED_REP_OTHER_SIREN: Cohort = Cohort( + code="acteurs_closed_replaced_other_siren", + label=f"{INTRO} 🟡 remplacés par SIRET d'un autre SIREN", + ) + CLOSED_REP_SAME_SIREN: Cohort = Cohort( + code="acteurs_closed_replaced_same_siren", + label=f"{INTRO} 🟢 remplacés par SIRET du même SIREN", + ) diff --git a/dags/enrich/config/columns.py b/dags/enrich/config/columns.py index a5a890999..697682522 100644 --- a/dags/enrich/config/columns.py +++ b/dags/enrich/config/columns.py @@ -10,13 +10,20 @@ class COLS: # Dry run DRY_RUN: str = "dry_run" + # Suggestions + SUGGEST_COHORT_CODE: str = "suggestion_cohorte_code" + SUGGEST_COHORT_LABEL: str = "suggestion_cohorte_label" + # COMMON SIREN: str = "siren" + SIRET: str = "siret" # QFDMO ACTEUR_ID: str = "acteur_id" - ACTEUR_TYPE: str = "acteur_type" - ACTEUR_SOURCE: str = "acteur_source" + ACTEUR_TYPE_ID: str = "acteur_type_id" + ACTEUR_TYPE_CODE: str = "acteur_type_code" + ACTEUR_SOURCE_ID: str = "acteur_source_id" + ACTEUR_SOURCE_CODE: str = "acteur_source_code" ACTEUR_SIRET: str = "acteur_siret" ACTEUR_NOM: str = "acteur_nom" ACTEUR_NOMS_ORIGINE: str = "acteur_noms_origine" @@ -27,7 +34,6 @@ class COLS: AE_DIRIGEANTS_NOMS: str = "ae_dirigeants_noms_prenoms" REMPLACER_SIRET: str = "remplacer_siret" REMPLACER_NOM: str = "remplacer_nom" - REMPLACER_COHORTE: str = "remplacer_cohorte" # Fields identical between acteurs and remplacements # hence replacer_ prefix not present on the model column names diff --git a/dags/enrich/config/dbt.py b/dags/enrich/config/dbt.py index dbfc0d1aa..1ae960ae9 100644 --- a/dags/enrich/config/dbt.py +++ b/dags/enrich/config/dbt.py @@ -8,11 +8,11 @@ class DBT: MARTS_ENRICH_AE_RGPD: str = "marts_enrich_ae_rgpd" MARTS_ENRICH_AE_CLOSED_CANDIDATES: str = "marts_enrich_acteurs_closed_candidates" MARTS_ENRICH_AE_CLOSED_REPLACED_SAME_SIREN: str = ( - "marts_enrich_acteurs_closed_replaced_same_siren" + "marts_enrich_acteurs_closed_suggest_replaced_same_siren" ) MARTS_ENRICH_AE_CLOSED_REPLACED_OTHER_SIREN: str = ( - "marts_enrich_acteurs_closed_replaced_other_siren" + "marts_enrich_acteurs_closed_suggest_replaced_other_siren" ) MARTS_ENRICH_AE_CLOSED_NOT_REPLACED: str = ( - "marts_enrich_acteurs_closed_not_replaced" + "marts_enrich_acteurs_closed_suggest_not_replaced" ) diff --git a/dags/enrich/dags/enrich_acteurs_closed.py b/dags/enrich/dags/enrich_acteurs_closed.py index b6aebcff7..fa3752321 100644 --- a/dags/enrich/dags/enrich_acteurs_closed.py +++ b/dags/enrich/dags/enrich_acteurs_closed.py @@ -8,17 +8,13 @@ COHORTS, DBT, TASKS, - XCOMS, EnrichActeursClosedConfig, ) -from enrich.tasks.airflow_logic.enrich_acteurs_closed_suggestions import ( - enrich_acteurs_closed_suggestions_task, -) from enrich.tasks.airflow_logic.enrich_config_create_task import ( enrich_config_create_task, ) -from enrich.tasks.airflow_logic.enrich_dbt_model_read_task import ( - enrich_dbt_model_read_task, +from enrich.tasks.airflow_logic.enrich_dbt_model_suggest_task import ( + enrich_dbt_model_suggest_task, ) from enrich.tasks.airflow_logic.enrich_dbt_models_refresh_task import ( enrich_dbt_models_refresh_task, @@ -45,66 +41,35 @@ start_date=START_DATES.FOR_SCHEDULE_NONE, params=config_to_airflow_params( EnrichActeursClosedConfig( + dbt_models_refresh=False, filter_equals__acteur_statut="ACTIF", ) ), ) as dag: - """ - chain( - enrich_config_create_task(dag), - enrich_dbt_model_read_task( - dag, - task_id=TASKS.ENRICH_CLOSED_CANDIDATES, - dbt_model_name=DBT.MARTS_ENRICH_AE_CLOSED_CANDIDATES, - xcom_push_key=XCOMS.DF_CLOSED_CANDIDATES, - ), - enrich_dbt_model_read_task( - dag, - task_id=TASKS.ENRICH_CLOSED_REPLACED, - dbt_model_name=DBT.MARTS_ENRICH_AE_CLOSED_REPLACED, - xcom_push_key=XCOMS.DF_CLOSED_REPLACED, - ), - ) - """ + # Instantiation config = enrich_config_create_task(dag) - refresh_dbt = enrich_dbt_models_refresh_task(dag) - replaced_same_siren = enrich_dbt_model_read_task( - dag, - task_id=TASKS.ENRICH_CLOSED_REPLACED_SAME_SIREN, - dbt_model_name=DBT.MARTS_ENRICH_AE_CLOSED_REPLACED_SAME_SIREN, - xcom_push_key=XCOMS.DF_CLOSED_REPLACED_SAME_SIREN, - ) - replaced_other_siren = enrich_dbt_model_read_task( + dbt_refresh = enrich_dbt_models_refresh_task(dag) + suggest_not_replaced = enrich_dbt_model_suggest_task( dag, - task_id=TASKS.ENRICH_CLOSED_REPLACED_OTHER_SIREN, - dbt_model_name=DBT.MARTS_ENRICH_AE_CLOSED_REPLACED_OTHER_SIREN, - xcom_push_key=XCOMS.DF_CLOSED_REPLACED_OTHER_SIREN, - ) - not_replaced = enrich_dbt_model_read_task( - dag, - task_id=TASKS.ENRICH_CLOSED_NOT_REPLACED, + task_id=TASKS.ENRICH_CLOSED_SUGGESTIONS_NOT_REPLACED, + cohort=COHORTS.CLOSED_NOT_REPLACED, dbt_model_name=DBT.MARTS_ENRICH_AE_CLOSED_NOT_REPLACED, - xcom_push_key=XCOMS.DF_CLOSED_NOT_REPLACED, ) - suggestions_same_siren = enrich_acteurs_closed_suggestions_task( - dag, - task_id=TASKS.ENRICH_CLOSED_SUGGESTIONS_SAME_SIREN, - cohort_type=COHORTS.CLOSED_REP_SAME_SIREN, - df_xcom_key=XCOMS.DF_CLOSED_REPLACED_SAME_SIREN, - ) - suggestions_other_siren = enrich_acteurs_closed_suggestions_task( + suggest_other_siren = enrich_dbt_model_suggest_task( dag, task_id=TASKS.ENRICH_CLOSED_SUGGESTIONS_OTHER_SIREN, - cohort_type=COHORTS.CLOSED_REP_OTHER_SIREN, - df_xcom_key=XCOMS.DF_CLOSED_REPLACED_OTHER_SIREN, + cohort=COHORTS.CLOSED_REP_OTHER_SIREN, + dbt_model_name=DBT.MARTS_ENRICH_AE_CLOSED_REPLACED_OTHER_SIREN, ) - suggestions_not_replaced = enrich_acteurs_closed_suggestions_task( + suggest_same_siren = enrich_dbt_model_suggest_task( dag, - task_id=TASKS.ENRICH_CLOSED_SUGGESTIONS_NOT_REPLACED, - cohort_type=COHORTS.CLOSED_NOT_REPLACED, - df_xcom_key=XCOMS.DF_CLOSED_NOT_REPLACED, + task_id=TASKS.ENRICH_CLOSED_SUGGESTIONS_SAME_SIREN, + cohort=COHORTS.CLOSED_REP_SAME_SIREN, + dbt_model_name=DBT.MARTS_ENRICH_AE_CLOSED_REPLACED_SAME_SIREN, ) - config >> refresh_dbt # type: ignore - refresh_dbt >> replaced_same_siren >> suggestions_same_siren # type: ignore - refresh_dbt >> replaced_other_siren >> suggestions_other_siren # type: ignore - refresh_dbt >> not_replaced >> suggestions_not_replaced # type: ignore + + # Graph + config >> dbt_refresh # type: ignore + dbt_refresh >> suggest_not_replaced # type: ignore + dbt_refresh >> suggest_other_siren # type: ignore + dbt_refresh >> suggest_same_siren # type: ignore diff --git a/dags/enrich/tasks/airflow_logic/enrich_acteurs_closed_suggestions.py b/dags/enrich/tasks/airflow_logic/enrich_acteurs_closed_suggestions.py deleted file mode 100644 index 39a743b8e..000000000 --- a/dags/enrich/tasks/airflow_logic/enrich_acteurs_closed_suggestions.py +++ /dev/null @@ -1,57 +0,0 @@ -"""Read data from DB needed for RGPD anonymization""" - -import logging - -from airflow import DAG -from airflow.models.taskinstance import TaskInstance -from airflow.operators.python import PythonOperator -from enrich.config import XCOMS, xcom_pull -from enrich.tasks.business_logic.enrich_acteurs_closed_suggestions import ( - enrich_acteurs_closed_suggestions, -) - -logger = logging.getLogger(__name__) - - -def task_info_get(task_id, df_xcom_key): - return f""" - ============================================================ - Description de la tâche "{task_id}" - ============================================================ - 💡 quoi: on génère les suggestions à partir de la df - {df_xcom_key} - - 🎯 pourquoi: le but de ce DAG - - 🏗️ comment: pour chaque acteur fermé, on génère 1 suggestion - """ - - -def enrich_acteurs_closed_suggestions_wrapper( - cohort_type: str, df_xcom_key: str, task_id: str, ti: TaskInstance, dag: DAG -) -> None: - logger.info(task_info_get(task_id, df_xcom_key)) - - # Config - config = xcom_pull(ti, XCOMS.CONFIG) - logger.info(f"📖 Configuration:\n{config.model_dump_json(indent=2)}") - - # Processing - enrich_acteurs_closed_suggestions( - df=xcom_pull(ti, df_xcom_key), - cohort_type=cohort_type, - identifiant_action=dag.dag_id, - dry_run=config.dry_run, - ) - - -def enrich_acteurs_closed_suggestions_task( - dag: DAG, task_id: str, cohort_type: str, df_xcom_key: str -) -> PythonOperator: - return PythonOperator( - task_id=task_id, - python_callable=enrich_acteurs_closed_suggestions_wrapper, - op_args=[cohort_type, df_xcom_key, task_id], - dag=dag, - doc_md=f"**Suggestions** pour la cohorte: {cohort_type}**", - ) diff --git a/dags/enrich/tasks/airflow_logic/enrich_dbt_model_read_task.py b/dags/enrich/tasks/airflow_logic/enrich_dbt_model_read_task.py deleted file mode 100644 index 325af36e8..000000000 --- a/dags/enrich/tasks/airflow_logic/enrich_dbt_model_read_task.py +++ /dev/null @@ -1,60 +0,0 @@ -"""Read data from DB needed for RGPD anonymization""" - -import logging - -from airflow import DAG -from airflow.exceptions import AirflowSkipException -from airflow.operators.python import PythonOperator -from airflow.utils.trigger_rule import TriggerRule -from enrich.config import DBT, TASKS, XCOMS, xcom_pull -from enrich.tasks.business_logic.enrich_dbt_model_read import ( - enrich_dbt_model_read, -) - -logger = logging.getLogger(__name__) - - -def task_info_get(): - return f""" - ============================================================ - Description de la tâche "{TASKS.READ_AE_RGPD}" - ============================================================ - 💡 quoi: lecture des données via le modèle DBT - {DBT.MARTS_ENRICH_AE_RGPD} - - 🎯 pourquoi: faire un pré-filtre sur les matches potentiels - (pas récupérer les ~27M de lignes de la table AE unite_legale) - - 🏗️ comment: on récupère uniquement les matches SIREN avec - des infos de noms/prénoms dans l'AE en passant par de la normalisation - de chaines de caractères - """ - - -def enrich_dbt_model_read_wrapper(dbt_model_name, xcom_push_key, ti) -> None: - logger.info(task_info_get()) - - # Config - config = xcom_pull(ti, XCOMS.CONFIG) - logger.info(f"📖 Configuration:\n{config.model_dump_json(indent=2)}") - - # Processing - df = enrich_dbt_model_read(dbt_model_name=dbt_model_name, filters=config.filters) - if df.empty: - raise AirflowSkipException("Pas de données DB, on s'arrête là") - - # Result - ti.xcom_push(key=xcom_push_key, value=df) - - -def enrich_dbt_model_read_task( - dag: DAG, task_id: str, dbt_model_name: str, xcom_push_key: str -) -> PythonOperator: - return PythonOperator( - task_id=task_id, - python_callable=enrich_dbt_model_read_wrapper, - op_args=[dbt_model_name, xcom_push_key], - dag=dag, - doc_md=f"**Lecture du modèle DBT**: `{dbt_model_name}`", - trigger_rule=TriggerRule.ALL_DONE, - ) diff --git a/dags/enrich/tasks/airflow_logic/enrich_dbt_model_suggest_task.py b/dags/enrich/tasks/airflow_logic/enrich_dbt_model_suggest_task.py new file mode 100644 index 000000000..d27b1cf12 --- /dev/null +++ b/dags/enrich/tasks/airflow_logic/enrich_dbt_model_suggest_task.py @@ -0,0 +1,67 @@ +"""Read data from DB needed for RGPD anonymization""" + +import logging + +from airflow import DAG +from airflow.exceptions import AirflowSkipException +from airflow.models.taskinstance import TaskInstance +from airflow.operators.python import PythonOperator +from airflow.utils.trigger_rule import TriggerRule +from enrich.config import XCOMS, Cohort, xcom_pull +from enrich.tasks.business_logic.enrich_dbt_model_suggest import ( + enrich_dbt_model_suggest, +) + +logger = logging.getLogger(__name__) + + +def task_info_get(task_id, df_xcom_key): + return f""" + ============================================================ + Description de la tâche "{task_id}" + ============================================================ + 💡 quoi: on génère les suggestions à partir de la df + {df_xcom_key} + + 🎯 pourquoi: le but de ce DAG + + 🏗️ comment: pour chaque acteur fermé, on génère 1 suggestion + """ + + +def enrich_dbt_model_suggest_wrapper( + task_id: str, + cohort: Cohort, + dbt_model_name: str, + ti: TaskInstance, + dag: DAG, +) -> None: + logger.info(task_info_get(task_id, dbt_model_name)) + + # Config + config = xcom_pull(ti, XCOMS.CONFIG) + logger.info(f"📖 Configuration:\n{config.model_dump_json(indent=2)}") + + # Processing + suggestions_written = enrich_dbt_model_suggest( + dbt_model_name=dbt_model_name, + filters=config.filters, + cohort=cohort, + identifiant_action=dag.dag_id, + dry_run=config.dry_run, + ) + if not suggestions_written: + raise AirflowSkipException("Pas de suggestions écrites") + + +def enrich_dbt_model_suggest_task( + dag: DAG, task_id: str, cohort: Cohort, dbt_model_name: str +) -> PythonOperator: + return PythonOperator( + task_id=task_id, + python_callable=enrich_dbt_model_suggest_wrapper, + op_args=[task_id, cohort, dbt_model_name], + dag=dag, + doc_md=f"**Suggestions** pour la cohorte: {cohort.label}**", + trigger_rule=TriggerRule.ALL_DONE, + ) diff --git a/dags/enrich/tasks/airflow_logic/enrich_read_dbt_model_task.py b/dags/enrich/tasks/airflow_logic/enrich_read_dbt_model_task.py deleted file mode 100644 index 72a54c040..000000000 --- a/dags/enrich/tasks/airflow_logic/enrich_read_dbt_model_task.py +++ /dev/null @@ -1,58 +0,0 @@ -"""Read data from DB needed for RGPD anonymization""" - -import logging - -from airflow import DAG -from airflow.exceptions import AirflowSkipException -from airflow.operators.python import PythonOperator -from enrich.config import DBT, TASKS, XCOMS, xcom_pull -from enrich.tasks.business_logic.enrich_dbt_model_read import ( - enrich_dbt_model_read, -) - -logger = logging.getLogger(__name__) - - -def task_info_get(): - return f""" - ============================================================ - Description de la tâche "{TASKS.READ_AE_RGPD}" - ============================================================ - 💡 quoi: lecture des données via le modèle DBT - {DBT.MARTS_ENRICH_AE_RGPD} - - 🎯 pourquoi: faire un pré-filtre sur les matches potentiels - (pas récupérer les ~27M de lignes de la table AE unite_legale) - - 🏗️ comment: on récupère uniquement les matches SIREN avec - des infos de noms/prénoms dans l'AE en passant par de la normalisation - de chaines de caractères - """ - - -def enrich_dbt_model_read_wrapper(dbt_model_name, xcom_push_key, ti) -> None: - logger.info(task_info_get()) - - # Config - config = xcom_pull(ti, XCOMS.CONFIG) - logger.info(f"📖 Configuration:\n{config.model_dump_json(indent=2)}") - - # Processing - df = enrich_dbt_model_read(dbt_model_name=dbt_model_name, filters=config.filters) - if df.empty: - raise AirflowSkipException("Pas de données DB, on s'arrête là") - - # Result - ti.xcom_push(key=xcom_push_key, value=df) - - -def enrich_dbt_model_read_task( - dag: DAG, task_id: str, dbt_model_name: str, xcom_push_key: str -) -> PythonOperator: - return PythonOperator( - task_id=task_id, - python_callable=enrich_dbt_model_read_wrapper, - op_args=[dbt_model_name, xcom_push_key], - dag=dag, - doc_md=f"**Lecture du modèle DBT**: `{dbt_model_name}`", - ) diff --git a/dags/enrich/tasks/business_logic/enrich_dbt_model_suggest.py b/dags/enrich/tasks/business_logic/enrich_dbt_model_suggest.py new file mode 100644 index 000000000..e748fd4c6 --- /dev/null +++ b/dags/enrich/tasks/business_logic/enrich_dbt_model_suggest.py @@ -0,0 +1,29 @@ +import logging + +from enrich.config import Cohort +from enrich.tasks.business_logic.enrich_dbt_model_read import enrich_dbt_model_read +from enrich.tasks.business_logic.enrich_dbt_model_to_suggestions import ( + enrich_dbt_model_to_suggestions, +) + +logger = logging.getLogger(__name__) + + +def enrich_dbt_model_suggest( + dbt_model_name: str, + filters: list[dict], + cohort: Cohort, + identifiant_action: str, + dry_run: bool = True, +) -> bool: + """Reads a DBT model and generates suggestions for it""" + df = enrich_dbt_model_read(dbt_model_name, filters) + + if df.empty: + logger.info(f"0 donnée pour {dbt_model_name=} avec filtres {filters}") + return False + + suggestions_written = enrich_dbt_model_to_suggestions( + df, cohort, identifiant_action, dry_run + ) + return suggestions_written diff --git a/dags/enrich/tasks/business_logic/enrich_acteurs_closed_suggestions.py b/dags/enrich/tasks/business_logic/enrich_dbt_model_to_suggestions.py similarity index 70% rename from dags/enrich/tasks/business_logic/enrich_acteurs_closed_suggestions.py rename to dags/enrich/tasks/business_logic/enrich_dbt_model_to_suggestions.py index b00c34fd1..c6025d434 100644 --- a/dags/enrich/tasks/business_logic/enrich_acteurs_closed_suggestions.py +++ b/dags/enrich/tasks/business_logic/enrich_dbt_model_to_suggestions.py @@ -5,17 +5,17 @@ from cluster.tasks.business_logic.cluster_acteurs_parents_choose_new import ( parent_id_generate, ) -from enrich.config import COHORTS, COLS +from enrich.config import COHORTS, COLS, Cohort logger = logging.getLogger(__name__) -def enrich_acteurs_closed_suggestions( +def enrich_dbt_model_to_suggestions( df: pd.DataFrame, - cohort_type: str, + cohort: Cohort, identifiant_action: str, dry_run: bool = True, -) -> None: +) -> bool: from data.models import ( Suggestion, SuggestionAction, @@ -32,12 +32,12 @@ def enrich_acteurs_closed_suggestions( if df is None or df.empty: raise ValueError("df vide: on devrait pas être ici") - if cohort_type not in [ - COHORTS.CLOSED_NOT_REPLACED, - COHORTS.CLOSED_REP_OTHER_SIREN, - COHORTS.CLOSED_REP_SAME_SIREN, + if cohort.code not in [ + COHORTS.CLOSED_NOT_REPLACED.code, + COHORTS.CLOSED_REP_OTHER_SIREN.code, + COHORTS.CLOSED_REP_SAME_SIREN.code, ]: - raise ValueError(f"Mauvaise cohorte: {cohort_type=}") + raise ValueError(f"Mauvaise cohorte: {cohort=}") # Suggestions suggestions = [] @@ -47,21 +47,25 @@ def enrich_acteurs_closed_suggestions( # ----------------------------------------- # NOT REPLACED # ----------------------------------------- - if cohort_type == COHORTS.CLOSED_NOT_REPLACED: + if cohort == COHORTS.CLOSED_NOT_REPLACED: changes = [] model_params = { "id": row[COLS.ACTEUR_ID], "data": { + "identifiant_unique": row[COLS.ACTEUR_ID], "statut": ActeurStatus.INACTIF, + # TODO: fix inconsistency between acteur_siret and siret + # in non-replaced model + "siret": row[COLS.SIRET], "siret_is_closed": True, - "acteur_type": row[COLS.ACTEUR_TYPE], - "source": row[COLS.ACTEUR_SOURCE], + "acteur_type": row[COLS.ACTEUR_TYPE_ID], + "source": row[COLS.ACTEUR_SOURCE_ID], }, } ChangeActeurUpdateData(**model_params).validate() change = SuggestionChange( order=1, - reason=cohort_type, + reason="SIRET & SIREN fermés, 0 remplacement trouvé", entity_type="acteur_displayed", model_name=ChangeActeurUpdateData.name(), model_params=model_params, @@ -71,14 +75,16 @@ def enrich_acteurs_closed_suggestions( # ----------------------------------------- # REPLACED # ----------------------------------------- - elif cohort_type in [ + elif cohort in [ COHORTS.CLOSED_REP_OTHER_SIREN, COHORTS.CLOSED_REP_SAME_SIREN, ]: - cohortes = df[COLS.REMPLACER_COHORTE].unique() - if len(cohortes) > 1: - raise ValueError(f"Une seule cohorte à la fois: {cohortes=}") - logger.info(f"{cohort_type}: suggestion acteur id={row[COLS.ACTEUR_ID]}") + cohorts = df[COLS.SUGGEST_COHORT_CODE].unique() + if len(cohorts) > 1: + raise ValueError(f"Une seule cohorte à la fois: {cohorts=}") + if cohorts[0] != cohort.code: + raise ValueError(f"Mauvaise cohorte: {cohorts=} != {cohort=}") + logger.info(f"{cohort.label}: suggestion acteur id={row[COLS.ACTEUR_ID]}") changes = [] @@ -95,14 +101,15 @@ def enrich_acteurs_closed_suggestions( "siren": row[COLS.REMPLACER_SIRET][:9], "siret": row[COLS.REMPLACER_SIRET], "naf_principal": row[COLS.REMPLACER_NAF], - "acteur_type": row[COLS.ACTEUR_TYPE], + "acteur_type": row[COLS.ACTEUR_TYPE_ID], "source": None, + "statut": ActeurStatus.ACTIF, }, } ChangeActeurCreateAsParent(**model_params).validate() change = SuggestionChange( order=1, - reason=cohort_type, + reason="besoin d'un parent pour nouvel acteur", entity_type="acteur_displayed", model_name=ChangeActeurCreateAsParent.name(), model_params=model_params, @@ -113,22 +120,25 @@ def enrich_acteurs_closed_suggestions( model_params = { "id": row[COLS.ACTEUR_ID], "data": { - "statut": ActeurStatus.INACTIF, + "identifiant_unique": row[COLS.ACTEUR_ID], "parent": parent_id, "parent_reason": ( f"SIRET {row[COLS.ACTEUR_SIRET]} " f"détecté le {today} comme fermé dans AE, " f"remplacé par SIRET {row[COLS.REMPLACER_SIRET]}" ), + "siren": row[COLS.ACTEUR_SIRET][:9], + "siret": row[COLS.ACTEUR_SIRET], "siret_is_closed": True, - "acteur_type": row[COLS.ACTEUR_TYPE], - "source": row[COLS.ACTEUR_SOURCE], + "acteur_type": row[COLS.ACTEUR_TYPE_ID], + "source": row[COLS.ACTEUR_SOURCE_ID], + "statut": ActeurStatus.INACTIF, }, } ChangeActeurUpdateData(**model_params).validate() change = SuggestionChange( order=2, - reason=cohort_type, + reason="rattaché au parent", entity_type="acteur_displayed", model_name=ChangeActeurUpdateData.name(), model_params=model_params, @@ -136,7 +146,7 @@ def enrich_acteurs_closed_suggestions( changes.append(change) else: - raise ValueError(f"Mauvaise cohorte: {cohort_type=}") + raise ValueError(f"Mauvaise cohorte: {cohort=}") # Generic to all cohorts suggestions.append( @@ -144,7 +154,7 @@ def enrich_acteurs_closed_suggestions( # TODO: free format thanks to recursive model "contexte": {}, "suggestion": { - "title": cohort_type, + "title": cohort.label, "summary": [], "changes": changes, }, @@ -156,23 +166,26 @@ def enrich_acteurs_closed_suggestions( # ----------------------------------------- if dry_run: logger.info("✋ Dry run: suggestions pas écrites en base") - return + suggestions_written = False + return suggestions_written # ----------------------------------------- # SUGGESTION: WRITE TO DB # ----------------------------------------- - cohort = SuggestionCohorte( + db_cohort = SuggestionCohorte( identifiant_action=identifiant_action, - identifiant_execution=f"{cohort_type}", + identifiant_execution=f"{cohort.label}", statut=SuggestionStatut.AVALIDER, type_action=SuggestionAction.ENRICH_ACTEURS_CLOSED, metadata={"🔢 Nombre de suggestions": len(suggestions)}, ) - cohort.save() + db_cohort.save() for suggestion in suggestions: Suggestion( - suggestion_cohorte=cohort, + suggestion_cohorte=db_cohort, statut=SuggestionStatut.AVALIDER, contexte=suggestion["contexte"], suggestion=suggestion["suggestion"], ).save() + suggestions_written = True + return suggestions_written diff --git a/dags_unit_tests/enrich/config/test_enrich_acteurs_closed_config.py b/dags_unit_tests/enrich/config/test_enrich_acteurs_closed_config.py index 595536f35..cb51d00b2 100644 --- a/dags_unit_tests/enrich/config/test_enrich_acteurs_closed_config.py +++ b/dags_unit_tests/enrich/config/test_enrich_acteurs_closed_config.py @@ -1,6 +1,5 @@ import pytest - -from dags.enrich.config.models import EnrichActeursClosedConfig +from enrich.config.models import EnrichActeursClosedConfig class TestEnrichClosedConfig: diff --git a/dags_unit_tests/enrich/tasks/test_enrich_acteurs_closed_suggestions.py b/dags_unit_tests/enrich/tasks/test_enrich_acteurs_closed_suggestions.py index d71d9bd97..792a45df2 100644 --- a/dags_unit_tests/enrich/tasks/test_enrich_acteurs_closed_suggestions.py +++ b/dags_unit_tests/enrich/tasks/test_enrich_acteurs_closed_suggestions.py @@ -6,10 +6,9 @@ parent_id_generate, ) from django.contrib.gis.geos import Point - -from dags.enrich.config import COHORTS, COLS -from dags.enrich.tasks.business_logic.enrich_acteurs_closed_suggestions import ( - enrich_acteurs_closed_suggestions, +from enrich.config import COHORTS, COLS +from enrich.tasks.business_logic.enrich_dbt_model_to_suggestions import ( + enrich_dbt_model_to_suggestions, ) TODAY = datetime.now(timezone.utc).strftime("%Y-%m-%d") @@ -38,8 +37,8 @@ def df_not_replaced(self, atype, source): COLS.ACTEUR_ID: ["a01", "a02"], COLS.ACTEUR_SIRET: ["00000000000001", "00000000000002"], COLS.ACTEUR_NOM: ["AVANT a01", "AVANT a02"], - COLS.ACTEUR_TYPE: [atype.pk, atype.pk], - COLS.ACTEUR_SOURCE: [source.pk, source.pk], + COLS.ACTEUR_TYPE_ID: [atype.pk, atype.pk], + COLS.ACTEUR_SOURCE_ID: [source.pk, source.pk], } ) @@ -54,8 +53,8 @@ def df_replaced(self, atype, source): "22222222200001", "44444444400001", ], - COLS.ACTEUR_TYPE: [atype.pk, atype.pk, atype.pk], - COLS.ACTEUR_SOURCE: [source.pk, source.pk, source.pk], + COLS.ACTEUR_TYPE_ID: [atype.pk, atype.pk, atype.pk], + COLS.ACTEUR_SOURCE_ID: [source.pk, source.pk, source.pk], # Replacement data COLS.REMPLACER_SIRET: [ "11111111100002", @@ -63,7 +62,11 @@ def df_replaced(self, atype, source): "55555555500001", ], COLS.REMPLACER_NOM: ["APRES a1", "APRES a2", "APRES a3"], - COLS.REMPLACER_COHORTE: ["meme_siret", "autre_siret", "autre_siret"], + COLS.SUGGEST_COHORT_LABEL: [ + "meme_siret", + "autre_siret", + "autre_siret", + ], COLS.REMPLACER_ADRESSE: ["Adresse1", "Adresse2", "Adresse3"], COLS.REMPLACER_CODE_POSTAL: ["12345", "67890", "12345"], COLS.REMPLACER_VILLE: ["Ville1", "Ville2", "Ville3"], @@ -72,7 +75,7 @@ def df_replaced(self, atype, source): ) def test_df_replaced(self, df_replaced): - assert sorted(df_replaced[COLS.REMPLACER_COHORTE].unique()) == sorted( + assert sorted(df_replaced[COLS.SUGGEST_COHORT_LABEL].unique()) == sorted( [ "meme_siret", "autre_siret", @@ -81,11 +84,11 @@ def test_df_replaced(self, df_replaced): @pytest.fixture def df_replaced_meme_siret(self, df_replaced): - return df_replaced[df_replaced[COLS.REMPLACER_COHORTE] == "meme_siret"] + return df_replaced[df_replaced[COLS.SUGGEST_COHORT_LABEL] == "meme_siret"] @pytest.fixture def df_replaced_autre_siret(self, df_replaced): - return df_replaced[df_replaced[COLS.REMPLACER_COHORTE] == "autre_siret"] + return df_replaced[df_replaced[COLS.SUGGEST_COHORT_LABEL] == "autre_siret"] @pytest.fixture def acteurs(self, df_not_replaced, df_replaced, atype, source): @@ -108,9 +111,9 @@ def test_cohorte_not_replaced(self, acteurs, df_not_replaced): from qfdmo.models import ActeurStatus, RevisionActeur # Write suggestions to DB - enrich_acteurs_closed_suggestions( + enrich_dbt_model_to_suggestions( df=df_not_replaced, - cohort_type=COHORTS.CLOSED_NOT_REPLACED, + cohort_code=COHORTS.CLOSED_NOT_REPLACED, identifiant_action="test_not_replaced", dry_run=False, ) @@ -143,9 +146,9 @@ def test_cohorte_meme_siren(self, acteurs, atype, source, df_replaced_meme_siret from qfdmo.models import ActeurStatus, RevisionActeur # Write suggestions to DB - enrich_acteurs_closed_suggestions( + enrich_dbt_model_to_suggestions( df=df_replaced_meme_siret, - cohort_type=COHORTS.CLOSED_REP_SAME_SIREN, + cohort_code=COHORTS.CLOSED_REP_SAME_SIREN, identifiant_action="test_meme_siren", dry_run=False, ) @@ -187,9 +190,9 @@ def test_cohorte_autre_siren(self, acteurs, df_replaced_autre_siret): from qfdmo.models import ActeurStatus, RevisionActeur # Write suggestions to DB - enrich_acteurs_closed_suggestions( + enrich_dbt_model_to_suggestions( df=df_replaced_autre_siret, - cohort_type=COHORTS.CLOSED_REP_OTHER_SIREN, + cohort_code=COHORTS.CLOSED_REP_OTHER_SIREN, identifiant_action="test_autre_siren", dry_run=False, ) diff --git a/dags_unit_tests/utils/test_data_serialize_reconstruct.py b/dags_unit_tests/utils/test_data_serialize_reconstruct.py index df2f49eb9..dc3af503a 100644 --- a/dags_unit_tests/utils/test_data_serialize_reconstruct.py +++ b/dags_unit_tests/utils/test_data_serialize_reconstruct.py @@ -34,19 +34,16 @@ def data_init(self) -> dict: "location": POINT, "cree_le": DATETIME, } - print("data_init", f"{data=}") return data @pytest.fixture def data_serialized(self, data_init) -> dict: data = data_serialize(RevisionActeur, data_init) - print("data_serialized", f"{data=}") return data @pytest.fixture def data_reconstructed(self, data_serialized) -> dict: data = data_reconstruct(RevisionActeur, data_serialized) - print("data_reconstructed", f"{data=}") return data def test_data_reconstructed(self, data_reconstructed): @@ -94,3 +91,18 @@ def test_none_cases(self, data_init): data = {"location": None} data = data_reconstruct(RevisionActeur, data) assert data == {} + + def test_working_with_id_fields(self, data_init): + # When working with DBT, we have foreign keys being + # expressed as {field}_id fields (and not {field} like + # in Django models), and we test that data_reconstruct + # handles this transparently and forces {field} representation + data = data_init.copy() + # We switch from the Django reprentation (source) to the + # DBT representation (source_id) + data["source_id"] = data["source"].id + del data["source"] + ser = data_serialize(RevisionActeur, data) + rec = data_reconstruct(RevisionActeur, ser) + # The reconstruction should be in {field} format + assert rec["source"].id == data_init["source"].id diff --git a/data/models/suggestion.py b/data/models/suggestion.py index 606ee8e4d..c27182147 100644 --- a/data/models/suggestion.py +++ b/data/models/suggestion.py @@ -196,10 +196,11 @@ def display_suggestion_details(self): template_name = "data/_partials/clustering_suggestion_details.html" elif self.suggestion_cohorte.type_action == SuggestionAction.CRAWL_URLS: template_name = "data/_partials/crawl_urls_suggestion_details.html" - elif ( - self.suggestion_cohorte.type_action == SuggestionAction.ENRICH_ACTEURS_RGPD - ): - template_name = "data/_partials/generic_suggestion_details.html" + elif self.suggestion_cohorte.type_action in [ + SuggestionAction.ENRICH_ACTEURS_RGPD, + SuggestionAction.ENRICH_ACTEURS_CLOSED, + ]: + template_name = "data/_partials/suggestion_details_changes.html" template_context = self.suggestion # TODO: suggestions to migrate to PYDANTIC classes diff --git a/dbt/README.md b/dbt/README.md index 13faf5657..c56288ee9 100644 --- a/dbt/README.md +++ b/dbt/README.md @@ -30,6 +30,24 @@ Lancer les tests dbt run --select qfdmo.exhaustive_acteurs ``` +## Sampling + + - 💡 **quoi**: utiliser une sous-partie de la donnée + - 🎯 **pourquoi**: itérer plus rapidement + - 🤔 **comment**: + - **Variable d'environement** `DBT_SAMPLING` à mettre à `true` + - **Liberté par modèle**: d'implémenter du sampling ou pas, ex: `base_ae_etablissement.sql` + ```sql + {% if env_var('DBT_SAMPLING', 'false') == 'true' %} + ORDER BY siret DESC + LIMIT 1000000 + {% endif %} + ``` + - **Appliquer le sampling**: en préfixant la commande dbt + ```bash + export DBT_SAMPLING='true' && dbt ... + ``` + ### Resources: - Learn more about dbt [in the docs](https://docs.getdbt.com/docs/introduction) - Check out [Discourse](https://discourse.getdbt.com/) for commonly asked questions and answers diff --git a/dbt/macros/table/macro_acteur.sql b/dbt/macros/table/macro_acteur.sql index 6657954e3..417c25625 100644 --- a/dbt/macros/table/macro_acteur.sql +++ b/dbt/macros/table/macro_acteur.sql @@ -5,6 +5,7 @@ SELECT DISTINCT efa.uuid, efa.nom, {{ field_empty('efa.description') }} AS description, efa.acteur_type_id, + efa.acteur_type_code, {{ field_empty('efa.adresse') }} AS adresse, {{ field_empty('efa.adresse_complement') }} AS adresse_complement, {{ field_empty('efa.code_postal') }} AS code_postal, @@ -18,6 +19,7 @@ SELECT DISTINCT efa.uuid, {{ field_empty('efa.siren') }} AS siren, {{ field_empty('efa.siret') }} AS siret, efa.source_id, + efa.source_code, efa.identifiant_externe, efa.naf_principal, {{ field_empty('efa.commentaires') }} AS commentaires, diff --git a/dbt/macros/udf/udf_ae_string_cleanup.sql b/dbt/macros/udf/udf_ae_string_cleanup.sql index 977416058..2608580f9 100644 --- a/dbt/macros/udf/udf_ae_string_cleanup.sql +++ b/dbt/macros/udf/udf_ae_string_cleanup.sql @@ -6,7 +6,8 @@ to NULL for easier processing whenever we consider it to be empty. */ -CREATE OR REPLACE FUNCTION {{ target.schema }}.udf_ae_string_cleanup(val TEXT) +DROP FUNCTION IF EXISTS {{ target.schema }}.udf_ae_string_cleanup(TEXT) CASCADE; +CREATE FUNCTION {{ target.schema }}.udf_ae_string_cleanup(val TEXT) RETURNS TEXT AS $$ BEGIN IF TRIM(val) = '' OR TRIM(val) = '[ND]' THEN diff --git a/dbt/macros/udf/udf_columns_concat_unique_non_empty.sql b/dbt/macros/udf/udf_columns_concat_unique_non_empty.sql index fc7423f5b..6bc2b34b2 100644 --- a/dbt/macros/udf/udf_columns_concat_unique_non_empty.sql +++ b/dbt/macros/udf/udf_columns_concat_unique_non_empty.sql @@ -2,8 +2,7 @@ /* Concatenate strings from various columns while only retaining non-empty values */ - -DROP FUNCTION IF EXISTS {{ target.schema }}.udf_columns_concat_unique_non_empty(VARIADIC input_columns TEXT[]); +DROP FUNCTION IF EXISTS {{ target.schema }}.udf_columns_concat_unique_non_empty(VARIADIC input_columns TEXT[]) CASCADE; CREATE FUNCTION {{ target.schema }}.udf_columns_concat_unique_non_empty(VARIADIC input_columns TEXT[]) RETURNS TEXT AS $$ DECLARE diff --git a/dbt/macros/udf/udf_columns_words_in_common_count.sql b/dbt/macros/udf/udf_columns_words_in_common_count.sql index 616cfcbf7..e2f1e6afc 100644 --- a/dbt/macros/udf/udf_columns_words_in_common_count.sql +++ b/dbt/macros/udf/udf_columns_words_in_common_count.sql @@ -2,7 +2,8 @@ /* Count number of words in common between 2 columns */ -CREATE OR REPLACE FUNCTION {{ target.schema }}.udf_columns_words_in_common_count(col1 text, col2 text) +DROP FUNCTION IF EXISTS {{ target.schema }}.udf_columns_words_in_common_count(text, text) CASCADE; +CREATE FUNCTION {{ target.schema }}.udf_columns_words_in_common_count(col1 text, col2 text) RETURNS integer AS $$ DECLARE word text; diff --git a/dbt/macros/udf/udf_encode_base57.sql b/dbt/macros/udf/udf_encode_base57.sql index b251e509d..a6756eafc 100644 --- a/dbt/macros/udf/udf_encode_base57.sql +++ b/dbt/macros/udf/udf_encode_base57.sql @@ -1,8 +1,8 @@ {% macro create_udf_encode_base57() %} -DROP FUNCTION IF EXISTS {{ target.schema }}.encode_base57(uuid); +DROP FUNCTION IF EXISTS {{ target.schema }}.encode_base57(uuid) CASCADE; CREATE FUNCTION {{ target.schema }}.encode_base57(uuid UUID) -RETURNS varchar(22) AS $$ +RETURNS text AS $$ DECLARE alphabet text := '23456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz'; -- pragma: allowlist secret result text := ''; diff --git a/dbt/macros/udf/udf_normalize_string_alpha_for_match.sql b/dbt/macros/udf/udf_normalize_string_alpha_for_match.sql index affea32c5..0bd1c3c05 100644 --- a/dbt/macros/udf/udf_normalize_string_alpha_for_match.sql +++ b/dbt/macros/udf/udf_normalize_string_alpha_for_match.sql @@ -7,20 +7,29 @@ with directors' names, hence normalization for a pre-filtering in SQL E.g. to test this function: - SELECT udf_normalize_string_alpha_for_match(' Héllo-Wørld! Ça va? 123 '); + SELECT udf_normalize_string_for_match(' Héllo-Wørld! Ça va? 123 '); */ - -DROP FUNCTION IF EXISTS {{ target.schema }}.udf_normalize_string_alpha_for_match(input_text TEXT); -CREATE FUNCTION {{ target.schema }}.udf_normalize_string_alpha_for_match(input_text TEXT) RETURNS TEXT AS $$ +DROP FUNCTION IF EXISTS {{ target.schema }}.udf_normalize_string_for_match(TEXT) CASCADE; +CREATE FUNCTION {{ target.schema }}.udf_normalize_string_for_match(input_text TEXT) RETURNS TEXT AS $$ DECLARE normalized TEXT; + words TEXT[]; BEGIN + -- Step 1: Normalize the string normalized := unaccent(input_text); normalized := lower(normalized); normalized := regexp_replace(normalized, '[^a-z]', ' ', 'g'); normalized := regexp_replace(normalized, '\s+', ' ', 'g'); normalized := trim(normalized); + -- Step 2: Split into words, sort alphabetically, and rejoin + words := string_to_array(normalized, ' '); + SELECT string_agg(word, ' ') INTO normalized + FROM ( + SELECT unnest(words) AS word + ORDER BY word + ) AS words_sorted; + RETURN normalized; END; $$ LANGUAGE plpgsql; diff --git a/dbt/macros/udf/udf_safe_divmod.sql b/dbt/macros/udf/udf_safe_divmod.sql index 14dd02195..3c36d20f5 100644 --- a/dbt/macros/udf/udf_safe_divmod.sql +++ b/dbt/macros/udf/udf_safe_divmod.sql @@ -1,6 +1,5 @@ {% macro create_udf_safe_divmod() %} - -DROP FUNCTION IF EXISTS {{ target.schema }}.safe_divmod(n numeric, d numeric); +DROP FUNCTION IF EXISTS {{ target.schema }}.safe_divmod(numeric, numeric) CASCADE; CREATE FUNCTION {{ target.schema }}.safe_divmod(n numeric, d numeric) RETURNS TABLE(quotient numeric, remainder numeric) AS $$ DECLARE diff --git a/dbt/macros/udf/udf_uuid_to_int.sql b/dbt/macros/udf/udf_uuid_to_int.sql index 87a5ec7cb..45a437172 100644 --- a/dbt/macros/udf/udf_uuid_to_int.sql +++ b/dbt/macros/udf/udf_uuid_to_int.sql @@ -1,6 +1,5 @@ {% macro create_udf_uuid_to_int() %} - -DROP FUNCTION IF EXISTS {{ target.schema }}.uuid_to_int(uuid UUID); +DROP FUNCTION IF EXISTS {{ target.schema }}.uuid_to_int(uuid) CASCADE; CREATE FUNCTION {{ target.schema }}.uuid_to_int(uuid UUID) RETURNS numeric AS $$ DECLARE diff --git a/dbt/models/base/acteurs/base_acteur.sql b/dbt/models/base/acteurs/base_acteur.sql index 209b43f2f..321da9544 100644 --- a/dbt/models/base/acteurs/base_acteur.sql +++ b/dbt/models/base/acteurs/base_acteur.sql @@ -1 +1,4 @@ -select * from {{ source('qfdmo', 'qfdmo_acteur') }} \ No newline at end of file +select * from {{ source('qfdmo', 'qfdmo_acteur') }} +{% if env_var('DBT_SAMPLING', 'false') == 'true' %} +TABLESAMPLE SYSTEM (10) +{% endif %} diff --git a/dbt/models/base/acteurs/base_acteur_type.sql b/dbt/models/base/acteurs/base_acteur_type.sql new file mode 100644 index 000000000..cf1d4a247 --- /dev/null +++ b/dbt/models/base/acteurs/base_acteur_type.sql @@ -0,0 +1 @@ +select * from {{ source('qfdmo', 'qfdmo_acteurtype') }} \ No newline at end of file diff --git a/dbt/models/base/acteurs/schema.yml b/dbt/models/base/acteurs/schema.yml index 157b6e544..1e3c5b3b9 100644 --- a/dbt/models/base/acteurs/schema.yml +++ b/dbt/models/base/acteurs/schema.yml @@ -338,3 +338,17 @@ models: description: "The logo_file for this table" - name: licence description: "The licence for this table" + - name: base_acteur_type + description: "Types d'acteurs" + columns: + - name: id + description: "clef primaire" + data_tests: + - not_null + - name: libelle + description: "Nom/description du type d'acteur (ex: Association, entreprise de l'ESS)" + - name: code + description: "Code du type d'acteur (ex: ess)" + data_tests: + - not_null + - unique diff --git a/dbt/models/base/ae_annuaire_entreprises/base_ae_etablissement.sql b/dbt/models/base/ae_annuaire_entreprises/base_ae_etablissement.sql index 22909b05b..2cb1da23f 100644 --- a/dbt/models/base/ae_annuaire_entreprises/base_ae_etablissement.sql +++ b/dbt/models/base/ae_annuaire_entreprises/base_ae_etablissement.sql @@ -13,30 +13,29 @@ Notes: SELECT - -- Codes -{{ udf_ae_string_cleanup(siret) }} AS siret, -{{ udf_ae_string_cleanup(activite_principale) }} AS activite_principale, +udf_ae_string_cleanup(siret) AS siret, +udf_ae_string_cleanup(activite_principale) AS activite_principale, -- Names -{{ udf_ae_string_cleanup(denomination_usuelle) }} AS denomination_usuelle, +udf_ae_string_cleanup(denomination_usuelle) AS denomination_usuelle, -- Status -{{ udf_ae_string_cleanup(etat_administratif) }} AS etat_administratif, +udf_ae_string_cleanup(etat_administratif) AS etat_administratif, -- Address -{{ udf_ae_string_cleanup(numero_voie) }} AS numero_voie, -{{ udf_ae_string_cleanup(complement_adresse) }} AS complement_adresse, -{{ udf_ae_string_cleanup(type_voie) }} AS type_voie, -{{ udf_ae_string_cleanup(libelle_voie) }} AS libelle_voie, -{{ udf_ae_string_cleanup(code_postal) }} AS code_postal, -{{ udf_ae_string_cleanup(libelle_commune) }} AS libelle_commune +udf_ae_string_cleanup(numero_voie) AS numero_voie, +udf_ae_string_cleanup(complement_adresse) AS complement_adresse, +udf_ae_string_cleanup(type_voie) AS type_voie, +udf_ae_string_cleanup(libelle_voie) AS libelle_voie, +udf_ae_string_cleanup(code_postal) AS code_postal, +udf_ae_string_cleanup(libelle_commune) AS libelle_commune FROM {{ source('ae', 'clone_ae_etablissement_in_use') }} -- Filtering out foreign establishments as our focus is France -- On 2025-03-17 this allows excluding ~316K rows WHERE code_pays_etranger IS NULL -{% if target.name == 'sampling' %} +{% if env_var('DBT_SAMPLING', 'false') == 'true' %} /* We can't do random sampling else we risk having no matching etablissement vs. unite legale. Can't sample on location as not available in unite to match, diff --git a/dbt/models/base/ae_annuaire_entreprises/base_ae_unite_legale.sql b/dbt/models/base/ae_annuaire_entreprises/base_ae_unite_legale.sql index f248f640f..49c353ae3 100644 --- a/dbt/models/base/ae_annuaire_entreprises/base_ae_unite_legale.sql +++ b/dbt/models/base/ae_annuaire_entreprises/base_ae_unite_legale.sql @@ -15,24 +15,24 @@ Notes: SELECT -- Codes -{{ udf_ae_string_cleanup(siren) }} AS siren, -{{ udf_ae_string_cleanup(activite_principale) }} AS activite_principale, +udf_ae_string_cleanup(siren) AS siren, +udf_ae_string_cleanup(activite_principale) AS activite_principale, -- Status -{{ udf_ae_string_cleanup(etat_administratif) }} AS etat_administratif, +udf_ae_string_cleanup(etat_administratif) AS etat_administratif, -- Business names -{{ udf_ae_string_cleanup(denomination) }} AS denomination, +udf_ae_string_cleanup(denomination) AS denomination, -- Director's names -{{ udf_ae_string_cleanup(prenom1) }} AS prenom1, -{{ udf_ae_string_cleanup(prenom2) }} AS prenom2, -{{ udf_ae_string_cleanup(prenom3) }} AS prenom3, -{{ udf_ae_string_cleanup(prenom4) }} AS prenom4, -{{ udf_ae_string_cleanup(prenom_usuel) }} AS prenom_usuel, -{{ udf_ae_string_cleanup(pseudonyme) }} AS pseudonyme, -{{ udf_ae_string_cleanup(nom) }} AS nom, -{{ udf_ae_string_cleanup(nom_usage) }} AS nom_usage +udf_ae_string_cleanup(prenom1) AS prenom1, +udf_ae_string_cleanup(prenom2) AS prenom2, +udf_ae_string_cleanup(prenom3) AS prenom3, +udf_ae_string_cleanup(prenom4) AS prenom4, +udf_ae_string_cleanup(prenom_usuel) AS prenom_usuel, +udf_ae_string_cleanup(pseudonyme) AS pseudonyme, +udf_ae_string_cleanup(nom) AS nom, +udf_ae_string_cleanup(nom_usage) AS nom_usage FROM {{ source('ae', 'clone_ae_unite_legale_in_use') }} /* We can't do random sampling else we risk having @@ -41,7 +41,7 @@ sample on location as not available in unite to match, falling back to latest SIRET/SIREN as they will give matches while representing recent data. */ -{% if target.name == 'sampling' %} +{% if env_var('DBT_SAMPLING', 'false') == 'true' %} ORDER BY siren DESC -LIMIT 500000 -- 1 SIREN for 2 SIRET +LIMIT 500000 {% endif %} \ No newline at end of file diff --git a/dbt/models/intermediate/acteurs/int_acteur.sql b/dbt/models/intermediate/acteurs/int_acteur.sql index 45dac5061..836e5e026 100644 --- a/dbt/models/intermediate/acteurs/int_acteur.sql +++ b/dbt/models/intermediate/acteurs/int_acteur.sql @@ -1,9 +1,22 @@ +WITH acteur_type_id_to_code AS ( + SELECT + id, + code + FROM {{ ref('base_acteur_type') }} +), source_id_to_code AS ( + SELECT + id, + code + FROM {{ ref('base_source') }} +) + SELECT CAST({{ target.schema }}.encode_base57(uuid_generate_v5('6ba7b810-9dad-11d1-80b4-00c04fd430c8'::uuid, COALESCE(ra.identifiant_unique, a.identifiant_unique)::text)) AS varchar(22)) AS uuid, {{ coalesce_empty('ra.identifiant_unique', 'a.identifiant_unique') }} AS identifiant_unique, {{ coalesce_empty('ra.nom', 'a.nom') }} AS nom, {{ coalesce_empty('ra.description', 'a.description') }} AS description, COALESCE(ra.acteur_type_id, a.acteur_type_id) AS acteur_type_id, + (SELECT code FROM acteur_type_id_to_code WHERE id = COALESCE(ra.acteur_type_id, a.acteur_type_id)) AS acteur_type_code, {{ coalesce_empty('ra.adresse', 'a.adresse') }} AS adresse, {{ coalesce_empty('ra.adresse_complement', 'a.adresse_complement') }} AS adresse_complement, {{ coalesce_empty('ra.code_postal', 'a.code_postal') }} AS code_postal, @@ -17,6 +30,7 @@ SELECT {{ coalesce_empty('ra.siren', 'a.siren') }} AS siren, {{ coalesce_empty('ra.siret', 'a.siret') }} AS siret, COALESCE(ra.source_id, a.source_id) AS source_id, + (SELECT code FROM source_id_to_code WHERE id = COALESCE(ra.source_id, a.source_id)) AS source_code, {{ coalesce_empty('ra.identifiant_externe', 'a.identifiant_externe') }} AS identifiant_externe, {{ coalesce_empty('ra.statut', 'a.statut') }} AS statut, {{ coalesce_empty('ra.naf_principal', 'a.naf_principal') }} AS naf_principal, diff --git a/dbt/models/intermediate/ae_annuaire_entreprises/int_ae_etablissement.sql b/dbt/models/intermediate/ae_annuaire_entreprises/int_ae_etablissement.sql index 2025cfd51..b313e9eac 100644 --- a/dbt/models/intermediate/ae_annuaire_entreprises/int_ae_etablissement.sql +++ b/dbt/models/intermediate/ae_annuaire_entreprises/int_ae_etablissement.sql @@ -11,6 +11,10 @@ Notes: indexes=[ {'columns': ['siret'], 'unique': True}, {'columns': ['est_actif']}, + {'columns': ['code_postal']}, + ], + post_hook=[ + "CREATE INDEX ON {{ this }}(adresse_numero) WHERE adresse_numero IS NOT NULL" ] ) }} @@ -24,7 +28,7 @@ SELECT CASE WHEN etab.denomination_usuelle IS NOT NULL THEN etab.denomination_usuelle WHEN etab.denomination_usuelle IS NULL AND unite.denomination IS NOT NULL THEN unite.denomination - ELSE {{ value_unavailable() }} -- To make this case explicit + ELSE {{ value_unavailable() }} END AS nom, /* diff --git a/dbt/models/intermediate/ae_annuaire_entreprises/int_ae_unite_legale.sql b/dbt/models/intermediate/ae_annuaire_entreprises/int_ae_unite_legale.sql index 6a7c8ef18..162f3b84e 100644 --- a/dbt/models/intermediate/ae_annuaire_entreprises/int_ae_unite_legale.sql +++ b/dbt/models/intermediate/ae_annuaire_entreprises/int_ae_unite_legale.sql @@ -45,14 +45,14 @@ SELECT - normalize to increase chances of matching - keep each column separate for a potential substring match */ - udf_normalize_string_alpha_for_match(nom) AS dirigeant_nom, - udf_normalize_string_alpha_for_match(nom_usage) AS dirigeant_nom_usage, - udf_normalize_string_alpha_for_match(pseudonyme) AS dirigeant_pseudonyme, - udf_normalize_string_alpha_for_match(prenom1) AS dirigeant_prenom1, - udf_normalize_string_alpha_for_match(prenom2) AS dirigeant_prenom2, - udf_normalize_string_alpha_for_match(prenom3) AS dirigeant_prenom3, - udf_normalize_string_alpha_for_match(prenom4) AS dirigeant_prenom4, - udf_normalize_string_alpha_for_match(prenom_usuel) AS dirigeant_prenom_usuel, + nom AS dirigeant_nom, + nom_usage AS dirigeant_nom_usage, + pseudonyme AS dirigeant_pseudonyme, + prenom1 AS dirigeant_prenom1, + prenom2 AS dirigeant_prenom2, + prenom3 AS dirigeant_prenom3, + prenom4 AS dirigeant_prenom4, + prenom_usuel AS dirigeant_prenom_usuel, -- TRUE if ANY names NOT NULL for more efficient pre-filtering COALESCE( nom, diff --git a/dbt/models/marts/enrich/marts_enrich_acteurs_closed_candidates.sql b/dbt/models/marts/enrich/marts_enrich_acteurs_closed_candidates.sql index 62a66bd84..bf4733a21 100644 --- a/dbt/models/marts/enrich/marts_enrich_acteurs_closed_candidates.sql +++ b/dbt/models/marts/enrich/marts_enrich_acteurs_closed_candidates.sql @@ -14,14 +14,24 @@ Notes: -- Starting from our acteurs we can match via SIRET WITH acteurs_with_siret AS ( SELECT + -- Common columns LEFT(siret,9) AS siren, siret, + + -- Acteur columns nom AS acteur_nom, - udf_normalize_string_alpha_for_match(nom) AS acteur_nom_normalise, + udf_normalize_string_for_match(nom) AS acteur_nom_normalise, identifiant_unique AS acteur_id, commentaires AS acteur_commentaires, statut AS acteur_statut, - acteur_type_id + acteur_type_id, + acteur_type_code, + source_id AS acteur_source_id, + source_code AS acteur_source_code, + adresse AS acteur_adresse, + code_postal AS acteur_code_postal, + ville AS acteur_ville + FROM {{ ref('marts_carte_acteur') }} WHERE siret IS NOT NULL AND siret != '' AND LENGTH(siret) = 14 ), @@ -30,21 +40,42 @@ not on unite closed (NOT unite_est_actif) because open unite might bring potential replacements */ etab_closed_candidates AS ( SELECT - etab.siret, - etab.unite_est_actif AS unite_est_actif, - etab.est_actif AS etab_est_actif, - etab.code_postal AS etab_code_postal, - etab.adresse AS etab_adresse, - etab.naf AS etab_naf, + -- Common columns (need to specify to avoid ambiguity) + acteurs.siren, + acteurs.siret, + + -- acteurs acteurs.acteur_id, + acteurs.acteur_type_id, + acteurs.acteur_type_code, + acteurs.acteur_source_id, + acteurs.acteur_source_code, acteurs.acteur_statut, acteurs.acteur_nom, acteurs.acteur_nom_normalise, acteurs.acteur_commentaires, - acteurs.acteur_type_id + acteurs.acteur_adresse, + acteurs.acteur_code_postal, + acteurs.acteur_ville, + + -- etablissement + etab.unite_est_actif AS unite_est_actif, + etab.est_actif AS etab_est_actif, + etab.code_postal AS etab_code_postal, + etab.adresse_numero AS etab_adresse_numero, + etab.adresse AS etab_adresse, + etab.adresse_complement AS etab_adresse_complement, + etab.naf AS etab_naf + FROM acteurs_with_siret AS acteurs JOIN {{ ref('int_ae_etablissement') }} AS etab ON acteurs.siret = etab.siret WHERE etab.est_actif IS FALSE -AND etab.numero_voie +/* To reduce false positives with generic addresses +such as ZA, ZI containing multiple instances of similar +stores (e.g. supermarkets), we force presence +of street number, which later will be used +as condition for matching */ +AND etab.adresse_numero IS NOT NULL ) + SELECT * FROM etab_closed_candidates \ No newline at end of file diff --git a/dbt/models/marts/enrich/marts_enrich_acteurs_closed_replaced.sql b/dbt/models/marts/enrich/marts_enrich_acteurs_closed_replaced.sql index 7ce1b427f..5cfba9be9 100644 --- a/dbt/models/marts/enrich/marts_enrich_acteurs_closed_replaced.sql +++ b/dbt/models/marts/enrich/marts_enrich_acteurs_closed_replaced.sql @@ -7,26 +7,35 @@ WITH potential_replacements AS ( SELECT + + -- Candidates candidates.acteur_id AS acteur_id, candidates.acteur_type_id AS acteur_type_id, + candidates.acteur_type_code AS acteur_type_code, + candidates.acteur_source_id AS acteur_source_id, + candidates.acteur_source_code AS acteur_source_code, candidates.acteur_statut AS acteur_statut, candidates.siret AS acteur_siret, - replacements.siret AS remplacer_siret, - CASE - WHEN LEFT(candidates.siret,9) = LEFT(replacements.siret,9) THEN 'siret_du_meme_siren' - ELSE 'siret_dun_autre_siren' - END AS remplacer_cohorte, candidates.acteur_nom, - replacements.nom AS remplacer_nom, - udf_columns_words_in_common_count( - candidates.acteur_nom_normalise, - udf_normalize_string_alpha_for_match(replacements.nom) - ) AS noms_nombre_mots_commun, candidates.acteur_commentaires AS acteur_commentaires, + candidates.acteur_adresse AS acteur_adresse, + candidates.acteur_code_postal AS acteur_code_postal, + candidates.acteur_ville AS acteur_ville, + + -- Replacements + replacements.siret AS remplacer_siret, + LEFT(candidates.siret,9) = LEFT(replacements.siret,9) AS remplacer_siret_is_from_same_siren, + replacements.nom AS remplacer_nom, replacements.naf AS naf, replacements.ville AS ville, replacements.code_postal AS code_postal, replacements.adresse AS adresse, + + -- Matching + udf_columns_words_in_common_count( + candidates.acteur_nom_normalise, + udf_normalize_string_for_match(replacements.nom) + ) AS noms_nombre_mots_commun, ROW_NUMBER() OVER ( PARTITION BY candidates.siret ORDER BY @@ -38,15 +47,25 @@ WITH potential_replacements AS ( -- Then etablissements with more words in common udf_columns_words_in_common_count( candidates.acteur_nom_normalise, - udf_normalize_string_alpha_for_match(replacements.nom) + udf_normalize_string_for_match(replacements.nom) ) DESC ) AS replacement_priority + /* + JOINS: candidates are our acteurs, replacements are etablissements + with a matching naf, code_postal, adresse and adresse_numero + */ FROM {{ ref('marts_enrich_acteurs_closed_candidates') }} AS candidates INNER JOIN {{ ref('int_ae_etablissement') }} AS replacements ON replacements.naf = candidates.etab_naf AND replacements.code_postal = candidates.etab_code_postal - AND replacements.adresse = candidates.etab_adresse + AND replacements.adresse_numero = candidates.etab_adresse_numero + AND udf_normalize_string_for_match(replacements.adresse) = udf_normalize_string_for_match(candidates.etab_adresse) WHERE replacements.est_actif + -- Fields which must be non-NULL for a replacement to be considered + AND replacements.code_postal IS NOT NULL + AND replacements.adresse IS NOT NULL + -- Number is crucial to avoid mismatches on generic addresses (e.g. ZA, ZI...) + AND replacements.adresse_numero IS NOT NULL ) SELECT * FROM potential_replacements WHERE replacement_priority=1 diff --git a/dbt/models/marts/enrich/marts_enrich_acteurs_closed_replaced_other_siren.sql b/dbt/models/marts/enrich/marts_enrich_acteurs_closed_replaced_other_siren.sql deleted file mode 100644 index 9f5458cbb..000000000 --- a/dbt/models/marts/enrich/marts_enrich_acteurs_closed_replaced_other_siren.sql +++ /dev/null @@ -1,9 +0,0 @@ -{{ - config( - materialized = 'table', - tags=['marts', 'enrich', 'closed', 'ae', 'annuaire_entreprises', 'etablissement'], - ) -}} - -SELECT * FROM {{ ref('marts_enrich_acteurs_closed_replaced') }} -WHERE remplacer_cohorte = 'siret_dun_autre_siren' diff --git a/dbt/models/marts/enrich/marts_enrich_acteurs_closed_replaced_same_siren.sql b/dbt/models/marts/enrich/marts_enrich_acteurs_closed_replaced_same_siren.sql deleted file mode 100644 index 108927314..000000000 --- a/dbt/models/marts/enrich/marts_enrich_acteurs_closed_replaced_same_siren.sql +++ /dev/null @@ -1,9 +0,0 @@ -{{ - config( - materialized = 'table', - tags=['marts', 'enrich', 'closed', 'ae', 'annuaire_entreprises', 'etablissement'], - ) -}} - -SELECT * FROM {{ ref('marts_enrich_acteurs_closed_replaced') }} -WHERE remplacer_cohorte = 'siret_du_meme_siren' diff --git a/dbt/models/marts/enrich/marts_enrich_acteurs_closed_not_replaced.sql b/dbt/models/marts/enrich/marts_enrich_acteurs_closed_suggest_not_replaced.sql similarity index 50% rename from dbt/models/marts/enrich/marts_enrich_acteurs_closed_not_replaced.sql rename to dbt/models/marts/enrich/marts_enrich_acteurs_closed_suggest_not_replaced.sql index 1f821794f..49cffb4ea 100644 --- a/dbt/models/marts/enrich/marts_enrich_acteurs_closed_not_replaced.sql +++ b/dbt/models/marts/enrich/marts_enrich_acteurs_closed_suggest_not_replaced.sql @@ -1,6 +1,6 @@ /* -Acteurs which SIRENT & SIRET is closed in AE's etablissement -BUT for which we couldn't find replacements +Acteurs which SIREN & SIRET are closed in AE's etablissement +AND for which we couldn't find replacements */ {{ config( @@ -9,9 +9,14 @@ BUT for which we couldn't find replacements ) }} -SELECT * FROM {{ ref('marts_enrich_acteurs_closed_candidates') }} +SELECT + 'acteurs_closed_not_replaced' AS suggestion_cohorte_code, + '🚪 Acteurs Fermés: 🔴 non remplacés' AS suggestion_cohorte_label, + * +FROM {{ ref('marts_enrich_acteurs_closed_candidates') }} WHERE - /* In candidates we don't filter on unite_est_actif IS FALSE + /* In candidates we already filter on etab_est_actif IS FALSE + but we don't filter on unite_est_actif IS FALSE because it would prevent us from finding replacements for same unite, however for acteurs we consider fully closed we do apply that filter */ unite_est_actif is FALSE diff --git a/dbt/models/marts/enrich/marts_enrich_acteurs_closed_suggest_replaced_other_siren.sql b/dbt/models/marts/enrich/marts_enrich_acteurs_closed_suggest_replaced_other_siren.sql new file mode 100644 index 000000000..d973f2ca5 --- /dev/null +++ b/dbt/models/marts/enrich/marts_enrich_acteurs_closed_suggest_replaced_other_siren.sql @@ -0,0 +1,13 @@ +{{ + config( + materialized = 'table', + tags=['marts', 'enrich', 'closed', 'ae', 'annuaire_entreprises', 'etablissement'], + ) +}} + +SELECT + 'acteurs_closed_replaced_other_siren' AS suggestion_cohorte_code, + '🚪 Acteurs Fermés: 🟡 remplacés par SIRET d''un autre SIREN' AS suggestion_cohorte_label, + * +FROM {{ ref('marts_enrich_acteurs_closed_replaced') }} +WHERE remplacer_siret_is_from_same_siren IS FALSE diff --git a/dbt/models/marts/enrich/marts_enrich_acteurs_closed_suggest_replaced_same_siren.sql b/dbt/models/marts/enrich/marts_enrich_acteurs_closed_suggest_replaced_same_siren.sql new file mode 100644 index 000000000..c098377c1 --- /dev/null +++ b/dbt/models/marts/enrich/marts_enrich_acteurs_closed_suggest_replaced_same_siren.sql @@ -0,0 +1,13 @@ +{{ + config( + materialized = 'table', + tags=['marts', 'enrich', 'closed', 'ae', 'annuaire_entreprises', 'etablissement'], + ) +}} + +SELECT + 'acteurs_closed_replaced_same_siren' AS suggestion_cohorte_code, + '🚪 Acteurs Fermés: 🟢 remplacés par SIRET du même SIREN' AS suggestion_cohorte_label, + * +FROM {{ ref('marts_enrich_acteurs_closed_replaced') }} +WHERE remplacer_siret_is_from_same_siren IS TRUE diff --git a/dbt/models/marts/enrich/marts_enrich_ae_rgpd.sql b/dbt/models/marts/enrich/marts_enrich_ae_rgpd.sql index b57af9b3e..4922beaaa 100644 --- a/dbt/models/marts/enrich/marts_enrich_ae_rgpd.sql +++ b/dbt/models/marts/enrich/marts_enrich_ae_rgpd.sql @@ -22,7 +22,7 @@ WITH acteurs_with_siren AS ( ', , ', '') ) AS acteur_noms_origine, - udf_normalize_string_alpha_for_match(CONCAT(nom || ' ' || nom_officiel || ' ' || nom_commercial)) AS acteur_noms_normalises, + udf_normalize_string_for_match(CONCAT(nom || ' ' || nom_officiel || ' ' || nom_commercial)) AS acteur_noms_normalises, commentaires AS acteur_commentaires FROM {{ ref('marts_carte_acteur') }} /* diff --git a/dbt/models/source/source_acteur.yml b/dbt/models/source/source_acteur.yml index b5d6f20b6..3c8d7fdec 100644 --- a/dbt/models/source/source_acteur.yml +++ b/dbt/models/source/source_acteur.yml @@ -7,6 +7,7 @@ sources: - name : qfdmo_acteur - name : qfdmo_acteur_acteur_services - name : qfdmo_acteur_labels + - name : qfdmo_acteurtype - name : qfdmo_propositionservice - name : qfdmo_propositionservice_sous_categories - name : qfdmo_revisionacteur diff --git a/dbt/profiles.yml b/dbt/profiles.yml index 0d82812cb..ca26c20c1 100644 --- a/dbt/profiles.yml +++ b/dbt/profiles.yml @@ -2,18 +2,6 @@ default: target: qfdmo outputs: qfdmo: - type: postgres - host: "{{ env_var('POSTGRES_HOST', 'localhost') }}" - port: "{{ env_var('POSTGRES_PORT', 6543) | as_number }}" - user: "{{ env_var('POSTGRES_USER', 'qfdmo') }}" - password: "{{ env_var('POSTGRES_PASSWORD', 'qfdmo') }}" - dbname: "{{ env_var('POSTGRES_DB', 'qfdmo') }}" - schema: "{{ env_var('POSTGRES_SCHEMA', 'public') }}" - # To implement sampling logic for large tables - # (e.g. Annuaire Entreprises Etablissements = 40M rows) - # Each model is free to implement its own sampling strategy - # using {% if target.name == 'sampling' %}...{% endif %} - sampling: type: postgres host: "{{ env_var('POSTGRES_HOST', 'localhost') }}" port: "{{ env_var('POSTGRES_PORT', 6543) | as_number }}" diff --git a/templates/data/_partials/dicts_to_table.html b/templates/data/_partials/dicts_to_table.html new file mode 100644 index 000000000..83405d35d --- /dev/null +++ b/templates/data/_partials/dicts_to_table.html @@ -0,0 +1,33 @@ +{# Turns data:list[dict] into a table #} + + + + {% for key in data.0.keys %} + + {% endfor %} + + + + {% for row in data %} + + {% for value in row.values %} + + {% endfor %} + + {% endfor %} + +
{{ key|capfirst }}
+ {% if value|valuetype == "dict" %} + {% include "data/_partials/value_details.html" with value=value %}

+ {% if 'data' in value %} + {% if "nom" in value.data and "adresse" in value.data and "ville" in value.data and "code_postal" in value.data %} +
+ + 🗺️ Voir sur Google Maps + + {% endif %} + {% endif %} + {% else %} + {{ value }} + {% endif %} +
diff --git a/templates/data/_partials/generic_suggestion_details.html b/templates/data/_partials/generic_suggestion_details.html deleted file mode 100644 index 3a823cca5..000000000 --- a/templates/data/_partials/generic_suggestion_details.html +++ /dev/null @@ -1,24 +0,0 @@ -{% extends "data/_partials/suggestion_details.html" %} -{% load custom_filters %} - -{% block suggestion_title %} -{{ title }} -{% endblock suggestion_title %} - -{% block suggestion_details %} - -

💡 Résumé:

-
    - {% for entry in summary %} - {# TODO: we can use entry.value_type to customize rendering #} -
  • {{ entry.label }}: {{ entry.value }}
  • - {% endfor %} -
- -

🔢 {{ changes|length }} acteur(s) impacté(s):

- -{% endblock suggestion_details %} diff --git a/templates/data/_partials/suggestion_details_changes.html b/templates/data/_partials/suggestion_details_changes.html new file mode 100644 index 000000000..e91094586 --- /dev/null +++ b/templates/data/_partials/suggestion_details_changes.html @@ -0,0 +1,13 @@ +{# Generic template for suggestions following suggestion.changes:list[SuggestionChange] #} +{% extends "data/_partials/suggestion_details.html" %} +{% load custom_filters %} + +{% block suggestion_title %} +{{ title }} +{% endblock suggestion_title %} + +{% block suggestion_details %} + +

🔢 {{ changes|length }} acteur(s) impacté(s):

+{% include "data/_partials/dicts_to_table.html" with data=changes %} +{% endblock suggestion_details %} diff --git a/templates/data/_partials/value_details.html b/templates/data/_partials/value_details.html index db29adb23..11e94c8c5 100644 --- a/templates/data/_partials/value_details.html +++ b/templates/data/_partials/value_details.html @@ -1,15 +1,50 @@ {% load custom_filters %} {% if value|valuetype != "list" and value|valuetype != "dict" %} - {{ value }} + + {# Generic case to highlight specific values #} + {% if value is None %} + NONE + {% elif value == "" %} + EMPTY STRING + + {# Source #} + {% elif key == "source" %} + {{ value }} + + {# Acteur Type #} + {% elif key == "acteur_type" %} + {{ value }} + + {# Acteur #} + {% elif key == "identifiant_unique" %} + {{ value }} + (base, + rev, + disp) + {% elif key == "statut" %} + {{ value }} + {% elif key == "siret_is_closed" %} + {{ value }} + + {# Annuaire Entreprises links #} + {% elif key == "siren" %} + {{ value }} + {% elif key == "siret" %} + {{ value }} + + {# Fallback #} + {% else %} + {{ value }} + {% endif %} {% elif value|valuetype == "list" and value %}
    {% for item in value %} -
  • {% include "data/_partials/value_details.html" with value=item %}
  • +
  • {% include "data/_partials/value_details.html" with key=key value=item %}
  • {% endfor %}
{% elif value|valuetype == "dict" %} {% for key, item in value.items %} -

{{ key }} : {% include "data/_partials/value_details.html" with value=item %}

+

{{ key }} : {% include "data/_partials/value_details.html" with key=key value=item %}

{% endfor %} {% endif %} From 82de0f619c3a42ade07a82505a1b7499c3b103c7 Mon Sep 17 00:00:00 2001 From: maxcorbeau Date: Thu, 10 Apr 2025 14:33:44 +0200 Subject: [PATCH 17/50] =?UTF-8?q?create=5Fas=5Fchild,=20num=C3=A9ro=20rue,?= =?UTF-8?q?=20UI?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../enrich_dbt_model_suggest_task.py | 2 +- .../enrich_dbt_model_to_suggestions.py | 52 +++++++++++--- data/models/changes/__init__.py | 2 + data/models/changes/acteur_create_as_child.py | 58 ++++++++++++++++ .../models/changes/acteur_update_parent_id.py | 1 + ...marts_enrich_acteurs_closed_candidates.sql | 6 -- .../marts_enrich_acteurs_closed_replaced.sql | 6 +- qfdmo/models/acteur.py | 1 + scripts/restore_prod_locally.sh | 2 +- templates/data/_partials/value_details.html | 3 + .../changes/test_acteur_create_as_child.py | 69 +++++++++++++++++++ 11 files changed, 182 insertions(+), 20 deletions(-) create mode 100644 data/models/changes/acteur_create_as_child.py create mode 100644 unit_tests/data/models/changes/test_acteur_create_as_child.py diff --git a/dags/enrich/tasks/airflow_logic/enrich_dbt_model_suggest_task.py b/dags/enrich/tasks/airflow_logic/enrich_dbt_model_suggest_task.py index d27b1cf12..f1e854ddb 100644 --- a/dags/enrich/tasks/airflow_logic/enrich_dbt_model_suggest_task.py +++ b/dags/enrich/tasks/airflow_logic/enrich_dbt_model_suggest_task.py @@ -62,6 +62,6 @@ def enrich_dbt_model_suggest_task( python_callable=enrich_dbt_model_suggest_wrapper, op_args=[task_id, cohort, dbt_model_name], dag=dag, - doc_md=f"**Suggestions** pour la cohorte: {cohort.label}**", + doc_md=f"**Suggestions** pour la cohorte: **{cohort.label}**", trigger_rule=TriggerRule.ALL_DONE, ) diff --git a/dags/enrich/tasks/business_logic/enrich_dbt_model_to_suggestions.py b/dags/enrich/tasks/business_logic/enrich_dbt_model_to_suggestions.py index c6025d434..a5a92be20 100644 --- a/dags/enrich/tasks/business_logic/enrich_dbt_model_to_suggestions.py +++ b/dags/enrich/tasks/business_logic/enrich_dbt_model_to_suggestions.py @@ -23,7 +23,11 @@ def enrich_dbt_model_to_suggestions( SuggestionStatut, ) from data.models.change import SuggestionChange - from data.models.changes import ChangeActeurCreateAsParent, ChangeActeurUpdateData + from data.models.changes import ( + ChangeActeurCreateAsChild, + ChangeActeurCreateAsParent, + ChangeActeurUpdateData, + ) from qfdmo.models import ActeurStatus today = datetime.now(timezone.utc).strftime("%Y-%m-%d") @@ -90,7 +94,7 @@ def enrich_dbt_model_to_suggestions( # Parent parent_id = parent_id_generate([str(row[COLS.REMPLACER_SIRET])]) - model_params = { + model_params_parent = { "id": parent_id, "data": { "identifiant_unique": parent_id, @@ -106,18 +110,44 @@ def enrich_dbt_model_to_suggestions( "statut": ActeurStatus.ACTIF, }, } - ChangeActeurCreateAsParent(**model_params).validate() + ChangeActeurCreateAsParent(**model_params_parent).validate() change = SuggestionChange( order=1, - reason="besoin d'un parent pour nouvel acteur", + reason="besoin d'un parent pour rattaché acteur fermé", entity_type="acteur_displayed", model_name=ChangeActeurCreateAsParent.name(), - model_params=model_params, + model_params=model_params_parent, ).model_dump() changes.append(change) - # Child - model_params = { + # New child to hold the reference data as standalone + # as parents are surrogates (e.g. they can be deleted + # during clustering) + now = datetime.now(timezone.utc).strftime("%Y%m%d%H%M%S") + child_new_id = f"{row[COLS.ACTEUR_ID]}_{row[COLS.ACTEUR_SIRET]}_{now}" + model_params_child_new = model_params_parent.copy() + model_params_child_new["id"] = child_new_id + model_params_child_new["data"]["identifiant_unique"] = child_new_id + model_params_child_new["data"]["source"] = row[COLS.ACTEUR_SOURCE_ID] + model_params_child_new["data"]["parent"] = parent_id + model_params_child_new["data"]["parent_reason"] = ( + f"Nouvel enfant pour conserver les données suite à: " + f"SIRET {row[COLS.ACTEUR_SIRET]} " + f"détecté le {today} comme fermé dans AE, " + f"remplacé par SIRET {row[COLS.REMPLACER_SIRET]}" + ) + ChangeActeurCreateAsChild(**model_params_child_new).validate() + change = SuggestionChange( + order=2, + reason="besoin nouvel enfant pour conserver les données", + entity_type="acteur_displayed", + model_name=ChangeActeurCreateAsChild.name(), + model_params=model_params_child_new, + ).model_dump() + changes.append(change) + + # Existing Child + model_params_child_old = { "id": row[COLS.ACTEUR_ID], "data": { "identifiant_unique": row[COLS.ACTEUR_ID], @@ -135,13 +165,13 @@ def enrich_dbt_model_to_suggestions( "statut": ActeurStatus.INACTIF, }, } - ChangeActeurUpdateData(**model_params).validate() + ChangeActeurUpdateData(**model_params_child_old).validate() change = SuggestionChange( - order=2, - reason="rattaché au parent", + order=3, + reason="rattacher enfant fermé à un parent", entity_type="acteur_displayed", model_name=ChangeActeurUpdateData.name(), - model_params=model_params, + model_params=model_params_child_old, ).model_dump() changes.append(change) diff --git a/data/models/changes/__init__.py b/data/models/changes/__init__.py index 21166e57d..ab172c0c0 100644 --- a/data/models/changes/__init__.py +++ b/data/models/changes/__init__.py @@ -1,4 +1,5 @@ from .acteur_change_nothing_in_base import ChangeActeurNothingBase +from .acteur_create_as_child import ChangeActeurCreateAsChild from .acteur_create_as_parent import ChangeActeurCreateAsParent from .acteur_delete_as_parent import ChangeActeurDeleteAsParent from .acteur_keep_as_parent import ChangeActeurKeepAsParent @@ -11,6 +12,7 @@ CHANGE_MODELS = { ChangeActeurRgpdAnonymize.name(): ChangeActeurRgpdAnonymize, ChangeActeurUpdateData.name(): ChangeActeurUpdateData, + ChangeActeurCreateAsChild.name(): ChangeActeurCreateAsChild, ChangeActeurCreateAsParent.name(): ChangeActeurCreateAsParent, ChangeActeurDeleteAsParent.name(): ChangeActeurDeleteAsParent, ChangeActeurUpdateParentId.name(): ChangeActeurUpdateParentId, diff --git a/data/models/changes/acteur_create_as_child.py b/data/models/changes/acteur_create_as_child.py new file mode 100644 index 000000000..4e6f60028 --- /dev/null +++ b/data/models/changes/acteur_create_as_child.py @@ -0,0 +1,58 @@ +from pydantic import BaseModel + + +class ChangeActeurCreateAsChild(BaseModel): + id: str + data: dict = {} + + @classmethod + def name(cls) -> str: + return "acteur_create_as_child" + + def validate(self): + from qfdmo.models import Acteur, DisplayedActeur, RevisionActeur + + # Parent field must be SET (but we can't check if parent exists yet + # as it could be a new parent to be created) + for field in ["parent", "parent_reason"]: + if not self.data.get(field): + msg = f"Création d'enfant: champ '{field}' à renseigner {self.data}" + raise ValueError(msg) + + # Ensure child exists nowhere + for model in [Acteur, RevisionActeur, DisplayedActeur]: + obj = model.objects.filter(pk=self.id) + if obj.exists(): + msg = ( + f"Création d'enfant: '{self.id}' existe déjà dans {model.__name__}" + ) + raise ValueError(msg) + + def apply(self): + self.validate() + from qfdmo.models import Acteur, RevisionActeur + + # Ensure parent exists in RevisionActeur + parent = RevisionActeur.objects.get(pk=self.data["parent"]) + + # Create child in Acteur + data_base = self.data.copy() + del data_base["parent"] + del data_base["parent_reason"] + Acteur.objects.create( + identifiant_unique=self.id, + **data_base, + ) + + # In Revision we only store what is different, i.e. parent + # FIXME: should we use get_or_create_revision? + # I tried, it was failing, started to look at the code and went + # down a rabbit hole + RevisionActeur.objects.create( + identifiant_unique=self.id, + parent=parent, + parent_reason=self.data["parent_reason"], + statut="ACTIF", + source=self.data["source"], + acteur_type=self.data["acteur_type"], + ) diff --git a/data/models/changes/acteur_update_parent_id.py b/data/models/changes/acteur_update_parent_id.py index f19e95a2d..b12421375 100644 --- a/data/models/changes/acteur_update_parent_id.py +++ b/data/models/changes/acteur_update_parent_id.py @@ -16,6 +16,7 @@ def validate(self): # - Can't test if parent exists as maybe it's to be created def apply(self): + self.validate() # By the time we apply changes to update parent_ids, the # corresponding parents must exist parent = RevisionActeur.objects.get(pk=self.data["parent_id"]) diff --git a/dbt/models/marts/enrich/marts_enrich_acteurs_closed_candidates.sql b/dbt/models/marts/enrich/marts_enrich_acteurs_closed_candidates.sql index bf4733a21..a10b50d16 100644 --- a/dbt/models/marts/enrich/marts_enrich_acteurs_closed_candidates.sql +++ b/dbt/models/marts/enrich/marts_enrich_acteurs_closed_candidates.sql @@ -70,12 +70,6 @@ SELECT FROM acteurs_with_siret AS acteurs JOIN {{ ref('int_ae_etablissement') }} AS etab ON acteurs.siret = etab.siret WHERE etab.est_actif IS FALSE -/* To reduce false positives with generic addresses -such as ZA, ZI containing multiple instances of similar -stores (e.g. supermarkets), we force presence -of street number, which later will be used -as condition for matching */ -AND etab.adresse_numero IS NOT NULL ) SELECT * FROM etab_closed_candidates \ No newline at end of file diff --git a/dbt/models/marts/enrich/marts_enrich_acteurs_closed_replaced.sql b/dbt/models/marts/enrich/marts_enrich_acteurs_closed_replaced.sql index 5cfba9be9..3784becfd 100644 --- a/dbt/models/marts/enrich/marts_enrich_acteurs_closed_replaced.sql +++ b/dbt/models/marts/enrich/marts_enrich_acteurs_closed_replaced.sql @@ -64,7 +64,11 @@ WITH potential_replacements AS ( -- Fields which must be non-NULL for a replacement to be considered AND replacements.code_postal IS NOT NULL AND replacements.adresse IS NOT NULL - -- Number is crucial to avoid mismatches on generic addresses (e.g. ZA, ZI...) + /* To reduce false positives with generic addresses + such as ZA, ZI containing multiple instances of similar + stores (e.g. supermarkets), we force presence + of street number, which later will be used + as condition for matching */ AND replacements.adresse_numero IS NOT NULL ) SELECT * FROM potential_replacements diff --git a/qfdmo/models/acteur.py b/qfdmo/models/acteur.py index 12710f726..1595affcb 100644 --- a/qfdmo/models/acteur.py +++ b/qfdmo/models/acteur.py @@ -838,6 +838,7 @@ def set_default_fields_and_objects_before_save(self) -> Acteur | None: "acteur_services", "labels", "parent", + "parent_reason", "is_parent", ], ) diff --git a/scripts/restore_prod_locally.sh b/scripts/restore_prod_locally.sh index 1c9b36cc3..7d8eb959e 100755 --- a/scripts/restore_prod_locally.sh +++ b/scripts/restore_prod_locally.sh @@ -1,4 +1,4 @@ -DUMP_FILE=backup.pgsql +DUMP_FILE=/home/me/Downloads/20250410002559_quefairedem_5084/20250410002559_quefairedem_5084.pgsql DATABASE_URL=postgres://qfdmo:qfdmo@localhost:6543/qfdmo # pragma: allowlist secret for table in $(psql "${DATABASE_URL}" -t -c "SELECT \"tablename\" FROM pg_tables WHERE schemaname='public'"); do diff --git a/templates/data/_partials/value_details.html b/templates/data/_partials/value_details.html index 11e94c8c5..3f30804c4 100644 --- a/templates/data/_partials/value_details.html +++ b/templates/data/_partials/value_details.html @@ -26,6 +26,9 @@ {{ value }} {% elif key == "siret_is_closed" %} {{ value }} + {% elif key == "parent" %} + {{ value }} (futur parent) + {# Annuaire Entreprises links #} {% elif key == "siren" %} diff --git a/unit_tests/data/models/changes/test_acteur_create_as_child.py b/unit_tests/data/models/changes/test_acteur_create_as_child.py new file mode 100644 index 000000000..aadb6ad79 --- /dev/null +++ b/unit_tests/data/models/changes/test_acteur_create_as_child.py @@ -0,0 +1,69 @@ +import pytest +from django.contrib.gis.geos import Point + +from data.models import Acteur, RevisionActeur +from data.models.changes.acteur_create_as_child import ChangeActeurCreateAsChild +from unit_tests.qfdmo.acteur_factory import ( + ActeurFactory, + ActeurTypeFactory, + SourceFactory, +) + + +@pytest.mark.django_db +class TestChangeActeurCreateAsChild: + @pytest.mark.parametrize( + "data,missing", + [({"parent": "456"}, "parent_reason"), ({"parent_reason": "test"}, "parent")], + ) + def test_raise_if_missing_params(self, data, missing): + change = ChangeActeurCreateAsChild(id="123", data=data) + with pytest.raises(ValueError, match=f"champ '{missing}' à renseigner"): + change.apply() + + def test_raise_if_acteur_exists(self): + ActeurFactory(identifiant_unique="123") + change = ChangeActeurCreateAsChild( + id="123", data={"parent": "456", "parent_reason": "test"} + ) + with pytest.raises(ValueError, match="existe déjà"): + change.apply() + + def test_working(self): + # Create parent + source = SourceFactory(code="source1") + atype = ActeurTypeFactory(code="atype1") + parent = RevisionActeur.objects.create( + identifiant_unique="parent1", + source=source, + acteur_type=atype, + statut="ACTIF", + location=Point(1, 1), + ) + # Create child + change = ChangeActeurCreateAsChild( + id="child1", + data={ + "nom": "my child1", + "source": source, + "acteur_type": atype, + "statut": "ACFIF", + "location": Point(1, 1), + "parent": parent, + "parent_reason": "test", + }, + ) + change.apply() + + base = Acteur.objects.get(pk="child1") + assert base.identifiant_unique == "child1" + assert base.nom == "my child1" + assert base.source.pk == source.pk + assert base.acteur_type.pk == atype.pk + assert base.statut == "ACFIF" + assert base.location.x == 1 + assert base.location.y == 1 + revision = RevisionActeur.objects.get(pk="child1") + assert revision.parent.pk == parent.pk + assert revision.parent_reason == "test" + assert not revision.nom From 3a1184f37073372d23431fa4c0bca27f3f1e54f0 Mon Sep 17 00:00:00 2001 From: maxcorbeau Date: Mon, 14 Apr 2025 14:05:08 +0200 Subject: [PATCH 18/50] =?UTF-8?q?refacto=20mod=C3=A8les=20&=20tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cluster_acteurs_parents_choose_data.py | 7 +- .../misc/data_serialize_reconstruct.py | 14 +- dags/enrich/config/columns.py | 38 ++- dags/enrich/config/tasks.py | 2 +- dags/enrich/config/xcoms.py | 2 +- dags/enrich/dags/enrich_acteurs_rgpd.py | 4 +- .../enrich_ae_rgpd_match_task.py | 6 +- .../business_logic/enrich_ae_rgpd_match.py | 14 +- .../business_logic/enrich_ae_rgpd_suggest.py | 4 +- .../enrich_dbt_model_to_suggestions.py | 287 ++++++++++-------- .../test_enrich_acteurs_closed_suggestions.py | 35 ++- .../tasks/test_enrich_ae_rgpd_suggest.py | 3 +- data/models/changes/acteur_create_as_child.py | 28 +- .../models/changes/acteur_create_as_parent.py | 3 - data/models/changes/acteur_rgpd_anonymize.py | 2 +- data/models/changes/acteur_update_data.py | 5 +- .../models/changes/acteur_update_parent_id.py | 8 +- ...marts_enrich_acteurs_closed_candidates.sql | 21 +- .../marts_enrich_acteurs_closed_replaced.sql | 29 +- .../changes/test_acteur_create_as_child.py | 3 + .../changes/test_acteur_rgpd_anonymize.py | 2 +- 21 files changed, 280 insertions(+), 237 deletions(-) diff --git a/dags/cluster/tasks/business_logic/cluster_acteurs_parents_choose_data.py b/dags/cluster/tasks/business_logic/cluster_acteurs_parents_choose_data.py index cc75f4aa7..430b29b27 100644 --- a/dags/cluster/tasks/business_logic/cluster_acteurs_parents_choose_data.py +++ b/dags/cluster/tasks/business_logic/cluster_acteurs_parents_choose_data.py @@ -1,12 +1,13 @@ +import logging from typing import Any import pandas as pd from cluster.config.constants import COL_PARENT_DATA_NEW, FIELDS_PARENT_DATA_EXCLUDED from django.forms.models import model_to_dict -from rich import print from utils.django import django_setup_full django_setup_full() + from data.models.change import COL_CHANGE_MODEL_NAME # noqa: E402 from data.models.changes import ( # noqa: E402 ChangeActeurCreateAsParent, @@ -14,6 +15,8 @@ ) from qfdmo.models.acteur import Acteur, DisplayedActeur, RevisionActeur # noqa: E402 +logger = logging.getLogger(__name__) + def fields_to_include_clean( fields_to_include: list[str], @@ -63,7 +66,7 @@ def field_pick_value( """ return value except Exception as e: - print(f"Invalid value for field {field}: {value}: {e}") + logging.error(f"Invalid value for field {field}: {value}: {e}") pass return None diff --git a/dags/cluster/tasks/business_logic/misc/data_serialize_reconstruct.py b/dags/cluster/tasks/business_logic/misc/data_serialize_reconstruct.py index aab3757da..4584d9589 100644 --- a/dags/cluster/tasks/business_logic/misc/data_serialize_reconstruct.py +++ b/dags/cluster/tasks/business_logic/misc/data_serialize_reconstruct.py @@ -80,10 +80,8 @@ def data_reconstruct(model: type[models.Model], data_src: dict) -> dict: else: result[key] = value elif isinstance(field, models.ForeignKey): - # Django seems to handle both {field}_id and {field} transparently - # but since we reconstruct for Django, we favor the {field} flavour, - # this prevents having inconsistent representations when we work with - # Django vs. DBT models + # Normalizing to {field} from {field}_id so all fields are + # represented in their Django flavour if key.endswith("_id"): try: key_no_id = key.rstrip("_id") @@ -91,8 +89,12 @@ def data_reconstruct(model: type[models.Model], data_src: dict) -> dict: key = key_no_id except Exception: pass - related_instance = field.related_model.objects.get(pk=value) # type: ignore - result[key] = related_instance + + # Retrieving the related instance if it's not already an instance + if not isinstance(value, field.related_model): # type: ignore + value = field.related_model.objects.get(pk=value) # type: ignore + + result[key] = value else: result[key] = value diff --git a/dags/enrich/config/columns.py b/dags/enrich/config/columns.py index 697682522..9d9a8073b 100644 --- a/dags/enrich/config/columns.py +++ b/dags/enrich/config/columns.py @@ -7,18 +7,8 @@ @dataclass(frozen=True) class COLS: - # Dry run - DRY_RUN: str = "dry_run" - # Suggestions - SUGGEST_COHORT_CODE: str = "suggestion_cohorte_code" - SUGGEST_COHORT_LABEL: str = "suggestion_cohorte_label" - - # COMMON - SIREN: str = "siren" - SIRET: str = "siret" - - # QFDMO + # Acteurs ACTEUR_ID: str = "acteur_id" ACTEUR_TYPE_ID: str = "acteur_type_id" ACTEUR_TYPE_CODE: str = "acteur_type_code" @@ -29,20 +19,28 @@ class COLS: ACTEUR_NOMS_ORIGINE: str = "acteur_noms_origine" ACTEUR_NOMS_NORMALISES: str = "acteur_noms_normalises" ACTEUR_COMMENTAIRES: str = "acteur_commentaires" + ACTEUR_ADRESSE: str = "acteur_adresse" + ACTEUR_CODE_POSTAL: str = "acteur_code_postal" + ACTEUR_VILLE: str = "acteur_ville" + ACTEUR_NAF: str = "acteur_naf" + ACTEUR_LONGITUDE: str = "acteur_longitude" + ACTEUR_LATITUDE: str = "acteur_latitude" # Annuaire Entreprise AE_DIRIGEANTS_NOMS: str = "ae_dirigeants_noms_prenoms" + + # Suggestions + SUGGEST_COHORT_CODE: str = "suggestion_cohorte_code" + SUGGEST_COHORT_LABEL: str = "suggestion_cohorte_label" + + # Replacements REMPLACER_SIRET: str = "remplacer_siret" REMPLACER_NOM: str = "remplacer_nom" - - # Fields identical between acteurs and remplacements - # hence replacer_ prefix not present on the model column names - REMPLACER_ADRESSE: str = "adresse" - REMPLACER_CODE_POSTAL: str = "code_postal" - REMPLACER_VILLE: str = "ville" - REMPLACER_NAF: str = "naf" + REMPLACER_ADRESSE: str = "remplacer_adresse" + REMPLACER_CODE_POSTAL: str = "remplacer_code_postal" + REMPLACER_VILLE: str = "remplacer_ville" + REMPLACER_NAF: str = "remplacer_naf" # Matching - MATCH_SCORE_AE_RGPD: str = "match_score" MATCH_WORDS: str = "match_words" - MATCH_THRESHOLD: str = "match_threshold" + MATCH_SCORE: str = "match_score" diff --git a/dags/enrich/config/tasks.py b/dags/enrich/config/tasks.py index 0652eacc5..dd152e865 100644 --- a/dags/enrich/config/tasks.py +++ b/dags/enrich/config/tasks.py @@ -27,7 +27,7 @@ class TASKS: ENRICH_DBT_MODELS_REFRESH: str = "enrich_dbt_models_refresh" # Matching tasks - MATCH_SCORE_AE_RGPD: str = "enrich_ae_rgpd_match" + MATCH_SCORE: str = "enrich_ae_rgpd_match" # Suggestion tasks SUGGEST_AE_RGPD: str = "enrich_ae_rgpd_suggest" diff --git a/dags/enrich/config/xcoms.py b/dags/enrich/config/xcoms.py index 0d9451913..9b2b0ea6e 100644 --- a/dags/enrich/config/xcoms.py +++ b/dags/enrich/config/xcoms.py @@ -37,7 +37,7 @@ def xcom_pull(ti: TaskInstance, key: str, skip_if_empty: bool = False) -> Any: elif key == XCOMS.DF_READ: value = ti.xcom_pull(key=key, task_ids=TASKS.READ_AE_RGPD) elif key == XCOMS.DF_MATCH: - value = ti.xcom_pull(key=key, task_ids=TASKS.MATCH_SCORE_AE_RGPD) + value = ti.xcom_pull(key=key, task_ids=TASKS.MATCH_SCORE) elif key == XCOMS.DF_CLOSED_REPLACED_SAME_SIREN: value = ti.xcom_pull(key=key, task_ids=TASKS.ENRICH_CLOSED_REPLACED_SAME_SIREN) elif key == XCOMS.DF_CLOSED_REPLACED_OTHER_SIREN: diff --git a/dags/enrich/dags/enrich_acteurs_rgpd.py b/dags/enrich/dags/enrich_acteurs_rgpd.py index c2c64ba64..4265305fa 100644 --- a/dags/enrich/dags/enrich_acteurs_rgpd.py +++ b/dags/enrich/dags/enrich_acteurs_rgpd.py @@ -37,7 +37,7 @@ ), tags=["enrich", "annuaire", "entreprise", "siren", "ae", "acteurs"], params={ - COLS.DRY_RUN: Param( + "dry_run": Param( True, type="boolean", description_md="🚱 Si coché, aucune tâche d'écriture ne sera effectuée", @@ -47,7 +47,7 @@ type=["null", "string"], description_md="🔍 Filtre sur les commentaires pour la lecture des données", ), - COLS.MATCH_THRESHOLD: Param( + COLS.MATCH_SCORE: Param( 1, type="number", minimum=0.5, diff --git a/dags/enrich/tasks/airflow_logic/enrich_ae_rgpd_match_task.py b/dags/enrich/tasks/airflow_logic/enrich_ae_rgpd_match_task.py index bfc27bbbc..9eaa636f4 100644 --- a/dags/enrich/tasks/airflow_logic/enrich_ae_rgpd_match_task.py +++ b/dags/enrich/tasks/airflow_logic/enrich_ae_rgpd_match_task.py @@ -16,7 +16,7 @@ def task_info_get(): return f""" ============================================================ - Description de la tâche "{TASKS.MATCH_SCORE_AE_RGPD}" + Description de la tâche "{TASKS.MATCH_SCORE}" ============================================================ 💡 quoi: on cherche à déterminer quels acteurs QFDMO ont un nom qui correspond à des noms de personnes dans l'AE @@ -34,7 +34,7 @@ def enrich_ae_rgpd_match_wrapper(ti, params) -> None: df = enrich_ae_rgpd_match( df=ti.xcom_pull(key=XCOMS.DF_READ), - match_threshold=params[COLS.MATCH_THRESHOLD], + match_threshold=params[COLS.MATCH_SCORE], ) if df.empty: raise AirflowSkipException("Pas de matches, on s'arrête là") @@ -44,7 +44,7 @@ def enrich_ae_rgpd_match_wrapper(ti, params) -> None: def enrich_ae_rgpd_match_task(dag: DAG) -> PythonOperator: return PythonOperator( - task_id=TASKS.MATCH_SCORE_AE_RGPD, + task_id=TASKS.MATCH_SCORE, python_callable=enrich_ae_rgpd_match_wrapper, dag=dag, ) diff --git a/dags/enrich/tasks/business_logic/enrich_ae_rgpd_match.py b/dags/enrich/tasks/business_logic/enrich_ae_rgpd_match.py index c40dd2676..75d73045e 100644 --- a/dags/enrich/tasks/business_logic/enrich_ae_rgpd_match.py +++ b/dags/enrich/tasks/business_logic/enrich_ae_rgpd_match.py @@ -59,19 +59,17 @@ def enrich_ae_rgpd_match( lambda x: word_overlap_ratio(x, cols_names_qfdmo, cols_names_ae), axis=1 ) df[COLS.MATCH_WORDS] = df["temp"].apply(lambda x: x[0]) - df[COLS.MATCH_SCORE_AE_RGPD] = df["temp"].apply(lambda x: x[1]) + df[COLS.MATCH_SCORE] = df["temp"].apply(lambda x: x[1]) df.drop(columns=["temp"], inplace=True) # Selecting & previewing matches - df_no_match = df[df[COLS.MATCH_SCORE_AE_RGPD] == 0] - df_partial = df[ - (df[COLS.MATCH_SCORE_AE_RGPD] > 0) & (df[COLS.MATCH_SCORE_AE_RGPD] < 1) - ] - df_perfect = df[df[COLS.MATCH_SCORE_AE_RGPD] == 1] - df_retained = df[df[COLS.MATCH_SCORE_AE_RGPD] >= match_threshold].copy() + df_no_match = df[df[COLS.MATCH_SCORE] == 0] + df_partial = df[(df[COLS.MATCH_SCORE] > 0) & (df[COLS.MATCH_SCORE] < 1)] + df_perfect = df[df[COLS.MATCH_SCORE] == 1] + df_retained = df[df[COLS.MATCH_SCORE] >= match_threshold].copy() log.preview_df_as_markdown("🔴 Matches non-existant (==0)", df_no_match) log.preview_df_as_markdown("🟡 Matches partiel (>0 & <1)", df_partial) log.preview_df_as_markdown("🟢 Matches parfait (==1)", df_perfect) log.preview_df_as_markdown(f"💾 Matches retenus (>={match_threshold})", df_retained) - return df_retained.sort_values(COLS.MATCH_SCORE_AE_RGPD, ascending=False) + return df_retained.sort_values(COLS.MATCH_SCORE, ascending=False) diff --git a/dags/enrich/tasks/business_logic/enrich_ae_rgpd_suggest.py b/dags/enrich/tasks/business_logic/enrich_ae_rgpd_suggest.py index a77dc0bc3..612dac088 100644 --- a/dags/enrich/tasks/business_logic/enrich_ae_rgpd_suggest.py +++ b/dags/enrich/tasks/business_logic/enrich_ae_rgpd_suggest.py @@ -68,9 +68,7 @@ def enrich_ae_rgpd_suggest( "summary": [ sumline("noms d'origine", row[COLS.ACTEUR_NOMS_ORIGINE], "text"), sumline("mots de match", row[COLS.MATCH_WORDS], "text_list"), - sumline( - "score de match", row[COLS.MATCH_SCORE_AE_RGPD], "score_0_to_1" - ), + sumline("score de match", row[COLS.MATCH_SCORE], "score_0_to_1"), sumline("changements", "voir contexte/détails", "text"), ], "changes": changes, diff --git a/dags/enrich/tasks/business_logic/enrich_dbt_model_to_suggestions.py b/dags/enrich/tasks/business_logic/enrich_dbt_model_to_suggestions.py index a5a92be20..0c30b0c11 100644 --- a/dags/enrich/tasks/business_logic/enrich_dbt_model_to_suggestions.py +++ b/dags/enrich/tasks/business_logic/enrich_dbt_model_to_suggestions.py @@ -10,6 +10,154 @@ logger = logging.getLogger(__name__) +def suggestion_change_prepare( + model, + model_params: dict, + order: int, + reason: str, + entity_type: str, +) -> dict: + """Generic utility to prepare, validate and + serialize 1 suggestion change for all suggestion types""" + from data.models.change import SuggestionChange + + model(**model_params).validate() + return SuggestionChange( + order=order, + reason=reason, + entity_type=entity_type, + model_name=model.name(), + model_params=model_params, + ).model_dump() + + +def suggestion_change_prepare_closed_not_replaced( + row: dict, +) -> list[dict]: + """Prepare suggestions for closed not replaced cohorts""" + from data.models.changes import ChangeActeurUpdateData + from qfdmo.models import ActeurStatus + + changes = [] + model_params = { + "id": row[COLS.ACTEUR_ID], + "data": { + "identifiant_unique": row[COLS.ACTEUR_ID], + "statut": ActeurStatus.INACTIF, + # TODO: fix inconsistency between acteur_siret and siret + # in non-replaced model + "siret": row[COLS.ACTEUR_SIRET], + "siret_is_closed": True, + "acteur_type": row[COLS.ACTEUR_TYPE_ID], + "source": row[COLS.ACTEUR_SOURCE_ID], + }, + } + changes.append( + suggestion_change_prepare( + model=ChangeActeurUpdateData, + model_params=model_params, + order=1, + reason="SIRET & SIREN fermés, 0 remplacement trouvé", + entity_type="acteur_displayed", + ) + ) + return changes + + +def suggestion_change_prepare_closed_replaced( + row: dict, +) -> list[dict]: + """Prepare suggestions for closed replaced cohorts""" + from data.models.changes import ( + ChangeActeurCreateAsChild, + ChangeActeurCreateAsParent, + ChangeActeurUpdateData, + ) + from qfdmo.models import ActeurStatus + + changes = [] + today = datetime.now(timezone.utc).strftime("%Y-%m-%d") + # Parent + parent_id = parent_id_generate([str(row[COLS.REMPLACER_SIRET])]) + params_parent = { + "id": parent_id, + "data": { + "identifiant_unique": parent_id, + "nom": row[COLS.REMPLACER_NOM], + "adresse": row[COLS.REMPLACER_ADRESSE], + "code_postal": row[COLS.REMPLACER_CODE_POSTAL], + "ville": row[COLS.REMPLACER_VILLE], + "siren": row[COLS.REMPLACER_SIRET][:9], + "siret": row[COLS.REMPLACER_SIRET], + "naf_principal": row[COLS.REMPLACER_NAF], + "acteur_type": row[COLS.ACTEUR_TYPE_ID], + "source": None, + "statut": ActeurStatus.ACTIF, + }, + } + changes.append( + suggestion_change_prepare( + model=ChangeActeurCreateAsParent, + model_params=params_parent, + order=1, + reason="besoin d'un parent pour rattaché acteur fermé", + entity_type="acteur_displayed", + ) + ) + + # New child to hold the reference data as standalone + # as parents are surrogates (e.g. they can be deleted + # during clustering) + now = datetime.now(timezone.utc).strftime("%Y%m%d%H%M%S") + child_new_id = f"{row[COLS.ACTEUR_ID]}_{row[COLS.ACTEUR_SIRET]}_{now}" + params_child_new = params_parent.copy() + params_child_new["id"] = child_new_id + params_child_new["data"]["identifiant_unique"] = child_new_id + params_child_new["data"]["source"] = row[COLS.ACTEUR_SOURCE_ID] + params_child_new["data"]["parent"] = parent_id + params_child_new["data"]["parent_reason"] = ( + f"Nouvel enfant pour conserver les données suite à: " + f"SIRET {row[COLS.ACTEUR_SIRET]} " + f"détecté le {today} comme fermé dans AE, " + f"remplacé par SIRET {row[COLS.REMPLACER_SIRET]}" + ) + if row[COLS.ACTEUR_LONGITUDE] is not None and row[COLS.ACTEUR_LATITUDE] is not None: + params_child_new["data"]["longitude"] = row[COLS.ACTEUR_LONGITUDE] + params_child_new["data"]["latitude"] = row[COLS.ACTEUR_LATITUDE] + changes.append( + suggestion_change_prepare( + model=ChangeActeurCreateAsChild, + model_params=params_child_new, + order=2, + reason="besoin nouvel enfant pour conserver les données", + entity_type="acteur_displayed", + ) + ) + + # Existing Child + params_child_old = params_child_new.copy() + params_child_old["id"] = row[COLS.ACTEUR_ID] + params_child_old["data"]["identifiant_unique"] = row[COLS.ACTEUR_ID] + params_child_old["data"]["parent"] = parent_id + params_child_old["data"]["parent_reason"] = ( + f"SIRET {row[COLS.ACTEUR_SIRET]} " + f"détecté le {today} comme fermé dans AE, " + f"remplacé par SIRET {row[COLS.REMPLACER_SIRET]}" + ) + params_child_old["data"]["siret_is_closed"] = True + params_child_old["data"]["statut"] = ActeurStatus.INACTIF + changes.append( + suggestion_change_prepare( + model=ChangeActeurUpdateData, + model_params=params_child_old, + order=3, + reason="rattacher enfant fermé à un parent", + entity_type="acteur_displayed", + ) + ) + return changes + + def enrich_dbt_model_to_suggestions( df: pd.DataFrame, cohort: Cohort, @@ -22,26 +170,15 @@ def enrich_dbt_model_to_suggestions( SuggestionCohorte, SuggestionStatut, ) - from data.models.change import SuggestionChange - from data.models.changes import ( - ChangeActeurCreateAsChild, - ChangeActeurCreateAsParent, - ChangeActeurUpdateData, - ) - from qfdmo.models import ActeurStatus - - today = datetime.now(timezone.utc).strftime("%Y-%m-%d") # Validation if df is None or df.empty: raise ValueError("df vide: on devrait pas être ici") - if cohort.code not in [ - COHORTS.CLOSED_NOT_REPLACED.code, - COHORTS.CLOSED_REP_OTHER_SIREN.code, - COHORTS.CLOSED_REP_SAME_SIREN.code, - ]: - raise ValueError(f"Mauvaise cohorte: {cohort=}") + cohort_codes = list(df[COLS.SUGGEST_COHORT_CODE].unique()) + if len(cohort_codes) != 1 or cohort_codes[0] != cohort.code: + msg = f"Problème cohorte: obtenu {cohort_codes=} vs. attendu {cohort.code=}" + raise ValueError(msg) # Suggestions suggestions = [] @@ -52,29 +189,7 @@ def enrich_dbt_model_to_suggestions( # NOT REPLACED # ----------------------------------------- if cohort == COHORTS.CLOSED_NOT_REPLACED: - changes = [] - model_params = { - "id": row[COLS.ACTEUR_ID], - "data": { - "identifiant_unique": row[COLS.ACTEUR_ID], - "statut": ActeurStatus.INACTIF, - # TODO: fix inconsistency between acteur_siret and siret - # in non-replaced model - "siret": row[COLS.SIRET], - "siret_is_closed": True, - "acteur_type": row[COLS.ACTEUR_TYPE_ID], - "source": row[COLS.ACTEUR_SOURCE_ID], - }, - } - ChangeActeurUpdateData(**model_params).validate() - change = SuggestionChange( - order=1, - reason="SIRET & SIREN fermés, 0 remplacement trouvé", - entity_type="acteur_displayed", - model_name=ChangeActeurUpdateData.name(), - model_params=model_params, - ).model_dump() - changes.append(change) + changes = suggestion_change_prepare_closed_not_replaced(row) # ----------------------------------------- # REPLACED @@ -83,102 +198,12 @@ def enrich_dbt_model_to_suggestions( COHORTS.CLOSED_REP_OTHER_SIREN, COHORTS.CLOSED_REP_SAME_SIREN, ]: - cohorts = df[COLS.SUGGEST_COHORT_CODE].unique() - if len(cohorts) > 1: - raise ValueError(f"Une seule cohorte à la fois: {cohorts=}") - if cohorts[0] != cohort.code: - raise ValueError(f"Mauvaise cohorte: {cohorts=} != {cohort=}") - logger.info(f"{cohort.label}: suggestion acteur id={row[COLS.ACTEUR_ID]}") - - changes = [] - - # Parent - parent_id = parent_id_generate([str(row[COLS.REMPLACER_SIRET])]) - model_params_parent = { - "id": parent_id, - "data": { - "identifiant_unique": parent_id, - "nom": row[COLS.REMPLACER_NOM], - "adresse": row[COLS.REMPLACER_ADRESSE], - "code_postal": row[COLS.REMPLACER_CODE_POSTAL], - "ville": row[COLS.REMPLACER_VILLE], - "siren": row[COLS.REMPLACER_SIRET][:9], - "siret": row[COLS.REMPLACER_SIRET], - "naf_principal": row[COLS.REMPLACER_NAF], - "acteur_type": row[COLS.ACTEUR_TYPE_ID], - "source": None, - "statut": ActeurStatus.ACTIF, - }, - } - ChangeActeurCreateAsParent(**model_params_parent).validate() - change = SuggestionChange( - order=1, - reason="besoin d'un parent pour rattaché acteur fermé", - entity_type="acteur_displayed", - model_name=ChangeActeurCreateAsParent.name(), - model_params=model_params_parent, - ).model_dump() - changes.append(change) - - # New child to hold the reference data as standalone - # as parents are surrogates (e.g. they can be deleted - # during clustering) - now = datetime.now(timezone.utc).strftime("%Y%m%d%H%M%S") - child_new_id = f"{row[COLS.ACTEUR_ID]}_{row[COLS.ACTEUR_SIRET]}_{now}" - model_params_child_new = model_params_parent.copy() - model_params_child_new["id"] = child_new_id - model_params_child_new["data"]["identifiant_unique"] = child_new_id - model_params_child_new["data"]["source"] = row[COLS.ACTEUR_SOURCE_ID] - model_params_child_new["data"]["parent"] = parent_id - model_params_child_new["data"]["parent_reason"] = ( - f"Nouvel enfant pour conserver les données suite à: " - f"SIRET {row[COLS.ACTEUR_SIRET]} " - f"détecté le {today} comme fermé dans AE, " - f"remplacé par SIRET {row[COLS.REMPLACER_SIRET]}" - ) - ChangeActeurCreateAsChild(**model_params_child_new).validate() - change = SuggestionChange( - order=2, - reason="besoin nouvel enfant pour conserver les données", - entity_type="acteur_displayed", - model_name=ChangeActeurCreateAsChild.name(), - model_params=model_params_child_new, - ).model_dump() - changes.append(change) - - # Existing Child - model_params_child_old = { - "id": row[COLS.ACTEUR_ID], - "data": { - "identifiant_unique": row[COLS.ACTEUR_ID], - "parent": parent_id, - "parent_reason": ( - f"SIRET {row[COLS.ACTEUR_SIRET]} " - f"détecté le {today} comme fermé dans AE, " - f"remplacé par SIRET {row[COLS.REMPLACER_SIRET]}" - ), - "siren": row[COLS.ACTEUR_SIRET][:9], - "siret": row[COLS.ACTEUR_SIRET], - "siret_is_closed": True, - "acteur_type": row[COLS.ACTEUR_TYPE_ID], - "source": row[COLS.ACTEUR_SOURCE_ID], - "statut": ActeurStatus.INACTIF, - }, - } - ChangeActeurUpdateData(**model_params_child_old).validate() - change = SuggestionChange( - order=3, - reason="rattacher enfant fermé à un parent", - entity_type="acteur_displayed", - model_name=ChangeActeurUpdateData.name(), - model_params=model_params_child_old, - ).model_dump() - changes.append(change) + changes = suggestion_change_prepare_closed_replaced(row) else: - raise ValueError(f"Mauvaise cohorte: {cohort=}") + raise NotImplementedError(f"Cohorte non implémentée: {cohort=}") - # Generic to all cohorts + # Creating a suggestion with the given changes suggestions.append( { # TODO: free format thanks to recursive model diff --git a/dags_unit_tests/enrich/tasks/test_enrich_acteurs_closed_suggestions.py b/dags_unit_tests/enrich/tasks/test_enrich_acteurs_closed_suggestions.py index 792a45df2..fc7189d9b 100644 --- a/dags_unit_tests/enrich/tasks/test_enrich_acteurs_closed_suggestions.py +++ b/dags_unit_tests/enrich/tasks/test_enrich_acteurs_closed_suggestions.py @@ -39,6 +39,8 @@ def df_not_replaced(self, atype, source): COLS.ACTEUR_NOM: ["AVANT a01", "AVANT a02"], COLS.ACTEUR_TYPE_ID: [atype.pk, atype.pk], COLS.ACTEUR_SOURCE_ID: [source.pk, source.pk], + COLS.SUGGEST_COHORT_CODE: [COHORTS.CLOSED_NOT_REPLACED.code] * 2, + COLS.SUGGEST_COHORT_LABEL: [COHORTS.CLOSED_NOT_REPLACED.label] * 2, } ) @@ -55,6 +57,8 @@ def df_replaced(self, atype, source): ], COLS.ACTEUR_TYPE_ID: [atype.pk, atype.pk, atype.pk], COLS.ACTEUR_SOURCE_ID: [source.pk, source.pk, source.pk], + COLS.ACTEUR_LONGITUDE: [1, 1, 1], + COLS.ACTEUR_LATITUDE: [2, 2, 2], # Replacement data COLS.REMPLACER_SIRET: [ "11111111100002", @@ -62,10 +66,15 @@ def df_replaced(self, atype, source): "55555555500001", ], COLS.REMPLACER_NOM: ["APRES a1", "APRES a2", "APRES a3"], + COLS.SUGGEST_COHORT_CODE: [ + COHORTS.CLOSED_REP_SAME_SIREN.code, + COHORTS.CLOSED_REP_OTHER_SIREN.code, + COHORTS.CLOSED_REP_OTHER_SIREN.code, + ], COLS.SUGGEST_COHORT_LABEL: [ - "meme_siret", - "autre_siret", - "autre_siret", + COHORTS.CLOSED_REP_SAME_SIREN.label, + COHORTS.CLOSED_REP_OTHER_SIREN.label, + COHORTS.CLOSED_REP_OTHER_SIREN.label, ], COLS.REMPLACER_ADRESSE: ["Adresse1", "Adresse2", "Adresse3"], COLS.REMPLACER_CODE_POSTAL: ["12345", "67890", "12345"], @@ -77,18 +86,24 @@ def df_replaced(self, atype, source): def test_df_replaced(self, df_replaced): assert sorted(df_replaced[COLS.SUGGEST_COHORT_LABEL].unique()) == sorted( [ - "meme_siret", - "autre_siret", + COHORTS.CLOSED_REP_SAME_SIREN.label, + COHORTS.CLOSED_REP_OTHER_SIREN.label, ] ) @pytest.fixture def df_replaced_meme_siret(self, df_replaced): - return df_replaced[df_replaced[COLS.SUGGEST_COHORT_LABEL] == "meme_siret"] + return df_replaced[ + df_replaced[COLS.SUGGEST_COHORT_LABEL] + == COHORTS.CLOSED_REP_SAME_SIREN.label + ] @pytest.fixture def df_replaced_autre_siret(self, df_replaced): - return df_replaced[df_replaced[COLS.SUGGEST_COHORT_LABEL] == "autre_siret"] + return df_replaced[ + df_replaced[COLS.SUGGEST_COHORT_LABEL] + == COHORTS.CLOSED_REP_OTHER_SIREN.label + ] @pytest.fixture def acteurs(self, df_not_replaced, df_replaced, atype, source): @@ -113,7 +128,7 @@ def test_cohorte_not_replaced(self, acteurs, df_not_replaced): # Write suggestions to DB enrich_dbt_model_to_suggestions( df=df_not_replaced, - cohort_code=COHORTS.CLOSED_NOT_REPLACED, + cohort=COHORTS.CLOSED_NOT_REPLACED, identifiant_action="test_not_replaced", dry_run=False, ) @@ -148,7 +163,7 @@ def test_cohorte_meme_siren(self, acteurs, atype, source, df_replaced_meme_siret # Write suggestions to DB enrich_dbt_model_to_suggestions( df=df_replaced_meme_siret, - cohort_code=COHORTS.CLOSED_REP_SAME_SIREN, + cohort=COHORTS.CLOSED_REP_SAME_SIREN, identifiant_action="test_meme_siren", dry_run=False, ) @@ -192,7 +207,7 @@ def test_cohorte_autre_siren(self, acteurs, df_replaced_autre_siret): # Write suggestions to DB enrich_dbt_model_to_suggestions( df=df_replaced_autre_siret, - cohort_code=COHORTS.CLOSED_REP_OTHER_SIREN, + cohort=COHORTS.CLOSED_REP_OTHER_SIREN, identifiant_action="test_autre_siren", dry_run=False, ) diff --git a/dags_unit_tests/enrich/tasks/test_enrich_ae_rgpd_suggest.py b/dags_unit_tests/enrich/tasks/test_enrich_ae_rgpd_suggest.py index ca2f4bdb3..51e7a86e1 100644 --- a/dags_unit_tests/enrich/tasks/test_enrich_ae_rgpd_suggest.py +++ b/dags_unit_tests/enrich/tasks/test_enrich_ae_rgpd_suggest.py @@ -26,6 +26,7 @@ def df(self): COLS.ACTEUR_NOMS_ORIGINE: ["acteur1", "acteur2"], COLS.MATCH_WORDS: ["acteur1", "acteur2"], COLS.MATCH_SCORE: [1.0, 1.0], + COLS.ACTEUR_SIRET: ["11111111100001", "22222222200001"], } ) @@ -96,7 +97,7 @@ def test_suggestion_change(self, suggest): assert acteur.nom == "ANONYMISE POUR RAISON RGPD" assert acteur.nom_officiel == "ANONYMISE POUR RAISON RGPD" assert acteur.nom_commercial == "ANONYMISE POUR RAISON RGPD" - assert acteur.email is None + assert acteur.email == "" assert acteur.telephone == "ANONYMISE POUR RAISON RGPD" assert acteur.adresse == "ANONYMISE POUR RAISON RGPD" assert acteur.adresse_complement == "ANONYMISE POUR RAISON RGPD" diff --git a/data/models/changes/acteur_create_as_child.py b/data/models/changes/acteur_create_as_child.py index 4e6f60028..516ae94d6 100644 --- a/data/models/changes/acteur_create_as_child.py +++ b/data/models/changes/acteur_create_as_child.py @@ -1,4 +1,9 @@ from pydantic import BaseModel +from rich import print + +from dags.cluster.tasks.business_logic.misc.data_serialize_reconstruct import ( + data_reconstruct, +) class ChangeActeurCreateAsChild(BaseModel): @@ -35,24 +40,29 @@ def apply(self): # Ensure parent exists in RevisionActeur parent = RevisionActeur.objects.get(pk=self.data["parent"]) - # Create child in Acteur - data_base = self.data.copy() + # Reconstruct data from RevisionActeur + print(f"data before reconstruct: {self.data}") + data = data_reconstruct(RevisionActeur, self.data) + print(f"data after reconstruct: {data}") + + # Create child in Acteur to hold data + data_base = data.copy() del data_base["parent"] del data_base["parent_reason"] + # TODO: if we flatten our pydantic models, then we wouldn't + if "identifiant_unique" in data_base: + del data_base["identifiant_unique"] Acteur.objects.create( identifiant_unique=self.id, **data_base, ) - # In Revision we only store what is different, i.e. parent - # FIXME: should we use get_or_create_revision? - # I tried, it was failing, started to look at the code and went - # down a rabbit hole + # Create child in RevisionActeur to hold reference to parent RevisionActeur.objects.create( identifiant_unique=self.id, + parent_reason=data["parent_reason"], parent=parent, - parent_reason=self.data["parent_reason"], statut="ACTIF", - source=self.data["source"], - acteur_type=self.data["acteur_type"], + source=data["source"], + acteur_type=data["acteur_type"], ) diff --git a/data/models/changes/acteur_create_as_parent.py b/data/models/changes/acteur_create_as_parent.py index 73baf552a..6ba6bff61 100644 --- a/data/models/changes/acteur_create_as_parent.py +++ b/data/models/changes/acteur_create_as_parent.py @@ -1,7 +1,5 @@ """change model to create a parent acteur""" -from rich import print - from data.models.changes.acteur_abstract import ChangeActeurAbstract from data.models.changes.utils import data_reconstruct from qfdmo.models import ActeurStatus, RevisionActeur @@ -17,7 +15,6 @@ def name(cls) -> str: def validate(self): """The parent shouldn't already exist""" - print(f"ChangeActeurCreateAsParent.validate: {self.id=} {self.data=}") rev = RevisionActeur.objects.filter(identifiant_unique=self.id) if rev.exists(): raise ValueError(f"Parent to create '{self.id}' already exists") diff --git a/data/models/changes/acteur_rgpd_anonymize.py b/data/models/changes/acteur_rgpd_anonymize.py index f4bc1c257..32663b592 100644 --- a/data/models/changes/acteur_rgpd_anonymize.py +++ b/data/models/changes/acteur_rgpd_anonymize.py @@ -23,7 +23,7 @@ "nom": VALUE_ANONYMIZED, "nom_officiel": VALUE_ANONYMIZED, "nom_commercial": VALUE_ANONYMIZED, - "email": None, # due to email constraint + "email": "", # Consequence of allowing empty strings in DB "telephone": VALUE_ANONYMIZED, "adresse": VALUE_ANONYMIZED, "adresse_complement": VALUE_ANONYMIZED, diff --git a/data/models/changes/acteur_update_data.py b/data/models/changes/acteur_update_data.py index 1bc5a9eac..39837b2e4 100644 --- a/data/models/changes/acteur_update_data.py +++ b/data/models/changes/acteur_update_data.py @@ -35,7 +35,10 @@ def apply(self): acteur = self.validate() # If acteur is only in base, we need to create a revision if isinstance(acteur, Acteur): - acteur = RevisionActeur(identifiant_unique=acteur.identifiant_unique) + acteur = RevisionActeur( + identifiant_unique=acteur.identifiant_unique, + acteur_type=acteur.acteur_type, + ) data = data_reconstruct(RevisionActeur, self.data) for key, value in data.items(): setattr(acteur, key, value) diff --git a/data/models/changes/acteur_update_parent_id.py b/data/models/changes/acteur_update_parent_id.py index b12421375..cfdfd4dcc 100644 --- a/data/models/changes/acteur_update_parent_id.py +++ b/data/models/changes/acteur_update_parent_id.py @@ -11,18 +11,20 @@ def name(cls) -> str: def validate(self): # - The acteur MUST exist in base - Acteur.objects.get(pk=self.id) + return Acteur.objects.get(pk=self.id) # - It's OK for acteur to not be in revision # - Can't test if parent exists as maybe it's to be created def apply(self): - self.validate() + base = self.validate() # By the time we apply changes to update parent_ids, the # corresponding parents must exist parent = RevisionActeur.objects.get(pk=self.data["parent_id"]) rev = RevisionActeur.objects.filter(pk=self.id) if not rev.exists(): - rev = RevisionActeur(identifiant_unique=self.id) + rev = RevisionActeur( + identifiant_unique=self.id, acteur_type=base.acteur_type + ) else: rev = rev.first() rev.parent = parent # type: ignore diff --git a/dbt/models/marts/enrich/marts_enrich_acteurs_closed_candidates.sql b/dbt/models/marts/enrich/marts_enrich_acteurs_closed_candidates.sql index a10b50d16..8745ebd55 100644 --- a/dbt/models/marts/enrich/marts_enrich_acteurs_closed_candidates.sql +++ b/dbt/models/marts/enrich/marts_enrich_acteurs_closed_candidates.sql @@ -14,14 +14,12 @@ Notes: -- Starting from our acteurs we can match via SIRET WITH acteurs_with_siret AS ( SELECT - -- Common columns - LEFT(siret,9) AS siren, - siret, - -- Acteur columns + identifiant_unique AS acteur_id, + siret AS acteur_siret, + LEFT(siret,9) AS acteur_siren, nom AS acteur_nom, udf_normalize_string_for_match(nom) AS acteur_nom_normalise, - identifiant_unique AS acteur_id, commentaires AS acteur_commentaires, statut AS acteur_statut, acteur_type_id, @@ -30,7 +28,8 @@ WITH acteurs_with_siret AS ( source_code AS acteur_source_code, adresse AS acteur_adresse, code_postal AS acteur_code_postal, - ville AS acteur_ville + ville AS acteur_ville, + location AS acteur_location FROM {{ ref('marts_carte_acteur') }} WHERE siret IS NOT NULL AND siret != '' AND LENGTH(siret) = 14 @@ -40,12 +39,10 @@ not on unite closed (NOT unite_est_actif) because open unite might bring potential replacements */ etab_closed_candidates AS ( SELECT - -- Common columns (need to specify to avoid ambiguity) - acteurs.siren, - acteurs.siret, - -- acteurs acteurs.acteur_id, + acteurs.acteur_siret, + acteurs.acteur_siren, acteurs.acteur_type_id, acteurs.acteur_type_code, acteurs.acteur_source_id, @@ -57,6 +54,8 @@ SELECT acteurs.acteur_adresse, acteurs.acteur_code_postal, acteurs.acteur_ville, + CASE WHEN acteurs.acteur_location IS NULL THEN NULL ELSE ST_X(acteurs.acteur_location) END AS acteur_longitude, + CASE WHEN acteurs.acteur_location IS NULL THEN NULL ELSE ST_Y(acteurs.acteur_location) END AS acteur_latitude, -- etablissement etab.unite_est_actif AS unite_est_actif, @@ -68,7 +67,7 @@ SELECT etab.naf AS etab_naf FROM acteurs_with_siret AS acteurs -JOIN {{ ref('int_ae_etablissement') }} AS etab ON acteurs.siret = etab.siret +JOIN {{ ref('int_ae_etablissement') }} AS etab ON acteurs.acteur_siret = etab.siret WHERE etab.est_actif IS FALSE ) diff --git a/dbt/models/marts/enrich/marts_enrich_acteurs_closed_replaced.sql b/dbt/models/marts/enrich/marts_enrich_acteurs_closed_replaced.sql index 3784becfd..050842253 100644 --- a/dbt/models/marts/enrich/marts_enrich_acteurs_closed_replaced.sql +++ b/dbt/models/marts/enrich/marts_enrich_acteurs_closed_replaced.sql @@ -8,28 +8,17 @@ WITH potential_replacements AS ( SELECT - -- Candidates - candidates.acteur_id AS acteur_id, - candidates.acteur_type_id AS acteur_type_id, - candidates.acteur_type_code AS acteur_type_code, - candidates.acteur_source_id AS acteur_source_id, - candidates.acteur_source_code AS acteur_source_code, - candidates.acteur_statut AS acteur_statut, - candidates.siret AS acteur_siret, - candidates.acteur_nom, - candidates.acteur_commentaires AS acteur_commentaires, - candidates.acteur_adresse AS acteur_adresse, - candidates.acteur_code_postal AS acteur_code_postal, - candidates.acteur_ville AS acteur_ville, + -- Candidates acteur data + candidates.*, -- Replacements replacements.siret AS remplacer_siret, - LEFT(candidates.siret,9) = LEFT(replacements.siret,9) AS remplacer_siret_is_from_same_siren, + LEFT(candidates.acteur_siret,9) = LEFT(replacements.siret,9) AS remplacer_siret_is_from_same_siren, replacements.nom AS remplacer_nom, - replacements.naf AS naf, - replacements.ville AS ville, - replacements.code_postal AS code_postal, - replacements.adresse AS adresse, + replacements.naf AS remplacer_naf, + replacements.ville AS remplacer_ville, + replacements.code_postal AS remplacer_code_postal, + replacements.adresse AS remplacer_adresse, -- Matching udf_columns_words_in_common_count( @@ -37,11 +26,11 @@ WITH potential_replacements AS ( udf_normalize_string_for_match(replacements.nom) ) AS noms_nombre_mots_commun, ROW_NUMBER() OVER ( - PARTITION BY candidates.siret + PARTITION BY candidates.acteur_siret ORDER BY -- Prioritize replacements from same company CASE - WHEN LEFT(candidates.siret,9) = LEFT(replacements.siret,9) THEN 1 + WHEN LEFT(candidates.acteur_siret,9) = LEFT(replacements.siret,9) THEN 1 ELSE 0 END DESC, -- Then etablissements with more words in common diff --git a/unit_tests/data/models/changes/test_acteur_create_as_child.py b/unit_tests/data/models/changes/test_acteur_create_as_child.py index aadb6ad79..77799f901 100644 --- a/unit_tests/data/models/changes/test_acteur_create_as_child.py +++ b/unit_tests/data/models/changes/test_acteur_create_as_child.py @@ -55,6 +55,7 @@ def test_working(self): ) change.apply() + # Acteur created in base to hold the core data base = Acteur.objects.get(pk="child1") assert base.identifiant_unique == "child1" assert base.nom == "my child1" @@ -63,6 +64,8 @@ def test_working(self): assert base.statut == "ACFIF" assert base.location.x == 1 assert base.location.y == 1 + + # Acteur created in revision to hold the parent reference revision = RevisionActeur.objects.get(pk="child1") assert revision.parent.pk == parent.pk assert revision.parent_reason == "test" diff --git a/unit_tests/data/models/changes/test_acteur_rgpd_anonymize.py b/unit_tests/data/models/changes/test_acteur_rgpd_anonymize.py index 74d7c02fe..0fbf2f91a 100644 --- a/unit_tests/data/models/changes/test_acteur_rgpd_anonymize.py +++ b/unit_tests/data/models/changes/test_acteur_rgpd_anonymize.py @@ -36,7 +36,7 @@ "nom": CHANGE_ANON, "nom_officiel": CHANGE_ANON, "nom_commercial": CHANGE_ANON, - "email": None, # due to email constraint + "email": "", # Consequence of allowing empty strings in DB "telephone": CHANGE_ANON, "adresse": CHANGE_ANON, "adresse_complement": CHANGE_ANON, From 4731a3c61568778c52c75f5275a2fedfed15d554 Mon Sep 17 00:00:00 2001 From: maxcorbeau Date: Mon, 14 Apr 2025 14:39:53 +0200 Subject: [PATCH 19/50] =?UTF-8?q?ajout=20tol=C3=A9rance=20=C3=A9chec?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../enrich_dbt_model_to_suggestions.py | 42 +++++++++++-------- 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/dags/enrich/tasks/business_logic/enrich_dbt_model_to_suggestions.py b/dags/enrich/tasks/business_logic/enrich_dbt_model_to_suggestions.py index 0c30b0c11..c4b40df59 100644 --- a/dags/enrich/tasks/business_logic/enrich_dbt_model_to_suggestions.py +++ b/dags/enrich/tasks/business_logic/enrich_dbt_model_to_suggestions.py @@ -6,6 +6,7 @@ parent_id_generate, ) from enrich.config import COHORTS, COLS, Cohort +from utils import logging_utils as log logger = logging.getLogger(__name__) @@ -185,28 +186,30 @@ def enrich_dbt_model_to_suggestions( for _, row in df.iterrows(): row = dict(row) - # ----------------------------------------- - # NOT REPLACED - # ----------------------------------------- - if cohort == COHORTS.CLOSED_NOT_REPLACED: - changes = suggestion_change_prepare_closed_not_replaced(row) - - # ----------------------------------------- - # REPLACED - # ----------------------------------------- - elif cohort in [ - COHORTS.CLOSED_REP_OTHER_SIREN, - COHORTS.CLOSED_REP_SAME_SIREN, - ]: - changes = suggestion_change_prepare_closed_replaced(row) - - else: - raise NotImplementedError(f"Cohorte non implémentée: {cohort=}") + try: + # ----------------------------------------- + # NOT REPLACED + # ----------------------------------------- + if cohort == COHORTS.CLOSED_NOT_REPLACED: + changes = suggestion_change_prepare_closed_not_replaced(row) + + # ----------------------------------------- + # REPLACED + # ----------------------------------------- + elif cohort in [ + COHORTS.CLOSED_REP_OTHER_SIREN, + COHORTS.CLOSED_REP_SAME_SIREN, + ]: + changes = suggestion_change_prepare_closed_replaced(row) + + except Exception as e: + log.preview("🔴 Suggestion problématique", row) + logger.error(f"Erreur de préparation des changements: {e}") + continue # Creating a suggestion with the given changes suggestions.append( { - # TODO: free format thanks to recursive model "contexte": {}, "suggestion": { "title": cohort.label, @@ -216,6 +219,9 @@ def enrich_dbt_model_to_suggestions( } ) + if not suggestions: + raise ValueError("Aucune suggestion à écrire, pas normal") + # ----------------------------------------- # DRY RUN: STOP HERE # ----------------------------------------- From d8921121fac4f36732ba5dce7c03412a693d86ca Mon Sep 17 00:00:00 2001 From: maxcorbeau Date: Mon, 14 Apr 2025 16:17:27 +0200 Subject: [PATCH 20/50] udfs: norma exclusion petits mots, cleanup --- dbt/macros/udf/udf_ae_string_cleanup.sql | 2 +- dbt/macros/udf/udf_columns_concat_unique_non_empty.sql | 2 +- dbt/macros/udf/udf_columns_words_in_common_count.sql | 2 +- dbt/macros/udf/udf_encode_base57.sql | 2 +- dbt/macros/udf/udf_normalize_string_alpha_for_match.sql | 7 ++++--- dbt/macros/udf/udf_safe_divmod.sql | 2 +- dbt/macros/udf/udf_uuid_to_int.sql | 2 +- 7 files changed, 10 insertions(+), 9 deletions(-) diff --git a/dbt/macros/udf/udf_ae_string_cleanup.sql b/dbt/macros/udf/udf_ae_string_cleanup.sql index 2608580f9..ddd65e003 100644 --- a/dbt/macros/udf/udf_ae_string_cleanup.sql +++ b/dbt/macros/udf/udf_ae_string_cleanup.sql @@ -6,7 +6,7 @@ to NULL for easier processing whenever we consider it to be empty. */ -DROP FUNCTION IF EXISTS {{ target.schema }}.udf_ae_string_cleanup(TEXT) CASCADE; +DROP FUNCTION IF EXISTS {{ target.schema }}.udf_ae_string_cleanup CASCADE; CREATE FUNCTION {{ target.schema }}.udf_ae_string_cleanup(val TEXT) RETURNS TEXT AS $$ BEGIN diff --git a/dbt/macros/udf/udf_columns_concat_unique_non_empty.sql b/dbt/macros/udf/udf_columns_concat_unique_non_empty.sql index 6bc2b34b2..a13f7acee 100644 --- a/dbt/macros/udf/udf_columns_concat_unique_non_empty.sql +++ b/dbt/macros/udf/udf_columns_concat_unique_non_empty.sql @@ -2,7 +2,7 @@ /* Concatenate strings from various columns while only retaining non-empty values */ -DROP FUNCTION IF EXISTS {{ target.schema }}.udf_columns_concat_unique_non_empty(VARIADIC input_columns TEXT[]) CASCADE; +DROP FUNCTION IF EXISTS {{ target.schema }}.udf_columns_concat_unique_non_empty CASCADE; CREATE FUNCTION {{ target.schema }}.udf_columns_concat_unique_non_empty(VARIADIC input_columns TEXT[]) RETURNS TEXT AS $$ DECLARE diff --git a/dbt/macros/udf/udf_columns_words_in_common_count.sql b/dbt/macros/udf/udf_columns_words_in_common_count.sql index e2f1e6afc..07534a8ef 100644 --- a/dbt/macros/udf/udf_columns_words_in_common_count.sql +++ b/dbt/macros/udf/udf_columns_words_in_common_count.sql @@ -2,7 +2,7 @@ /* Count number of words in common between 2 columns */ -DROP FUNCTION IF EXISTS {{ target.schema }}.udf_columns_words_in_common_count(text, text) CASCADE; +DROP FUNCTION IF EXISTS {{ target.schema }}.udf_columns_words_in_common_count CASCADE; CREATE FUNCTION {{ target.schema }}.udf_columns_words_in_common_count(col1 text, col2 text) RETURNS integer AS $$ DECLARE diff --git a/dbt/macros/udf/udf_encode_base57.sql b/dbt/macros/udf/udf_encode_base57.sql index a6756eafc..58163b64f 100644 --- a/dbt/macros/udf/udf_encode_base57.sql +++ b/dbt/macros/udf/udf_encode_base57.sql @@ -1,6 +1,6 @@ {% macro create_udf_encode_base57() %} -DROP FUNCTION IF EXISTS {{ target.schema }}.encode_base57(uuid) CASCADE; +DROP FUNCTION IF EXISTS {{ target.schema }}.encode_base57 CASCADE; CREATE FUNCTION {{ target.schema }}.encode_base57(uuid UUID) RETURNS text AS $$ DECLARE diff --git a/dbt/macros/udf/udf_normalize_string_alpha_for_match.sql b/dbt/macros/udf/udf_normalize_string_alpha_for_match.sql index 0bd1c3c05..44a6d10ed 100644 --- a/dbt/macros/udf/udf_normalize_string_alpha_for_match.sql +++ b/dbt/macros/udf/udf_normalize_string_alpha_for_match.sql @@ -9,8 +9,8 @@ E.g. to test this function: SELECT udf_normalize_string_for_match(' Héllo-Wørld! Ça va? 123 '); */ -DROP FUNCTION IF EXISTS {{ target.schema }}.udf_normalize_string_for_match(TEXT) CASCADE; -CREATE FUNCTION {{ target.schema }}.udf_normalize_string_for_match(input_text TEXT) RETURNS TEXT AS $$ +DROP FUNCTION IF EXISTS {{ target.schema }}.udf_normalize_string_for_match CASCADE; +CREATE FUNCTION {{ target.schema }}.udf_normalize_string_for_match(input_text TEXT, remove_words_smaller_size INTEGER DEFAULT 2) RETURNS TEXT AS $$ DECLARE normalized TEXT; words TEXT[]; @@ -28,7 +28,8 @@ BEGIN FROM ( SELECT unnest(words) AS word ORDER BY word - ) AS words_sorted; + ) AS words_sorted + WHERE length(word) >= remove_words_smaller_size; RETURN normalized; END; diff --git a/dbt/macros/udf/udf_safe_divmod.sql b/dbt/macros/udf/udf_safe_divmod.sql index 3c36d20f5..a782562fd 100644 --- a/dbt/macros/udf/udf_safe_divmod.sql +++ b/dbt/macros/udf/udf_safe_divmod.sql @@ -1,5 +1,5 @@ {% macro create_udf_safe_divmod() %} -DROP FUNCTION IF EXISTS {{ target.schema }}.safe_divmod(numeric, numeric) CASCADE; +DROP FUNCTION IF EXISTS {{ target.schema }}.safe_divmod CASCADE; CREATE FUNCTION {{ target.schema }}.safe_divmod(n numeric, d numeric) RETURNS TABLE(quotient numeric, remainder numeric) AS $$ DECLARE diff --git a/dbt/macros/udf/udf_uuid_to_int.sql b/dbt/macros/udf/udf_uuid_to_int.sql index 45a437172..6bff1d6ba 100644 --- a/dbt/macros/udf/udf_uuid_to_int.sql +++ b/dbt/macros/udf/udf_uuid_to_int.sql @@ -1,5 +1,5 @@ {% macro create_udf_uuid_to_int() %} -DROP FUNCTION IF EXISTS {{ target.schema }}.uuid_to_int(uuid) CASCADE; +DROP FUNCTION IF EXISTS {{ target.schema }}.uuid_to_int CASCADE; CREATE FUNCTION {{ target.schema }}.uuid_to_int(uuid UUID) RETURNS numeric AS $$ DECLARE From 658f31fea2261968a72b6dc2eda537744a2b00a1 Mon Sep 17 00:00:00 2001 From: maxcorbeau Date: Wed, 16 Apr 2025 10:15:55 +0200 Subject: [PATCH 21/50] =?UTF-8?q?fin=20de=20r=C3=A9solution=20de=20conflit?= =?UTF-8?q?s?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- dags/enrich/dags/enrich_acteurs_closed.py | 2 +- data/models/changes/acteur_update_data.py | 8 -------- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/dags/enrich/dags/enrich_acteurs_closed.py b/dags/enrich/dags/enrich_acteurs_closed.py index fa3752321..24397914e 100644 --- a/dags/enrich/dags/enrich_acteurs_closed.py +++ b/dags/enrich/dags/enrich_acteurs_closed.py @@ -38,7 +38,7 @@ tags=["annuaire", "entreprises", "ae", "siren", "siret", "acteurs", "fermés"], schedule=SCHEDULES.NONE, catchup=CATCHUPS.AWLAYS_FALSE, - start_date=START_DATES.FOR_SCHEDULE_NONE, + start_date=START_DATES.YESTERDAY, params=config_to_airflow_params( EnrichActeursClosedConfig( dbt_models_refresh=False, diff --git a/data/models/changes/acteur_update_data.py b/data/models/changes/acteur_update_data.py index 39837b2e4..785bfe4e3 100644 --- a/data/models/changes/acteur_update_data.py +++ b/data/models/changes/acteur_update_data.py @@ -2,14 +2,6 @@ is very specific (e.g. RGPD), use a dedicated model fore more clarity/robustness, else you can use this model.""" -<<<<<<< HEAD -from rich import print - -======= -from dags.cluster.tasks.business_logic.misc.data_serialize_reconstruct import ( - data_reconstruct, -) ->>>>>>> b8c92032 (suggestions: tests qui fonctionnent) from data.models.changes.acteur_abstract import ChangeActeurAbstract from data.models.changes.utils import data_reconstruct from qfdmo.models import Acteur, RevisionActeur From df8a074fd4385472c81a4a0810d32a27eb1cc904 Mon Sep 17 00:00:00 2001 From: maxcorbeau Date: Wed, 16 Apr 2025 10:30:57 +0200 Subject: [PATCH 22/50] suppression code RGPD --- dags/enrich/config/dbt.py | 1 - dags/enrich/config/tasks.py | 7 - dags/enrich/dags/enrich_acteurs_rgpd.py | 71 ---------- .../enrich_ae_rgpd_match_task.py | 50 -------- .../airflow_logic/enrich_ae_rgpd_read_task.py | 51 -------- .../enrich_ae_rgpd_suggest_task.py | 51 -------- .../business_logic/enrich_ae_rgpd_match.py | 75 ----------- .../business_logic/enrich_ae_rgpd_read.py | 46 ------- .../business_logic/enrich_ae_rgpd_suggest.py | 100 --------------- .../enrich/tasks/test_enrich_ae_rgpd_read.py | 49 ------- .../tasks/test_enrich_ae_rgpd_suggest.py | 116 ----------------- data/models/changes/acteur_rgpd_anonymize.py | 62 --------- .../marts/enrich/marts_enrich_ae_rgpd.sql | 74 ----------- dbt/models/marts/enrich/schema.yml | 43 ------- .../changes/test_acteur_rgpd_anonymize.py | 121 ------------------ 15 files changed, 917 deletions(-) delete mode 100644 dags/enrich/dags/enrich_acteurs_rgpd.py delete mode 100644 dags/enrich/tasks/airflow_logic/enrich_ae_rgpd_match_task.py delete mode 100644 dags/enrich/tasks/airflow_logic/enrich_ae_rgpd_read_task.py delete mode 100644 dags/enrich/tasks/airflow_logic/enrich_ae_rgpd_suggest_task.py delete mode 100644 dags/enrich/tasks/business_logic/enrich_ae_rgpd_match.py delete mode 100644 dags/enrich/tasks/business_logic/enrich_ae_rgpd_read.py delete mode 100644 dags/enrich/tasks/business_logic/enrich_ae_rgpd_suggest.py delete mode 100644 dags_unit_tests/enrich/tasks/test_enrich_ae_rgpd_read.py delete mode 100644 dags_unit_tests/enrich/tasks/test_enrich_ae_rgpd_suggest.py delete mode 100644 data/models/changes/acteur_rgpd_anonymize.py delete mode 100644 dbt/models/marts/enrich/marts_enrich_ae_rgpd.sql delete mode 100644 unit_tests/data/models/changes/test_acteur_rgpd_anonymize.py diff --git a/dags/enrich/config/dbt.py b/dags/enrich/config/dbt.py index 1ae960ae9..44e3af97e 100644 --- a/dags/enrich/config/dbt.py +++ b/dags/enrich/config/dbt.py @@ -5,7 +5,6 @@ @dataclass(frozen=True) class DBT: - MARTS_ENRICH_AE_RGPD: str = "marts_enrich_ae_rgpd" MARTS_ENRICH_AE_CLOSED_CANDIDATES: str = "marts_enrich_acteurs_closed_candidates" MARTS_ENRICH_AE_CLOSED_REPLACED_SAME_SIREN: str = ( "marts_enrich_acteurs_closed_suggest_replaced_same_siren" diff --git a/dags/enrich/config/tasks.py b/dags/enrich/config/tasks.py index dd152e865..0b1266be6 100644 --- a/dags/enrich/config/tasks.py +++ b/dags/enrich/config/tasks.py @@ -9,7 +9,6 @@ class TASKS: CONFIG_CREATE: str = "enrich_config_create" # Read tasks - READ_AE_RGPD: str = "enrich_ae_rgpd_read" ENRICH_CLOSED_REPLACED_SAME_SIREN: str = "enrich_acteurs_closed_replaced_same_siren" ENRICH_CLOSED_REPLACED_OTHER_SIREN: str = ( "enrich_acteurs_closed_replaced_other_siren" @@ -25,9 +24,3 @@ class TASKS: "enrich_acteurs_closed_suggestions_not_replaced" ) ENRICH_DBT_MODELS_REFRESH: str = "enrich_dbt_models_refresh" - - # Matching tasks - MATCH_SCORE: str = "enrich_ae_rgpd_match" - - # Suggestion tasks - SUGGEST_AE_RGPD: str = "enrich_ae_rgpd_suggest" diff --git a/dags/enrich/dags/enrich_acteurs_rgpd.py b/dags/enrich/dags/enrich_acteurs_rgpd.py deleted file mode 100644 index 4265305fa..000000000 --- a/dags/enrich/dags/enrich_acteurs_rgpd.py +++ /dev/null @@ -1,71 +0,0 @@ -""" -DAG to anonymize QFDMO acteur which names -contains people from Annuaire Entreprise (AE) -""" - -from datetime import datetime - -from airflow import DAG -from airflow.models.baseoperator import chain -from airflow.models.param import Param -from enrich.config import COLS -from enrich.tasks.airflow_logic.enrich_ae_rgpd_match_task import ( - enrich_ae_rgpd_match_task, -) -from enrich.tasks.airflow_logic.enrich_ae_rgpd_read_task import ( - enrich_ae_rgpd_read_task, -) -from enrich.tasks.airflow_logic.enrich_ae_rgpd_suggest_task import ( - enrich_ae_rgpd_suggest_task, -) - -with DAG( - dag_id="enrich_ae_acteurs_rgpd", - dag_display_name="Enrichir - AE - Acteurs RGPD", - default_args={ - "owner": "airflow", - "depends_on_past": False, - "start_date": datetime(2025, 3, 5), - "catchup": False, - "email_on_failure": False, - "email_on_retry": False, - "retries": 0, - }, - description=( - "Un DAG pour anonymiser les acteurs QFDMO dont" - "le nom contient des personnes de l'Annuaire Entreprise (AE)" - ), - tags=["enrich", "annuaire", "entreprise", "siren", "ae", "acteurs"], - params={ - "dry_run": Param( - True, - type="boolean", - description_md="🚱 Si coché, aucune tâche d'écriture ne sera effectuée", - ), - "filter_comments_contain": Param( - "", - type=["null", "string"], - description_md="🔍 Filtre sur les commentaires pour la lecture des données", - ), - COLS.MATCH_SCORE: Param( - 1, - type="number", - minimum=0.5, - maximum=1, - description_md=r"""🎯 Seuil de match pour considérer un acteur - anonymisable: - - **match** = ratio du nombre de mots du nom de l'acteur qui correspondent - à des mots de nom/prénom des personnes de l'AE - - **minimum** = 0.5 - - **maximum** = 1 - """, - ), - }, - schedule=None, - catchup=False, -) as dag: - chain( - enrich_ae_rgpd_read_task(dag), - enrich_ae_rgpd_match_task(dag), - enrich_ae_rgpd_suggest_task(dag), - ) diff --git a/dags/enrich/tasks/airflow_logic/enrich_ae_rgpd_match_task.py b/dags/enrich/tasks/airflow_logic/enrich_ae_rgpd_match_task.py deleted file mode 100644 index 9eaa636f4..000000000 --- a/dags/enrich/tasks/airflow_logic/enrich_ae_rgpd_match_task.py +++ /dev/null @@ -1,50 +0,0 @@ -"""Match acteurs from QFDMO vs. AE based on people names""" - -import logging - -from airflow import DAG -from airflow.exceptions import AirflowSkipException -from airflow.operators.python import PythonOperator -from enrich.config import COLS, TASKS, XCOMS -from enrich.tasks.business_logic.enrich_ae_rgpd_match import ( - enrich_ae_rgpd_match, -) - -logger = logging.getLogger(__name__) - - -def task_info_get(): - return f""" - ============================================================ - Description de la tâche "{TASKS.MATCH_SCORE}" - ============================================================ - 💡 quoi: on cherche à déterminer quels acteurs QFDMO ont un - nom qui correspond à des noms de personnes dans l'AE - - 🎯 pourquoi: le but de ce DAG: pouvoir par la suite anonymiser - - 🏗️ comment: normalisation puis matching python sur la base - du ratio de mots dans le nom de l'acteur qui matchent avec des - noms/prénoms de personnes dans l'AE - """ - - -def enrich_ae_rgpd_match_wrapper(ti, params) -> None: - logger.info(task_info_get()) - - df = enrich_ae_rgpd_match( - df=ti.xcom_pull(key=XCOMS.DF_READ), - match_threshold=params[COLS.MATCH_SCORE], - ) - if df.empty: - raise AirflowSkipException("Pas de matches, on s'arrête là") - - ti.xcom_push(key=XCOMS.DF_MATCH, value=df) - - -def enrich_ae_rgpd_match_task(dag: DAG) -> PythonOperator: - return PythonOperator( - task_id=TASKS.MATCH_SCORE, - python_callable=enrich_ae_rgpd_match_wrapper, - dag=dag, - ) diff --git a/dags/enrich/tasks/airflow_logic/enrich_ae_rgpd_read_task.py b/dags/enrich/tasks/airflow_logic/enrich_ae_rgpd_read_task.py deleted file mode 100644 index 426317042..000000000 --- a/dags/enrich/tasks/airflow_logic/enrich_ae_rgpd_read_task.py +++ /dev/null @@ -1,51 +0,0 @@ -"""Read data from DB needed for RGPD anonymization""" - -import logging - -from airflow import DAG -from airflow.exceptions import AirflowSkipException -from airflow.operators.python import PythonOperator -from enrich.config import DBT, TASKS, XCOMS -from enrich.tasks.business_logic.enrich_ae_rgpd_read import ( - enrich_ae_rgpd_read, -) - -logger = logging.getLogger(__name__) - - -def task_info_get(): - return f""" - ============================================================ - Description de la tâche "{TASKS.READ_AE_RGPD}" - ============================================================ - 💡 quoi: lecture des données via le modèle DBT - {DBT.MARTS_ENRICH_AE_RGPD} - - 🎯 pourquoi: faire un pré-filtre sur les matches potentiels - (pas récupérer les ~27M de lignes de la table AE unite_legale) - - 🏗️ comment: on récupère uniquement les matches SIREN avec - des infos de noms/prénoms dans l'AE en passant par de la normalisation - de chaines de caractères - """ - - -def enrich_ae_rgpd_read_wrapper(ti, params) -> None: - logger.info(task_info_get()) - - df = enrich_ae_rgpd_read( - dbt_model_name=DBT.MARTS_ENRICH_AE_RGPD, - filter_comments_contain=params["filter_comments_contain"], - ) - if df.empty: - raise AirflowSkipException("Pas de données DB, on s'arrête là") - - ti.xcom_push(key=XCOMS.DF_READ, value=df) - - -def enrich_ae_rgpd_read_task(dag: DAG) -> PythonOperator: - return PythonOperator( - task_id=TASKS.READ_AE_RGPD, - python_callable=enrich_ae_rgpd_read_wrapper, - dag=dag, - ) diff --git a/dags/enrich/tasks/airflow_logic/enrich_ae_rgpd_suggest_task.py b/dags/enrich/tasks/airflow_logic/enrich_ae_rgpd_suggest_task.py deleted file mode 100644 index ba3c47c29..000000000 --- a/dags/enrich/tasks/airflow_logic/enrich_ae_rgpd_suggest_task.py +++ /dev/null @@ -1,51 +0,0 @@ -"""Match acteurs from QFDMO vs. AE based on people names""" - -import logging - -from airflow import DAG -from airflow.exceptions import AirflowSkipException -from airflow.operators.python import PythonOperator -from enrich.config import COLS, TASKS, XCOMS -from enrich.tasks.business_logic.enrich_ae_rgpd_suggest import ( - enrich_ae_rgpd_suggest, -) - -logger = logging.getLogger(__name__) - - -def task_info_get(): - return f""" - ============================================================ - Description de la tâche "{TASKS.SUGGEST_AE_RGPD}" - ============================================================ - 💡 quoi: on cherche à déterminer quels acteurs QFDMO ont un - nom qui correspond à des noms de personnes dans l'AE - - 🎯 pourquoi: le but de ce DAG: pouvoir par la suite anonymiser - - 🏗️ comment: normalisation puis suggesting python sur la base - du ratio de mots dans le nom de l'acteur qui suggestent avec des - noms/prénoms de personnes dans l'AE - """ - - -def enrich_ae_rgpd_suggest_wrapper(ti, params, dag, run_id) -> None: - logger.info(task_info_get()) - - enrich_ae_rgpd_suggest( - df=ti.xcom_pull(key=XCOMS.DF_MATCH), - identifiant_action=dag.dag_id, - identifiant_execution=run_id, - dry_run=params[COLS.DRY_RUN], - ) - # Flagging as skipped at the end to help read status in Airflow UI - if params[COLS.DRY_RUN]: - raise AirflowSkipException("Pas de données DB, on s'arrête là") - - -def enrich_ae_rgpd_suggest_task(dag: DAG) -> PythonOperator: - return PythonOperator( - task_id=TASKS.SUGGEST_AE_RGPD, - python_callable=enrich_ae_rgpd_suggest_wrapper, - dag=dag, - ) diff --git a/dags/enrich/tasks/business_logic/enrich_ae_rgpd_match.py b/dags/enrich/tasks/business_logic/enrich_ae_rgpd_match.py deleted file mode 100644 index 75d73045e..000000000 --- a/dags/enrich/tasks/business_logic/enrich_ae_rgpd_match.py +++ /dev/null @@ -1,75 +0,0 @@ -"""Match acteurs from QFDMO vs. AE based on people names""" - -import pandas as pd -from enrich.config import COLS -from shared.tasks.business_logic import normalize -from utils import logging_utils as log - - -def word_overlap_ratio( - row: pd.Series, cols_a: list, cols_b: list -) -> tuple[list[str], float]: - # Gather words from columns in cols_a - words_a = set() - for col in cols_a: - if row[col] is not None: - words_a.update(str(row[col]).split()) - - # Gather words from columns in cols_b - words_b = set() - for col in cols_b: - if row[col] is not None: - words_b.update(str(row[col]).split()) - - # Avoid division by zero - if not words_a: - return ([], 0.0) - - words_matched = [word for word in words_a if word in words_b] - words_count = len(words_matched) - ratio = words_count / len(words_a) - if ratio > 1: - raise ValueError(f"ratio > 1 {ratio}: {words_a} - {words_b}") - return (words_matched, ratio) - - -def enrich_ae_rgpd_match( - df: pd.DataFrame, - match_threshold: float, -) -> pd.DataFrame: - """Identify matches between QFDMO company names and AE's people names.""" - if df.empty: - raise ValueError("df vide, on devrait pas être là") - if match_threshold < 0.5 or match_threshold > 1: - raise ValueError(f"match_threshold invalide: {match_threshold}") - - df = df.copy() - - # Matching columns - cols_names_qfdmo = [COLS.ACTEUR_NOMS_NORMALISES] - cols_names_ae = [COLS.AE_DIRIGEANTS_NOMS] - - # Normalization - cols_to_norm = cols_names_qfdmo + cols_names_ae - for col in cols_to_norm: - df[col] = df[col].map(normalize.string_basic) - - # Matching - df["temp"] = df.apply( - lambda x: word_overlap_ratio(x, cols_names_qfdmo, cols_names_ae), axis=1 - ) - df[COLS.MATCH_WORDS] = df["temp"].apply(lambda x: x[0]) - df[COLS.MATCH_SCORE] = df["temp"].apply(lambda x: x[1]) - df.drop(columns=["temp"], inplace=True) - - # Selecting & previewing matches - df_no_match = df[df[COLS.MATCH_SCORE] == 0] - df_partial = df[(df[COLS.MATCH_SCORE] > 0) & (df[COLS.MATCH_SCORE] < 1)] - df_perfect = df[df[COLS.MATCH_SCORE] == 1] - df_retained = df[df[COLS.MATCH_SCORE] >= match_threshold].copy() - log.preview_df_as_markdown("🔴 Matches non-existant (==0)", df_no_match) - log.preview_df_as_markdown("🟡 Matches partiel (>0 & <1)", df_partial) - log.preview_df_as_markdown("🟢 Matches parfait (==1)", df_perfect) - log.preview_df_as_markdown(f"💾 Matches retenus (>={match_threshold})", df_retained) - - return df_retained.sort_values(COLS.MATCH_SCORE, ascending=False) diff --git a/dags/enrich/tasks/business_logic/enrich_ae_rgpd_read.py b/dags/enrich/tasks/business_logic/enrich_ae_rgpd_read.py deleted file mode 100644 index 3c9bfee4e..000000000 --- a/dags/enrich/tasks/business_logic/enrich_ae_rgpd_read.py +++ /dev/null @@ -1,46 +0,0 @@ -"""Read data from DB needed for RGPD anonymization""" - -import logging - -import numpy as np -import pandas as pd -from enrich.config import COLS -from utils import logging_utils as log -from utils.django import django_setup_full - -django_setup_full() - -logger = logging.getLogger(__name__) - - -def enrich_ae_rgpd_read( - dbt_model_name: str, filter_comments_contain: str = "" -) -> pd.DataFrame: - """Reads necessary QFDMO acteurs and AE entries from DB""" - from django.db import connection - - # Execute SQL query and get data - with connection.cursor() as cursor: - cursor.execute(f"SELECT * FROM {dbt_model_name}") - columns = [col[0] for col in cursor.description] - data = cursor.fetchall() - - # Create DataFrame and preview - df = pd.DataFrame(data, columns=columns, dtype="object").replace({np.nan: None}) - log.preview_df_as_markdown("Matches acteurs vs. Annuaire Entreprises", df) - - # Filtering if needed - filter = (filter_comments_contain or "").strip() - if not df.empty and filter: - logger.info(f"Filtre sur les commentaires: {filter}") - df = df[df[COLS.ACTEUR_COMMENTAIRES].notnull()].copy() - df = df[ - df[COLS.ACTEUR_COMMENTAIRES].str.contains( - filter, - regex=True, - case=False, - ) - ].copy() - log.preview_df_as_markdown("Matches APRES filtre commentaires", df) - - return df diff --git a/dags/enrich/tasks/business_logic/enrich_ae_rgpd_suggest.py b/dags/enrich/tasks/business_logic/enrich_ae_rgpd_suggest.py deleted file mode 100644 index 612dac088..000000000 --- a/dags/enrich/tasks/business_logic/enrich_ae_rgpd_suggest.py +++ /dev/null @@ -1,100 +0,0 @@ -"""Generate suggestions from matches""" - -import logging -from typing import Any - -import pandas as pd -from enrich.config import COLS -from utils import logging_utils as log -from utils.django import django_setup_full - -django_setup_full() - -logger = logging.getLogger(__name__) - - -# TODO: create a utility + model which helps us generate -# structured & consistent details for generic_suggestion_details.html -def sumline(label: str, value: Any, value_type: str): - return locals() - - -def enrich_ae_rgpd_suggest( - df: pd.DataFrame, - identifiant_action: str, - identifiant_execution: str, - dry_run: bool = True, -) -> list[dict]: - """Generate suggestions from matches""" - from data.models import ( - Suggestion, - SuggestionAction, - SuggestionCohorte, - SuggestionStatut, - ) - from data.models.change import SuggestionChange - from data.models.changes.acteur_rgpd_anonymize import ( - ACTEUR_FIELDS_TO_ANONYMIZE, - ChangeActeurRgpdAnonymize, - ) - - # Prepare suggestions - suggestions = [] - for _, row in df.iterrows(): - changes = [] - - # Preparing & validating the change params - acteur_id = row[COLS.ACTEUR_ID] - model_params = {"id": acteur_id} - ChangeActeurRgpdAnonymize(**model_params).validate() - - # Preparing suggestion with change and ensuring we can JSON serialize it - change = SuggestionChange( - order=1, - reason="Noms/prénoms détectés dans l'Annuaire Entreprise (AE)", - entity_type="acteur_displayed", - model_name=ChangeActeurRgpdAnonymize.name(), - model_params=model_params, - ).model_dump() - changes.append(change) - contexte_changes = ACTEUR_FIELDS_TO_ANONYMIZE.copy() - contexte_changes["commentaires"] = "➕ Ajout mention avec 📆 date & ⏰ heure" - suggestion = { - "contexte": { - "changements": contexte_changes, - }, - "suggestion": { - "title": "🕵️ Anonymisation RGPD", - "summary": [ - sumline("noms d'origine", row[COLS.ACTEUR_NOMS_ORIGINE], "text"), - sumline("mots de match", row[COLS.MATCH_WORDS], "text_list"), - sumline("score de match", row[COLS.MATCH_SCORE], "score_0_to_1"), - sumline("changements", "voir contexte/détails", "text"), - ], - "changes": changes, - }, - } - suggestions.append(suggestion) - log.preview(f"Suggestion pour acteur: {acteur_id}", suggestion) - - # Saving suggestions - logging.info(log.banner_string("✍️ Ecritures en DB")) - if dry_run: - logger.info("✋ Dry run: suggestions pas écrites en base") - else: - cohort = SuggestionCohorte( - identifiant_action=identifiant_action, - identifiant_execution=identifiant_execution, - type_action=SuggestionAction.ENRICH_ACTEURS_RGPD, - metadata={"🔢 Nombre de suggestions": len(suggestions)}, - ) - cohort.save() - for suggestion in suggestions: - Suggestion( - suggestion_cohorte=cohort, - statut=SuggestionStatut.AVALIDER, - contexte=suggestion["contexte"], - suggestion=suggestion["suggestion"], - ).save() - - return suggestions diff --git a/dags_unit_tests/enrich/tasks/test_enrich_ae_rgpd_read.py b/dags_unit_tests/enrich/tasks/test_enrich_ae_rgpd_read.py deleted file mode 100644 index 47320dea8..000000000 --- a/dags_unit_tests/enrich/tasks/test_enrich_ae_rgpd_read.py +++ /dev/null @@ -1,49 +0,0 @@ -import pytest - -from dags.enrich.config import COLS -from dags.enrich.tasks.business_logic.enrich_ae_rgpd_read import ( - enrich_ae_rgpd_read, -) - -DBT_MODEL_NAME = "my_dummy_dbt_model" - - -@pytest.mark.django_db -class TestEnrichAeRgpdRead: - - @pytest.fixture - def dbt_model(self): - from django.db import connection - - sql = f"""CREATE TABLE {DBT_MODEL_NAME} ( - {COLS.ACTEUR_COMMENTAIRES} TEXT - ); - - INSERT INTO {DBT_MODEL_NAME} ({COLS.ACTEUR_COMMENTAIRES}) VALUES - (NULL), - (' '), - ('This is the first comment.'), - ('Second comment here.'), - ('Another comment added.');""" - - with connection.cursor() as cursor: - cursor.execute(sql) - - def test_default(self, dbt_model): - df = enrich_ae_rgpd_read(DBT_MODEL_NAME) - assert df[COLS.ACTEUR_COMMENTAIRES].tolist() == [ - None, - " ", - "This is the first comment.", - "Second comment here.", - "Another comment added.", - ] - - def test_filter_supports_insensitive_regex(self, dbt_model): - df = enrich_ae_rgpd_read( - DBT_MODEL_NAME, filter_comments_contain="(second|another)" - ) - assert df[COLS.ACTEUR_COMMENTAIRES].tolist() == [ - "Second comment here.", - "Another comment added.", - ] diff --git a/dags_unit_tests/enrich/tasks/test_enrich_ae_rgpd_suggest.py b/dags_unit_tests/enrich/tasks/test_enrich_ae_rgpd_suggest.py deleted file mode 100644 index 51e7a86e1..000000000 --- a/dags_unit_tests/enrich/tasks/test_enrich_ae_rgpd_suggest.py +++ /dev/null @@ -1,116 +0,0 @@ -import json -import re - -import pandas as pd -import pytest -from django.contrib.gis.geos import Point -from rich import print - -from dags.enrich.config import COLS -from dags.enrich.tasks.business_logic.enrich_ae_rgpd_suggest import ( - enrich_ae_rgpd_suggest, -) - -CHANGE_ANON = "ANONYMISE POUR RAISON RGPD" -COMMENT_PATTERN = CHANGE_ANON + r" le \d{4}-\d{2}-\d{2} à \d{2}:\d{2}:\d{2} UTC" - - -@pytest.mark.django_db -class TestEnrichAeRgpdSuggest: - - @pytest.fixture - def df(self): - return pd.DataFrame( - { - COLS.ACTEUR_ID: ["id1", "id2"], - COLS.ACTEUR_NOMS_ORIGINE: ["acteur1", "acteur2"], - COLS.MATCH_WORDS: ["acteur1", "acteur2"], - COLS.MATCH_SCORE: [1.0, 1.0], - COLS.ACTEUR_SIRET: ["11111111100001", "22222222200001"], - } - ) - - @pytest.fixture - def acteurs(self, df): - from qfdmo.models import Acteur, ActeurType - - at1 = ActeurType(code="at1") - at1.save() - - for _, row in df.iterrows(): - Acteur.objects.create( - # Required fields - identifiant_unique=row[COLS.ACTEUR_ID], - acteur_type=at1, - location=Point(1, 2), - # Fields to anonymize - nom=row[COLS.ACTEUR_NOMS_ORIGINE], - nom_officiel="🟠 not anonymized", - nom_commercial="🟠 not anonymized", - email="me@myself.com", - telephone="🟠 not anonymized", - adresse="🟠 not anonymized", - adresse_complement="🟠 not anonymized", - # Fields to keep as-is - description="🟠 not anonymized", - ) - - @pytest.fixture - def suggestions(self, df, acteurs): - return enrich_ae_rgpd_suggest( - df=df, - identifiant_action="my_action_id", - identifiant_execution="my_execution_id", - dry_run=True, - ) - - @pytest.fixture - def suggest(self, suggestions) -> dict: - suggest = suggestions[0] - print(f"{suggest=}") - return suggest - - def test_one_suggestion_per_acteur(self, df, suggestions): - assert len(suggestions) == len(df) - - def test_one_change_per_suggestion(self, suggest): - # 1 change per acteur, we don't group acteurs together - # even if they have identical SIREN or SIRET - assert len(suggest["suggestion"]["changes"]) == 1 - - def test_suggestion_change(self, suggest): - # The changes being sensitive, this test intentionnally - # hardcodes the structure of the suggestion so we need - # to udpate tests with intention when changing the DAG - from data.models.change import SuggestionChange - from qfdmo.models import Acteur, ActeurStatus - - change = suggest["suggestion"]["changes"][0] - assert change["model_name"] == "acteur_rgpd_anonymize" - assert change["model_params"] == {"id": "id1"} - - SuggestionChange(**change).apply() - - acteur = Acteur.objects.get(identifiant_unique="id1") - - # Fields anonymized - assert acteur.nom == "ANONYMISE POUR RAISON RGPD" - assert acteur.nom_officiel == "ANONYMISE POUR RAISON RGPD" - assert acteur.nom_commercial == "ANONYMISE POUR RAISON RGPD" - assert acteur.email == "" - assert acteur.telephone == "ANONYMISE POUR RAISON RGPD" - assert acteur.adresse == "ANONYMISE POUR RAISON RGPD" - assert acteur.adresse_complement == "ANONYMISE POUR RAISON RGPD" - - # Status set to inactif - assert acteur.statut == ActeurStatus.INACTIF - - # Check comment - comments = json.loads(acteur.commentaires) - assert re.match(COMMENT_PATTERN, comments[0]["message"]) - - # Fields not changed - assert acteur.description == "🟠 not anonymized" - assert acteur.location.x == 1 - assert acteur.location.y == 2 - assert acteur.acteur_type.code == "at1" diff --git a/data/models/changes/acteur_rgpd_anonymize.py b/data/models/changes/acteur_rgpd_anonymize.py deleted file mode 100644 index 32663b592..000000000 --- a/data/models/changes/acteur_rgpd_anonymize.py +++ /dev/null @@ -1,62 +0,0 @@ -"""Special change model dedicated to RGPD because: - -- NORMALLY we version data through RevisionActeur - + consequence: we create a Revision if it doesn't exist - -- HOWEVER WITH RGPD we don't do data versioning, we overwrite - the data so it disappears from our DB - = consequence: we don't create a Revision if it doesn't exist - (again we are not versioning, just overwriting) - -Since the approach to RGPD should be consistent, we don't -expect the model to take any other input than the ID of the acteur -we are changing, and the model takes care of the rest -""" - -from datetime import datetime, timezone - -from data.models.changes.acteur_abstract import ChangeActeurAbstract -from qfdmo.models import Acteur, ActeurStatus, RevisionActeur - -VALUE_ANONYMIZED = "ANONYMISE POUR RAISON RGPD" -ACTEUR_FIELDS_TO_ANONYMIZE = { - "nom": VALUE_ANONYMIZED, - "nom_officiel": VALUE_ANONYMIZED, - "nom_commercial": VALUE_ANONYMIZED, - "email": "", # Consequence of allowing empty strings in DB - "telephone": VALUE_ANONYMIZED, - "adresse": VALUE_ANONYMIZED, - "adresse_complement": VALUE_ANONYMIZED, - "statut": ActeurStatus.INACTIF, -} - - -class ChangeActeurRgpdAnonymize(ChangeActeurAbstract): - @classmethod - def name(cls) -> str: - return "acteur_rgpd_anonymize" - - def validate(self) -> list[Acteur | RevisionActeur]: - if self.data: - raise ValueError("Pour RGPD ne pas fournir de data, le modèle efface") - # The parent should already exist in revision or base - # and we return all its instances to overwrite them all - instances = [] - rev = RevisionActeur.objects.filter(pk=self.id).first() - if rev: - instances.append(rev) - instances.append(Acteur.objects.get(pk=self.id)) - return instances - - def apply(self): - # For each instance found - instances = self.validate() - for instance in instances: - # We anonymize the fields - for key, value in ACTEUR_FIELDS_TO_ANONYMIZE.items(): - setattr(instance, key, value) - - # Special case for comments - now = datetime.now(timezone.utc).strftime("le %Y-%m-%d à %H:%M:%S UTC") - instance.commentaires_ajouter(f"{VALUE_ANONYMIZED} {now}") - instance.save() diff --git a/dbt/models/marts/enrich/marts_enrich_ae_rgpd.sql b/dbt/models/marts/enrich/marts_enrich_ae_rgpd.sql deleted file mode 100644 index 4922beaaa..000000000 --- a/dbt/models/marts/enrich/marts_enrich_ae_rgpd.sql +++ /dev/null @@ -1,74 +0,0 @@ -/* -Model to find entries from AE's unite legal which directors names -around found inside our acteurs names. - -Notes: - - 🧹 Pre-matching/filtering at SQL level to reduce data size (13M rows) - - 👁️‍🗨️ Keeping as view to always re-evaluate vs. ever changing QFDMO data -*/ -{{ - config( - materialized = 'view', - tags=['marts', 'ae', 'annuaire_entreprises', 'unite_legale', 'rgpd'], - ) -}} - -WITH acteurs_with_siren AS ( - SELECT - LEFT(siret,9) AS siren, - identifiant_unique AS acteur_id, - TRIM(REGEXP_REPLACE( - CONCAT(nom || ', ' || nom_officiel || ', ' || nom_commercial), - ', , ', - '') - ) AS acteur_noms_origine, - udf_normalize_string_for_match(CONCAT(nom || ' ' || nom_officiel || ' ' || nom_commercial)) AS acteur_noms_normalises, - commentaires AS acteur_commentaires - FROM {{ ref('marts_carte_acteur') }} - /* - We have normalization issues with our SIREN field in our DB - and we obtain better matching by reconstructing SIREN via SIRET - */ - WHERE siret IS NOT NULL AND siret != '' AND LENGTH(siret) = 14 - AND {{ acteur_status_is_active() }} -) -SELECT - -- Common fields - acteurs.siren, - - -- Acteur fields - acteur_id, - acteur_noms_origine, - acteur_noms_normalises, - acteur_commentaires, - - -- Unite legale fields - /* - We don't care which one is which, we aggregate to - reduce data size and we will perform a more precise - post-match in Python - */ - udf_columns_concat_unique_non_empty( - dirigeant_nom, - dirigeant_nom_usage, - dirigeant_pseudonyme, - dirigeant_prenom1, - dirigeant_prenom2, - dirigeant_prenom3, - dirigeant_prenom4 - ) AS ae_dirigeants_noms_prenoms - -FROM {{ ref('int_ae_unite_legale') }} AS unite -LEFT JOIN acteurs_with_siren AS acteurs ON acteurs.siren = unite.siren -WHERE - acteurs.siren IS NOT NULL -- i.e. we have a match - AND a_dirigeant_noms_ou_prenoms_non_null -- we have some directors names - AND ( -- Any of the directors names appear in the acteur names - position(dirigeant_nom IN acteur_noms_normalises) > 0 - OR position(dirigeant_nom_usage IN acteur_noms_normalises) > 0 - OR position(dirigeant_pseudonyme IN acteur_noms_normalises) > 0 - OR position(dirigeant_prenom1 IN acteur_noms_normalises) > 0 - OR position(dirigeant_prenom2 IN acteur_noms_normalises) > 0 - OR position(dirigeant_prenom3 IN acteur_noms_normalises) > 0 - OR position(dirigeant_prenom4 IN acteur_noms_normalises) > 0 - ) \ No newline at end of file diff --git a/dbt/models/marts/enrich/schema.yml b/dbt/models/marts/enrich/schema.yml index bf9eaef2b..f076f21ea 100644 --- a/dbt/models/marts/enrich/schema.yml +++ b/dbt/models/marts/enrich/schema.yml @@ -1,49 +1,6 @@ version: 2 models: - - name: marts_enrich_ae_rgpd - description: Unités légales de l'Annuaire Entreprises (AE) préfiltrés - | et prématchés sur la base des noms/prénoms de dirigeants dont au - | moins 1 apparait dans le nom de nos acteurs (le modèle sera ensuite - | utilisé par un DAG Airflow pour faire du matching plus poussé via - | python et soumettre des suggestions) - columns: - - name: siren - description: "Numéro SIREN" - data_type: varchar(9) - data_tests: - - not_null - # Our model is at unite_legale level (no repetition per establishment) - # hence SIREN should be unique. However test failing as of 2025-03-19 - # due to duplicate SIREN in our DB which is potentially OK (i.e. multiple - # acteur locations belonging to the same parent Acteur SIREN) - # - unique - - name: acteur_id - description: Identifiant unique de l'acteur - data_tests: - - not_null - - name: acteur_noms_origine - description: Nom, nom officiel et nom commercial de l'acteur regroupés - data_tests: - - not_null - - name: acteur_noms_normalises - description: Nom, nom officiel et nom commercial de l'acteur regroupés - | & normalisés pour réduire la taille de la table, sachant - | qu'on fait un matching plus poussés avec python par la suite - # Ensuring we are not matching empty strings - data_tests: - - not_null - - name: acteur_commentaires - description: Commentaires de l'acteur pour debug ET si on veut faire - | filtering avec des paramètres de DAG - - name: ae_dirigeants_noms_prenoms - description: Noms & prénoms de tous les dirigeants - | regroupés & normalisés pour réduire la taille de la table, sachant - | qu'on fait un matching plus poussés avec python par la suite - # If we had a match then we must have at least one director's name - data_tests: - - not_null - - name: marts_enrich_acteurs_closed_replaced description: Etablissements de l'Annuaire Entreprises (AE) qui ont été | fermés et remplacés par un autre établissement diff --git a/unit_tests/data/models/changes/test_acteur_rgpd_anonymize.py b/unit_tests/data/models/changes/test_acteur_rgpd_anonymize.py deleted file mode 100644 index 0fbf2f91a..000000000 --- a/unit_tests/data/models/changes/test_acteur_rgpd_anonymize.py +++ /dev/null @@ -1,121 +0,0 @@ -""" -Test file for the ChangeActeurRgpdAnonymize model. - -""" - -import json -import re - -import pytest -from django.contrib.gis.geos import Point - -from data.models.changes.acteur_rgpd_anonymize import ( - ChangeActeurRgpdAnonymize, -) -from qfdmo.models.acteur import Acteur, ActeurStatus, ActeurType, RevisionActeur - -TEST_DATA = { - "location": Point(1, 2), - "nom": "🟠 not anonymized", - "nom_officiel": "🟠 not anonymized", - "nom_commercial": "🟠 not anonymized", - "description": "🟠 not anonymized", - "email": "me@myself.com", - "telephone": "🟠 not anonymized", - "adresse": "🟠 not anonymized", - "adresse_complement": "🟠 not anonymized", - "statut": ActeurStatus.ACTIF, - "commentaires": " ", -} - -# Intentionally replicating & hardcoding the expected -# changes to prevent accidental modification to model -# without updating the tests -CHANGE_ANON = "ANONYMISE POUR RAISON RGPD" -CHANGES_EXPECTED = { - "nom": CHANGE_ANON, - "nom_officiel": CHANGE_ANON, - "nom_commercial": CHANGE_ANON, - "email": "", # Consequence of allowing empty strings in DB - "telephone": CHANGE_ANON, - "adresse": CHANGE_ANON, - "adresse_complement": CHANGE_ANON, - "statut": ActeurStatus.INACTIF, -} -COMMENT_PATTERN = CHANGE_ANON + r" le \d{4}-\d{2}-\d{2} à \d{2}:\d{2}:\d{2} UTC" - - -@pytest.mark.django_db -class TestChangeActeurRgpdAnonymize: - def test_name(self): - assert ChangeActeurRgpdAnonymize.name() == "acteur_rgpd_anonymize" - - def test_raise_if_data_provided(self): - change = ChangeActeurRgpdAnonymize(id="dummy", data={"nom": "dummy"}) - with pytest.raises(ValueError, match="Pour RGPD ne pas fournir de data"): - change.apply() - - def test_raise_if_acteur_does_not_exist(self): - change = ChangeActeurRgpdAnonymize(id="dummy") - with pytest.raises(Acteur.DoesNotExist): - change.apply() - - def test_working_only_in_base(self): - # We start by creating acteur only in base - at1 = ActeurType.objects.create(code="at1") - id1 = "id1" - data = TEST_DATA.copy() - data["acteur_type"] = at1 - data["identifiant_unique"] = id1 - Acteur.objects.create(**data) - - # We check that acteur isn't in revision yet - assert RevisionActeur.objects.filter(pk=id1).count() == 0 - - # Since RGPD changes are to owerwrite consistently, we don't - # pass any data to the model, only the ID of the acteur - # and the model takes care of the rest - ChangeActeurRgpdAnonymize(id=id1).apply() - - # We check that no revision was created because we overwrite - # hence don't want Revisions meants for versioning - assert not RevisionActeur.objects.filter(pk=id1).exists() - - # We check that acteur in base was anonymized - base = Acteur.objects.get(pk=id1) - for key, value in CHANGES_EXPECTED.items(): - assert getattr(base, key) == value - - # Comments - comments = json.loads(base.commentaires) - assert re.match(COMMENT_PATTERN, comments[0]["message"]) - - # We check that other fields were not modified - assert base.description == "🟠 not anonymized" - - def test_working_both_base_and_revision(self): - # We start by creating acteur BOTH in base and revision - at1 = ActeurType.objects.create(code="at1") - id2 = "id2" - data = TEST_DATA.copy() - data["acteur_type"] = at1 - data["identifiant_unique"] = id2 - Acteur.objects.create(**data) - RevisionActeur.objects.create(**data) - - # Same remark as previous test on not having to pass data - ChangeActeurRgpdAnonymize(id=id2).apply() - - # In this case we check that all instances were anonymized - instances = [ - Acteur.objects.get(pk=id2), - RevisionActeur.objects.get(pk=id2), - ] - for instance in instances: - for key, value in CHANGES_EXPECTED.items(): - assert getattr(instance, key) == value - assert instance.description == "🟠 not anonymized" - - # Comments - comments = json.loads(instance.commentaires) - assert re.match(COMMENT_PATTERN, comments[0]["message"]) From 38036ebf9c0edbc1a325c3044ffd6ba1946785f3 Mon Sep 17 00:00:00 2001 From: maxcorbeau Date: Wed, 16 Apr 2025 10:35:40 +0200 Subject: [PATCH 23/50] cont. suppression RGPD & move data_reconstruct --- .../misc/data_serialize_reconstruct.py | 51 ----------------- data/models/changes/__init__.py | 2 - data/models/changes/acteur_create_as_child.py | 4 +- data/models/changes/utils.py | 55 +++++++++++-------- 4 files changed, 32 insertions(+), 80 deletions(-) diff --git a/dags/cluster/tasks/business_logic/misc/data_serialize_reconstruct.py b/dags/cluster/tasks/business_logic/misc/data_serialize_reconstruct.py index 4584d9589..d3094fe20 100644 --- a/dags/cluster/tasks/business_logic/misc/data_serialize_reconstruct.py +++ b/dags/cluster/tasks/business_logic/misc/data_serialize_reconstruct.py @@ -4,7 +4,6 @@ from datetime import datetime -from django.contrib.gis.geos import Point from django.db import models @@ -50,53 +49,3 @@ def data_serialize(model: type[models.Model], data: dict) -> dict: result[key] = value return result - - -def data_reconstruct(model: type[models.Model], data_src: dict) -> dict: - """ - Reconstruct data ready to use in Django model. - - Args: - - model_class: The Django model class. - - data: The dictionary containing the data to reconstruct. - - Returns: - - An instance of the model with the data populated. - """ - result = {} - data = data_src.copy() - - if "longitude" in data and "latitude" in data: - result["location"] = Point(data.pop("longitude"), data.pop("latitude")) - - for key, value in data.items(): - field = model._meta.get_field(key) - - # We don't try to be fancy with None, it's None - if value is None: - # Same explanation as in data_serialize - if key == "location": - continue - else: - result[key] = value - elif isinstance(field, models.ForeignKey): - # Normalizing to {field} from {field}_id so all fields are - # represented in their Django flavour - if key.endswith("_id"): - try: - key_no_id = key.rstrip("_id") - field = model._meta.get_field(key_no_id) - key = key_no_id - except Exception: - pass - - # Retrieving the related instance if it's not already an instance - if not isinstance(value, field.related_model): # type: ignore - value = field.related_model.objects.get(pk=value) # type: ignore - - result[key] = value - - else: - result[key] = value - - return result diff --git a/data/models/changes/__init__.py b/data/models/changes/__init__.py index ab172c0c0..e4fdc890c 100644 --- a/data/models/changes/__init__.py +++ b/data/models/changes/__init__.py @@ -3,14 +3,12 @@ from .acteur_create_as_parent import ChangeActeurCreateAsParent from .acteur_delete_as_parent import ChangeActeurDeleteAsParent from .acteur_keep_as_parent import ChangeActeurKeepAsParent -from .acteur_rgpd_anonymize import ChangeActeurRgpdAnonymize from .acteur_update_data import ChangeActeurUpdateData from .acteur_update_parent_id import ChangeActeurUpdateParentId from .acteur_verify_in_revision import ChangeActeurVerifyRevision from .sample_model_do_nothing import SampleModelDoNothing CHANGE_MODELS = { - ChangeActeurRgpdAnonymize.name(): ChangeActeurRgpdAnonymize, ChangeActeurUpdateData.name(): ChangeActeurUpdateData, ChangeActeurCreateAsChild.name(): ChangeActeurCreateAsChild, ChangeActeurCreateAsParent.name(): ChangeActeurCreateAsParent, diff --git a/data/models/changes/acteur_create_as_child.py b/data/models/changes/acteur_create_as_child.py index 516ae94d6..550946a2a 100644 --- a/data/models/changes/acteur_create_as_child.py +++ b/data/models/changes/acteur_create_as_child.py @@ -1,9 +1,7 @@ from pydantic import BaseModel from rich import print -from dags.cluster.tasks.business_logic.misc.data_serialize_reconstruct import ( - data_reconstruct, -) +from data.models.changes.utils import data_reconstruct class ChangeActeurCreateAsChild(BaseModel): diff --git a/data/models/changes/utils.py b/data/models/changes/utils.py index a593407f1..44c329fb3 100644 --- a/data/models/changes/utils.py +++ b/data/models/changes/utils.py @@ -36,30 +36,37 @@ def data_reconstruct(model: type[models.Model], data_src: dict) -> dict: result = {} data = data_src.copy() - try: - if "longitude" in data and "latitude" in data: - result["location"] = Point(data["longitude"], data["latitude"]) - # so we don't evaluate in below loop - del data["longitude"] - del data["latitude"] - - for key, value in data.items(): - field = model._meta.get_field(key) - - # We don't try to be fancy with None, it's None - if value is None: - # Same explanation as in data_serialize - if key == "location": - continue - else: - result[key] = value - elif isinstance(field, models.ForeignKey): - # If it's a foreign key, fetch the related entity - related_instance = field.related_model.objects.get(pk=value) # type: ignore - result[key] = related_instance + if "longitude" in data and "latitude" in data: + result["location"] = Point(data.pop("longitude"), data.pop("latitude")) + + for key, value in data.items(): + field = model._meta.get_field(key) + + # We don't try to be fancy with None, it's None + if value is None: + # Same explanation as in data_serialize + if key == "location": + continue else: result[key] = value - except Exception as e: - logger.error(f"Error reconstructing for {model.__name__}, {data=}, {e=}") - raise e + elif isinstance(field, models.ForeignKey): + # Normalizing to {field} from {field}_id so all fields are + # represented in their Django flavour + if key.endswith("_id"): + try: + key_no_id = key.rstrip("_id") + field = model._meta.get_field(key_no_id) + key = key_no_id + except Exception: + pass + + # Retrieving the related instance if it's not already an instance + if not isinstance(value, field.related_model): # type: ignore + value = field.related_model.objects.get(pk=value) # type: ignore + + result[key] = value + + else: + result[key] = value + return result From 524821e4fd64d1324e6650664d2f4d58576e0929 Mon Sep 17 00:00:00 2001 From: maxcorbeau Date: Wed, 16 Apr 2025 10:51:48 +0200 Subject: [PATCH 24/50] =?UTF-8?q?recr=C3=A9er=20migration=20django=20+=20f?= =?UTF-8?q?ix=20imports=20cass=C3=A9s=20via=20rebase?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../tasks/test_enrich_acteurs_closed_suggestions.py | 4 ++-- ...re.py => 0141_acteur_siret_is_closed_and_more.py} | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) rename qfdmo/migrations/{0140_acteur_siret_is_closed_and_more.py => 0141_acteur_siret_is_closed_and_more.py} (94%) diff --git a/dags_unit_tests/enrich/tasks/test_enrich_acteurs_closed_suggestions.py b/dags_unit_tests/enrich/tasks/test_enrich_acteurs_closed_suggestions.py index fc7189d9b..4ec8f30be 100644 --- a/dags_unit_tests/enrich/tasks/test_enrich_acteurs_closed_suggestions.py +++ b/dags_unit_tests/enrich/tasks/test_enrich_acteurs_closed_suggestions.py @@ -19,7 +19,7 @@ class TestEnrichActeursClosedSuggestions: @pytest.fixture def source(self): - from data.models import Source + from qfdmo.models import Source return Source.objects.create(code="s1") @@ -122,7 +122,7 @@ def acteurs(self, df_not_replaced, df_replaced, atype, source): ) def test_cohorte_not_replaced(self, acteurs, df_not_replaced): - from data.models import Suggestion, SuggestionCohorte + from data.models.suggestion import Suggestion, SuggestionCohorte from qfdmo.models import ActeurStatus, RevisionActeur # Write suggestions to DB diff --git a/qfdmo/migrations/0140_acteur_siret_is_closed_and_more.py b/qfdmo/migrations/0141_acteur_siret_is_closed_and_more.py similarity index 94% rename from qfdmo/migrations/0140_acteur_siret_is_closed_and_more.py rename to qfdmo/migrations/0141_acteur_siret_is_closed_and_more.py index 80e1a8079..0f6bba9bf 100644 --- a/qfdmo/migrations/0140_acteur_siret_is_closed_and_more.py +++ b/qfdmo/migrations/0141_acteur_siret_is_closed_and_more.py @@ -1,4 +1,4 @@ -# Generated by Django 5.1.6 on 2025-04-07 08:33 +# Generated by Django 5.1.6 on 2025-04-16 08:39 from django.db import migrations, models @@ -6,7 +6,7 @@ class Migration(migrations.Migration): dependencies = [ - ("qfdmo", "0139_alter_acteur_not_nullable_char_fields_tel_url_ville"), + ("qfdmo", "0140_drop_views"), ] operations = [ @@ -15,9 +15,9 @@ class Migration(migrations.Migration): name="siret_is_closed", field=models.BooleanField( blank=True, - null=True, default=None, help_text="Indique si le SIRET est fermé ou non dans l'Annuaire Entreprises", + null=True, verbose_name="SIRET fermé", ), ), @@ -26,9 +26,9 @@ class Migration(migrations.Migration): name="siret_is_closed", field=models.BooleanField( blank=True, - null=True, default=None, help_text="Indique si le SIRET est fermé ou non dans l'Annuaire Entreprises", + null=True, verbose_name="SIRET fermé", ), ), @@ -37,9 +37,9 @@ class Migration(migrations.Migration): name="siret_is_closed", field=models.BooleanField( blank=True, - null=True, default=None, help_text="Indique si le SIRET est fermé ou non dans l'Annuaire Entreprises", + null=True, verbose_name="SIRET fermé", ), ), @@ -59,9 +59,9 @@ class Migration(migrations.Migration): name="siret_is_closed", field=models.BooleanField( blank=True, - null=True, default=None, help_text="Indique si le SIRET est fermé ou non dans l'Annuaire Entreprises", + null=True, verbose_name="SIRET fermé", ), ), From 135305e1b97ae2d791e3a1fe80692a87a935a272 Mon Sep 17 00:00:00 2001 From: maxcorbeau Date: Wed, 16 Apr 2025 10:56:59 +0200 Subject: [PATCH 25/50] fix imports + data_serialize en doublon --- .../misc/data_serialize_reconstruct.py | 51 ------------------- .../enrich_dbt_model_to_suggestions.py | 2 +- .../test_enrich_acteurs_closed_suggestions.py | 4 +- 3 files changed, 3 insertions(+), 54 deletions(-) delete mode 100644 dags/cluster/tasks/business_logic/misc/data_serialize_reconstruct.py diff --git a/dags/cluster/tasks/business_logic/misc/data_serialize_reconstruct.py b/dags/cluster/tasks/business_logic/misc/data_serialize_reconstruct.py deleted file mode 100644 index d3094fe20..000000000 --- a/dags/cluster/tasks/business_logic/misc/data_serialize_reconstruct.py +++ /dev/null @@ -1,51 +0,0 @@ -"""Functions to serialize and reconstruct the Django -data as we need to pass it over wire/JSON and thus have -to lose Python/Django objects""" - -from datetime import datetime - -from django.db import models - - -def data_serialize(model: type[models.Model], data: dict) -> dict: - """ - Serialize a dictionary to match the Django model structure. - - Args: - - model_class: The Django model class. - - data: The dictionary containing the data to serialize. - - Returns: - - A dictionary with values adjusted to match the model's requirements. - """ - result = {} - - for key, value in data.items(): - field = model._meta.get_field(key) - - # We don't try to be fancy with None, it's None - if value is None: - # Due to clean_location check on Acteur model - # which prevents None if acteur is non-digital - # AND the fact that we can't know for sure whether - # acteur is digital or not, we just skip None locations - # TODO: we need to revamp the validation architecture - # as those if-elses all over the code are not maintainable - if key == "location": - continue - else: - result[key] = value - elif isinstance(field, models.ForeignKey): - if isinstance(value, (str, int)): - result[key] = value - else: - result[key] = value.pk - elif key == "location": - result["longitude"] = data["location"].x - result["latitude"] = data["location"].y - elif isinstance(value, datetime): - result[key] = value.isoformat() - else: - result[key] = value - - return result diff --git a/dags/enrich/tasks/business_logic/enrich_dbt_model_to_suggestions.py b/dags/enrich/tasks/business_logic/enrich_dbt_model_to_suggestions.py index c4b40df59..276225699 100644 --- a/dags/enrich/tasks/business_logic/enrich_dbt_model_to_suggestions.py +++ b/dags/enrich/tasks/business_logic/enrich_dbt_model_to_suggestions.py @@ -165,7 +165,7 @@ def enrich_dbt_model_to_suggestions( identifiant_action: str, dry_run: bool = True, ) -> bool: - from data.models import ( + from data.models.suggestion import ( Suggestion, SuggestionAction, SuggestionCohorte, diff --git a/dags_unit_tests/enrich/tasks/test_enrich_acteurs_closed_suggestions.py b/dags_unit_tests/enrich/tasks/test_enrich_acteurs_closed_suggestions.py index 4ec8f30be..645b1eefd 100644 --- a/dags_unit_tests/enrich/tasks/test_enrich_acteurs_closed_suggestions.py +++ b/dags_unit_tests/enrich/tasks/test_enrich_acteurs_closed_suggestions.py @@ -157,7 +157,7 @@ def test_cohorte_not_replaced(self, acteurs, df_not_replaced): assert a02.siret_is_closed is True def test_cohorte_meme_siren(self, acteurs, atype, source, df_replaced_meme_siret): - from data.models import Suggestion, SuggestionCohorte + from data.models.suggestion import Suggestion, SuggestionCohorte from qfdmo.models import ActeurStatus, RevisionActeur # Write suggestions to DB @@ -201,7 +201,7 @@ def test_cohorte_meme_siren(self, acteurs, atype, source, df_replaced_meme_siret assert child.siret_is_closed is True def test_cohorte_autre_siren(self, acteurs, df_replaced_autre_siret): - from data.models import Suggestion, SuggestionCohorte + from data.models.suggestion import Suggestion, SuggestionCohorte from qfdmo.models import ActeurStatus, RevisionActeur # Write suggestions to DB From 6c3f7547fe875fc224ce663b17b49d86661530fa Mon Sep 17 00:00:00 2001 From: maxcorbeau Date: Wed, 16 Apr 2025 11:11:22 +0200 Subject: [PATCH 26/50] cont. del rgpd, fix acteurs model & tests --- dags/enrich/config/columns.py | 4 +--- dags/enrich/config/tasks.py | 2 +- dags/enrich/config/xcoms.py | 4 ---- .../enrich_config_create_task.py | 2 +- .../enrich_dbt_model_suggest_task.py | 2 +- .../enrich_dbt_models_refresh_task.py | 19 ++++++++----------- .../business_logic/enrich_dbt_model_read.py | 2 +- dags/sources/config/shared_constants.py | 1 - data/models/changes/acteur_update_data.py | 3 +-- data/models/suggestion.py | 1 - qfdmo/models/acteur.py | 1 + .../changes/test_acteur_create_as_child.py | 2 +- unit_tests/qfdmo/test_acteur_methods.py | 2 +- 13 files changed, 17 insertions(+), 28 deletions(-) diff --git a/dags/enrich/config/columns.py b/dags/enrich/config/columns.py index 9d9a8073b..9af12db7c 100644 --- a/dags/enrich/config/columns.py +++ b/dags/enrich/config/columns.py @@ -1,6 +1,4 @@ -"""Column names for RGPD anonymize DAG. Columns -are used in conf, dataframes and SQL queries. These -don't include Acteur fields (for this we stick to Acteur models)""" +"""Column names enrichment DAGs""" from dataclasses import dataclass diff --git a/dags/enrich/config/tasks.py b/dags/enrich/config/tasks.py index 0b1266be6..a56767fa6 100644 --- a/dags/enrich/config/tasks.py +++ b/dags/enrich/config/tasks.py @@ -1,4 +1,4 @@ -"""Task IDs for RGPD anonymize people DAG""" +"""Task IDs for enrichment DAGs""" from dataclasses import dataclass diff --git a/dags/enrich/config/xcoms.py b/dags/enrich/config/xcoms.py index 9b2b0ea6e..eb07c88c2 100644 --- a/dags/enrich/config/xcoms.py +++ b/dags/enrich/config/xcoms.py @@ -34,10 +34,6 @@ def xcom_pull(ti: TaskInstance, key: str, skip_if_empty: bool = False) -> Any: # Reading values if key == XCOMS.CONFIG: value = ti.xcom_pull(key=key, task_ids=TASKS.CONFIG_CREATE) - elif key == XCOMS.DF_READ: - value = ti.xcom_pull(key=key, task_ids=TASKS.READ_AE_RGPD) - elif key == XCOMS.DF_MATCH: - value = ti.xcom_pull(key=key, task_ids=TASKS.MATCH_SCORE) elif key == XCOMS.DF_CLOSED_REPLACED_SAME_SIREN: value = ti.xcom_pull(key=key, task_ids=TASKS.ENRICH_CLOSED_REPLACED_SAME_SIREN) elif key == XCOMS.DF_CLOSED_REPLACED_OTHER_SIREN: diff --git a/dags/enrich/tasks/airflow_logic/enrich_config_create_task.py b/dags/enrich/tasks/airflow_logic/enrich_config_create_task.py index c38c252d5..6e3d6452e 100644 --- a/dags/enrich/tasks/airflow_logic/enrich_config_create_task.py +++ b/dags/enrich/tasks/airflow_logic/enrich_config_create_task.py @@ -1,4 +1,4 @@ -"""Read data from DB needed for RGPD anonymization""" +"""Generic task to create configuration""" import logging diff --git a/dags/enrich/tasks/airflow_logic/enrich_dbt_model_suggest_task.py b/dags/enrich/tasks/airflow_logic/enrich_dbt_model_suggest_task.py index f1e854ddb..04aa2f81f 100644 --- a/dags/enrich/tasks/airflow_logic/enrich_dbt_model_suggest_task.py +++ b/dags/enrich/tasks/airflow_logic/enrich_dbt_model_suggest_task.py @@ -1,4 +1,4 @@ -"""Read data from DB needed for RGPD anonymization""" +"""Generate suggestions for enrichment DAGs""" import logging diff --git a/dags/enrich/tasks/airflow_logic/enrich_dbt_models_refresh_task.py b/dags/enrich/tasks/airflow_logic/enrich_dbt_models_refresh_task.py index b7c6070c4..ff1844695 100644 --- a/dags/enrich/tasks/airflow_logic/enrich_dbt_models_refresh_task.py +++ b/dags/enrich/tasks/airflow_logic/enrich_dbt_models_refresh_task.py @@ -1,4 +1,4 @@ -"""Read data from DB needed for RGPD anonymization""" +"""Refresh DBT models for enrichment DAGs""" import logging @@ -6,33 +6,30 @@ from airflow.exceptions import AirflowSkipException from airflow.operators.bash import BashOperator from airflow.operators.python import PythonOperator -from enrich.config import DBT, TASKS, XCOMS, xcom_pull +from enrich.config import TASKS, XCOMS, xcom_pull logger = logging.getLogger(__name__) -def task_info_get(): +def task_info_get(dbt_model_refresh_command: str): return f""" ============================================================ Description de la tâche "{TASKS.ENRICH_DBT_MODELS_REFRESH}" ============================================================ - 💡 quoi: lecture des données via le modèle DBT - {DBT.MARTS_ENRICH_AE_RGPD} + 💡 quoi: rafraichissement des modèles DBT - 🎯 pourquoi: faire un pré-filtre sur les matches potentiels - (pas récupérer les ~27M de lignes de la table AE unite_legale) + 🎯 pourquoi: avoir des suggestions fraiches - 🏗️ comment: on récupère uniquement les matches SIREN avec - des infos de noms/prénoms dans l'AE en passant par de la normalisation - de chaines de caractères + 🏗️ comment: via commande: {dbt_model_refresh_command} """ def enrich_dbt_models_refresh_wrapper(ti) -> None: - logger.info(task_info_get()) # Config config = xcom_pull(ti, XCOMS.CONFIG) + + logger.info(task_info_get(config.dbt_models_refresh_command)) logger.info(f"📖 Configuration:\n{config.model_dump_json(indent=2)}") if not config.dbt_models_refresh: diff --git a/dags/enrich/tasks/business_logic/enrich_dbt_model_read.py b/dags/enrich/tasks/business_logic/enrich_dbt_model_read.py index 45d9644ab..090e45625 100644 --- a/dags/enrich/tasks/business_logic/enrich_dbt_model_read.py +++ b/dags/enrich/tasks/business_logic/enrich_dbt_model_read.py @@ -1,4 +1,4 @@ -"""Read data from DB needed for RGPD anonymization""" +"""Read data from DBT models""" import logging diff --git a/dags/sources/config/shared_constants.py b/dags/sources/config/shared_constants.py index 90740f724..dcf228027 100755 --- a/dags/sources/config/shared_constants.py +++ b/dags/sources/config/shared_constants.py @@ -8,7 +8,6 @@ # SuggestionCohorte type_action SUGGESTION_CRAWL_URLS = "CRAWL_URLS" -SUGGESTION_RGPD_ANONYMIZE = "ENRICH_ACTEURS_RGPD" SUGGESTION_CLUSTERING = "CLUSTERING" SUGGESTION_SOURCE_AJOUT = "SOURCE_AJOUT" SUGGESTION_SOURCE_MODIFICATION = "SOURCE_MODIFICATION" diff --git a/data/models/changes/acteur_update_data.py b/data/models/changes/acteur_update_data.py index 785bfe4e3..d629e3603 100644 --- a/data/models/changes/acteur_update_data.py +++ b/data/models/changes/acteur_update_data.py @@ -1,6 +1,5 @@ """Generic change model to update an acteur's data. If your use-case -is very specific (e.g. RGPD), use a dedicated model fore more clarity/robustness, -else you can use this model.""" +is very specific (e.g. RGPD), create dedicated model for more clarity/reliability.""" from data.models.changes.acteur_abstract import ChangeActeurAbstract from data.models.changes.utils import data_reconstruct diff --git a/data/models/suggestion.py b/data/models/suggestion.py index c27182147..854dd0001 100644 --- a/data/models/suggestion.py +++ b/data/models/suggestion.py @@ -197,7 +197,6 @@ def display_suggestion_details(self): elif self.suggestion_cohorte.type_action == SuggestionAction.CRAWL_URLS: template_name = "data/_partials/crawl_urls_suggestion_details.html" elif self.suggestion_cohorte.type_action in [ - SuggestionAction.ENRICH_ACTEURS_RGPD, SuggestionAction.ENRICH_ACTEURS_CLOSED, ]: template_name = "data/_partials/suggestion_details_changes.html" diff --git a/qfdmo/models/acteur.py b/qfdmo/models/acteur.py index 1595affcb..ded587031 100644 --- a/qfdmo/models/acteur.py +++ b/qfdmo/models/acteur.py @@ -905,6 +905,7 @@ def duplicate(self): "acteur_services", "proposition_services", "parent", + "parent_reason", ] for field in fields_to_reset: diff --git a/unit_tests/data/models/changes/test_acteur_create_as_child.py b/unit_tests/data/models/changes/test_acteur_create_as_child.py index 77799f901..325b5bcdf 100644 --- a/unit_tests/data/models/changes/test_acteur_create_as_child.py +++ b/unit_tests/data/models/changes/test_acteur_create_as_child.py @@ -1,8 +1,8 @@ import pytest from django.contrib.gis.geos import Point -from data.models import Acteur, RevisionActeur from data.models.changes.acteur_create_as_child import ChangeActeurCreateAsChild +from qfdmo.models import Acteur, RevisionActeur from unit_tests.qfdmo.acteur_factory import ( ActeurFactory, ActeurTypeFactory, diff --git a/unit_tests/qfdmo/test_acteur_methods.py b/unit_tests/qfdmo/test_acteur_methods.py index d04b43291..7e0e7e759 100644 --- a/unit_tests/qfdmo/test_acteur_methods.py +++ b/unit_tests/qfdmo/test_acteur_methods.py @@ -15,7 +15,7 @@ class TestActeurMethods: @pytest.mark.parametrize( "initial,expected", [ - (None, [{"message": "test"}]), + ("", [{"message": "test"}]), (" ", [{"message": "test"}]), ("foo", [{"message": "foo"}, {"message": "test"}]), ('[{"message": "bar"}]', [{"message": "bar"}, {"message": "test"}]), From ff9f58b2eb0f11a99522530da643f097306beb21 Mon Sep 17 00:00:00 2001 From: maxcorbeau Date: Wed, 16 Apr 2025 11:14:57 +0200 Subject: [PATCH 27/50] suppresion migration RGPD --- data/models/suggestion.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/data/models/suggestion.py b/data/models/suggestion.py index 854dd0001..383a5eee0 100644 --- a/data/models/suggestion.py +++ b/data/models/suggestion.py @@ -54,7 +54,6 @@ class SuggestionCohorteStatut(models.TextChoices): class SuggestionAction(models.TextChoices): CRAWL_URLS = SUGGESTION_CRAWL_URLS, "🔗 URLs scannées" - ENRICH_ACTEURS_RGPD = "RGPD_ANONYMISATION", "🕵️ Anonymisation RGPD" ENRICH_ACTEURS_CLOSED = "ENRICH_ACTEURS_CLOSED", "🚪 Acteurs fermés" CLUSTERING = SUGGESTION_CLUSTERING, "regroupement/déduplication des acteurs" SOURCE_AJOUT = ( @@ -320,7 +319,6 @@ def apply(self): if self.suggestion_cohorte.type_action in [ SuggestionAction.CLUSTERING, SuggestionAction.CRAWL_URLS, - SuggestionAction.ENRICH_ACTEURS_RGPD, SuggestionAction.ENRICH_ACTEURS_CLOSED, ]: changes = self.suggestion["changes"] From 390b48041e0f36a9b68d9aadc2ed00da861ca7a2 Mon Sep 17 00:00:00 2001 From: maxcorbeau Date: Wed, 16 Apr 2025 11:17:09 +0200 Subject: [PATCH 28/50] suppression des prints --- data/models/changes/acteur_create_as_child.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/data/models/changes/acteur_create_as_child.py b/data/models/changes/acteur_create_as_child.py index 550946a2a..74282eb89 100644 --- a/data/models/changes/acteur_create_as_child.py +++ b/data/models/changes/acteur_create_as_child.py @@ -1,5 +1,4 @@ from pydantic import BaseModel -from rich import print from data.models.changes.utils import data_reconstruct @@ -39,9 +38,7 @@ def apply(self): parent = RevisionActeur.objects.get(pk=self.data["parent"]) # Reconstruct data from RevisionActeur - print(f"data before reconstruct: {self.data}") data = data_reconstruct(RevisionActeur, self.data) - print(f"data after reconstruct: {data}") # Create child in Acteur to hold data data_base = data.copy() From fef7876939e1942dc18ff76bbc59bde06a5ca963 Mon Sep 17 00:00:00 2001 From: maxcorbeau Date: Wed, 16 Apr 2025 11:18:36 +0200 Subject: [PATCH 29/50] drop changes in restore script --- scripts/restore_prod_locally.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/restore_prod_locally.sh b/scripts/restore_prod_locally.sh index 7d8eb959e..27d76a71c 100755 --- a/scripts/restore_prod_locally.sh +++ b/scripts/restore_prod_locally.sh @@ -1,4 +1,3 @@ -DUMP_FILE=/home/me/Downloads/20250410002559_quefairedem_5084/20250410002559_quefairedem_5084.pgsql DATABASE_URL=postgres://qfdmo:qfdmo@localhost:6543/qfdmo # pragma: allowlist secret for table in $(psql "${DATABASE_URL}" -t -c "SELECT \"tablename\" FROM pg_tables WHERE schemaname='public'"); do From 8c0cbaf492e208087cbd18ee164d2831931c54dc Mon Sep 17 00:00:00 2001 From: maxcorbeau Date: Wed, 16 Apr 2025 11:24:35 +0200 Subject: [PATCH 30/50] cont. fix script import --- 1 | 0 scripts/restore_prod_locally.sh | 1 + 2 files changed, 1 insertion(+) create mode 100644 1 diff --git a/1 b/1 new file mode 100644 index 000000000..e69de29bb diff --git a/scripts/restore_prod_locally.sh b/scripts/restore_prod_locally.sh index 27d76a71c..1c9b36cc3 100755 --- a/scripts/restore_prod_locally.sh +++ b/scripts/restore_prod_locally.sh @@ -1,3 +1,4 @@ +DUMP_FILE=backup.pgsql DATABASE_URL=postgres://qfdmo:qfdmo@localhost:6543/qfdmo # pragma: allowlist secret for table in $(psql "${DATABASE_URL}" -t -c "SELECT \"tablename\" FROM pg_tables WHERE schemaname='public'"); do From 3139b7a501c958a090cfefcb50101b8de0edc36c Mon Sep 17 00:00:00 2001 From: maxcorbeau Date: Wed, 16 Apr 2025 14:59:52 +0200 Subject: [PATCH 31/50] fix typo sur dbt_models_refresh_command --- .../tasks/airflow_logic/enrich_dbt_models_refresh_task.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dags/enrich/tasks/airflow_logic/enrich_dbt_models_refresh_task.py b/dags/enrich/tasks/airflow_logic/enrich_dbt_models_refresh_task.py index ff1844695..baccb7d2e 100644 --- a/dags/enrich/tasks/airflow_logic/enrich_dbt_models_refresh_task.py +++ b/dags/enrich/tasks/airflow_logic/enrich_dbt_models_refresh_task.py @@ -7,6 +7,7 @@ from airflow.operators.bash import BashOperator from airflow.operators.python import PythonOperator from enrich.config import TASKS, XCOMS, xcom_pull +from enrich.config.models import EnrichBaseConfig logger = logging.getLogger(__name__) @@ -27,7 +28,7 @@ def task_info_get(dbt_model_refresh_command: str): def enrich_dbt_models_refresh_wrapper(ti) -> None: # Config - config = xcom_pull(ti, XCOMS.CONFIG) + config: EnrichBaseConfig = xcom_pull(ti, XCOMS.CONFIG) logger.info(task_info_get(config.dbt_models_refresh_command)) logger.info(f"📖 Configuration:\n{config.model_dump_json(indent=2)}") @@ -40,7 +41,7 @@ def enrich_dbt_models_refresh_wrapper(ti) -> None: ) bash = BashOperator( task_id=TASKS.ENRICH_DBT_MODELS_REFRESH + "_bash", - bash_command=config.dbt_build_command, + bash_command=config.dbt_models_refresh_command, ) bash.execute(context=ti.get_template_context()) From a5f3627a0e41ab164c9c8492228563e24d84794e Mon Sep 17 00:00:00 2001 From: maxcorbeau Date: Mon, 21 Apr 2025 09:41:54 +0200 Subject: [PATCH 32/50] renommage replace -> suggest --- dags/enrich/config/columns.py | 12 ++++---- .../enrich_dbt_model_to_suggestions.py | 20 ++++++------- .../test_enrich_acteurs_closed_suggestions.py | 12 ++++---- .../marts_enrich_acteurs_closed_replaced.sql | 16 +++++----- ...rs_closed_suggest_replaced_other_siren.sql | 2 +- ...urs_closed_suggest_replaced_same_siren.sql | 2 +- .../marts_enrich_acteurs_villes_suggest.sql | 29 +++++++++++++++++++ ...arts_enrich_acteurs_villes_suggest_new.sql | 15 ++++++++++ ...ts_enrich_acteurs_villes_suggest_other.sql | 15 ++++++++++ dbt/models/marts/enrich/schema.yml | 6 ++-- 10 files changed, 94 insertions(+), 35 deletions(-) create mode 100644 dbt/models/marts/enrich/marts_enrich_acteurs_villes_suggest.sql create mode 100644 dbt/models/marts/enrich/marts_enrich_acteurs_villes_suggest_new.sql create mode 100644 dbt/models/marts/enrich/marts_enrich_acteurs_villes_suggest_other.sql diff --git a/dags/enrich/config/columns.py b/dags/enrich/config/columns.py index 9af12db7c..618eb0c37 100644 --- a/dags/enrich/config/columns.py +++ b/dags/enrich/config/columns.py @@ -32,12 +32,12 @@ class COLS: SUGGEST_COHORT_LABEL: str = "suggestion_cohorte_label" # Replacements - REMPLACER_SIRET: str = "remplacer_siret" - REMPLACER_NOM: str = "remplacer_nom" - REMPLACER_ADRESSE: str = "remplacer_adresse" - REMPLACER_CODE_POSTAL: str = "remplacer_code_postal" - REMPLACER_VILLE: str = "remplacer_ville" - REMPLACER_NAF: str = "remplacer_naf" + SUGGEST_SIRET: str = "suggest_siret" + SUGGEST_NOM: str = "suggest_nom" + SUGGEST_ADRESSE: str = "suggest_adresse" + SUGGEST_CODE_POSTAL: str = "suggest_code_postal" + SUGGEST_VILLE: str = "suggest_ville" + SUGGEST_NAF: str = "suggest_naf" # Matching MATCH_WORDS: str = "match_words" diff --git a/dags/enrich/tasks/business_logic/enrich_dbt_model_to_suggestions.py b/dags/enrich/tasks/business_logic/enrich_dbt_model_to_suggestions.py index 276225699..60e24b7ee 100644 --- a/dags/enrich/tasks/business_logic/enrich_dbt_model_to_suggestions.py +++ b/dags/enrich/tasks/business_logic/enrich_dbt_model_to_suggestions.py @@ -79,18 +79,18 @@ def suggestion_change_prepare_closed_replaced( changes = [] today = datetime.now(timezone.utc).strftime("%Y-%m-%d") # Parent - parent_id = parent_id_generate([str(row[COLS.REMPLACER_SIRET])]) + parent_id = parent_id_generate([str(row[COLS.SUGGEST_SIRET])]) params_parent = { "id": parent_id, "data": { "identifiant_unique": parent_id, - "nom": row[COLS.REMPLACER_NOM], - "adresse": row[COLS.REMPLACER_ADRESSE], - "code_postal": row[COLS.REMPLACER_CODE_POSTAL], - "ville": row[COLS.REMPLACER_VILLE], - "siren": row[COLS.REMPLACER_SIRET][:9], - "siret": row[COLS.REMPLACER_SIRET], - "naf_principal": row[COLS.REMPLACER_NAF], + "nom": row[COLS.SUGGEST_NOM], + "adresse": row[COLS.SUGGEST_ADRESSE], + "code_postal": row[COLS.SUGGEST_CODE_POSTAL], + "ville": row[COLS.SUGGEST_VILLE], + "siren": row[COLS.SUGGEST_SIRET][:9], + "siret": row[COLS.SUGGEST_SIRET], + "naf_principal": row[COLS.SUGGEST_NAF], "acteur_type": row[COLS.ACTEUR_TYPE_ID], "source": None, "statut": ActeurStatus.ACTIF, @@ -120,7 +120,7 @@ def suggestion_change_prepare_closed_replaced( f"Nouvel enfant pour conserver les données suite à: " f"SIRET {row[COLS.ACTEUR_SIRET]} " f"détecté le {today} comme fermé dans AE, " - f"remplacé par SIRET {row[COLS.REMPLACER_SIRET]}" + f"remplacé par SIRET {row[COLS.SUGGEST_SIRET]}" ) if row[COLS.ACTEUR_LONGITUDE] is not None and row[COLS.ACTEUR_LATITUDE] is not None: params_child_new["data"]["longitude"] = row[COLS.ACTEUR_LONGITUDE] @@ -143,7 +143,7 @@ def suggestion_change_prepare_closed_replaced( params_child_old["data"]["parent_reason"] = ( f"SIRET {row[COLS.ACTEUR_SIRET]} " f"détecté le {today} comme fermé dans AE, " - f"remplacé par SIRET {row[COLS.REMPLACER_SIRET]}" + f"remplacé par SIRET {row[COLS.SUGGEST_SIRET]}" ) params_child_old["data"]["siret_is_closed"] = True params_child_old["data"]["statut"] = ActeurStatus.INACTIF diff --git a/dags_unit_tests/enrich/tasks/test_enrich_acteurs_closed_suggestions.py b/dags_unit_tests/enrich/tasks/test_enrich_acteurs_closed_suggestions.py index 645b1eefd..d7d35cb5f 100644 --- a/dags_unit_tests/enrich/tasks/test_enrich_acteurs_closed_suggestions.py +++ b/dags_unit_tests/enrich/tasks/test_enrich_acteurs_closed_suggestions.py @@ -60,12 +60,12 @@ def df_replaced(self, atype, source): COLS.ACTEUR_LONGITUDE: [1, 1, 1], COLS.ACTEUR_LATITUDE: [2, 2, 2], # Replacement data - COLS.REMPLACER_SIRET: [ + COLS.SUGGEST_SIRET: [ "11111111100002", "33333333300001", "55555555500001", ], - COLS.REMPLACER_NOM: ["APRES a1", "APRES a2", "APRES a3"], + COLS.SUGGEST_NOM: ["APRES a1", "APRES a2", "APRES a3"], COLS.SUGGEST_COHORT_CODE: [ COHORTS.CLOSED_REP_SAME_SIREN.code, COHORTS.CLOSED_REP_OTHER_SIREN.code, @@ -76,10 +76,10 @@ def df_replaced(self, atype, source): COHORTS.CLOSED_REP_OTHER_SIREN.label, COHORTS.CLOSED_REP_OTHER_SIREN.label, ], - COLS.REMPLACER_ADRESSE: ["Adresse1", "Adresse2", "Adresse3"], - COLS.REMPLACER_CODE_POSTAL: ["12345", "67890", "12345"], - COLS.REMPLACER_VILLE: ["Ville1", "Ville2", "Ville3"], - COLS.REMPLACER_NAF: ["naf1", "naf2", "naf3"], + COLS.SUGGEST_ADRESSE: ["Adresse1", "Adresse2", "Adresse3"], + COLS.SUGGEST_CODE_POSTAL: ["12345", "67890", "12345"], + COLS.SUGGEST_VILLE: ["Ville1", "Ville2", "Ville3"], + COLS.SUGGEST_NAF: ["naf1", "naf2", "naf3"], } ) diff --git a/dbt/models/marts/enrich/marts_enrich_acteurs_closed_replaced.sql b/dbt/models/marts/enrich/marts_enrich_acteurs_closed_replaced.sql index 050842253..53fbf95b5 100644 --- a/dbt/models/marts/enrich/marts_enrich_acteurs_closed_replaced.sql +++ b/dbt/models/marts/enrich/marts_enrich_acteurs_closed_replaced.sql @@ -12,13 +12,13 @@ WITH potential_replacements AS ( candidates.*, -- Replacements - replacements.siret AS remplacer_siret, - LEFT(candidates.acteur_siret,9) = LEFT(replacements.siret,9) AS remplacer_siret_is_from_same_siren, - replacements.nom AS remplacer_nom, - replacements.naf AS remplacer_naf, - replacements.ville AS remplacer_ville, - replacements.code_postal AS remplacer_code_postal, - replacements.adresse AS remplacer_adresse, + replacements.siret AS suggest_siret, + LEFT(candidates.acteur_siret,9) = LEFT(replacements.siret,9) AS suggest_siret_is_from_same_siren, + replacements.nom AS suggest_nom, + replacements.naf AS suggest_naf, + replacements.ville AS suggest_ville, + replacements.code_postal AS suggest_code_postal, + replacements.adresse AS suggest_adresse, -- Matching udf_columns_words_in_common_count( @@ -63,4 +63,4 @@ WITH potential_replacements AS ( SELECT * FROM potential_replacements WHERE replacement_priority=1 /* We don't want to propose replacements with unavailable names */ -AND remplacer_nom != {{ value_unavailable() }} \ No newline at end of file +AND suggest_nom != {{ value_unavailable() }} \ No newline at end of file diff --git a/dbt/models/marts/enrich/marts_enrich_acteurs_closed_suggest_replaced_other_siren.sql b/dbt/models/marts/enrich/marts_enrich_acteurs_closed_suggest_replaced_other_siren.sql index d973f2ca5..e3827f59d 100644 --- a/dbt/models/marts/enrich/marts_enrich_acteurs_closed_suggest_replaced_other_siren.sql +++ b/dbt/models/marts/enrich/marts_enrich_acteurs_closed_suggest_replaced_other_siren.sql @@ -10,4 +10,4 @@ SELECT '🚪 Acteurs Fermés: 🟡 remplacés par SIRET d''un autre SIREN' AS suggestion_cohorte_label, * FROM {{ ref('marts_enrich_acteurs_closed_replaced') }} -WHERE remplacer_siret_is_from_same_siren IS FALSE +WHERE suggest_siret_is_from_same_siren IS FALSE diff --git a/dbt/models/marts/enrich/marts_enrich_acteurs_closed_suggest_replaced_same_siren.sql b/dbt/models/marts/enrich/marts_enrich_acteurs_closed_suggest_replaced_same_siren.sql index c098377c1..4a95f9881 100644 --- a/dbt/models/marts/enrich/marts_enrich_acteurs_closed_suggest_replaced_same_siren.sql +++ b/dbt/models/marts/enrich/marts_enrich_acteurs_closed_suggest_replaced_same_siren.sql @@ -10,4 +10,4 @@ SELECT '🚪 Acteurs Fermés: 🟢 remplacés par SIRET du même SIREN' AS suggestion_cohorte_label, * FROM {{ ref('marts_enrich_acteurs_closed_replaced') }} -WHERE remplacer_siret_is_from_same_siren IS TRUE +WHERE suggest_siret_is_from_same_siren IS TRUE diff --git a/dbt/models/marts/enrich/marts_enrich_acteurs_villes_suggest.sql b/dbt/models/marts/enrich/marts_enrich_acteurs_villes_suggest.sql new file mode 100644 index 000000000..67007050f --- /dev/null +++ b/dbt/models/marts/enrich/marts_enrich_acteurs_villes_suggest.sql @@ -0,0 +1,29 @@ +{{ + config( + materialized = 'table', + tags=['marts', 'enrich', 'ville','cities', 'ban'], + ) +}} + +SELECT + acteurs.identifiant_unique AS acteur_id, + acteurs.ville AS acteur_ville, + acteurs.code_postal AS acteur_code_postal, + ban.ville_ancienne AS ban_ville_ancienne, + ban.ville AS ban_ville, + ban.code_postal AS ban_code_postal, + ban.ville AS suggest_ville +FROM {{ ref('marts_carte_acteur') }} AS acteurs +JOIN {{ ref('int_ban_villes') }} AS ban ON ban.code_postal = acteurs.code_postal +WHERE acteurs.statut = 'ACTIF' +AND acteurs.code_postal IS NOT NULL and acteurs.code_postal != '' and LENGTH(acteurs.code_postal) = 5 +/* Only suggest if 1 difference */ +AND ( + acteurs.ville != ban.ville_ancienne + OR acteurs.ville != ban.ville +) +/* BUT also a match somewhere */ +AND ( + udf_normalize_string_for_match(acteurs.ville,3) = udf_normalize_string_for_match(ban.ville_ancienne,3) + OR udf_normalize_string_for_match(acteurs.ville,3) = udf_normalize_string_for_match(ban.ville,3) +) diff --git a/dbt/models/marts/enrich/marts_enrich_acteurs_villes_suggest_new.sql b/dbt/models/marts/enrich/marts_enrich_acteurs_villes_suggest_new.sql new file mode 100644 index 000000000..a2a3fcf10 --- /dev/null +++ b/dbt/models/marts/enrich/marts_enrich_acteurs_villes_suggest_new.sql @@ -0,0 +1,15 @@ +{{ + config( + materialized = 'view', + alias = 'marts_enrich_acteurs_villes_suggest_new', + tags=['marts', 'enrich', 'ville', 'ban','acteurs','nouvelle','new'], + ) +}} + +SELECT + 'acteurs_villes_anciennes_nouvelles' AS suggestion_cohorte_code, + '🌆 Changement de ville: 🟡 ancienne -> nouvelle' AS suggestion_cohorte_label, + * +FROM {{ ref('marts_enrich_acteurs_villes_suggest') }} +WHERE udf_normalize_string_for_match(acteur_ville,3) != udf_normalize_string_for_match(suggest_ville,3) +AND ban_ville_ancienne IS NOT NULL \ No newline at end of file diff --git a/dbt/models/marts/enrich/marts_enrich_acteurs_villes_suggest_other.sql b/dbt/models/marts/enrich/marts_enrich_acteurs_villes_suggest_other.sql new file mode 100644 index 000000000..db3d68fa7 --- /dev/null +++ b/dbt/models/marts/enrich/marts_enrich_acteurs_villes_suggest_other.sql @@ -0,0 +1,15 @@ +{{ + config( + materialized = 'view', + alias = 'marts_enrich_acteurs_villes_suggest_other', + tags=['marts', 'enrich', 'ville', 'ban','acteurs','nouvelle','other'], + ) +}} + +SELECT + 'acteurs_villes_other' AS suggestion_cohorte_code, + '🌆 Changement de ville: 🔴 autre' AS suggestion_cohorte_label, + * +FROM {{ ref('marts_enrich_acteurs_villes_suggest') }} +WHERE udf_normalize_string_for_match(acteur_ville,3) != udf_normalize_string_for_match(suggest_ville,3) +AND ban_ville_ancienne IS NULL \ No newline at end of file diff --git a/dbt/models/marts/enrich/schema.yml b/dbt/models/marts/enrich/schema.yml index f076f21ea..2b25e895f 100644 --- a/dbt/models/marts/enrich/schema.yml +++ b/dbt/models/marts/enrich/schema.yml @@ -15,11 +15,11 @@ models: description: SIRET de l'acteur fermé data_tests: - not_null - - name: remplacer_siret + - name: suggest_siret description: SIRET de l'établissement qui remplace l'acteur fermé data_tests: - not_null - - name: remplacer_cohorte + - name: suggest_cohorte description: "Si le SIRET de remplacement appartient à la même entreprise (meme_siret) ou non (autre_siret)" data_tests: - not_null @@ -27,7 +27,7 @@ models: # values: ['siret_du_meme_siren', 'siret_dun_autre_siren'] - name: acteur_nom description: Nom de l'acteur fermé - - name: remplacer_nom + - name: suggest_nom description: Nom de l'établissement qui remplace l'acteur fermé - name: noms_nombre_mots_commun description: Nombre de mots en commun entre le nom de l'acteur et celui du remplaçant From 6e8b4aa7a0e7d0bab550fd9dabfac13211b33a37 Mon Sep 17 00:00:00 2001 From: maxcorbeau Date: Mon, 21 Apr 2025 10:10:54 +0200 Subject: [PATCH 33/50] cohorte: simplification, label uniquement --- dags/enrich/config/__init__.py | 2 +- dags/enrich/config/cohorts.py | 23 ++------ dags/enrich/config/columns.py | 2 +- .../enrich_dbt_model_suggest_task.py | 8 +-- .../enrich_dbt_model_suggest.py | 3 +- .../enrich_dbt_model_to_suggestions.py | 59 +++++++++---------- .../test_enrich_acteurs_closed_suggestions.py | 28 ++++----- ...ch_acteurs_closed_suggest_not_replaced.sql | 3 +- ...rs_closed_suggest_replaced_other_siren.sql | 3 +- ...urs_closed_suggest_replaced_same_siren.sql | 3 +- ...arts_enrich_acteurs_villes_suggest_new.sql | 3 +- ...ts_enrich_acteurs_villes_suggest_other.sql | 3 +- 12 files changed, 54 insertions(+), 86 deletions(-) diff --git a/dags/enrich/config/__init__.py b/dags/enrich/config/__init__.py index 26fceffac..b6c1b63e6 100644 --- a/dags/enrich/config/__init__.py +++ b/dags/enrich/config/__init__.py @@ -1,4 +1,4 @@ -from .cohorts import COHORTS, Cohort # noqa: F401 +from .cohorts import COHORTS # noqa: F401 from .columns import COLS # noqa: F401 from .dbt import DBT # noqa: F401 from .models import DAG_ID_TO_CONFIG_MODEL, EnrichActeursClosedConfig # noqa: F401 diff --git a/dags/enrich/config/cohorts.py b/dags/enrich/config/cohorts.py index c601baaca..1db756e8e 100644 --- a/dags/enrich/config/cohorts.py +++ b/dags/enrich/config/cohorts.py @@ -2,26 +2,11 @@ from dataclasses import dataclass -INTRO = "🚪 Acteurs Fermés:" - - -@dataclass(frozen=True) -class Cohort: - code: str - label: str +CLOSED = "🚪 Acteurs Fermés:" @dataclass(frozen=True) class COHORTS: - CLOSED_NOT_REPLACED: Cohort = Cohort( - code="acteurs_closed_not_replaced", - label=f"{INTRO} 🔴 non remplacés", - ) - CLOSED_REP_OTHER_SIREN: Cohort = Cohort( - code="acteurs_closed_replaced_other_siren", - label=f"{INTRO} 🟡 remplacés par SIRET d'un autre SIREN", - ) - CLOSED_REP_SAME_SIREN: Cohort = Cohort( - code="acteurs_closed_replaced_same_siren", - label=f"{INTRO} 🟢 remplacés par SIRET du même SIREN", - ) + CLOSED_NOT_REPLACED = f"{CLOSED} 🔴 non remplacés" + CLOSED_REP_OTHER_SIREN = f"{CLOSED} 🟡 remplacés par SIRET d'un autre SIREN" + CLOSED_REP_SAME_SIREN = f"{CLOSED} 🟢 remplacés par SIRET du même SIREN" diff --git a/dags/enrich/config/columns.py b/dags/enrich/config/columns.py index 618eb0c37..e8dedff44 100644 --- a/dags/enrich/config/columns.py +++ b/dags/enrich/config/columns.py @@ -29,7 +29,7 @@ class COLS: # Suggestions SUGGEST_COHORT_CODE: str = "suggestion_cohorte_code" - SUGGEST_COHORT_LABEL: str = "suggestion_cohorte_label" + SUGGEST_COHORT: str = "suggest_cohort" # Replacements SUGGEST_SIRET: str = "suggest_siret" diff --git a/dags/enrich/tasks/airflow_logic/enrich_dbt_model_suggest_task.py b/dags/enrich/tasks/airflow_logic/enrich_dbt_model_suggest_task.py index 04aa2f81f..dfd03366a 100644 --- a/dags/enrich/tasks/airflow_logic/enrich_dbt_model_suggest_task.py +++ b/dags/enrich/tasks/airflow_logic/enrich_dbt_model_suggest_task.py @@ -7,7 +7,7 @@ from airflow.models.taskinstance import TaskInstance from airflow.operators.python import PythonOperator from airflow.utils.trigger_rule import TriggerRule -from enrich.config import XCOMS, Cohort, xcom_pull +from enrich.config import XCOMS, xcom_pull from enrich.tasks.business_logic.enrich_dbt_model_suggest import ( enrich_dbt_model_suggest, ) @@ -31,7 +31,7 @@ def task_info_get(task_id, df_xcom_key): def enrich_dbt_model_suggest_wrapper( task_id: str, - cohort: Cohort, + cohort: str, dbt_model_name: str, ti: TaskInstance, dag: DAG, @@ -55,13 +55,13 @@ def enrich_dbt_model_suggest_wrapper( def enrich_dbt_model_suggest_task( - dag: DAG, task_id: str, cohort: Cohort, dbt_model_name: str + dag: DAG, task_id: str, cohort: str, dbt_model_name: str ) -> PythonOperator: return PythonOperator( task_id=task_id, python_callable=enrich_dbt_model_suggest_wrapper, op_args=[task_id, cohort, dbt_model_name], dag=dag, - doc_md=f"**Suggestions** pour la cohorte: **{cohort.label}**", + doc_md=f"**Suggestions** pour la cohorte: **{cohort}**", trigger_rule=TriggerRule.ALL_DONE, ) diff --git a/dags/enrich/tasks/business_logic/enrich_dbt_model_suggest.py b/dags/enrich/tasks/business_logic/enrich_dbt_model_suggest.py index e748fd4c6..d17ece76d 100644 --- a/dags/enrich/tasks/business_logic/enrich_dbt_model_suggest.py +++ b/dags/enrich/tasks/business_logic/enrich_dbt_model_suggest.py @@ -1,6 +1,5 @@ import logging -from enrich.config import Cohort from enrich.tasks.business_logic.enrich_dbt_model_read import enrich_dbt_model_read from enrich.tasks.business_logic.enrich_dbt_model_to_suggestions import ( enrich_dbt_model_to_suggestions, @@ -12,7 +11,7 @@ def enrich_dbt_model_suggest( dbt_model_name: str, filters: list[dict], - cohort: Cohort, + cohort: str, identifiant_action: str, dry_run: bool = True, ) -> bool: diff --git a/dags/enrich/tasks/business_logic/enrich_dbt_model_to_suggestions.py b/dags/enrich/tasks/business_logic/enrich_dbt_model_to_suggestions.py index 60e24b7ee..bdd89f979 100644 --- a/dags/enrich/tasks/business_logic/enrich_dbt_model_to_suggestions.py +++ b/dags/enrich/tasks/business_logic/enrich_dbt_model_to_suggestions.py @@ -5,13 +5,13 @@ from cluster.tasks.business_logic.cluster_acteurs_parents_choose_new import ( parent_id_generate, ) -from enrich.config import COHORTS, COLS, Cohort +from enrich.config import COHORTS, COLS from utils import logging_utils as log logger = logging.getLogger(__name__) -def suggestion_change_prepare( +def changes_prepare( model, model_params: dict, order: int, @@ -19,7 +19,7 @@ def suggestion_change_prepare( entity_type: str, ) -> dict: """Generic utility to prepare, validate and - serialize 1 suggestion change for all suggestion types""" + serialize 1 suggestion change for ANY suggestion types""" from data.models.change import SuggestionChange model(**model_params).validate() @@ -32,10 +32,10 @@ def suggestion_change_prepare( ).model_dump() -def suggestion_change_prepare_closed_not_replaced( +def changes_prepare_closed_not_replaced( row: dict, ) -> list[dict]: - """Prepare suggestions for closed not replaced cohorts""" + """Prepare suggestion changes for closed not replaced cohorts""" from data.models.changes import ChangeActeurUpdateData from qfdmo.models import ActeurStatus @@ -54,7 +54,7 @@ def suggestion_change_prepare_closed_not_replaced( }, } changes.append( - suggestion_change_prepare( + changes_prepare( model=ChangeActeurUpdateData, model_params=model_params, order=1, @@ -65,10 +65,10 @@ def suggestion_change_prepare_closed_not_replaced( return changes -def suggestion_change_prepare_closed_replaced( +def changes_prepare_closed_replaced( row: dict, ) -> list[dict]: - """Prepare suggestions for closed replaced cohorts""" + """Prepare suggestion changes for closed replaced cohorts""" from data.models.changes import ( ChangeActeurCreateAsChild, ChangeActeurCreateAsParent, @@ -97,7 +97,7 @@ def suggestion_change_prepare_closed_replaced( }, } changes.append( - suggestion_change_prepare( + changes_prepare( model=ChangeActeurCreateAsParent, model_params=params_parent, order=1, @@ -126,7 +126,7 @@ def suggestion_change_prepare_closed_replaced( params_child_new["data"]["longitude"] = row[COLS.ACTEUR_LONGITUDE] params_child_new["data"]["latitude"] = row[COLS.ACTEUR_LATITUDE] changes.append( - suggestion_change_prepare( + changes_prepare( model=ChangeActeurCreateAsChild, model_params=params_child_new, order=2, @@ -148,7 +148,7 @@ def suggestion_change_prepare_closed_replaced( params_child_old["data"]["siret_is_closed"] = True params_child_old["data"]["statut"] = ActeurStatus.INACTIF changes.append( - suggestion_change_prepare( + changes_prepare( model=ChangeActeurUpdateData, model_params=params_child_old, order=3, @@ -159,9 +159,17 @@ def suggestion_change_prepare_closed_replaced( return changes +# Mapping cohorts with their respective changes preparation function +COHORTS_TO_PREPARE_CHANGES = { + COHORTS.CLOSED_NOT_REPLACED: changes_prepare_closed_not_replaced, + COHORTS.CLOSED_REP_OTHER_SIREN: changes_prepare_closed_replaced, + COHORTS.CLOSED_REP_SAME_SIREN: changes_prepare_closed_replaced, +} + + def enrich_dbt_model_to_suggestions( df: pd.DataFrame, - cohort: Cohort, + cohort: str, identifiant_action: str, dry_run: bool = True, ) -> bool: @@ -176,9 +184,9 @@ def enrich_dbt_model_to_suggestions( if df is None or df.empty: raise ValueError("df vide: on devrait pas être ici") - cohort_codes = list(df[COLS.SUGGEST_COHORT_CODE].unique()) - if len(cohort_codes) != 1 or cohort_codes[0] != cohort.code: - msg = f"Problème cohorte: obtenu {cohort_codes=} vs. attendu {cohort.code=}" + cohorts = list(df[COLS.SUGGEST_COHORT].unique()) + if len(cohorts) != 1 or cohorts[0] != cohort: + msg = f"Problème cohorte: obtenu {cohorts=} vs. attendu {cohort=}" raise ValueError(msg) # Suggestions @@ -187,21 +195,9 @@ def enrich_dbt_model_to_suggestions( row = dict(row) try: - # ----------------------------------------- - # NOT REPLACED - # ----------------------------------------- - if cohort == COHORTS.CLOSED_NOT_REPLACED: - changes = suggestion_change_prepare_closed_not_replaced(row) - - # ----------------------------------------- - # REPLACED - # ----------------------------------------- - elif cohort in [ - COHORTS.CLOSED_REP_OTHER_SIREN, - COHORTS.CLOSED_REP_SAME_SIREN, - ]: - changes = suggestion_change_prepare_closed_replaced(row) + changes = COHORTS_TO_PREPARE_CHANGES[cohort](row) + # We tolerate some errors except Exception as e: log.preview("🔴 Suggestion problématique", row) logger.error(f"Erreur de préparation des changements: {e}") @@ -212,13 +208,14 @@ def enrich_dbt_model_to_suggestions( { "contexte": {}, "suggestion": { - "title": cohort.label, + "title": cohort, "summary": [], "changes": changes, }, } ) + # we need some working suggestions, can't have it all fail if not suggestions: raise ValueError("Aucune suggestion à écrire, pas normal") @@ -235,7 +232,7 @@ def enrich_dbt_model_to_suggestions( # ----------------------------------------- db_cohort = SuggestionCohorte( identifiant_action=identifiant_action, - identifiant_execution=f"{cohort.label}", + identifiant_execution=f"{cohort}", statut=SuggestionStatut.AVALIDER, type_action=SuggestionAction.ENRICH_ACTEURS_CLOSED, metadata={"🔢 Nombre de suggestions": len(suggestions)}, diff --git a/dags_unit_tests/enrich/tasks/test_enrich_acteurs_closed_suggestions.py b/dags_unit_tests/enrich/tasks/test_enrich_acteurs_closed_suggestions.py index d7d35cb5f..b10577897 100644 --- a/dags_unit_tests/enrich/tasks/test_enrich_acteurs_closed_suggestions.py +++ b/dags_unit_tests/enrich/tasks/test_enrich_acteurs_closed_suggestions.py @@ -39,8 +39,7 @@ def df_not_replaced(self, atype, source): COLS.ACTEUR_NOM: ["AVANT a01", "AVANT a02"], COLS.ACTEUR_TYPE_ID: [atype.pk, atype.pk], COLS.ACTEUR_SOURCE_ID: [source.pk, source.pk], - COLS.SUGGEST_COHORT_CODE: [COHORTS.CLOSED_NOT_REPLACED.code] * 2, - COLS.SUGGEST_COHORT_LABEL: [COHORTS.CLOSED_NOT_REPLACED.label] * 2, + COLS.SUGGEST_COHORT: [COHORTS.CLOSED_NOT_REPLACED] * 2, } ) @@ -66,15 +65,10 @@ def df_replaced(self, atype, source): "55555555500001", ], COLS.SUGGEST_NOM: ["APRES a1", "APRES a2", "APRES a3"], - COLS.SUGGEST_COHORT_CODE: [ - COHORTS.CLOSED_REP_SAME_SIREN.code, - COHORTS.CLOSED_REP_OTHER_SIREN.code, - COHORTS.CLOSED_REP_OTHER_SIREN.code, - ], - COLS.SUGGEST_COHORT_LABEL: [ - COHORTS.CLOSED_REP_SAME_SIREN.label, - COHORTS.CLOSED_REP_OTHER_SIREN.label, - COHORTS.CLOSED_REP_OTHER_SIREN.label, + COLS.SUGGEST_COHORT: [ + COHORTS.CLOSED_REP_SAME_SIREN, + COHORTS.CLOSED_REP_OTHER_SIREN, + COHORTS.CLOSED_REP_OTHER_SIREN, ], COLS.SUGGEST_ADRESSE: ["Adresse1", "Adresse2", "Adresse3"], COLS.SUGGEST_CODE_POSTAL: ["12345", "67890", "12345"], @@ -84,25 +78,23 @@ def df_replaced(self, atype, source): ) def test_df_replaced(self, df_replaced): - assert sorted(df_replaced[COLS.SUGGEST_COHORT_LABEL].unique()) == sorted( + assert sorted(df_replaced[COLS.SUGGEST_COHORT].unique()) == sorted( [ - COHORTS.CLOSED_REP_SAME_SIREN.label, - COHORTS.CLOSED_REP_OTHER_SIREN.label, + COHORTS.CLOSED_REP_SAME_SIREN, + COHORTS.CLOSED_REP_OTHER_SIREN, ] ) @pytest.fixture def df_replaced_meme_siret(self, df_replaced): return df_replaced[ - df_replaced[COLS.SUGGEST_COHORT_LABEL] - == COHORTS.CLOSED_REP_SAME_SIREN.label + df_replaced[COLS.SUGGEST_COHORT] == COHORTS.CLOSED_REP_SAME_SIREN ] @pytest.fixture def df_replaced_autre_siret(self, df_replaced): return df_replaced[ - df_replaced[COLS.SUGGEST_COHORT_LABEL] - == COHORTS.CLOSED_REP_OTHER_SIREN.label + df_replaced[COLS.SUGGEST_COHORT] == COHORTS.CLOSED_REP_OTHER_SIREN ] @pytest.fixture diff --git a/dbt/models/marts/enrich/marts_enrich_acteurs_closed_suggest_not_replaced.sql b/dbt/models/marts/enrich/marts_enrich_acteurs_closed_suggest_not_replaced.sql index 49cffb4ea..5b49c3c15 100644 --- a/dbt/models/marts/enrich/marts_enrich_acteurs_closed_suggest_not_replaced.sql +++ b/dbt/models/marts/enrich/marts_enrich_acteurs_closed_suggest_not_replaced.sql @@ -10,8 +10,7 @@ AND for which we couldn't find replacements }} SELECT - 'acteurs_closed_not_replaced' AS suggestion_cohorte_code, - '🚪 Acteurs Fermés: 🔴 non remplacés' AS suggestion_cohorte_label, + '🚪 Acteurs Fermés: 🔴 non remplacés' AS suggest_cohort, * FROM {{ ref('marts_enrich_acteurs_closed_candidates') }} WHERE diff --git a/dbt/models/marts/enrich/marts_enrich_acteurs_closed_suggest_replaced_other_siren.sql b/dbt/models/marts/enrich/marts_enrich_acteurs_closed_suggest_replaced_other_siren.sql index e3827f59d..1cb1d63cb 100644 --- a/dbt/models/marts/enrich/marts_enrich_acteurs_closed_suggest_replaced_other_siren.sql +++ b/dbt/models/marts/enrich/marts_enrich_acteurs_closed_suggest_replaced_other_siren.sql @@ -6,8 +6,7 @@ }} SELECT - 'acteurs_closed_replaced_other_siren' AS suggestion_cohorte_code, - '🚪 Acteurs Fermés: 🟡 remplacés par SIRET d''un autre SIREN' AS suggestion_cohorte_label, + '🚪 Acteurs Fermés: 🟡 remplacés par SIRET d''un autre SIREN' AS suggest_cohort, * FROM {{ ref('marts_enrich_acteurs_closed_replaced') }} WHERE suggest_siret_is_from_same_siren IS FALSE diff --git a/dbt/models/marts/enrich/marts_enrich_acteurs_closed_suggest_replaced_same_siren.sql b/dbt/models/marts/enrich/marts_enrich_acteurs_closed_suggest_replaced_same_siren.sql index 4a95f9881..30f6e24cf 100644 --- a/dbt/models/marts/enrich/marts_enrich_acteurs_closed_suggest_replaced_same_siren.sql +++ b/dbt/models/marts/enrich/marts_enrich_acteurs_closed_suggest_replaced_same_siren.sql @@ -6,8 +6,7 @@ }} SELECT - 'acteurs_closed_replaced_same_siren' AS suggestion_cohorte_code, - '🚪 Acteurs Fermés: 🟢 remplacés par SIRET du même SIREN' AS suggestion_cohorte_label, + '🚪 Acteurs Fermés: 🟢 remplacés par SIRET du même SIREN' AS suggest_cohort, * FROM {{ ref('marts_enrich_acteurs_closed_replaced') }} WHERE suggest_siret_is_from_same_siren IS TRUE diff --git a/dbt/models/marts/enrich/marts_enrich_acteurs_villes_suggest_new.sql b/dbt/models/marts/enrich/marts_enrich_acteurs_villes_suggest_new.sql index a2a3fcf10..449e6b3d7 100644 --- a/dbt/models/marts/enrich/marts_enrich_acteurs_villes_suggest_new.sql +++ b/dbt/models/marts/enrich/marts_enrich_acteurs_villes_suggest_new.sql @@ -7,8 +7,7 @@ }} SELECT - 'acteurs_villes_anciennes_nouvelles' AS suggestion_cohorte_code, - '🌆 Changement de ville: 🟡 ancienne -> nouvelle' AS suggestion_cohorte_label, + '🌆 Changement de ville: 🟡 ancienne -> nouvelle' AS suggest_cohort, * FROM {{ ref('marts_enrich_acteurs_villes_suggest') }} WHERE udf_normalize_string_for_match(acteur_ville,3) != udf_normalize_string_for_match(suggest_ville,3) diff --git a/dbt/models/marts/enrich/marts_enrich_acteurs_villes_suggest_other.sql b/dbt/models/marts/enrich/marts_enrich_acteurs_villes_suggest_other.sql index db3d68fa7..c3752fca0 100644 --- a/dbt/models/marts/enrich/marts_enrich_acteurs_villes_suggest_other.sql +++ b/dbt/models/marts/enrich/marts_enrich_acteurs_villes_suggest_other.sql @@ -7,8 +7,7 @@ }} SELECT - 'acteurs_villes_other' AS suggestion_cohorte_code, - '🌆 Changement de ville: 🔴 autre' AS suggestion_cohorte_label, + '🌆 Changement de ville: 🔴 autre' AS suggest_cohort, * FROM {{ ref('marts_enrich_acteurs_villes_suggest') }} WHERE udf_normalize_string_for_match(acteur_ville,3) != udf_normalize_string_for_match(suggest_ville,3) From a47623396aeb3d9e30a406ec04ad5f9d17d9d2a8 Mon Sep 17 00:00:00 2001 From: maxcorbeau Date: Mon, 21 Apr 2025 10:11:59 +0200 Subject: [PATCH 34/50] fix migration after rebase --- ...osed_and_more.py => 0148_acteur_siret_is_closed_and_more.py} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename qfdmo/migrations/{0141_acteur_siret_is_closed_and_more.py => 0148_acteur_siret_is_closed_and_more.py} (96%) diff --git a/qfdmo/migrations/0141_acteur_siret_is_closed_and_more.py b/qfdmo/migrations/0148_acteur_siret_is_closed_and_more.py similarity index 96% rename from qfdmo/migrations/0141_acteur_siret_is_closed_and_more.py rename to qfdmo/migrations/0148_acteur_siret_is_closed_and_more.py index 0f6bba9bf..0cb2c97ee 100644 --- a/qfdmo/migrations/0141_acteur_siret_is_closed_and_more.py +++ b/qfdmo/migrations/0148_acteur_siret_is_closed_and_more.py @@ -6,7 +6,7 @@ class Migration(migrations.Migration): dependencies = [ - ("qfdmo", "0140_drop_views"), + ("qfdmo", "0147_groupeaction_fill_alter_action_couleur_and_more"), ] operations = [ From 62513337a743788191c71f3ed11e3b3f26d25d07 Mon Sep 17 00:00:00 2001 From: maxcorbeau Date: Mon, 21 Apr 2025 13:44:15 +0200 Subject: [PATCH 35/50] gestion contexte + del marts villes --- .../marts_enrich_acteurs_villes_suggest.sql | 29 ------------------- ...arts_enrich_acteurs_villes_suggest_new.sql | 14 --------- ...ts_enrich_acteurs_villes_suggest_other.sql | 14 --------- 3 files changed, 57 deletions(-) delete mode 100644 dbt/models/marts/enrich/marts_enrich_acteurs_villes_suggest.sql delete mode 100644 dbt/models/marts/enrich/marts_enrich_acteurs_villes_suggest_new.sql delete mode 100644 dbt/models/marts/enrich/marts_enrich_acteurs_villes_suggest_other.sql diff --git a/dbt/models/marts/enrich/marts_enrich_acteurs_villes_suggest.sql b/dbt/models/marts/enrich/marts_enrich_acteurs_villes_suggest.sql deleted file mode 100644 index 67007050f..000000000 --- a/dbt/models/marts/enrich/marts_enrich_acteurs_villes_suggest.sql +++ /dev/null @@ -1,29 +0,0 @@ -{{ - config( - materialized = 'table', - tags=['marts', 'enrich', 'ville','cities', 'ban'], - ) -}} - -SELECT - acteurs.identifiant_unique AS acteur_id, - acteurs.ville AS acteur_ville, - acteurs.code_postal AS acteur_code_postal, - ban.ville_ancienne AS ban_ville_ancienne, - ban.ville AS ban_ville, - ban.code_postal AS ban_code_postal, - ban.ville AS suggest_ville -FROM {{ ref('marts_carte_acteur') }} AS acteurs -JOIN {{ ref('int_ban_villes') }} AS ban ON ban.code_postal = acteurs.code_postal -WHERE acteurs.statut = 'ACTIF' -AND acteurs.code_postal IS NOT NULL and acteurs.code_postal != '' and LENGTH(acteurs.code_postal) = 5 -/* Only suggest if 1 difference */ -AND ( - acteurs.ville != ban.ville_ancienne - OR acteurs.ville != ban.ville -) -/* BUT also a match somewhere */ -AND ( - udf_normalize_string_for_match(acteurs.ville,3) = udf_normalize_string_for_match(ban.ville_ancienne,3) - OR udf_normalize_string_for_match(acteurs.ville,3) = udf_normalize_string_for_match(ban.ville,3) -) diff --git a/dbt/models/marts/enrich/marts_enrich_acteurs_villes_suggest_new.sql b/dbt/models/marts/enrich/marts_enrich_acteurs_villes_suggest_new.sql deleted file mode 100644 index 449e6b3d7..000000000 --- a/dbt/models/marts/enrich/marts_enrich_acteurs_villes_suggest_new.sql +++ /dev/null @@ -1,14 +0,0 @@ -{{ - config( - materialized = 'view', - alias = 'marts_enrich_acteurs_villes_suggest_new', - tags=['marts', 'enrich', 'ville', 'ban','acteurs','nouvelle','new'], - ) -}} - -SELECT - '🌆 Changement de ville: 🟡 ancienne -> nouvelle' AS suggest_cohort, - * -FROM {{ ref('marts_enrich_acteurs_villes_suggest') }} -WHERE udf_normalize_string_for_match(acteur_ville,3) != udf_normalize_string_for_match(suggest_ville,3) -AND ban_ville_ancienne IS NOT NULL \ No newline at end of file diff --git a/dbt/models/marts/enrich/marts_enrich_acteurs_villes_suggest_other.sql b/dbt/models/marts/enrich/marts_enrich_acteurs_villes_suggest_other.sql deleted file mode 100644 index c3752fca0..000000000 --- a/dbt/models/marts/enrich/marts_enrich_acteurs_villes_suggest_other.sql +++ /dev/null @@ -1,14 +0,0 @@ -{{ - config( - materialized = 'view', - alias = 'marts_enrich_acteurs_villes_suggest_other', - tags=['marts', 'enrich', 'ville', 'ban','acteurs','nouvelle','other'], - ) -}} - -SELECT - '🌆 Changement de ville: 🔴 autre' AS suggest_cohort, - * -FROM {{ ref('marts_enrich_acteurs_villes_suggest') }} -WHERE udf_normalize_string_for_match(acteur_ville,3) != udf_normalize_string_for_match(suggest_ville,3) -AND ban_ville_ancienne IS NULL \ No newline at end of file From 1e82cc71b06867e0ec785c229847c4ea26798e12 Mon Sep 17 00:00:00 2001 From: maxcorbeau Date: Mon, 21 Apr 2025 13:46:06 +0200 Subject: [PATCH 36/50] gestion contexte --- .../enrich_dbt_model_to_suggestions.py | 23 ++++++++++++------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/dags/enrich/tasks/business_logic/enrich_dbt_model_to_suggestions.py b/dags/enrich/tasks/business_logic/enrich_dbt_model_to_suggestions.py index bdd89f979..028849dc0 100644 --- a/dags/enrich/tasks/business_logic/enrich_dbt_model_to_suggestions.py +++ b/dags/enrich/tasks/business_logic/enrich_dbt_model_to_suggestions.py @@ -34,7 +34,7 @@ def changes_prepare( def changes_prepare_closed_not_replaced( row: dict, -) -> list[dict]: +) -> tuple[list[dict], dict]: """Prepare suggestion changes for closed not replaced cohorts""" from data.models.changes import ChangeActeurUpdateData from qfdmo.models import ActeurStatus @@ -62,12 +62,13 @@ def changes_prepare_closed_not_replaced( entity_type="acteur_displayed", ) ) - return changes + contexte = {} # changes are self-explanatory + return changes, contexte def changes_prepare_closed_replaced( row: dict, -) -> list[dict]: +) -> tuple[list[dict], dict]: """Prepare suggestion changes for closed replaced cohorts""" from data.models.changes import ( ChangeActeurCreateAsChild, @@ -156,7 +157,8 @@ def changes_prepare_closed_replaced( entity_type="acteur_displayed", ) ) - return changes + contexte = {} # changes are self-explanatory + return changes, contexte # Mapping cohorts with their respective changes preparation function @@ -180,6 +182,12 @@ def enrich_dbt_model_to_suggestions( SuggestionStatut, ) + COHORTS_TO_SUGGESTION_ACTION = { + COHORTS.CLOSED_NOT_REPLACED: SuggestionAction.ENRICH_ACTEURS_CLOSED, + COHORTS.CLOSED_REP_OTHER_SIREN: SuggestionAction.ENRICH_ACTEURS_CLOSED, + COHORTS.CLOSED_REP_SAME_SIREN: SuggestionAction.ENRICH_ACTEURS_CLOSED, + } + # Validation if df is None or df.empty: raise ValueError("df vide: on devrait pas être ici") @@ -195,7 +203,7 @@ def enrich_dbt_model_to_suggestions( row = dict(row) try: - changes = COHORTS_TO_PREPARE_CHANGES[cohort](row) + changes, contexte = COHORTS_TO_PREPARE_CHANGES[cohort](row) # We tolerate some errors except Exception as e: @@ -206,10 +214,9 @@ def enrich_dbt_model_to_suggestions( # Creating a suggestion with the given changes suggestions.append( { - "contexte": {}, + "contexte": contexte, "suggestion": { "title": cohort, - "summary": [], "changes": changes, }, } @@ -234,7 +241,7 @@ def enrich_dbt_model_to_suggestions( identifiant_action=identifiant_action, identifiant_execution=f"{cohort}", statut=SuggestionStatut.AVALIDER, - type_action=SuggestionAction.ENRICH_ACTEURS_CLOSED, + type_action=COHORTS_TO_SUGGESTION_ACTION[cohort], metadata={"🔢 Nombre de suggestions": len(suggestions)}, ) db_cohort.save() From a13e3f7c7010aec5ec588db4abd6006c355c76e9 Mon Sep 17 00:00:00 2001 From: maxcorbeau Date: Mon, 21 Apr 2025 15:17:36 +0200 Subject: [PATCH 37/50] renommage fichier test --- ...rs_closed_suggestions.py => test_enrich_suggestions_closed.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename dags_unit_tests/enrich/tasks/{test_enrich_acteurs_closed_suggestions.py => test_enrich_suggestions_closed.py} (100%) diff --git a/dags_unit_tests/enrich/tasks/test_enrich_acteurs_closed_suggestions.py b/dags_unit_tests/enrich/tasks/test_enrich_suggestions_closed.py similarity index 100% rename from dags_unit_tests/enrich/tasks/test_enrich_acteurs_closed_suggestions.py rename to dags_unit_tests/enrich/tasks/test_enrich_suggestions_closed.py From 0d74a1e2dc5c88b3d9c9d89542cada4976264ae8 Mon Sep 17 00:00:00 2001 From: maxcorbeau Date: Mon, 21 Apr 2025 15:26:18 +0200 Subject: [PATCH 38/50] regroupement logique code --- .../enrich_dbt_model_to_suggestions.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/dags/enrich/tasks/business_logic/enrich_dbt_model_to_suggestions.py b/dags/enrich/tasks/business_logic/enrich_dbt_model_to_suggestions.py index 028849dc0..19f9cb1d2 100644 --- a/dags/enrich/tasks/business_logic/enrich_dbt_model_to_suggestions.py +++ b/dags/enrich/tasks/business_logic/enrich_dbt_model_to_suggestions.py @@ -197,13 +197,19 @@ def enrich_dbt_model_to_suggestions( msg = f"Problème cohorte: obtenu {cohorts=} vs. attendu {cohort=}" raise ValueError(msg) - # Suggestions + # Creating suggestion suggestions = [] for _, row in df.iterrows(): row = dict(row) try: changes, contexte = COHORTS_TO_PREPARE_CHANGES[cohort](row) + suggestions.append( + { + "contexte": contexte, + "suggestion": {"title": cohort, "changes": changes}, + } + ) # We tolerate some errors except Exception as e: @@ -211,17 +217,6 @@ def enrich_dbt_model_to_suggestions( logger.error(f"Erreur de préparation des changements: {e}") continue - # Creating a suggestion with the given changes - suggestions.append( - { - "contexte": contexte, - "suggestion": { - "title": cohort, - "changes": changes, - }, - } - ) - # we need some working suggestions, can't have it all fail if not suggestions: raise ValueError("Aucune suggestion à écrire, pas normal") From 040ce579aa92f450532f10e507da98e25867e18e Mon Sep 17 00:00:00 2001 From: maxcorbeau Date: Mon, 21 Apr 2025 15:55:49 +0200 Subject: [PATCH 39/50] automatiser row -> suggest data --- dags/enrich/config/__init__.py | 2 +- dags/enrich/config/columns.py | 26 ++++++---- .../enrich_dbt_model_to_suggestions.py | 43 +++++++++------- .../enrich/tasks/test_enrich_suggestions.py | 31 ++++++++++++ .../tasks/test_enrich_suggestions_closed.py | 16 ++++++ .../marts_enrich_acteurs_closed_replaced.sql | 50 ++++++++++--------- 6 files changed, 116 insertions(+), 52 deletions(-) create mode 100644 dags_unit_tests/enrich/tasks/test_enrich_suggestions.py diff --git a/dags/enrich/config/__init__.py b/dags/enrich/config/__init__.py index b6c1b63e6..7246ecb0a 100644 --- a/dags/enrich/config/__init__.py +++ b/dags/enrich/config/__init__.py @@ -1,5 +1,5 @@ from .cohorts import COHORTS # noqa: F401 -from .columns import COLS # noqa: F401 +from .columns import COLS, SUGGEST_PREFIX # noqa: F401 from .dbt import DBT # noqa: F401 from .models import DAG_ID_TO_CONFIG_MODEL, EnrichActeursClosedConfig # noqa: F401 from .paths import DIR_SQL_READ # noqa: F401 diff --git a/dags/enrich/config/columns.py b/dags/enrich/config/columns.py index e8dedff44..e9c48b924 100644 --- a/dags/enrich/config/columns.py +++ b/dags/enrich/config/columns.py @@ -2,6 +2,10 @@ from dataclasses import dataclass +# All values we want to suggest via our enrichment DBT models +# should start with this prefix +SUGGEST_PREFIX = "suggest" + @dataclass(frozen=True) class COLS: @@ -28,17 +32,17 @@ class COLS: AE_DIRIGEANTS_NOMS: str = "ae_dirigeants_noms_prenoms" # Suggestions - SUGGEST_COHORT_CODE: str = "suggestion_cohorte_code" - SUGGEST_COHORT: str = "suggest_cohort" - - # Replacements - SUGGEST_SIRET: str = "suggest_siret" - SUGGEST_NOM: str = "suggest_nom" - SUGGEST_ADRESSE: str = "suggest_adresse" - SUGGEST_CODE_POSTAL: str = "suggest_code_postal" - SUGGEST_VILLE: str = "suggest_ville" - SUGGEST_NAF: str = "suggest_naf" - + SUGGEST_COHORT: str = f"{SUGGEST_PREFIX}_cohort" + SUGGEST_SIRET: str = f"{SUGGEST_PREFIX}_siret" + SUGGEST_SIREN: str = f"{SUGGEST_PREFIX}_siren" + SUGGEST_NOM: str = f"{SUGGEST_PREFIX}_nom" + SUGGEST_ADRESSE: str = f"{SUGGEST_PREFIX}_adresse" + SUGGEST_CODE_POSTAL: str = f"{SUGGEST_PREFIX}_code_postal" + SUGGEST_VILLE: str = f"{SUGGEST_PREFIX}_ville" + SUGGEST_NAF: str = f"{SUGGEST_PREFIX}_naf_principal" + SUGGEST_LONGITUDE: str = f"{SUGGEST_PREFIX}_longitude" + SUGGEST_LATITUDE: str = f"{SUGGEST_PREFIX}_latitude" + SUGGEST_ACTEUR_TYPE_ID: str = f"{SUGGEST_PREFIX}_acteur_type_id" # Matching MATCH_WORDS: str = "match_words" MATCH_SCORE: str = "match_score" diff --git a/dags/enrich/tasks/business_logic/enrich_dbt_model_to_suggestions.py b/dags/enrich/tasks/business_logic/enrich_dbt_model_to_suggestions.py index 19f9cb1d2..277e8e41c 100644 --- a/dags/enrich/tasks/business_logic/enrich_dbt_model_to_suggestions.py +++ b/dags/enrich/tasks/business_logic/enrich_dbt_model_to_suggestions.py @@ -5,12 +5,32 @@ from cluster.tasks.business_logic.cluster_acteurs_parents_choose_new import ( parent_id_generate, ) -from enrich.config import COHORTS, COLS +from enrich.config import COHORTS, COLS, SUGGEST_PREFIX from utils import logging_utils as log logger = logging.getLogger(__name__) +def row_to_suggest_data(row: dict) -> dict: + """Construct the data dict from all row props starting with SUGGEST_PREFIX""" + pre = SUGGEST_PREFIX + keys_ok = [k for k in row.keys() if k.startswith(f"{pre}_")] + keys_ok.remove(f"{pre}_cohort") + + # Validation + keys_fail = [ + k + for k in row.keys() + if pre in k and k not in keys_ok and not k.startswith(f"{pre}_") + ] + if keys_fail: + msg = f"Colonnes invalides avec {pre} mais sans {pre}_: {keys_fail}" + raise KeyError(msg) + + # Construct the data dict + return {k.replace(pre + "_", ""): row[k] for k in keys_ok} + + def changes_prepare( model, model_params: dict, @@ -81,21 +101,13 @@ def changes_prepare_closed_replaced( today = datetime.now(timezone.utc).strftime("%Y-%m-%d") # Parent parent_id = parent_id_generate([str(row[COLS.SUGGEST_SIRET])]) + parent_data = row_to_suggest_data(row) + parent_data["identifiant_unique"] = parent_id + parent_data["source"] = None + parent_data["statut"] = ActeurStatus.ACTIF params_parent = { "id": parent_id, - "data": { - "identifiant_unique": parent_id, - "nom": row[COLS.SUGGEST_NOM], - "adresse": row[COLS.SUGGEST_ADRESSE], - "code_postal": row[COLS.SUGGEST_CODE_POSTAL], - "ville": row[COLS.SUGGEST_VILLE], - "siren": row[COLS.SUGGEST_SIRET][:9], - "siret": row[COLS.SUGGEST_SIRET], - "naf_principal": row[COLS.SUGGEST_NAF], - "acteur_type": row[COLS.ACTEUR_TYPE_ID], - "source": None, - "statut": ActeurStatus.ACTIF, - }, + "data": parent_data, } changes.append( changes_prepare( @@ -123,9 +135,6 @@ def changes_prepare_closed_replaced( f"détecté le {today} comme fermé dans AE, " f"remplacé par SIRET {row[COLS.SUGGEST_SIRET]}" ) - if row[COLS.ACTEUR_LONGITUDE] is not None and row[COLS.ACTEUR_LATITUDE] is not None: - params_child_new["data"]["longitude"] = row[COLS.ACTEUR_LONGITUDE] - params_child_new["data"]["latitude"] = row[COLS.ACTEUR_LATITUDE] changes.append( changes_prepare( model=ChangeActeurCreateAsChild, diff --git a/dags_unit_tests/enrich/tasks/test_enrich_suggestions.py b/dags_unit_tests/enrich/tasks/test_enrich_suggestions.py new file mode 100644 index 000000000..86302e574 --- /dev/null +++ b/dags_unit_tests/enrich/tasks/test_enrich_suggestions.py @@ -0,0 +1,31 @@ +import pytest +from enrich.tasks.business_logic.enrich_dbt_model_to_suggestions import ( + row_to_suggest_data, +) + + +class TestEnrichSuggestionsRowToSuggestData: + + def test_row_to_suggest_data(self): + row = { + "suggest_cohort": "cohort", + "suggest_siret": "12345678901234", + "foo": "bar", + } + data = row_to_suggest_data(row) + assert data == {"siret": "12345678901234"} + + @pytest.mark.parametrize( + "key", + ["suggest", "suggestion_siret", "siret_suggest"], + ) + def test_raise_if_inconsistent_suggest_keys(self, key): + row = {"suggest_cohort": "cohort"} # must always be present + row[key] = "12345678901234" + with pytest.raises(KeyError, match="Colonnes invalides"): + row_to_suggest_data(row) + + def test_raise_if_missing_cohort(self): + row = {"suggest_siret": "12345678901234"} + with pytest.raises(ValueError, match="not in list"): + row_to_suggest_data(row) diff --git a/dags_unit_tests/enrich/tasks/test_enrich_suggestions_closed.py b/dags_unit_tests/enrich/tasks/test_enrich_suggestions_closed.py index b10577897..9fdb456c3 100644 --- a/dags_unit_tests/enrich/tasks/test_enrich_suggestions_closed.py +++ b/dags_unit_tests/enrich/tasks/test_enrich_suggestions_closed.py @@ -64,6 +64,7 @@ def df_replaced(self, atype, source): "33333333300001", "55555555500001", ], + COLS.SUGGEST_SIREN: ["111111111", "333333333", "555555555"], COLS.SUGGEST_NOM: ["APRES a1", "APRES a2", "APRES a3"], COLS.SUGGEST_COHORT: [ COHORTS.CLOSED_REP_SAME_SIREN, @@ -74,6 +75,9 @@ def df_replaced(self, atype, source): COLS.SUGGEST_CODE_POSTAL: ["12345", "67890", "12345"], COLS.SUGGEST_VILLE: ["Ville1", "Ville2", "Ville3"], COLS.SUGGEST_NAF: ["naf1", "naf2", "naf3"], + COLS.SUGGEST_LONGITUDE: [1, 2, 3], + COLS.SUGGEST_LATITUDE: [11, 22, 33], + COLS.SUGGEST_ACTEUR_TYPE_ID: [atype.pk, atype.pk, atype.pk], } ) @@ -182,6 +186,8 @@ def test_cohorte_meme_siren(self, acteurs, atype, source, df_replaced_meme_siret assert parent.naf_principal == "naf1" assert parent.acteur_type == atype assert parent.source is None + assert parent.location.x == 1 + assert parent.location.y == 11 child = RevisionActeur.objects.get(pk="a1") assert child.statut == ActeurStatus.INACTIF @@ -191,6 +197,8 @@ def test_cohorte_meme_siren(self, acteurs, atype, source, df_replaced_meme_siret f"remplacé par SIRET 11111111100002" ) assert child.siret_is_closed is True + assert child.location.x == 1 + assert child.location.y == 11 def test_cohorte_autre_siren(self, acteurs, df_replaced_autre_siret): from data.models.suggestion import Suggestion, SuggestionCohorte @@ -223,6 +231,8 @@ def test_cohorte_autre_siren(self, acteurs, df_replaced_autre_siret): assert parent.code_postal == "67890" assert parent.ville == "Ville2" assert parent.naf_principal == "naf2" + assert parent.location.x == 2 + assert parent.location.y == 22 child = RevisionActeur.objects.get(pk="a2") assert child.statut == ActeurStatus.INACTIF @@ -232,6 +242,8 @@ def test_cohorte_autre_siren(self, acteurs, df_replaced_autre_siret): f"remplacé par SIRET 33333333300001" ) assert child.siret_is_closed is True + assert child.location.x == 2 + assert child.location.y == 22 parent_id = parent_id_generate(["55555555500001"]) parent = RevisionActeur.objects.get(pk=parent_id) @@ -240,6 +252,8 @@ def test_cohorte_autre_siren(self, acteurs, df_replaced_autre_siret): assert parent.code_postal == "12345" assert parent.ville == "Ville3" assert parent.naf_principal == "naf3" + assert parent.location.x == 3 + assert parent.location.y == 33 child = RevisionActeur.objects.get(pk="a3") assert child.statut == ActeurStatus.INACTIF @@ -248,3 +262,5 @@ def test_cohorte_autre_siren(self, acteurs, df_replaced_autre_siret): f"SIRET 44444444400001 détecté le {TODAY} comme fermé dans AE, " f"remplacé par SIRET 55555555500001" ) + assert child.location.x == 3 + assert child.location.y == 33 diff --git a/dbt/models/marts/enrich/marts_enrich_acteurs_closed_replaced.sql b/dbt/models/marts/enrich/marts_enrich_acteurs_closed_replaced.sql index 53fbf95b5..94c964759 100644 --- a/dbt/models/marts/enrich/marts_enrich_acteurs_closed_replaced.sql +++ b/dbt/models/marts/enrich/marts_enrich_acteurs_closed_replaced.sql @@ -11,56 +11,60 @@ WITH potential_replacements AS ( -- Candidates acteur data candidates.*, - -- Replacements - replacements.siret AS suggest_siret, - LEFT(candidates.acteur_siret,9) = LEFT(replacements.siret,9) AS suggest_siret_is_from_same_siren, - replacements.nom AS suggest_nom, - replacements.naf AS suggest_naf, - replacements.ville AS suggest_ville, - replacements.code_postal AS suggest_code_postal, - replacements.adresse AS suggest_adresse, + -- Suggestions + acteur_type_id AS suggest_acteur_type_id, + acteur_longitude AS suggest_longitude, + acteur_latitude AS suggest_latitude, + suggests.siret AS suggest_siret, + LEFT(suggests.siret,9) AS suggest_siren, + LEFT(candidates.acteur_siret,9) = LEFT(suggests.siret,9) AS suggest_siret_is_from_same_siren, + suggests.nom AS suggest_nom, + suggests.naf AS suggest_naf, + suggests.ville AS suggest_ville, + suggests.code_postal AS suggest_code_postal, + suggests.adresse AS suggest_adresse, -- Matching udf_columns_words_in_common_count( candidates.acteur_nom_normalise, - udf_normalize_string_for_match(replacements.nom) + udf_normalize_string_for_match(suggests.nom) ) AS noms_nombre_mots_commun, ROW_NUMBER() OVER ( PARTITION BY candidates.acteur_siret ORDER BY - -- Prioritize replacements from same company + -- Prioritize suggests from same company CASE - WHEN LEFT(candidates.acteur_siret,9) = LEFT(replacements.siret,9) THEN 1 + WHEN LEFT(candidates.acteur_siret,9) = LEFT(suggests.siret,9) THEN 1 ELSE 0 END DESC, -- Then etablissements with more words in common udf_columns_words_in_common_count( candidates.acteur_nom_normalise, - udf_normalize_string_for_match(replacements.nom) + udf_normalize_string_for_match(suggests.nom) ) DESC ) AS replacement_priority /* - JOINS: candidates are our acteurs, replacements are etablissements + JOINS: candidates are our acteurs, suggests are etablissements with a matching naf, code_postal, adresse and adresse_numero */ FROM {{ ref('marts_enrich_acteurs_closed_candidates') }} AS candidates - INNER JOIN {{ ref('int_ae_etablissement') }} AS replacements - ON replacements.naf = candidates.etab_naf - AND replacements.code_postal = candidates.etab_code_postal - AND replacements.adresse_numero = candidates.etab_adresse_numero - AND udf_normalize_string_for_match(replacements.adresse) = udf_normalize_string_for_match(candidates.etab_adresse) - WHERE replacements.est_actif + INNER JOIN {{ ref('int_ae_etablissement') }} AS suggests + ON suggests.naf = candidates.etab_naf + AND suggests.code_postal = candidates.etab_code_postal + AND suggests.adresse_numero = candidates.etab_adresse_numero + AND udf_normalize_string_for_match(suggests.adresse) = udf_normalize_string_for_match(candidates.etab_adresse) + WHERE suggests.est_actif -- Fields which must be non-NULL for a replacement to be considered - AND replacements.code_postal IS NOT NULL - AND replacements.adresse IS NOT NULL + AND suggests.code_postal IS NOT NULL + AND suggests.adresse IS NOT NULL /* To reduce false positives with generic addresses such as ZA, ZI containing multiple instances of similar stores (e.g. supermarkets), we force presence of street number, which later will be used as condition for matching */ - AND replacements.adresse_numero IS NOT NULL + AND suggests.adresse_numero IS NOT NULL ) SELECT * FROM potential_replacements WHERE replacement_priority=1 -/* We don't want to propose replacements with unavailable names */ +/* We don't want to propose suggests with unavailable names */ AND suggest_nom != {{ value_unavailable() }} \ No newline at end of file From 6c77d1658764dc40f90a4558bfff474715140724 Mon Sep 17 00:00:00 2001 From: maxcorbeau Date: Mon, 21 Apr 2025 16:03:42 +0200 Subject: [PATCH 40/50] profiter pour corriger les siren --- .../tasks/business_logic/enrich_dbt_model_to_suggestions.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/dags/enrich/tasks/business_logic/enrich_dbt_model_to_suggestions.py b/dags/enrich/tasks/business_logic/enrich_dbt_model_to_suggestions.py index 277e8e41c..41971a205 100644 --- a/dags/enrich/tasks/business_logic/enrich_dbt_model_to_suggestions.py +++ b/dags/enrich/tasks/business_logic/enrich_dbt_model_to_suggestions.py @@ -65,9 +65,8 @@ def changes_prepare_closed_not_replaced( "data": { "identifiant_unique": row[COLS.ACTEUR_ID], "statut": ActeurStatus.INACTIF, - # TODO: fix inconsistency between acteur_siret and siret - # in non-replaced model "siret": row[COLS.ACTEUR_SIRET], + "siren": row[COLS.ACTEUR_SIRET][:9], "siret_is_closed": True, "acteur_type": row[COLS.ACTEUR_TYPE_ID], "source": row[COLS.ACTEUR_SOURCE_ID], From 035c6d1bb0f643f203652ac68eb0dd7a9ab1dc86 Mon Sep 17 00:00:00 2001 From: maxcorbeau Date: Mon, 21 Apr 2025 16:57:19 +0200 Subject: [PATCH 41/50] commande dbt au mauvais endroit --- dags/enrich/config/models.py | 2 +- dags/enrich/dags/enrich_acteurs_closed.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/dags/enrich/config/models.py b/dags/enrich/config/models.py index 2005c0839..3ec5d7eab 100644 --- a/dags/enrich/config/models.py +++ b/dags/enrich/config/models.py @@ -47,7 +47,7 @@ class EnrichBaseConfig(BaseModel): 🔴 Désactiver uniquement pour des tests.""", ) dbt_models_refresh_command: str = Field( - default="dbt build --select tag:marts,tag:enrich,tag:closed", + default="", description="🔄 Commande DBT à exécuter pour rafraîchir les modèles", ) filter_contains__acteur_commentaires: Optional[str] = Field( diff --git a/dags/enrich/dags/enrich_acteurs_closed.py b/dags/enrich/dags/enrich_acteurs_closed.py index 24397914e..bb806ae64 100644 --- a/dags/enrich/dags/enrich_acteurs_closed.py +++ b/dags/enrich/dags/enrich_acteurs_closed.py @@ -42,6 +42,9 @@ params=config_to_airflow_params( EnrichActeursClosedConfig( dbt_models_refresh=False, + dbt_models_refresh_command=( + "dbt build --select tag:marts,tag:enrich,tag:closed" + ), filter_equals__acteur_statut="ACTIF", ) ), From cd13fb463df275b1f2b7233140d3323ba40d425d Mon Sep 17 00:00:00 2001 From: maxcorbeau Date: Mon, 21 Apr 2025 17:02:18 +0200 Subject: [PATCH 42/50] =?UTF-8?q?renommage/d=C3=A9placement=20row=20to=20d?= =?UTF-8?q?ata?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../enrich_dbt_model_row_to_suggest_data.py | 22 +++++++++++++++ .../enrich_dbt_model_to_suggestions.py | 27 ++++--------------- ...y => test_enrich_dbt_model_row_to_data.py} | 12 ++++----- 3 files changed, 33 insertions(+), 28 deletions(-) create mode 100644 dags/enrich/tasks/business_logic/enrich_dbt_model_row_to_suggest_data.py rename dags_unit_tests/enrich/tasks/{test_enrich_suggestions.py => test_enrich_dbt_model_row_to_data.py} (71%) diff --git a/dags/enrich/tasks/business_logic/enrich_dbt_model_row_to_suggest_data.py b/dags/enrich/tasks/business_logic/enrich_dbt_model_row_to_suggest_data.py new file mode 100644 index 000000000..e22f6bd41 --- /dev/null +++ b/dags/enrich/tasks/business_logic/enrich_dbt_model_row_to_suggest_data.py @@ -0,0 +1,22 @@ +from enrich.config import SUGGEST_PREFIX + + +def dbt_model_row_to_suggest_data(row: dict) -> dict: + """Construct the pydantic model_params.data dict from a + dbt model's row based on fields prefixed with SUGGEST_PREFIX""" + pre = SUGGEST_PREFIX + keys_ok = [k for k in row.keys() if k.startswith(f"{pre}_")] + keys_ok.remove(f"{pre}_cohort") + + # Validation + keys_fail = [ + k + for k in row.keys() + if pre in k and k not in keys_ok and not k.startswith(f"{pre}_") + ] + if keys_fail: + msg = f"Colonnes invalides avec {pre} mais sans {pre}_: {keys_fail}" + raise KeyError(msg) + + # Construct the data dict + return {k.replace(pre + "_", ""): row[k] for k in keys_ok} diff --git a/dags/enrich/tasks/business_logic/enrich_dbt_model_to_suggestions.py b/dags/enrich/tasks/business_logic/enrich_dbt_model_to_suggestions.py index 41971a205..3a93a1bc9 100644 --- a/dags/enrich/tasks/business_logic/enrich_dbt_model_to_suggestions.py +++ b/dags/enrich/tasks/business_logic/enrich_dbt_model_to_suggestions.py @@ -5,32 +5,15 @@ from cluster.tasks.business_logic.cluster_acteurs_parents_choose_new import ( parent_id_generate, ) -from enrich.config import COHORTS, COLS, SUGGEST_PREFIX +from enrich.config import COHORTS, COLS +from enrich.tasks.business_logic.enrich_dbt_model_row_to_suggest_data import ( + dbt_model_row_to_suggest_data, +) from utils import logging_utils as log logger = logging.getLogger(__name__) -def row_to_suggest_data(row: dict) -> dict: - """Construct the data dict from all row props starting with SUGGEST_PREFIX""" - pre = SUGGEST_PREFIX - keys_ok = [k for k in row.keys() if k.startswith(f"{pre}_")] - keys_ok.remove(f"{pre}_cohort") - - # Validation - keys_fail = [ - k - for k in row.keys() - if pre in k and k not in keys_ok and not k.startswith(f"{pre}_") - ] - if keys_fail: - msg = f"Colonnes invalides avec {pre} mais sans {pre}_: {keys_fail}" - raise KeyError(msg) - - # Construct the data dict - return {k.replace(pre + "_", ""): row[k] for k in keys_ok} - - def changes_prepare( model, model_params: dict, @@ -100,7 +83,7 @@ def changes_prepare_closed_replaced( today = datetime.now(timezone.utc).strftime("%Y-%m-%d") # Parent parent_id = parent_id_generate([str(row[COLS.SUGGEST_SIRET])]) - parent_data = row_to_suggest_data(row) + parent_data = dbt_model_row_to_suggest_data(row) parent_data["identifiant_unique"] = parent_id parent_data["source"] = None parent_data["statut"] = ActeurStatus.ACTIF diff --git a/dags_unit_tests/enrich/tasks/test_enrich_suggestions.py b/dags_unit_tests/enrich/tasks/test_enrich_dbt_model_row_to_data.py similarity index 71% rename from dags_unit_tests/enrich/tasks/test_enrich_suggestions.py rename to dags_unit_tests/enrich/tasks/test_enrich_dbt_model_row_to_data.py index 86302e574..4c96476e2 100644 --- a/dags_unit_tests/enrich/tasks/test_enrich_suggestions.py +++ b/dags_unit_tests/enrich/tasks/test_enrich_dbt_model_row_to_data.py @@ -1,10 +1,10 @@ import pytest -from enrich.tasks.business_logic.enrich_dbt_model_to_suggestions import ( - row_to_suggest_data, +from enrich.tasks.business_logic.enrich_dbt_model_row_to_suggest_data import ( + dbt_model_row_to_suggest_data, ) -class TestEnrichSuggestionsRowToSuggestData: +class TestEnrichDbtModelRowToData: def test_row_to_suggest_data(self): row = { @@ -12,7 +12,7 @@ def test_row_to_suggest_data(self): "suggest_siret": "12345678901234", "foo": "bar", } - data = row_to_suggest_data(row) + data = dbt_model_row_to_suggest_data(row) assert data == {"siret": "12345678901234"} @pytest.mark.parametrize( @@ -23,9 +23,9 @@ def test_raise_if_inconsistent_suggest_keys(self, key): row = {"suggest_cohort": "cohort"} # must always be present row[key] = "12345678901234" with pytest.raises(KeyError, match="Colonnes invalides"): - row_to_suggest_data(row) + dbt_model_row_to_suggest_data(row) def test_raise_if_missing_cohort(self): row = {"suggest_siret": "12345678901234"} with pytest.raises(ValueError, match="not in list"): - row_to_suggest_data(row) + dbt_model_row_to_suggest_data(row) From eeb4b50db2f70678b82e98357596ef4ea1a728b7 Mon Sep 17 00:00:00 2001 From: maxcorbeau Date: Wed, 23 Apr 2025 15:36:22 +0200 Subject: [PATCH 43/50] =?UTF-8?q?migration:=20suppresion=20RGPD=20=C3=A0?= =?UTF-8?q?=20venir=20via=20autre=20PR?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...011_alter_suggestioncohorte_type_action.py | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 data/migrations/0011_alter_suggestioncohorte_type_action.py diff --git a/data/migrations/0011_alter_suggestioncohorte_type_action.py b/data/migrations/0011_alter_suggestioncohorte_type_action.py new file mode 100644 index 000000000..dbbdecf36 --- /dev/null +++ b/data/migrations/0011_alter_suggestioncohorte_type_action.py @@ -0,0 +1,32 @@ +# Generated by Django 5.1.6 on 2025-04-23 13:35 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("data", "0010_alter_suggestioncohorte_type_action"), + ] + + operations = [ + migrations.AlterField( + model_name="suggestioncohorte", + name="type_action", + field=models.CharField( + blank=True, + choices=[ + ("CRAWL_URLS", "🔗 URLs scannées"), + ("ENRICH_ACTEURS_CLOSED", "🚪 Acteurs fermés"), + ("CLUSTERING", "regroupement/déduplication des acteurs"), + ("SOURCE_AJOUT", "ingestion de source de données - nouveau acteur"), + ( + "SOURCE_MODIFICATION", + "ingestion de source de données - modification d'acteur existant", + ), + ("SOURCE_SUPRESSION", "ingestion de source de données"), + ], + max_length=50, + ), + ), + ] From 385ff09f4c2ed16695c8cb10af25986c13bdd962 Mon Sep 17 00:00:00 2001 From: maxcorbeau Date: Wed, 23 Apr 2025 08:08:46 +0200 Subject: [PATCH 44/50] DAG de rafraichissement --- dags/enrich/dags/enrich_dbt_models_refresh.py | 58 +++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 dags/enrich/dags/enrich_dbt_models_refresh.py diff --git a/dags/enrich/dags/enrich_dbt_models_refresh.py b/dags/enrich/dags/enrich_dbt_models_refresh.py new file mode 100644 index 000000000..239761556 --- /dev/null +++ b/dags/enrich/dags/enrich_dbt_models_refresh.py @@ -0,0 +1,58 @@ +""" +DAG to anonymize QFDMO acteur which names +contains people from Annuaire Entreprise (AE) +""" + +import re + +from airflow import DAG +from airflow.models.baseoperator import chain +from airflow.operators.bash import BashOperator +from enrich.config import ( + EnrichActeursClosedConfig, +) +from shared.config import CATCHUPS, SCHEDULES, START_DATES, config_to_airflow_params + +with DAG( + dag_id="enrich_dbt_models_refresh", + dag_display_name="🔄 Enrichir - Rafraîchir les modèles DBT", + default_args={ + "owner": "airflow", + "depends_on_past": False, + "email_on_failure": False, + "email_on_retry": False, + "retries": 0, + }, + description=( + "Un DAG pour rafraîchir les modèles DBT nécessaires" + "à l'enrichissement des acteurs" + ), + tags=["dbt", "annuaire", "entreprises", "ae", "ban", "marts"], + schedule=SCHEDULES.DAILY, + catchup=CATCHUPS.AWLAYS_FALSE, + start_date=START_DATES.YESTERDAY, + params=config_to_airflow_params( + EnrichActeursClosedConfig( + dbt_models_refresh=True, + dbt_models_refresh_command=""" + dbt build --select model1 + dbt build --select model2 + """, + ) + ), +) as dag: + commands = [ + x.strip() + for x in dag.params.get("dbt_models_refresh_command").split("\n") # type: ignore + if x.strip() + ] + ops = [] + for command in commands: + cmd_id = re.sub(r"__+", "_", re.sub(r"[^a-zA-Z0-9]+", "_", command)) + ops.append( + BashOperator( + task_id=f"enrich_{cmd_id}", + bash_command=command, + ) + ) + chain(*ops) From 0773bc6229aaedf9f2aad7cc16a9bd04095e5b5a Mon Sep 17 00:00:00 2001 From: maxcorbeau Date: Wed, 23 Apr 2025 08:20:37 +0200 Subject: [PATCH 45/50] =?UTF-8?q?sp=C3=A9cifier=20les=20mod=C3=A8les=20dbt?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- dags/enrich/dags/enrich_dbt_models_refresh.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/dags/enrich/dags/enrich_dbt_models_refresh.py b/dags/enrich/dags/enrich_dbt_models_refresh.py index 239761556..4a08ad5d4 100644 --- a/dags/enrich/dags/enrich_dbt_models_refresh.py +++ b/dags/enrich/dags/enrich_dbt_models_refresh.py @@ -35,8 +35,10 @@ EnrichActeursClosedConfig( dbt_models_refresh=True, dbt_models_refresh_command=""" - dbt build --select model1 - dbt build --select model2 + dbt build --select +int_ae_unite_legale + dbt build --select int_ae_etablissement + dbt build --select +int_ban_adresses + dbt build --select int_ban_villes """, ) ), From 6a689c78963f7b0872a14f0343d06bee2b189401 Mon Sep 17 00:00:00 2001 From: maxcorbeau Date: Wed, 23 Apr 2025 08:33:13 +0200 Subject: [PATCH 46/50] changer param dag en liste --- dags/enrich/config/__init__.py | 6 +++- dags/enrich/config/models.py | 7 ++++ dags/enrich/dags/enrich_dbt_models_refresh.py | 35 +++++++++---------- dags/shared/config/models.py | 1 + 4 files changed, 29 insertions(+), 20 deletions(-) diff --git a/dags/enrich/config/__init__.py b/dags/enrich/config/__init__.py index 7246ecb0a..c826578a0 100644 --- a/dags/enrich/config/__init__.py +++ b/dags/enrich/config/__init__.py @@ -1,7 +1,11 @@ from .cohorts import COHORTS # noqa: F401 from .columns import COLS, SUGGEST_PREFIX # noqa: F401 from .dbt import DBT # noqa: F401 -from .models import DAG_ID_TO_CONFIG_MODEL, EnrichActeursClosedConfig # noqa: F401 +from .models import ( # noqa: F401 + DAG_ID_TO_CONFIG_MODEL, + EnrichActeursClosedConfig, + EnrichDbtModelsRefreshConfig, +) from .paths import DIR_SQL_READ # noqa: F401 from .tasks import TASKS # noqa: F401 from .xcoms import XCOMS, xcom_pull # noqa: F401 diff --git a/dags/enrich/config/models.py b/dags/enrich/config/models.py index 3ec5d7eab..1c0ec3f97 100644 --- a/dags/enrich/config/models.py +++ b/dags/enrich/config/models.py @@ -82,6 +82,13 @@ class EnrichActeursClosedConfig(EnrichBaseConfig): ) +class EnrichDbtModelsRefreshConfig(BaseModel): + dbt_models_refresh_commands: list[str] = Field( + default=[], + description="🔄 Liste de commandes DBT à exécuter pour rafraîchir les modèles", + ) + + DAG_ID_TO_CONFIG_MODEL = { "enrich_acteurs_closed": EnrichActeursClosedConfig, } diff --git a/dags/enrich/dags/enrich_dbt_models_refresh.py b/dags/enrich/dags/enrich_dbt_models_refresh.py index 4a08ad5d4..1de40bd85 100644 --- a/dags/enrich/dags/enrich_dbt_models_refresh.py +++ b/dags/enrich/dags/enrich_dbt_models_refresh.py @@ -9,7 +9,7 @@ from airflow.models.baseoperator import chain from airflow.operators.bash import BashOperator from enrich.config import ( - EnrichActeursClosedConfig, + EnrichDbtModelsRefreshConfig, ) from shared.config import CATCHUPS, SCHEDULES, START_DATES, config_to_airflow_params @@ -32,29 +32,26 @@ catchup=CATCHUPS.AWLAYS_FALSE, start_date=START_DATES.YESTERDAY, params=config_to_airflow_params( - EnrichActeursClosedConfig( - dbt_models_refresh=True, - dbt_models_refresh_command=""" - dbt build --select +int_ae_unite_legale - dbt build --select int_ae_etablissement - dbt build --select +int_ban_adresses - dbt build --select int_ban_villes - """, + EnrichDbtModelsRefreshConfig( + dbt_models_refresh_commands=[ + "dbt build --select +int_ae_unite_legale", + "dbt build --select int_ae_etablissement", + "dbt build --select +int_ban_adresses", + "dbt build --select int_ban_villes", + ], ) ), ) as dag: - commands = [ - x.strip() - for x in dag.params.get("dbt_models_refresh_command").split("\n") # type: ignore - if x.strip() - ] - ops = [] - for command in commands: - cmd_id = re.sub(r"__+", "_", re.sub(r"[^a-zA-Z0-9]+", "_", command)) - ops.append( + tasks = [] + for command in dag.params.get("dbt_models_refresh_commands", []): + cmd = command.strip() + if not cmd: + continue + cmd_id = re.sub(r"__+", "_", re.sub(r"[^a-zA-Z0-9]+", "_", cmd)) + tasks.append( BashOperator( task_id=f"enrich_{cmd_id}", bash_command=command, ) ) - chain(*ops) + chain(*tasks) diff --git a/dags/shared/config/models.py b/dags/shared/config/models.py index 89740aa14..51b403470 100644 --- a/dags/shared/config/models.py +++ b/dags/shared/config/models.py @@ -7,6 +7,7 @@ bool: "boolean", str: "string", typing.Optional[str]: ["null", "string"], + list[str]: ["array"], } From 63225b09a4956c8a24b314f146aace4acb54a510 Mon Sep 17 00:00:00 2001 From: maxcorbeau Date: Wed, 23 Apr 2025 09:55:43 +0200 Subject: [PATCH 47/50] =?UTF-8?q?=C3=A9viter=20probl=C3=A8me=20d=C3=A9pend?= =?UTF-8?q?ances?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- dags/enrich/dags/enrich_dbt_models_refresh.py | 3 ++- .../ae_annuaire_entreprises/int_ae_etablissement.sql | 3 +++ .../ae_annuaire_entreprises/int_ae_unite_legale.sql | 2 ++ 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/dags/enrich/dags/enrich_dbt_models_refresh.py b/dags/enrich/dags/enrich_dbt_models_refresh.py index 1de40bd85..caaee8c05 100644 --- a/dags/enrich/dags/enrich_dbt_models_refresh.py +++ b/dags/enrich/dags/enrich_dbt_models_refresh.py @@ -35,7 +35,7 @@ EnrichDbtModelsRefreshConfig( dbt_models_refresh_commands=[ "dbt build --select +int_ae_unite_legale", - "dbt build --select int_ae_etablissement", + "dbt build --select +int_ae_etablissement", "dbt build --select +int_ban_adresses", "dbt build --select int_ban_villes", ], @@ -48,6 +48,7 @@ if not cmd: continue cmd_id = re.sub(r"__+", "_", re.sub(r"[^a-zA-Z0-9]+", "_", cmd)) + cmd += " --debug --threads 1" tasks.append( BashOperator( task_id=f"enrich_{cmd_id}", diff --git a/dbt/models/intermediate/ae_annuaire_entreprises/int_ae_etablissement.sql b/dbt/models/intermediate/ae_annuaire_entreprises/int_ae_etablissement.sql index b313e9eac..e4479943c 100644 --- a/dbt/models/intermediate/ae_annuaire_entreprises/int_ae_etablissement.sql +++ b/dbt/models/intermediate/ae_annuaire_entreprises/int_ae_etablissement.sql @@ -1,3 +1,6 @@ +-- depends_on: {{ ref('base_ae_unite_legale') }} +-- depends_on: {{ ref('base_ae_etablissement') }} + /* Notes: - 🖊️ Renaming columns to follow our naming convention diff --git a/dbt/models/intermediate/ae_annuaire_entreprises/int_ae_unite_legale.sql b/dbt/models/intermediate/ae_annuaire_entreprises/int_ae_unite_legale.sql index 162f3b84e..48d67956c 100644 --- a/dbt/models/intermediate/ae_annuaire_entreprises/int_ae_unite_legale.sql +++ b/dbt/models/intermediate/ae_annuaire_entreprises/int_ae_unite_legale.sql @@ -1,3 +1,5 @@ +-- depends_on: {{ ref('base_ae_unite_legale') }} + /* Notes: - 🖊️ Renaming columns to follow our naming convention From 60b373f063dac06cdb0c18ac90379b2cdba0f877 Mon Sep 17 00:00:00 2001 From: maxcorbeau Date: Wed, 23 Apr 2025 13:21:11 +0200 Subject: [PATCH 48/50] aider le linter --- dags/shared/config/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dags/shared/config/models.py b/dags/shared/config/models.py index 51b403470..021ae2024 100644 --- a/dags/shared/config/models.py +++ b/dags/shared/config/models.py @@ -28,7 +28,7 @@ def config_to_airflow_params(model_instance: BaseModel) -> dict[str, Param]: model_cls = model_instance.__class__ for field_name, field_info in model_cls.model_fields.items(): field_value = getattr(model_instance, field_name) # Get value from instance - + assert field_info.annotation is not None params[field_name] = Param( field_value, type=PYDANTIC_TYPE_TO_AIRFLOW_TYPE[field_info.annotation], From 9346c6f67842910d4b828284cbc9057d73c4d9d8 Mon Sep 17 00:00:00 2001 From: maxcorbeau Date: Wed, 23 Apr 2025 15:16:07 +0200 Subject: [PATCH 49/50] =?UTF-8?q?fix=20nouvelle=20cmd=20pas=20utilis=C3=A9?= =?UTF-8?q?e?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- dags/enrich/dags/enrich_dbt_models_refresh.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/dags/enrich/dags/enrich_dbt_models_refresh.py b/dags/enrich/dags/enrich_dbt_models_refresh.py index caaee8c05..c34a50955 100644 --- a/dags/enrich/dags/enrich_dbt_models_refresh.py +++ b/dags/enrich/dags/enrich_dbt_models_refresh.py @@ -8,9 +8,7 @@ from airflow import DAG from airflow.models.baseoperator import chain from airflow.operators.bash import BashOperator -from enrich.config import ( - EnrichDbtModelsRefreshConfig, -) +from enrich.config import EnrichDbtModelsRefreshConfig from shared.config import CATCHUPS, SCHEDULES, START_DATES, config_to_airflow_params with DAG( @@ -43,8 +41,8 @@ ), ) as dag: tasks = [] - for command in dag.params.get("dbt_models_refresh_commands", []): - cmd = command.strip() + for cmd in dag.params.get("dbt_models_refresh_commands", []): + cmd = cmd.strip() if not cmd: continue cmd_id = re.sub(r"__+", "_", re.sub(r"[^a-zA-Z0-9]+", "_", cmd)) @@ -52,7 +50,7 @@ tasks.append( BashOperator( task_id=f"enrich_{cmd_id}", - bash_command=command, + bash_command=cmd, ) ) chain(*tasks) From 47c6c5c862c8a79cdedfaab258273e194d2a5a5b Mon Sep 17 00:00:00 2001 From: maxcorbeau Date: Thu, 24 Apr 2025 07:23:47 +0200 Subject: [PATCH 50/50] =?UTF-8?q?1=20cmd=20en=20=C3=A9chec=20bloque=20pas?= =?UTF-8?q?=20les=20autres?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- dags/enrich/dags/enrich_dbt_models_refresh.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dags/enrich/dags/enrich_dbt_models_refresh.py b/dags/enrich/dags/enrich_dbt_models_refresh.py index c34a50955..c03ec8d44 100644 --- a/dags/enrich/dags/enrich_dbt_models_refresh.py +++ b/dags/enrich/dags/enrich_dbt_models_refresh.py @@ -8,6 +8,7 @@ from airflow import DAG from airflow.models.baseoperator import chain from airflow.operators.bash import BashOperator +from airflow.utils.trigger_rule import TriggerRule from enrich.config import EnrichDbtModelsRefreshConfig from shared.config import CATCHUPS, SCHEDULES, START_DATES, config_to_airflow_params @@ -51,6 +52,7 @@ BashOperator( task_id=f"enrich_{cmd_id}", bash_command=cmd, + trigger_rule=TriggerRule.ALL_DONE, ) ) chain(*tasks)