diff --git a/dags/enrich/config/__init__.py b/dags/enrich/config/__init__.py index c2ace783b..c85294e29 100644 --- a/dags/enrich/config/__init__.py +++ b/dags/enrich/config/__init__.py @@ -5,6 +5,7 @@ DAG_ID_TO_CONFIG_MODEL, EnrichActeursClosedConfig, EnrichActeursRGPDConfig, + EnrichActeursVillesConfig, EnrichDbtModelsRefreshConfig, ) from .tasks import TASKS # noqa: F401 diff --git a/dags/enrich/config/cohorts.py b/dags/enrich/config/cohorts.py index ae4f73d4d..9181304a8 100644 --- a/dags/enrich/config/cohorts.py +++ b/dags/enrich/config/cohorts.py @@ -11,3 +11,5 @@ class COHORTS: ) CLOSED_REP_SAME_SIREN = "🚪 Acteurs Fermés: 🟢 remplacés par SIRET du même SIREN" RGPD = "Anonymisation RGPD" + VILLES_TYPO = "🌆 Changement de ville: 🟢 variation d'ortographe" + VILLES_NEW = "🌆 Changement de ville: 🟡 ancienne -> nouvelle" diff --git a/dags/enrich/config/columns.py b/dags/enrich/config/columns.py index b6cbdbc2b..1ec997172 100644 --- a/dags/enrich/config/columns.py +++ b/dags/enrich/config/columns.py @@ -13,9 +13,7 @@ class COLS: # Acteurs ACTEUR_ID: str = "acteur_id" ACTEUR_TYPE_ID: str = "acteur_type_id" - ACTEUR_TYPE_CODE: str = "acteur_type_code" ACTEUR_SOURCE_ID: str = "acteur_source_id" - ACTEUR_SOURCE_CODE: str = "acteur_source_code" ACTEUR_SIRET: str = "acteur_siret" ACTEUR_NOM: str = "acteur_nom" ACTEUR_NOMS_ORIGINE: str = "acteur_noms_origine" @@ -43,6 +41,18 @@ class COLS: SUGGEST_VILLE: str = "suggest_ville" SUGGEST_NAF: str = "suggest_naf" + # Suggestions + SUGGEST_COHORT_CODE: str = "suggestion_cohorte_code" + SUGGEST_COHORT: str = "suggest_cohort" + + # Replacements + SUGGEST_SIRET: str = "suggest_siret" + SUGGEST_NOM: str = "suggest_nom" + SUGGEST_ADRESSE: str = "suggest_adresse" + SUGGEST_CODE_POSTAL: str = "suggest_code_postal" + SUGGEST_VILLE: str = "suggest_ville" + SUGGEST_NAF: str = "suggest_naf" + # Suggestions SUGGEST_COHORT: str = f"{SUGGEST_PREFIX}_cohort" SUGGEST_SIRET: str = f"{SUGGEST_PREFIX}_siret" diff --git a/dags/enrich/config/dbt.py b/dags/enrich/config/dbt.py index bf1a6fb83..ab7f09ef1 100644 --- a/dags/enrich/config/dbt.py +++ b/dags/enrich/config/dbt.py @@ -19,4 +19,5 @@ class DBT: MARTS_ENRICH_AE_CLOSED_NOT_REPLACED: str = ( "marts_enrich_acteurs_closed_suggest_not_replaced" ) - MARTS_ENRICH_AE_RGPD: str = "marts_enrich_ae_rgpd" + MARTS_ENRICH_VILLES_TYPO: str = "marts_enrich_acteurs_villes_suggest_typo" + MARTS_ENRICH_VILLES_NEW: str = "marts_enrich_acteurs_villes_suggest_new" diff --git a/dags/enrich/config/models.py b/dags/enrich/config/models.py index db757e5b2..4dbfa7fff 100644 --- a/dags/enrich/config/models.py +++ b/dags/enrich/config/models.py @@ -100,8 +100,13 @@ class EnrichDbtModelsRefreshConfig(BaseModel): ) +class EnrichActeursVillesConfig(EnrichBaseConfig): + pass + + DAG_ID_TO_CONFIG_MODEL = { "enrich_acteurs_closed": EnrichActeursClosedConfig, "enrich_acteurs_rgpd": EnrichActeursRGPDConfig, "enrich_dbt_models_refresh": EnrichDbtModelsRefreshConfig, + "enrich_acteurs_villes": EnrichActeursVillesConfig, } diff --git a/dags/enrich/config/tasks.py b/dags/enrich/config/tasks.py index a3dc8cc45..19ce8ac84 100644 --- a/dags/enrich/config/tasks.py +++ b/dags/enrich/config/tasks.py @@ -28,3 +28,7 @@ class TASKS: ) ENRICH_DBT_MODELS_REFRESH: str = "enrich_dbt_models_refresh" READ_AE_RGPD: str = "enrich_ae_rgpd_read" + + # Villes + ENRICH_VILLES_TYPO: str = "enrich_acteurs_villes_typo" + ENRICH_VILLES_NEW: str = "enrich_acteurs_villes_new" diff --git a/dags/enrich/dags/enrich_acteurs_villes.py b/dags/enrich/dags/enrich_acteurs_villes.py new file mode 100644 index 000000000..17520d8d7 --- /dev/null +++ b/dags/enrich/dags/enrich_acteurs_villes.py @@ -0,0 +1,63 @@ +"""DAG to suggestion city corrections based on BAN data""" + +from airflow import DAG +from enrich.config import ( + COHORTS, + DBT, + TASKS, + EnrichActeursVillesConfig, +) +from enrich.tasks.airflow_logic.enrich_config_create_task import ( + enrich_config_create_task, +) +from enrich.tasks.airflow_logic.enrich_dbt_model_suggest_task import ( + enrich_dbt_model_suggest_task, +) +from enrich.tasks.airflow_logic.enrich_dbt_models_refresh_task import ( + enrich_dbt_models_refresh_task, +) +from shared.config import CATCHUPS, SCHEDULES, START_DATES, config_to_airflow_params + +with DAG( + dag_id="enrich_acteurs_villes", + dag_display_name="🌆 Enrichir - Acteurs Villes", + default_args={ + "owner": "airflow", + "depends_on_past": False, + "email_on_failure": False, + "email_on_retry": False, + "retries": 0, + }, + description=("Un DAG pour suggérer des corrections de villes"), + tags=["annuaire", "entreprises", "ae", "acteurs", "juridique"], + schedule=SCHEDULES.NONE, + catchup=CATCHUPS.AWLAYS_FALSE, + start_date=START_DATES.YESTERDAY, + params=config_to_airflow_params( + EnrichActeursVillesConfig( + dbt_models_refresh=True, + dbt_models_refresh_command=( + "dbt build --select tag:marts,tag:enrich,tag:villes" + ), + filter_equals__acteur_statut="ACTIF", + ) + ), +) as dag: + # Instantiation + config = enrich_config_create_task(dag) + dbt_refresh = enrich_dbt_models_refresh_task(dag) + suggest_typo = enrich_dbt_model_suggest_task( + dag, + task_id=TASKS.ENRICH_VILLES_TYPO, + cohort=COHORTS.VILLES_TYPO, + dbt_model_name=DBT.MARTS_ENRICH_VILLES_TYPO, + ) + suggest_new = enrich_dbt_model_suggest_task( + dag, + task_id=TASKS.ENRICH_VILLES_NEW, + cohort=COHORTS.VILLES_NEW, + dbt_model_name=DBT.MARTS_ENRICH_VILLES_NEW, + ) + config >> dbt_refresh # type: ignore + dbt_refresh >> suggest_typo # type: ignore + dbt_refresh >> suggest_new # type: ignore diff --git a/dags/enrich/dags/enrich_dbt_models_refresh.py b/dags/enrich/dags/enrich_dbt_models_refresh.py index ab1f542a0..58db0f531 100644 --- a/dags/enrich/dags/enrich_dbt_models_refresh.py +++ b/dags/enrich/dags/enrich_dbt_models_refresh.py @@ -30,10 +30,8 @@ params=config_to_airflow_params( EnrichDbtModelsRefreshConfig( dbt_models_refresh_commands=[ - "dbt build --select +int_ae_unite_legale", - "dbt build --select +int_ae_etablissement", - "dbt build --select +int_ban_adresses", - "dbt build --select int_ban_villes", + "dbt build --select +tag:intermediate,tag:ae", + "dbt build --select +tag:intermediate,tag:ban", ], ) ), diff --git a/dags/enrich/tasks/business_logic/enrich_dbt_model_to_suggestions.py b/dags/enrich/tasks/business_logic/enrich_dbt_model_to_suggestions.py index 533ebc1d0..db3dad33c 100644 --- a/dags/enrich/tasks/business_logic/enrich_dbt_model_to_suggestions.py +++ b/dags/enrich/tasks/business_logic/enrich_dbt_model_to_suggestions.py @@ -35,6 +35,30 @@ def changes_prepare( ).model_dump() +def changes_prepare_villes(row: dict) -> tuple[list[dict], dict]: + """Prepare suggestions for villes cohorts""" + from data.models.changes import ChangeActeurUpdateData + + changes = [] + model_params = { + "id": row[COLS.ACTEUR_ID], + "data": { + "ville": row[COLS.SUGGEST_VILLE], + }, + } + changes.append( + changes_prepare( + model=ChangeActeurUpdateData, + model_params=model_params, + order=1, + reason="On fait confiance à la BAN", + entity_type="acteur_displayed", + ) + ) + contexte = {} # changes are self-explanatory + return changes, contexte + + def changes_prepare_rgpd( row: dict, ) -> tuple[list[dict], dict]: @@ -184,6 +208,8 @@ def changes_prepare_closed_replaced( COHORTS.CLOSED_REP_OTHER_SIREN: changes_prepare_closed_replaced, COHORTS.CLOSED_REP_SAME_SIREN: changes_prepare_closed_replaced, COHORTS.RGPD: changes_prepare_rgpd, + COHORTS.VILLES_TYPO: changes_prepare_villes, + COHORTS.VILLES_NEW: changes_prepare_villes, } @@ -200,11 +226,17 @@ def enrich_dbt_model_to_suggestions( SuggestionStatut, ) + # TODO: once all suggestions have been migrated to pydantic, we no + # longer need SuggestionCohorte.type_action and any of the following + # identifiant_execution = cohort AND pydantic models take care of + # handling the specifics COHORTS_TO_SUGGESTION_ACTION = { COHORTS.CLOSED_NOT_REPLACED: SuggestionAction.ENRICH_ACTEURS_CLOSED, COHORTS.CLOSED_REP_OTHER_SIREN: SuggestionAction.ENRICH_ACTEURS_CLOSED, COHORTS.CLOSED_REP_SAME_SIREN: SuggestionAction.ENRICH_ACTEURS_CLOSED, COHORTS.RGPD: SuggestionAction.ENRICH_ACTEURS_RGPD, + COHORTS.VILLES_TYPO: SuggestionAction.ENRICH_ACTEURS_VILLES_TYPO, + COHORTS.VILLES_NEW: SuggestionAction.ENRICH_ACTEURS_VILLES_NEW, } # Validation @@ -223,12 +255,12 @@ def enrich_dbt_model_to_suggestions( try: changes, contexte = COHORTS_TO_PREPARE_CHANGES[cohort](row) - suggestions.append( - { - "contexte": contexte, - "suggestion": {"title": cohort, "changes": changes}, - } - ) + suggestion = { + "contexte": contexte, + "suggestion": {"title": cohort, "changes": changes}, + } + log.preview("🔢 Suggestion", suggestion) + suggestions.append(suggestion) # We tolerate some errors except Exception as e: diff --git a/dags_unit_tests/enrich/config/test_enrich_acteurs_closed_config.py b/dags_unit_tests/enrich/config/test_enrich_acteurs_closed_config.py index cb51d00b2..1015c7df2 100644 --- a/dags_unit_tests/enrich/config/test_enrich_acteurs_closed_config.py +++ b/dags_unit_tests/enrich/config/test_enrich_acteurs_closed_config.py @@ -1,5 +1,5 @@ import pytest -from enrich.config.models import EnrichActeursClosedConfig +from dags.enrich.config.models import EnrichActeursClosedConfig class TestEnrichClosedConfig: diff --git a/dags_unit_tests/enrich/tasks/test_enrich_suggestions_cities.py b/dags_unit_tests/enrich/tasks/test_enrich_suggestions_cities.py new file mode 100644 index 000000000..fe7df7dd9 --- /dev/null +++ b/dags_unit_tests/enrich/tasks/test_enrich_suggestions_cities.py @@ -0,0 +1,100 @@ +import pandas as pd +import pytest +from enrich.config import COHORTS, COLS +from enrich.tasks.business_logic.enrich_dbt_model_to_suggestions import ( + enrich_dbt_model_to_suggestions, +) + + +@pytest.mark.django_db +class TestEnrichSuggestionsCities: + + @pytest.fixture + def df_new(self): + return pd.DataFrame( + { + COLS.SUGGEST_COHORT: [COHORTS.VILLES_NEW] * 2, + COLS.SUGGEST_VILLE: ["new town 1", "new town 2"], + COLS.ACTEUR_ID: ["new1", "new2"], + COLS.ACTEUR_VILLE: ["old town 1", "old town 2"], + } + ) + + @pytest.fixture + def df_typo(self): + return pd.DataFrame( + { + COLS.SUGGEST_COHORT: [COHORTS.VILLES_TYPO] * 2, + COLS.SUGGEST_VILLE: ["Paris", "Laval"], + COLS.ACTEUR_ID: ["typo1", "typo2"], + COLS.ACTEUR_VILLE: ["Pâris", "Lâval"], + } + ) + + @pytest.fixture + def acteurs(self, df_new, df_typo): + # Creating acteurs as presence required to apply changes + from unit_tests.qfdmo.acteur_factory import ActeurFactory + + for _, row in pd.concat([df_new, df_typo]).iterrows(): + ActeurFactory( + identifiant_unique=row[COLS.ACTEUR_ID], + ville=row[COLS.ACTEUR_VILLE], + ) + + def test_cohort_new(self, acteurs, df_new): + from data.models.suggestion import Suggestion, SuggestionCohorte + from qfdmo.models import RevisionActeur + + # Write suggestions to DB + enrich_dbt_model_to_suggestions( + df=df_new, + cohort=COHORTS.VILLES_NEW, + identifiant_action="test_new", + dry_run=False, + ) + + # Check suggestions have been written to DB + cohort = SuggestionCohorte.objects.get(identifiant_action="test_new") + suggestions = Suggestion.objects.filter(suggestion_cohorte=cohort) + assert len(suggestions) == 2 + + # Apply suggestions + for suggestion in suggestions: + suggestion.apply() + + # Verify changes + # 2 revisions should be created but not parent + new1 = RevisionActeur.objects.get(pk="new1") + assert new1.ville == "new town 1" + + new2 = RevisionActeur.objects.get(pk="new2") + assert new2.ville == "new town 2" + + def test_cohort_typo(self, acteurs, df_typo): + from data.models.suggestion import Suggestion, SuggestionCohorte + from qfdmo.models import RevisionActeur + + # Write suggestions to DB + enrich_dbt_model_to_suggestions( + df=df_typo, + cohort=COHORTS.VILLES_TYPO, + identifiant_action="test_typo", + dry_run=False, + ) + + # Check suggestions have been written to DB + cohort = SuggestionCohorte.objects.get(identifiant_action="test_typo") + suggestions = Suggestion.objects.filter(suggestion_cohorte=cohort) + assert len(suggestions) == 2 + + # Apply suggestions + for suggestion in suggestions: + suggestion.apply() + + # Verify changes + typo1 = RevisionActeur.objects.get(pk="typo1") + assert typo1.ville == "Paris" + + typo2 = RevisionActeur.objects.get(pk="typo2") + assert typo2.ville == "Laval" diff --git a/data/migrations/0013_alter_suggestioncohorte_type_action.py b/data/migrations/0013_alter_suggestioncohorte_type_action.py new file mode 100644 index 000000000..9701ed417 --- /dev/null +++ b/data/migrations/0013_alter_suggestioncohorte_type_action.py @@ -0,0 +1,35 @@ +# Generated by Django 5.1.6 on 2025-04-28 05:22 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("data", "0012_alter_suggestioncohorte_type_action"), + ] + + operations = [ + migrations.AlterField( + model_name="suggestioncohorte", + name="type_action", + field=models.CharField( + blank=True, + choices=[ + ("CRAWL_URLS", "🔗 URLs scannées"), + ("ENRICH_ACTEURS_CLOSED", "🚪 Acteurs fermés"), + ("ENRICH_ACTEURS_RGPD", "🕵 Anonymisation RGPD"), + ("ENRICH_ACTEURS_VILLES_TYPO", "🏙️ Acteurs villes typographiques"), + ("ENRICH_ACTEURS_VILLES_NEW", "🏙️ Acteurs villes nouvelles"), + ("CLUSTERING", "regroupement/déduplication des acteurs"), + ("SOURCE_AJOUT", "ingestion de source de données - nouveau acteur"), + ( + "SOURCE_MODIFICATION", + "ingestion de source de données - modification d'acteur existant", + ), + ("SOURCE_SUPRESSION", "ingestion de source de données"), + ], + max_length=50, + ), + ), + ] diff --git a/data/models/changes/__init__.py b/data/models/changes/__init__.py index ab172c0c0..6d1b5e002 100644 --- a/data/models/changes/__init__.py +++ b/data/models/changes/__init__.py @@ -10,7 +10,6 @@ from .sample_model_do_nothing import SampleModelDoNothing CHANGE_MODELS = { - ChangeActeurRgpdAnonymize.name(): ChangeActeurRgpdAnonymize, ChangeActeurUpdateData.name(): ChangeActeurUpdateData, ChangeActeurCreateAsChild.name(): ChangeActeurCreateAsChild, ChangeActeurCreateAsParent.name(): ChangeActeurCreateAsParent, @@ -20,4 +19,5 @@ ChangeActeurNothingBase.name(): ChangeActeurNothingBase, ChangeActeurKeepAsParent.name(): ChangeActeurKeepAsParent, SampleModelDoNothing.name(): SampleModelDoNothing, + ChangeActeurRgpdAnonymize.name(): ChangeActeurRgpdAnonymize, } diff --git a/data/models/suggestion.py b/data/models/suggestion.py index b71955c73..01225b812 100644 --- a/data/models/suggestion.py +++ b/data/models/suggestion.py @@ -56,6 +56,14 @@ class SuggestionAction(models.TextChoices): CRAWL_URLS = SUGGESTION_CRAWL_URLS, "🔗 URLs scannées" ENRICH_ACTEURS_CLOSED = "ENRICH_ACTEURS_CLOSED", "🚪 Acteurs fermés" ENRICH_ACTEURS_RGPD = "ENRICH_ACTEURS_RGPD", "🕵 Anonymisation RGPD" + ENRICH_ACTEURS_VILLES_TYPO = ( + "ENRICH_ACTEURS_VILLES_TYPO", + "🏙️ Acteurs villes typographiques", + ) + ENRICH_ACTEURS_VILLES_NEW = ( + "ENRICH_ACTEURS_VILLES_NEW", + "🏙️ Acteurs villes nouvelles", + ) CLUSTERING = SUGGESTION_CLUSTERING, "regroupement/déduplication des acteurs" SOURCE_AJOUT = ( SUGGESTION_SOURCE_AJOUT, @@ -181,6 +189,8 @@ def display_contexte_details(self): SuggestionAction.CLUSTERING, SuggestionAction.CRAWL_URLS, SuggestionAction.ENRICH_ACTEURS_RGPD, + SuggestionAction.ENRICH_ACTEURS_VILLES_TYPO, + SuggestionAction.ENRICH_ACTEURS_VILLES_NEW, ]: context["details_open"] = True @@ -202,6 +212,8 @@ def display_suggestion_details(self): elif self.suggestion_cohorte.type_action in [ SuggestionAction.ENRICH_ACTEURS_CLOSED, SuggestionAction.ENRICH_ACTEURS_RGPD, + SuggestionAction.ENRICH_ACTEURS_VILLES_TYPO, + SuggestionAction.ENRICH_ACTEURS_VILLES_NEW, ]: template_name = "data/_partials/suggestion_details_changes.html" template_context = self.suggestion @@ -328,6 +340,8 @@ def apply(self): SuggestionAction.CRAWL_URLS, SuggestionAction.ENRICH_ACTEURS_CLOSED, SuggestionAction.ENRICH_ACTEURS_RGPD, + SuggestionAction.ENRICH_ACTEURS_VILLES_TYPO, + SuggestionAction.ENRICH_ACTEURS_VILLES_NEW, ]: changes = self.suggestion["changes"] changes.sort(key=lambda x: x["order"]) diff --git a/dbt/models/base/ae_annuaire_entreprises/base_ae_etablissement.sql b/dbt/models/base/ae_annuaire_entreprises/base_ae_etablissement.sql index 2cb1da23f..fc9294b45 100644 --- a/dbt/models/base/ae_annuaire_entreprises/base_ae_etablissement.sql +++ b/dbt/models/base/ae_annuaire_entreprises/base_ae_etablissement.sql @@ -47,4 +47,4 @@ in acteurs + a little more if we can suggestion models to have more data */ ORDER BY siret DESC LIMIT 1000000 -{% endif %} +{% endif %} \ No newline at end of file diff --git a/dbt/models/base/ban/base_ban_adresses.sql b/dbt/models/base/ban/base_ban_adresses.sql new file mode 100644 index 000000000..f278c4f94 --- /dev/null +++ b/dbt/models/base/ban/base_ban_adresses.sql @@ -0,0 +1,27 @@ +{{ + config( + materialized = 'view', + tags=['base', 'ban', 'adresses'], + ) +}} +-- Large source: only reading what's needed +SELECT + /* Creating complete adresse to do lookups + and compare vs. ours rep = ex: "bis" */ + udf_columns_concat_unique_non_empty(numero,rep,nom_voie) AS adresse, + /* Also keeping separate column for numero + as it's a common suggestion filter */ + numero AS adresse_numero, + nom_commune AS ville, + /* We only keep ville_ancienne if it's different from current ville */ + CASE + WHEN nom_ancienne_commune = nom_commune THEN NULL + ELSE nom_ancienne_commune + END AS ville_ancienne, + code_postal, + LEFT(code_postal, 2) AS code_departement, + lat as latitude, + lon as longitude +FROM {{ source('ban', 'clone_ban_adresses_in_use') }} +WHERE code_postal IS NOT NULL AND code_postal != '' +ORDER BY code_postal ASC \ No newline at end of file diff --git a/dbt/models/base/ban/base_ban_lieux_dits.sql b/dbt/models/base/ban/base_ban_lieux_dits.sql new file mode 100644 index 000000000..0be29e959 --- /dev/null +++ b/dbt/models/base/ban/base_ban_lieux_dits.sql @@ -0,0 +1,8 @@ +{{ + config( + materialized = 'view', + tags=['base', 'ban', 'lieux_dits'], + ) +}} + +SELECT * FROM {{ source('ban', 'clone_ban_lieux_dits_in_use') }} \ No newline at end of file diff --git a/dbt/models/intermediate/ban/int_ban_adresses.sql b/dbt/models/intermediate/ban/int_ban_adresses.sql new file mode 100644 index 000000000..ba11403ad --- /dev/null +++ b/dbt/models/intermediate/ban/int_ban_adresses.sql @@ -0,0 +1,22 @@ +/* +post_hook = partial indexes on high-cardinality columns only for NOT NULL +so we can still speed up the JOINS/FILTERS +*/ +{{ + config( + materialized = 'table', + tags=['intermediate', 'ban', 'adresses'], + indexes=[ + {'columns': ['code_postal']}, + {'columns': ['code_departement']}, + ], + post_hook=[ + "CREATE INDEX ON {{ this }}(ville_ancienne) WHERE ville_ancienne IS NOT NULL", + "CREATE INDEX ON {{ this }}(adresse_numero) WHERE adresse_numero IS NOT NULL", + ] + ) +}} + + +SELECT * +FROM {{ ref('base_ban_adresses') }} \ No newline at end of file diff --git a/dbt/models/intermediate/ban/int_ban_villes.sql b/dbt/models/intermediate/ban/int_ban_villes.sql new file mode 100644 index 000000000..8446b910d --- /dev/null +++ b/dbt/models/intermediate/ban/int_ban_villes.sql @@ -0,0 +1,28 @@ +/* +post_hook = partial indexes on high-cardinality columns only for NOT NULL +so we can still speed up the JOINS/FILTERS +*/ +{{ + config( + materialized = 'table', + tags=['intermediate', 'ban', 'villes'], + indexes=[ + {'columns': ['ville_ancienne']}, + {'columns': ['ville']}, + {'columns': ['code_postal']}, + {'columns': ['code_departement']}, + ], + post_hook=[ + "CREATE INDEX ON {{ this }}(ville_ancienne) WHERE ville_ancienne IS NOT NULL", + ] + ) +}} + + +SELECT + ville_ancienne, + ville, + code_postal, + code_departement +FROM {{ ref('base_ban_adresses') }} +GROUP BY 1,2,3,4 \ No newline at end of file diff --git a/dbt/models/marts/enrich/marts_enrich_acteurs_closed_replaced.sql b/dbt/models/marts/enrich/marts_enrich_acteurs_closed_replaced.sql index 4ce20c003..94c964759 100644 --- a/dbt/models/marts/enrich/marts_enrich_acteurs_closed_replaced.sql +++ b/dbt/models/marts/enrich/marts_enrich_acteurs_closed_replaced.sql @@ -67,4 +67,4 @@ WITH potential_replacements AS ( SELECT * FROM potential_replacements WHERE replacement_priority=1 /* We don't want to propose suggests with unavailable names */ -AND suggest_nom != {{ value_unavailable() }} +AND suggest_nom != {{ value_unavailable() }} \ No newline at end of file diff --git a/dbt/models/marts/enrich/marts_enrich_acteurs_villes_candidates.sql b/dbt/models/marts/enrich/marts_enrich_acteurs_villes_candidates.sql new file mode 100644 index 000000000..55ca81441 --- /dev/null +++ b/dbt/models/marts/enrich/marts_enrich_acteurs_villes_candidates.sql @@ -0,0 +1,29 @@ +{{ + config( + materialized = 'table', + tags=['marts', 'enrich', 'villes', 'cities', 'ban'], + ) +}} + +SELECT + acteurs.identifiant_unique AS acteur_id, + acteurs.ville AS acteur_ville, + acteurs.code_postal AS acteur_code_postal, + ban.ville_ancienne AS ban_ville_ancienne, + ban.ville AS ban_ville, + ban.code_postal AS ban_code_postal, + ban.ville AS suggest_ville +FROM {{ ref('marts_carte_acteur') }} AS acteurs +JOIN {{ ref('int_ban_villes') }} AS ban ON ban.code_postal = acteurs.code_postal +WHERE acteurs.statut = 'ACTIF' +AND acteurs.code_postal IS NOT NULL and acteurs.code_postal != '' and LENGTH(acteurs.code_postal) = 5 +/* Only suggest if 1 difference */ +AND ( + acteurs.ville != ban.ville_ancienne + OR acteurs.ville != ban.ville +) +/* BUT also a match somewhere */ +AND ( + udf_normalize_string_for_match(acteurs.ville,3) = udf_normalize_string_for_match(ban.ville_ancienne,3) + OR udf_normalize_string_for_match(acteurs.ville,3) = udf_normalize_string_for_match(ban.ville,3) +) diff --git a/dbt/models/marts/enrich/marts_enrich_acteurs_villes_suggest_new.sql b/dbt/models/marts/enrich/marts_enrich_acteurs_villes_suggest_new.sql new file mode 100644 index 000000000..3e6e3c0ea --- /dev/null +++ b/dbt/models/marts/enrich/marts_enrich_acteurs_villes_suggest_new.sql @@ -0,0 +1,14 @@ +{{ + config( + materialized = 'view', + alias = 'marts_enrich_acteurs_villes_suggest_new', + tags=['marts', 'enrich', 'villes', 'cities', 'ban', 'acteurs', 'nouvelle', 'new'], + ) +}} + +SELECT + '🌆 Changement de ville: 🟡 ancienne -> nouvelle' AS suggest_cohort, + * +FROM {{ ref('marts_enrich_acteurs_villes_candidates') }} +WHERE udf_normalize_string_for_match(acteur_ville,3) != udf_normalize_string_for_match(suggest_ville,3) +AND ban_ville_ancienne IS NOT NULL \ No newline at end of file diff --git a/dbt/models/marts/enrich/marts_enrich_acteurs_villes_suggest_typo.sql b/dbt/models/marts/enrich/marts_enrich_acteurs_villes_suggest_typo.sql new file mode 100644 index 000000000..2896b99b1 --- /dev/null +++ b/dbt/models/marts/enrich/marts_enrich_acteurs_villes_suggest_typo.sql @@ -0,0 +1,14 @@ +{{ + config( + materialized = 'view', + alias = 'marts_enrich_acteurs_villes_suggest_typo', + tags=['marts', 'enrich', 'villes', 'cities', 'ban','acteurs','typo','ortographe'], + ) +}} + +SELECT + '🌆 Changement de ville: 🟢 variation d''ortographe' AS suggest_cohort, + * +FROM {{ ref('marts_enrich_acteurs_villes_candidates') }} +WHERE udf_normalize_string_for_match(acteur_ville,3) = udf_normalize_string_for_match(suggest_ville,3) +AND ban_ville_ancienne IS NULL diff --git a/dbt/models/source/source_ban_base_adresse_nationale.yml b/dbt/models/source/source_ban_base_adresse_nationale.yml new file mode 100644 index 000000000..f94198026 --- /dev/null +++ b/dbt/models/source/source_ban_base_adresse_nationale.yml @@ -0,0 +1,9 @@ +version: 2 + +sources: + - name: ban + description: "Base Adresse Nationale (BAN)" + schema: public + tables: + - name: clone_ban_adresses_in_use + - name: clone_ban_lieux_dits_in_use