Dédupliquer les lignes qui ont plusieurs EO et prefixé ocab (#1610)

kolok · fabienheureux · web-flow · commit 8d133312dcb9 · 2025-05-19T06:38:03.000Z
Co-authored-by: Fabien Le Frapper &lt;contact@fabienlefrapper.me&gt;
diff --git a/dags/sources/dags/source_ocab.py b/dags/sources/dags/source_ocab.py
@@ -24,8 +24,8 @@
                 "destination": "nom",
             },
             {
-                "origin": "enseigne_commerciale",
-                "destination": "nom_commercial",
+                "origin": "consignes_dacces",
+                "destination": "description",
             },
             {
                 "origin": "longitudewgs84",
@@ -56,6 +56,11 @@
                 "transformation": "clean_sous_categorie_codes",
                 "destination": "sous_categorie_codes",
             },
+            {
+                "origin": "horaires_douverture",
+                "transformation": "clean_horaires_osm",
+                "destination": "horaires_osm",
+            },
             # 3. Ajout des colonnes avec une valeur par défaut
             {
                 "column": "statut",
@@ -94,10 +99,15 @@
                 "transformation": "clean_adresse",
                 "destination": ["adresse", "code_postal", "ville"],
             },
+            {
+                "origin": ["telephone", "code_postal"],
+                "transformation": "clean_telephone",
+                "destination": ["telephone"],
+            },
             {
                 "origin": [
-                    # "point_dapport_de_service_reparation",
-                    # "point_de_reparation",
+                    "point_dapport_de_service_reparation",
+                    "point_de_reparation",
                     "point_dapport_pour_reemploi",
                     "point_de_collecte_ou_de_reprise_des_dechets",
                 ],
@@ -106,8 +116,8 @@
             },
             {
                 "origin": [
-                    # "point_dapport_de_service_reparation",
-                    # "point_de_reparation",
+                    "point_dapport_de_service_reparation",
+                    "point_de_reparation",
                     "point_dapport_pour_reemploi",
                     "point_de_collecte_ou_de_reprise_des_dechets",
                 ],
@@ -131,12 +141,18 @@
             {"remove": "id_point_apport_ou_reparation"},
             {"remove": "point_de_collecte_ou_de_reprise_des_dechets"},
             {"remove": "point_dapport_pour_reemploi"},
+            {"remove": "point_de_reparation"},
+            {"remove": "point_dapport_de_service_reparation"},
             # 6. Colonnes à garder (rien à faire, utilisé pour le controle)
         ],
         "endpoint": (
             "https://data.pointsapport.ademe.fr/data-fair/api/v1/datasets/"
             "donnees-eo-ocab/lines?size=10000"
         ),
+        "oca": {
+            "prefix": "ocab",
+            "deduplication_source": True,
+        },
         "validate_address_with_ban": False,
         "product_mapping": get_mapping_config(),
     },
diff --git a/dags/sources/tasks/airflow_logic/config_management.py b/dags/sources/tasks/airflow_logic/config_management.py
@@ -43,6 +43,11 @@ class NormalizationColumnKeep(BaseModel):
     keep: str
 
 
+class OCAConfig(BaseModel):
+    prefix: str | None = None
+    deduplication_source: bool = False
+
+
 class DAGConfig(BaseModel):
     normalization_rules: list[
         Union[
@@ -62,6 +67,7 @@ class DAGConfig(BaseModel):
     product_mapping: dict
     source_code: Optional[str] = None
     validate_address_with_ban: bool = False
+    oca: OCAConfig | None = None
 
     @field_validator("endpoint")
     def validate_endpoint(cls, endpoint):
@@ -112,6 +118,22 @@ def get_expected_columns(self) -> set[str]:
         columns -= set(removed_columns)
         return columns
 
+    @property
+    def oca_deduplication_source(self) -> bool:
+        if self.oca is None:
+            return False
+        return bool(self.oca.deduplication_source)
+
+    @property
+    def is_oca(self) -> bool:
+        return bool(self.oca)
+
+    @property
+    def oca_prefix(self) -> str | None:
+        if self.oca is None:
+            return None
+        return self.oca.prefix
+
 
 # DEPRECATED
 def get_nested_config_parameter(
diff --git a/dags/sources/tasks/business_logic/source_data_normalize.py b/dags/sources/tasks/business_logic/source_data_normalize.py
@@ -205,6 +205,25 @@ def _display_warning_about_missing_location(df: pd.DataFrame) -> None:
             log.preview("Acteurs sans localisation", df_acteur_sans_loc)
 
 
+def _manage_oca_config(df: pd.DataFrame, dag_config: DAGConfig) -> pd.DataFrame:
+    if dag_config.oca_deduplication_source:
+        df = df.assign(source_code=df["source_code"].str.split("|")).explode(
+            "source_code"
+        )
+    if oca_prefix := dag_config.oca_prefix:
+        df["source_code"] = df["source_code"].apply(
+            lambda x: oca_prefix + "_" + x.strip().lower()
+        )
+    # Recalcul de l'identifiant unique
+    normalisation_function = get_transformation_function(
+        "clean_identifiant_unique", dag_config
+    )
+    df[["identifiant_unique"]] = df[["identifiant_externe", "source_code"]].apply(
+        normalisation_function, axis=1
+    )
+    return df
+
+
 def source_data_normalize(
     df_acteur_from_source: pd.DataFrame,
     dag_config: DAGConfig,
@@ -239,6 +258,10 @@ def source_data_normalize(
     # Merge and delete undesired lines
     df, metadata = _remove_undesired_lines(df, dag_config)
 
+    # deduplication_on_source_code
+    if dag_config.is_oca:
+        df = _manage_oca_config(df, dag_config)
+
     # Check that the dataframe has the expected columns
     expected_columns = dag_config.get_expected_columns()
 
diff --git a/dags_unit_tests/sources/tasks/business_logic/test_source_data_normalize.py b/dags_unit_tests/sources/tasks/business_logic/test_source_data_normalize.py
@@ -215,7 +215,7 @@ def test_remove_explicit_null(self, null_value):
                 "product_mapping": {"product1": "code1"},
             }
         )
-        df, metadata = source_data_normalize(
+        df, _ = source_data_normalize(
             dag_config=dag_config,
             df_acteur_from_source=pd.DataFrame(
                 {
@@ -238,6 +238,115 @@ def test_remove_explicit_null(self, null_value):
         )
 
 
+class TestDfApplyOCA:
+    """
+    Test de la fonction df_normalize_oca
+    """
+
+    @pytest.fixture
+    def dag_config_kwargs(self):
+        return {
+            "normalization_rules": [
+                {"keep": "identifiant_unique"},
+                {"keep": "nom"},
+                {"keep": "source_code"},
+                {"keep": "identifiant_externe"},
+            ],
+            "product_mapping": {},
+            "endpoint": "https://example.com/api",
+        }
+
+    @pytest.fixture
+    def df_acteur(self):
+        return pd.DataFrame(
+            {
+                "identifiant_unique": ["id1", "id2"],
+                "source_code": ["oca_1|oca_2", "oca_2"],
+                "identifiant_externe": ["ext1", "ext2"],
+                "nom": ["nom1", "nom2"],
+            }
+        )
+
+    def test_apply_oca_config(self, dag_config_kwargs, df_acteur):
+        dag_config_kwargs["oca"] = {"prefix": "ocatest", "deduplication_source": True}
+
+        df, _ = source_data_normalize(
+            df_acteur_from_source=df_acteur,
+            dag_config=DAGConfig.model_validate(dag_config_kwargs),
+            dag_id="dag_id",
+        )
+
+        expected_df = pd.DataFrame(
+            {
+                "identifiant_unique": [
+                    "ocatest_oca_1_ext1",
+                    "ocatest_oca_2_ext1",
+                    "ocatest_oca_2_ext2",
+                ],
+                "source_code": ["ocatest_oca_1", "ocatest_oca_2", "ocatest_oca_2"],
+                "identifiant_externe": ["ext1", "ext1", "ext2"],
+                "nom": ["nom1", "nom1", "nom2"],
+            }
+        )
+
+        pd.testing.assert_frame_equal(
+            df.reset_index(drop=True), expected_df.reset_index(drop=True)
+        )
+
+    def test_apply_oca_config_no_deduplication_source(
+        self, dag_config_kwargs, df_acteur
+    ):
+        dag_config_kwargs["oca"] = {"prefix": "ocatest"}
+
+        df, _ = source_data_normalize(
+            df_acteur_from_source=df_acteur,
+            dag_config=DAGConfig.model_validate(dag_config_kwargs),
+            dag_id="dag_id",
+        )
+
+        expected_df = pd.DataFrame(
+            {
+                "identifiant_unique": [
+                    "ocatest_oca_1|oca_2_ext1",
+                    "ocatest_oca_2_ext2",
+                ],
+                "source_code": ["ocatest_oca_1|oca_2", "ocatest_oca_2"],
+                "identifiant_externe": ["ext1", "ext2"],
+                "nom": ["nom1", "nom2"],
+            }
+        )
+
+        pd.testing.assert_frame_equal(
+            df.reset_index(drop=True), expected_df.reset_index(drop=True)
+        )
+
+    def test_apply_oca_config_no_prefix(self, dag_config_kwargs, df_acteur):
+        dag_config_kwargs["oca"] = {"deduplication_source": True}
+
+        df, _ = source_data_normalize(
+            df_acteur_from_source=df_acteur,
+            dag_config=DAGConfig.model_validate(dag_config_kwargs),
+            dag_id="dag_id",
+        )
+
+        expected_df = pd.DataFrame(
+            {
+                "identifiant_unique": [
+                    "oca_1_ext1",
+                    "oca_2_ext1",
+                    "oca_2_ext2",
+                ],
+                "source_code": ["oca_1", "oca_2", "oca_2"],
+                "identifiant_externe": ["ext1", "ext1", "ext2"],
+                "nom": ["nom1", "nom1", "nom2"],
+            }
+        )
+
+        pd.testing.assert_frame_equal(
+            df.reset_index(drop=True), expected_df.reset_index(drop=True)
+        )
+
+
 class TestDfNormalizePharmacie:
     """
     Test de la fonction df_normalize_pharmacie