Test fix for filesystem iceberg

anuunchin · anuunchin · commit 93cbfee352f1 · 2025-05-30T15:44:40.000+02:00
diff --git a/docs/website/docs/general-usage/merge-loading.md b/docs/website/docs/general-usage/merge-loading.md
@@ -554,7 +554,7 @@ The `upsert` merge strategy is currently supported for these destinations:
 - `mssql`
 - `postgres`
 - `snowflake`
-- `filesystem` with `delta` table format (see limitations [here](../dlt-ecosystem/destinations/delta-iceberg#known-limitations))
+- `filesystem` with `delta` table format (see limitations [here](../dlt-ecosystem/destinations/delta-iceberg#known-limitations)) and `iceberg` table format
 :::
 
 The `upsert` merge strategy does primary-key based *upserts*:
diff --git a/tests/load/pipeline/test_merge_disposition.py b/tests/load/pipeline/test_merge_disposition.py
@@ -820,6 +820,9 @@ def test_merge_no_merge_keys(destination_config: DestinationTestConfiguration) -
     ids=lambda x: x.name,
 )
 def test_pipeline_load_parquet(destination_config: DestinationTestConfiguration) -> None:
+    import os
+
+    os.environ["LOAD__WORKERS"] = "1"
     p = destination_config.setup_pipeline("github_3", dev_mode=True)
     # do not save state to destination so jobs counting is easier
     p.config.restore_from_destination = False
@@ -828,11 +831,22 @@ def test_pipeline_load_parquet(destination_config: DestinationTestConfiguration)
     github_data.max_table_nesting = 2
     github_data_copy = github()
     github_data_copy.max_table_nesting = 2
-    info = p.run(
-        [github_data, github_data_copy],
-        write_disposition="merge",
-        **destination_config.run_kwargs,
-    )
+    # iceberg filesystem requires input data without duplicates
+    if (
+        destination_config.table_format == "iceberg"
+        and destination_config.destination_type == "filesystem"
+    ):
+        info = p.run(
+            github_data,
+            write_disposition="merge",
+            **destination_config.run_kwargs,
+        )
+    else:
+        info = p.run(
+            [github_data, github_data_copy],
+            write_disposition="merge",
+            **destination_config.run_kwargs,
+        )
     assert_load_info(info)
     # make sure it was parquet or sql transforms
     expected_formats = ["parquet"]
@@ -844,10 +858,9 @@ def test_pipeline_load_parquet(destination_config: DestinationTestConfiguration)
 
     github_1_counts = load_table_counts(p, *[t["name"] for t in p.default_schema.data_tables()])
     expected_rows = 100
-    # if table_format is set we use upsert which does not deduplicate input data
-    if not destination_config.supports_merge or (
-        destination_config.table_format and destination_config.destination_type != "athena"
-    ):
+    # if table_format is set to delta we use upsert which does not deduplicate input data
+    # otherwise the data is either deduplicated or it's iceberg filesystem for which we didn't pass duplicates at all
+    if destination_config.table_format == "delta":
         expected_rows *= 2
     assert github_1_counts["issues"] == expected_rows
 
diff --git a/tests/load/pipeline/test_open_table_pipeline.py b/tests/load/pipeline/test_open_table_pipeline.py
@@ -375,16 +375,15 @@ def nested_table():
     assert len(rows_dict["nested_table__child"]) == 3
     assert len(rows_dict["nested_table__child__grandchild"]) == 5
 
-    if destination_config.supports_merge:
-        # now drop children and grandchildren, use merge write disposition to create and pass full table chain
-        # also for tables that do not have jobs
-        info = pipeline.run(
-            [{"foo": 3}] * 10000,
-            table_name="nested_table",
-            primary_key="foo",
-            write_disposition="merge",
-        )
-        assert_load_info(info)
+    # now drop children and grandchildren, use merge write disposition to create and pass full table chain
+    # also for tables that do not have jobs
+    info = pipeline.run(
+        [{"foo": i} for i in range(3, 10003)],
+        table_name="nested_table",
+        primary_key="foo",
+        write_disposition="merge",
+    )
+    assert_load_info(info)
 
 
 @pytest.mark.parametrize(