Test fix for filesystem iceberg

anuunchin · anuunchin · commit c17d230c67e0 · 2025-06-05T09:31:07.000+02:00
diff --git a/dlt/common/libs/pyiceberg.py b/dlt/common/libs/pyiceberg.py
@@ -26,6 +26,7 @@
     from pyiceberg.catalog import Catalog as IcebergCatalog
     from pyiceberg.exceptions import NoSuchTableError
     import pyarrow as pa
+    import pyiceberg.io.pyarrow as _pio
 except ModuleNotFoundError:
     raise MissingDependencyException(
         "dlt pyiceberg helpers",
@@ -34,6 +35,20 @@
     )
 
 
+# TODO: remove with pyiceberg's release after 0.9.1
+_orig_get_kwargs = _pio._get_parquet_writer_kwargs
+
+
+def _patched_get_parquet_writer_kwargs(table_properties):  # type: ignore[no-untyped-def]
+    """Return the original kwargs **plus** store_decimal_as_integer=True."""
+    kwargs = _orig_get_kwargs(table_properties)
+    kwargs.setdefault("store_decimal_as_integer", True)
+    return kwargs
+
+
+_pio._get_parquet_writer_kwargs = _patched_get_parquet_writer_kwargs
+
+
 def ensure_iceberg_compatible_arrow_schema(schema: pa.Schema) -> pa.Schema:
     ARROW_TO_ICEBERG_COMPATIBLE_ARROW_TYPE_MAP = {
         pa.types.is_time32: pa.time64("us"),
@@ -82,6 +97,7 @@ def merge_iceberg_table(
         else:
             join_cols = get_columns_names_with_prop(schema, "primary_key")
 
+        # TODO: replace the batching method with transaction with pyiceberg's release after 0.9.1
         for rb in data.to_batches(max_chunksize=1_000):
             batch_tbl = pa.Table.from_batches([rb])
             batch_tbl = ensure_iceberg_compatible_arrow_data(batch_tbl)
diff --git a/docs/website/docs/dlt-ecosystem/destinations/iceberg.md b/docs/website/docs/dlt-ecosystem/destinations/iceberg.md
@@ -120,7 +120,11 @@ The [S3-compatible](./filesystem.md#using-s3-compatible-storage) interface for G
 The `az` [scheme](./filesystem.md#supported-schemes) is not supported when using the `iceberg` table format. Please use the `abfss` scheme. This is because `pyiceberg`, which dlt used under the hood, currently does not support `az`.
 
 ## Table format `merge` support
-The [`upsert`](../../general-usage/merge-loading.md#upsert-strategy) merge strategy is supported for `iceberg`. This strategy requires that the input data contains no duplicate rows based on the key columns, and that the target table also does not contain duplicates on those keys.
+The [`upsert`](../../general-usage/merge-loading.md#upsert-strategy) merge strategy is supported for `iceberg`. This strategy requires that the input data contains no duplicate rows based on the key columns, and that the target table also does not contain duplicates on those keys. 
+
+:::warning
+Until _pyiceberg_ > 0.9.1 is released, upsert is executed in chunks of **1000** rows. 
+:::
 
 ```py
 @dlt.resource(
diff --git a/docs/website/docs/general-usage/merge-loading.md b/docs/website/docs/general-usage/merge-loading.md
@@ -554,7 +554,7 @@ The `upsert` merge strategy is currently supported for these destinations:
 - `mssql`
 - `postgres`
 - `snowflake`
-- `filesystem` with `delta` table format (see limitations [here](../dlt-ecosystem/destinations/delta-iceberg#known-limitations))
+- `filesystem` with `delta` table format (see limitations [here](../dlt-ecosystem/destinations/delta-iceberg#known-limitations)) and `iceberg` table format
 :::
 
 The `upsert` merge strategy does primary-key based *upserts*:
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -104,9 +104,7 @@ db-dtypes = { version = ">=1.2.0", optional = true }
 # https://github.com/apache/airflow/issues/28723
 # pyiceberg = { version = ">=0.7.1", optional = true, extras = ["sql-sqlite"] }
 # we will rely on manual installation of `sqlalchemy>=2.0.18` instead
-# replace pyiceberg's version with the one released after 0.9.1
-pyiceberg = { git = "https://github.com/apache/iceberg-python.git", rev = "260ef54e3920d435ae3b2ccda090e66f9c1ac015", optional = true }
-# pyiceberg = { version = ">=0.9.1" , optional = true }
+pyiceberg = { version = ">=0.9.1" , optional = true }
 
 databricks-sdk = {version = ">=0.38.0", optional = true}
 pywin32 = {version = ">=306", optional = true, platform = "win32"}
diff --git a/tests/load/pipeline/test_merge_disposition.py b/tests/load/pipeline/test_merge_disposition.py
@@ -828,11 +828,22 @@ def test_pipeline_load_parquet(destination_config: DestinationTestConfiguration)
     github_data.max_table_nesting = 2
     github_data_copy = github()
     github_data_copy.max_table_nesting = 2
-    info = p.run(
-        [github_data, github_data_copy],
-        write_disposition="merge",
-        **destination_config.run_kwargs,
-    )
+    # iceberg filesystem requires input data without duplicates
+    if (
+        destination_config.table_format == "iceberg"
+        and destination_config.destination_type == "filesystem"
+    ):
+        info = p.run(
+            github_data,
+            write_disposition="merge",
+            **destination_config.run_kwargs,
+        )
+    else:
+        info = p.run(
+            [github_data, github_data_copy],
+            write_disposition="merge",
+            **destination_config.run_kwargs,
+        )
     assert_load_info(info)
     # make sure it was parquet or sql transforms
     expected_formats = ["parquet"]
@@ -844,10 +855,9 @@ def test_pipeline_load_parquet(destination_config: DestinationTestConfiguration)
 
     github_1_counts = load_table_counts(p)
     expected_rows = 100
-    # if table_format is set we use upsert which does not deduplicate input data
-    if not destination_config.supports_merge or (
-        destination_config.table_format and destination_config.destination_type != "athena"
-    ):
+    # if table_format is set to delta we use upsert which does not deduplicate input data
+    # otherwise the data is either deduplicated or it's iceberg filesystem for which we didn't pass duplicates at all
+    if destination_config.table_format == "delta":
         expected_rows *= 2
     assert github_1_counts["issues"] == expected_rows
 
diff --git a/tests/load/pipeline/test_open_table_pipeline.py b/tests/load/pipeline/test_open_table_pipeline.py
@@ -375,16 +375,15 @@ def nested_table():
     assert len(rows_dict["nested_table__child"]) == 3
     assert len(rows_dict["nested_table__child__grandchild"]) == 5
 
-    if destination_config.supports_merge:
-        # now drop children and grandchildren, use merge write disposition to create and pass full table chain
-        # also for tables that do not have jobs
-        info = pipeline.run(
-            [{"foo": 3}] * 10000,
-            table_name="nested_table",
-            primary_key="foo",
-            write_disposition="merge",
-        )
-        assert_load_info(info)
+    # now drop children and grandchildren, use merge write disposition to create and pass full table chain
+    # also for tables that do not have jobs
+    info = pipeline.run(
+        [{"foo": i} for i in range(3, 10003)],
+        table_name="nested_table",
+        primary_key="foo",
+        write_disposition="merge",
+    )
+    assert_load_info(info)
 
 
 @pytest.mark.parametrize(