Merge branch 'main' into add-toc-ecosystem

datapythonista · web-flow · commit da810624dc55 · 2025-06-13T16:40:13.000+02:00
diff --git a/README.md b/README.md
@@ -5,7 +5,7 @@
 
 -----------------
 
-# pandas: powerful Python data analysis toolkit
+# pandas: A Powerful Python Data Analysis Toolkit
 
 | | |
 | --- | --- |
diff --git a/ci/deps/actions-310-minimum_versions.yaml b/ci/deps/actions-310-minimum_versions.yaml
@@ -18,7 +18,7 @@ dependencies:
   - pytest-xdist>=3.4.0
   - pytest-localserver>=0.8.1
   - pytest-qt>=4.4.0
-  - boto3
+  - boto3=1.37.3
 
   # required dependencies
   - python-dateutil=2.8.2
diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml
@@ -16,7 +16,7 @@ dependencies:
   - pytest-xdist>=3.4.0
   - pytest-localserver>=0.8.1
   - pytest-qt>=4.4.0
-  - boto3
+  - boto3=1.37.3
 
   # required dependencies
   - python-dateutil
diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml
@@ -17,7 +17,7 @@ dependencies:
   - pytest-xdist>=3.4.0
   - pytest-localserver>=0.8.1
   - pytest-qt>=4.4.0
-  - boto3
+  - boto3=1.37.3
 
   # required dependencies
   - python-dateutil
diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml
@@ -16,7 +16,7 @@ dependencies:
   - pytest-xdist>=3.4.0
   - pytest-localserver>=0.8.1
   - pytest-qt>=4.4.0
-  - boto3
+  - boto3=1.37.3
 
   # required dependencies
   - python-dateutil
diff --git a/ci/deps/actions-312.yaml b/ci/deps/actions-312.yaml
@@ -16,7 +16,7 @@ dependencies:
   - pytest-xdist>=3.4.0
   - pytest-localserver>=0.8.1
   - pytest-qt>=4.4.0
-  - boto3
+  - boto3=1.37.3
 
   # required dependencies
   - python-dateutil
diff --git a/ci/deps/actions-313.yaml b/ci/deps/actions-313.yaml
@@ -16,7 +16,7 @@ dependencies:
   - pytest-xdist>=3.4.0
   - pytest-localserver>=0.8.1
   - pytest-qt>=4.4.0
-  - boto3
+  - boto3=1.37.3
 
   # required dependencies
   - python-dateutil
diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst
@@ -308,7 +308,7 @@ Dependency                                             Minimum Version    pip ex
 `zlib <https://github.com/madler/zlib>`__                                 hdf5             Compression for HDF5
 `fastparquet <https://github.com/dask/fastparquet>`__  2024.2.0           -                Parquet reading / writing (pyarrow is default)
 `pyarrow <https://github.com/apache/arrow>`__          10.0.1             parquet, feather Parquet, ORC, and feather reading / writing
-`PyIceberg <https://py.iceberg.apache.org/>`__         0.7.1              iceberg          Apache Iceberg reading
+`PyIceberg <https://py.iceberg.apache.org/>`__         0.7.1              iceberg          Apache Iceberg reading / writing
 `pyreadstat <https://github.com/Roche/pyreadstat>`__   1.2.6              spss             SPSS files (.sav) reading
 `odfpy <https://github.com/eea/odfpy>`__               1.4.1              excel            Open document format (.odf, .ods, .odt) reading / writing
 ====================================================== ================== ================ ==========================================================
diff --git a/doc/source/reference/io.rst b/doc/source/reference/io.rst
@@ -162,6 +162,7 @@ Iceberg
     :toctree: api/
 
     read_iceberg
+    DataFrame.to_iceberg
 
 .. warning:: ``read_iceberg`` is experimental and may change without warning.
 
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -29,7 +29,7 @@ The pandas I/O API is a set of top level ``reader`` functions accessed like
     binary,`HDF5 Format <https://support.hdfgroup.org/documentation/hdf5/latest/_intro_h_d_f5.html>`__, :ref:`read_hdf<io.hdf5>`, :ref:`to_hdf<io.hdf5>`
     binary,`Feather Format <https://github.com/wesm/feather>`__, :ref:`read_feather<io.feather>`, :ref:`to_feather<io.feather>`
     binary,`Parquet Format <https://parquet.apache.org/>`__, :ref:`read_parquet<io.parquet>`, :ref:`to_parquet<io.parquet>`
-    binary,`Apache Iceberg <https://iceberg.apache.org/>`__, :ref:`read_iceberg<io.iceberg>` , NA
+    binary,`Apache Iceberg <https://iceberg.apache.org/>`__, :ref:`read_iceberg<io.iceberg>` , :ref:`to_iceberg<io.iceberg>`
     binary,`ORC Format <https://orc.apache.org/>`__, :ref:`read_orc<io.orc>`, :ref:`to_orc<io.orc>`
     binary,`Stata <https://en.wikipedia.org/wiki/Stata>`__, :ref:`read_stata<io.stata_reader>`, :ref:`to_stata<io.stata_writer>`
     binary,`SAS <https://en.wikipedia.org/wiki/SAS_(software)>`__, :ref:`read_sas<io.sas_reader>` , NA
@@ -5417,7 +5417,7 @@ engines to safely work with the same tables at the same time.
 
 Iceberg support predicate pushdown and column pruning, which are available to pandas
 users via the ``row_filter`` and ``selected_fields`` parameters of the :func:`~pandas.read_iceberg`
-function. This is convenient to extract from large tables a subset that fits in memory asa
+function. This is convenient to extract from large tables a subset that fits in memory as a
 pandas ``DataFrame``.
 
 Internally, pandas uses PyIceberg_ to query Iceberg.
@@ -5497,6 +5497,29 @@ parameter:
 Reading a particular snapshot is also possible providing the snapshot ID as an argument to
 ``snapshot_id``.
 
+To save a ``DataFrame`` to Iceberg, it can be done with the :meth:`DataFrame.to_iceberg`
+method:
+
+.. code-block:: python
+
+    df.to_iceberg("my_table", catalog_name="my_catalog")
+
+To specify the catalog, it works in the same way as for :func:`read_iceberg` with the
+``catalog_name`` and ``catalog_properties`` parameters.
+
+The location of the table can be specified with the ``location`` parameter:
+
+.. code-block:: python
+
+    df.to_iceberg(
+        "my_table",
+        catalog_name="my_catalog",
+        location="s://my-data-lake/my-iceberg-tables",
+    )
+
+It is possible to add properties to the table snapshot by passing a dictionary to the
+``snapshot_properties`` parameter.
+
 More information about the Iceberg format can be found in the `Apache Iceberg official
 page <https://iceberg.apache.org/>`__.
 
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -79,7 +79,7 @@ Other enhancements
 - :py:class:`frozenset` elements in pandas objects are now natively printed (:issue:`60690`)
 - Add ``"delete_rows"`` option to ``if_exists`` argument in :meth:`DataFrame.to_sql` deleting all records of the table before inserting data (:issue:`37210`).
 - Added half-year offset classes :class:`HalfYearBegin`, :class:`HalfYearEnd`, :class:`BHalfYearBegin` and :class:`BHalfYearEnd` (:issue:`60928`)
-- Added support to read from Apache Iceberg tables with the new :func:`read_iceberg` function (:issue:`61383`)
+- Added support to read and write from and to Apache Iceberg tables with the new :func:`read_iceberg` and :meth:`DataFrame.to_iceberg` functions (:issue:`61383`)
 - Errors occurring during SQL I/O will now throw a generic :class:`.DatabaseError` instead of the raw Exception type from the underlying driver manager library (:issue:`60748`)
 - Implemented :meth:`Series.str.isascii` and :meth:`Series.str.isascii` (:issue:`59091`)
 - Improved deprecation message for offset aliases (:issue:`60820`)
@@ -712,6 +712,7 @@ Timezones
 Numeric
 ^^^^^^^
 - Bug in :meth:`DataFrame.corr` where numerical precision errors resulted in correlations above ``1.0`` (:issue:`61120`)
+- Bug in :meth:`DataFrame.cov` raises a ``TypeError`` instead of returning potentially incorrect results or other errors (:issue:`53115`)
 - Bug in :meth:`DataFrame.quantile` where the column type was not preserved when ``numeric_only=True`` with a list-like ``q`` produced an empty result (:issue:`59035`)
 - Bug in :meth:`Series.dot` returning ``object`` dtype for :class:`ArrowDtype` and nullable-dtype data (:issue:`61375`)
 - Bug in ``np.matmul`` with :class:`Index` inputs raising a ``TypeError`` (:issue:`57079`)
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -3547,6 +3547,62 @@ def to_xml(
 
         return xml_formatter.write_output()
 
+    def to_iceberg(
+        self,
+        table_identifier: str,
+        catalog_name: str | None = None,
+        *,
+        catalog_properties: dict[str, Any] | None = None,
+        location: str | None = None,
+        append: bool = False,
+        snapshot_properties: dict[str, str] | None = None,
+    ) -> None:
+        """
+        Write a DataFrame to an Apache Iceberg table.
+
+        .. versionadded:: 3.0.0
+
+        .. warning::
+
+           to_iceberg is experimental and may change without warning.
+
+        Parameters
+        ----------
+        table_identifier : str
+            Table identifier.
+        catalog_name : str, optional
+            The name of the catalog.
+        catalog_properties : dict of {str: str}, optional
+            The properties that are used next to the catalog configuration.
+        location : str, optional
+            Location for the table.
+        append : bool, default False
+            If ``True``, append data to the table, instead of replacing the content.
+        snapshot_properties : dict of {str: str}, optional
+            Custom properties to be added to the snapshot summary
+
+        See Also
+        --------
+        read_iceberg : Read an Apache Iceberg table.
+        DataFrame.to_parquet : Write a DataFrame in Parquet format.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(data={"col1": [1, 2], "col2": [4, 3]})
+        >>> df.to_iceberg("my_table", catalog_name="my_catalog")  # doctest: +SKIP
+        """
+        from pandas.io.iceberg import to_iceberg
+
+        to_iceberg(
+            self,
+            table_identifier,
+            catalog_name,
+            catalog_properties=catalog_properties,
+            location=location,
+            append=append,
+            snapshot_properties=snapshot_properties,
+        )
+
     # ----------------------------------------------------------------------
     @doc(INFO_DOCSTRING, **frame_sub_kwargs)
     def info(
diff --git a/pandas/io/iceberg.py b/pandas/io/iceberg.py
@@ -10,6 +10,7 @@
 def read_iceberg(
     table_identifier: str,
     catalog_name: str | None = None,
+    *,
     catalog_properties: dict[str, Any] | None = None,
     row_filter: str | None = None,
     selected_fields: tuple[str] | None = None,
@@ -21,6 +22,8 @@ def read_iceberg(
     """
     Read an Apache Iceberg table into a pandas DataFrame.
 
+    .. versionadded:: 3.0.0
+
     .. warning::
 
        read_iceberg is experimental and may change without warning.
@@ -71,7 +74,6 @@ def read_iceberg(
     """
     pyiceberg_catalog = import_optional_dependency("pyiceberg.catalog")
     pyiceberg_expressions = import_optional_dependency("pyiceberg.expressions")
-
     if catalog_properties is None:
         catalog_properties = {}
     catalog = pyiceberg_catalog.load_catalog(catalog_name, **catalog_properties)
@@ -91,3 +93,59 @@ def read_iceberg(
         limit=limit,
     )
     return result.to_pandas()
+
+
+def to_iceberg(
+    df: DataFrame,
+    table_identifier: str,
+    catalog_name: str | None = None,
+    *,
+    catalog_properties: dict[str, Any] | None = None,
+    location: str | None = None,
+    append: bool = False,
+    snapshot_properties: dict[str, str] | None = None,
+) -> None:
+    """
+    Write a DataFrame to an Apache Iceberg table.
+
+    .. versionadded:: 3.0.0
+
+    Parameters
+    ----------
+    table_identifier : str
+        Table identifier.
+    catalog_name : str, optional
+        The name of the catalog.
+    catalog_properties : dict of {str: str}, optional
+        The properties that are used next to the catalog configuration.
+    location : str, optional
+        Location for the table.
+    append : bool, default False
+        If ``True``, append data to the table, instead of replacing the content.
+    snapshot_properties : dict of {str: str}, optional
+        Custom properties to be added to the snapshot summary
+
+    See Also
+    --------
+    read_iceberg : Read an Apache Iceberg table.
+    DataFrame.to_parquet : Write a DataFrame in Parquet format.
+    """
+    pa = import_optional_dependency("pyarrow")
+    pyiceberg_catalog = import_optional_dependency("pyiceberg.catalog")
+    if catalog_properties is None:
+        catalog_properties = {}
+    catalog = pyiceberg_catalog.load_catalog(catalog_name, **catalog_properties)
+    arrow_table = pa.Table.from_pandas(df)
+    table = catalog.create_table_if_not_exists(
+        identifier=table_identifier,
+        schema=arrow_table.schema,
+        location=location,
+        # we could add `partition_spec`, `sort_order` and `properties` in the
+        # future, but it may not be trivial without exposing PyIceberg objects
+    )
+    if snapshot_properties is None:
+        snapshot_properties = {}
+    if append:
+        table.append(arrow_table, snapshot_properties=snapshot_properties)
+    else:
+        table.overwrite(arrow_table, snapshot_properties=snapshot_properties)
diff --git a/pandas/tests/generic/test_to_xarray.py b/pandas/tests/generic/test_to_xarray.py
@@ -93,6 +93,7 @@ def test_to_xarray_index_types(self, index_flat, request):
             isinstance(index.dtype, StringDtype)
             and index.dtype.storage == "pyarrow"
             and Version(xarray.__version__) > Version("2024.9.0")
+            and Version(xarray.__version__) < Version("2025.6.0")
         ):
             request.applymarker(
                 pytest.mark.xfail(
diff --git a/pandas/tests/io/test_iceberg.py b/pandas/tests/io/test_iceberg.py
diff --git a/web/pandas/about/roadmap.md b/web/pandas/about/roadmap.md
diff --git a/web/pandas/static/css/pandas.css b/web/pandas/static/css/pandas.css
diff --git a/web/pandas_web.py b/web/pandas_web.py