diff --git a/.gitattributes b/.gitattributes
index d94c19e7edb1f..bc7dec642df0f 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -84,5 +84,3 @@ pandas/tests/io/parser/data export-ignore
# Include cibw script in sdist since it's needed for building wheels
scripts/cibw_before_build.sh -export-ignore
-scripts/cibw_before_build_windows.sh -export-ignore
-scripts/cibw_before_test_windows.sh -export-ignore
diff --git a/.github/actions/build_pandas/action.yml b/.github/actions/build_pandas/action.yml
index b92bacd1a537c..2d208cb38725a 100644
--- a/.github/actions/build_pandas/action.yml
+++ b/.github/actions/build_pandas/action.yml
@@ -4,6 +4,9 @@ inputs:
editable:
description: Whether to build pandas in editable mode (default true)
default: true
+ werror:
+ description: Enable werror flag for build
+ default: true
runs:
using: composite
steps:
@@ -26,9 +29,9 @@ runs:
run: |
if [[ ${{ inputs.editable }} == "true" ]]; then
pip install -e . --no-build-isolation -v --no-deps \
- -Csetup-args="--werror"
+ ${{ inputs.werror == 'true' && '-Csetup-args="--werror"' || '' }}
else
pip install . --no-build-isolation -v --no-deps \
- -Csetup-args="--werror"
+ ${{ inputs.werror == 'true' && '-Csetup-args="--werror"' || '' }}
fi
shell: bash -el {0}
diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml
index e1d2d1ea846b8..728019b06e053 100644
--- a/.github/workflows/code-checks.yml
+++ b/.github/workflows/code-checks.yml
@@ -21,7 +21,7 @@ permissions:
jobs:
docstring_typing_manual_hooks:
name: Docstring validation, typing, and other manual pre-commit hooks
- runs-on: ubuntu-22.04
+ runs-on: ubuntu-24.04
defaults:
run:
shell: bash -el {0}
@@ -102,7 +102,7 @@ jobs:
asv-benchmarks:
name: ASV Benchmarks
- runs-on: ubuntu-22.04
+ runs-on: ubuntu-24.04
defaults:
run:
shell: bash -el {0}
@@ -133,7 +133,7 @@ jobs:
build_docker_dev_environment:
name: Build Docker Dev Environment
- runs-on: ubuntu-22.04
+ runs-on: ubuntu-24.04
defaults:
run:
shell: bash -el {0}
@@ -160,7 +160,7 @@ jobs:
requirements-dev-text-installable:
name: Test install requirements-dev.txt
- runs-on: ubuntu-22.04
+ runs-on: ubuntu-24.04
concurrency:
# https://github.community/t/concurrecy-not-work-for-push/183068/7
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 4d0066bc0b48d..44a9b4bfa20b8 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -13,7 +13,7 @@ permissions:
jobs:
analyze:
- runs-on: ubuntu-22.04
+ runs-on: ubuntu-24.04
permissions:
actions: read
contents: read
diff --git a/.github/workflows/comment-commands.yml b/.github/workflows/comment-commands.yml
index 62956f5825782..b843363ae8c4d 100644
--- a/.github/workflows/comment-commands.yml
+++ b/.github/workflows/comment-commands.yml
@@ -10,7 +10,7 @@ permissions:
jobs:
issue_assign:
- runs-on: ubuntu-22.04
+ runs-on: ubuntu-24.04
if: (!github.event.issue.pull_request) && github.event.comment.body == 'take'
concurrency:
group: ${{ github.actor }}-issue-assign
@@ -19,7 +19,7 @@ jobs:
echo "Assigning issue ${{ github.event.issue.number }} to ${{ github.event.comment.user.login }}"
curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -d '{"assignees": ["${{ github.event.comment.user.login }}"]}' https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/assignees
preview_docs:
- runs-on: ubuntu-22.04
+ runs-on: ubuntu-24.04
if: github.event.issue.pull_request && github.event.comment.body == '/preview'
concurrency:
group: ${{ github.actor }}-preview-docs
@@ -29,7 +29,7 @@ jobs:
previewer-server: "https://pandas.pydata.org/preview"
artifact-job: "Doc Build and Upload"
asv_run:
- runs-on: ubuntu-22.04
+ runs-on: ubuntu-24.04
# TODO: Support more benchmarking options later, against different branches, against self, etc
if: github.event.issue.pull_request && startsWith(github.event.comment.body, '@github-actions benchmark')
defaults:
diff --git a/.github/workflows/deprecation-tracking-bot.yml b/.github/workflows/deprecation-tracking-bot.yml
index 3d4cab7be09c5..334a5d77b407b 100644
--- a/.github/workflows/deprecation-tracking-bot.yml
+++ b/.github/workflows/deprecation-tracking-bot.yml
@@ -17,7 +17,7 @@ jobs:
deprecation_update:
permissions:
issues: write
- runs-on: ubuntu-22.04
+ runs-on: ubuntu-24.04
env:
DEPRECATION_TRACKER_ISSUE: 56596
steps:
diff --git a/.github/workflows/docbuild-and-upload.yml b/.github/workflows/docbuild-and-upload.yml
index 294334ca1d54b..ba9e30e088c66 100644
--- a/.github/workflows/docbuild-and-upload.yml
+++ b/.github/workflows/docbuild-and-upload.yml
@@ -23,7 +23,7 @@ permissions:
jobs:
web_and_docs:
name: Doc Build and Upload
- runs-on: ubuntu-22.04
+ runs-on: ubuntu-24.04
concurrency:
# https://github.community/t/concurrecy-not-work-for-push/183068/7
diff --git a/.github/workflows/package-checks.yml b/.github/workflows/package-checks.yml
index 331af6e05b650..9800cc1694313 100644
--- a/.github/workflows/package-checks.yml
+++ b/.github/workflows/package-checks.yml
@@ -21,7 +21,7 @@ defaults:
jobs:
pip:
if: ${{ github.event.label.name == 'Build' || contains(github.event.pull_request.labels.*.name, 'Build') || github.event_name == 'push'}}
- runs-on: ubuntu-22.04
+ runs-on: ubuntu-24.04
strategy:
matrix:
extra: ["test", "pyarrow", "performance", "computation", "fss", "aws", "gcp", "excel", "parquet", "feather", "hdf5", "spss", "postgresql", "mysql", "sql-other", "html", "xml", "plot", "output-formatting", "clipboard", "compression", "all"]
@@ -50,7 +50,7 @@ jobs:
shell: bash -el {0}
conda_forge_recipe:
if: ${{ github.event.label.name == 'Build' || contains(github.event.pull_request.labels.*.name, 'Build') || github.event_name == 'push'}}
- runs-on: ubuntu-22.04
+ runs-on: ubuntu-24.04
strategy:
matrix:
python-version: ['3.10', '3.11']
diff --git a/.github/workflows/stale-pr.yml b/.github/workflows/stale-pr.yml
index 792afe8f4faf5..3a51dbefc6bb0 100644
--- a/.github/workflows/stale-pr.yml
+++ b/.github/workflows/stale-pr.yml
@@ -12,7 +12,7 @@ jobs:
permissions:
pull-requests: write
if: github.repository_owner == 'pandas-dev'
- runs-on: ubuntu-22.04
+ runs-on: ubuntu-24.04
steps:
- uses: actions/stale@v9
with:
diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
index 08c41a1eeb21f..59512ddc91a8a 100644
--- a/.github/workflows/unit-tests.yml
+++ b/.github/workflows/unit-tests.yml
@@ -26,8 +26,8 @@ jobs:
timeout-minutes: 90
strategy:
matrix:
- platform: [ubuntu-22.04, ubuntu-24.04-arm]
- env_file: [actions-310.yaml, actions-311.yaml, actions-312.yaml]
+ platform: [ubuntu-24.04, ubuntu-24.04-arm]
+ env_file: [actions-310.yaml, actions-311.yaml, actions-312.yaml, actions-313.yaml]
# Prevent the include jobs from overriding other jobs
pattern: [""]
pandas_future_infer_string: ["0"]
@@ -36,11 +36,15 @@ jobs:
env_file: actions-311-downstream_compat.yaml
pattern: "not slow and not network and not single_cpu"
pytest_target: "pandas/tests/test_downstream.py"
- platform: ubuntu-22.04
+ platform: ubuntu-24.04
- name: "Minimum Versions"
env_file: actions-310-minimum_versions.yaml
pattern: "not slow and not network and not single_cpu"
- platform: ubuntu-22.04
+ platform: ubuntu-24.04
+ - name: "Freethreading"
+ env_file: actions-313-freethreading.yaml
+ pattern: "not slow and not network and not single_cpu"
+ platform: ubuntu-24.04
- name: "Locale: it_IT"
env_file: actions-311.yaml
pattern: "not slow and not network and not single_cpu"
@@ -51,7 +55,7 @@ jobs:
# Also install it_IT (its encoding is ISO8859-1) but do not activate it.
# It will be temporarily activated during tests with locale.setlocale
extra_loc: "it_IT"
- platform: ubuntu-22.04
+ platform: ubuntu-24.04
- name: "Locale: zh_CN"
env_file: actions-311.yaml
pattern: "not slow and not network and not single_cpu"
@@ -62,30 +66,30 @@ jobs:
# Also install zh_CN (its encoding is gb2312) but do not activate it.
# It will be temporarily activated during tests with locale.setlocale
extra_loc: "zh_CN"
- platform: ubuntu-22.04
+ platform: ubuntu-24.04
- name: "Future infer strings"
env_file: actions-312.yaml
pandas_future_infer_string: "1"
- platform: ubuntu-22.04
+ platform: ubuntu-24.04
- name: "Future infer strings (without pyarrow)"
env_file: actions-311.yaml
pandas_future_infer_string: "1"
- platform: ubuntu-22.04
+ platform: ubuntu-24.04
- name: "Pypy"
env_file: actions-pypy-39.yaml
pattern: "not slow and not network and not single_cpu"
test_args: "--max-worker-restart 0"
- platform: ubuntu-22.04
+ platform: ubuntu-24.04
- name: "Numpy Dev"
env_file: actions-311-numpydev.yaml
pattern: "not slow and not network and not single_cpu"
test_args: "-W error::DeprecationWarning -W error::FutureWarning"
- platform: ubuntu-22.04
+ platform: ubuntu-24.04
- name: "Pyarrow Nightly"
env_file: actions-311-pyarrownightly.yaml
pattern: "not slow and not network and not single_cpu"
pandas_future_infer_string: "1"
- platform: ubuntu-22.04
+ platform: ubuntu-24.04
fail-fast: false
name: ${{ matrix.name || format('{0} {1}', matrix.platform, matrix.env_file) }}
env:
@@ -165,6 +169,9 @@ jobs:
- name: Build Pandas
id: build
uses: ./.github/actions/build_pandas
+ with:
+ # xref https://github.com/cython/cython/issues/6870
+ werror: ${{ matrix.name != 'Freethreading' }}
# TODO: Re-enable once Pypy has Pypy 3.10 on conda-forge
if: ${{ matrix.name != 'Pypy' }}
@@ -188,7 +195,7 @@ jobs:
matrix:
# Note: Don't use macOS latest since macos 14 appears to be arm64 only
os: [macos-13, macos-14, windows-latest]
- env_file: [actions-310.yaml, actions-311.yaml, actions-312.yaml]
+ env_file: [actions-310.yaml, actions-311.yaml, actions-312.yaml, actions-313.yaml]
fail-fast: false
runs-on: ${{ matrix.os }}
name: ${{ format('{0} {1}', matrix.os, matrix.env_file) }}
@@ -220,7 +227,7 @@ jobs:
uses: ./.github/actions/run-tests
Linux-32-bit:
- runs-on: ubuntu-22.04
+ runs-on: ubuntu-24.04
container:
image: quay.io/pypa/manylinux2014_i686
options: --platform linux/386
@@ -241,12 +248,14 @@ jobs:
fi
- name: Build environment and Run Tests
# https://github.com/numpy/numpy/issues/24703#issuecomment-1722379388
+ # Note: Pinned to Cython 3.0.10 to avoid numerical instability in 32-bit environments
+ # https://github.com/pandas-dev/pandas/pull/61423
run: |
/opt/python/cp313-cp313/bin/python -m venv ~/virtualenvs/pandas-dev
. ~/virtualenvs/pandas-dev/bin/activate
python -m pip install --no-cache-dir -U pip wheel setuptools meson[ninja]==1.2.1 meson-python==0.13.1
python -m pip install numpy -Csetup-args="-Dallow-noblas=true"
- python -m pip install --no-cache-dir versioneer[toml] cython python-dateutil pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0
+ python -m pip install --no-cache-dir versioneer[toml] cython==3.0.10 python-dateutil pytest>=7.3.2 pytest-xdist>=3.4.0 hypothesis>=6.84.0
python -m pip install --no-cache-dir --no-build-isolation -e . -Csetup-args="--werror"
python -m pip list --no-cache-dir
PANDAS_CI=1 python -m pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml
@@ -256,7 +265,7 @@ jobs:
cancel-in-progress: true
Linux-Musl:
- runs-on: ubuntu-22.04
+ runs-on: ubuntu-24.04
container:
image: quay.io/pypa/musllinux_1_2_x86_64
steps:
@@ -316,7 +325,7 @@ jobs:
# To freeze this file, uncomment out the ``if: false`` condition, and migrate the jobs
# to the corresponding posix/windows-macos/sdist etc. workflows.
# Feel free to modify this comment as necessary.
- # if: false # Uncomment this to freeze the workflow, comment it to unfreeze
+ if: false
defaults:
run:
shell: bash -eou pipefail {0}
@@ -325,7 +334,7 @@ jobs:
fail-fast: false
matrix:
# Separate out macOS 13 and 14, since macOS 14 is arm64 only
- os: [ubuntu-22.04, macOS-13, macOS-14, windows-latest]
+ os: [ubuntu-24.04, macOS-13, macOS-14, windows-latest]
timeout-minutes: 90
@@ -362,48 +371,6 @@ jobs:
- name: Run Tests
uses: ./.github/actions/run-tests
- python-freethreading:
- defaults:
- run:
- shell: bash -eou pipefail {0}
- runs-on: ubuntu-22.04
-
- timeout-minutes: 90
-
- concurrency:
- # https://github.community/t/concurrecy-not-work-for-push/183068/7
- group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-python-freethreading-dev
- cancel-in-progress: true
-
- env:
- PYTEST_WORKERS: "auto"
- PANDAS_CI: 1
- PATTERN: "not slow and not network and not clipboard and not single_cpu"
- PYTEST_TARGET: pandas
-
- steps:
- - uses: actions/checkout@v4
- with:
- fetch-depth: 0
-
- - name: Set up Python Free-threading Version
- uses: deadsnakes/action@v3.2.0
- with:
- python-version: 3.13-dev
- nogil: true
-
- - name: Build Environment
- run: |
- python --version
- python -m pip install --upgrade pip setuptools wheel numpy meson[ninja]==1.2.1 meson-python==0.13.1
- python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple cython
- python -m pip install versioneer[toml] python-dateutil pytz tzdata hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0 pytest-cov
- python -m pip install -ve . --no-build-isolation --no-index --no-deps -Csetup-args="--werror"
- python -m pip list
-
- - name: Run Tests
- uses: ./.github/actions/run-tests
-
# NOTE: this job must be kept in sync with the Pyodide build job in wheels.yml
emscripten:
# Note: the Python version, Emscripten toolchain version are determined
@@ -413,7 +380,7 @@ jobs:
# The Node.js version can be determined via Pyodide:
# https://pyodide.org/en/stable/usage/index.html#node-js
name: Pyodide build
- runs-on: ubuntu-22.04
+ runs-on: ubuntu-24.04
concurrency:
# https://github.community/t/concurrecy-not-work-for-push/183068/7
group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-wasm
diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index f330d0e6cb41a..4de7aec4f551a 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -40,7 +40,7 @@ jobs:
(github.event_name == 'pull_request' &&
contains(github.event.pull_request.labels.*.name, 'Build')) ||
(github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && ( ! endsWith(github.ref, 'dev0')))
- runs-on: ubuntu-22.04
+ runs-on: ubuntu-24.04
env:
IS_PUSH: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') }}
IS_SCHEDULE_DISPATCH: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
@@ -92,26 +92,30 @@ jobs:
# GitHub Actions doesn't support pairing matrix values together, let's improvise
# https://github.com/github/feedback/discussions/7835#discussioncomment-1769026
buildplat:
- - [ubuntu-22.04, manylinux_x86_64]
- - [ubuntu-22.04, musllinux_x86_64]
+ - [ubuntu-24.04, manylinux_x86_64]
+ - [ubuntu-24.04, musllinux_x86_64]
- [ubuntu-24.04-arm, manylinux_aarch64]
+ - [ubuntu-24.04-arm, musllinux_aarch64]
- [macos-13, macosx_x86_64]
# Note: M1 images on Github Actions start from macOS 14
- [macos-14, macosx_arm64]
- [windows-2022, win_amd64]
+ - [windows-11-arm, win_arm64]
# TODO: support PyPy?
python: [["cp310", "3.10"], ["cp311", "3.11"], ["cp312", "3.12"], ["cp313", "3.13"], ["cp313t", "3.13"]]
include:
- # TODO: Remove this plus installing build deps in cibw_before_build.sh
- # after pandas can be built with a released NumPy/Cython
- - python: ["cp313t", "3.13"]
- cibw_build_frontend: 'pip; args: --no-build-isolation'
# Build Pyodide wheels and upload them to Anaconda.org
# NOTE: this job is similar to the one in unit-tests.yml except for the fact
# that it uses cibuildwheel instead of a standard Pyodide xbuildenv setup.
- - buildplat: [ubuntu-22.04, pyodide_wasm32]
+ - buildplat: [ubuntu-24.04, pyodide_wasm32]
python: ["cp312", "3.12"]
cibw_build_frontend: 'build'
+ exclude:
+ - buildplat: [windows-11-arm, win_arm64]
+ python: ["cp310", "3.10"]
+ # BackendUnavailable: Cannot import 'mesonpy'
+ - buildplat: [windows-11-arm, win_arm64]
+ python: ["cp313t", "3.13"]
env:
IS_PUSH: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') }}
@@ -122,6 +126,12 @@ jobs:
with:
fetch-depth: 0
+ - name: Set up MSVC environment for ARM64
+ if: matrix.buildplat[1] == 'win_arm64'
+ uses: ilammy/msvc-dev-cmd@v1
+ with:
+ arch: arm64
+
# TODO: Build wheels from sdist again
# There's some sort of weird race condition?
# within Github that makes the sdist be missing files
@@ -159,9 +169,13 @@ jobs:
env:
CIBW_BUILD: ${{ matrix.python[0] }}-${{ matrix.buildplat[1] }}
CIBW_BUILD_FRONTEND: ${{ matrix.cibw_build_frontend || 'pip' }}
- CIBW_PLATFORM: ${{ matrix.buildplat[1] == 'pyodide_wasm32' && 'pyodide' || 'auto' }}
+ CIBW_PLATFORM: ${{ (matrix.buildplat[1] == 'pyodide_wasm32' && 'pyodide') || (matrix.buildplat[1] == 'win_arm64' && 'windows') || 'auto' }}
+ CIBW_ARCHS: ${{ matrix.buildplat[1] == 'win_arm64' && 'ARM64' || 'auto' }}
+ CIBW_BEFORE_BUILD_WINDOWS: 'python -m pip install delvewheel'
- - name: Set up Python
+ - name: Set up Python for validation/upload (non-ARM64 Windows & other OS)
+ # micromamba is not available for ARM64 Windows
+ if: matrix.buildplat[1] != 'win_arm64'
uses: mamba-org/setup-micromamba@v2
with:
environment-name: wheel-env
@@ -174,6 +188,12 @@ jobs:
cache-downloads: true
cache-environment: true
+ - name: Install wheel for win_arm64
+ # installing wheel here because micromamba step was skipped
+ if: matrix.buildplat[1] == 'win_arm64'
+ shell: bash -el {0}
+ run: python -m pip install wheel
+
- name: Validate wheel RECORD
shell: bash -el {0}
run: for whl in $(ls wheelhouse); do wheel unpack wheelhouse/$whl -d /tmp; done
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 5308c98e96937..b5856810b749e 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -19,7 +19,7 @@ ci:
skip: [pyright, mypy]
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
- rev: v0.11.4
+ rev: v0.11.12
hooks:
- id: ruff
args: [--exit-non-zero-on-fix]
@@ -74,7 +74,7 @@ repos:
hooks:
- id: isort
- repo: https://github.com/asottile/pyupgrade
- rev: v3.19.1
+ rev: v3.20.0
hooks:
- id: pyupgrade
args: [--py310-plus]
@@ -95,14 +95,14 @@ repos:
- id: sphinx-lint
args: ["--enable", "all", "--disable", "line-too-long"]
- repo: https://github.com/pre-commit/mirrors-clang-format
- rev: v20.1.0
+ rev: v20.1.5
hooks:
- id: clang-format
files: ^pandas/_libs/src|^pandas/_libs/include
args: [-i]
types_or: [c, c++]
- repo: https://github.com/trim21/pre-commit-mirror-meson
- rev: v1.7.2
+ rev: v1.8.1
hooks:
- id: meson-fmt
args: ['--inplace']
@@ -140,7 +140,7 @@ repos:
pass_filenames: false
types: [python]
stages: [manual]
- - id: mypy
+ - id: stubtest
# note: assumes python env is setup and activated
# note: requires pandas dev to be installed
name: mypy (stubtest)
diff --git a/Dockerfile b/Dockerfile
index 4090a4adb1af8..e778312fd3aa2 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,16 +1,20 @@
FROM python:3.10.8
WORKDIR /home/pandas
-RUN apt-get update && apt-get -y upgrade
-RUN apt-get install -y build-essential bash-completion
+RUN apt-get update && \
+ apt-get --no-install-recommends -y upgrade && \
+ apt-get --no-install-recommends -y install \
+ build-essential \
+ bash-completion \
+ # hdf5 needed for pytables installation
+ libhdf5-dev \
+ # libgles2-mesa needed for pytest-qt
+ libgles2-mesa-dev && \
+ rm -rf /var/lib/apt/lists/*
-# hdf5 needed for pytables installation
-# libgles2-mesa needed for pytest-qt
-RUN apt-get install -y libhdf5-dev libgles2-mesa-dev
-
-RUN python -m pip install --upgrade pip
COPY requirements-dev.txt /tmp
-RUN python -m pip install -r /tmp/requirements-dev.txt
+RUN python -m pip install --no-cache-dir --upgrade pip && \
+ python -m pip install --no-cache-dir -r /tmp/requirements-dev.txt
RUN git config --global --add safe.directory /home/pandas
ENV SHELL="/bin/bash"
diff --git a/MANIFEST.in b/MANIFEST.in
index c59151f340545..a7d7d7eb4e062 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -65,5 +65,3 @@ graft pandas/_libs/include
# Include cibw script in sdist since it's needed for building wheels
include scripts/cibw_before_build.sh
-include scripts/cibw_before_build_windows.sh
-include scripts/cibw_before_test_windows.sh
diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json
index 30c692115eab1..d286e57ce6b51 100644
--- a/asv_bench/asv.conf.json
+++ b/asv_bench/asv.conf.json
@@ -42,7 +42,7 @@
// followed by the pip installed packages).
"matrix": {
"pip+build": [],
- "Cython": ["3.0"],
+ "Cython": [],
"matplotlib": [],
"sqlalchemy": [],
"scipy": [],
diff --git a/ci/deps/actions-310-minimum_versions.yaml b/ci/deps/actions-310-minimum_versions.yaml
index a8fb678970b20..eaba29c2f796e 100644
--- a/ci/deps/actions-310-minimum_versions.yaml
+++ b/ci/deps/actions-310-minimum_versions.yaml
@@ -8,7 +8,7 @@ dependencies:
# build dependencies
- versioneer
- - cython>=0.29.33
+ - cython<4.0.0a0
- meson=1.2.1
- meson-python=0.13.1
@@ -26,13 +26,12 @@ dependencies:
# optional dependencies
- beautifulsoup4=4.12.3
- - blosc=1.21.3
- bottleneck=1.3.6
- fastparquet=2024.2.0
- - fsspec=2024.2.0
+ - fsspec=2023.12.2
- html5lib=1.1
- hypothesis=6.84.0
- - gcsfs=2024.2.0
+ - gcsfs=2023.12.2
- jinja2=3.1.3
- lxml=4.9.2
- matplotlib=3.8.3
@@ -43,6 +42,7 @@ dependencies:
- openpyxl=3.1.2
- psycopg2=2.9.6
- pyarrow=10.0.1
+ - pyiceberg=0.7.1
- pymysql=1.1.0
- pyqt=5.15.9
- pyreadstat=1.2.6
@@ -50,7 +50,7 @@ dependencies:
- python-calamine=0.1.7
- pytz=2023.4
- pyxlsb=1.0.10
- - s3fs=2024.2.0
+ - s3fs=2023.12.2
- scipy=1.12.0
- sqlalchemy=2.0.0
- tabulate=0.9.0
diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml
index e1c7463f6432d..fbeabb56a62d0 100644
--- a/ci/deps/actions-310.yaml
+++ b/ci/deps/actions-310.yaml
@@ -6,7 +6,7 @@ dependencies:
# build dependencies
- versioneer
- - cython>=0.29.33
+ - cython<4.0.0a0
- meson=1.2.1
- meson-python=0.13.1
@@ -24,13 +24,12 @@ dependencies:
# optional dependencies
- beautifulsoup4>=4.12.3
- - blosc>=1.21.3
- bottleneck>=1.3.6
- fastparquet>=2024.2.0
- - fsspec>=2024.2.0
+ - fsspec>=2023.12.2
- html5lib>=1.1
- hypothesis>=6.84.0
- - gcsfs>=2024.2.0
+ - gcsfs>=2023.12.2
- jinja2>=3.1.3
- lxml>=4.9.2
- matplotlib>=3.8.3
@@ -41,6 +40,7 @@ dependencies:
- openpyxl>=3.1.2
- psycopg2>=2.9.6
- pyarrow>=10.0.1
+ - pyiceberg>=0.7.1
- pymysql>=1.1.0
- pyqt>=5.15.9
- pyreadstat>=1.2.6
@@ -48,11 +48,11 @@ dependencies:
- python-calamine>=0.1.7
- pytz>=2023.4
- pyxlsb>=1.0.10
- - s3fs>=2024.2.0
+ - s3fs>=2023.12.2
- scipy>=1.12.0
- sqlalchemy>=2.0.0
- tabulate>=0.9.0
- - xarray>=2024.1.1, <=2024.9.0
+ - xarray>=2024.1.1
- xlrd>=2.0.1
- xlsxwriter>=3.2.0
- zstandard>=0.22.0
diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml
index ff8feee9dbf9a..07f865868573d 100644
--- a/ci/deps/actions-311-downstream_compat.yaml
+++ b/ci/deps/actions-311-downstream_compat.yaml
@@ -7,7 +7,7 @@ dependencies:
# build dependencies
- versioneer
- - cython>=0.29.33
+ - cython<4.0.0a0
- meson=1.2.1
- meson-python=0.13.1
@@ -25,13 +25,12 @@ dependencies:
# optional dependencies
- beautifulsoup4>=4.12.3
- - blosc>=1.21.3
- bottleneck>=1.3.6
- fastparquet>=2024.2.0
- - fsspec>=2024.2.0
+ - fsspec>=2023.12.2
- html5lib>=1.1
- hypothesis>=6.84.0
- - gcsfs>=2024.2.0
+ - gcsfs>=2023.12.2
- jinja2>=3.1.3
- lxml>=4.9.2
- matplotlib>=3.8.3
@@ -42,6 +41,7 @@ dependencies:
- openpyxl>=3.1.2
- psycopg2>=2.9.6
- pyarrow>=10.0.1
+ - pyiceberg>=0.7.1
- pymysql>=1.1.0
- pyqt>=5.15.9
- pyreadstat>=1.2.6
@@ -49,11 +49,11 @@ dependencies:
- python-calamine>=0.1.7
- pytz>=2023.4
- pyxlsb>=1.0.10
- - s3fs>=2024.2.0
+ - s3fs>=2023.12.2
- scipy>=1.12.0
- sqlalchemy>=2.0.0
- tabulate>=0.9.0
- - xarray>=2024.1.1, <=2024.9.0
+ - xarray>=2024.1.1
- xlrd>=2.0.1
- xlsxwriter>=3.2.0
- zstandard>=0.22.0
@@ -63,14 +63,12 @@ dependencies:
- cftime
- dask
- ipython
- - geopandas-base
- seaborn
- scikit-learn
- statsmodels
- coverage
- pandas-datareader
- pyyaml
- - py
- pip:
- adbc-driver-postgresql>=0.10.0
- adbc-driver-sqlite>=0.8.0
diff --git a/ci/deps/actions-311-numpydev.yaml b/ci/deps/actions-311-numpydev.yaml
index 325a6d45d74fd..99cbe0415b4f9 100644
--- a/ci/deps/actions-311-numpydev.yaml
+++ b/ci/deps/actions-311-numpydev.yaml
@@ -8,7 +8,7 @@ dependencies:
- versioneer
- meson=1.2.1
- meson-python=0.13.1
- - cython>=0.29.33
+ - cython<4.0.0a0
# test dependencies
- pytest>=7.3.2
diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml
index 2d3d11c294e12..da0cecda0fb46 100644
--- a/ci/deps/actions-311-pyarrownightly.yaml
+++ b/ci/deps/actions-311-pyarrownightly.yaml
@@ -7,7 +7,7 @@ dependencies:
# build dependencies
- versioneer
- meson=1.2.1
- - cython>=0.29.33
+ - cython<4.0.0a0
- meson-python=0.13.1
# test dependencies
diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml
index f7d5dd75aff82..9ab82d69504fb 100644
--- a/ci/deps/actions-311.yaml
+++ b/ci/deps/actions-311.yaml
@@ -6,7 +6,7 @@ dependencies:
# build dependencies
- versioneer
- - cython>=0.29.33
+ - cython<4.0.0a0
- meson=1.2.1
- meson-python=0.13.1
@@ -24,13 +24,12 @@ dependencies:
# optional dependencies
- beautifulsoup4>=4.12.3
- - blosc>=1.21.3
- bottleneck>=1.3.6
- fastparquet>=2024.2.0
- - fsspec>=2024.2.0
+ - fsspec>=2023.12.2
- html5lib>=1.1
- hypothesis>=6.84.0
- - gcsfs>=2024.2.0
+ - gcsfs>=2023.12.2
- jinja2>=3.1.3
- lxml>=4.9.2
- matplotlib>=3.8.3
@@ -42,17 +41,18 @@ dependencies:
- openpyxl>=3.1.2
- psycopg2>=2.9.6
- pyarrow>=10.0.1
+ - pyiceberg>=0.7.1
- pymysql>=1.1.0
- pyreadstat>=1.2.6
- pytables>=3.8.0
- python-calamine>=0.1.7
- pytz>=2023.4
- pyxlsb>=1.0.10
- - s3fs>=2024.2.0
+ - s3fs>=2023.12.2
- scipy>=1.12.0
- sqlalchemy>=2.0.0
- tabulate>=0.9.0
- - xarray>=2024.1.1, <=2024.9.0
+ - xarray>=2024.1.1
- xlrd>=2.0.1
- xlsxwriter>=3.2.0
- zstandard>=0.22.0
diff --git a/ci/deps/actions-312.yaml b/ci/deps/actions-312.yaml
index f1d17c72da2c5..d12fe4f7d6e49 100644
--- a/ci/deps/actions-312.yaml
+++ b/ci/deps/actions-312.yaml
@@ -6,7 +6,7 @@ dependencies:
# build dependencies
- versioneer
- - cython>=0.29.33
+ - cython<4.0.0a0
- meson=1.2.1
- meson-python=0.13.1
@@ -24,13 +24,12 @@ dependencies:
# optional dependencies
- beautifulsoup4>=4.12.3
- - blosc>=1.21.3
- bottleneck>=1.3.6
- fastparquet>=2024.2.0
- - fsspec>=2024.2.0
+ - fsspec>=2023.12.2
- html5lib>=1.1
- hypothesis>=6.84.0
- - gcsfs>=2024.2.0
+ - gcsfs>=2023.12.2
- jinja2>=3.1.3
- lxml>=4.9.2
- matplotlib>=3.8.3
@@ -42,17 +41,18 @@ dependencies:
- openpyxl>=3.1.2
- psycopg2>=2.9.6
- pyarrow>=10.0.1
+ - pyiceberg>=0.7.1
- pymysql>=1.1.0
- pyreadstat>=1.2.6
- pytables>=3.8.0
- python-calamine>=0.1.7
- pytz>=2023.4
- pyxlsb>=1.0.10
- - s3fs>=2024.2.0
+ - s3fs>=2023.12.2
- scipy>=1.12.0
- sqlalchemy>=2.0.0
- tabulate>=0.9.0
- - xarray>=2024.1.1, <=2024.9.0
+ - xarray>=2024.1.1
- xlrd>=2.0.1
- xlsxwriter>=3.2.0
- zstandard>=0.22.0
diff --git a/ci/deps/actions-313-freethreading.yaml b/ci/deps/actions-313-freethreading.yaml
new file mode 100644
index 0000000000000..14e3ade976b01
--- /dev/null
+++ b/ci/deps/actions-313-freethreading.yaml
@@ -0,0 +1,29 @@
+name: pandas-dev-313-freethreading
+channels:
+ - conda-forge
+dependencies:
+ - python-freethreading
+
+ # build dependencies
+ - setuptools
+ - versioneer
+ - cython<4.0.0a0
+ - meson=1.8.0
+ - meson-python=0.18.0
+
+ # test dependencies
+ - pytest>=7.3.2
+ - pytest-xdist>=3.4.0
+
+ # required dependencies
+ - python-dateutil
+ - numpy
+
+ # optional dependencies
+ - hypothesis>=6.84.0
+
+ - pip:
+ # No free-threaded coveragepy (with the C-extension) on conda-forge yet
+ - pytest-cov
+ - "tzdata>=2022.7"
+ - "--extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple"
diff --git a/ci/deps/actions-313.yaml b/ci/deps/actions-313.yaml
new file mode 100644
index 0000000000000..57ed0ea062398
--- /dev/null
+++ b/ci/deps/actions-313.yaml
@@ -0,0 +1,63 @@
+name: pandas-dev-313
+channels:
+ - conda-forge
+dependencies:
+ - python=3.13
+
+ # build dependencies
+ - versioneer
+ - cython<4.0.0a0
+ - meson=1.2.1
+ - meson-python=0.13.1
+
+ # test dependencies
+ - pytest>=7.3.2
+ - pytest-cov
+ - pytest-xdist>=3.4.0
+ - pytest-localserver>=0.8.1
+ - pytest-qt>=4.4.0
+ - boto3
+
+ # required dependencies
+ - python-dateutil
+ - numpy
+
+ # optional dependencies
+ - beautifulsoup4>=4.12.3
+ - blosc>=1.21.3
+ - bottleneck>=1.3.6
+ - fastparquet>=2024.2.0
+ - fsspec>=2023.12.2
+ - html5lib>=1.1
+ - hypothesis>=6.84.0
+ - gcsfs>=2023.12.2
+ - jinja2>=3.1.3
+ - lxml>=4.9.2
+ - matplotlib>=3.8.3
+ - numba>=0.59.0
+ - numexpr>=2.9.0
+ - odfpy>=1.4.1
+ - qtpy>=2.3.0
+ - pyqt>=5.15.9
+ - openpyxl>=3.1.2
+ - psycopg2>=2.9.6
+ - pyarrow>=10.0.1
+ - pymysql>=1.1.0
+ - pyreadstat>=1.2.6
+ - pytables>=3.8.0
+ - python-calamine>=0.1.7
+ - pytz>=2023.4
+ - pyxlsb>=1.0.10
+ - s3fs>=2023.12.2
+ - scipy>=1.12.0
+ - sqlalchemy>=2.0.0
+ - tabulate>=0.9.0
+ - xarray>=2024.1.1
+ - xlrd>=2.0.1
+ - xlsxwriter>=3.2.0
+ - zstandard>=0.22.0
+
+ - pip:
+ - adbc-driver-postgresql>=0.10.0
+ - adbc-driver-sqlite>=0.8.0
+ - tzdata>=2022.7
diff --git a/ci/deps/actions-pypy-39.yaml b/ci/deps/actions-pypy-39.yaml
index 90933b24b88db..e0ddc6954e4a4 100644
--- a/ci/deps/actions-pypy-39.yaml
+++ b/ci/deps/actions-pypy-39.yaml
@@ -9,7 +9,7 @@ dependencies:
# build dependencies
- versioneer
- - cython>=0.29.33
+ - cython<4.0.0a0
- meson=1.2.1
- meson-python=0.13.1
diff --git a/doc/cheatsheet/Pandas_Cheat_Sheet.pdf b/doc/cheatsheet/Pandas_Cheat_Sheet.pdf
index 3582e0c0dabf9..33fbf2507ed62 100644
Binary files a/doc/cheatsheet/Pandas_Cheat_Sheet.pdf and b/doc/cheatsheet/Pandas_Cheat_Sheet.pdf differ
diff --git a/doc/cheatsheet/Pandas_Cheat_Sheet.pptx b/doc/cheatsheet/Pandas_Cheat_Sheet.pptx
index 746f508516964..5ce2e3be48d55 100644
Binary files a/doc/cheatsheet/Pandas_Cheat_Sheet.pptx and b/doc/cheatsheet/Pandas_Cheat_Sheet.pptx differ
diff --git a/doc/cheatsheet/README.md b/doc/cheatsheet/README.md
index b8599acff2f6e..b72c093b4ba2f 100644
--- a/doc/cheatsheet/README.md
+++ b/doc/cheatsheet/README.md
@@ -12,7 +12,7 @@ This cheat sheet, originally written by Irv Lustig, [Princeton Consultants](http
| Pandas_Cheat_Sheet_JA | Japanese |
|
|
| Pandas_Cheat_Sheet_FA | Persian |
|
|
-
+The English version has additional material that is not in the versions in other languages.
**Alternative**
diff --git a/doc/source/_static/css/getting_started.css b/doc/source/_static/css/getting_started.css
index b02311eb66080..55141f8955066 100644
--- a/doc/source/_static/css/getting_started.css
+++ b/doc/source/_static/css/getting_started.css
@@ -249,6 +249,7 @@ ul.task-bullet > li > p:first-child {
.tutorial-card .card-header {
--bs-card-cap-color: var(--pst-color-text-base);
+ color: var(--pst-color-text-base);
cursor: pointer;
background-color: var(--pst-color-surface);
border: 1px solid var(--pst-color-border)
@@ -256,6 +257,7 @@ ul.task-bullet > li > p:first-child {
.tutorial-card .card-body {
background-color: var(--pst-color-on-background);
+ color: var(--pst-color-text-base);
}
.tutorial-card .badge {
diff --git a/doc/source/development/contributing_environment.rst b/doc/source/development/contributing_environment.rst
index 98bd4b00d016b..d7b779debcd5e 100644
--- a/doc/source/development/contributing_environment.rst
+++ b/doc/source/development/contributing_environment.rst
@@ -251,7 +251,7 @@ This option allows you to configure where meson stores your built C extensions,
Sometimes, it might be useful to compile pandas with debugging symbols, when debugging C extensions.
Appending ``-Csetup-args="-Ddebug=true"`` will do the trick.
-With pip, it is possible to chain together multiple config settings (for example specifying both a build directory
+With pip, it is possible to chain together multiple config settings. For example, specifying both a build directory
and building with debug symbols would look like
``-Cbuilddir="your builddir here" -Csetup-args="-Dbuildtype=debug"``.
diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst
index 1d651ac570d8b..1589fea5f8953 100644
--- a/doc/source/getting_started/install.rst
+++ b/doc/source/getting_started/install.rst
@@ -299,16 +299,16 @@ Dependency Minimum Versi
Other data sources
^^^^^^^^^^^^^^^^^^
-Installable with ``pip install "pandas[hdf5, parquet, feather, spss, excel]"``
+Installable with ``pip install "pandas[hdf5, parquet, iceberg, feather, spss, excel]"``
====================================================== ================== ================ ==========================================================
Dependency Minimum Version pip extra Notes
====================================================== ================== ================ ==========================================================
`PyTables `__ 3.8.0 hdf5 HDF5-based reading / writing
-`blosc `__ 1.21.3 hdf5 Compression for HDF5; only available on ``conda``
`zlib `__ hdf5 Compression for HDF5
`fastparquet `__ 2024.2.0 - Parquet reading / writing (pyarrow is default)
`pyarrow `__ 10.0.1 parquet, feather Parquet, ORC, and feather reading / writing
+`PyIceberg `__ 0.7.1 iceberg Apache Iceberg reading / writing
`pyreadstat `__ 1.2.6 spss SPSS files (.sav) reading
`odfpy `__ 1.4.1 excel Open document format (.odf, .ods, .odt) reading / writing
====================================================== ================== ================ ==========================================================
@@ -329,10 +329,10 @@ Installable with ``pip install "pandas[fss, aws, gcp]"``
============================================ ================== =============== ==========================================================
Dependency Minimum Version pip extra Notes
============================================ ================== =============== ==========================================================
-`fsspec `__ 2024.2.0 fss, gcp, aws Handling files aside from simple local and HTTP (required
+`fsspec `__ 2023.12.2 fss, gcp, aws Handling files aside from simple local and HTTP (required
dependency of s3fs, gcsfs).
-`gcsfs `__ 2024.2.0 gcp Google Cloud Storage access
-`s3fs `__ 2024.2.0 aws Amazon S3 access
+`gcsfs `__ 2023.12.2 gcp Google Cloud Storage access
+`s3fs `__ 2023.12.2 aws Amazon S3 access
============================================ ================== =============== ==========================================================
Clipboard
diff --git a/doc/source/getting_started/intro_tutorials/includes/titanic.rst b/doc/source/getting_started/intro_tutorials/includes/titanic.rst
index 6e03b848aab06..41159516200fa 100644
--- a/doc/source/getting_started/intro_tutorials/includes/titanic.rst
+++ b/doc/source/getting_started/intro_tutorials/includes/titanic.rst
@@ -11,7 +11,7 @@ This tutorial uses the Titanic data set, stored as CSV. The data
consists of the following data columns:
- PassengerId: Id of every passenger.
-- Survived: Indication whether passenger survived. ``0`` for yes and ``1`` for no.
+- Survived: Indication whether passenger survived. ``0`` for no and ``1`` for yes.
- Pclass: One out of the 3 ticket classes: Class ``1``, Class ``2`` and Class ``3``.
- Name: Name of passenger.
- Sex: Gender of passenger.
diff --git a/doc/source/reference/arrays.rst b/doc/source/reference/arrays.rst
index 5be08f163e6ce..d37eebef5c0c0 100644
--- a/doc/source/reference/arrays.rst
+++ b/doc/source/reference/arrays.rst
@@ -664,6 +664,7 @@ Data type introspection
api.types.is_datetime64_dtype
api.types.is_datetime64_ns_dtype
api.types.is_datetime64tz_dtype
+ api.types.is_dtype_equal
api.types.is_extension_array_dtype
api.types.is_float_dtype
api.types.is_int64_dtype
diff --git a/doc/source/reference/io.rst b/doc/source/reference/io.rst
index 805fb8b783459..37d9e7f6b7dbd 100644
--- a/doc/source/reference/io.rst
+++ b/doc/source/reference/io.rst
@@ -156,6 +156,16 @@ Parquet
read_parquet
DataFrame.to_parquet
+Iceberg
+~~~~~~~
+.. autosummary::
+ :toctree: api/
+
+ read_iceberg
+ DataFrame.to_iceberg
+
+.. warning:: ``read_iceberg`` is experimental and may change without warning.
+
ORC
~~~
.. autosummary::
diff --git a/doc/source/user_guide/10min.rst b/doc/source/user_guide/10min.rst
index 72bb93d21a99f..8beaa73090673 100644
--- a/doc/source/user_guide/10min.rst
+++ b/doc/source/user_guide/10min.rst
@@ -178,12 +178,26 @@ Getitem (``[]``)
~~~~~~~~~~~~~~~~
For a :class:`DataFrame`, passing a single label selects a column and
-yields a :class:`Series` equivalent to ``df.A``:
+yields a :class:`Series`:
.. ipython:: python
df["A"]
+If the label only contains letters, numbers, and underscores, you can
+alternatively use the column name attribute:
+
+.. ipython:: python
+
+ df.A
+
+Passing a list of column labels selects multiple columns, which can be useful
+for getting a subset/rearranging:
+
+.. ipython:: python
+
+ df[["B", "A"]]
+
For a :class:`DataFrame`, passing a slice ``:`` selects matching rows:
.. ipython:: python
diff --git a/doc/source/user_guide/index.rst b/doc/source/user_guide/index.rst
index f0d6a76f0de5b..230b2b86b2ffd 100644
--- a/doc/source/user_guide/index.rst
+++ b/doc/source/user_guide/index.rst
@@ -78,6 +78,7 @@ Guides
boolean
visualization
style
+ user_defined_functions
groupby
window
timeseries
diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst
index ed5c7806b2e23..605f9501c5b23 100644
--- a/doc/source/user_guide/indexing.rst
+++ b/doc/source/user_guide/indexing.rst
@@ -325,7 +325,7 @@ The ``.loc`` attribute is the primary access method. The following are valid inp
* A single label, e.g. ``5`` or ``'a'`` (Note that ``5`` is interpreted as a *label* of the index. This use is **not** an integer position along the index.).
* A list or array of labels ``['a', 'b', 'c']``.
-* A slice object with labels ``'a':'f'`` (Note that contrary to usual Python
+* A slice object with labels ``'a':'f'``. Note that contrary to usual Python
slices, **both** the start and the stop are included, when present in the
index! See :ref:`Slicing with labels `.
* A boolean array.
@@ -1461,16 +1461,33 @@ Looking up values by index/column labels
Sometimes you want to extract a set of values given a sequence of row labels
and column labels, this can be achieved by ``pandas.factorize`` and NumPy indexing.
-For instance:
-.. ipython:: python
+For heterogeneous column types, we subset columns to avoid unnecessary NumPy conversions:
+
+.. code-block:: python
+
+ def pd_lookup_het(df, row_labels, col_labels):
+ rows = df.index.get_indexer(row_labels)
+ cols = df.columns.get_indexer(col_labels)
+ sub = df.take(np.unique(cols), axis=1)
+ sub = sub.take(np.unique(rows), axis=0)
+ rows = sub.index.get_indexer(row_labels)
+ values = sub.melt()["value"]
+ cols = sub.columns.get_indexer(col_labels)
+ flat_index = rows + cols * len(sub)
+ result = values[flat_index]
+ return result
+
+For homogeneous column types, it is fastest to skip column subsetting and go directly to NumPy:
+
+.. code-block:: python
- df = pd.DataFrame({'col': ["A", "A", "B", "B"],
- 'A': [80, 23, np.nan, 22],
- 'B': [80, 55, 76, 67]})
- df
- idx, cols = pd.factorize(df['col'])
- df.reindex(cols, axis=1).to_numpy()[np.arange(len(df)), idx]
+ def pd_lookup_hom(df, row_labels, col_labels):
+ rows = df.index.get_indexer(row_labels)
+ df = df.loc[:, sorted(set(col_labels))]
+ cols = df.columns.get_indexer(col_labels)
+ result = df.to_numpy()[rows, cols]
+ return result
Formerly this could be achieved with the dedicated ``DataFrame.lookup`` method
which was deprecated in version 1.2.0 and removed in version 2.0.0.
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
index 23da52f26358f..25f1e11e6b603 100644
--- a/doc/source/user_guide/io.rst
+++ b/doc/source/user_guide/io.rst
@@ -26,9 +26,10 @@ The pandas I/O API is a set of top level ``reader`` functions accessed like
text, Local clipboard, :ref:`read_clipboard`, :ref:`to_clipboard`
binary,`MS Excel `__ , :ref:`read_excel`, :ref:`to_excel`
binary,`OpenDocument `__, :ref:`read_excel`, NA
- binary,`HDF5 Format `__, :ref:`read_hdf`, :ref:`to_hdf`
+ binary,`HDF5 Format `__, :ref:`read_hdf`, :ref:`to_hdf`
binary,`Feather Format `__, :ref:`read_feather`, :ref:`to_feather`
binary,`Parquet Format `__, :ref:`read_parquet`, :ref:`to_parquet`
+ binary,`Apache Iceberg `__, :ref:`read_iceberg` , :ref:`to_iceberg`
binary,`ORC Format `__, :ref:`read_orc`, :ref:`to_orc`
binary,`Stata `__, :ref:`read_stata`, :ref:`to_stata`
binary,`SAS `__, :ref:`read_sas` , NA
@@ -1414,7 +1415,7 @@ of multi-columns indices.
.. note::
If an ``index_col`` is not specified (e.g. you don't have an index, or wrote it
- with ``df.to_csv(..., index=False)``, then any ``names`` on the columns index will
+ with ``df.to_csv(..., index=False)``), then any ``names`` on the columns index will
be *lost*.
.. ipython:: python
@@ -5403,6 +5404,125 @@ The above example creates a partitioned dataset that may look like:
except OSError:
pass
+.. _io.iceberg:
+
+Iceberg
+-------
+
+.. versionadded:: 3.0.0
+
+Apache Iceberg is a high performance open-source format for large analytic tables.
+Iceberg enables the use of SQL tables for big data while making it possible for different
+engines to safely work with the same tables at the same time.
+
+Iceberg support predicate pushdown and column pruning, which are available to pandas
+users via the ``row_filter`` and ``selected_fields`` parameters of the :func:`~pandas.read_iceberg`
+function. This is convenient to extract from large tables a subset that fits in memory as a
+pandas ``DataFrame``.
+
+Internally, pandas uses PyIceberg_ to query Iceberg.
+
+.. _PyIceberg: https://py.iceberg.apache.org/
+
+A simple example loading all data from an Iceberg table ``my_table`` defined in the
+``my_catalog`` catalog.
+
+.. code-block:: python
+
+ df = pd.read_iceberg("my_table", catalog_name="my_catalog")
+
+Catalogs must be defined in the ``.pyiceberg.yaml`` file, usually in the home directory.
+It is possible to to change properties of the catalog definition with the
+``catalog_properties`` parameter:
+
+.. code-block:: python
+
+ df = pd.read_iceberg(
+ "my_table",
+ catalog_name="my_catalog",
+ catalog_properties={"s3.secret-access-key": "my_secret"},
+ )
+
+It is also possible to fully specify the catalog in ``catalog_properties`` and not provide
+a ``catalog_name``:
+
+.. code-block:: python
+
+ df = pd.read_iceberg(
+ "my_table",
+ catalog_properties={
+ "uri": "http://127.0.0.1:8181",
+ "s3.endpoint": "http://127.0.0.1:9000",
+ },
+ )
+
+To create the ``DataFrame`` with only a subset of the columns:
+
+.. code-block:: python
+
+ df = pd.read_iceberg(
+ "my_table",
+ catalog_name="my_catalog",
+ selected_fields=["my_column_3", "my_column_7"]
+ )
+
+This will execute the function faster, since other columns won't be read. And it will also
+save memory, since the data from other columns won't be loaded into the underlying memory of
+the ``DataFrame``.
+
+To fetch only a subset of the rows, we can do it with the ``limit`` parameter:
+
+.. code-block:: python
+
+ df = pd.read_iceberg(
+ "my_table",
+ catalog_name="my_catalog",
+ limit=100,
+ )
+
+This will create a ``DataFrame`` with 100 rows, assuming there are at least this number in
+the table.
+
+To fetch a subset of the rows based on a condition, this can be done using the ``row_filter``
+parameter:
+
+.. code-block:: python
+
+ df = pd.read_iceberg(
+ "my_table",
+ catalog_name="my_catalog",
+ row_filter="distance > 10.0",
+ )
+
+Reading a particular snapshot is also possible providing the snapshot ID as an argument to
+``snapshot_id``.
+
+To save a ``DataFrame`` to Iceberg, it can be done with the :meth:`DataFrame.to_iceberg`
+method:
+
+.. code-block:: python
+
+ df.to_iceberg("my_table", catalog_name="my_catalog")
+
+To specify the catalog, it works in the same way as for :func:`read_iceberg` with the
+``catalog_name`` and ``catalog_properties`` parameters.
+
+The location of the table can be specified with the ``location`` parameter:
+
+.. code-block:: python
+
+ df.to_iceberg(
+ "my_table",
+ catalog_name="my_catalog",
+ location="s://my-data-lake/my-iceberg-tables",
+ )
+
+It is possible to add properties to the table snapshot by passing a dictionary to the
+``snapshot_properties`` parameter.
+
+More information about the Iceberg format can be found in the `Apache Iceberg official
+page `__.
+
.. _io.orc:
ORC
diff --git a/doc/source/user_guide/merging.rst b/doc/source/user_guide/merging.rst
index e96d18be8a0c5..af377dd7a32f2 100644
--- a/doc/source/user_guide/merging.rst
+++ b/doc/source/user_guide/merging.rst
@@ -957,7 +957,7 @@ location.
:func:`merge_ordered`
---------------------
-:func:`merge_ordered` combines order data such as numeric or time series data
+:func:`merge_ordered` combines ordered data such as numeric or time series data
with optional filling of missing data with ``fill_method``.
.. ipython:: python
diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst
index e15939eb49239..56f4c80cbde16 100644
--- a/doc/source/user_guide/missing_data.rst
+++ b/doc/source/user_guide/missing_data.rst
@@ -258,9 +258,6 @@ will convert your data to use the nullable data types supporting :class:`NA`,
such as :class:`Int64Dtype` or :class:`ArrowDtype`. This is especially helpful after reading
in data sets from IO methods where data types were inferred.
-In this example, while the dtypes of all columns are changed, we show the results for
-the first 10 columns.
-
.. ipython:: python
import io
@@ -434,7 +431,7 @@ where the index and column aligns between the original object and the filled obj
.. note::
- :meth:`DataFrame.where` can also be used to fill NA values.Same result as above.
+ :meth:`DataFrame.where` can also be used to fill NA values. Same result as above.
.. ipython:: python
diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst
index 8c5e98791a9ef..bc5a2d5ed5735 100644
--- a/doc/source/user_guide/reshaping.rst
+++ b/doc/source/user_guide/reshaping.rst
@@ -395,7 +395,7 @@ variables and the values representing the presence of those variables per row.
pd.get_dummies(df["key"])
df["key"].str.get_dummies()
-``prefix`` adds a prefix to the the column names which is useful for merging the result
+``prefix`` adds a prefix to the column names which is useful for merging the result
with the original :class:`DataFrame`:
.. ipython:: python
diff --git a/doc/source/user_guide/sparse.rst b/doc/source/user_guide/sparse.rst
index 25bcb8bcc0c93..624086f7a8505 100644
--- a/doc/source/user_guide/sparse.rst
+++ b/doc/source/user_guide/sparse.rst
@@ -40,8 +40,8 @@ and in the Python interpreter.
.. ipython:: python
- 'dense : {:0.2f} bytes'.format(df.memory_usage().sum() / 1e3)
- 'sparse: {:0.2f} bytes'.format(sdf.memory_usage().sum() / 1e3)
+ f'dense: {df.memory_usage().sum()} bytes'
+ f'sparse: {sdf.memory_usage().sum()} bytes'
Functionally, their behavior should be nearly
identical to their dense counterparts.
diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst
index 10260cb011d90..ac0fc9e53ee94 100644
--- a/doc/source/user_guide/timeseries.rst
+++ b/doc/source/user_guide/timeseries.rst
@@ -2458,7 +2458,7 @@ you can use the ``tz_convert`` method.
For ``pytz`` time zones, it is incorrect to pass a time zone object directly into
the ``datetime.datetime`` constructor
- (e.g., ``datetime.datetime(2011, 1, 1, tzinfo=pytz.timezone('US/Eastern'))``.
+ (e.g., ``datetime.datetime(2011, 1, 1, tzinfo=pytz.timezone('US/Eastern'))``).
Instead, the datetime needs to be localized using the ``localize`` method
on the ``pytz`` time zone object.
diff --git a/doc/source/user_guide/user_defined_functions.rst b/doc/source/user_guide/user_defined_functions.rst
new file mode 100644
index 0000000000000..6f7fdaddac622
--- /dev/null
+++ b/doc/source/user_guide/user_defined_functions.rst
@@ -0,0 +1,419 @@
+.. _udf:
+
+{{ header }}
+
+*****************************
+User-Defined Functions (UDFs)
+*****************************
+
+In pandas, User-Defined Functions (UDFs) provide a way to extend the library’s
+functionality by allowing users to apply custom computations to their data. While
+pandas comes with a set of built-in functions for data manipulation, UDFs offer
+flexibility when built-in methods are not sufficient. These functions can be
+applied at different levels: element-wise, row-wise, column-wise, or group-wise,
+and behave differently, depending on the method used.
+
+Here’s a simple example to illustrate a UDF applied to a Series:
+
+.. ipython:: python
+
+ s = pd.Series([1, 2, 3])
+
+ # Simple UDF that adds 1 to a value
+ def add_one(x):
+ return x + 1
+
+ # Apply the function element-wise using .map
+ s.map(add_one)
+
+Why Not To Use User-Defined Functions
+-------------------------------------
+
+While UDFs provide flexibility, they come with significant drawbacks, primarily
+related to performance and behavior. When using UDFs, pandas must perform inference
+on the result, and that inference could be incorrect. Furthermore, unlike vectorized operations,
+UDFs are slower because pandas can't optimize their computations, leading to
+inefficient processing.
+
+.. note::
+ In general, most tasks can and should be accomplished using pandas’ built-in methods or vectorized operations.
+
+Despite their drawbacks, UDFs can be helpful when:
+
+* **Custom Computations Are Needed**: Implementing complex logic or domain-specific calculations that pandas'
+ built-in methods cannot handle.
+* **Extending pandas' Functionality**: Applying external libraries or specialized algorithms unavailable in pandas.
+* **Handling Complex Grouped Operations**: Performing operations on grouped data that standard methods do not support.
+
+For example:
+
+.. code-block:: python
+
+ from sklearn.linear_model import LinearRegression
+
+ # Sample data
+ df = pd.DataFrame({
+ 'group': ['A', 'A', 'A', 'B', 'B', 'B'],
+ 'x': [1, 2, 3, 1, 2, 3],
+ 'y': [2, 4, 6, 1, 2, 1.5]
+ })
+
+ # Function to fit a model to each group
+ def fit_model(group):
+ model = LinearRegression()
+ model.fit(group[['x']], group['y'])
+ group['y_pred'] = model.predict(group[['x']])
+ return group
+
+ result = df.groupby('group').apply(fit_model)
+
+
+Methods that support User-Defined Functions
+-------------------------------------------
+
+User-Defined Functions can be applied across various pandas methods:
+
++-------------------------------+------------------------+--------------------------+----------------------------------------------------------------------------------------------------------------------------------------------+
+| Method | Function Input | Function Output | Description |
++===============================+========================+==========================+==============================================================================================================================================+
+| :ref:`udf.map` | Scalar | Scalar | Apply a function to each element |
++-------------------------------+------------------------+--------------------------+----------------------------------------------------------------------------------------------------------------------------------------------+
+| :ref:`udf.apply` (axis=0) | Column (Series) | Column (Series) | Apply a function to each column |
++-------------------------------+------------------------+--------------------------+----------------------------------------------------------------------------------------------------------------------------------------------+
+| :ref:`udf.apply` (axis=1) | Row (Series) | Row (Series) | Apply a function to each row |
++-------------------------------+------------------------+--------------------------+----------------------------------------------------------------------------------------------------------------------------------------------+
+| :ref:`udf.pipe` | Series or DataFrame | Series or DataFrame | Chain functions together to apply to Series or Dataframe |
++-------------------------------+------------------------+--------------------------+----------------------------------------------------------------------------------------------------------------------------------------------+
+| :ref:`udf.filter` | Series or DataFrame | Boolean | Only accepts UDFs in group by. Function is called for each group, and the group is removed from the result if the function returns ``False`` |
++-------------------------------+------------------------+--------------------------+----------------------------------------------------------------------------------------------------------------------------------------------+
+| :ref:`udf.agg` | Series or DataFrame | Scalar or Series | Aggregate and summarizes values, e.g., sum or custom reducer |
++-------------------------------+------------------------+--------------------------+----------------------------------------------------------------------------------------------------------------------------------------------+
+| :ref:`udf.transform` (axis=0) | Column (Series) | Column (Series) | Same as :meth:`apply` with (axis=0), but it raises an exception if the function changes the shape of the data |
++-------------------------------+------------------------+--------------------------+----------------------------------------------------------------------------------------------------------------------------------------------+
+| :ref:`udf.transform` (axis=1) | Row (Series) | Row (Series) | Same as :meth:`apply` with (axis=1), but it raises an exception if the function changes the shape of the data |
++-------------------------------+------------------------+--------------------------+----------------------------------------------------------------------------------------------------------------------------------------------+
+
+When applying UDFs in pandas, it is essential to select the appropriate method based
+on your specific task. Each method has its strengths and is designed for different use
+cases. Understanding the purpose and behavior of each method will help you make informed
+decisions, ensuring more efficient and maintainable code.
+
+.. note::
+ Some of these methods are can also be applied to groupby, resample, and various window objects.
+ See :ref:`groupby`, :ref:`resample()`, :ref:`rolling()`, :ref:`expanding()`,
+ and :ref:`ewm()` for details.
+
+
+.. _udf.map:
+
+:meth:`Series.map` and :meth:`DataFrame.map`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The :meth:`map` method is used specifically to apply element-wise UDFs. This means the function
+will be called for each element in the ``Series`` or ``DataFrame``, with the individual value or
+the cell as the function argument.
+
+.. ipython:: python
+
+ temperature_celsius = pd.DataFrame({
+ "NYC": [14, 21, 23],
+ "Los Angeles": [22, 28, 31],
+ })
+
+ def to_fahrenheit(value):
+ return value * (9 / 5) + 32
+
+ temperature_celsius.map(to_fahrenheit)
+
+In this example, the function ``to_fahrenheit`` will be called 6 times, once for each value
+in the ``DataFrame``. And the result of each call will be returned in the corresponding cell
+of the resulting ``DataFrame``.
+
+In general, ``map`` will be slow, as it will not make use of vectorization. Instead, a Python
+function call for each value will be required, which will slow down things significantly if
+working with medium or large data.
+
+When to use: Use :meth:`map` for applying element-wise UDFs to DataFrames or Series.
+
+.. _udf.apply:
+
+:meth:`Series.apply` and :meth:`DataFrame.apply`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The :meth:`apply` method allows you to apply UDFs for a whole column or row. This is different
+from :meth:`map` in that the function will be called for each column (or row), not for each individual value.
+
+.. ipython:: python
+
+ temperature_celsius = pd.DataFrame({
+ "NYC": [14, 21, 23],
+ "Los Angeles": [22, 28, 31],
+ })
+
+ def to_fahrenheit(column):
+ return column * (9 / 5) + 32
+
+ temperature_celsius.apply(to_fahrenheit)
+
+In the example, ``to_fahrenheit`` will be called only twice, as opposed to the 6 times with :meth:`map`.
+This will be faster than using :meth:`map`, since the operations for each column are vectorized, and the
+overhead of iterating over data in Python and calling Python functions is significantly reduced.
+
+In some cases, the function may require all the data to be able to compute the result. So :meth:`apply`
+is needed, since with :meth:`map` the function can only access one element at a time.
+
+.. ipython:: python
+
+ temperature = pd.DataFrame({
+ "NYC": [14, 21, 23],
+ "Los Angeles": [22, 28, 31],
+ })
+
+ def normalize(column):
+ return column / column.mean()
+
+ temperature.apply(normalize)
+
+In the example, the ``normalize`` function needs to compute the mean of the whole column in order
+to divide each element by it. So, we cannot call the function for each element, but we need the
+function to receive the whole column.
+
+:meth:`apply` can also execute function by row, by specifying ``axis=1``.
+
+.. ipython:: python
+
+ temperature = pd.DataFrame({
+ "NYC": [14, 21, 23],
+ "Los Angeles": [22, 28, 31],
+ })
+
+ def hotter(row):
+ return row["Los Angeles"] - row["NYC"]
+
+ temperature.apply(hotter, axis=1)
+
+In the example, the function ``hotter`` will be called 3 times, once for each row. And each
+call will receive the whole row as the argument, allowing computations that require more than
+one value in the row.
+
+``apply`` is also available for :meth:`SeriesGroupBy.apply`, :meth:`DataFrameGroupBy.apply`,
+:meth:`Rolling.apply`, :meth:`Expanding.apply` and :meth:`Resampler.apply`. You can read more
+about ``apply`` in groupby operations :ref:`groupby.apply`.
+
+When to use: :meth:`apply` is suitable when no alternative vectorized method or UDF method is available,
+but consider optimizing performance with vectorized operations wherever possible.
+
+.. _udf.pipe:
+
+:meth:`Series.pipe` and :meth:`DataFrame.pipe`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The ``pipe`` method is similar to ``map`` and ``apply``, but the function receives the whole ``Series``
+or ``DataFrame`` it is called on.
+
+.. ipython:: python
+
+ temperature = pd.DataFrame({
+ "NYC": [14, 21, 23],
+ "Los Angeles": [22, 28, 31],
+ })
+
+ def normalize(df):
+ return df / df.mean().mean()
+
+ temperature.pipe(normalize)
+
+This is equivalent to calling the ``normalize`` function with the ``DataFrame`` as the parameter.
+
+.. ipython:: python
+
+ normalize(temperature)
+
+The main advantage of using ``pipe`` is readability. It allows method chaining and clearer code when
+calling multiple functions.
+
+.. ipython:: python
+
+ temperature_celsius = pd.DataFrame({
+ "NYC": [14, 21, 23],
+ "Los Angeles": [22, 28, 31],
+ })
+
+ def multiply_by_9(value):
+ return value * 9
+
+ def divide_by_5(value):
+ return value / 5
+
+ def add_32(value):
+ return value + 32
+
+ # Without `pipe`:
+ fahrenheit = add_32(divide_by_5(multiply_by_9(temperature_celsius)))
+
+ # With `pipe`:
+ fahrenheit = (temperature_celsius.pipe(multiply_by_9)
+ .pipe(divide_by_5)
+ .pipe(add_32))
+
+``pipe`` is also available for :meth:`SeriesGroupBy.pipe`, :meth:`DataFrameGroupBy.pipe` and
+:meth:`Resampler.pipe`. You can read more about ``pipe`` in groupby operations in :ref:`groupby.pipe`.
+
+When to use: Use :meth:`pipe` when you need to create a pipeline of operations and want to keep the code readable and maintainable.
+
+.. _udf.filter:
+
+:meth:`Series.filter` and :meth:`DataFrame.filter`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The ``filter`` method is used to select a subset of rows that match certain criteria.
+:meth:`Series.filter` and :meth:`DataFrame.filter` do not support user defined functions,
+but :meth:`SeriesGroupBy.filter` and :meth:`DataFrameGroupBy.filter` do. You can read more
+about ``filter`` in groupby operations in :ref:`groupby.filter`.
+
+.. _udf.agg:
+
+:meth:`Series.agg` and :meth:`DataFrame.agg`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The ``agg`` method is used to aggregate a set of data points into a single one.
+The most common aggregation functions such as ``min``, ``max``, ``mean``, ``sum``, etc.
+are already implemented in pandas. ``agg`` allows to implement other custom aggregate
+functions.
+
+.. ipython:: python
+
+ temperature = pd.DataFrame({
+ "NYC": [14, 21, 23],
+ "Los Angeles": [22, 28, 31],
+ })
+
+ def highest_jump(column):
+ return column.pct_change().max()
+
+ temperature.agg(highest_jump)
+
+
+When to use: Use :meth:`agg` for performing custom aggregations, where the operation returns
+a scalar value on each input.
+
+.. _udf.transform:
+
+:meth:`Series.transform` and :meth:`DataFrame.transform`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The ``transform``` method is similar to an aggregation, with the difference that the result is broadcasted
+to the original data.
+
+.. ipython:: python
+
+ temperature = pd.DataFrame({
+ "NYC": [14, 21, 23],
+ "Los Angeles": [22, 28, 31]},
+ index=pd.date_range("2000-01-01", "2000-01-03"))
+
+ def warm_up_all_days(column):
+ return pd.Series(column.max(), index=column.index)
+
+ temperature.transform(warm_up_all_days)
+
+In the example, the ``warm_up_all_days`` function computes the ``max`` like an aggregation, but instead
+of returning just the maximum value, it returns a ``DataFrame`` with the same shape as the original one
+with the values of each day replaced by the maximum temperature of the city.
+
+``transform`` is also available for :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.transform` and
+:meth:`Resampler.transform`, where it's more common. You can read more about ``transform`` in groupby
+operations in :ref:`groupby.transform`.
+
+When to use: When you need to perform an aggregation that will be returned in the original structure of
+the DataFrame.
+
+
+Performance
+-----------
+
+While UDFs provide flexibility, their use is generally discouraged as they can introduce
+performance issues, especially when written in pure Python. To improve efficiency,
+consider using built-in ``NumPy`` or ``pandas`` functions instead of UDFs
+for common operations.
+
+.. note::
+ If performance is critical, explore **vectorized operations** before resorting
+ to UDFs.
+
+Vectorized Operations
+~~~~~~~~~~~~~~~~~~~~~
+
+Below is a comparison of using UDFs versus using Vectorized Operations:
+
+.. code-block:: python
+
+ # User-defined function
+ def calc_ratio(row):
+ return 100 * (row["one"] / row["two"])
+
+ df["new_col"] = df.apply(calc_ratio, axis=1)
+
+ # Vectorized Operation
+ df["new_col2"] = 100 * (df["one"] / df["two"])
+
+Measuring how long each operation takes:
+
+.. code-block:: text
+
+ User-defined function: 5.6435 secs
+ Vectorized: 0.0043 secs
+
+Vectorized operations in pandas are significantly faster than using :meth:`DataFrame.apply`
+with UDFs because they leverage highly optimized C functions
+via ``NumPy`` to process entire arrays at once. This approach avoids the overhead of looping
+through rows in Python and making separate function calls for each row, which is slow and
+inefficient. Additionally, ``NumPy`` arrays benefit from memory efficiency and CPU-level
+optimizations, making vectorized operations the preferred choice whenever possible.
+
+
+Improving Performance with UDFs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In scenarios where UDFs are necessary, there are still ways to mitigate their performance drawbacks.
+One approach is to use **Numba**, a Just-In-Time (JIT) compiler that can significantly speed up numerical
+Python code by compiling Python functions to optimized machine code at runtime.
+
+By annotating your UDFs with ``@numba.jit``, you can achieve performance closer to vectorized operations,
+especially for computationally heavy tasks.
+
+.. note::
+ You may also refer to the user guide on `Enhancing performance `_
+ for a more detailed guide to using **Numba**.
+
+Using :meth:`DataFrame.pipe` for Composable Logic
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Another useful pattern for improving readability and composability, especially when mixing
+vectorized logic with UDFs, is to use the :meth:`DataFrame.pipe` method.
+
+:meth:`DataFrame.pipe` doesn't improve performance directly, but it enables cleaner
+method chaining by passing the entire object into a function. This is especially helpful
+when chaining custom transformations:
+
+.. code-block:: python
+
+ def add_ratio_column(df):
+ df["ratio"] = 100 * (df["one"] / df["two"])
+ return df
+
+ df = (
+ df
+ .query("one > 0")
+ .pipe(add_ratio_column)
+ .dropna()
+ )
+
+This is functionally equivalent to calling ``add_ratio_column(df)``, but keeps your code
+clean and composable. The function you pass to :meth:`DataFrame.pipe` can use vectorized operations,
+row-wise UDFs, or any other logic; :meth:`DataFrame.pipe` is agnostic.
+
+.. note::
+ While :meth:`DataFrame.pipe` does not improve performance on its own,
+ it promotes clean, modular design and allows both vectorized and UDF-based logic
+ to be composed in method chains.
diff --git a/doc/source/whatsnew/v0.11.0.rst b/doc/source/whatsnew/v0.11.0.rst
index 28c9d46f21fd8..903632b488cca 100644
--- a/doc/source/whatsnew/v0.11.0.rst
+++ b/doc/source/whatsnew/v0.11.0.rst
@@ -70,7 +70,7 @@ See the section :ref:`Selection by Position ` for substitutes.
Dtypes
~~~~~~
-Numeric dtypes will propagate and can coexist in DataFrames. If a dtype is passed (either directly via the ``dtype`` keyword, a passed ``ndarray``, or a passed ``Series``, then it will be preserved in DataFrame operations. Furthermore, different numeric dtypes will **NOT** be combined. The following example will give you a taste.
+Numeric dtypes will propagate and can coexist in DataFrames. If a dtype is passed (either directly via the ``dtype`` keyword, a passed ``ndarray``, or a passed ``Series``), then it will be preserved in DataFrame operations. Furthermore, different numeric dtypes will **NOT** be combined. The following example will give you a taste.
.. ipython:: python
diff --git a/doc/source/whatsnew/v0.12.0.rst b/doc/source/whatsnew/v0.12.0.rst
index 08d3a6b188322..f2674938e7726 100644
--- a/doc/source/whatsnew/v0.12.0.rst
+++ b/doc/source/whatsnew/v0.12.0.rst
@@ -245,7 +245,7 @@ IO enhancements
format. (:issue:`3571`, :issue:`1651`, :issue:`3141`)
- If an ``index_col`` is not specified (e.g. you don't have an index, or wrote it
- with ``df.to_csv(..., index=False``), then any ``names`` on the columns index will
+ with ``df.to_csv(..., index=False)``), then any ``names`` on the columns index will
be *lost*.
.. ipython:: python
diff --git a/doc/source/whatsnew/v0.16.1.rst b/doc/source/whatsnew/v0.16.1.rst
index cbf5b7703bd79..b376530358f53 100644
--- a/doc/source/whatsnew/v0.16.1.rst
+++ b/doc/source/whatsnew/v0.16.1.rst
@@ -353,7 +353,7 @@ Deprecations
Index representation
~~~~~~~~~~~~~~~~~~~~
-The string representation of ``Index`` and its sub-classes have now been unified. These will show a single-line display if there are few values; a wrapped multi-line display for a lot of values (but less than ``display.max_seq_items``; if lots of items (> ``display.max_seq_items``) will show a truncated display (the head and tail of the data). The formatting for ``MultiIndex`` is unchanged (a multi-line wrapped display). The display width responds to the option ``display.max_seq_items``, which is defaulted to 100. (:issue:`6482`)
+The string representation of ``Index`` and its sub-classes have now been unified. These will show a single-line display if there are few values; a wrapped multi-line display for a lot of values (but less than ``display.max_seq_items``); if lots of items (> ``display.max_seq_items``) will show a truncated display (the head and tail of the data). The formatting for ``MultiIndex`` is unchanged (a multi-line wrapped display). The display width responds to the option ``display.max_seq_items``, which is defaulted to 100. (:issue:`6482`)
Previous behavior
diff --git a/doc/source/whatsnew/v0.19.0.rst b/doc/source/whatsnew/v0.19.0.rst
index 1ae711113773f..0b1f6a2249a6c 100644
--- a/doc/source/whatsnew/v0.19.0.rst
+++ b/doc/source/whatsnew/v0.19.0.rst
@@ -1547,7 +1547,7 @@ Bug fixes
- Bug in checking for any null objects in a ``TimedeltaIndex``, which always returned ``True`` (:issue:`13603`)
- Bug in ``Series`` arithmetic raises ``TypeError`` if it contains datetime-like as ``object`` dtype (:issue:`13043`)
- Bug ``Series.isnull()`` and ``Series.notnull()`` ignore ``Period('NaT')`` (:issue:`13737`)
-- Bug ``Series.fillna()`` and ``Series.dropna()`` don't affect to ``Period('NaT')`` (:issue:`13737`
+- Bug ``Series.fillna()`` and ``Series.dropna()`` don't affect to ``Period('NaT')`` (:issue:`13737`)
- Bug in ``.fillna(value=np.nan)`` incorrectly raises ``KeyError`` on a ``category`` dtyped ``Series`` (:issue:`14021`)
- Bug in extension dtype creation where the created types were not is/identical (:issue:`13285`)
- Bug in ``.resample(..)`` where incorrect warnings were triggered by IPython introspection (:issue:`13618`)
diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst
index 60e77a8c5d8c5..0f40f5bfa5fc9 100644
--- a/doc/source/whatsnew/v0.24.0.rst
+++ b/doc/source/whatsnew/v0.24.0.rst
@@ -1019,7 +1019,7 @@ operations has been changed to match the arithmetic operations in these cases.
The affected cases are:
- operating against a 2-dimensional ``np.ndarray`` with either 1 row or 1 column will now broadcast the same way a ``np.ndarray`` would (:issue:`23000`).
-- a list or tuple with length matching the number of rows in the :class:`DataFrame` will now raise ``ValueError`` instead of operating column-by-column (:issue:`22880`.
+- a list or tuple with length matching the number of rows in the :class:`DataFrame` will now raise ``ValueError`` instead of operating column-by-column (:issue:`22880`).
- a list or tuple with length matching the number of columns in the :class:`DataFrame` will now operate row-by-row instead of raising ``ValueError`` (:issue:`22880`).
.. ipython:: python
@@ -1556,7 +1556,7 @@ Performance improvements
(i.e. ``x in cat``-style checks are much faster). :meth:`CategoricalIndex.contains`
is likewise much faster (:issue:`21369`, :issue:`21508`)
- Improved performance of :meth:`HDFStore.groups` (and dependent functions like
- :meth:`HDFStore.keys`. (i.e. ``x in store`` checks are much faster)
+ :meth:`HDFStore.keys` (i.e. ``x in store`` checks) are much faster)
(:issue:`21372`)
- Improved the performance of :func:`pandas.get_dummies` with ``sparse=True`` (:issue:`21997`)
- Improved performance of :func:`IndexEngine.get_indexer_non_unique` for sorted, non-unique indexes (:issue:`9466`)
diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
index 98cb9c4ad7b45..1aac68b90ff2f 100755
--- a/doc/source/whatsnew/v1.0.0.rst
+++ b/doc/source/whatsnew/v1.0.0.rst
@@ -1114,7 +1114,7 @@ Numeric
- Bug in :class:`UInt64Index` precision loss while constructing from a list with values in the ``np.uint64`` range (:issue:`29526`)
- Bug in :class:`NumericIndex` construction that caused indexing to fail when integers in the ``np.uint64`` range were used (:issue:`28023`)
- Bug in :class:`NumericIndex` construction that caused :class:`UInt64Index` to be casted to :class:`Float64Index` when integers in the ``np.uint64`` range were used to index a :class:`DataFrame` (:issue:`28279`)
-- Bug in :meth:`Series.interpolate` when using method=`index` with an unsorted index, would previously return incorrect results. (:issue:`21037`)
+- Bug in :meth:`Series.interpolate` when using ``method='index'`` with an unsorted index, would previously return incorrect results. (:issue:`21037`)
- Bug in :meth:`DataFrame.round` where a :class:`DataFrame` with a :class:`CategoricalIndex` of :class:`IntervalIndex` columns would incorrectly raise a ``TypeError`` (:issue:`30063`)
- Bug in :meth:`Series.pct_change` and :meth:`DataFrame.pct_change` when there are duplicated indices (:issue:`30463`)
- Bug in :class:`DataFrame` cumulative operations (e.g. cumsum, cummax) incorrect casting to object-dtype (:issue:`19296`)
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
index b199b113d26f2..dff73bef79135 100644
--- a/doc/source/whatsnew/v1.1.0.rst
+++ b/doc/source/whatsnew/v1.1.0.rst
@@ -1039,7 +1039,7 @@ Missing
^^^^^^^
- Calling :meth:`fillna` on an empty :class:`Series` now correctly returns a shallow copied object. The behaviour is now consistent with :class:`Index`, :class:`DataFrame` and a non-empty :class:`Series` (:issue:`32543`).
- Bug in :meth:`Series.replace` when argument ``to_replace`` is of type dict/list and is used on a :class:`Series` containing ```` was raising a ``TypeError``. The method now handles this by ignoring ```` values when doing the comparison for the replacement (:issue:`32621`)
-- Bug in :meth:`~Series.any` and :meth:`~Series.all` incorrectly returning ```` for all ``False`` or all ``True`` values using the nulllable Boolean dtype and with ``skipna=False`` (:issue:`33253`)
+- Bug in :meth:`~Series.any` and :meth:`~Series.all` incorrectly returning ```` for all ``False`` or all ``True`` values using the nullable Boolean dtype and with ``skipna=False`` (:issue:`33253`)
- Clarified documentation on interpolate with ``method=akima``. The ``der`` parameter must be scalar or ``None`` (:issue:`33426`)
- :meth:`DataFrame.interpolate` uses the correct axis convention now. Previously interpolating along columns lead to interpolation along indices and vice versa. Furthermore interpolating with methods ``pad``, ``ffill``, ``bfill`` and ``backfill`` are identical to using these methods with :meth:`DataFrame.fillna` (:issue:`12918`, :issue:`29146`)
- Bug in :meth:`DataFrame.interpolate` when called on a :class:`DataFrame` with column names of string type was throwing a ValueError. The method is now independent of the type of the column names (:issue:`33956`)
diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
index 12ab4f27d1e62..ebde7cb14684b 100644
--- a/doc/source/whatsnew/v1.2.0.rst
+++ b/doc/source/whatsnew/v1.2.0.rst
@@ -793,7 +793,7 @@ Groupby/resample/rolling
- Bug in :meth:`DataFrame.resample` that would throw a ``ValueError`` when resampling from ``"D"`` to ``"24H"`` over a transition into daylight savings time (DST) (:issue:`35219`)
- Bug when combining methods :meth:`DataFrame.groupby` with :meth:`DataFrame.resample` and :meth:`DataFrame.interpolate` raising a ``TypeError`` (:issue:`35325`)
- Bug in :meth:`.DataFrameGroupBy.apply` where a non-nuisance grouping column would be dropped from the output columns if another groupby method was called before ``.apply`` (:issue:`34656`)
-- Bug when subsetting columns on a :class:`.DataFrameGroupBy` (e.g. ``df.groupby('a')[['b']])``) would reset the attributes ``axis``, ``dropna``, ``group_keys``, ``level``, ``mutated``, ``sort``, and ``squeeze`` to their default values (:issue:`9959`)
+- Bug when subsetting columns on a :class:`.DataFrameGroupBy` (e.g. ``df.groupby('a')[['b']]``) would reset the attributes ``axis``, ``dropna``, ``group_keys``, ``level``, ``mutated``, ``sort``, and ``squeeze`` to their default values (:issue:`9959`)
- Bug in :meth:`.DataFrameGroupBy.tshift` failing to raise ``ValueError`` when a frequency cannot be inferred for the index of a group (:issue:`35937`)
- Bug in :meth:`DataFrame.groupby` does not always maintain column index name for ``any``, ``all``, ``bfill``, ``ffill``, ``shift`` (:issue:`29764`)
- Bug in :meth:`.DataFrameGroupBy.apply` raising error with ``np.nan`` group(s) when ``dropna=False`` (:issue:`35889`)
diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
index 7b1aef07e5f00..cf016c882c225 100644
--- a/doc/source/whatsnew/v1.4.0.rst
+++ b/doc/source/whatsnew/v1.4.0.rst
@@ -666,7 +666,7 @@ be removed in a future version. Use :func:`pandas.concat` instead (:issue:`35407
.. code-block:: ipython
- In [1]: pd.Series([1, 2]).append(pd.Series([3, 4])
+ In [1]: pd.Series([1, 2]).append(pd.Series([3, 4]))
Out [1]:
:1: FutureWarning: The series.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
0 1
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
index 43aa63c284f38..0bede60758331 100644
--- a/doc/source/whatsnew/v1.5.0.rst
+++ b/doc/source/whatsnew/v1.5.0.rst
@@ -287,7 +287,7 @@ and attributes without holding entire tree in memory (:issue:`45442`).
In [1]: df = pd.read_xml(
... "/path/to/downloaded/enwikisource-latest-pages-articles.xml",
- ... iterparse = {"page": ["title", "ns", "id"]})
+ ... iterparse = {"page": ["title", "ns", "id"]}
... )
df
Out[2]:
diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst
index 329ef2859f56f..e32417e367427 100644
--- a/doc/source/whatsnew/v2.2.0.rst
+++ b/doc/source/whatsnew/v2.2.0.rst
@@ -815,8 +815,8 @@ Conversion
^^^^^^^^^^
- Bug in :meth:`DataFrame.astype` when called with ``str`` on unpickled array - the array might change in-place (:issue:`54654`)
- Bug in :meth:`DataFrame.astype` where ``errors="ignore"`` had no effect for extension types (:issue:`54654`)
+- Bug in :meth:`DataFrame.loc` was not throwing "incompatible dtype warning" (see `PDEP6 `_) when assigning a ``Series`` with a different dtype using a full column setter (e.g. ``df.loc[:, 'a'] = incompatible_value``) (:issue:`39584`)
- Bug in :meth:`Series.convert_dtypes` not converting all NA column to ``null[pyarrow]`` (:issue:`55346`)
-- Bug in :meth:``DataFrame.loc`` was not throwing "incompatible dtype warning" (see `PDEP6 `_) when assigning a ``Series`` with a different dtype using a full column setter (e.g. ``df.loc[:, 'a'] = incompatible_value``) (:issue:`39584`)
Strings
^^^^^^^
@@ -826,7 +826,7 @@ Strings
- Bug in :meth:`Index.str.cat` always casting result to object dtype (:issue:`56157`)
- Bug in :meth:`Series.__mul__` for :class:`ArrowDtype` with ``pyarrow.string`` dtype and ``string[pyarrow]`` for the pyarrow backend (:issue:`51970`)
- Bug in :meth:`Series.str.find` when ``start < 0`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56411`)
-- Bug in :meth:`Series.str.fullmatch` when ``dtype=pandas.ArrowDtype(pyarrow.string()))`` allows partial matches when regex ends in literal //$ (:issue:`56652`)
+- Bug in :meth:`Series.str.fullmatch` when ``dtype=pandas.ArrowDtype(pyarrow.string())`` allows partial matches when regex ends in literal //$ (:issue:`56652`)
- Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56404`)
- Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with arguments of type ``tuple[str, ...]`` for :class:`ArrowDtype` with ``pyarrow.string`` dtype (:issue:`56579`)
- Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with arguments of type ``tuple[str, ...]`` for ``string[pyarrow]`` (:issue:`54942`)
diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst
index ac74e6a8e5f77..6433fe8d2b060 100644
--- a/doc/source/whatsnew/v2.3.0.rst
+++ b/doc/source/whatsnew/v2.3.0.rst
@@ -1,6 +1,6 @@
.. _whatsnew_230:
-What's new in 2.3.0 (Month XX, 2024)
+What's new in 2.3.0 (June 4, 2025)
------------------------------------
These are the changes in pandas 2.3.0. See :ref:`release` for a full changelog
@@ -10,37 +10,26 @@ including other versions of pandas.
.. ---------------------------------------------------------------------------
-.. _whatsnew_230.upcoming_changes:
-
-Upcoming changes in pandas 3.0
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-
.. _whatsnew_230.enhancements:
Enhancements
~~~~~~~~~~~~
-.. _whatsnew_230.enhancements.enhancement1:
-
-enhancement1
-^^^^^^^^^^^^
-
-
.. _whatsnew_230.enhancements.other:
Other enhancements
^^^^^^^^^^^^^^^^^^
+- :meth:`pandas.api.interchange.from_dataframe` now uses the `PyCapsule Interface `_ if available, only falling back to the Dataframe Interchange Protocol if that fails (:issue:`60739`)
- The semantics for the ``copy`` keyword in ``__array__`` methods (i.e. called
when using ``np.array()`` or ``np.asarray()`` on pandas objects) has been
updated to work correctly with NumPy >= 2 (:issue:`57739`)
-- :meth:`Series.str.decode` result now has ``StringDtype`` when ``future.infer_string`` is True (:issue:`60709`)
-- :meth:`~Series.to_hdf` and :meth:`~DataFrame.to_hdf` now round-trip with ``StringDtype`` (:issue:`60663`)
+- :meth:`Series.str.decode` result now has :class:`StringDtype` when ``future.infer_string`` is True (:issue:`60709`)
+- :meth:`~Series.to_hdf` and :meth:`~DataFrame.to_hdf` now round-trip with :class:`StringDtype` (:issue:`60663`)
- Improved ``repr`` of :class:`.NumpyExtensionArray` to account for NEP51 (:issue:`61085`)
- The :meth:`Series.str.decode` has gained the argument ``dtype`` to control the dtype of the result (:issue:`60940`)
-- The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns (:issue:`60633`)
-- The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`)
+- The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for :class:`StringDtype` columns (:issue:`60633`)
+- The :meth:`~Series.sum` reduction is now implemented for :class:`StringDtype` columns (:issue:`59853`)
.. ---------------------------------------------------------------------------
.. _whatsnew_230.notable_bug_fixes:
@@ -50,19 +39,29 @@ Notable bug fixes
These are bug fixes that might have notable behavior changes.
-.. _whatsnew_230.notable_bug_fixes.notable_bug_fix1:
+.. _whatsnew_230.notable_bug_fixes.string_comparisons:
+
+Comparisons between different string dtypes
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-notable_bug_fix1
-^^^^^^^^^^^^^^^^
+In previous versions, comparing :class:`Series` of different string dtypes (e.g. ``pd.StringDtype("pyarrow", na_value=pd.NA)`` against ``pd.StringDtype("python", na_value=np.nan)``) would result in inconsistent resulting dtype or incorrectly raise. pandas will now use the hierarchy
+
+ object < (python, NaN) < (pyarrow, NaN) < (python, NA) < (pyarrow, NA)
+
+in determining the result dtype when there are different string dtypes compared. Some examples:
+
+- When ``pd.StringDtype("pyarrow", na_value=pd.NA)`` is compared against any other string dtype, the result will always be ``boolean[pyarrow]``.
+- When ``pd.StringDtype("python", na_value=pd.NA)`` is compared against ``pd.StringDtype("pyarrow", na_value=np.nan)``, the result will be ``boolean``, the NumPy-backed nullable extension array.
+- When ``pd.StringDtype("python", na_value=pd.NA)`` is compared against ``pd.StringDtype("python", na_value=np.nan)``, the result will be ``boolean``, the NumPy-backed nullable extension array.
.. _whatsnew_230.api_changes:
API changes
~~~~~~~~~~~
-- When enabling the ``future.infer_string`` option: Index set operations (like
- union or intersection) will now ignore the dtype of an empty ``RangeIndex`` or
- empty ``Index`` with object dtype when determining the dtype of the resulting
+- When enabling the ``future.infer_string`` option, :class:`Index` set operations (like
+ union or intersection) will now ignore the dtype of an empty :class:`RangeIndex` or
+ empty :class:`Index` with ``object`` dtype when determining the dtype of the resulting
Index (:issue:`60797`)
.. ---------------------------------------------------------------------------
@@ -73,120 +72,35 @@ Deprecations
- Deprecated allowing non-``bool`` values for ``na`` in :meth:`.str.contains`, :meth:`.str.startswith`, and :meth:`.str.endswith` for dtypes that do not already disallow these (:issue:`59615`)
- Deprecated the ``"pyarrow_numpy"`` storage option for :class:`StringDtype` (:issue:`60152`)
-.. ---------------------------------------------------------------------------
-.. _whatsnew_230.performance:
-
-Performance improvements
-~~~~~~~~~~~~~~~~~~~~~~~~
--
--
-
.. ---------------------------------------------------------------------------
.. _whatsnew_230.bug_fixes:
Bug fixes
~~~~~~~~~
-Categorical
-^^^^^^^^^^^
--
--
-
-Datetimelike
-^^^^^^^^^^^^
--
--
-
-Timedelta
-^^^^^^^^^
--
--
-
-Timezones
-^^^^^^^^^
--
--
-
Numeric
^^^^^^^
-- Enabled :class:`Series.mode` and :class:`DataFrame.mode` with ``dropna=False`` to sort the result for all dtypes in the presence of NA values; previously only certain dtypes would sort (:issue:`60702`)
--
-
-Conversion
-^^^^^^^^^^
--
--
+- Bug in :meth:`Series.mode` and :meth:`DataFrame.mode` with ``dropna=False`` where not all dtypes would sort in the presence of ``NA`` values (:issue:`60702`)
+- Bug in :meth:`Series.round` where a ``TypeError`` would always raise with ``object`` dtype (:issue:`61206`)
Strings
^^^^^^^
-- Bug in :meth:`.DataFrameGroupBy.min`, :meth:`.DataFrameGroupBy.max`, :meth:`.Resampler.min`, :meth:`.Resampler.max` on string input of all NA values would return float dtype; now returns string (:issue:`60810`)
-- Bug in :meth:`DataFrame.sum` with ``axis=1``, :meth:`.DataFrameGroupBy.sum` or :meth:`.SeriesGroupBy.sum` with ``skipna=True``, and :meth:`.Resampler.sum` on :class:`StringDtype` with all NA values resulted in ``0`` and is now the empty string ``""`` (:issue:`60229`)
-- Bug in :meth:`Series.__pos__` and :meth:`DataFrame.__pos__` did not raise for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`60710`)
-- Bug in :meth:`Series.rank` for :class:`StringDtype` with ``storage="pyarrow"`` incorrectly returning integer results in case of ``method="average"`` and raising an error if it would truncate results (:issue:`59768`)
+- Bug in :meth:`.DataFrameGroupBy.min`, :meth:`.DataFrameGroupBy.max`, :meth:`.Resampler.min`, :meth:`.Resampler.max` where all NA values of string dtype would return float instead of string dtype (:issue:`60810`)
+- Bug in :meth:`DataFrame.sum` with ``axis=1``, :meth:`.DataFrameGroupBy.sum` or :meth:`.SeriesGroupBy.sum` with ``skipna=True``, and :meth:`.Resampler.sum` with all NA values of :class:`StringDtype` resulted in ``0`` instead of the empty string ``""`` (:issue:`60229`)
+- Bug in :meth:`Series.__pos__` and :meth:`DataFrame.__pos__` where an ``Exception`` was not raised for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`60710`)
+- Bug in :meth:`Series.rank` for :class:`StringDtype` with ``storage="pyarrow"`` that incorrectly returned integer results with ``method="average"`` and raised an error if it would truncate results (:issue:`59768`)
- Bug in :meth:`Series.replace` with :class:`StringDtype` when replacing with a non-string value was not upcasting to ``object`` dtype (:issue:`60282`)
+- Bug in :meth:`Series.str.center` with :class:`StringDtype` with ``storage="pyarrow"`` not matching the python behavior in corner cases with an odd number of fill characters (:issue:`54792`)
- Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`)
-- Bug in ``ser.str.slice`` with negative ``step`` with :class:`ArrowDtype` and :class:`StringDtype` with ``storage="pyarrow"`` giving incorrect results (:issue:`59710`)
-- Bug in the ``center`` method on :class:`Series` and :class:`Index` object ``str`` accessors with pyarrow-backed dtype not matching the python behavior in corner cases with an odd number of fill characters (:issue:`54792`)
-
-Interval
-^^^^^^^^
--
--
+- Bug in :meth:`Series.str.slice` with negative ``step`` with :class:`ArrowDtype` and :class:`StringDtype` with ``storage="pyarrow"`` giving incorrect results (:issue:`59710`)
Indexing
^^^^^^^^
-- Fixed bug in :meth:`Index.get_indexer` round-tripping through string dtype when ``infer_string`` is enabled (:issue:`55834`)
--
-
-Missing
-^^^^^^^
--
--
-
-MultiIndex
-^^^^^^^^^^
--
--
+- Bug in :meth:`Index.get_indexer` round-tripping through string dtype when ``infer_string`` is enabled (:issue:`55834`)
I/O
^^^
-- :meth:`DataFrame.to_excel` was storing decimals as strings instead of numbers (:issue:`49598`)
--
-
-Period
-^^^^^^
--
--
-
-Plotting
-^^^^^^^^
--
--
-
-Groupby/resample/rolling
-^^^^^^^^^^^^^^^^^^^^^^^^
--
--
-
-Reshaping
-^^^^^^^^^
--
--
-
-Sparse
-^^^^^^
--
--
-
-ExtensionArray
-^^^^^^^^^^^^^^
--
--
-
-Styler
-^^^^^^
--
--
+- Bug in :meth:`DataFrame.to_excel` which stored decimals as strings instead of numbers (:issue:`49598`)
Other
^^^^^
@@ -198,3 +112,5 @@ Other
Contributors
~~~~~~~~~~~~
+
+.. contributors:: v2.2.3..v2.3.0|HEAD
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
index ab3316e7fca4c..b76d722be4e6d 100644
--- a/doc/source/whatsnew/v3.0.0.rst
+++ b/doc/source/whatsnew/v3.0.0.rst
@@ -30,7 +30,6 @@ Other enhancements
^^^^^^^^^^^^^^^^^^
- :class:`pandas.api.typing.FrozenList` is available for typing the outputs of :attr:`MultiIndex.names`, :attr:`MultiIndex.codes` and :attr:`MultiIndex.levels` (:issue:`58237`)
- :class:`pandas.api.typing.SASReader` is available for typing the output of :func:`read_sas` (:issue:`55689`)
-- :meth:`pandas.api.interchange.from_dataframe` now uses the `PyCapsule Interface `_ if available, only falling back to the Dataframe Interchange Protocol if that fails (:issue:`60739`)
- Added :meth:`.Styler.to_typst` to write Styler objects to file, buffer or string in Typst format (:issue:`57617`)
- Added missing :meth:`pandas.Series.info` to API reference (:issue:`60926`)
- :class:`pandas.api.typing.NoDefault` is available for typing ``no_default``
@@ -52,6 +51,7 @@ Other enhancements
- :class:`Rolling` and :class:`Expanding` now support ``pipe`` method (:issue:`57076`)
- :class:`Series` now supports the Arrow PyCapsule Interface for export (:issue:`59518`)
- :func:`DataFrame.to_excel` argument ``merge_cells`` now accepts a value of ``"columns"`` to only merge :class:`MultiIndex` column header header cells (:issue:`35384`)
+- :func:`set_option` now accepts a dictionary of options, simplifying configuration of multiple settings at once (:issue:`61093`)
- :meth:`DataFrame.corrwith` now accepts ``min_periods`` as optional arguments, as in :meth:`DataFrame.corr` and :meth:`Series.corr` (:issue:`9490`)
- :meth:`DataFrame.cummin`, :meth:`DataFrame.cummax`, :meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods now have a ``numeric_only`` parameter (:issue:`53072`)
- :meth:`DataFrame.ewm` now allows ``adjust=False`` when ``times`` is provided (:issue:`54328`)
@@ -73,11 +73,13 @@ Other enhancements
- :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`)
- :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`)
- :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`)
+- :meth:`Series.map` now accepts an ``engine`` parameter to allow execution with a third-party execution engine (:issue:`61125`)
- :meth:`Series.str.get_dummies` now accepts a ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`)
- :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`)
- :py:class:`frozenset` elements in pandas objects are now natively printed (:issue:`60690`)
- Add ``"delete_rows"`` option to ``if_exists`` argument in :meth:`DataFrame.to_sql` deleting all records of the table before inserting data (:issue:`37210`).
- Added half-year offset classes :class:`HalfYearBegin`, :class:`HalfYearEnd`, :class:`BHalfYearBegin` and :class:`BHalfYearEnd` (:issue:`60928`)
+- Added support to read and write from and to Apache Iceberg tables with the new :func:`read_iceberg` and :meth:`DataFrame.to_iceberg` functions (:issue:`61383`)
- Errors occurring during SQL I/O will now throw a generic :class:`.DatabaseError` instead of the raw Exception type from the underlying driver manager library (:issue:`60748`)
- Implemented :meth:`Series.str.isascii` and :meth:`Series.str.isascii` (:issue:`59091`)
- Improved deprecation message for offset aliases (:issue:`60820`)
@@ -693,6 +695,7 @@ Datetimelike
- Bug in :meth:`to_datetime` on float array with missing values throwing ``FloatingPointError`` (:issue:`58419`)
- Bug in :meth:`to_datetime` on float32 df with year, month, day etc. columns leads to precision issues and incorrect result. (:issue:`60506`)
- Bug in :meth:`to_datetime` reports incorrect index in case of any failure scenario. (:issue:`58298`)
+- Bug in :meth:`to_datetime` with ``format="ISO8601"`` and ``utc=True`` where naive timestamps incorrectly inherited timezone offset from previous timestamps in a series. (:issue:`61389`)
- Bug in :meth:`to_datetime` wrongly converts when ``arg`` is a ``np.datetime64`` object with unit of ``ps``. (:issue:`60341`)
- Bug in setting scalar values with mismatched resolution into arrays with non-nanosecond ``datetime64``, ``timedelta64`` or :class:`DatetimeTZDtype` incorrectly truncating those scalars (:issue:`56410`)
@@ -703,13 +706,15 @@ Timedelta
Timezones
^^^^^^^^^
--
+- Bug in :meth:`DatetimeIndex.union`, :meth:`DatetimeIndex.intersection`, and :meth:`DatetimeIndex.symmetric_difference` changing timezone to UTC when merging two DatetimeIndex objects with the same timezone but different units (:issue:`60080`)
-
Numeric
^^^^^^^
- Bug in :meth:`DataFrame.corr` where numerical precision errors resulted in correlations above ``1.0`` (:issue:`61120`)
+- Bug in :meth:`DataFrame.cov` raises a ``TypeError`` instead of returning potentially incorrect results or other errors (:issue:`53115`)
- Bug in :meth:`DataFrame.quantile` where the column type was not preserved when ``numeric_only=True`` with a list-like ``q`` produced an empty result (:issue:`59035`)
+- Bug in :meth:`Series.dot` returning ``object`` dtype for :class:`ArrowDtype` and nullable-dtype data (:issue:`61375`)
- Bug in ``np.matmul`` with :class:`Index` inputs raising a ``TypeError`` (:issue:`57079`)
Conversion
@@ -772,6 +777,7 @@ I/O
- Bug in :meth:`DataFrame.to_stata` when writing more than 32,000 value labels. (:issue:`60107`)
- Bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`)
- Bug in :meth:`HDFStore.get` was failing to save data of dtype datetime64[s] correctly (:issue:`59004`)
+- Bug in :meth:`HDFStore.select` causing queries on categorical string columns to return unexpected results (:issue:`57608`)
- Bug in :meth:`read_csv` causing segmentation fault when ``encoding_errors`` is not a string. (:issue:`59059`)
- Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`)
- Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`)
@@ -785,6 +791,7 @@ I/O
- Bug in :meth:`read_stata` where extreme value integers were incorrectly interpreted as missing for format versions 111 and prior (:issue:`58130`)
- Bug in :meth:`read_stata` where the missing code for double was not recognised for format versions 105 and prior (:issue:`58149`)
- Bug in :meth:`set_option` where setting the pandas option ``display.html.use_mathjax`` to ``False`` has no effect (:issue:`59884`)
+- Bug in :meth:`to_csv` where ``quotechar``` is not escaped when ``escapechar`` is not None (:issue:`61407`)
- Bug in :meth:`to_excel` where :class:`MultiIndex` columns would be merged to a single row when ``merge_cells=False`` is passed (:issue:`60274`)
Period
@@ -799,6 +806,7 @@ Plotting
- Bug in :meth:`DataFrame.plot.bar` with ``stacked=True`` where labels on stacked bars with zero-height segments were incorrectly positioned at the base instead of the label position of the previous segment (:issue:`59429`)
- Bug in :meth:`DataFrame.plot.line` raising ``ValueError`` when set both color and a ``dict`` style (:issue:`59461`)
- Bug in :meth:`DataFrame.plot` that causes a shift to the right when the frequency multiplier is greater than one. (:issue:`57587`)
+- Bug in :meth:`DataFrame.plot` where ``title`` would require extra titles when plotting more than one column per subplot. (:issue:`61019`)
- Bug in :meth:`Series.plot` preventing a line and bar from being aligned on the same plot (:issue:`61161`)
- Bug in :meth:`Series.plot` preventing a line and scatter plot from being aligned (:issue:`61005`)
- Bug in :meth:`Series.plot` with ``kind="pie"`` with :class:`ArrowDtype` (:issue:`59192`)
@@ -817,6 +825,7 @@ Groupby/resample/rolling
- Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` were not keeping the index name when the index had :class:`ArrowDtype` timestamp dtype (:issue:`61222`)
- Bug in :meth:`DataFrame.resample` changing index type to :class:`MultiIndex` when the dataframe is empty and using an upsample method (:issue:`55572`)
- Bug in :meth:`DataFrameGroupBy.agg` that raises ``AttributeError`` when there is dictionary input and duplicated columns, instead of returning a DataFrame with the aggregation of all duplicate columns. (:issue:`55041`)
+- Bug in :meth:`DataFrameGroupBy.agg` where applying a user-defined function to an empty DataFrame returned a Series instead of an empty DataFrame. (:issue:`61503`)
- Bug in :meth:`DataFrameGroupBy.apply` and :meth:`SeriesGroupBy.apply` for empty data frame with ``group_keys=False`` still creating output index using group keys. (:issue:`60471`)
- Bug in :meth:`DataFrameGroupBy.apply` that was returning a completely empty DataFrame when all return values of ``func`` were ``None`` instead of returning an empty DataFrame with the original columns and dtypes. (:issue:`57775`)
- Bug in :meth:`DataFrameGroupBy.apply` with ``as_index=False`` that was returning :class:`MultiIndex` instead of returning :class:`Index`. (:issue:`58291`)
@@ -825,7 +834,7 @@ Groupby/resample/rolling
- Bug in :meth:`DataFrameGroupby.transform` and :meth:`SeriesGroupby.transform` with a reducer and ``observed=False`` that coerces dtype to float when there are unobserved categories. (:issue:`55326`)
- Bug in :meth:`Rolling.apply` for ``method="table"`` where column order was not being respected due to the columns getting sorted by default. (:issue:`59666`)
- Bug in :meth:`Rolling.apply` where the applied function could be called on fewer than ``min_period`` periods if ``method="table"``. (:issue:`58868`)
-- Bug in :meth:`Series.resample` could raise when the the date range ended shortly before a non-existent time. (:issue:`58380`)
+- Bug in :meth:`Series.resample` could raise when the date range ended shortly before a non-existent time. (:issue:`58380`)
Reshaping
^^^^^^^^^
@@ -842,6 +851,8 @@ Reshaping
- Bug in :meth:`DataFrame.stack` with the new implementation where ``ValueError`` is raised when ``level=[]`` (:issue:`60740`)
- Bug in :meth:`DataFrame.unstack` producing incorrect results when manipulating empty :class:`DataFrame` with an :class:`ExtentionDtype` (:issue:`59123`)
- Bug in :meth:`concat` where concatenating DataFrame and Series with ``ignore_index = True`` drops the series name (:issue:`60723`, :issue:`56257`)
+- Bug in :func:`melt` where calling with duplicate column names in ``id_vars`` raised a misleading ``AttributeError`` (:issue:`61475`)
+- Bug in :meth:`DataFrame.merge` where user-provided suffixes could result in duplicate column names if the resulting names matched existing columns. Now raises a :class:`MergeError` in such cases. (:issue:`61402`)
Sparse
^^^^^^
@@ -873,6 +884,7 @@ Other
- Bug in :func:`eval` with ``engine="numexpr"`` returning unexpected result for float division. (:issue:`59736`)
- Bug in :func:`to_numeric` raising ``TypeError`` when ``arg`` is a :class:`Timedelta` or :class:`Timestamp` scalar. (:issue:`59944`)
- Bug in :func:`unique` on :class:`Index` not always returning :class:`Index` (:issue:`57043`)
+- Bug in :meth:`DataFrame.apply` raising ``RecursionError`` when passing ``func=list[int]``. (:issue:`61565`)
- Bug in :meth:`DataFrame.apply` where passing ``engine="numba"`` ignored ``args`` passed to the applied function (:issue:`58712`)
- Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which caused an exception when using NumPy attributes via ``@`` notation, e.g., ``df.eval("@np.floor(a)")``. (:issue:`58041`)
- Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which did not allow to use ``tan`` function. (:issue:`55091`)
@@ -881,6 +893,7 @@ Other
- Bug in :meth:`DataFrame.query` which raised an exception when querying integer column names using backticks. (:issue:`60494`)
- Bug in :meth:`DataFrame.shift` where passing a ``freq`` on a DataFrame with no columns did not shift the index correctly. (:issue:`60102`)
- Bug in :meth:`DataFrame.sort_index` when passing ``axis="columns"`` and ``ignore_index=True`` and ``ascending=False`` not returning a :class:`RangeIndex` columns (:issue:`57293`)
+- Bug in :meth:`DataFrame.sort_values` where sorting by a column explicitly named ``None`` raised a ``KeyError`` instead of sorting by the column as expected. (:issue:`61512`)
- Bug in :meth:`DataFrame.transform` that was returning the wrong order unless the index was monotonically increasing. (:issue:`57069`)
- Bug in :meth:`DataFrame.where` where using a non-bool type array in the function would return a ``ValueError`` instead of a ``TypeError`` (:issue:`56330`)
- Bug in :meth:`Index.sort_values` when passing a key function that turns values into tuples, e.g. ``key=natsort.natsort_key``, would raise ``TypeError`` (:issue:`56081`)
@@ -900,6 +913,7 @@ Other
- Bug in ``Series.list`` methods not preserving the original name. (:issue:`60522`)
- Bug in printing a :class:`DataFrame` with a :class:`DataFrame` stored in :attr:`DataFrame.attrs` raised a ``ValueError`` (:issue:`60455`)
- Bug in printing a :class:`Series` with a :class:`DataFrame` stored in :attr:`Series.attrs` raised a ``ValueError`` (:issue:`60568`)
+- Fixed bug where the :class:`DataFrame` constructor misclassified array-like objects with a ``.name`` attribute as :class:`Series` or :class:`Index` (:issue:`61443`)
- Fixed regression in :meth:`DataFrame.from_records` not initializing subclasses properly (:issue:`57008`)
.. ***DO NOT USE THIS SECTION***
diff --git a/environment.yml b/environment.yml
index da0b2a012c3fc..74186bd2581c4 100644
--- a/environment.yml
+++ b/environment.yml
@@ -3,12 +3,12 @@ name: pandas-dev
channels:
- conda-forge
dependencies:
- - python=3.10
+ - python=3.11
- pip
# build dependencies
- versioneer
- - cython~=3.0.5
+ - cython<4.0.0a0
- meson=1.2.1
- meson-python=0.13.1
@@ -27,13 +27,12 @@ dependencies:
# optional dependencies
- beautifulsoup4>=4.12.3
- - blosc
- bottleneck>=1.3.6
- fastparquet>=2024.2.0
- - fsspec>=2024.2.0
+ - fsspec>=2023.12.2
- html5lib>=1.1
- hypothesis>=6.84.0
- - gcsfs>=2024.2.0
+ - gcsfs>=2023.12.2
- ipython
- pickleshare # Needed for IPython Sphinx directive in the docs GH#60429
- jinja2>=3.1.3
@@ -45,17 +44,18 @@ dependencies:
- odfpy>=1.4.1
- psycopg2>=2.9.6
- pyarrow>=10.0.1
+ - pyiceberg>=0.7.1
- pymysql>=1.1.0
- pyreadstat>=1.2.6
- pytables>=3.8.0
- python-calamine>=0.1.7
- pytz>=2023.4
- pyxlsb>=1.0.10
- - s3fs>=2024.2.0
+ - s3fs>=2023.12.2
- scipy>=1.12.0
- sqlalchemy>=2.0.0
- tabulate>=0.9.0
- - xarray>=2024.1.1, <=2024.9.0
+ - xarray>=2024.1.1
- xlrd>=2.0.1
- xlsxwriter>=3.2.0
- zstandard>=0.22.0
@@ -83,8 +83,6 @@ dependencies:
# documentation
- gitpython # obtain contributors from git for whatsnew
- - gitdb
- - google-auth
- natsort # DataFrame.sort_values doctest
- numpydoc
- pydata-sphinx-theme=0.16
diff --git a/meson.build b/meson.build
index 66583095a6e77..6a00e52481108 100644
--- a/meson.build
+++ b/meson.build
@@ -47,6 +47,28 @@ endif
cy = meson.get_compiler('cython')
if cy.version().version_compare('>=3.1.0')
add_project_arguments('-Xfreethreading_compatible=true', language: 'cython')
+
+ # Use shared utility code to reduce wheel sizes
+ # copied from https://github.com/scikit-learn/scikit-learn/pull/31151/files
+ cy = find_program(cy.cmd_array()[0])
+ cython_shared_src = custom_target(
+ install: false,
+ output: '_cyutility.c',
+ command: [
+ cy,
+ '-3',
+ '-Xfreethreading_compatible=true',
+ '--fast-fail',
+ '--generate-shared=' + meson.current_build_dir() / '_cyutility.c',
+ ],
+ )
+
+ py.extension_module(
+ '_cyutility',
+ cython_shared_src,
+ subdir: 'pandas/_libs',
+ install: true,
+ )
endif
# Needed by pandas.test() when it looks for the pytest ini options
diff --git a/pandas/__init__.py b/pandas/__init__.py
index 7d6dd7b7c1a88..8b92ad6cdfebb 100644
--- a/pandas/__init__.py
+++ b/pandas/__init__.py
@@ -164,6 +164,7 @@
read_stata,
read_sas,
read_spss,
+ read_iceberg,
)
from pandas.io.json._normalize import json_normalize
@@ -319,6 +320,7 @@
"read_fwf",
"read_hdf",
"read_html",
+ "read_iceberg",
"read_json",
"read_orc",
"read_parquet",
diff --git a/pandas/_config/config.py b/pandas/_config/config.py
index ce53e05608ba7..d42d90d44f82f 100644
--- a/pandas/_config/config.py
+++ b/pandas/_config/config.py
@@ -199,9 +199,9 @@ def set_option(*args) -> None:
Parameters
----------
- *args : str | object
- Arguments provided in pairs, which will be interpreted as (pattern, value)
- pairs.
+ *args : str | object | dict
+ Arguments provided in pairs, which will be interpreted as (pattern, value),
+ or as a single dictionary containing multiple option-value pairs.
pattern: str
Regexp which should match a single option
value: object
@@ -239,6 +239,8 @@ def set_option(*args) -> None:
Examples
--------
+ Option-Value Pair Input:
+
>>> pd.set_option("display.max_columns", 4)
>>> df = pd.DataFrame([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]])
>>> df
@@ -247,8 +249,23 @@ def set_option(*args) -> None:
1 6 7 ... 9 10
[2 rows x 5 columns]
>>> pd.reset_option("display.max_columns")
+
+ Dictionary Input:
+
+ >>> pd.set_option({"display.max_columns": 4, "display.precision": 1})
+ >>> df = pd.DataFrame([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]])
+ >>> df
+ 0 1 ... 3 4
+ 0 1 2 ... 4 5
+ 1 6 7 ... 9 10
+ [2 rows x 5 columns]
+ >>> pd.reset_option("display.max_columns")
+ >>> pd.reset_option("display.precision")
"""
- # must at least 1 arg deal with constraints later
+ # Handle dictionary input
+ if len(args) == 1 and isinstance(args[0], dict):
+ args = tuple(kv for item in args[0].items() for kv in item)
+
nargs = len(args)
if not nargs or nargs % 2 != 0:
raise ValueError("Must provide an even number of non-keyword arguments")
@@ -440,9 +457,10 @@ def option_context(*args) -> Generator[None]:
Parameters
----------
- *args : str | object
+ *args : str | object | dict
An even amount of arguments provided in pairs which will be
- interpreted as (pattern, value) pairs.
+ interpreted as (pattern, value) pairs. Alternatively, a single
+ dictionary of {pattern: value} may be provided.
Returns
-------
@@ -471,7 +489,12 @@ def option_context(*args) -> Generator[None]:
>>> from pandas import option_context
>>> with option_context("display.max_rows", 10, "display.max_columns", 5):
... pass
+ >>> with option_context({"display.max_rows": 10, "display.max_columns": 5}):
+ ... pass
"""
+ if len(args) == 1 and isinstance(args[0], dict):
+ args = tuple(kv for item in args[0].items() for kv in item)
+
if len(args) % 2 != 0 or len(args) < 2:
raise ValueError(
"Provide an even amount of arguments as "
diff --git a/pandas/_libs/arrays.pyi b/pandas/_libs/arrays.pyi
index dda23d9dec98b..60e4ff3fab74e 100644
--- a/pandas/_libs/arrays.pyi
+++ b/pandas/_libs/arrays.pyi
@@ -1,4 +1,4 @@
-from typing import Sequence
+from collections.abc import Sequence
import numpy as np
diff --git a/pandas/_libs/hashtable.pyi b/pandas/_libs/hashtable.pyi
index 7a810a988e50e..5ee359d84a6ed 100644
--- a/pandas/_libs/hashtable.pyi
+++ b/pandas/_libs/hashtable.pyi
@@ -1,6 +1,6 @@
+from collections.abc import Hashable
from typing import (
Any,
- Hashable,
Literal,
overload,
)
diff --git a/pandas/_libs/internals.pyi b/pandas/_libs/internals.pyi
index ffe6c7730bcdc..a680304d55ea2 100644
--- a/pandas/_libs/internals.pyi
+++ b/pandas/_libs/internals.pyi
@@ -1,6 +1,8 @@
-from typing import (
+from collections.abc import (
Iterator,
Sequence,
+)
+from typing import (
final,
overload,
)
diff --git a/pandas/_libs/json.pyi b/pandas/_libs/json.pyi
index bc4fe68573b94..349320d69d707 100644
--- a/pandas/_libs/json.pyi
+++ b/pandas/_libs/json.pyi
@@ -1,6 +1,6 @@
+from collections.abc import Callable
from typing import (
Any,
- Callable,
)
def ujson_dumps(
diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi
index daaaacee3487d..331233f37f63d 100644
--- a/pandas/_libs/lib.pyi
+++ b/pandas/_libs/lib.pyi
@@ -1,12 +1,14 @@
# TODO(npdtypes): Many types specified here can be made more specific/accurate;
# the more specific versions are specified in comments
+from collections.abc import (
+ Callable,
+ Generator,
+ Hashable,
+)
from decimal import Decimal
from typing import (
Any,
- Callable,
Final,
- Generator,
- Hashable,
Literal,
TypeAlias,
overload,
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 38d9a8f62417c..3b7d659c2150e 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -2,6 +2,7 @@ from collections import abc
from decimal import Decimal
from enum import Enum
from sys import getsizeof
+from types import GenericAlias
from typing import (
Literal,
_GenericAlias,
@@ -777,7 +778,10 @@ cpdef ndarray[object] ensure_string_array(
return out
arr = arr.to_numpy(dtype=object)
elif not util.is_array(arr):
- arr = np.array(arr, dtype="object")
+ # GH#61155: Guarantee a 1-d result when array is a list of lists
+ input_arr = arr
+ arr = np.empty(len(arr), dtype="object")
+ arr[:] = input_arr
result = np.asarray(arr, dtype="object")
@@ -1295,7 +1299,7 @@ cdef bint c_is_list_like(object obj, bint allow_sets) except -1:
getattr(obj, "__iter__", None) is not None and not isinstance(obj, type)
# we do not count strings/unicode/bytes as list-like
# exclude Generic types that have __iter__
- and not isinstance(obj, (str, bytes, _GenericAlias))
+ and not isinstance(obj, (str, bytes, _GenericAlias, GenericAlias))
# exclude zero-dimensional duck-arrays, effectively scalars
and not (hasattr(obj, "ndim") and obj.ndim == 0)
# exclude sets if allow_sets is False
diff --git a/pandas/_libs/meson.build b/pandas/_libs/meson.build
index a50976767928a..33fc65e5034d0 100644
--- a/pandas/_libs/meson.build
+++ b/pandas/_libs/meson.build
@@ -148,6 +148,12 @@ if get_option('buildtype') == 'debug'
cython_args += ['--gdb']
endif
+# Use shared utility code to reduce wheel sizes
+# copied from https://github.com/scikit-learn/scikit-learn/pull/31151/files
+if cy.version().version_compare('>=3.1.0')
+ cython_args += ['--shared=pandas._libs._cyutility']
+endif
+
foreach ext_name, ext_dict : libs_sources
py.extension_module(
ext_name,
diff --git a/pandas/_libs/ops.pyi b/pandas/_libs/ops.pyi
index 6738a1dff4a9e..81fe81930539d 100644
--- a/pandas/_libs/ops.pyi
+++ b/pandas/_libs/ops.pyi
@@ -1,7 +1,9 @@
-from typing import (
- Any,
+from collections.abc import (
Callable,
Iterable,
+)
+from typing import (
+ Any,
Literal,
TypeAlias,
overload,
diff --git a/pandas/_libs/parsers.pyi b/pandas/_libs/parsers.pyi
index 253bb7303cefb..d18f54c546232 100644
--- a/pandas/_libs/parsers.pyi
+++ b/pandas/_libs/parsers.pyi
@@ -1,5 +1,5 @@
+from collections.abc import Hashable
from typing import (
- Hashable,
Literal,
)
diff --git a/pandas/_libs/properties.pyi b/pandas/_libs/properties.pyi
index aaa44a0cf47bf..bbde6ec454202 100644
--- a/pandas/_libs/properties.pyi
+++ b/pandas/_libs/properties.pyi
@@ -1,5 +1,5 @@
+from collections.abc import Sequence
from typing import (
- Sequence,
overload,
)
diff --git a/pandas/_libs/sparse.pyi b/pandas/_libs/sparse.pyi
index 536265b25425e..8727b1a5b0420 100644
--- a/pandas/_libs/sparse.pyi
+++ b/pandas/_libs/sparse.pyi
@@ -1,4 +1,4 @@
-from typing import Sequence
+from collections.abc import Sequence
import numpy as np
diff --git a/pandas/_libs/testing.pyi b/pandas/_libs/testing.pyi
index ab87e58eba9b9..4758483b3b5e7 100644
--- a/pandas/_libs/testing.pyi
+++ b/pandas/_libs/testing.pyi
@@ -1,4 +1,4 @@
-from typing import Mapping
+from collections.abc import Mapping
def assert_dict_equal(a: Mapping, b: Mapping, compare_keys: bool = ...) -> bool: ...
def assert_almost_equal(
diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx
index c4acf72ab87d8..45552108f8c15 100644
--- a/pandas/_libs/tslibs/conversion.pyx
+++ b/pandas/_libs/tslibs/conversion.pyx
@@ -797,7 +797,7 @@ cdef int64_t parse_pydatetime(
dts : *npy_datetimestruct
Needed to use in pydatetime_to_dt64, which writes to it.
creso : NPY_DATETIMEUNIT
- Resolution to store the the result.
+ Resolution to store the result.
Raises
------
diff --git a/pandas/_libs/tslibs/meson.build b/pandas/_libs/tslibs/meson.build
index 052a8568b76af..ac43dc7db5fb7 100644
--- a/pandas/_libs/tslibs/meson.build
+++ b/pandas/_libs/tslibs/meson.build
@@ -28,6 +28,12 @@ if get_option('buildtype') == 'debug'
cython_args += ['--gdb']
endif
+# Use shared utility code to reduce wheel sizes
+# copied from https://github.com/scikit-learn/scikit-learn/pull/31151/files
+if cy.version().version_compare('>=3.1.0')
+ cython_args += ['--shared=pandas._libs._cyutility']
+endif
+
foreach ext_name, ext_dict : tslibs_sources
py.extension_module(
ext_name,
diff --git a/pandas/_libs/tslibs/nattype.pyi b/pandas/_libs/tslibs/nattype.pyi
index d3b10fbe79cb9..ff3bb5b70801e 100644
--- a/pandas/_libs/tslibs/nattype.pyi
+++ b/pandas/_libs/tslibs/nattype.pyi
@@ -1,7 +1,5 @@
from datetime import (
- date as date_,
datetime,
- time as time_,
timedelta,
tzinfo as _tzinfo,
)
@@ -99,7 +97,6 @@ class NaTType:
ambiguous: bool | Literal["raise"] | NaTType = ...,
nonexistent: TimestampNonexistent = ...,
) -> NaTType: ...
- def combine(cls, date: date_, time: time_) -> NoReturn: ...
@property
def tzinfo(self) -> None: ...
@property
diff --git a/pandas/_libs/tslibs/offsets.pyi b/pandas/_libs/tslibs/offsets.pyi
index f9f56d38c5e0a..ad579a5e41522 100644
--- a/pandas/_libs/tslibs/offsets.pyi
+++ b/pandas/_libs/tslibs/offsets.pyi
@@ -1,3 +1,4 @@
+from collections.abc import Collection
from datetime import (
datetime,
time,
@@ -5,7 +6,6 @@ from datetime import (
)
from typing import (
Any,
- Collection,
Literal,
TypeVar,
overload,
diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx
index fb89f1328529d..b443aa7bede22 100644
--- a/pandas/_libs/tslibs/strptime.pyx
+++ b/pandas/_libs/tslibs/strptime.pyx
@@ -444,6 +444,9 @@ def array_strptime(
else:
val = str(val)
+ out_local = 0
+ out_tzoffset = 0
+
if fmt == "ISO8601":
string_to_dts_succeeded = not string_to_dts(
val, &dts, &out_bestunit, &out_local,
diff --git a/pandas/_libs/tslibs/timezones.pyi b/pandas/_libs/tslibs/timezones.pyi
index 4e9f0c6ae6c33..26ffa568a8480 100644
--- a/pandas/_libs/tslibs/timezones.pyi
+++ b/pandas/_libs/tslibs/timezones.pyi
@@ -1,8 +1,8 @@
+from collections.abc import Callable
from datetime import (
datetime,
tzinfo,
)
-from typing import Callable
import numpy as np
diff --git a/pandas/_libs/tslibs/tzconversion.pyi b/pandas/_libs/tslibs/tzconversion.pyi
index 2108fa0f35547..07ee46858577a 100644
--- a/pandas/_libs/tslibs/tzconversion.pyi
+++ b/pandas/_libs/tslibs/tzconversion.pyi
@@ -1,8 +1,8 @@
+from collections.abc import Iterable
from datetime import (
timedelta,
tzinfo,
)
-from typing import Iterable
import numpy as np
diff --git a/pandas/_libs/window/aggregations.pyi b/pandas/_libs/window/aggregations.pyi
index b4bdd7e05cf0e..99413751cd5c2 100644
--- a/pandas/_libs/window/aggregations.pyi
+++ b/pandas/_libs/window/aggregations.pyi
@@ -1,6 +1,6 @@
+from collections.abc import Callable
from typing import (
Any,
- Callable,
Literal,
)
diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx
index 04b3f8ab461fa..0c8ea28b60ce8 100644
--- a/pandas/_libs/window/aggregations.pyx
+++ b/pandas/_libs/window/aggregations.pyx
@@ -1354,8 +1354,8 @@ def roll_quantile(const float64_t[:] values, ndarray[int64_t] start,
if interpolation_type == LINEAR:
vlow = skiplist_get(skiplist, idx, &ret)
vhigh = skiplist_get(skiplist, idx + 1, &ret)
- output[i] = ((vlow + (vhigh - vlow) *
- (idx_with_fraction - idx)))
+ output[i] = (vlow + (vhigh - vlow) *
+ (idx_with_fraction - idx))
elif interpolation_type == LOWER:
output[i] = skiplist_get(skiplist, idx, &ret)
elif interpolation_type == HIGHER:
diff --git a/pandas/_libs/window/meson.build b/pandas/_libs/window/meson.build
index 1d49bba47e139..8c00a98b1241a 100644
--- a/pandas/_libs/window/meson.build
+++ b/pandas/_libs/window/meson.build
@@ -1,7 +1,14 @@
+cy_args = ['-X always_allow_keywords=true']
+# Use shared utility code to reduce wheel sizes
+# copied from https://github.com/scikit-learn/scikit-learn/pull/31151/files
+if cy.version().version_compare('>=3.1.0')
+ cython_args += ['--shared=pandas._libs._cyutility']
+endif
+
py.extension_module(
'aggregations',
['aggregations.pyx'],
- cython_args: ['-X always_allow_keywords=true'],
+ cython_args: cy_args,
include_directories: [inc_np, inc_pd],
subdir: 'pandas/_libs/window',
override_options: ['cython_language=cpp'],
@@ -11,7 +18,7 @@ py.extension_module(
py.extension_module(
'indexers',
['indexers.pyx'],
- cython_args: ['-X always_allow_keywords=true'],
+ cython_args: cy_args,
include_directories: [inc_np, inc_pd],
subdir: 'pandas/_libs/window',
install: true,
diff --git a/pandas/_testing/contexts.py b/pandas/_testing/contexts.py
index 99826de51e1bf..da147c117ad43 100644
--- a/pandas/_testing/contexts.py
+++ b/pandas/_testing/contexts.py
@@ -3,6 +3,7 @@
from contextlib import contextmanager
import os
from pathlib import Path
+import sys
import tempfile
from typing import (
IO,
@@ -81,7 +82,9 @@ def setTZ(tz) -> None:
pass
else:
os.environ["TZ"] = tz
- time.tzset()
+ # Next line allows typing checks to pass on Windows
+ if sys.platform != "win32":
+ time.tzset()
orig_tz = os.environ.get("TZ")
setTZ(tz)
diff --git a/pandas/_typing.py b/pandas/_typing.py
index 4365ee85f72e3..889252bb00438 100644
--- a/pandas/_typing.py
+++ b/pandas/_typing.py
@@ -1,5 +1,6 @@
from __future__ import annotations
+from builtins import type as type_t # pyright: ignore[reportUnusedImport]
from collections.abc import (
Callable,
Hashable,
@@ -20,22 +21,23 @@
TYPE_CHECKING,
Any,
Literal,
- Optional,
Protocol,
- Type as type_t,
+ TypeAlias,
TypeVar,
Union,
overload,
)
import numpy as np
+import numpy.typing as npt
# To prevent import cycles place any internal imports in the branch below
# and use a string literal forward reference to it in subsequent types
# https://mypy.readthedocs.io/en/latest/common_issues.html#import-cycles
-if TYPE_CHECKING:
- import numpy.typing as npt
+# Note that Union is needed when a Union includes a pandas type
+
+if TYPE_CHECKING:
from pandas._libs import (
NaTType,
Period,
@@ -76,19 +78,12 @@
from pandas.io.formats.format import EngFormatter
from pandas.tseries.holiday import AbstractHolidayCalendar
- ScalarLike_co = Union[
- int,
- float,
- complex,
- str,
- bytes,
- np.generic,
- ]
+ ScalarLike_co: TypeAlias = int | float | complex | str | bytes | np.generic
# numpy compatible types
- NumpyValueArrayLike = Union[ScalarLike_co, npt.ArrayLike]
+ NumpyValueArrayLike: TypeAlias = ScalarLike_co | npt.ArrayLike
# Name "npt._ArrayLikeInt_co" is not defined [name-defined]
- NumpySorter = Optional[npt._ArrayLikeInt_co] # type: ignore[name-defined]
+ NumpySorter: TypeAlias = npt._ArrayLikeInt_co | None # type: ignore[name-defined]
from typing import (
ParamSpec,
@@ -107,7 +102,6 @@
from typing_extensions import Unpack # pyright: ignore[reportUnusedImport]
else:
- npt: Any = None
ParamSpec: Any = None
Self: Any = None
TypeGuard: Any = None
@@ -120,10 +114,10 @@
# array-like
-ArrayLike = Union["ExtensionArray", np.ndarray]
+ArrayLike: TypeAlias = Union["ExtensionArray", np.ndarray]
ArrayLikeT = TypeVar("ArrayLikeT", "ExtensionArray", np.ndarray)
-AnyArrayLike = Union[ArrayLike, "Index", "Series"]
-TimeArrayLike = Union["DatetimeArray", "TimedeltaArray"]
+AnyArrayLike: TypeAlias = Union[ArrayLike, "Index", "Series"]
+TimeArrayLike: TypeAlias = Union["DatetimeArray", "TimedeltaArray"]
# list-like
@@ -152,31 +146,31 @@ def count(self, value: Any, /) -> int: ...
def __reversed__(self) -> Iterator[_T_co]: ...
-ListLike = Union[AnyArrayLike, SequenceNotStr, range]
+ListLike: TypeAlias = AnyArrayLike | SequenceNotStr | range
# scalars
-PythonScalar = Union[str, float, bool]
-DatetimeLikeScalar = Union["Period", "Timestamp", "Timedelta"]
-PandasScalar = Union["Period", "Timestamp", "Timedelta", "Interval"]
-Scalar = Union[PythonScalar, PandasScalar, np.datetime64, np.timedelta64, date]
-IntStrT = TypeVar("IntStrT", bound=Union[int, str])
-
+PythonScalar: TypeAlias = str | float | bool
+DatetimeLikeScalar: TypeAlias = Union["Period", "Timestamp", "Timedelta"]
+PandasScalar: TypeAlias = Union["Period", "Timestamp", "Timedelta", "Interval"]
+Scalar: TypeAlias = PythonScalar | PandasScalar | np.datetime64 | np.timedelta64 | date
+IntStrT = TypeVar("IntStrT", bound=int | str)
# timestamp and timedelta convertible types
-TimestampConvertibleTypes = Union[
+TimestampConvertibleTypes: TypeAlias = Union[
"Timestamp", date, np.datetime64, np.int64, float, str
]
-TimestampNonexistent = Union[
- Literal["shift_forward", "shift_backward", "NaT", "raise"], timedelta
-]
-TimedeltaConvertibleTypes = Union[
+TimestampNonexistent: TypeAlias = (
+ Literal["shift_forward", "shift_backward", "NaT", "raise"] | timedelta
+)
+
+TimedeltaConvertibleTypes: TypeAlias = Union[
"Timedelta", timedelta, np.timedelta64, np.int64, float, str
]
-Timezone = Union[str, tzinfo]
+Timezone: TypeAlias = str | tzinfo
-ToTimestampHow = Literal["s", "e", "start", "end"]
+ToTimestampHow: TypeAlias = Literal["s", "e", "start", "end"]
# NDFrameT is stricter and ensures that the same subclass of NDFrame always is
# used. E.g. `def func(a: NDFrameT) -> NDFrameT: ...` means that if a
@@ -188,69 +182,66 @@ def __reversed__(self) -> Iterator[_T_co]: ...
FreqIndexT = TypeVar("FreqIndexT", "DatetimeIndex", "PeriodIndex", "TimedeltaIndex")
NumpyIndexT = TypeVar("NumpyIndexT", np.ndarray, "Index")
-AxisInt = int
-Axis = Union[AxisInt, Literal["index", "columns", "rows"]]
-IndexLabel = Union[Hashable, Sequence[Hashable]]
-Level = Hashable
-Shape = tuple[int, ...]
-Suffixes = Sequence[Optional[str]]
-Ordered = Optional[bool]
-JSONSerializable = Optional[Union[PythonScalar, list, dict]]
-Frequency = Union[str, "BaseOffset"]
-Axes = ListLike
-
-RandomState = Union[
- int,
- np.ndarray,
- np.random.Generator,
- np.random.BitGenerator,
- np.random.RandomState,
-]
+AxisInt: TypeAlias = int
+Axis: TypeAlias = AxisInt | Literal["index", "columns", "rows"]
+IndexLabel: TypeAlias = Hashable | Sequence[Hashable]
+Level: TypeAlias = Hashable
+Shape: TypeAlias = tuple[int, ...]
+Suffixes: TypeAlias = Sequence[str | None]
+Ordered: TypeAlias = bool | None
+JSONSerializable: TypeAlias = PythonScalar | list | dict | None
+Frequency: TypeAlias = Union[str, "BaseOffset"]
+Axes: TypeAlias = ListLike
+
+RandomState: TypeAlias = (
+ int
+ | np.ndarray
+ | np.random.Generator
+ | np.random.BitGenerator
+ | np.random.RandomState
+)
+
# dtypes
-NpDtype = Union[str, np.dtype, type_t[Union[str, complex, bool, object]]]
-Dtype = Union["ExtensionDtype", NpDtype]
-AstypeArg = Union["ExtensionDtype", "npt.DTypeLike"]
+NpDtype: TypeAlias = str | np.dtype | type[str | complex | bool | object]
+Dtype: TypeAlias = Union["ExtensionDtype", NpDtype]
+AstypeArg: TypeAlias = Union["ExtensionDtype", npt.DTypeLike]
# DtypeArg specifies all allowable dtypes in a functions its dtype argument
-DtypeArg = Union[Dtype, Mapping[Hashable, Dtype]]
-DtypeObj = Union[np.dtype, "ExtensionDtype"]
+DtypeArg: TypeAlias = Dtype | Mapping[Hashable, Dtype]
+DtypeObj: TypeAlias = Union[np.dtype, "ExtensionDtype"]
# converters
-ConvertersArg = dict[Hashable, Callable[[Dtype], Dtype]]
+ConvertersArg: TypeAlias = dict[Hashable, Callable[[Dtype], Dtype]]
# parse_dates
-ParseDatesArg = Union[
- bool, list[Hashable], list[list[Hashable]], dict[Hashable, list[Hashable]]
-]
+ParseDatesArg: TypeAlias = (
+ bool | list[Hashable] | list[list[Hashable]] | dict[Hashable, list[Hashable]]
+)
# For functions like rename that convert one label to another
-Renamer = Union[Mapping[Any, Hashable], Callable[[Any], Hashable]]
+Renamer: TypeAlias = Mapping[Any, Hashable] | Callable[[Any], Hashable]
# to maintain type information across generic functions and parametrization
T = TypeVar("T")
# used in decorators to preserve the signature of the function it decorates
# see https://mypy.readthedocs.io/en/stable/generics.html#declaring-decorators
-FuncType = Callable[..., Any]
+FuncType: TypeAlias = Callable[..., Any]
F = TypeVar("F", bound=FuncType)
TypeT = TypeVar("TypeT", bound=type)
# types of vectorized key functions for DataFrame::sort_values and
# DataFrame::sort_index, among others
-ValueKeyFunc = Optional[Callable[["Series"], Union["Series", AnyArrayLike]]]
-IndexKeyFunc = Optional[Callable[["Index"], Union["Index", AnyArrayLike]]]
+ValueKeyFunc: TypeAlias = Callable[["Series"], Union["Series", AnyArrayLike]] | None
+IndexKeyFunc: TypeAlias = Callable[["Index"], Union["Index", AnyArrayLike]] | None
# types of `func` kwarg for DataFrame.aggregate and Series.aggregate
-AggFuncTypeBase = Union[Callable, str]
-AggFuncTypeDict = MutableMapping[
- Hashable, Union[AggFuncTypeBase, list[AggFuncTypeBase]]
+AggFuncTypeBase: TypeAlias = Callable | str
+AggFuncTypeDict: TypeAlias = MutableMapping[
+ Hashable, AggFuncTypeBase | list[AggFuncTypeBase]
]
-AggFuncType = Union[
- AggFuncTypeBase,
- list[AggFuncTypeBase],
- AggFuncTypeDict,
-]
-AggObjType = Union[
+AggFuncType: TypeAlias = AggFuncTypeBase | list[AggFuncTypeBase] | AggFuncTypeDict
+AggObjType: TypeAlias = Union[
"Series",
"DataFrame",
"GroupBy",
@@ -260,7 +251,7 @@ def __reversed__(self) -> Iterator[_T_co]: ...
"Resampler",
]
-PythonFuncType = Callable[[Any], Any]
+PythonFuncType: TypeAlias = Callable[[Any], Any]
# filenames and file-like-objects
AnyStr_co = TypeVar("AnyStr_co", str, bytes, covariant=True)
@@ -330,31 +321,30 @@ def closed(self) -> bool:
...
-FilePath = Union[str, "PathLike[str]"]
+FilePath: TypeAlias = str | PathLike[str]
# for arbitrary kwargs passed during reading/writing files
-StorageOptions = Optional[dict[str, Any]]
-
+StorageOptions: TypeAlias = dict[str, Any] | None
# compression keywords and compression
-CompressionDict = dict[str, Any]
-CompressionOptions = Optional[
- Union[Literal["infer", "gzip", "bz2", "zip", "xz", "zstd", "tar"], CompressionDict]
-]
+CompressionDict: TypeAlias = dict[str, Any]
+CompressionOptions: TypeAlias = (
+ Literal["infer", "gzip", "bz2", "zip", "xz", "zstd", "tar"] | CompressionDict | None
+)
# types in DataFrameFormatter
-FormattersType = Union[
- list[Callable], tuple[Callable, ...], Mapping[Union[str, int], Callable]
-]
-ColspaceType = Mapping[Hashable, Union[str, int]]
-FloatFormatType = Union[str, Callable, "EngFormatter"]
-ColspaceArgType = Union[
- str, int, Sequence[Union[str, int]], Mapping[Hashable, Union[str, int]]
-]
+FormattersType: TypeAlias = (
+ list[Callable] | tuple[Callable, ...] | Mapping[str | int, Callable]
+)
+ColspaceType: TypeAlias = Mapping[Hashable, str | int]
+FloatFormatType: TypeAlias = Union[str, Callable, "EngFormatter"]
+ColspaceArgType: TypeAlias = (
+ str | int | Sequence[str | int] | Mapping[Hashable, str | int]
+)
# Arguments for fillna()
-FillnaOptions = Literal["backfill", "bfill", "ffill", "pad"]
-InterpolateOptions = Literal[
+FillnaOptions: TypeAlias = Literal["backfill", "bfill", "ffill", "pad"]
+InterpolateOptions: TypeAlias = Literal[
"linear",
"time",
"index",
@@ -376,7 +366,7 @@ def closed(self) -> bool:
]
# internals
-Manager = Union["BlockManager", "SingleBlockManager"]
+Manager: TypeAlias = Union["BlockManager", "SingleBlockManager"]
# indexing
# PositionalIndexer -> valid 1D positional indexer, e.g. can pass
@@ -389,63 +379,62 @@ def closed(self) -> bool:
# https://github.com/python/typing/issues/684#issuecomment-548203158
# https://bugs.python.org/issue41810
# Using List[int] here rather than Sequence[int] to disallow tuples.
-ScalarIndexer = Union[int, np.integer]
-SequenceIndexer = Union[slice, list[int], np.ndarray]
-PositionalIndexer = Union[ScalarIndexer, SequenceIndexer]
-PositionalIndexerTuple = tuple[PositionalIndexer, PositionalIndexer]
-PositionalIndexer2D = Union[PositionalIndexer, PositionalIndexerTuple]
-if TYPE_CHECKING:
- TakeIndexer = Union[Sequence[int], Sequence[np.integer], npt.NDArray[np.integer]]
-else:
- TakeIndexer = Any
+ScalarIndexer: TypeAlias = int | np.integer
+SequenceIndexer: TypeAlias = slice | list[int] | np.ndarray
+PositionalIndexer: TypeAlias = ScalarIndexer | SequenceIndexer
+PositionalIndexerTuple: TypeAlias = tuple[PositionalIndexer, PositionalIndexer]
+PositionalIndexer2D: TypeAlias = PositionalIndexer | PositionalIndexerTuple
+TakeIndexer: TypeAlias = Sequence[int] | Sequence[np.integer] | npt.NDArray[np.integer]
# Shared by functions such as drop and astype
-IgnoreRaise = Literal["ignore", "raise"]
+IgnoreRaise: TypeAlias = Literal["ignore", "raise"]
# Windowing rank methods
-WindowingRankType = Literal["average", "min", "max"]
+WindowingRankType: TypeAlias = Literal["average", "min", "max"]
# read_csv engines
-CSVEngine = Literal["c", "python", "pyarrow", "python-fwf"]
+CSVEngine: TypeAlias = Literal["c", "python", "pyarrow", "python-fwf"]
# read_json engines
-JSONEngine = Literal["ujson", "pyarrow"]
+JSONEngine: TypeAlias = Literal["ujson", "pyarrow"]
# read_xml parsers
-XMLParsers = Literal["lxml", "etree"]
+XMLParsers: TypeAlias = Literal["lxml", "etree"]
# read_html flavors
-HTMLFlavors = Literal["lxml", "html5lib", "bs4"]
+HTMLFlavors: TypeAlias = Literal["lxml", "html5lib", "bs4"]
# Interval closed type
-IntervalLeftRight = Literal["left", "right"]
-IntervalClosedType = Union[IntervalLeftRight, Literal["both", "neither"]]
+IntervalLeftRight: TypeAlias = Literal["left", "right"]
+IntervalClosedType: TypeAlias = IntervalLeftRight | Literal["both", "neither"]
# datetime and NaTType
-DatetimeNaTType = Union[datetime, "NaTType"]
-DateTimeErrorChoices = Literal["raise", "coerce"]
+DatetimeNaTType: TypeAlias = Union[datetime, "NaTType"]
+DateTimeErrorChoices: TypeAlias = Literal["raise", "coerce"]
# sort_index
-SortKind = Literal["quicksort", "mergesort", "heapsort", "stable"]
-NaPosition = Literal["first", "last"]
+SortKind: TypeAlias = Literal["quicksort", "mergesort", "heapsort", "stable"]
+NaPosition: TypeAlias = Literal["first", "last"]
# Arguments for nsmallest and nlargest
-NsmallestNlargestKeep = Literal["first", "last", "all"]
+NsmallestNlargestKeep: TypeAlias = Literal["first", "last", "all"]
# quantile interpolation
-QuantileInterpolation = Literal["linear", "lower", "higher", "midpoint", "nearest"]
+QuantileInterpolation: TypeAlias = Literal[
+ "linear", "lower", "higher", "midpoint", "nearest"
+]
# plotting
-PlottingOrientation = Literal["horizontal", "vertical"]
+PlottingOrientation: TypeAlias = Literal["horizontal", "vertical"]
# dropna
-AnyAll = Literal["any", "all"]
+AnyAll: TypeAlias = Literal["any", "all"]
# merge
-MergeHow = Literal[
+MergeHow: TypeAlias = Literal[
"left", "right", "inner", "outer", "cross", "left_anti", "right_anti"
]
-MergeValidate = Literal[
+MergeValidate: TypeAlias = Literal[
"one_to_one",
"1:1",
"one_to_many",
@@ -457,8 +446,8 @@ def closed(self) -> bool:
]
# join
-JoinHow = Literal["left", "right", "inner", "outer"]
-JoinValidate = Literal[
+JoinHow: TypeAlias = Literal["left", "right", "inner", "outer"]
+JoinValidate: TypeAlias = Literal[
"one_to_one",
"1:1",
"one_to_many",
@@ -470,25 +459,28 @@ def closed(self) -> bool:
]
# reindex
-ReindexMethod = Union[FillnaOptions, Literal["nearest"]]
+ReindexMethod: TypeAlias = FillnaOptions | Literal["nearest"]
-MatplotlibColor = Union[str, Sequence[float]]
-TimeGrouperOrigin = Union[
+MatplotlibColor: TypeAlias = str | Sequence[float]
+TimeGrouperOrigin: TypeAlias = Union[
"Timestamp", Literal["epoch", "start", "start_day", "end", "end_day"]
]
-TimeAmbiguous = Union[Literal["infer", "NaT", "raise"], "npt.NDArray[np.bool_]"]
-TimeNonexistent = Union[
- Literal["shift_forward", "shift_backward", "NaT", "raise"], timedelta
-]
-DropKeep = Literal["first", "last", False]
-CorrelationMethod = Union[
- Literal["pearson", "kendall", "spearman"], Callable[[np.ndarray, np.ndarray], float]
-]
-AlignJoin = Literal["outer", "inner", "left", "right"]
-DtypeBackend = Literal["pyarrow", "numpy_nullable"]
+TimeAmbiguous: TypeAlias = Literal["infer", "NaT", "raise"] | npt.NDArray[np.bool_]
+TimeNonexistent: TypeAlias = (
+ Literal["shift_forward", "shift_backward", "NaT", "raise"] | timedelta
+)
+
+DropKeep: TypeAlias = Literal["first", "last", False]
+CorrelationMethod: TypeAlias = (
+ Literal["pearson", "kendall", "spearman"]
+ | Callable[[np.ndarray, np.ndarray], float]
+)
-TimeUnit = Literal["s", "ms", "us", "ns"]
-OpenFileErrors = Literal[
+AlignJoin: TypeAlias = Literal["outer", "inner", "left", "right"]
+DtypeBackend: TypeAlias = Literal["pyarrow", "numpy_nullable"]
+
+TimeUnit: TypeAlias = Literal["s", "ms", "us", "ns"]
+OpenFileErrors: TypeAlias = Literal[
"strict",
"ignore",
"replace",
@@ -499,34 +491,32 @@ def closed(self) -> bool:
]
# update
-UpdateJoin = Literal["left"]
+UpdateJoin: TypeAlias = Literal["left"]
# applymap
-NaAction = Literal["ignore"]
+NaAction: TypeAlias = Literal["ignore"]
# from_dict
-FromDictOrient = Literal["columns", "index", "tight"]
+FromDictOrient: TypeAlias = Literal["columns", "index", "tight"]
# to_stata
-ToStataByteorder = Literal[">", "<", "little", "big"]
+ToStataByteorder: TypeAlias = Literal[">", "<", "little", "big"]
# ExcelWriter
-ExcelWriterIfSheetExists = Literal["error", "new", "replace", "overlay"]
-ExcelWriterMergeCells = Union[bool, Literal["columns"]]
+ExcelWriterIfSheetExists: TypeAlias = Literal["error", "new", "replace", "overlay"]
+ExcelWriterMergeCells: TypeAlias = bool | Literal["columns"]
# Offsets
-OffsetCalendar = Union[np.busdaycalendar, "AbstractHolidayCalendar"]
+OffsetCalendar: TypeAlias = Union[np.busdaycalendar, "AbstractHolidayCalendar"]
# read_csv: usecols
-UsecolsArgType = Union[
- SequenceNotStr[Hashable],
- range,
- AnyArrayLike,
- Callable[[HashableT], bool],
- None,
-]
+UsecolsArgType: TypeAlias = (
+ SequenceNotStr[Hashable] | range | AnyArrayLike | Callable[[HashableT], bool] | None
+)
# maintain the sub-type of any hashable sequence
SequenceT = TypeVar("SequenceT", bound=Sequence[Hashable])
-SliceType = Optional[Hashable]
+SliceType: TypeAlias = Hashable | None
+
+__all__ = ["type_t"]
diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py
index 5c2e3d9b07c22..f01dfab0de829 100644
--- a/pandas/compat/_optional.py
+++ b/pandas/compat/_optional.py
@@ -23,13 +23,12 @@
"adbc-driver-postgresql": "0.10.0",
"adbc-driver-sqlite": "0.8.0",
"bs4": "4.12.3",
- "blosc": "1.21.3",
"bottleneck": "1.3.6",
"fastparquet": "2024.2.0",
- "fsspec": "2024.2.0",
+ "fsspec": "2023.12.2",
"html5lib": "1.1",
"hypothesis": "6.84.0",
- "gcsfs": "2024.2.0",
+ "gcsfs": "2023.12.2",
"jinja2": "3.1.3",
"lxml.etree": "4.9.2",
"matplotlib": "3.8.3",
@@ -40,12 +39,13 @@
"psycopg2": "2.9.6", # (dt dec pq3 ext lo64)
"pymysql": "1.1.0",
"pyarrow": "10.0.1",
+ "pyiceberg": "0.7.1",
"pyreadstat": "1.2.6",
"pytest": "7.3.2",
"python-calamine": "0.1.7",
"pytz": "2023.4",
"pyxlsb": "1.0.10",
- "s3fs": "2024.2.0",
+ "s3fs": "2023.12.2",
"scipy": "1.12.0",
"sqlalchemy": "2.0.0",
"tables": "3.8.0",
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index e6847b380a7e8..7fc391d3ffb51 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -47,6 +47,7 @@
is_bool_dtype,
is_complex_dtype,
is_dict_like,
+ is_dtype_equal,
is_extension_array_dtype,
is_float,
is_float_dtype,
@@ -511,6 +512,7 @@ def isin(comps: ListLike, values: ListLike) -> npt.NDArray[np.bool_]:
len(values) > 0
and values.dtype.kind in "iufcb"
and not is_signed_integer_dtype(comps)
+ and not is_dtype_equal(values, comps)
):
# GH#46485 Use object to avoid upcast to float64 later
# TODO: Share with _find_common_type_compat
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index d7187b57a69e4..0b90bcea35100 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -33,7 +33,6 @@
infer_dtype_from_scalar,
)
from pandas.core.dtypes.common import (
- CategoricalDtype,
is_array_like,
is_bool_dtype,
is_float_dtype,
@@ -730,9 +729,7 @@ def __setstate__(self, state) -> None:
def _cmp_method(self, other, op) -> ArrowExtensionArray:
pc_func = ARROW_CMP_FUNCS[op.__name__]
- if isinstance(
- other, (ArrowExtensionArray, np.ndarray, list, BaseMaskedArray)
- ) or isinstance(getattr(other, "dtype", None), CategoricalDtype):
+ if isinstance(other, (ExtensionArray, np.ndarray, list)):
try:
result = pc_func(self._pa_array, self._box_pa(other))
except pa.ArrowNotImplementedError:
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
index df1aa21e9203c..3d2ad109a55ba 100644
--- a/pandas/core/arrays/categorical.py
+++ b/pandas/core/arrays/categorical.py
@@ -1666,7 +1666,7 @@ def __array__(
Parameters
----------
dtype : np.dtype or None
- Specifies the the dtype for the array.
+ Specifies the dtype for the array.
copy : bool or None, optional
See :func:`numpy.asarray`.
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 7227ea77ca433..8048306df91a2 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -123,10 +123,10 @@ class StringDtype(StorageExtensionDtype):
Examples
--------
>>> pd.StringDtype()
- string[python]
+ )>
>>> pd.StringDtype(storage="pyarrow")
- string[pyarrow]
+ )>
"""
@property
@@ -198,11 +198,8 @@ def __init__(
self._na_value = na_value
def __repr__(self) -> str:
- if self._na_value is libmissing.NA:
- return f"{self.name}[{self.storage}]"
- else:
- # TODO add more informative repr
- return self.name
+ storage = "" if self.storage == "pyarrow" else "storage='python', "
+ return f""
def __eq__(self, other: object) -> bool:
# we need to override the base class __eq__ because na_value (NA or NaN)
@@ -1018,7 +1015,30 @@ def searchsorted(
return super().searchsorted(value=value, side=side, sorter=sorter)
def _cmp_method(self, other, op):
- from pandas.arrays import BooleanArray
+ from pandas.arrays import (
+ ArrowExtensionArray,
+ BooleanArray,
+ )
+
+ if (
+ isinstance(other, BaseStringArray)
+ and self.dtype.na_value is not libmissing.NA
+ and other.dtype.na_value is libmissing.NA
+ ):
+ # NA has priority of NaN semantics
+ return NotImplemented
+
+ if isinstance(other, ArrowExtensionArray):
+ if isinstance(other, BaseStringArray):
+ # pyarrow storage has priority over python storage
+ # (except if we have NA semantics and other not)
+ if not (
+ self.dtype.na_value is libmissing.NA
+ and other.dtype.na_value is not libmissing.NA
+ ):
+ return NotImplemented
+ else:
+ return NotImplemented
if isinstance(other, StringArray):
other = other._ndarray
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index a39d64429d162..9668981df827b 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -473,6 +473,14 @@ def value_counts(self, dropna: bool = True) -> Series:
return result
def _cmp_method(self, other, op):
+ if (
+ isinstance(other, (BaseStringArray, ArrowExtensionArray))
+ and self.dtype.na_value is not libmissing.NA
+ and other.dtype.na_value is libmissing.NA
+ ):
+ # NA has priority of NaN semantics
+ return NotImplemented
+
result = super()._cmp_method(other, op)
if self.dtype.na_value is np.nan:
if op == operator.ne:
diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py
index 166c9d47294cd..77b7d9ad11a6c 100644
--- a/pandas/core/computation/pytables.py
+++ b/pandas/core/computation/pytables.py
@@ -239,7 +239,8 @@ def stringify(value):
if conv_val not in metadata:
result = -1
else:
- result = metadata.searchsorted(conv_val, side="left")
+ # Find the index of the first match of conv_val in metadata
+ result = np.flatnonzero(metadata == conv_val)[0]
return TermValue(result, result, "integer")
elif kind == "integer":
try:
diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
index e92f2363b69f1..68d99937f728c 100644
--- a/pandas/core/dtypes/common.py
+++ b/pandas/core/dtypes/common.py
@@ -655,24 +655,38 @@ def is_dtype_equal(source, target) -> bool:
Parameters
----------
- source : The first dtype to compare
- target : The second dtype to compare
+ source : type or str
+ The first dtype to compare.
+ target : type or str
+ The second dtype to compare.
Returns
-------
boolean
Whether or not the two dtypes are equal.
+ See Also
+ --------
+ api.types.is_categorical_dtype : Check whether the provided array or dtype
+ is of the Categorical dtype.
+ api.types.is_string_dtype : Check whether the provided array or dtype
+ is of the string dtype.
+ api.types.is_object_dtype : Check whether an array-like or dtype is of the
+ object dtype.
+
Examples
--------
+ >>> from pandas.api.types import is_dtype_equal
>>> is_dtype_equal(int, float)
False
>>> is_dtype_equal("int", int)
True
>>> is_dtype_equal(object, "category")
False
+ >>> from pandas.core.dtypes.dtypes import CategoricalDtype
>>> is_dtype_equal(CategoricalDtype(), "category")
True
+ >>> from pandas.core.dtypes.dtypes import DatetimeTZDtype
>>> is_dtype_equal(DatetimeTZDtype(tz="UTC"), "datetime64")
False
"""
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 6158e19737185..8053c17437c5e 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -112,6 +112,10 @@
BaseMaskedDtype,
ExtensionDtype,
)
+from pandas.core.dtypes.generic import (
+ ABCIndex,
+ ABCSeries,
+)
from pandas.core.dtypes.missing import (
isna,
notna,
@@ -333,15 +337,15 @@
to SQL left anti join; preserve key order.
* right_anti: use only keys from right frame that are not in left frame, similar
to SQL right anti join; preserve key order.
-on : label or list
+on : Hashable or a sequence of the previous
Column or index level names to join on. These must be found in both
DataFrames. If `on` is None and not merging on indexes then this defaults
to the intersection of the columns in both DataFrames.
-left_on : label or list, or array-like
+left_on : Hashable or a sequence of the previous, or array-like
Column or index level names to join on in the left DataFrame. Can also
be an array or list of arrays of the length of the left DataFrame.
These arrays are treated as if they are columns.
-right_on : label or list, or array-like
+right_on : Hashable or a sequence of the previous, or array-like
Column or index level names to join on in the right DataFrame. Can also
be an array or list of arrays of the length of the right DataFrame.
These arrays are treated as if they are columns.
@@ -795,12 +799,12 @@ def __init__(
dtype,
copy,
)
- elif getattr(data, "name", None) is not None:
+ elif isinstance(data, (ABCSeries, ABCIndex)) and data.name is not None:
# i.e. Series/Index with non-None name
mgr = dict_to_mgr(
# error: Item "ndarray" of "Union[ndarray, Series, Index]" has no
# attribute "name"
- {data.name: data}, # type: ignore[union-attr]
+ {data.name: data},
index,
columns,
dtype=dtype,
@@ -3543,6 +3547,62 @@ def to_xml(
return xml_formatter.write_output()
+ def to_iceberg(
+ self,
+ table_identifier: str,
+ catalog_name: str | None = None,
+ *,
+ catalog_properties: dict[str, Any] | None = None,
+ location: str | None = None,
+ append: bool = False,
+ snapshot_properties: dict[str, str] | None = None,
+ ) -> None:
+ """
+ Write a DataFrame to an Apache Iceberg table.
+
+ .. versionadded:: 3.0.0
+
+ .. warning::
+
+ to_iceberg is experimental and may change without warning.
+
+ Parameters
+ ----------
+ table_identifier : str
+ Table identifier.
+ catalog_name : str, optional
+ The name of the catalog.
+ catalog_properties : dict of {str: str}, optional
+ The properties that are used next to the catalog configuration.
+ location : str, optional
+ Location for the table.
+ append : bool, default False
+ If ``True``, append data to the table, instead of replacing the content.
+ snapshot_properties : dict of {str: str}, optional
+ Custom properties to be added to the snapshot summary
+
+ See Also
+ --------
+ read_iceberg : Read an Apache Iceberg table.
+ DataFrame.to_parquet : Write a DataFrame in Parquet format.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame(data={"col1": [1, 2], "col2": [4, 3]})
+ >>> df.to_iceberg("my_table", catalog_name="my_catalog") # doctest: +SKIP
+ """
+ from pandas.io.iceberg import to_iceberg
+
+ to_iceberg(
+ self,
+ table_identifier,
+ catalog_name,
+ catalog_properties=catalog_properties,
+ location=location,
+ append=append,
+ snapshot_properties=snapshot_properties,
+ )
+
# ----------------------------------------------------------------------
@doc(INFO_DOCSTRING, **frame_sub_kwargs)
def info(
@@ -4477,18 +4537,58 @@ def _get_item(self, item: Hashable) -> Series:
@overload
def query(
- self, expr: str, *, inplace: Literal[False] = ..., **kwargs
+ self,
+ expr: str,
+ *,
+ parser: Literal["pandas", "python"] = ...,
+ engine: Literal["python", "numexpr"] | None = ...,
+ local_dict: dict[str, Any] | None = ...,
+ global_dict: dict[str, Any] | None = ...,
+ resolvers: list[Mapping] | None = ...,
+ level: int = ...,
+ inplace: Literal[False] = ...,
) -> DataFrame: ...
@overload
- def query(self, expr: str, *, inplace: Literal[True], **kwargs) -> None: ...
+ def query(
+ self,
+ expr: str,
+ *,
+ parser: Literal["pandas", "python"] = ...,
+ engine: Literal["python", "numexpr"] | None = ...,
+ local_dict: dict[str, Any] | None = ...,
+ global_dict: dict[str, Any] | None = ...,
+ resolvers: list[Mapping] | None = ...,
+ level: int = ...,
+ inplace: Literal[True],
+ ) -> None: ...
@overload
def query(
- self, expr: str, *, inplace: bool = ..., **kwargs
+ self,
+ expr: str,
+ *,
+ parser: Literal["pandas", "python"] = ...,
+ engine: Literal["python", "numexpr"] | None = ...,
+ local_dict: dict[str, Any] | None = ...,
+ global_dict: dict[str, Any] | None = ...,
+ resolvers: list[Mapping] | None = ...,
+ level: int = ...,
+ inplace: bool = ...,
) -> DataFrame | None: ...
- def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | None:
+ def query(
+ self,
+ expr: str,
+ *,
+ parser: Literal["pandas", "python"] = "pandas",
+ engine: Literal["python", "numexpr"] | None = None,
+ local_dict: dict[str, Any] | None = None,
+ global_dict: dict[str, Any] | None = None,
+ resolvers: list[Mapping] | None = None,
+ level: int = 0,
+ inplace: bool = False,
+ ) -> DataFrame | None:
"""
Query the columns of a DataFrame with a boolean expression.
@@ -4507,11 +4607,41 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | No
See the documentation for :meth:`DataFrame.eval` for details on
referring to column names and variables in the query string.
+ parser : {'pandas', 'python'}, default 'pandas'
+ The parser to use to construct the syntax tree from the expression. The
+ default of ``'pandas'`` parses code slightly different than standard
+ Python. Alternatively, you can parse an expression using the
+ ``'python'`` parser to retain strict Python semantics. See the
+ :ref:`enhancing performance ` documentation for
+ more details.
+ engine : {'python', 'numexpr'}, default 'numexpr'
+
+ The engine used to evaluate the expression. Supported engines are
+
+ - None : tries to use ``numexpr``, falls back to ``python``
+ - ``'numexpr'`` : This default engine evaluates pandas objects using
+ numexpr for large speed ups in complex expressions with large frames.
+ - ``'python'`` : Performs operations as if you had ``eval``'d in top
+ level python. This engine is generally not that useful.
+
+ More backends may be available in the future.
+ local_dict : dict or None, optional
+ A dictionary of local variables, taken from locals() by default.
+ global_dict : dict or None, optional
+ A dictionary of global variables, taken from globals() by default.
+ resolvers : list of dict-like or None, optional
+ A list of objects implementing the ``__getitem__`` special method that
+ you can use to inject an additional collection of namespaces to use for
+ variable lookup. For example, this is used in the
+ :meth:`~DataFrame.query` method to inject the
+ ``DataFrame.index`` and ``DataFrame.columns``
+ variables that refer to their respective :class:`~pandas.DataFrame`
+ instance attributes.
+ level : int, optional
+ The number of prior stack frames to traverse and add to the current
+ scope. Most users will **not** need to change this parameter.
inplace : bool
Whether to modify the DataFrame rather than creating a new one.
- **kwargs
- See the documentation for :func:`eval` for complete details
- on the keyword arguments accepted by :meth:`DataFrame.query`.
Returns
-------
@@ -4624,10 +4754,17 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | No
if not isinstance(expr, str):
msg = f"expr must be a string to be evaluated, {type(expr)} given"
raise ValueError(msg)
- kwargs["level"] = kwargs.pop("level", 0) + 1
- kwargs["target"] = None
- res = self.eval(expr, **kwargs)
+ res = self.eval(
+ expr,
+ level=level + 1,
+ parser=parser,
+ target=None,
+ engine=engine,
+ local_dict=local_dict,
+ global_dict=global_dict,
+ resolvers=resolvers or (),
+ )
try:
result = self.loc[res]
@@ -7314,7 +7451,7 @@ def value_counts(
Parameters
----------
- subset : label or list of labels, optional
+ subset : Hashable or a sequence of the previous, optional
Columns to use when counting unique combinations.
normalize : bool, default False
Return proportions rather than frequencies.
@@ -7465,7 +7602,7 @@ def nlargest(
----------
n : int
Number of rows to return.
- columns : label or list of labels
+ columns : Hashable or a sequence of the previous
Column label(s) to order by.
keep : {'first', 'last', 'all'}, default 'first'
Where there are duplicate values:
@@ -9177,11 +9314,11 @@ def groupby(
Parameters
----------%s
- columns : str or object or a list of str
+ columns : Hashable or a sequence of the previous
Column to use to make new frame's columns.
- index : str or object or a list of str, optional
+ index : Hashable or a sequence of the previous, optional
Column to use to make new frame's index. If not given, uses existing index.
- values : str, object or a list of the previous, optional
+ values : Hashable or a sequence of the previous, optional
Column(s) to use for populating new frame's values. If not
specified, all remaining columns will be used and the result will
have hierarchically indexed columns.
@@ -9320,12 +9457,12 @@ def pivot(
----------%s
values : list-like or scalar, optional
Column or columns to aggregate.
- index : column, Grouper, array, or list of the previous
+ index : column, Grouper, array, or sequence of the previous
Keys to group by on the pivot table index. If a list is passed,
it can contain any of the other types (except list). If an array is
passed, it must be the same length as the data and will be used in
the same manner as column values.
- columns : column, Grouper, array, or list of the previous
+ columns : column, Grouper, array, or sequence of the previous
Keys to group by on the pivot table column. If a list is passed,
it can contain any of the other types (except list). If an array is
passed, it must be the same length as the data and will be used in
@@ -9854,7 +9991,7 @@ def unstack(
----------
level : int, str, or list of these, default -1 (last level)
Level(s) of index to unstack, can pass level name.
- fill_value : int, str or dict
+ fill_value : scalar
Replace NaN with this value if the unstack produces missing values.
sort : bool, default True
Sort the level(s) in the resulting MultiIndex columns.
@@ -11353,6 +11490,12 @@ def cov(
c -0.150812 0.191417 0.895202
"""
data = self._get_numeric_data() if numeric_only else self
+ if any(blk.dtype.kind in "mM" for blk in self._mgr.blocks):
+ msg = (
+ "DataFrame contains columns with dtype datetime64 "
+ "or timedelta64, which are not supported for cov."
+ )
+ raise TypeError(msg)
cols = data.columns
idx = cols.copy()
mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False)
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index 486a6a2a02be3..8aae4609b1833 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -1645,11 +1645,7 @@ def _is_label_reference(self, key: Level, axis: Axis = 0) -> bool:
axis_int = self._get_axis_number(axis)
other_axes = (ax for ax in range(self._AXIS_LEN) if ax != axis_int)
- return (
- key is not None
- and is_hashable(key)
- and any(key in self.axes[ax] for ax in other_axes)
- )
+ return is_hashable(key) and any(key in self.axes[ax] for ax in other_axes)
@final
def _is_label_or_level_reference(self, key: Level, axis: AxisInt = 0) -> bool:
@@ -3964,7 +3960,7 @@ def take(self, indices, axis: Axis = 0, **kwargs) -> Self:
----------
indices : array-like
An array of ints indicating which positions to take.
- axis : {0 or 'index', 1 or 'columns', None}, default 0
+ axis : {0 or 'index', 1 or 'columns'}, default 0
The axis on which to select elements. ``0`` means that we are
selecting rows, ``1`` means that we are selecting columns.
For `Series` this parameter is unused and defaults to 0.
@@ -6819,12 +6815,12 @@ def convert_dtypes(
2 3 z 20 200.0
>>> dfn.dtypes
- a Int32
- b string[python]
- c boolean
- d string[python]
- e Int64
- f Float64
+ a Int32
+ b string
+ c boolean
+ d string
+ e Int64
+ f Float64
dtype: object
Start with a Series of strings and missing data represented by ``np.nan``.
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
index b520ad69aae96..49b80337c700e 100644
--- a/pandas/core/groupby/generic.py
+++ b/pandas/core/groupby/generic.py
@@ -504,11 +504,13 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs)
# inference. We default to using the existing dtype.
# xref GH#51445
obj = self._obj_with_exclusions
- return self.obj._constructor(
- [],
- name=self.obj.name,
- index=self._grouper.result_index,
- dtype=obj.dtype,
+ return self._wrap_aggregated_output(
+ self.obj._constructor(
+ [],
+ name=self.obj.name,
+ index=self._grouper.result_index,
+ dtype=obj.dtype,
+ )
)
return self._python_agg_general(func, *args, **kwargs)
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index ff3879018674e..4e1ea07907cdb 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -1731,10 +1731,16 @@ def name(self) -> Hashable:
"""
Return Index or MultiIndex name.
+ Returns
+ -------
+ label (hashable object)
+ The name of the Index.
+
See Also
--------
Index.set_names: Able to set new names partially and by level.
Index.rename: Able to set new names partially and by level.
+ Series.name: Corresponding Series property.
Examples
--------
@@ -1907,12 +1913,12 @@ def set_names(self, names, *, level=None, inplace: bool = False) -> Self | None:
Parameters
----------
- names : label or list of label or dict-like for MultiIndex
+ names : Hashable or a sequence of the previous or dict-like for MultiIndex
Name(s) to set.
.. versionchanged:: 1.3.0
- level : int, label or list of int or label, optional
+ level : int, Hashable or a sequence of the previous, optional
If the index is a MultiIndex and names is not dict-like, level(s) to set
(None for all levels). Otherwise level must be None.
@@ -2017,7 +2023,7 @@ def rename(self, name, *, inplace: bool = False) -> Self | None:
Parameters
----------
- name : label or list of labels
+ name : Hashable or a sequence of the previous
Name(s) to set.
inplace : bool, default False
Modifies the object directly, instead of creating a new Index or
@@ -2961,10 +2967,14 @@ def _dti_setop_align_tzs(self, other: Index, setop: str_t) -> tuple[Index, Index
and self.tz is not None
and other.tz is not None
):
- # GH#39328, GH#45357
- left = self.tz_convert("UTC")
- right = other.tz_convert("UTC")
- return left, right
+ # GH#39328, GH#45357, GH#60080
+ # If both timezones are the same, no need to convert to UTC
+ if self.tz == other.tz:
+ return self, other
+ else:
+ left = self.tz_convert("UTC")
+ right = other.tz_convert("UTC")
+ return left, right
return self, other
@final
diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py
index b990eca39b3dd..c2fbef1089d5a 100644
--- a/pandas/core/interchange/from_dataframe.py
+++ b/pandas/core/interchange/from_dataframe.py
@@ -563,7 +563,6 @@ def set_nulls(
if null_kind == ColumnNullType.USE_SENTINEL:
null_pos = pd.Series(data) == sentinel_val
elif null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK):
- assert validity, "Expected to have a validity buffer for the mask"
valid_buff, valid_dtype = validity
null_pos = buffer_to_ndarray(
valid_buff, valid_dtype, offset=col.offset, length=col.size()
diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py
index 202bebde88c2c..d64c7e33657d4 100644
--- a/pandas/core/internals/__init__.py
+++ b/pandas/core/internals/__init__.py
@@ -6,9 +6,9 @@
)
__all__ = [
- "Block",
+ "Block", # pyright:ignore[reportUnsupportedDunderAll)]
"BlockManager",
- "ExtensionBlock",
+ "ExtensionBlock", # pyright:ignore[reportUnsupportedDunderAll)]
"SingleBlockManager",
"concatenate_managers",
"make_block",
diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
index e238bb78bbdfa..cb290fde7095c 100644
--- a/pandas/core/internals/managers.py
+++ b/pandas/core/internals/managers.py
@@ -1800,6 +1800,8 @@ def as_array(
arr = np.asarray(blk.values, dtype=dtype)
else:
arr = np.array(blk.values, dtype=dtype, copy=copy)
+ if passed_nan and blk.dtype.kind in "mM":
+ arr[isna(blk.values)] = na_value
if not copy:
arr = arr.view()
@@ -1865,6 +1867,8 @@ def _interleave(
else:
arr = blk.get_values(dtype)
result[rl.indexer] = arr
+ if na_value is not lib.no_default and blk.dtype.kind in "mM":
+ result[rl.indexer][isna(arr)] = na_value
itemmask[rl.indexer] = 1
if not itemmask.all():
diff --git a/pandas/core/ops/invalid.py b/pandas/core/ops/invalid.py
index 395db1617cb63..62aa79a881717 100644
--- a/pandas/core/ops/invalid.py
+++ b/pandas/core/ops/invalid.py
@@ -25,7 +25,7 @@
def invalid_comparison(
left: ArrayLike,
- right: ArrayLike | Scalar,
+ right: ArrayLike | list | Scalar,
op: Callable[[Any, Any], bool],
) -> npt.NDArray[np.bool_]:
"""
diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py
index f4cb82816bbcf..20b4cd2185bb4 100644
--- a/pandas/core/reshape/melt.py
+++ b/pandas/core/reshape/melt.py
@@ -182,6 +182,10 @@ def melt(
value_vars_was_not_none = value_vars is not None
value_vars = ensure_list_vars(value_vars, "value_vars", frame.columns)
+ # GH61475 - prevent AttributeError when duplicate column in id_vars
+ if len(frame.columns.get_indexer_for(id_vars)) > len(id_vars):
+ raise ValueError("id_vars cannot contain duplicate columns.")
+
if id_vars or value_vars:
if col_level is not None:
level = frame.columns.get_level_values(col_level)
diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
index 68d61da0cf7dd..34f3e2c626378 100644
--- a/pandas/core/reshape/merge.py
+++ b/pandas/core/reshape/merge.py
@@ -198,15 +198,15 @@ def merge(
to SQL left anti join; preserve key order.
* right_anti: use only keys from right frame that are not in left frame, similar
to SQL right anti join; preserve key order.
- on : label or list
+ on : Hashable or a sequence of the previous
Column or index level names to join on. These must be found in both
DataFrames. If `on` is None and not merging on indexes then this defaults
to the intersection of the columns in both DataFrames.
- left_on : label or list, or array-like
+ left_on : Hashable or a sequence of the previous, or array-like
Column or index level names to join on in the left DataFrame. Can also
be an array or list of arrays of the length of the left DataFrame.
These arrays are treated as if they are columns.
- right_on : label or list, or array-like
+ right_on : Hashable or a sequence of the previous, or array-like
Column or index level names to join on in the right DataFrame. Can also
be an array or list of arrays of the length of the right DataFrame.
These arrays are treated as if they are columns.
@@ -536,13 +536,13 @@ def merge_ordered(
First pandas object to merge.
right : DataFrame or named Series
Second pandas object to merge.
- on : label or list
+ on : Hashable or a sequence of the previous
Field names to join on. Must be found in both DataFrames.
- left_on : label or list, or array-like
+ left_on : Hashable or a sequence of the previous, or array-like
Field names to join on in left DataFrame. Can be a vector or list of
vectors of the length of the DataFrame to use a particular vector as
the join key instead of columns.
- right_on : label or list, or array-like
+ right_on : Hashable or a sequence of the previous, or array-like
Field names to join on in right DataFrame or vector/list of vectors per
left_on docs.
left_by : column name or list of column names
@@ -3062,13 +3062,16 @@ def renamer(x, suffix: str | None):
if not llabels.is_unique:
# Only warn when duplicates are caused because of suffixes, already duplicated
# columns in origin should not warn
- dups = llabels[(llabels.duplicated()) & (~left.duplicated())].tolist()
+ dups.extend(llabels[(llabels.duplicated()) & (~left.duplicated())].tolist())
if not rlabels.is_unique:
dups.extend(rlabels[(rlabels.duplicated()) & (~right.duplicated())].tolist())
+ # Suffix addition creates duplicate to pre-existing column name
+ dups.extend(llabels.intersection(right.difference(to_rename)).tolist())
+ dups.extend(rlabels.intersection(left.difference(to_rename)).tolist())
if dups:
raise MergeError(
f"Passing 'suffixes' which cause duplicate columns {set(dups)} is "
- f"not allowed.",
+ "not allowed.",
)
return llabels, rlabels
diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py
index 4e77f0a6bf5bf..ac89f19b80a0f 100644
--- a/pandas/core/reshape/pivot.py
+++ b/pandas/core/reshape/pivot.py
@@ -76,12 +76,12 @@ def pivot_table(
Input pandas DataFrame object.
values : list-like or scalar, optional
Column or columns to aggregate.
- index : column, Grouper, array, or list of the previous
+ index : column, Grouper, array, or sequence of the previous
Keys to group by on the pivot table index. If a list is passed,
it can contain any of the other types (except list). If an array is
passed, it must be the same length as the data and will be used in
the same manner as column values.
- columns : column, Grouper, array, or list of the previous
+ columns : column, Grouper, array, or sequence of the previous
Keys to group by on the pivot table column. If a list is passed,
it can contain any of the other types (except list). If an array is
passed, it must be the same length as the data and will be used in
@@ -708,11 +708,11 @@ def pivot(
----------
data : DataFrame
Input pandas DataFrame object.
- columns : str or object or a list of str
+ columns : Hashable or a sequence of the previous
Column to use to make new frame's columns.
- index : str or object or a list of str, optional
+ index : Hashable or a sequence of the previous, optional
Column to use to make new frame's index. If not given, uses existing index.
- values : str, object or a list of the previous, optional
+ values : Hashable or a sequence of the previous, optional
Column(s) to use for populating new frame's values. If not
specified, all remaining columns will be used and the result will
have hierarchically indexed columns.
diff --git a/pandas/core/series.py b/pandas/core/series.py
index d6a982c65e9fd..7a26be875e7b5 100644
--- a/pandas/core/series.py
+++ b/pandas/core/series.py
@@ -2514,6 +2514,8 @@ def round(self, decimals: int = 0, *args, **kwargs) -> Series:
dtype: float64
"""
nv.validate_round(args, kwargs)
+ if self.dtype == "object":
+ raise TypeError("Expected numeric dtype, got object instead.")
new_mgr = self._mgr.round(decimals=decimals)
return self._constructor_from_mgr(new_mgr, axes=new_mgr.axes).__finalize__(
self, method="round"
@@ -2951,8 +2953,9 @@ def dot(self, other: AnyArrayLike | DataFrame) -> Series | np.ndarray:
)
if isinstance(other, ABCDataFrame):
+ common_type = find_common_type([self.dtypes] + list(other.dtypes))
return self._constructor(
- np.dot(lvals, rvals), index=other.columns, copy=False
+ np.dot(lvals, rvals), index=other.columns, copy=False, dtype=common_type
).__finalize__(self, method="dot")
elif isinstance(other, Series):
return np.dot(lvals, rvals)
@@ -4325,6 +4328,7 @@ def map(
self,
func: Callable | Mapping | Series | None = None,
na_action: Literal["ignore"] | None = None,
+ engine: Callable | None = None,
**kwargs,
) -> Series:
"""
@@ -4341,6 +4345,25 @@ def map(
na_action : {None, 'ignore'}, default None
If 'ignore', propagate NaN values, without passing them to the
mapping correspondence.
+ engine : decorator, optional
+ Choose the execution engine to use to run the function. Only used for
+ functions. If ``map`` is called with a mapping or ``Series``, an
+ exception will be raised. If ``engine`` is not provided the function will
+ be executed by the regular Python interpreter.
+
+ Options include JIT compilers such as Numba, Bodo or Blosc2, which in some
+ cases can speed up the execution. To use an executor you can provide the
+ decorators ``numba.jit``, ``numba.njit``, ``bodo.jit`` or ``blosc2.jit``.
+ You can also provide the decorator with parameters, like
+ ``numba.jit(nogit=True)``.
+
+ Not all functions can be executed with all execution engines. In general,
+ JIT compilers will require type stability in the function (no variable
+ should change data type during the execution). And not all pandas and
+ NumPy APIs are supported. Check the engine documentation for limitations.
+
+ .. versionadded:: 3.0.0
+
**kwargs
Additional keyword arguments to pass as keywords arguments to
`arg`.
@@ -4420,6 +4443,25 @@ def map(
else:
raise ValueError("The `func` parameter is required")
+ if engine is not None:
+ if not callable(func):
+ raise ValueError(
+ "The engine argument can only be specified when func is a function"
+ )
+ if not hasattr(engine, "__pandas_udf__"):
+ raise ValueError(f"Not a valid engine: {engine!r}")
+ result = engine.__pandas_udf__.map( # type: ignore[attr-defined]
+ data=self,
+ func=func,
+ args=(),
+ kwargs=kwargs,
+ decorator=engine,
+ skip_na=na_action == "ignore",
+ )
+ if not isinstance(result, Series):
+ result = Series(result, index=self.index, name=self.name)
+ return result.__finalize__(self, method="map")
+
if callable(func):
func = functools.partial(func, **kwargs)
new_values = self._map_values(func, na_action=na_action)
diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py
index 81fa508ae6d23..bf30c215596f2 100644
--- a/pandas/core/shared_docs.py
+++ b/pandas/core/shared_docs.py
@@ -380,7 +380,7 @@
replaced with `value`
- str: string exactly matching `to_replace` will be replaced
with `value`
- - regex: regexs matching `to_replace` will be replaced with
+ - regex: regexes matching `to_replace` will be replaced with
`value`
* list of str, regex, or numeric:
@@ -388,7 +388,7 @@
- First, if `to_replace` and `value` are both lists, they
**must** be the same length.
- Second, if ``regex=True`` then all of the strings in **both**
- lists will be interpreted as regexs otherwise they will match
+ lists will be interpreted as regexes otherwise they will match
directly. This doesn't matter much for `value` since there
are only a few possible substitution regexes you can use.
- str, regex and numeric rules apply as above.
diff --git a/pandas/io/api.py b/pandas/io/api.py
index d4982399a604b..5900c94384384 100644
--- a/pandas/io/api.py
+++ b/pandas/io/api.py
@@ -10,6 +10,7 @@
)
from pandas.io.feather_format import read_feather
from pandas.io.html import read_html
+from pandas.io.iceberg import read_iceberg
from pandas.io.json import read_json
from pandas.io.orc import read_orc
from pandas.io.parquet import read_parquet
@@ -47,6 +48,7 @@
"read_fwf",
"read_hdf",
"read_html",
+ "read_iceberg",
"read_json",
"read_orc",
"read_parquet",
diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py
index 75bcb51ef4be2..1b9eb6303fe74 100644
--- a/pandas/io/formats/csvs.py
+++ b/pandas/io/formats/csvs.py
@@ -90,9 +90,9 @@ def __init__(
self.index_label = self._initialize_index_label(index_label)
self.errors = errors
self.quoting = quoting or csvlib.QUOTE_MINIMAL
- self.quotechar = self._initialize_quotechar(quotechar)
self.doublequote = doublequote
self.escapechar = escapechar
+ self.quotechar = self._initialize_quotechar(quotechar)
self.lineterminator = lineterminator or os.linesep
self.date_format = date_format
self.cols = self._initialize_columns(cols)
@@ -141,7 +141,7 @@ def _get_index_label_flat(self) -> Sequence[Hashable]:
return [""] if index_label is None else [index_label]
def _initialize_quotechar(self, quotechar: str | None) -> str | None:
- if self.quoting != csvlib.QUOTE_NONE:
+ if self.quoting != csvlib.QUOTE_NONE or self.escapechar is not None:
# prevents crash in _csv
return quotechar
return None
diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py
index f1be0b41ad7f7..097e508d4889a 100644
--- a/pandas/io/formats/format.py
+++ b/pandas/io/formats/format.py
@@ -67,7 +67,6 @@
ExtensionArray,
TimedeltaArray,
)
-from pandas.core.arrays.string_ import StringDtype
from pandas.core.base import PandasObject
import pandas.core.common as com
from pandas.core.indexes.api import (
@@ -115,7 +114,7 @@
columns : array-like, optional, default None
The subset of columns to write. Writes all columns by default.
col_space : %(col_space_type)s, optional
- %(col_space)s.
+ %(col_space)s
header : %(header_type)s, optional
%(header)s.
index : bool, optional, default True
@@ -1218,8 +1217,6 @@ def _format(x):
return self.na_rep
elif isinstance(x, PandasObject):
return str(x)
- elif isinstance(x, StringDtype):
- return repr(x)
else:
# object dtype
return str(formatter(x))
diff --git a/pandas/io/iceberg.py b/pandas/io/iceberg.py
new file mode 100644
index 0000000000000..dcb675271031e
--- /dev/null
+++ b/pandas/io/iceberg.py
@@ -0,0 +1,151 @@
+from typing import (
+ Any,
+)
+
+from pandas.compat._optional import import_optional_dependency
+
+from pandas import DataFrame
+
+
+def read_iceberg(
+ table_identifier: str,
+ catalog_name: str | None = None,
+ *,
+ catalog_properties: dict[str, Any] | None = None,
+ row_filter: str | None = None,
+ selected_fields: tuple[str] | None = None,
+ case_sensitive: bool = True,
+ snapshot_id: int | None = None,
+ limit: int | None = None,
+ scan_properties: dict[str, Any] | None = None,
+) -> DataFrame:
+ """
+ Read an Apache Iceberg table into a pandas DataFrame.
+
+ .. versionadded:: 3.0.0
+
+ .. warning::
+
+ read_iceberg is experimental and may change without warning.
+
+ Parameters
+ ----------
+ table_identifier : str
+ Table identifier.
+ catalog_name : str, optional
+ The name of the catalog.
+ catalog_properties : dict of {str: str}, optional
+ The properties that are used next to the catalog configuration.
+ row_filter : str, optional
+ A string that describes the desired rows.
+ selected_fields : tuple of str, optional
+ A tuple of strings representing the column names to return in the output
+ dataframe.
+ case_sensitive : bool, default True
+ If True column matching is case sensitive.
+ snapshot_id : int, optional
+ Snapshot ID to time travel to. By default the table will be scanned as of the
+ current snapshot ID.
+ limit : int, optional
+ An integer representing the number of rows to return in the scan result.
+ By default all matching rows will be fetched.
+ scan_properties : dict of {str: obj}, optional
+ Additional Table properties as a dictionary of string key value pairs to use
+ for this scan.
+
+ Returns
+ -------
+ DataFrame
+ DataFrame based on the Iceberg table.
+
+ See Also
+ --------
+ read_parquet : Read a Parquet file.
+
+ Examples
+ --------
+ >>> df = pd.read_iceberg(
+ ... table_identifier="my_table",
+ ... catalog_name="my_catalog",
+ ... catalog_properties={"s3.secret-access-key": "my-secret"},
+ ... row_filter="trip_distance >= 10.0",
+ ... selected_fields=("VendorID", "tpep_pickup_datetime"),
+ ... ) # doctest: +SKIP
+ """
+ pyiceberg_catalog = import_optional_dependency("pyiceberg.catalog")
+ pyiceberg_expressions = import_optional_dependency("pyiceberg.expressions")
+ if catalog_properties is None:
+ catalog_properties = {}
+ catalog = pyiceberg_catalog.load_catalog(catalog_name, **catalog_properties)
+ table = catalog.load_table(table_identifier)
+ if row_filter is None:
+ row_filter = pyiceberg_expressions.AlwaysTrue()
+ if selected_fields is None:
+ selected_fields = ("*",)
+ if scan_properties is None:
+ scan_properties = {}
+ result = table.scan(
+ row_filter=row_filter,
+ selected_fields=selected_fields,
+ case_sensitive=case_sensitive,
+ snapshot_id=snapshot_id,
+ options=scan_properties,
+ limit=limit,
+ )
+ return result.to_pandas()
+
+
+def to_iceberg(
+ df: DataFrame,
+ table_identifier: str,
+ catalog_name: str | None = None,
+ *,
+ catalog_properties: dict[str, Any] | None = None,
+ location: str | None = None,
+ append: bool = False,
+ snapshot_properties: dict[str, str] | None = None,
+) -> None:
+ """
+ Write a DataFrame to an Apache Iceberg table.
+
+ .. versionadded:: 3.0.0
+
+ Parameters
+ ----------
+ table_identifier : str
+ Table identifier.
+ catalog_name : str, optional
+ The name of the catalog.
+ catalog_properties : dict of {str: str}, optional
+ The properties that are used next to the catalog configuration.
+ location : str, optional
+ Location for the table.
+ append : bool, default False
+ If ``True``, append data to the table, instead of replacing the content.
+ snapshot_properties : dict of {str: str}, optional
+ Custom properties to be added to the snapshot summary
+
+ See Also
+ --------
+ read_iceberg : Read an Apache Iceberg table.
+ DataFrame.to_parquet : Write a DataFrame in Parquet format.
+ """
+ pa = import_optional_dependency("pyarrow")
+ pyiceberg_catalog = import_optional_dependency("pyiceberg.catalog")
+ if catalog_properties is None:
+ catalog_properties = {}
+ catalog = pyiceberg_catalog.load_catalog(catalog_name, **catalog_properties)
+ arrow_table = pa.Table.from_pandas(df)
+ table = catalog.create_table_if_not_exists(
+ identifier=table_identifier,
+ schema=arrow_table.schema,
+ location=location,
+ # we could add `partition_spec`, `sort_order` and `properties` in the
+ # future, but it may not be trivial without exposing PyIceberg objects
+ )
+ if snapshot_properties is None:
+ snapshot_properties = {}
+ if append:
+ table.append(arrow_table, snapshot_properties=snapshot_properties)
+ else:
+ table.overwrite(arrow_table, snapshot_properties=snapshot_properties)
diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py
index 0e06cb10d2029..1c7e1ab57b2a9 100644
--- a/pandas/plotting/_matplotlib/core.py
+++ b/pandas/plotting/_matplotlib/core.py
@@ -802,7 +802,13 @@ def _adorn_subplots(self, fig: Figure) -> None:
if self.title:
if self.subplots:
if is_list_like(self.title):
- if len(self.title) != self.nseries:
+ if not isinstance(self.subplots, bool):
+ if len(self.subplots) != len(self.title):
+ raise ValueError(
+ f"The number of titles ({len(self.title)}) must equal "
+ f"the number of subplots ({len(self.subplots)})."
+ )
+ elif len(self.title) != self.nseries:
raise ValueError(
"The length of `title` must equal the number "
"of columns if using `title` of type `list` "
@@ -1934,13 +1940,14 @@ def _make_plot(self, fig: Figure) -> None:
self.subplots: list[Any]
- if bool(self.subplots) and self.stacked:
- for i, sub_plot in enumerate(self.subplots):
- if len(sub_plot) <= 1:
- continue
- for plot in sub_plot:
- _stacked_subplots_ind[int(plot)] = i
- _stacked_subplots_offsets.append([0, 0])
+ if not isinstance(self.subplots, bool):
+ if bool(self.subplots) and self.stacked:
+ for i, sub_plot in enumerate(self.subplots):
+ if len(sub_plot) <= 1:
+ continue
+ for plot in sub_plot:
+ _stacked_subplots_ind[int(plot)] = i
+ _stacked_subplots_offsets.append([0, 0])
for i, (label, y) in enumerate(self._iter_data(data=data)):
ax = self._get_ax(i)
diff --git a/pandas/plotting/_matplotlib/style.py b/pandas/plotting/_matplotlib/style.py
index 962f9711d9916..7cf63c8621392 100644
--- a/pandas/plotting/_matplotlib/style.py
+++ b/pandas/plotting/_matplotlib/style.py
@@ -22,8 +22,6 @@
from pandas.core.dtypes.common import is_list_like
-import pandas.core.common as com
-
if TYPE_CHECKING:
from matplotlib.colors import Colormap
@@ -251,31 +249,17 @@ def _is_floats_color(color: Color | Collection[Color]) -> bool:
def _get_colors_from_color_type(color_type: str, num_colors: int) -> list[Color]:
"""Get colors from user input color type."""
if color_type == "default":
- return _get_default_colors(num_colors)
+ prop_cycle = mpl.rcParams["axes.prop_cycle"]
+ return [
+ c["color"]
+ for c in itertools.islice(prop_cycle, min(num_colors, len(prop_cycle)))
+ ]
elif color_type == "random":
- return _get_random_colors(num_colors)
+ return np.random.default_rng(num_colors).random((num_colors, 3)).tolist()
else:
raise ValueError("color_type must be either 'default' or 'random'")
-def _get_default_colors(num_colors: int) -> list[Color]:
- """Get `num_colors` of default colors from matplotlib rc params."""
- colors = [c["color"] for c in mpl.rcParams["axes.prop_cycle"]]
- return colors[0:num_colors]
-
-
-def _get_random_colors(num_colors: int) -> list[Color]:
- """Get `num_colors` of random colors."""
- return [_random_color(num) for num in range(num_colors)]
-
-
-def _random_color(column: int) -> list[float]:
- """Get a random color represented as a list of length 3"""
- # GH17525 use common._random_state to avoid resetting the seed
- rs = com.random_state(column)
- return rs.rand(3).tolist()
-
-
def _is_single_string_color(color: Color) -> bool:
"""Check if `color` is a single string color.
diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py
index 75ff40f1eca90..0f2d824f37ffc 100644
--- a/pandas/plotting/_misc.py
+++ b/pandas/plotting/_misc.py
@@ -412,7 +412,7 @@ def andrews_curves(
>>> df = pd.read_csv(
... "https://raw.githubusercontent.com/pandas-dev/"
... "pandas/main/pandas/tests/io/data/csv/iris.csv"
- ... )
+ ... ) # doctest: +SKIP
>>> pd.plotting.andrews_curves(df, "Name") # doctest: +SKIP
"""
plot_backend = _get_plot_backend("matplotlib")
@@ -551,7 +551,7 @@ def parallel_coordinates(
>>> df = pd.read_csv(
... "https://raw.githubusercontent.com/pandas-dev/"
... "pandas/main/pandas/tests/io/data/csv/iris.csv"
- ... )
+ ... ) # doctest: +SKIP
>>> pd.plotting.parallel_coordinates(
... df, "Name", color=("#556270", "#4ECDC4", "#C7F464")
... ) # doctest: +SKIP
@@ -633,6 +633,15 @@ def autocorrelation_plot(series: Series, ax: Axes | None = None, **kwargs) -> Ax
"""
Autocorrelation plot for time series.
+ This method generates an autocorrelation plot for a given time series,
+ which helps to identify any periodic structure or correlation within the
+ data across various lags. It shows the correlation of a time series with a
+ delayed copy of itself as a function of delay. Autocorrelation plots are useful for
+ checking randomness in a data set. If the data are random, the autocorrelations
+ should be near zero for any and all time-lag separations. If the data are not
+ random, then one or more of the autocorrelations will be significantly
+ non-zero.
+
Parameters
----------
series : Series
diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py
index 2ba90948be399..871e977cbe2f8 100644
--- a/pandas/tests/api/test_api.py
+++ b/pandas/tests/api/test_api.py
@@ -168,6 +168,7 @@ class TestPDApi(Base):
"read_parquet",
"read_orc",
"read_spss",
+ "read_iceberg",
]
# top-level json funcs
diff --git a/pandas/tests/apply/conftest.py b/pandas/tests/apply/conftest.py
new file mode 100644
index 0000000000000..aecf82f5a9419
--- /dev/null
+++ b/pandas/tests/apply/conftest.py
@@ -0,0 +1,63 @@
+import numpy as np
+import pytest
+
+from pandas import (
+ DataFrame,
+ Series,
+)
+from pandas.api.executors import BaseExecutionEngine
+
+
+class MockExecutionEngine(BaseExecutionEngine):
+ """
+ Execution Engine to test if the execution engine interface receives and
+ uses all parameters provided by the user.
+
+ Making this engine work as the default Python engine by calling it, no extra
+ functionality is implemented here.
+
+ When testing, this will be called when this engine is provided, and then the
+ same pandas.map and pandas.apply function will be called, but without engine,
+ executing the default behavior from the python engine.
+ """
+
+ def map(data, func, args, kwargs, decorator, skip_na):
+ kwargs_to_pass = kwargs if isinstance(data, DataFrame) else {}
+ return data.map(func, na_action="ignore" if skip_na else None, **kwargs_to_pass)
+
+ def apply(data, func, args, kwargs, decorator, axis):
+ if isinstance(data, Series):
+ return data.apply(func, convert_dtype=True, args=args, by_row=False)
+ elif isinstance(data, DataFrame):
+ return data.apply(
+ func,
+ axis=axis,
+ raw=False,
+ result_type=None,
+ args=args,
+ by_row="compat",
+ **kwargs,
+ )
+ else:
+ assert isinstance(data, np.ndarray)
+
+ def wrap_function(func):
+ # https://github.com/numpy/numpy/issues/8352
+ def wrapper(*args, **kwargs):
+ result = func(*args, **kwargs)
+ if isinstance(result, str):
+ result = np.array(result, dtype=object)
+ return result
+
+ return wrapper
+
+ return np.apply_along_axis(wrap_function(func), axis, data, *args, **kwargs)
+
+
+class MockEngineDecorator:
+ __pandas_udf__ = MockExecutionEngine
+
+
+@pytest.fixture(params=[None, MockEngineDecorator])
+def engine(request):
+ return request.param
diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py
index dde1158dc7951..a9afb5dbd11d7 100644
--- a/pandas/tests/apply/test_frame_apply.py
+++ b/pandas/tests/apply/test_frame_apply.py
@@ -17,63 +17,11 @@
date_range,
)
import pandas._testing as tm
-from pandas.api.executors import BaseExecutionEngine
+from pandas.tests.apply.conftest import MockEngineDecorator
from pandas.tests.frame.common import zip_frames
from pandas.util.version import Version
-class MockExecutionEngine(BaseExecutionEngine):
- """
- Execution Engine to test if the execution engine interface receives and
- uses all parameters provided by the user.
-
- Making this engine work as the default Python engine by calling it, no extra
- functionality is implemented here.
-
- When testing, this will be called when this engine is provided, and then the
- same pandas.map and pandas.apply function will be called, but without engine,
- executing the default behavior from the python engine.
- """
-
- def map(data, func, args, kwargs, decorator, skip_na):
- kwargs_to_pass = kwargs if isinstance(data, DataFrame) else {}
- return data.map(
- func, action_na="ignore" if skip_na else False, **kwargs_to_pass
- )
-
- def apply(data, func, args, kwargs, decorator, axis):
- if isinstance(data, Series):
- return data.apply(func, convert_dtype=True, args=args, by_row=False)
- elif isinstance(data, DataFrame):
- return data.apply(
- func,
- axis=axis,
- raw=False,
- result_type=None,
- args=args,
- by_row="compat",
- **kwargs,
- )
- else:
- assert isinstance(data, np.ndarray)
-
- def wrap_function(func):
- # https://github.com/numpy/numpy/issues/8352
- def wrapper(*args, **kwargs):
- result = func(*args, **kwargs)
- if isinstance(result, str):
- result = np.array(result, dtype=object)
- return result
-
- return wrapper
-
- return np.apply_along_axis(wrap_function(func), axis, data, *args, **kwargs)
-
-
-class MockEngineDecorator:
- __pandas_udf__ = MockExecutionEngine
-
-
@pytest.fixture
def int_frame_const_col():
"""
diff --git a/pandas/tests/apply/test_series_apply.py b/pandas/tests/apply/test_series_apply.py
index 9541b0b7495c7..896c5c5fca9f7 100644
--- a/pandas/tests/apply/test_series_apply.py
+++ b/pandas/tests/apply/test_series_apply.py
@@ -376,13 +376,13 @@ def test_demo():
@pytest.mark.parametrize("func", [str, lambda x: str(x)])
-def test_apply_map_evaluate_lambdas_the_same(string_series, func, by_row):
+def test_apply_map_evaluate_lambdas_the_same(string_series, func, by_row, engine):
# test that we are evaluating row-by-row first if by_row="compat"
# else vectorized evaluation
result = string_series.apply(func, by_row=by_row)
if by_row:
- expected = string_series.map(func)
+ expected = string_series.map(func, engine=engine)
tm.assert_series_equal(result, expected)
else:
assert result == str(string_series)
diff --git a/pandas/tests/arrays/integer/test_arithmetic.py b/pandas/tests/arrays/integer/test_arithmetic.py
index dee3deeee0f2f..9fbea2022c87b 100644
--- a/pandas/tests/arrays/integer/test_arithmetic.py
+++ b/pandas/tests/arrays/integer/test_arithmetic.py
@@ -178,25 +178,9 @@ def test_error_invalid_values(data, all_arithmetic_operators):
ops = getattr(s, op)
# invalid scalars
- msg = "|".join(
- [
- r"can only perform ops with numeric values",
- r"IntegerArray cannot perform the operation mod",
- r"unsupported operand type",
- r"can only concatenate str \(not \"int\"\) to str",
- "not all arguments converted during string",
- "ufunc '.*' not supported for the input types, and the inputs could not",
- "ufunc '.*' did not contain a loop with signature matching types",
- "Addition/subtraction of integers and integer-arrays with Timestamp",
- "has no kernel",
- "not implemented",
- "The 'out' kwarg is necessary. Use numpy.strings.multiply without it.",
- "not supported for dtype",
- ]
- )
- with pytest.raises(TypeError, match=msg):
+ with tm.external_error_raised(TypeError):
ops("foo")
- with pytest.raises(TypeError, match=msg):
+ with tm.external_error_raised(TypeError):
ops(pd.Timestamp("20180101"))
# invalid array-likes
@@ -214,25 +198,10 @@ def test_error_invalid_values(data, all_arithmetic_operators):
# more-correct than np.nan here.
tm.assert_series_equal(res, expected)
else:
- with pytest.raises(TypeError, match=msg):
+ with tm.external_error_raised(TypeError):
ops(str_ser)
- msg = "|".join(
- [
- "can only perform ops with numeric values",
- "cannot perform .* with this index type: DatetimeArray",
- "Addition/subtraction of integers and integer-arrays "
- "with DatetimeArray is no longer supported. *",
- "unsupported operand type",
- r"can only concatenate str \(not \"int\"\) to str",
- "not all arguments converted during string",
- "cannot subtract DatetimeArray from ndarray",
- "has no kernel",
- "not implemented",
- "not supported for dtype",
- ]
- )
- with pytest.raises(TypeError, match=msg):
+ with tm.external_error_raised(TypeError):
ops(pd.Series(pd.date_range("20180101", periods=len(s))))
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index 336a0fef69170..736c0e1782fc0 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -10,10 +10,12 @@
from pandas._config import using_string_dtype
+from pandas.compat import HAS_PYARROW
from pandas.compat.pyarrow import (
pa_version_under12p0,
pa_version_under19p0,
)
+import pandas.util._test_decorators as td
from pandas.core.dtypes.common import is_dtype_equal
@@ -45,6 +47,25 @@ def cls(dtype):
return dtype.construct_array_type()
+def string_dtype_highest_priority(dtype1, dtype2):
+ if HAS_PYARROW:
+ DTYPE_HIERARCHY = [
+ pd.StringDtype("python", na_value=np.nan),
+ pd.StringDtype("pyarrow", na_value=np.nan),
+ pd.StringDtype("python", na_value=pd.NA),
+ pd.StringDtype("pyarrow", na_value=pd.NA),
+ ]
+ else:
+ DTYPE_HIERARCHY = [
+ pd.StringDtype("python", na_value=np.nan),
+ pd.StringDtype("python", na_value=pd.NA),
+ ]
+
+ h1 = DTYPE_HIERARCHY.index(dtype1)
+ h2 = DTYPE_HIERARCHY.index(dtype2)
+ return DTYPE_HIERARCHY[max(h1, h2)]
+
+
def test_dtype_constructor():
pytest.importorskip("pyarrow")
@@ -103,6 +124,18 @@ def test_repr(dtype):
assert repr(df.A.array) == expected
+def test_dtype_repr(dtype):
+ if dtype.storage == "pyarrow":
+ if dtype.na_value is pd.NA:
+ assert repr(dtype) == ")>"
+ else:
+ assert repr(dtype) == ""
+ elif dtype.na_value is pd.NA:
+ assert repr(dtype) == ")>"
+ else:
+ assert repr(dtype) == ""
+
+
def test_none_to_nan(cls, dtype):
a = cls._from_sequence(["a", None, "b"], dtype=dtype)
assert a[1] is not None
@@ -319,13 +352,18 @@ def test_comparison_methods_scalar_not_string(comparison_op, dtype):
tm.assert_extension_array_equal(result, expected)
-def test_comparison_methods_array(comparison_op, dtype):
+def test_comparison_methods_array(comparison_op, dtype, dtype2):
op_name = f"__{comparison_op.__name__}__"
a = pd.array(["a", None, "c"], dtype=dtype)
- other = [None, None, "c"]
- result = getattr(a, op_name)(other)
- if dtype.na_value is np.nan:
+ other = pd.array([None, None, "c"], dtype=dtype2)
+ result = comparison_op(a, other)
+
+ # ensure operation is commutative
+ result2 = comparison_op(other, a)
+ tm.assert_equal(result, result2)
+
+ if dtype.na_value is np.nan and dtype2.na_value is np.nan:
if operator.ne == comparison_op:
expected = np.array([True, True, False])
else:
@@ -333,11 +371,56 @@ def test_comparison_methods_array(comparison_op, dtype):
expected[-1] = getattr(other[-1], op_name)(a[-1])
tm.assert_numpy_array_equal(result, expected)
- result = getattr(a, op_name)(pd.NA)
+ else:
+ max_dtype = string_dtype_highest_priority(dtype, dtype2)
+ if max_dtype.storage == "python":
+ expected_dtype = "boolean"
+ else:
+ expected_dtype = "bool[pyarrow]"
+
+ expected = np.full(len(a), fill_value=None, dtype="object")
+ expected[-1] = getattr(other[-1], op_name)(a[-1])
+ expected = pd.array(expected, dtype=expected_dtype)
+ tm.assert_extension_array_equal(result, expected)
+
+
+@td.skip_if_no("pyarrow")
+def test_comparison_methods_array_arrow_extension(comparison_op, dtype2):
+ # Test pd.ArrowDtype(pa.string()) against other string arrays
+ import pyarrow as pa
+
+ op_name = f"__{comparison_op.__name__}__"
+ dtype = pd.ArrowDtype(pa.string())
+ a = pd.array(["a", None, "c"], dtype=dtype)
+ other = pd.array([None, None, "c"], dtype=dtype2)
+ result = comparison_op(a, other)
+
+ # ensure operation is commutative
+ result2 = comparison_op(other, a)
+ tm.assert_equal(result, result2)
+
+ expected = pd.array([None, None, True], dtype="bool[pyarrow]")
+ expected[-1] = getattr(other[-1], op_name)(a[-1])
+ tm.assert_extension_array_equal(result, expected)
+
+
+def test_comparison_methods_list(comparison_op, dtype):
+ op_name = f"__{comparison_op.__name__}__"
+
+ a = pd.array(["a", None, "c"], dtype=dtype)
+ other = [None, None, "c"]
+ result = comparison_op(a, other)
+
+ # ensure operation is commutative
+ result2 = comparison_op(other, a)
+ tm.assert_equal(result, result2)
+
+ if dtype.na_value is np.nan:
if operator.ne == comparison_op:
- expected = np.array([True, True, True])
+ expected = np.array([True, True, False])
else:
expected = np.array([False, False, False])
+ expected[-1] = getattr(other[-1], op_name)(a[-1])
tm.assert_numpy_array_equal(result, expected)
else:
@@ -347,10 +430,6 @@ def test_comparison_methods_array(comparison_op, dtype):
expected = pd.array(expected, dtype=expected_dtype)
tm.assert_extension_array_equal(result, expected)
- result = getattr(a, op_name)(pd.NA)
- expected = pd.array([None, None, None], dtype=expected_dtype)
- tm.assert_extension_array_equal(result, expected)
-
def test_constructor_raises(cls):
if cls is pd.arrays.StringArray:
diff --git a/pandas/tests/config/test_config.py b/pandas/tests/config/test_config.py
index aaf6178866ecd..a6bc40469cada 100644
--- a/pandas/tests/config/test_config.py
+++ b/pandas/tests/config/test_config.py
@@ -195,6 +195,24 @@ def test_set_option_multiple(self):
assert cf.get_option("b.c") is None
assert cf.get_option("b.b") == 10.0
+ def test_set_option_dict(self):
+ # GH 61093
+
+ cf.register_option("a", 1, "doc")
+ cf.register_option("b.c", "hullo", "doc2")
+ cf.register_option("b.b", None, "doc2")
+
+ assert cf.get_option("a") == 1
+ assert cf.get_option("b.c") == "hullo"
+ assert cf.get_option("b.b") is None
+
+ options_dict = {"a": "2", "b.c": None, "b.b": 10.0}
+ cf.set_option(options_dict)
+
+ assert cf.get_option("a") == "2"
+ assert cf.get_option("b.c") is None
+ assert cf.get_option("b.b") == 10.0
+
def test_validation(self):
cf.register_option("a", 1, "doc", validator=cf.is_int)
cf.register_option("d", 1, "doc", validator=cf.is_nonnegative_int)
@@ -377,6 +395,33 @@ def f():
f()
+ def test_set_ContextManager_dict(self):
+ def eq(val):
+ assert cf.get_option("a") == val
+ assert cf.get_option("b.c") == val
+
+ cf.register_option("a", 0)
+ cf.register_option("b.c", 0)
+
+ eq(0)
+ with cf.option_context({"a": 15, "b.c": 15}):
+ eq(15)
+ with cf.option_context({"a": 25, "b.c": 25}):
+ eq(25)
+ eq(15)
+ eq(0)
+
+ cf.set_option("a", 17)
+ cf.set_option("b.c", 17)
+ eq(17)
+
+ # Test that option_context can be used as a decorator too
+ @cf.option_context({"a": 123, "b.c": 123})
+ def f():
+ eq(123)
+
+ f()
+
def test_attribute_access(self):
holder = []
diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py
index db98751324ebc..7fd0395009adb 100644
--- a/pandas/tests/dtypes/test_inference.py
+++ b/pandas/tests/dtypes/test_inference.py
@@ -250,6 +250,15 @@ class MyDataFrame(DataFrame, Generic[T]): ...
assert inference.is_list_like(tst)
+def test_is_list_like_native_container_types():
+ # GH 61565
+ # is_list_like was yielding false positives for native container types
+ assert not inference.is_list_like(list[int])
+ assert not inference.is_list_like(list[str])
+ assert not inference.is_list_like(tuple[int])
+ assert not inference.is_list_like(tuple[str])
+
+
def test_is_sequence():
is_seq = inference.is_sequence
assert is_seq((1, 2))
diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
index 25129111180d6..96c014f549056 100644
--- a/pandas/tests/extension/test_string.py
+++ b/pandas/tests/extension/test_string.py
@@ -31,6 +31,7 @@
from pandas.api.types import is_string_dtype
from pandas.core.arrays import ArrowStringArray
from pandas.core.arrays.string_ import StringDtype
+from pandas.tests.arrays.string_.test_string import string_dtype_highest_priority
from pandas.tests.extension import base
@@ -202,10 +203,13 @@ def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result):
dtype = cast(StringDtype, tm.get_dtype(obj))
if op_name in ["__add__", "__radd__"]:
cast_to = dtype
+ dtype_other = tm.get_dtype(other) if not isinstance(other, str) else None
+ if isinstance(dtype_other, StringDtype):
+ cast_to = string_dtype_highest_priority(dtype, dtype_other)
elif dtype.na_value is np.nan:
cast_to = np.bool_ # type: ignore[assignment]
elif dtype.storage == "pyarrow":
- cast_to = "boolean[pyarrow]" # type: ignore[assignment]
+ cast_to = "bool[pyarrow]" # type: ignore[assignment]
else:
cast_to = "boolean" # type: ignore[assignment]
return pointwise_result.astype(cast_to)
@@ -236,10 +240,10 @@ def test_arith_series_with_array(
if (
using_infer_string
and all_arithmetic_operators == "__radd__"
- and (
- (dtype.na_value is pd.NA) or (dtype.storage == "python" and HAS_PYARROW)
- )
+ and dtype.na_value is pd.NA
+ and (HAS_PYARROW or dtype.storage == "pyarrow")
):
+ # TODO(infer_string)
mark = pytest.mark.xfail(
reason="The pointwise operation result will be inferred to "
"string[nan, pyarrow], which does not match the input dtype"
diff --git a/pandas/tests/frame/methods/test_dot.py b/pandas/tests/frame/methods/test_dot.py
index 3e01f67c8794b..b365ceb2ab61c 100644
--- a/pandas/tests/frame/methods/test_dot.py
+++ b/pandas/tests/frame/methods/test_dot.py
@@ -153,3 +153,19 @@ def test_arrow_dtype(dtype, exp_dtype):
expected = DataFrame([[1, 2], [3, 4], [5, 6]], dtype=exp_dtype)
tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+ "dtype,exp_dtype",
+ [("Float32", "Float64"), ("Int16", "Int32"), ("float[pyarrow]", "double[pyarrow]")],
+)
+def test_arrow_dtype_series(dtype, exp_dtype):
+ pytest.importorskip("pyarrow")
+
+ cols = ["a", "b"]
+ series_a = Series([1, 2], index=cols, dtype="int32")
+ df_b = DataFrame([[1, 0], [0, 1]], index=cols, dtype=dtype)
+ result = series_a.dot(df_b)
+ expected = Series([1, 2], dtype=exp_dtype)
+
+ tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/frame/methods/test_sort_values.py b/pandas/tests/frame/methods/test_sort_values.py
index e728526519e9d..9a628c2ee9f73 100644
--- a/pandas/tests/frame/methods/test_sort_values.py
+++ b/pandas/tests/frame/methods/test_sort_values.py
@@ -630,6 +630,13 @@ def test_sort_values_no_op_reset_index(self):
expected = DataFrame({"A": [10, 20], "B": [1, 5]})
tm.assert_frame_equal(result, expected)
+ def test_sort_by_column_named_none(self):
+ # GH#61512
+ df = DataFrame([[3, 1], [2, 2]], columns=[None, "C1"])
+ result = df.sort_values(by=None)
+ expected = DataFrame([[2, 2], [3, 1]], columns=[None, "C1"], index=[1, 0])
+ tm.assert_frame_equal(result, expected)
+
class TestDataFrameSortKey: # test key sorting (issue 27237)
def test_sort_values_inplace_key(self, sort_by_key):
diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py
index 9eafc69013ffe..34d120145b381 100644
--- a/pandas/tests/frame/methods/test_to_csv.py
+++ b/pandas/tests/frame/methods/test_to_csv.py
@@ -1450,3 +1450,22 @@ def test_to_csv_warn_when_zip_tar_and_append_mode(self, tmp_path):
RuntimeWarning, match=msg, raise_on_extra_warnings=False
):
df.to_csv(tar_path, mode="a")
+
+ def test_to_csv_escape_quotechar(self):
+ # GH61514
+ df = DataFrame(
+ {
+ "col_a": ["a", "a2"],
+ "col_b": ['b"c', None],
+ "col_c": ['de,f"', '"c'],
+ }
+ )
+
+ result = df.to_csv(quotechar='"', escapechar="\\", quoting=csv.QUOTE_NONE)
+ expected_rows = [
+ ",col_a,col_b,col_c",
+ '0,a,b\\"c,de\\,f\\"',
+ '1,a2,,\\"c',
+ ]
+ expected = tm.convert_rows_list_to_csv_str(expected_rows)
+ assert result == expected
diff --git a/pandas/tests/frame/methods/test_to_numpy.py b/pandas/tests/frame/methods/test_to_numpy.py
index 36088cceb13f1..f68d7f533645d 100644
--- a/pandas/tests/frame/methods/test_to_numpy.py
+++ b/pandas/tests/frame/methods/test_to_numpy.py
@@ -3,7 +3,9 @@
from pandas import (
DataFrame,
+ NaT,
Timestamp,
+ date_range,
)
import pandas._testing as tm
@@ -41,3 +43,37 @@ def test_to_numpy_mixed_dtype_to_str(self):
result = df.to_numpy(dtype=str)
expected = np.array([["2020-01-01 00:00:00", "100.0"]], dtype=str)
tm.assert_numpy_array_equal(result, expected)
+
+ def test_to_numpy_datetime_with_na(self):
+ # GH #53115
+ dti = date_range("2016-01-01", periods=3)
+ df = DataFrame(dti)
+ df.iloc[0, 0] = NaT
+ expected = np.array([[np.nan], [1.45169280e18], [1.45177920e18]])
+ result = df.to_numpy(float, na_value=np.nan)
+ tm.assert_numpy_array_equal(result, expected)
+
+ df = DataFrame(
+ {
+ "a": [Timestamp("1970-01-01"), Timestamp("1970-01-02"), NaT],
+ "b": [
+ Timestamp("1970-01-01"),
+ np.nan,
+ Timestamp("1970-01-02"),
+ ],
+ "c": [
+ 1,
+ np.nan,
+ 2,
+ ],
+ }
+ )
+ expected = np.array(
+ [
+ [0.00e00, 0.00e00, 1.00e00],
+ [8.64e04, np.nan, np.nan],
+ [np.nan, 8.64e04, 2.00e00],
+ ]
+ )
+ result = df.to_numpy(float, na_value=np.nan)
+ tm.assert_numpy_array_equal(result, expected)
diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
index 037a2ae294bb2..2426c89dbcff5 100644
--- a/pandas/tests/frame/test_constructors.py
+++ b/pandas/tests/frame/test_constructors.py
@@ -2780,6 +2780,19 @@ def test_construction_nan_value_timedelta64_dtype(self):
)
tm.assert_frame_equal(result, expected)
+ def test_dataframe_from_array_like_with_name_attribute(self):
+ # GH#61443
+ class DummyArray(np.ndarray):
+ def __new__(cls, input_array):
+ obj = np.asarray(input_array).view(cls)
+ obj.name = "foo"
+ return obj
+
+ dummy = DummyArray(np.eye(3))
+ df = DataFrame(dummy)
+ expected = DataFrame(np.eye(3))
+ tm.assert_frame_equal(df, expected)
+
class TestDataFrameConstructorIndexInference:
def test_frame_from_dict_of_series_overlapping_monthly_period_indexes(self):
diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py
index 127f0fc50a747..cc23c292b66dc 100644
--- a/pandas/tests/frame/test_reductions.py
+++ b/pandas/tests/frame/test_reductions.py
@@ -1917,6 +1917,39 @@ def test_df_empty_nullable_min_count_1(self, opname, dtype, exp_dtype):
expected = Series([pd.NA, pd.NA], dtype=exp_dtype, index=Index([0, 1]))
tm.assert_series_equal(result, expected)
+ @pytest.mark.parametrize(
+ "data",
+ [
+ {"a": [0, 1, 2], "b": [pd.NaT, pd.NaT, pd.NaT]},
+ {"a": [0, 1, 2], "b": [Timestamp("1990-01-01"), pd.NaT, pd.NaT]},
+ {
+ "a": [0, 1, 2],
+ "b": [
+ Timestamp("1990-01-01"),
+ Timestamp("1991-01-01"),
+ Timestamp("1992-01-01"),
+ ],
+ },
+ {
+ "a": [0, 1, 2],
+ "b": [pd.Timedelta("1 days"), pd.Timedelta("2 days"), pd.NaT],
+ },
+ {
+ "a": [0, 1, 2],
+ "b": [
+ pd.Timedelta("1 days"),
+ pd.Timedelta("2 days"),
+ pd.Timedelta("3 days"),
+ ],
+ },
+ ],
+ )
+ def test_df_cov_pd_nat(self, data):
+ # GH #53115
+ df = DataFrame(data)
+ with pytest.raises(TypeError, match="not supported for cov"):
+ df.cov()
+
def test_sum_timedelta64_skipna_false():
# GH#17235
diff --git a/pandas/tests/generic/test_to_xarray.py b/pandas/tests/generic/test_to_xarray.py
index 9fe9bca8abdc9..8917e4e3f3854 100644
--- a/pandas/tests/generic/test_to_xarray.py
+++ b/pandas/tests/generic/test_to_xarray.py
@@ -6,11 +6,13 @@
DataFrame,
MultiIndex,
Series,
+ StringDtype,
date_range,
)
import pandas._testing as tm
+from pandas.util.version import Version
-pytest.importorskip("xarray")
+xarray = pytest.importorskip("xarray")
class TestDataFrameToXArray:
@@ -29,13 +31,17 @@ def df(self):
}
)
- def test_to_xarray_index_types(self, index_flat, df, using_infer_string):
+ def test_to_xarray_index_types(self, index_flat, df, request):
index = index_flat
# MultiIndex is tested in test_to_xarray_with_multiindex
if len(index) == 0:
pytest.skip("Test doesn't make sense for empty index")
-
- from xarray import Dataset
+ elif Version(xarray.__version__) <= Version("2024.9.0"):
+ request.applymarker(
+ pytest.mark.xfail(
+ reason="Categorical column not preserved.",
+ )
+ )
df.index = index[:4]
df.index.name = "foo"
@@ -45,29 +51,22 @@ def test_to_xarray_index_types(self, index_flat, df, using_infer_string):
assert len(result.coords) == 1
assert len(result.data_vars) == 8
tm.assert_almost_equal(list(result.coords.keys()), ["foo"])
- assert isinstance(result, Dataset)
+ assert isinstance(result, xarray.Dataset)
# idempotency
# datetimes w/tz are preserved
# column names are lost
expected = df.copy()
- expected["f"] = expected["f"].astype(
- object if not using_infer_string else "str"
- )
expected.columns.name = None
tm.assert_frame_equal(result.to_dataframe(), expected)
def test_to_xarray_empty(self, df):
- from xarray import Dataset
-
df.index.name = "foo"
result = df[0:0].to_xarray()
assert result.sizes["foo"] == 0
- assert isinstance(result, Dataset)
+ assert isinstance(result, xarray.Dataset)
def test_to_xarray_with_multiindex(self, df, using_infer_string):
- from xarray import Dataset
-
# MultiIndex
df.index = MultiIndex.from_product([["a"], range(4)], names=["one", "two"])
result = df.to_xarray()
@@ -76,7 +75,7 @@ def test_to_xarray_with_multiindex(self, df, using_infer_string):
assert len(result.coords) == 2
assert len(result.data_vars) == 8
tm.assert_almost_equal(list(result.coords.keys()), ["one", "two"])
- assert isinstance(result, Dataset)
+ assert isinstance(result, xarray.Dataset)
result = result.to_dataframe()
expected = df.copy()
@@ -88,12 +87,21 @@ def test_to_xarray_with_multiindex(self, df, using_infer_string):
class TestSeriesToXArray:
- def test_to_xarray_index_types(self, index_flat):
+ def test_to_xarray_index_types(self, index_flat, request):
index = index_flat
+ if (
+ isinstance(index.dtype, StringDtype)
+ and index.dtype.storage == "pyarrow"
+ and Version(xarray.__version__) > Version("2024.9.0")
+ ):
+ request.applymarker(
+ pytest.mark.xfail(
+ reason="xarray calling reshape of ArrowExtensionArray",
+ raises=NotImplementedError,
+ )
+ )
# MultiIndex is tested in test_to_xarray_with_multiindex
- from xarray import DataArray
-
ser = Series(range(len(index)), index=index, dtype="int64")
ser.index.name = "foo"
result = ser.to_xarray()
@@ -101,30 +109,26 @@ def test_to_xarray_index_types(self, index_flat):
assert len(result) == len(index)
assert len(result.coords) == 1
tm.assert_almost_equal(list(result.coords.keys()), ["foo"])
- assert isinstance(result, DataArray)
+ assert isinstance(result, xarray.DataArray)
# idempotency
tm.assert_series_equal(result.to_series(), ser)
def test_to_xarray_empty(self):
- from xarray import DataArray
-
ser = Series([], dtype=object)
ser.index.name = "foo"
result = ser.to_xarray()
assert len(result) == 0
assert len(result.coords) == 1
tm.assert_almost_equal(list(result.coords.keys()), ["foo"])
- assert isinstance(result, DataArray)
+ assert isinstance(result, xarray.DataArray)
def test_to_xarray_with_multiindex(self):
- from xarray import DataArray
-
mi = MultiIndex.from_product([["a", "b"], range(3)], names=["one", "two"])
ser = Series(range(6), dtype="int64", index=mi)
result = ser.to_xarray()
assert len(result) == 2
tm.assert_almost_equal(list(result.coords.keys()), ["one", "two"])
- assert isinstance(result, DataArray)
+ assert isinstance(result, xarray.DataArray)
res = result.to_series()
tm.assert_series_equal(res, ser)
diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py
index b7e6e55739c17..4f6c27bd327cb 100644
--- a/pandas/tests/groupby/aggregate/test_aggregate.py
+++ b/pandas/tests/groupby/aggregate/test_aggregate.py
@@ -1807,3 +1807,20 @@ def test_groupby_aggregation_func_list_multi_index_duplicate_columns():
index=Index(["level1.1", "level1.2"]),
)
tm.assert_frame_equal(result, expected)
+
+
+def test_groupby_aggregate_empty_builtin_sum():
+ df = DataFrame(columns=["Group", "Data"])
+ result = df.groupby(["Group"], as_index=False)["Data"].agg("sum")
+ expected = DataFrame(columns=["Group", "Data"])
+ tm.assert_frame_equal(result, expected)
+
+
+def test_groupby_aggregate_empty_udf():
+ def func(x):
+ return sum(x)
+
+ df = DataFrame(columns=["Group", "Data"])
+ result = df.groupby(["Group"], as_index=False)["Data"].agg(func)
+ expected = DataFrame(columns=["Group", "Data"])
+ tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py
index 864b9e5d55991..0012074b9f995 100644
--- a/pandas/tests/groupby/test_raises.py
+++ b/pandas/tests/groupby/test_raises.py
@@ -671,7 +671,7 @@ def test_groupby_raises_category_on_category(
"nunique": (None, ""),
"pct_change": (TypeError, "unsupported operand type"),
"prod": (TypeError, "category type does not support prod operations"),
- "quantile": (TypeError, ""),
+ "quantile": (TypeError, "No matching signature found"),
"rank": (None, ""),
"sem": (
TypeError,
diff --git a/pandas/tests/indexes/datetimes/test_setops.py b/pandas/tests/indexes/datetimes/test_setops.py
index 7ef6efad0ff6f..7a68cb867c94e 100644
--- a/pandas/tests/indexes/datetimes/test_setops.py
+++ b/pandas/tests/indexes/datetimes/test_setops.py
@@ -201,6 +201,69 @@ def test_union_same_timezone_different_units(self):
expected = date_range("2000-01-01", periods=3, tz="UTC").as_unit("us")
tm.assert_index_equal(result, expected)
+ def test_union_same_nonzero_timezone_different_units(self):
+ # GH 60080 - fix timezone being changed to UTC when units differ
+ # but timezone is the same
+ tz = "UTC+05:00"
+ idx1 = date_range("2000-01-01", periods=3, tz=tz).as_unit("us")
+ idx2 = date_range("2000-01-01", periods=3, tz=tz).as_unit("ns")
+
+ # Check pre-conditions
+ assert idx1.tz == idx2.tz
+ assert idx1.dtype != idx2.dtype # Different units
+
+ # Test union preserves timezone when units differ
+ result = idx1.union(idx2)
+ expected = date_range("2000-01-01", periods=3, tz=tz).as_unit("ns")
+ tm.assert_index_equal(result, expected)
+
+ def test_union_different_dates_same_timezone_different_units(self):
+ # GH 60080 - fix timezone being changed to UTC when units differ
+ # but timezone is the same
+ tz = "UTC+05:00"
+ idx1 = date_range("2000-01-01", periods=3, tz=tz).as_unit("us")
+ idx3 = date_range("2000-01-03", periods=3, tz=tz).as_unit("us")
+
+ # Test with different dates to ensure it's not just returning one of the inputs
+ result = idx1.union(idx3)
+ expected = DatetimeIndex(
+ ["2000-01-01", "2000-01-02", "2000-01-03", "2000-01-04", "2000-01-05"],
+ tz=tz,
+ ).as_unit("us")
+ tm.assert_index_equal(result, expected)
+
+ def test_intersection_same_timezone_different_units(self):
+ # GH 60080 - fix timezone being changed to UTC when units differ
+ # but timezone is the same
+ tz = "UTC+05:00"
+ idx1 = date_range("2000-01-01", periods=3, tz=tz).as_unit("us")
+ idx2 = date_range("2000-01-01", periods=3, tz=tz).as_unit("ns")
+
+ # Check pre-conditions
+ assert idx1.tz == idx2.tz
+ assert idx1.dtype != idx2.dtype # Different units
+
+ # Test intersection
+ result = idx1.intersection(idx2)
+ expected = date_range("2000-01-01", periods=3, tz=tz).as_unit("ns")
+ tm.assert_index_equal(result, expected)
+
+ def test_symmetric_difference_same_timezone_different_units(self):
+ # GH 60080 - fix timezone being changed to UTC when units differ
+ # but timezone is the same
+ tz = "UTC+05:00"
+ idx1 = date_range("2000-01-01", periods=3, tz=tz).as_unit("us")
+ idx4 = date_range("2000-01-02", periods=3, tz=tz).as_unit("ns")
+
+ # Check pre-conditions
+ assert idx1.tz == idx4.tz
+ assert idx1.dtype != idx4.dtype # Different units
+
+ # Test symmetric_difference
+ result = idx1.symmetric_difference(idx4)
+ expected = DatetimeIndex(["2000-01-01", "2000-01-04"], tz=tz).as_unit("ns")
+ tm.assert_index_equal(result, expected)
+
# TODO: moved from test_datetimelike; de-duplicate with version below
def test_intersection2(self):
first = date_range("2020-01-01", periods=10)
diff --git a/pandas/tests/io/formats/test_to_string.py b/pandas/tests/io/formats/test_to_string.py
index 63c975fd831e7..0866581535c2f 100644
--- a/pandas/tests/io/formats/test_to_string.py
+++ b/pandas/tests/io/formats/test_to_string.py
@@ -777,9 +777,9 @@ def test_to_string_string_dtype(self):
result = df.dtypes.to_string()
expected = dedent(
"""\
- x string[pyarrow]
- y string[python]
- z int64[pyarrow]"""
+ x string
+ y string
+ z int64[pyarrow]"""
)
assert result == expected
diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py
index b3ab6b48508e1..5cfefeb469e8a 100644
--- a/pandas/tests/io/pytables/test_store.py
+++ b/pandas/tests/io/pytables/test_store.py
@@ -23,6 +23,9 @@
timedelta_range,
)
import pandas._testing as tm
+from pandas.api.types import (
+ CategoricalDtype,
+)
from pandas.tests.io.pytables.common import (
_maybe_remove,
ensure_clean_store,
@@ -1107,3 +1110,23 @@ def test_store_bool_index(tmp_path, setup_path):
df.to_hdf(path, key="a")
result = read_hdf(path, "a")
tm.assert_frame_equal(expected, result)
+
+
+@pytest.mark.parametrize("model", ["name", "longname", "verylongname"])
+def test_select_categorical_string_columns(tmp_path, model):
+ # Corresponding to BUG: 57608
+
+ path = tmp_path / "test.h5"
+
+ models = CategoricalDtype(categories=["name", "longname", "verylongname"])
+ df = DataFrame(
+ {"modelId": ["name", "longname", "longname"], "value": [1, 2, 3]}
+ ).astype({"modelId": models, "value": int})
+
+ with HDFStore(path, "w") as store:
+ store.append("df", df, data_columns=["modelId"])
+
+ with HDFStore(path, "r") as store:
+ result = store.select("df", "modelId == model")
+ expected = df[df["modelId"] == model]
+ tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/io/test_iceberg.py b/pandas/tests/io/test_iceberg.py
new file mode 100644
index 0000000000000..916c1d2af9b12
--- /dev/null
+++ b/pandas/tests/io/test_iceberg.py
@@ -0,0 +1,222 @@
+"""
+Tests for the Apache Iceberg format.
+
+Tests in this file use a simple Iceberg catalog based on SQLite, with the same
+data used for Parquet tests (``pandas/tests/io/data/parquet/simple.parquet``).
+"""
+
+import collections
+import importlib
+import pathlib
+
+import pytest
+
+import pandas as pd
+import pandas._testing as tm
+
+from pandas.io.iceberg import read_iceberg
+
+pytestmark = pytest.mark.single_cpu
+
+pyiceberg = pytest.importorskip("pyiceberg")
+pyiceberg_catalog = pytest.importorskip("pyiceberg.catalog")
+pq = pytest.importorskip("pyarrow.parquet")
+
+Catalog = collections.namedtuple("Catalog", ["name", "uri", "warehouse"])
+
+
+@pytest.fixture
+def catalog(request, tmp_path):
+ # the catalog stores the full path of data files, so the catalog needs to be
+ # created dynamically, and not saved in pandas/tests/io/data as other formats
+ uri = f"sqlite:///{tmp_path}/catalog.sqlite"
+ warehouse = f"file://{tmp_path}"
+ catalog_name = request.param if hasattr(request, "param") else None
+ catalog = pyiceberg_catalog.load_catalog(
+ catalog_name or "default",
+ type="sql",
+ uri=uri,
+ warehouse=warehouse,
+ )
+ catalog.create_namespace("ns")
+
+ df = pq.read_table(
+ pathlib.Path(__file__).parent / "data" / "parquet" / "simple.parquet"
+ )
+ table = catalog.create_table("ns.my_table", schema=df.schema)
+ table.append(df)
+
+ if catalog_name is not None:
+ config_path = pathlib.Path.home() / ".pyiceberg.yaml"
+ with open(config_path, "w", encoding="utf-8") as f:
+ f.write(f"""\
+catalog:
+ {catalog_name}:
+ type: sql
+ uri: {uri}
+ warehouse: {warehouse}""")
+
+ importlib.reload(pyiceberg_catalog) # needed to reload the config file
+
+ yield Catalog(name=catalog_name or "default", uri=uri, warehouse=warehouse)
+
+ if catalog_name is not None:
+ config_path.unlink()
+
+
+class TestIceberg:
+ def test_read(self, catalog):
+ expected = pd.DataFrame(
+ {
+ "A": [1, 2, 3],
+ "B": ["foo", "foo", "foo"],
+ }
+ )
+ result = read_iceberg(
+ "ns.my_table",
+ catalog_properties={"uri": catalog.uri},
+ )
+ tm.assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize("catalog", ["default", "pandas_tests"], indirect=True)
+ def test_read_by_catalog_name(self, catalog):
+ expected = pd.DataFrame(
+ {
+ "A": [1, 2, 3],
+ "B": ["foo", "foo", "foo"],
+ }
+ )
+ result = read_iceberg(
+ "ns.my_table",
+ catalog_name=catalog.name,
+ )
+ tm.assert_frame_equal(result, expected)
+
+ def test_read_with_row_filter(self, catalog):
+ expected = pd.DataFrame(
+ {
+ "A": [2, 3],
+ "B": ["foo", "foo"],
+ }
+ )
+ result = read_iceberg(
+ "ns.my_table",
+ catalog_properties={"uri": catalog.uri},
+ row_filter="A > 1",
+ )
+ tm.assert_frame_equal(result, expected)
+
+ def test_read_with_case_sensitive(self, catalog):
+ expected = pd.DataFrame(
+ {
+ "A": [1, 2, 3],
+ }
+ )
+ result = read_iceberg(
+ "ns.my_table",
+ catalog_properties={"uri": catalog.uri},
+ selected_fields=["a"],
+ case_sensitive=False,
+ )
+ tm.assert_frame_equal(result, expected)
+
+ with pytest.raises(ValueError, match="^Could not find column"):
+ read_iceberg(
+ "ns.my_table",
+ catalog_properties={"uri": catalog.uri},
+ selected_fields=["a"],
+ case_sensitive=True,
+ )
+
+ def test_read_with_limit(self, catalog):
+ expected = pd.DataFrame(
+ {
+ "A": [1, 2],
+ "B": ["foo", "foo"],
+ }
+ )
+ result = read_iceberg(
+ "ns.my_table",
+ catalog_properties={"uri": catalog.uri},
+ limit=2,
+ )
+ tm.assert_frame_equal(result, expected)
+
+ def test_write(self, catalog):
+ df = pd.DataFrame(
+ {
+ "A": [1, 2, 3],
+ "B": ["foo", "foo", "foo"],
+ }
+ )
+ df.to_iceberg(
+ "ns.new_table",
+ catalog_properties={"uri": catalog.uri},
+ location=catalog.warehouse,
+ )
+ result = read_iceberg(
+ "ns.new_table",
+ catalog_properties={"uri": catalog.uri},
+ )
+ tm.assert_frame_equal(result, df)
+
+ @pytest.mark.parametrize("catalog", ["default", "pandas_tests"], indirect=True)
+ def test_write_by_catalog_name(self, catalog):
+ df = pd.DataFrame(
+ {
+ "A": [1, 2, 3],
+ "B": ["foo", "foo", "foo"],
+ }
+ )
+ df.to_iceberg(
+ "ns.new_table",
+ catalog_name=catalog.name,
+ )
+ result = read_iceberg(
+ "ns.new_table",
+ catalog_name=catalog.name,
+ )
+ tm.assert_frame_equal(result, df)
+
+ def test_write_existing_table_with_append_true(self, catalog):
+ original = read_iceberg(
+ "ns.my_table",
+ catalog_properties={"uri": catalog.uri},
+ )
+ new = pd.DataFrame(
+ {
+ "A": [1, 2, 3],
+ "B": ["foo", "foo", "foo"],
+ }
+ )
+ expected = pd.concat([original, new], ignore_index=True)
+ new.to_iceberg(
+ "ns.my_table",
+ catalog_properties={"uri": catalog.uri},
+ location=catalog.warehouse,
+ append=True,
+ )
+ result = read_iceberg(
+ "ns.my_table",
+ catalog_properties={"uri": catalog.uri},
+ )
+ tm.assert_frame_equal(result, expected)
+
+ def test_write_existing_table_with_append_false(self, catalog):
+ df = pd.DataFrame(
+ {
+ "A": [1, 2, 3],
+ "B": ["foo", "foo", "foo"],
+ }
+ )
+ df.to_iceberg(
+ "ns.my_table",
+ catalog_properties={"uri": catalog.uri},
+ location=catalog.warehouse,
+ append=False,
+ )
+ result = read_iceberg(
+ "ns.my_table",
+ catalog_properties={"uri": catalog.uri},
+ )
+ tm.assert_frame_equal(result, df)
diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py
index db55b73bfb125..4a6a5635eb68c 100644
--- a/pandas/tests/io/test_sql.py
+++ b/pandas/tests/io/test_sql.py
@@ -1,7 +1,6 @@
from __future__ import annotations
import contextlib
-from contextlib import closing
import csv
from datetime import (
date,
@@ -2498,10 +2497,8 @@ def test_sqlalchemy_integer_overload_mapping(conn, request, integer):
sql.SQLTable("test_type", db, frame=df)
-@pytest.mark.parametrize("conn", all_connectable)
-def test_database_uri_string(conn, request, test_frame1):
+def test_database_uri_string(request, test_frame1):
pytest.importorskip("sqlalchemy")
- conn = request.getfixturevalue(conn)
# Test read_sql and .to_sql method with a database URI (GH10654)
# db_uri = 'sqlite:///:memory:' # raises
# sqlalchemy.exc.OperationalError: (sqlite3.OperationalError) near
@@ -2520,10 +2517,8 @@ def test_database_uri_string(conn, request, test_frame1):
@td.skip_if_installed("pg8000")
-@pytest.mark.parametrize("conn", all_connectable)
-def test_pg8000_sqlalchemy_passthrough_error(conn, request):
+def test_pg8000_sqlalchemy_passthrough_error(request):
pytest.importorskip("sqlalchemy")
- conn = request.getfixturevalue(conn)
# using driver that will not be installed on CI to trigger error
# in sqlalchemy.create_engine -> test passing of this error to user
db_uri = "postgresql+pg8000://user:pass@host/dbname"
@@ -2584,10 +2579,10 @@ def test_sql_open_close(test_frame3):
# between the writing and reading (as in many real situations).
with tm.ensure_clean() as name:
- with closing(sqlite3.connect(name)) as conn:
+ with contextlib.closing(sqlite3.connect(name)) as conn:
assert sql.to_sql(test_frame3, "test_frame3_legacy", conn, index=False) == 4
- with closing(sqlite3.connect(name)) as conn:
+ with contextlib.closing(sqlite3.connect(name)) as conn:
result = sql.read_sql_query("SELECT * FROM test_frame3_legacy;", conn)
tm.assert_frame_equal(test_frame3, result)
@@ -2731,25 +2726,26 @@ def test_delete_rows_is_atomic(conn_name, request):
replacing_df = DataFrame({"a": [5, 6, 7], "b": [8, 8, 8]}, dtype="int32")
conn = request.getfixturevalue(conn_name)
- pandasSQL = pandasSQL_builder(conn)
+ with pandasSQL_builder(conn) as pandasSQL:
+ with pandasSQL.run_transaction() as cur:
+ cur.execute(table_stmt)
- with pandasSQL.run_transaction() as cur:
- cur.execute(table_stmt)
+ with pandasSQL.run_transaction():
+ pandasSQL.to_sql(original_df, table_name, if_exists="append", index=False)
- with pandasSQL.run_transaction():
- pandasSQL.to_sql(original_df, table_name, if_exists="append", index=False)
+ # inserting duplicated values in a UNIQUE constraint column
+ with pytest.raises(pd.errors.DatabaseError):
+ with pandasSQL.run_transaction():
+ pandasSQL.to_sql(
+ replacing_df, table_name, if_exists="delete_rows", index=False
+ )
- # inserting duplicated values in a UNIQUE constraint column
- with pytest.raises(pd.errors.DatabaseError):
+ # failed "delete_rows" is rolled back preserving original data
with pandasSQL.run_transaction():
- pandasSQL.to_sql(
- replacing_df, table_name, if_exists="delete_rows", index=False
+ result_df = pandasSQL.read_query(
+ f"SELECT * FROM {table_name}", dtype="int32"
)
-
- # failed "delete_rows" is rolled back preserving original data
- with pandasSQL.run_transaction():
- result_df = pandasSQL.read_query(f"SELECT * FROM {table_name}", dtype="int32")
- tm.assert_frame_equal(result_df, original_df)
+ tm.assert_frame_equal(result_df, original_df)
@pytest.mark.parametrize("conn", all_connectable)
@@ -2759,10 +2755,10 @@ def test_roundtrip(conn, request, test_frame1):
conn_name = conn
conn = request.getfixturevalue(conn)
- pandasSQL = pandasSQL_builder(conn)
- with pandasSQL.run_transaction():
- assert pandasSQL.to_sql(test_frame1, "test_frame_roundtrip") == 4
- result = pandasSQL.read_query("SELECT * FROM test_frame_roundtrip")
+ with pandasSQL_builder(conn) as pandasSQL:
+ with pandasSQL.run_transaction():
+ assert pandasSQL.to_sql(test_frame1, "test_frame_roundtrip") == 4
+ result = pandasSQL.read_query("SELECT * FROM test_frame_roundtrip")
if "adbc" in conn_name:
result = result.rename(columns={"__index_level_0__": "level_0"})
@@ -3577,13 +3573,6 @@ def test_options_get_engine():
assert isinstance(get_engine("sqlalchemy"), SQLAlchemyEngine)
-def test_get_engine_auto_error_message():
- # Expect different error messages from get_engine(engine="auto")
- # if engines aren't installed vs. are installed but bad version
- pass
- # TODO(GH#36893) fill this in when we add more engines
-
-
@pytest.mark.parametrize("conn", all_connectable)
@pytest.mark.parametrize("func", ["read_sql", "read_sql_query"])
def test_read_sql_dtype_backend(
diff --git a/pandas/tests/io/xml/test_to_xml.py b/pandas/tests/io/xml/test_to_xml.py
index 50fef2c5eb4eb..4446dbe320b69 100644
--- a/pandas/tests/io/xml/test_to_xml.py
+++ b/pandas/tests/io/xml/test_to_xml.py
@@ -1345,7 +1345,7 @@ def test_ea_dtypes(any_numeric_ea_dtype, parser):
assert equalize_decl(result).strip() == expected
-def test_unsuported_compression(parser, geom_df):
+def test_unsupported_compression(parser, geom_df):
with pytest.raises(ValueError, match="Unrecognized compression type"):
with tm.ensure_clean() as path:
geom_df.to_xml(path, parser=parser, compression="7z")
diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py
index d897d251909fe..cf8ae28c4d9b5 100644
--- a/pandas/tests/io/xml/test_xml.py
+++ b/pandas/tests/io/xml/test_xml.py
@@ -1961,7 +1961,7 @@ def test_wrong_compression(parser, compression, compression_only):
read_xml(path, parser=parser, compression=attempted_compression)
-def test_unsuported_compression(parser):
+def test_unsupported_compression(parser):
with pytest.raises(ValueError, match="Unrecognized compression type"):
with tm.ensure_clean() as path:
read_xml(path, parser=parser, compression="7z")
diff --git a/pandas/tests/libs/test_lib.py b/pandas/tests/libs/test_lib.py
index 17dae1879f3b8..f619ba4dd204b 100644
--- a/pandas/tests/libs/test_lib.py
+++ b/pandas/tests/libs/test_lib.py
@@ -297,3 +297,13 @@ def test_ensure_string_array_copy():
assert not np.shares_memory(arr, result)
assert arr[1] is None
assert result[1] is np.nan
+
+
+def test_ensure_string_array_list_of_lists():
+ # GH#61155: ensure list of lists doesn't get converted to string
+ arr = [list("test"), list("word")]
+ result = lib.ensure_string_array(arr)
+
+ # Each item in result should still be a list, not a stringified version
+ expected = np.array(["['t', 'e', 's', 't']", "['w', 'o', 'r', 'd']"], dtype=object)
+ tm.assert_numpy_array_equal(result, expected)
diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py
index f4a0f1e792ae6..d3e1d7f60384b 100644
--- a/pandas/tests/plotting/test_misc.py
+++ b/pandas/tests/plotting/test_misc.py
@@ -31,6 +31,8 @@
plt = pytest.importorskip("matplotlib.pyplot")
cm = pytest.importorskip("matplotlib.cm")
+import re
+
from pandas.plotting._matplotlib.style import get_standard_colors
@@ -727,7 +729,11 @@ def _df_bar_subplot_checker(df_bar_data, df_bar_df, subplot_data_df, subplot_col
].reset_index()
for i in range(len(subplot_columns))
]
- expected_total_height = df_bar_df.loc[:, subplot_columns].sum(axis=1)
+
+ if len(subplot_columns) == 1:
+ expected_total_height = df_bar_df.loc[:, subplot_columns[0]]
+ else:
+ expected_total_height = df_bar_df.loc[:, subplot_columns].sum(axis=1)
for i in range(len(subplot_columns)):
sliced_df = subplot_sliced_by_source[i]
@@ -743,7 +749,6 @@ def _df_bar_subplot_checker(df_bar_data, df_bar_df, subplot_data_df, subplot_col
tm.assert_series_equal(
height_iter, expected_total_height, check_names=False, check_dtype=False
)
-
else:
# Checks each preceding bar ends where the next one starts
next_start_coord = subplot_sliced_by_source[i + 1]["y_coord"]
@@ -816,3 +821,44 @@ def test_bar_2_subplots_1_triple_stacked(df_bar_data, df_bar_df, subplot_divisio
_df_bar_subplot_checker(
df_bar_data, df_bar_df, subplot_data_df_list[i], subplot_division[i]
)
+
+
+def test_bar_subplots_stacking_bool(df_bar_data, df_bar_df):
+ subplot_division = [("A"), ("B"), ("C"), ("D")]
+ ax = df_bar_df.plot(subplots=True, kind="bar", stacked=True)
+ subplot_data_df_list = _df_bar_xyheight_from_ax_helper(
+ df_bar_data, ax, subplot_division
+ )
+ for i in range(len(subplot_data_df_list)):
+ _df_bar_subplot_checker(
+ df_bar_data, df_bar_df, subplot_data_df_list[i], subplot_division[i]
+ )
+
+
+def test_plot_bar_label_count_default():
+ df = DataFrame(
+ [(30, 10, 10, 10), (20, 20, 20, 20), (10, 30, 30, 10)], columns=list("ABCD")
+ )
+ df.plot(subplots=True, kind="bar", title=["A", "B", "C", "D"])
+
+
+def test_plot_bar_label_count_expected_fail():
+ df = DataFrame(
+ [(30, 10, 10, 10), (20, 20, 20, 20), (10, 30, 30, 10)], columns=list("ABCD")
+ )
+ error_regex = re.escape(
+ "The number of titles (4) must equal the number of subplots (3)."
+ )
+ with pytest.raises(ValueError, match=error_regex):
+ df.plot(
+ subplots=[("A", "B")],
+ kind="bar",
+ title=["A&B", "C", "D", "Extra Title"],
+ )
+
+
+def test_plot_bar_label_count_expected_success():
+ df = DataFrame(
+ [(30, 10, 10, 10), (20, 20, 20, 20), (10, 30, 30, 10)], columns=list("ABCD")
+ )
+ df.plot(subplots=[("A", "B", "D")], kind="bar", title=["A&B&D", "C"])
diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py
index f0f67aebd85ec..f3418ad047afe 100644
--- a/pandas/tests/reshape/merge/test_merge.py
+++ b/pandas/tests/reshape/merge/test_merge.py
@@ -3060,3 +3060,12 @@ def test_merge_on_all_nan_column():
{"x": [1, 2, 3], "y": [np.nan, np.nan, np.nan], "z": [4, 5, 6], "zz": [4, 5, 6]}
)
tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("suffixes", [("_dup", ""), ("", "_dup")])
+def test_merge_for_suffix_collisions(suffixes):
+ # GH#61402
+ df1 = DataFrame({"col1": [1], "col2": [2]})
+ df2 = DataFrame({"col1": [1], "col2": [2], "col2_dup": [3]})
+ with pytest.raises(MergeError, match="duplicate columns"):
+ merge(df1, df2, on="col1", suffixes=suffixes)
diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py
index 95aa5291cb45a..02544c9518d10 100644
--- a/pandas/tests/reshape/test_melt.py
+++ b/pandas/tests/reshape/test_melt.py
@@ -555,6 +555,14 @@ def test_melt_multiindex_columns_var_name_too_many(self):
):
df.melt(var_name=["first", "second", "third"])
+ def test_melt_duplicate_column_header_raises(self):
+ # GH61475
+ df = DataFrame([[1, 2, 3], [3, 4, 5]], columns=["A", "A", "B"])
+ msg = "id_vars cannot contain duplicate columns."
+
+ with pytest.raises(ValueError, match=msg):
+ df.melt(id_vars=["A"], value_vars=["B"])
+
class TestLreshape:
def test_pairs(self):
diff --git a/pandas/tests/series/methods/test_map.py b/pandas/tests/series/methods/test_map.py
index 384b7ce3dc985..0ec973dea23d5 100644
--- a/pandas/tests/series/methods/test_map.py
+++ b/pandas/tests/series/methods/test_map.py
@@ -21,6 +21,10 @@
)
import pandas._testing as tm
+# The fixture it's mostly used in pandas/tests/apply, so it's defined in that
+# conftest, which is out of scope here. So we need to manually import
+from pandas.tests.apply.conftest import engine # noqa: F401
+
def test_series_map_box_timedelta():
# GH#11349
@@ -32,16 +36,20 @@ def f(x):
ser.map(f)
-def test_map_callable(datetime_series):
+def test_map_callable(datetime_series, engine): # noqa: F811
with np.errstate(all="ignore"):
- tm.assert_series_equal(datetime_series.map(np.sqrt), np.sqrt(datetime_series))
+ tm.assert_series_equal(
+ datetime_series.map(np.sqrt, engine=engine), np.sqrt(datetime_series)
+ )
# map function element-wise
- tm.assert_series_equal(datetime_series.map(math.exp), np.exp(datetime_series))
+ tm.assert_series_equal(
+ datetime_series.map(math.exp, engine=engine), np.exp(datetime_series)
+ )
# empty series
s = Series(dtype=object, name="foo", index=Index([], name="bar"))
- rs = s.map(lambda x: x)
+ rs = s.map(lambda x: x, engine=engine)
tm.assert_series_equal(s, rs)
# check all metadata (GH 9322)
@@ -52,7 +60,7 @@ def test_map_callable(datetime_series):
# index but no data
s = Series(index=[1, 2, 3], dtype=np.float64)
- rs = s.map(lambda x: x)
+ rs = s.map(lambda x: x, engine=engine)
tm.assert_series_equal(s, rs)
@@ -269,10 +277,10 @@ def test_map_decimal(string_series):
assert isinstance(result.iloc[0], Decimal)
-def test_map_na_exclusion():
+def test_map_na_exclusion(engine): # noqa: F811
s = Series([1.5, np.nan, 3, np.nan, 5])
- result = s.map(lambda x: x * 2, na_action="ignore")
+ result = s.map(lambda x: x * 2, na_action="ignore", engine=engine)
exp = s * 2
tm.assert_series_equal(result, exp)
@@ -628,3 +636,18 @@ def test_map_no_func_or_arg():
def test_map_func_is_none():
with pytest.raises(ValueError, match="The `func` parameter is required"):
Series([1, 2]).map(func=None)
+
+
+@pytest.mark.parametrize("func", [{}, {1: 2}, Series([3, 4])])
+def test_map_engine_no_function(func):
+ s = Series([1, 2])
+
+ with pytest.raises(ValueError, match="engine argument can only be specified"):
+ s.map(func, engine="something")
+
+
+def test_map_engine_not_executor():
+ s = Series([1, 2])
+
+ with pytest.raises(ValueError, match="Not a valid engine: 'something'"):
+ s.map(lambda x: x, engine="something")
diff --git a/pandas/tests/series/methods/test_round.py b/pandas/tests/series/methods/test_round.py
index c330b7a7dfbbb..a78f77e990ae1 100644
--- a/pandas/tests/series/methods/test_round.py
+++ b/pandas/tests/series/methods/test_round.py
@@ -72,3 +72,10 @@ def test_round_ea_boolean(self):
tm.assert_series_equal(result, expected)
result.iloc[0] = False
tm.assert_series_equal(ser, expected)
+
+ def test_round_dtype_object(self):
+ # GH#61206
+ ser = Series([0.2], dtype="object")
+ msg = "Expected numeric dtype, got object instead."
+ with pytest.raises(TypeError, match=msg):
+ ser.round()
diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py
index a7b1c56ff4df2..d7398ffe259cb 100644
--- a/pandas/tests/test_downstream.py
+++ b/pandas/tests/test_downstream.py
@@ -103,7 +103,7 @@ def test_xarray_cftimeindex_nearest():
cftime = pytest.importorskip("cftime")
xarray = pytest.importorskip("xarray")
- times = xarray.cftime_range("0001", periods=2)
+ times = xarray.date_range("0001", periods=2, use_cftime=True)
key = cftime.DatetimeGregorian(2000, 1, 1)
result = times.get_indexer([key], method="nearest")
expected = 1
diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py
index 616ae36c989be..b02fab70fb825 100644
--- a/pandas/tests/tools/test_to_datetime.py
+++ b/pandas/tests/tools/test_to_datetime.py
@@ -3514,6 +3514,54 @@ def test_to_datetime_mixed_not_necessarily_iso8601_coerce():
tm.assert_index_equal(result, DatetimeIndex(["2020-01-01 00:00:00", NaT]))
+def test_to_datetime_iso8601_utc_single_naive():
+ # GH#61389
+ result = to_datetime("2023-10-15T14:30:00", utc=True, format="ISO8601")
+ expected = Timestamp("2023-10-15 14:30:00+00:00")
+ assert result == expected
+
+
+def test_to_datetime_iso8601_utc_mixed_negative_offset():
+ # GH#61389
+ data = ["2023-10-15T10:30:00-12:00", "2023-10-15T14:30:00"]
+ result = to_datetime(data, utc=True, format="ISO8601")
+
+ expected = DatetimeIndex(
+ [Timestamp("2023-10-15 22:30:00+00:00"), Timestamp("2023-10-15 14:30:00+00:00")]
+ )
+ tm.assert_index_equal(result, expected)
+
+
+def test_to_datetime_iso8601_utc_mixed_positive_offset():
+ # GH#61389
+ data = ["2023-10-15T10:30:00+08:00", "2023-10-15T14:30:00"]
+ result = to_datetime(data, utc=True, format="ISO8601")
+
+ expected = DatetimeIndex(
+ [Timestamp("2023-10-15 02:30:00+00:00"), Timestamp("2023-10-15 14:30:00+00:00")]
+ )
+ tm.assert_index_equal(result, expected)
+
+
+def test_to_datetime_iso8601_utc_mixed_both_offsets():
+ # GH#61389
+ data = [
+ "2023-10-15T10:30:00+08:00",
+ "2023-10-15T12:30:00-05:00",
+ "2023-10-15T14:30:00",
+ ]
+ result = to_datetime(data, utc=True, format="ISO8601")
+
+ expected = DatetimeIndex(
+ [
+ Timestamp("2023-10-15 02:30:00+00:00"),
+ Timestamp("2023-10-15 17:30:00+00:00"),
+ Timestamp("2023-10-15 14:30:00+00:00"),
+ ]
+ )
+ tm.assert_index_equal(result, expected)
+
+
def test_unknown_tz_raises():
# GH#18702, GH#51476
dtstr = "2014 Jan 9 05:15 FAKE"
diff --git a/pyproject.toml b/pyproject.toml
index 3f7b6a672e1b0..b17a1eacfa717 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ requires = [
"meson-python>=0.13.1",
"meson>=1.2.1,<2",
"wheel",
- "Cython~=3.0.5", # Note: sync with setup.py, environment.yml and asv.conf.json
+ "Cython<4.0.0a0", # Note: sync with setup.py, environment.yml and asv.conf.json
# Force numpy higher than 2.0rc1, so that built wheels are compatible
# with both numpy 1 and 2
"numpy>=2.0.0rc1",
@@ -62,15 +62,14 @@ test = ['hypothesis>=6.84.0', 'pytest>=7.3.2', 'pytest-xdist>=3.4.0']
pyarrow = ['pyarrow>=10.0.1']
performance = ['bottleneck>=1.3.6', 'numba>=0.59.0', 'numexpr>=2.9.0']
computation = ['scipy>=1.12.0', 'xarray>=2024.1.1']
-fss = ['fsspec>=2024.2.0']
-aws = ['s3fs>=2024.2.0']
-gcp = ['gcsfs>=2024.2.0']
+fss = ['fsspec>=2023.12.2']
+aws = ['s3fs>=2023.12.2']
+gcp = ['gcsfs>=2023.12.2']
excel = ['odfpy>=1.4.1', 'openpyxl>=3.1.2', 'python-calamine>=0.1.7', 'pyxlsb>=1.0.10', 'xlrd>=2.0.1', 'xlsxwriter>=3.2.0']
parquet = ['pyarrow>=10.0.1']
feather = ['pyarrow>=10.0.1']
-hdf5 = [# blosc only available on conda (https://github.com/Blosc/python-blosc/issues/297)
- #'blosc>=1.20.1',
- 'tables>=3.8.0']
+iceberg = ['pyiceberg>=0.7.1']
+hdf5 = ['tables>=3.8.0']
spss = ['pyreadstat>=1.2.6']
postgresql = ['SQLAlchemy>=2.0.0', 'psycopg2>=2.9.6', 'adbc-driver-postgresql>=0.10.0']
mysql = ['SQLAlchemy>=2.0.0', 'pymysql>=1.1.0']
@@ -85,12 +84,10 @@ timezone = ['pytz>=2023.4']
all = ['adbc-driver-postgresql>=0.10.0',
'adbc-driver-sqlite>=0.8.0',
'beautifulsoup4>=4.12.3',
- # blosc only available on conda (https://github.com/Blosc/python-blosc/issues/297)
- #'blosc>=1.21.3',
'bottleneck>=1.3.6',
'fastparquet>=2024.2.0',
- 'fsspec>=2024.2.0',
- 'gcsfs>=2024.2.0',
+ 'fsspec>=2023.12.2',
+ 'gcsfs>=2023.12.2',
'html5lib>=1.1',
'hypothesis>=6.84.0',
'jinja2>=3.1.3',
@@ -102,6 +99,7 @@ all = ['adbc-driver-postgresql>=0.10.0',
'openpyxl>=3.1.2',
'psycopg2>=2.9.6',
'pyarrow>=10.0.1',
+ 'pyiceberg>=0.7.1',
'pymysql>=1.1.0',
'PyQt5>=5.15.9',
'pyreadstat>=1.2.6',
@@ -112,7 +110,7 @@ all = ['adbc-driver-postgresql>=0.10.0',
'pyxlsb>=1.0.10',
'qtpy>=2.3.0',
'scipy>=1.12.0',
- 's3fs>=2024.2.0',
+ 's3fs>=2023.12.2',
'SQLAlchemy>=2.0.0',
'tables>=3.8.0',
'tabulate>=0.9.0',
@@ -156,12 +154,12 @@ test-command = """
pd.test(extra_args=["-m not clipboard and not single_cpu and not slow and not network and not db", "-n 2", "--no-strict-data-files"]); \
pd.test(extra_args=["-m not clipboard and single_cpu and not slow and not network and not db", "--no-strict-data-files"]);' \
"""
-free-threaded-support = true
+enable = ["cpython-freethreading"]
before-build = "PACKAGE_DIR={package} bash {package}/scripts/cibw_before_build.sh"
[tool.cibuildwheel.windows]
environment = {}
-before-build = "pip install delvewheel && bash {package}/scripts/cibw_before_build_windows.sh"
+before-build = "pip install delvewheel"
test-command = """
set PANDAS_CI='1' && \
python -c "import pandas as pd; \
diff --git a/requirements-dev.txt b/requirements-dev.txt
index f16b905bcddfb..6515797bc3b9d 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -3,7 +3,7 @@
pip
versioneer[toml]
-cython~=3.0.5
+cython<4.0.0a0
meson[ninja]==1.2.1
meson-python==0.13.1
pytest>=7.3.2
@@ -16,13 +16,12 @@ coverage
python-dateutil
numpy<3
beautifulsoup4>=4.12.3
-blosc
bottleneck>=1.3.6
fastparquet>=2024.2.0
-fsspec>=2024.2.0
+fsspec>=2023.12.2
html5lib>=1.1
hypothesis>=6.84.0
-gcsfs>=2024.2.0
+gcsfs>=2023.12.2
ipython
pickleshare
jinja2>=3.1.3
@@ -34,17 +33,18 @@ openpyxl>=3.1.2
odfpy>=1.4.1
psycopg2-binary>=2.9.6
pyarrow>=10.0.1
+pyiceberg>=0.7.1
pymysql>=1.1.0
pyreadstat>=1.2.6
tables>=3.8.0
python-calamine>=0.1.7
pytz>=2023.4
pyxlsb>=1.0.10
-s3fs>=2024.2.0
+s3fs>=2023.12.2
scipy>=1.12.0
SQLAlchemy>=2.0.0
tabulate>=0.9.0
-xarray>=2024.1.1, <=2024.9.0
+xarray>=2024.1.1
xlrd>=2.0.1
xlsxwriter>=3.2.0
zstandard>=0.22.0
@@ -58,8 +58,6 @@ mypy==1.13.0
tokenize-rt
pre-commit>=4.2.0
gitpython
-gitdb
-google-auth
natsort
numpydoc
pydata-sphinx-theme==0.16
diff --git a/scripts/cibw_before_build.sh b/scripts/cibw_before_build.sh
index d326dd3637314..274848972bd7e 100644
--- a/scripts/cibw_before_build.sh
+++ b/scripts/cibw_before_build.sh
@@ -3,11 +3,3 @@
for file in $PACKAGE_DIR/LICENSES/*; do
cat $file >> $PACKAGE_DIR/LICENSE
done
-
-# TODO: Delete when there's a PyPI Cython release that supports free-threaded Python 3.13.
-FREE_THREADED_BUILD="$(python -c"import sysconfig; print(bool(sysconfig.get_config_var('Py_GIL_DISABLED')))")"
-if [[ $FREE_THREADED_BUILD == "True" ]]; then
- python -m pip install -U pip
- python -m pip install -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple cython
- python -m pip install numpy ninja meson-python versioneer[toml]
-fi
diff --git a/scripts/cibw_before_build_windows.sh b/scripts/cibw_before_build_windows.sh
deleted file mode 100644
index 8f001db566a1d..0000000000000
--- a/scripts/cibw_before_build_windows.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-# Add 3rd party licenses, like numpy does
-for file in $PACKAGE_DIR/LICENSES/*; do
- cat $file >> $PACKAGE_DIR/LICENSE
-done
-
-# TODO: Delete when there's a PyPI Cython release that supports free-threaded Python 3.13
-FREE_THREADED_BUILD="$(python -c"import sysconfig; print(bool(sysconfig.get_config_var('Py_GIL_DISABLED')))")"
-if [[ $FREE_THREADED_BUILD == "True" ]]; then
- python -m pip install -U pip
- python -m pip install -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple cython
- python -m pip install ninja meson-python versioneer[toml] numpy
-fi
diff --git a/scripts/run_stubtest.py b/scripts/run_stubtest.py
index df88c61061f12..e87a7d53f4ff3 100644
--- a/scripts/run_stubtest.py
+++ b/scripts/run_stubtest.py
@@ -85,9 +85,11 @@
]
# create allowlist
- with tempfile.NamedTemporaryFile(mode="w+t") as allow:
- allow.write("\n".join(_ALLOWLIST))
- allow.flush()
+ with tempfile.TemporaryDirectory() as td:
+ allow = os.path.join(td, "test")
+ with open(allow, "w+t") as allow:
+ allow.write("\n".join(_ALLOWLIST))
+ allow.flush()
args = pyi_modules + [
"--ignore-missing-stub",
diff --git a/scripts/tests/data/deps_expected_random.yaml b/scripts/tests/data/deps_expected_random.yaml
index 8e85c91ead24e..d4ecd9f64a68d 100644
--- a/scripts/tests/data/deps_expected_random.yaml
+++ b/scripts/tests/data/deps_expected_random.yaml
@@ -23,7 +23,6 @@ dependencies:
# optional dependencies
- beautifulsoup4>=5.9.3
- - blosc
- bottleneck>=1.3.2
- fastparquet>=0.6.3
- fsspec>=2021.07.0
diff --git a/scripts/tests/data/deps_minimum.toml b/scripts/tests/data/deps_minimum.toml
index f789d5998a30c..21c269f573b3d 100644
--- a/scripts/tests/data/deps_minimum.toml
+++ b/scripts/tests/data/deps_minimum.toml
@@ -63,9 +63,7 @@ gcp = ['gcsfs>=2021.07.0']
excel = ['odfpy>=1.4.1', 'openpyxl>=3.0.7', 'python-calamine>=0.1.7', 'pyxlsb>=1.0.8', 'xlrd>=2.0.1', 'xlsxwriter>=1.4.3']
parquet = ['pyarrow>=7.0.0']
feather = ['pyarrow>=7.0.0']
-hdf5 = [# blosc only available on conda (https://github.com/Blosc/python-blosc/issues/297)
- #'blosc>=1.20.1',
- 'tables>=3.6.1']
+hdf5 = ['tables>=3.6.1']
spss = ['pyreadstat>=1.1.2']
postgresql = ['SQLAlchemy>=1.4.16', 'psycopg2>=2.8.6']
mysql = ['SQLAlchemy>=1.4.16', 'pymysql>=1.1.0']
@@ -77,8 +75,6 @@ output_formatting = ['jinja2>=3.0.0', 'tabulate>=0.8.9']
clipboard = ['PyQt5>=5.15.1', 'qtpy>=2.3.0']
compression = ['zstandard>=0.15.2']
all = ['beautifulsoup4>=5.9.3',
- # blosc only available on conda (https://github.com/Blosc/python-blosc/issues/297)
- #'blosc>=1.21.0',
'bottleneck>=1.3.2',
'fastparquet>=0.6.3',
'fsspec>=2021.07.0',
diff --git a/scripts/tests/data/deps_unmodified_random.yaml b/scripts/tests/data/deps_unmodified_random.yaml
index 5b47d45973161..4b0f4ffb51b92 100644
--- a/scripts/tests/data/deps_unmodified_random.yaml
+++ b/scripts/tests/data/deps_unmodified_random.yaml
@@ -23,7 +23,6 @@ dependencies:
# optional dependencies
- beautifulsoup4
- - blosc
- bottleneck>=1.3.2
- fastparquet>=0.6.3
- fsspec>=2021.07.0
diff --git a/scripts/validate_min_versions_in_sync.py b/scripts/validate_min_versions_in_sync.py
index 1001b00450354..7908aaef3d890 100755
--- a/scripts/validate_min_versions_in_sync.py
+++ b/scripts/validate_min_versions_in_sync.py
@@ -36,7 +36,7 @@
SETUP_PATH = pathlib.Path("pyproject.toml").resolve()
YAML_PATH = pathlib.Path("ci/deps")
ENV_PATH = pathlib.Path("environment.yml")
-EXCLUDE_DEPS = {"tzdata", "blosc", "pyqt", "pyqt5"}
+EXCLUDE_DEPS = {"tzdata", "pyqt", "pyqt5"}
EXCLUSION_LIST = frozenset(["python=3.8[build=*_pypy]"])
# pandas package is not available
# in pre-commit environment
diff --git a/web/pandas/about/roadmap.md b/web/pandas/about/roadmap.md
index aba95ec2c03fc..278143c01e7dc 100644
--- a/web/pandas/about/roadmap.md
+++ b/web/pandas/about/roadmap.md
@@ -58,27 +58,6 @@ library, making their behavior more consistent with the handling of
NumPy arrays. We'll do this by cleaning up pandas' internals and
adding new methods to the extension array interface.
-### String data type
-
-Currently, pandas stores text data in an `object` -dtype NumPy array.
-The current implementation has two primary drawbacks: First, `object`
--dtype is not specific to strings: any Python object can be stored in an
-`object` -dtype array, not just strings. Second: this is not efficient.
-The NumPy memory model isn't especially well-suited to variable width
-text data.
-
-To solve the first issue, we propose a new extension type for string
-data. This will initially be opt-in, with users explicitly requesting
-`dtype="string"`. The array backing this string dtype may initially be
-the current implementation: an `object` -dtype NumPy array of Python
-strings.
-
-To solve the second issue (performance), we'll explore alternative
-in-memory array libraries (for example, Apache Arrow). As part of the
-work, we may need to implement certain operations expected by pandas
-users (for example the algorithm used in, `Series.str.upper`). That work
-may be done outside of pandas.
-
### Apache Arrow interoperability
[Apache Arrow](https://arrow.apache.org) is a cross-language development
diff --git a/web/pandas/config.yml b/web/pandas/config.yml
index cb5447591dab6..bdc76ec22a310 100644
--- a/web/pandas/config.yml
+++ b/web/pandas/config.yml
@@ -82,15 +82,11 @@ maintainers:
- simonjayhawkins
- topper-123
- alimcmaster1
- - bashtage
- Dr-Irv
- rhshadrach
- phofl
- attack68
- fangchenli
- - lithomas1
- - lukemanley
- - noatamir
inactive:
- lodagro
- jseabold
@@ -108,6 +104,10 @@ maintainers:
- mzeitlin11
- twoertwein
- MarcoGorelli
+ - bashtage
+ - noatamir
+ - lithomas1
+ - lukemanley
workgroups:
coc:
name: Code of Conduct
diff --git a/web/pandas/pdeps/0001-purpose-and-guidelines.md b/web/pandas/pdeps/0001-purpose-and-guidelines.md
index 7f5f0326eba6c..ed084a730ecdc 100644
--- a/web/pandas/pdeps/0001-purpose-and-guidelines.md
+++ b/web/pandas/pdeps/0001-purpose-and-guidelines.md
@@ -8,6 +8,8 @@
[Noa Tamir](https://github.com/noatamir)
- Revision: 3
+[TOC]
+
## PDEP definition, purpose and scope
A PDEP (pandas enhancement proposal) is a proposal for a **major** change in
diff --git a/web/pandas/pdeps/0004-consistent-to-datetime-parsing.md b/web/pandas/pdeps/0004-consistent-to-datetime-parsing.md
index 68c6dfa26d1f1..b3f277326319e 100644
--- a/web/pandas/pdeps/0004-consistent-to-datetime-parsing.md
+++ b/web/pandas/pdeps/0004-consistent-to-datetime-parsing.md
@@ -6,6 +6,8 @@
- Author: [Marco Gorelli](https://github.com/MarcoGorelli)
- Revision: 2
+[TOC]
+
## Abstract
The suggestion is that:
diff --git a/web/pandas/pdeps/0005-no-default-index-mode.md b/web/pandas/pdeps/0005-no-default-index-mode.md
index d543a4718e896..81222b51817d5 100644
--- a/web/pandas/pdeps/0005-no-default-index-mode.md
+++ b/web/pandas/pdeps/0005-no-default-index-mode.md
@@ -6,6 +6,8 @@
- Author: [Marco Gorelli](https://github.com/MarcoGorelli)
- Revision: 2
+[TOC]
+
## Abstract
The suggestion is to add a ``NoRowIndex`` class. Internally, it would act a bit like
diff --git a/web/pandas/pdeps/0006-ban-upcasting.md b/web/pandas/pdeps/0006-ban-upcasting.md
index ae5872186bf23..59f2cc35bf6ee 100644
--- a/web/pandas/pdeps/0006-ban-upcasting.md
+++ b/web/pandas/pdeps/0006-ban-upcasting.md
@@ -6,6 +6,8 @@
- Author: [Marco Gorelli](https://github.com/MarcoGorelli) ([original issue](https://github.com/pandas-dev/pandas/issues/39584) by [Joris Van den Bossche](https://github.com/jorisvandenbossche))
- Revision: 1
+[TOC]
+
## Abstract
The suggestion is that setitem-like operations would
diff --git a/web/pandas/pdeps/0007-copy-on-write.md b/web/pandas/pdeps/0007-copy-on-write.md
index f5adb6a571120..5e35cf01de977 100644
--- a/web/pandas/pdeps/0007-copy-on-write.md
+++ b/web/pandas/pdeps/0007-copy-on-write.md
@@ -6,6 +6,8 @@
- Author: [Joris Van den Bossche](https://github.com/jorisvandenbossche)
- Revision: 1
+[TOC]
+
## Abstract
Short summary of the proposal:
@@ -525,7 +527,7 @@ following cases:
* Selecting a single column (as a Series) out of a DataFrame is always a view
(``df['a']``)
* Slicing columns from a DataFrame creating a subset DataFrame (``df[['a':'b']]`` or
- ``df.loc[:, 'a': 'b']``) is a view _if_ the the original DataFrame consists of a
+ ``df.loc[:, 'a': 'b']``) is a view _if_ the original DataFrame consists of a
single block (single dtype, consolidated) and _if_ you are slicing (so not a list
selection). In all other cases, getting a subset is always a copy.
* Selecting rows _can_ return a view, when the row indexer is a `slice` object.
diff --git a/web/pandas/pdeps/0009-io-extensions.md b/web/pandas/pdeps/0009-io-extensions.md
index aeda990cea7df..baa661957e951 100644
--- a/web/pandas/pdeps/0009-io-extensions.md
+++ b/web/pandas/pdeps/0009-io-extensions.md
@@ -7,6 +7,8 @@
- Author: [Marc Garcia](https://github.com/datapythonista)
- Revision: 1
+[TOC]
+
## PDEP Summary
This document proposes that third-party projects implementing I/O or memory
diff --git a/web/pandas/pdeps/0010-required-pyarrow-dependency.md b/web/pandas/pdeps/0010-required-pyarrow-dependency.md
index 0c3bf3c776988..60ed8c4b910eb 100644
--- a/web/pandas/pdeps/0010-required-pyarrow-dependency.md
+++ b/web/pandas/pdeps/0010-required-pyarrow-dependency.md
@@ -8,6 +8,8 @@
[Patrick Hoefler](https://github.com/phofl)
- Revision: 1
+[TOC]
+
## Abstract
This PDEP proposes that:
diff --git a/web/pandas/pdeps/0012-compact-and-reversible-JSON-interface.md b/web/pandas/pdeps/0012-compact-and-reversible-JSON-interface.md
index 71f669825f979..1c513c3bb517b 100644
--- a/web/pandas/pdeps/0012-compact-and-reversible-JSON-interface.md
+++ b/web/pandas/pdeps/0012-compact-and-reversible-JSON-interface.md
@@ -8,36 +8,7 @@
- Author: [Philippe THOMY](https://github.com/loco-philippe)
- Revision: 3
-##### Summary
-
-- [Abstract](./0012-compact-and-reversible-JSON-interface.md/#Abstract)
- - [Problem description](./0012-compact-and-reversible-JSON-interface.md/#Problem-description)
- - [Feature Description](./0012-compact-and-reversible-JSON-interface.md/#Feature-Description)
-- [Scope](./0012-compact-and-reversible-JSON-interface.md/#Scope)
-- [Motivation](./0012-compact-and-reversible-JSON-interface.md/#Motivation)
- - [Why is it important to have a compact and reversible JSON interface ?](./0012-compact-and-reversible-JSON-interface.md/#Why-is-it-important-to-have-a-compact-and-reversible-JSON-interface-?)
- - [Is it relevant to take an extended type into account ?](./0012-compact-and-reversible-JSON-interface.md/#Is-it-relevant-to-take-an-extended-type-into-account-?)
- - [Is this only useful for pandas ?](./0012-compact-and-reversible-JSON-interface.md/#Is-this-only-useful-for-pandas-?)
-- [Description](./0012-compact-and-reversible-JSON-interface.md/#Description)
- - [Data typing](./0012-compact-and-reversible-JSON-interface.md/#Data-typing)
- - [Correspondence between TableSchema and pandas](./panda0012-compact-and-reversible-JSON-interfaces_PDEP.md/#Correspondence-between-TableSchema-and-pandas)
- - [JSON format](./0012-compact-and-reversible-JSON-interface.md/#JSON-format)
- - [Conversion](./0012-compact-and-reversible-JSON-interface.md/#Conversion)
-- [Usage and impact](./0012-compact-and-reversible-JSON-interface.md/#Usage-and-impact)
- - [Usage](./0012-compact-and-reversible-JSON-interface.md/#Usage)
- - [Compatibility](./0012-compact-and-reversible-JSON-interface.md/#Compatibility)
- - [Impacts on the pandas framework](./0012-compact-and-reversible-JSON-interface.md/#Impacts-on-the-pandas-framework)
- - [Risk to do / risk not to do](./0012-compact-and-reversible-JSON-interface.md/#Risk-to-do-/-risk-not-to-do)
-- [Implementation](./0012-compact-and-reversible-JSON-interface.md/#Implementation)
- - [Modules](./0012-compact-and-reversible-JSON-interface.md/#Modules)
- - [Implementation options](./0012-compact-and-reversible-JSON-interface.md/#Implementation-options)
-- [F.A.Q.](./0012-compact-and-reversible-JSON-interface.md/#F.A.Q.)
-- [Synthesis](./0012-compact-and-reversible-JSON-interface.md/Synthesis)
-- [Core team decision](./0012-compact-and-reversible-JSON-interface.md/#Core-team-decision)
-- [Timeline](./0012-compact-and-reversible-JSON-interface.md/#Timeline)
-- [PDEP history](./0012-compact-and-reversible-JSON-interface.md/#PDEP-history)
-
--------------------------
+[TOC]
## Abstract
diff --git a/web/pandas/pdeps/0014-string-dtype.md b/web/pandas/pdeps/0014-string-dtype.md
index 5b74f71216454..35b5725341534 100644
--- a/web/pandas/pdeps/0014-string-dtype.md
+++ b/web/pandas/pdeps/0014-string-dtype.md
@@ -220,8 +220,8 @@ in pandas 2.3 and removed in pandas 3.0.
The `storage` keyword of `StringDtype` is kept to disambiguate the underlying
storage of the string data (using pyarrow or python objects), but an additional
-`na_value` is introduced to disambiguate the the variants using NA semantics
-and NaN semantics.
+`na_value` is introduced to disambiguate the variants using NA semantics and
+NaN semantics.
Overview of the different ways to specify a dtype and the resulting concrete
dtype of the data:
diff --git a/web/pandas/static/css/pandas.css b/web/pandas/static/css/pandas.css
index ec9a4bd502dd1..3bc4d81f21b64 100644
--- a/web/pandas/static/css/pandas.css
+++ b/web/pandas/static/css/pandas.css
@@ -1,3 +1,6 @@
+html {
+ scroll-padding-top: 5rem;
+}
body {
padding-top: 5em;
color: #444;
@@ -103,3 +106,22 @@ blockquote {
color: #787878;
font-size: 18px;
}
+.toc {
+ background: #f9f9f9;
+ padding: 1em;
+ border: 0.1em solid darkgrey;
+ border-radius: 0.4em;
+ display: inline-block;
+ margin: 1em 0;
+}
+.toc .toctitle {
+ font-weight: bold;
+ padding-bottom: 1em;
+}
+a.headerlink {
+ opacity: 0;
+}
+h2:hover a.headerlink, h3:hover a.headerlink {
+ opacity: 1;
+ transition: opacity 0.5s;
+}
diff --git a/web/pandas/versions.json b/web/pandas/versions.json
index 2d2599ae8585b..1274d2125bb2b 100644
--- a/web/pandas/versions.json
+++ b/web/pandas/versions.json
@@ -5,11 +5,16 @@
"url": "https://pandas.pydata.org/docs/dev/"
},
{
- "name": "2.2 (stable)",
- "version": "2.2",
+ "name": "2.3 (stable)",
+ "version": "2.3",
"url": "https://pandas.pydata.org/docs/",
"preferred": true
},
+ {
+ "name": "2.2",
+ "version": "2.2",
+ "url": "https://pandas.pydata.org/pandas-docs/version/2.2/"
+ },
{
"name": "2.1",
"version": "2.1",
diff --git a/web/pandas_web.py b/web/pandas_web.py
index b3872b829c73a..34ac3743148ba 100755
--- a/web/pandas_web.py
+++ b/web/pandas_web.py
@@ -441,6 +441,16 @@ def main(
For ``.md`` and ``.html`` files, render them with the context
before copying them. ``.md`` files are transformed to HTML.
"""
+ # Sanity check: validate that versions.json is valid JSON
+ versions_path = os.path.join(source_path, "versions.json")
+ with open(versions_path, encoding="utf-8") as f:
+ try:
+ json.load(f)
+ except json.JSONDecodeError as e:
+ raise RuntimeError(
+ f"Invalid versions.json: {e}. Ensure it is valid JSON."
+ ) from e
+
config_fname = os.path.join(source_path, "config.yml")
shutil.rmtree(target_path, ignore_errors=True)
@@ -466,9 +476,29 @@ def main(
with open(os.path.join(source_path, fname), encoding="utf-8") as f:
content = f.read()
if extension == ".md":
- body = markdown.markdown(
- content, extensions=context["main"]["markdown_extensions"]
- )
+ if "pdeps/" in fname:
+ from markdown.extensions.toc import TocExtension
+
+ body = markdown.markdown(
+ content,
+ extensions=[
+ # Ignore the title of the PDEP in the table of contents
+ TocExtension(
+ title="Table of Contents",
+ toc_depth="2-3",
+ permalink=" #",
+ ),
+ "tables",
+ "fenced_code",
+ "meta",
+ "footnotes",
+ "codehilite",
+ ],
+ )
+ else:
+ body = markdown.markdown(
+ content, extensions=context["main"]["markdown_extensions"]
+ )
# Apply Bootstrap's table formatting manually
# Python-Markdown doesn't let us config table attributes by hand
body = body.replace("