Skip to content

Commit 47645fe

Browse files
[backport 2.3.x] ENH(string dtype): fallback for HDF5 with UTF-8 surrogates (pandas-dev#60993) (pandas-dev#61639)
Co-authored-by: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com>
1 parent 68cdc1d commit 47645fe

File tree

2 files changed

+96
-40
lines changed

2 files changed

+96
-40
lines changed

pandas/io/pytables.py

Lines changed: 85 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
)
4141
from pandas._libs.lib import is_string_array
4242
from pandas._libs.tslibs import timezones
43+
from pandas.compat import HAS_PYARROW
4344
from pandas.compat._optional import import_optional_dependency
4445
from pandas.compat.pickle_compat import patch_pickle
4546
from pandas.errors import (
@@ -391,6 +392,13 @@ def read_hdf(
391392
DataFrame.to_hdf : Write a HDF file from a DataFrame.
392393
HDFStore : Low-level access to HDF files.
393394
395+
Notes
396+
-----
397+
When ``errors="surrogatepass"``, ``pd.options.future.infer_string`` is true,
398+
and PyArrow is installed, if a UTF-16 surrogate is encountered when decoding
399+
to UTF-8, the resulting dtype will be
400+
``pd.StringDtype(storage="python", na_value=np.nan)``.
401+
394402
Examples
395403
--------
396404
>>> df = pd.DataFrame([[1, 1.0, 'a']], columns=['x', 'y', 'z']) # doctest: +SKIP
@@ -2182,6 +2190,20 @@ def convert(
21822190
# making an Index instance could throw a number of different errors
21832191
try:
21842192
new_pd_index = factory(values, **kwargs)
2193+
except UnicodeEncodeError as err:
2194+
if (
2195+
errors == "surrogatepass"
2196+
and get_option("future.infer_string")
2197+
and str(err).endswith("surrogates not allowed")
2198+
and HAS_PYARROW
2199+
):
2200+
new_pd_index = factory(
2201+
values,
2202+
dtype=StringDtype(storage="python", na_value=np.nan),
2203+
**kwargs,
2204+
)
2205+
else:
2206+
raise
21852207
except ValueError:
21862208
# if the output freq is different that what we recorded,
21872209
# it should be None (see also 'doc example part 2')
@@ -3097,12 +3119,29 @@ def read_index_node(
30973119
**kwargs,
30983120
)
30993121
else:
3100-
index = factory(
3101-
_unconvert_index(
3102-
data, kind, encoding=self.encoding, errors=self.errors
3103-
),
3104-
**kwargs,
3105-
)
3122+
try:
3123+
index = factory(
3124+
_unconvert_index(
3125+
data, kind, encoding=self.encoding, errors=self.errors
3126+
),
3127+
**kwargs,
3128+
)
3129+
except UnicodeEncodeError as err:
3130+
if (
3131+
self.errors == "surrogatepass"
3132+
and get_option("future.infer_string")
3133+
and str(err).endswith("surrogates not allowed")
3134+
and HAS_PYARROW
3135+
):
3136+
index = factory(
3137+
_unconvert_index(
3138+
data, kind, encoding=self.encoding, errors=self.errors
3139+
),
3140+
dtype=StringDtype(storage="python", na_value=np.nan),
3141+
**kwargs,
3142+
)
3143+
else:
3144+
raise
31063145

31073146
index.name = name
31083147

@@ -3236,13 +3275,24 @@ def read(
32363275
self.validate_read(columns, where)
32373276
index = self.read_index("index", start=start, stop=stop)
32383277
values = self.read_array("values", start=start, stop=stop)
3239-
result = Series(values, index=index, name=self.name, copy=False)
3240-
if (
3241-
using_string_dtype()
3242-
and isinstance(values, np.ndarray)
3243-
and is_string_array(values, skipna=True)
3244-
):
3245-
result = result.astype(StringDtype(na_value=np.nan))
3278+
try:
3279+
result = Series(values, index=index, name=self.name, copy=False)
3280+
except UnicodeEncodeError as err:
3281+
if (
3282+
self.errors == "surrogatepass"
3283+
and get_option("future.infer_string")
3284+
and str(err).endswith("surrogates not allowed")
3285+
and HAS_PYARROW
3286+
):
3287+
result = Series(
3288+
values,
3289+
index=index,
3290+
name=self.name,
3291+
copy=False,
3292+
dtype=StringDtype(storage="python", na_value=np.nan),
3293+
)
3294+
else:
3295+
raise
32463296
return result
32473297

32483298
def write(self, obj, **kwargs) -> None:
@@ -4704,7 +4754,24 @@ def read(
47044754
values = values.reshape((1, values.shape[0]))
47054755

47064756
if isinstance(values, np.ndarray):
4707-
df = DataFrame(values.T, columns=cols_, index=index_, copy=False)
4757+
try:
4758+
df = DataFrame(values.T, columns=cols_, index=index_, copy=False)
4759+
except UnicodeEncodeError as err:
4760+
if (
4761+
self.errors == "surrogatepass"
4762+
and get_option("future.infer_string")
4763+
and str(err).endswith("surrogates not allowed")
4764+
and HAS_PYARROW
4765+
):
4766+
df = DataFrame(
4767+
values.T,
4768+
columns=cols_,
4769+
index=index_,
4770+
copy=False,
4771+
dtype=StringDtype(storage="python", na_value=np.nan),
4772+
)
4773+
else:
4774+
raise
47084775
elif isinstance(values, Index):
47094776
df = DataFrame(values, columns=cols_, index=index_)
47104777
else:
@@ -4714,23 +4781,10 @@ def read(
47144781
assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype)
47154782

47164783
# If str / string dtype is stored in meta, use that.
4717-
converted = False
47184784
for column in cols_:
47194785
dtype = getattr(self.table.attrs, f"{column}_meta", None)
47204786
if dtype in ["str", "string"]:
47214787
df[column] = df[column].astype(dtype)
4722-
converted = True
4723-
# Otherwise try inference.
4724-
if (
4725-
not converted
4726-
and using_string_dtype()
4727-
and isinstance(values, np.ndarray)
4728-
and is_string_array(
4729-
values,
4730-
skipna=True,
4731-
)
4732-
):
4733-
df = df.astype(StringDtype(na_value=np.nan))
47344788
frames.append(df)
47354789

47364790
if len(frames) == 1:
@@ -5194,7 +5248,7 @@ def _convert_string_array(data: np.ndarray, encoding: str, errors: str) -> np.nd
51945248
# encode if needed
51955249
if len(data):
51965250
data = (
5197-
Series(data.ravel(), copy=False)
5251+
Series(data.ravel(), copy=False, dtype="object")
51985252
.str.encode(encoding, errors)
51995253
._values.reshape(data.shape)
52005254
)
@@ -5234,7 +5288,9 @@ def _unconvert_string_array(
52345288
dtype = f"U{itemsize}"
52355289

52365290
if isinstance(data[0], bytes):
5237-
ser = Series(data, copy=False).str.decode(encoding, errors=errors)
5291+
ser = Series(data, copy=False).str.decode(
5292+
encoding, errors=errors, dtype="object"
5293+
)
52385294
data = ser.to_numpy()
52395295
data.flags.writeable = True
52405296
else:

pandas/tests/io/pytables/test_store.py

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,6 @@
77
import numpy as np
88
import pytest
99

10-
from pandas._config import using_string_dtype
11-
12-
from pandas.compat import HAS_PYARROW
13-
1410
import pandas as pd
1511
from pandas import (
1612
DataFrame,
@@ -398,20 +394,24 @@ def test_to_hdf_with_min_itemsize(tmp_path, setup_path):
398394
tm.assert_series_equal(read_hdf(path, "ss4"), concat([df["B"], df2["B"]]))
399395

400396

401-
@pytest.mark.xfail(
402-
using_string_dtype() and HAS_PYARROW,
403-
reason="TODO(infer_string): can't encode '\ud800': surrogates not allowed",
404-
)
405397
@pytest.mark.parametrize("format", ["fixed", "table"])
406-
def test_to_hdf_errors(tmp_path, format, setup_path):
398+
def test_to_hdf_errors(tmp_path, format, setup_path, using_infer_string):
407399
data = ["\ud800foo"]
408-
ser = Series(data, index=Index(data))
400+
ser = Series(data, index=Index(data, dtype="object"), dtype="object")
409401
path = tmp_path / setup_path
410402
# GH 20835
411403
ser.to_hdf(path, key="table", format=format, errors="surrogatepass")
412404

413405
result = read_hdf(path, "table", errors="surrogatepass")
414-
tm.assert_series_equal(result, ser)
406+
407+
if using_infer_string:
408+
# https://github.com/pandas-dev/pandas/pull/60993
409+
# Surrogates fallback to python storage.
410+
dtype = pd.StringDtype(storage="python", na_value=np.nan)
411+
else:
412+
dtype = "object"
413+
expected = Series(data, index=Index(data, dtype=dtype), dtype=dtype)
414+
tm.assert_series_equal(result, expected)
415415

416416

417417
def test_create_table_index(setup_path):

0 commit comments

Comments
 (0)