40
40
)
41
41
from pandas ._libs .lib import is_string_array
42
42
from pandas ._libs .tslibs import timezones
43
+ from pandas .compat import HAS_PYARROW
43
44
from pandas .compat ._optional import import_optional_dependency
44
45
from pandas .compat .pickle_compat import patch_pickle
45
46
from pandas .errors import (
@@ -391,6 +392,13 @@ def read_hdf(
391
392
DataFrame.to_hdf : Write a HDF file from a DataFrame.
392
393
HDFStore : Low-level access to HDF files.
393
394
395
+ Notes
396
+ -----
397
+ When ``errors="surrogatepass"``, ``pd.options.future.infer_string`` is true,
398
+ and PyArrow is installed, if a UTF-16 surrogate is encountered when decoding
399
+ to UTF-8, the resulting dtype will be
400
+ ``pd.StringDtype(storage="python", na_value=np.nan)``.
401
+
394
402
Examples
395
403
--------
396
404
>>> df = pd.DataFrame([[1, 1.0, 'a']], columns=['x', 'y', 'z']) # doctest: +SKIP
@@ -2182,6 +2190,20 @@ def convert(
2182
2190
# making an Index instance could throw a number of different errors
2183
2191
try :
2184
2192
new_pd_index = factory (values , ** kwargs )
2193
+ except UnicodeEncodeError as err :
2194
+ if (
2195
+ errors == "surrogatepass"
2196
+ and get_option ("future.infer_string" )
2197
+ and str (err ).endswith ("surrogates not allowed" )
2198
+ and HAS_PYARROW
2199
+ ):
2200
+ new_pd_index = factory (
2201
+ values ,
2202
+ dtype = StringDtype (storage = "python" , na_value = np .nan ),
2203
+ ** kwargs ,
2204
+ )
2205
+ else :
2206
+ raise
2185
2207
except ValueError :
2186
2208
# if the output freq is different that what we recorded,
2187
2209
# it should be None (see also 'doc example part 2')
@@ -3097,12 +3119,29 @@ def read_index_node(
3097
3119
** kwargs ,
3098
3120
)
3099
3121
else :
3100
- index = factory (
3101
- _unconvert_index (
3102
- data , kind , encoding = self .encoding , errors = self .errors
3103
- ),
3104
- ** kwargs ,
3105
- )
3122
+ try :
3123
+ index = factory (
3124
+ _unconvert_index (
3125
+ data , kind , encoding = self .encoding , errors = self .errors
3126
+ ),
3127
+ ** kwargs ,
3128
+ )
3129
+ except UnicodeEncodeError as err :
3130
+ if (
3131
+ self .errors == "surrogatepass"
3132
+ and get_option ("future.infer_string" )
3133
+ and str (err ).endswith ("surrogates not allowed" )
3134
+ and HAS_PYARROW
3135
+ ):
3136
+ index = factory (
3137
+ _unconvert_index (
3138
+ data , kind , encoding = self .encoding , errors = self .errors
3139
+ ),
3140
+ dtype = StringDtype (storage = "python" , na_value = np .nan ),
3141
+ ** kwargs ,
3142
+ )
3143
+ else :
3144
+ raise
3106
3145
3107
3146
index .name = name
3108
3147
@@ -3236,13 +3275,24 @@ def read(
3236
3275
self .validate_read (columns , where )
3237
3276
index = self .read_index ("index" , start = start , stop = stop )
3238
3277
values = self .read_array ("values" , start = start , stop = stop )
3239
- result = Series (values , index = index , name = self .name , copy = False )
3240
- if (
3241
- using_string_dtype ()
3242
- and isinstance (values , np .ndarray )
3243
- and is_string_array (values , skipna = True )
3244
- ):
3245
- result = result .astype (StringDtype (na_value = np .nan ))
3278
+ try :
3279
+ result = Series (values , index = index , name = self .name , copy = False )
3280
+ except UnicodeEncodeError as err :
3281
+ if (
3282
+ self .errors == "surrogatepass"
3283
+ and get_option ("future.infer_string" )
3284
+ and str (err ).endswith ("surrogates not allowed" )
3285
+ and HAS_PYARROW
3286
+ ):
3287
+ result = Series (
3288
+ values ,
3289
+ index = index ,
3290
+ name = self .name ,
3291
+ copy = False ,
3292
+ dtype = StringDtype (storage = "python" , na_value = np .nan ),
3293
+ )
3294
+ else :
3295
+ raise
3246
3296
return result
3247
3297
3248
3298
def write (self , obj , ** kwargs ) -> None :
@@ -4704,7 +4754,24 @@ def read(
4704
4754
values = values .reshape ((1 , values .shape [0 ]))
4705
4755
4706
4756
if isinstance (values , np .ndarray ):
4707
- df = DataFrame (values .T , columns = cols_ , index = index_ , copy = False )
4757
+ try :
4758
+ df = DataFrame (values .T , columns = cols_ , index = index_ , copy = False )
4759
+ except UnicodeEncodeError as err :
4760
+ if (
4761
+ self .errors == "surrogatepass"
4762
+ and get_option ("future.infer_string" )
4763
+ and str (err ).endswith ("surrogates not allowed" )
4764
+ and HAS_PYARROW
4765
+ ):
4766
+ df = DataFrame (
4767
+ values .T ,
4768
+ columns = cols_ ,
4769
+ index = index_ ,
4770
+ copy = False ,
4771
+ dtype = StringDtype (storage = "python" , na_value = np .nan ),
4772
+ )
4773
+ else :
4774
+ raise
4708
4775
elif isinstance (values , Index ):
4709
4776
df = DataFrame (values , columns = cols_ , index = index_ )
4710
4777
else :
@@ -4714,23 +4781,10 @@ def read(
4714
4781
assert (df .dtypes == values .dtype ).all (), (df .dtypes , values .dtype )
4715
4782
4716
4783
# If str / string dtype is stored in meta, use that.
4717
- converted = False
4718
4784
for column in cols_ :
4719
4785
dtype = getattr (self .table .attrs , f"{ column } _meta" , None )
4720
4786
if dtype in ["str" , "string" ]:
4721
4787
df [column ] = df [column ].astype (dtype )
4722
- converted = True
4723
- # Otherwise try inference.
4724
- if (
4725
- not converted
4726
- and using_string_dtype ()
4727
- and isinstance (values , np .ndarray )
4728
- and is_string_array (
4729
- values ,
4730
- skipna = True ,
4731
- )
4732
- ):
4733
- df = df .astype (StringDtype (na_value = np .nan ))
4734
4788
frames .append (df )
4735
4789
4736
4790
if len (frames ) == 1 :
@@ -5194,7 +5248,7 @@ def _convert_string_array(data: np.ndarray, encoding: str, errors: str) -> np.nd
5194
5248
# encode if needed
5195
5249
if len (data ):
5196
5250
data = (
5197
- Series (data .ravel (), copy = False )
5251
+ Series (data .ravel (), copy = False , dtype = "object" )
5198
5252
.str .encode (encoding , errors )
5199
5253
._values .reshape (data .shape )
5200
5254
)
@@ -5234,7 +5288,9 @@ def _unconvert_string_array(
5234
5288
dtype = f"U{ itemsize } "
5235
5289
5236
5290
if isinstance (data [0 ], bytes ):
5237
- ser = Series (data , copy = False ).str .decode (encoding , errors = errors )
5291
+ ser = Series (data , copy = False ).str .decode (
5292
+ encoding , errors = errors , dtype = "object"
5293
+ )
5238
5294
data = ser .to_numpy ()
5239
5295
data .flags .writeable = True
5240
5296
else :
0 commit comments