diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 03a386708323d..61951e25bb35f 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -773,6 +773,7 @@ I/O - Bug in :meth:`DataFrame.to_excel` when writing empty :class:`DataFrame` with :class:`MultiIndex` on both axes (:issue:`57696`) - Bug in :meth:`DataFrame.to_excel` where the :class:`MultiIndex` index with a period level was not a date (:issue:`60099`) - Bug in :meth:`DataFrame.to_stata` when exporting a column containing both long strings (Stata strL) and :class:`pd.NA` values (:issue:`23633`) +- Bug in :meth:`DataFrame.to_stata` when input encoded length and normal length are mismatched (:issue:`61583`) - Bug in :meth:`DataFrame.to_stata` when writing :class:`DataFrame` and ``byteorder=`big```. (:issue:`58969`) - Bug in :meth:`DataFrame.to_stata` when writing more than 32,000 value labels. (:issue:`60107`) - Bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index cd290710ddbaa..092c24f0d31c3 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -2739,7 +2739,7 @@ def _encode_strings(self) -> None: encoded = self.data[col].str.encode(self._encoding) # If larger than _max_string_length do nothing if ( - max_len_string_array(ensure_object(encoded._values)) + max_len_string_array(ensure_object(self.data[col]._values)) <= self._max_string_length ): self.data[col] = encoded @@ -3263,11 +3263,15 @@ def generate_blob(self, gso_table: dict[str, tuple[int, int]]) -> bytes: bio.write(gso_type) # llll - utf8_string = bytes(strl, "utf-8") - bio.write(struct.pack(len_type, len(utf8_string) + 1)) + if isinstance(strl, str): + strl_convert = bytes(strl, "utf-8") + else: + strl_convert = strl + + bio.write(struct.pack(len_type, len(strl_convert) + 1)) # xxx...xxx - bio.write(utf8_string) + bio.write(strl_convert) bio.write(null) return bio.getvalue() diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index e73de78847c8f..b155c0cca4aa6 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -2601,3 +2601,13 @@ def test_strl_missings(temp_file, version): ] ) df.to_stata(temp_file, version=version) + + +@pytest.mark.parametrize("version", [117, 118, 119, None]) +def test_ascii_error(temp_file, version): + # GH #61583 + # Check that 2 byte long unicode characters doesn't cause export error + df = DataFrame({"doubleByteCol": ["ยง" * 1500]}) + df.to_stata(temp_file, write_index=0, version=version) + df_input = read_stata(temp_file) + tm.assert_frame_equal(df, df_input)