Skip to content

Commit e3145c6

Browse files
committed
Try only fixing floats
1 parent ed1d9e3 commit e3145c6

File tree

1 file changed

+5
-9
lines changed

1 file changed

+5
-9
lines changed

tests/workflows/test_from_csv_to_parquet.py

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -114,23 +114,19 @@ def test_from_csv_to_parquet(
114114
sep="\t",
115115
names=SCHEMA.keys(),
116116
# 'dtype' and 'converters' cannot overlap
117-
dtype={
118-
col: dtype for col, dtype in SCHEMA.items() if dtype == "string[pyarrow]"
119-
},
117+
dtype={col: dtype for col, dtype in SCHEMA.items() if dtype != "Float64"},
120118
storage_options=s3.storage_options,
121119
on_bad_lines="skip",
122-
# Some bad files have '#' in numeric values
120+
# Some bad files have '#' in float values
123121
converters={
124122
col: lambda v: float(v.replace("#", "") or "NaN")
125123
for col, dtype in SCHEMA.items()
126-
if dtype != "string[pyarrow]"
124+
if dtype == "Float64"
127125
},
128126
)
129127

130-
# Now we can safely convert the numeric columns
131-
df = df.astype(
132-
{col: dtype for col, dtype in SCHEMA.items() if dtype != "string[pyarrow]"}
133-
)
128+
# Now we can safely convert the float columns
129+
df = df.astype({col: dtype for col, dtype in SCHEMA.items() if dtype == "Float64"})
134130

135131
df = df.map_partitions(
136132
lambda xdf: xdf.drop_duplicates(subset=["SOURCEURL"], keep="first")

0 commit comments

Comments
 (0)