File tree 1 file changed +5
-9
lines changed
1 file changed +5
-9
lines changed Original file line number Diff line number Diff line change @@ -114,23 +114,19 @@ def test_from_csv_to_parquet(
114
114
sep = "\t " ,
115
115
names = SCHEMA .keys (),
116
116
# 'dtype' and 'converters' cannot overlap
117
- dtype = {
118
- col : dtype for col , dtype in SCHEMA .items () if dtype == "string[pyarrow]"
119
- },
117
+ dtype = {col : dtype for col , dtype in SCHEMA .items () if dtype != "Float64" },
120
118
storage_options = s3 .storage_options ,
121
119
on_bad_lines = "skip" ,
122
- # Some bad files have '#' in numeric values
120
+ # Some bad files have '#' in float values
123
121
converters = {
124
122
col : lambda v : float (v .replace ("#" , "" ) or "NaN" )
125
123
for col , dtype in SCHEMA .items ()
126
- if dtype != "string[pyarrow] "
124
+ if dtype == "Float64 "
127
125
},
128
126
)
129
127
130
- # Now we can safely convert the numeric columns
131
- df = df .astype (
132
- {col : dtype for col , dtype in SCHEMA .items () if dtype != "string[pyarrow]" }
133
- )
128
+ # Now we can safely convert the float columns
129
+ df = df .astype ({col : dtype for col , dtype in SCHEMA .items () if dtype == "Float64" })
134
130
135
131
df = df .map_partitions (
136
132
lambda xdf : xdf .drop_duplicates (subset = ["SOURCEURL" ], keep = "first" )
You can’t perform that action at this time.
0 commit comments