@@ -26,7 +26,7 @@ type TransformConfig struct {
26
26
RowGroupLength int
27
27
TransformSchema SchemaTransformer
28
28
TransformColumn ColumnTransformer
29
- BeforeClose func (* file.Reader , * file. Writer ) error
29
+ BeforeClose func (* file.Reader , * pqarrow. FileWriter ) error
30
30
}
31
31
32
32
func getWriterProperties (config * TransformConfig , fileReader * file.Reader ) (* parquet.WriterProperties , error ) {
@@ -104,7 +104,16 @@ func TransformByColumn(config *TransformConfig) error {
104
104
return propErr
105
105
}
106
106
107
- fileWriter := file .NewParquetWriter (config .Writer , outputSchema .Root (), file .WithWriterProps (writerProperties ))
107
+ arrowSchema , arrowSchemaErr := pqarrow .FromParquet (outputSchema , & arrowReadProperties , fileReader .MetaData ().KeyValueMetadata ())
108
+ if arrowSchemaErr != nil {
109
+ return arrowSchemaErr
110
+ }
111
+
112
+ fileWriter , fileWriterErr := pqarrow .NewFileWriter (arrowSchema , config .Writer , writerProperties , pqarrow .DefaultWriterProps ())
113
+ if fileWriterErr != nil {
114
+ return fileWriterErr
115
+ }
116
+
108
117
ctx := pqarrow .NewArrowWriteContext (context .Background (), nil )
109
118
110
119
if config .RowGroupLength > 0 {
@@ -120,7 +129,8 @@ func TransformByColumn(config *TransformConfig) error {
120
129
numRows := fileReader .NumRows ()
121
130
numRowsWritten := int64 (0 )
122
131
for {
123
- rowGroupWriter := fileWriter .AppendRowGroup ()
132
+ fileWriter .NewRowGroup ()
133
+ numRowsInGroup := 0
124
134
for fieldNum := 0 ; fieldNum < numFields ; fieldNum += 1 {
125
135
colReader := columnReaders [fieldNum ]
126
136
arr , readErr := colReader .NextBatch (int64 (config .RowGroupLength ))
@@ -139,18 +149,14 @@ func TransformByColumn(config *TransformConfig) error {
139
149
}
140
150
arr = transformed
141
151
}
142
- colWriter , colWriterErr := pqarrow . NewArrowColumnWriter ( arr , 0 , int64 ( arr . Len ()), outputManifest , rowGroupWriter , fieldNum )
143
- if colWriterErr != nil {
144
- return colWriterErr
152
+ if numRowsInGroup == 0 {
153
+ // TODO: propose fileWriter.RowGroupNumRows()
154
+ numRowsInGroup = arr . Len ()
145
155
}
146
- if err := colWriter . Write ( ctx ); err != nil {
156
+ if err := fileWriter . WriteColumnChunked ( arr , 0 , int64 ( arr . Len ()) ); err != nil {
147
157
return err
148
158
}
149
159
}
150
- numRowsInGroup , err := rowGroupWriter .NumRows ()
151
- if err != nil {
152
- return err
153
- }
154
160
numRowsWritten += int64 (numRowsInGroup )
155
161
if numRowsWritten >= numRows {
156
162
break
@@ -160,7 +166,7 @@ func TransformByColumn(config *TransformConfig) error {
160
166
numRowGroups := fileReader .NumRowGroups ()
161
167
for rowGroupIndex := 0 ; rowGroupIndex < numRowGroups ; rowGroupIndex += 1 {
162
168
rowGroupReader := arrowReader .RowGroup (rowGroupIndex )
163
- rowGroupWriter := fileWriter .AppendRowGroup ()
169
+ fileWriter .NewRowGroup ()
164
170
for fieldNum := 0 ; fieldNum < numFields ; fieldNum += 1 {
165
171
arr , readErr := rowGroupReader .Column (fieldNum ).Read (ctx )
166
172
if readErr != nil {
@@ -175,11 +181,7 @@ func TransformByColumn(config *TransformConfig) error {
175
181
}
176
182
arr = transformed
177
183
}
178
- colWriter , colWriterErr := pqarrow .NewArrowColumnWriter (arr , 0 , int64 (arr .Len ()), outputManifest , rowGroupWriter , fieldNum )
179
- if colWriterErr != nil {
180
- return colWriterErr
181
- }
182
- if err := colWriter .Write (ctx ); err != nil {
184
+ if err := fileWriter .WriteColumnChunked (arr , 0 , int64 (arr .Len ())); err != nil {
183
185
return err
184
186
}
185
187
}
0 commit comments