Skip to content

Commit ba009c9

Browse files
authored
Merge pull request #109 from planetlabs/use-pqarrow
Use the pqarrow file writer
2 parents e01e089 + 2689577 commit ba009c9

File tree

4 files changed

+89
-19
lines changed

4 files changed

+89
-19
lines changed

internal/geoparquet/geoparquet.go

+2-1
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import (
1212
"github.com/apache/arrow/go/v14/parquet"
1313
"github.com/apache/arrow/go/v14/parquet/compress"
1414
"github.com/apache/arrow/go/v14/parquet/file"
15+
"github.com/apache/arrow/go/v14/parquet/pqarrow"
1516
"github.com/apache/arrow/go/v14/parquet/schema"
1617
"github.com/paulmach/orb/encoding/wkb"
1718
"github.com/paulmach/orb/encoding/wkt"
@@ -154,7 +155,7 @@ func FromParquet(input parquet.ReaderAtSeeker, output io.Writer, convertOptions
154155
return arrow.NewChunked(builder.Type(), transformed), nil
155156
}
156157

157-
beforeClose := func(fileReader *file.Reader, fileWriter *file.Writer) error {
158+
beforeClose := func(fileReader *file.Reader, fileWriter *pqarrow.FileWriter) error {
158159
metadata := getMetadata(fileReader, convertOptions)
159160
for name, geometryCol := range metadata.Columns {
160161
if !datasetInfo.HasCollection(name) {

internal/pqutil/transform.go

+19-17
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ type TransformConfig struct {
2626
RowGroupLength int
2727
TransformSchema SchemaTransformer
2828
TransformColumn ColumnTransformer
29-
BeforeClose func(*file.Reader, *file.Writer) error
29+
BeforeClose func(*file.Reader, *pqarrow.FileWriter) error
3030
}
3131

3232
func getWriterProperties(config *TransformConfig, fileReader *file.Reader) (*parquet.WriterProperties, error) {
@@ -104,7 +104,16 @@ func TransformByColumn(config *TransformConfig) error {
104104
return propErr
105105
}
106106

107-
fileWriter := file.NewParquetWriter(config.Writer, outputSchema.Root(), file.WithWriterProps(writerProperties))
107+
arrowSchema, arrowSchemaErr := pqarrow.FromParquet(outputSchema, &arrowReadProperties, fileReader.MetaData().KeyValueMetadata())
108+
if arrowSchemaErr != nil {
109+
return arrowSchemaErr
110+
}
111+
112+
fileWriter, fileWriterErr := pqarrow.NewFileWriter(arrowSchema, config.Writer, writerProperties, pqarrow.DefaultWriterProps())
113+
if fileWriterErr != nil {
114+
return fileWriterErr
115+
}
116+
108117
ctx := pqarrow.NewArrowWriteContext(context.Background(), nil)
109118

110119
if config.RowGroupLength > 0 {
@@ -120,7 +129,8 @@ func TransformByColumn(config *TransformConfig) error {
120129
numRows := fileReader.NumRows()
121130
numRowsWritten := int64(0)
122131
for {
123-
rowGroupWriter := fileWriter.AppendRowGroup()
132+
fileWriter.NewRowGroup()
133+
numRowsInGroup := 0
124134
for fieldNum := 0; fieldNum < numFields; fieldNum += 1 {
125135
colReader := columnReaders[fieldNum]
126136
arr, readErr := colReader.NextBatch(int64(config.RowGroupLength))
@@ -139,18 +149,14 @@ func TransformByColumn(config *TransformConfig) error {
139149
}
140150
arr = transformed
141151
}
142-
colWriter, colWriterErr := pqarrow.NewArrowColumnWriter(arr, 0, int64(arr.Len()), outputManifest, rowGroupWriter, fieldNum)
143-
if colWriterErr != nil {
144-
return colWriterErr
152+
if numRowsInGroup == 0 {
153+
// TODO: propose fileWriter.RowGroupNumRows()
154+
numRowsInGroup = arr.Len()
145155
}
146-
if err := colWriter.Write(ctx); err != nil {
156+
if err := fileWriter.WriteColumnChunked(arr, 0, int64(arr.Len())); err != nil {
147157
return err
148158
}
149159
}
150-
numRowsInGroup, err := rowGroupWriter.NumRows()
151-
if err != nil {
152-
return err
153-
}
154160
numRowsWritten += int64(numRowsInGroup)
155161
if numRowsWritten >= numRows {
156162
break
@@ -160,7 +166,7 @@ func TransformByColumn(config *TransformConfig) error {
160166
numRowGroups := fileReader.NumRowGroups()
161167
for rowGroupIndex := 0; rowGroupIndex < numRowGroups; rowGroupIndex += 1 {
162168
rowGroupReader := arrowReader.RowGroup(rowGroupIndex)
163-
rowGroupWriter := fileWriter.AppendRowGroup()
169+
fileWriter.NewRowGroup()
164170
for fieldNum := 0; fieldNum < numFields; fieldNum += 1 {
165171
arr, readErr := rowGroupReader.Column(fieldNum).Read(ctx)
166172
if readErr != nil {
@@ -175,11 +181,7 @@ func TransformByColumn(config *TransformConfig) error {
175181
}
176182
arr = transformed
177183
}
178-
colWriter, colWriterErr := pqarrow.NewArrowColumnWriter(arr, 0, int64(arr.Len()), outputManifest, rowGroupWriter, fieldNum)
179-
if colWriterErr != nil {
180-
return colWriterErr
181-
}
182-
if err := colWriter.Write(ctx); err != nil {
184+
if err := fileWriter.WriteColumnChunked(arr, 0, int64(arr.Len())); err != nil {
183185
return err
184186
}
185187
}

internal/pqutil/transform_test.go

+66
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"fmt"
77
"math"
88
"strconv"
9+
"strings"
910
"testing"
1011

1112
"github.com/apache/arrow/go/v14/arrow"
@@ -14,6 +15,7 @@ import (
1415
"github.com/apache/arrow/go/v14/parquet"
1516
"github.com/apache/arrow/go/v14/parquet/compress"
1617
"github.com/apache/arrow/go/v14/parquet/file"
18+
"github.com/apache/arrow/go/v14/parquet/pqarrow"
1719
"github.com/apache/arrow/go/v14/parquet/schema"
1820
"github.com/planetlabs/gpq/internal/pqutil"
1921
"github.com/planetlabs/gpq/internal/test"
@@ -123,6 +125,70 @@ func TestTransformByColumn(t *testing.T) {
123125
}
124126
}
125127

128+
func makeOvertureData(t *testing.T) (string, []byte) {
129+
schema := arrow.NewSchema([]arrow.Field{
130+
{Name: "sources", Nullable: true, Type: arrow.ListOf(arrow.StructOf(
131+
arrow.Field{Name: "property", Nullable: true, Type: arrow.BinaryTypes.String},
132+
arrow.Field{Name: "dataset", Nullable: true, Type: arrow.BinaryTypes.String},
133+
arrow.Field{Name: "recordId", Nullable: true, Type: arrow.BinaryTypes.String},
134+
arrow.Field{Name: "confidence", Nullable: true, Type: arrow.PrimitiveTypes.Float64},
135+
))},
136+
{Name: "bbox", Nullable: false, Type: arrow.StructOf(
137+
arrow.Field{Name: "minx", Nullable: true, Type: arrow.PrimitiveTypes.Float64},
138+
arrow.Field{Name: "maxx", Nullable: true, Type: arrow.PrimitiveTypes.Float64},
139+
arrow.Field{Name: "miny", Nullable: true, Type: arrow.PrimitiveTypes.Float64},
140+
arrow.Field{Name: "maxy", Nullable: true, Type: arrow.PrimitiveTypes.Float64},
141+
)},
142+
}, nil)
143+
144+
expected := `[
145+
{
146+
"sources": [
147+
{
148+
"property": "",
149+
"recordId": "record-1",
150+
"dataset": "test",
151+
"confidence": null
152+
}
153+
],
154+
"bbox": {
155+
"minx": -180,
156+
"maxx": -180,
157+
"miny": -90,
158+
"maxy": -90
159+
}
160+
}
161+
]`
162+
record, _, err := array.RecordFromJSON(memory.DefaultAllocator, schema, strings.NewReader(expected))
163+
require.NoError(t, err)
164+
165+
output := &bytes.Buffer{}
166+
writer, err := pqarrow.NewFileWriter(schema, output, nil, pqarrow.DefaultWriterProps())
167+
require.NoError(t, err)
168+
169+
require.NoError(t, writer.Write(record))
170+
require.NoError(t, writer.Close())
171+
172+
return expected, output.Bytes()
173+
}
174+
175+
func TestTransformOverture(t *testing.T) {
176+
// minimal reproduction of https://github.com/planetlabs/gpq/issues/102
177+
expected, parquetData := makeOvertureData(t)
178+
179+
input := bytes.NewReader(parquetData)
180+
output := &bytes.Buffer{}
181+
config := &pqutil.TransformConfig{
182+
Reader: input,
183+
Writer: output,
184+
}
185+
186+
require.NoError(t, pqutil.TransformByColumn(config))
187+
188+
outputAsJSON := test.ParquetToJSON(t, bytes.NewReader(output.Bytes()))
189+
assert.JSONEq(t, expected, outputAsJSON)
190+
}
191+
126192
func TestTransformByRowGroupLength(t *testing.T) {
127193
numRows := 100
128194
rows := make([]map[string]any, numRows)

internal/validator/validator_test.go

+2-1
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import (
2828

2929
"github.com/apache/arrow/go/v14/parquet"
3030
"github.com/apache/arrow/go/v14/parquet/file"
31+
"github.com/apache/arrow/go/v14/parquet/pqarrow"
3132
"github.com/paulmach/orb"
3233
"github.com/paulmach/orb/encoding/wkb"
3334
"github.com/planetlabs/gpq/internal/geojson"
@@ -87,7 +88,7 @@ func (s *Suite) copyWithMetadata(input parquet.ReaderAtSeeker, output io.Writer,
8788
config := &pqutil.TransformConfig{
8889
Reader: input,
8990
Writer: output,
90-
BeforeClose: func(fileReader *file.Reader, fileWriter *file.Writer) error {
91+
BeforeClose: func(fileReader *file.Reader, fileWriter *pqarrow.FileWriter) error {
9192
return fileWriter.AppendKeyValueMetadata(geoparquet.MetadataKey, metadata)
9293
},
9394
}

0 commit comments

Comments
 (0)