Skip to content

Commit

Permalink
apacheGH-34330: [Go][Parquet]: Add Extension type support (apache#34631)
Browse files Browse the repository at this point in the history
Follow-up instead of apache#34356
* Closes: apache#34330

This is not yet complete but I would love some direction on where should I add tests and in what other places I should handle the new extension type.

Lead-authored-by: Kemal Hadimli <[email protected]>
Co-authored-by: Herman Schaaf <[email protected]>
Co-authored-by: Yevgeny Pats <[email protected]>
Signed-off-by: Matt Topol <[email protected]>
  • Loading branch information
3 people authored and rtpsw committed May 16, 2023
1 parent a3d5218 commit a334a06
Show file tree
Hide file tree
Showing 32 changed files with 599 additions and 224 deletions.
2 changes: 1 addition & 1 deletion go/arrow/array/array_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ import (
"github.com/apache/arrow/go/v12/arrow"
"github.com/apache/arrow/go/v12/arrow/array"
"github.com/apache/arrow/go/v12/arrow/internal/testing/tools"
"github.com/apache/arrow/go/v12/arrow/internal/testing/types"
"github.com/apache/arrow/go/v12/arrow/memory"
"github.com/apache/arrow/go/v12/internal/types"
"github.com/stretchr/testify/assert"
)

Expand Down
2 changes: 1 addition & 1 deletion go/arrow/array/dictionary_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ import (
"github.com/apache/arrow/go/v12/arrow/array"
"github.com/apache/arrow/go/v12/arrow/bitutil"
"github.com/apache/arrow/go/v12/arrow/decimal128"
"github.com/apache/arrow/go/v12/arrow/internal/testing/types"
"github.com/apache/arrow/go/v12/arrow/memory"
"github.com/apache/arrow/go/v12/internal/types"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/stretchr/testify/suite"
Expand Down
2 changes: 1 addition & 1 deletion go/arrow/array/diff_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ import (

"github.com/apache/arrow/go/v12/arrow"
"github.com/apache/arrow/go/v12/arrow/array"
"github.com/apache/arrow/go/v12/arrow/internal/testing/types"
"github.com/apache/arrow/go/v12/arrow/memory"
"github.com/apache/arrow/go/v12/internal/types"
)

type diffTestCase struct {
Expand Down
2 changes: 1 addition & 1 deletion go/arrow/array/extension_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ import (

"github.com/apache/arrow/go/v12/arrow"
"github.com/apache/arrow/go/v12/arrow/array"
"github.com/apache/arrow/go/v12/arrow/internal/testing/types"
"github.com/apache/arrow/go/v12/arrow/memory"
"github.com/apache/arrow/go/v12/internal/types"
"github.com/stretchr/testify/suite"
)

Expand Down
15 changes: 13 additions & 2 deletions go/arrow/array/table_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
package array_test

import (
"errors"
"fmt"
"reflect"
"testing"
Expand Down Expand Up @@ -157,7 +158,17 @@ func TestChunkedInvalid(t *testing.T) {
if e == nil {
t.Fatalf("expected a panic")
}
if got, want := e.(string), "arrow/array: mismatch data type"; got != want {

err, ok := e.(error)
if !ok {
t.Fatalf("expected an error")
}

if !errors.Is(err, arrow.ErrInvalid) {
t.Fatalf("should be an ErrInvalid")
}

if got, want := err.Error(), fmt.Sprintf("%s: arrow/array: mismatch data type float64 vs int32", arrow.ErrInvalid); got != want {
t.Fatalf("invalid error. got=%q, want=%q", got, want)
}
}()
Expand Down Expand Up @@ -313,7 +324,7 @@ func TestColumn(t *testing.T) {
return c
}(),
field: arrow.Field{Name: "f32", Type: arrow.PrimitiveTypes.Float32},
err: fmt.Errorf("arrow/array: inconsistent data type"),
err: fmt.Errorf("%w: arrow/array: inconsistent data type float64 vs float32", arrow.ErrInvalid),
},
} {
t.Run("", func(t *testing.T) {
Expand Down
2 changes: 1 addition & 1 deletion go/arrow/compute/cast_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,9 @@ import (
"github.com/apache/arrow/go/v12/arrow/decimal128"
"github.com/apache/arrow/go/v12/arrow/decimal256"
"github.com/apache/arrow/go/v12/arrow/internal/testing/gen"
"github.com/apache/arrow/go/v12/arrow/internal/testing/types"
"github.com/apache/arrow/go/v12/arrow/memory"
"github.com/apache/arrow/go/v12/arrow/scalar"
"github.com/apache/arrow/go/v12/internal/types"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/stretchr/testify/suite"
Expand Down
2 changes: 1 addition & 1 deletion go/arrow/compute/internal/exec/span_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@ import (
"github.com/apache/arrow/go/v12/arrow/compute/internal/exec"
"github.com/apache/arrow/go/v12/arrow/decimal128"
"github.com/apache/arrow/go/v12/arrow/endian"
"github.com/apache/arrow/go/v12/arrow/internal/testing/types"
"github.com/apache/arrow/go/v12/arrow/memory"
"github.com/apache/arrow/go/v12/arrow/scalar"
"github.com/apache/arrow/go/v12/internal/types"
"github.com/stretchr/testify/assert"
)

Expand Down
2 changes: 1 addition & 1 deletion go/arrow/compute/vector_selection_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@ import (
"github.com/apache/arrow/go/v12/arrow/compute/internal/exec"
"github.com/apache/arrow/go/v12/arrow/compute/internal/kernels"
"github.com/apache/arrow/go/v12/arrow/internal/testing/gen"
"github.com/apache/arrow/go/v12/arrow/internal/testing/types"
"github.com/apache/arrow/go/v12/arrow/memory"
"github.com/apache/arrow/go/v12/arrow/scalar"
"github.com/apache/arrow/go/v12/internal/types"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/stretchr/testify/suite"
Expand Down
15 changes: 8 additions & 7 deletions go/arrow/csv/reader_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ import (
"github.com/apache/arrow/go/v12/arrow/csv"
"github.com/apache/arrow/go/v12/arrow/decimal128"
"github.com/apache/arrow/go/v12/arrow/decimal256"
"github.com/apache/arrow/go/v12/arrow/internal/testing/types"
"github.com/apache/arrow/go/v12/arrow/memory"
"github.com/apache/arrow/go/v12/internal/types"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
Expand Down Expand Up @@ -165,10 +165,10 @@ func Example_withChunk() {
}

func TestCSVReadInvalidFields(t *testing.T) {
tests := []struct {
Name string
Data string
Fields []arrow.Field
tests := []struct {
Name string
Data string
Fields []arrow.Field
ExpectedError bool
}{
{
Expand Down Expand Up @@ -201,13 +201,14 @@ func TestCSVReadInvalidFields(t *testing.T) {
t.Run(tc.Name, func(t *testing.T) {
f := bytes.NewBufferString(tc.Data)
schema := arrow.NewSchema(tc.Fields, nil)

r := csv.NewReader(
f, schema,
csv.WithComma(','),
)
defer r.Release()
for r.Next() {}
for r.Next() {
}
parseErr := r.Err()
if tc.ExpectedError && parseErr == nil {
t.Fatal("Expected error, but none found")
Expand Down
3 changes: 1 addition & 2 deletions go/arrow/csv/writer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ import (
"github.com/apache/arrow/go/v12/arrow/csv"
"github.com/apache/arrow/go/v12/arrow/decimal128"
"github.com/apache/arrow/go/v12/arrow/decimal256"
"github.com/apache/arrow/go/v12/arrow/internal/testing/types"
"github.com/apache/arrow/go/v12/arrow/memory"
"github.com/apache/arrow/go/v12/internal/types"
"github.com/google/uuid"
)

Expand Down Expand Up @@ -257,7 +257,6 @@ func testCSVWriter(t *testing.T, data [][]string, writeHeader bool, fmtr func(bo
listBuilderInt64.AppendValues([]int64{7, 8, 9}, nil)
b.Field(18).(*array.BinaryBuilder).AppendValues([][]byte{{0, 1, 2}, {3, 4, 5}, {}}, nil)
b.Field(19).(*types.UUIDBuilder).AppendValues([]uuid.UUID{uuid.MustParse("00000000-0000-0000-0000-000000000001"), uuid.MustParse("00000000-0000-0000-0000-000000000002"), uuid.MustParse("00000000-0000-0000-0000-000000000003")}, nil)


for _, field := range b.Fields() {
field.AppendNull()
Expand Down
2 changes: 1 addition & 1 deletion go/arrow/datatype_extension_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ import (
"testing"

"github.com/apache/arrow/go/v12/arrow"
"github.com/apache/arrow/go/v12/arrow/internal/testing/types"
"github.com/apache/arrow/go/v12/internal/types"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/suite"
)
Expand Down
2 changes: 1 addition & 1 deletion go/arrow/internal/arrdata/arrdata.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,9 @@ import (
"github.com/apache/arrow/go/v12/arrow/array"
"github.com/apache/arrow/go/v12/arrow/decimal128"
"github.com/apache/arrow/go/v12/arrow/float16"
"github.com/apache/arrow/go/v12/arrow/internal/testing/types"
"github.com/apache/arrow/go/v12/arrow/ipc"
"github.com/apache/arrow/go/v12/arrow/memory"
"github.com/apache/arrow/go/v12/internal/types"
)

var (
Expand Down
2 changes: 1 addition & 1 deletion go/arrow/internal/flight_integration/scenario.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,9 @@ import (
"github.com/apache/arrow/go/v12/arrow/flight/flightsql"
"github.com/apache/arrow/go/v12/arrow/flight/flightsql/schema_ref"
"github.com/apache/arrow/go/v12/arrow/internal/arrjson"
"github.com/apache/arrow/go/v12/arrow/internal/testing/types"
"github.com/apache/arrow/go/v12/arrow/ipc"
"github.com/apache/arrow/go/v12/arrow/memory"
"github.com/apache/arrow/go/v12/internal/types"
"golang.org/x/xerrors"
"google.golang.org/grpc"
"google.golang.org/grpc/codes"
Expand Down
2 changes: 1 addition & 1 deletion go/arrow/ipc/cmd/arrow-json-integration-test/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ import (
"github.com/apache/arrow/go/v12/arrow/array"
"github.com/apache/arrow/go/v12/arrow/arrio"
"github.com/apache/arrow/go/v12/arrow/internal/arrjson"
"github.com/apache/arrow/go/v12/arrow/internal/testing/types"
"github.com/apache/arrow/go/v12/arrow/ipc"
"github.com/apache/arrow/go/v12/internal/types"
)

func main() {
Expand Down
2 changes: 1 addition & 1 deletion go/arrow/ipc/endian_swap_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ import (
"github.com/apache/arrow/go/v12/arrow"
"github.com/apache/arrow/go/v12/arrow/array"
"github.com/apache/arrow/go/v12/arrow/endian"
"github.com/apache/arrow/go/v12/arrow/internal/testing/types"
"github.com/apache/arrow/go/v12/arrow/memory"
"github.com/apache/arrow/go/v12/internal/types"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
Expand Down
2 changes: 1 addition & 1 deletion go/arrow/ipc/metadata_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ import (
"github.com/apache/arrow/go/v12/arrow/array"
"github.com/apache/arrow/go/v12/arrow/internal/dictutils"
"github.com/apache/arrow/go/v12/arrow/internal/flatbuf"
"github.com/apache/arrow/go/v12/arrow/internal/testing/types"
"github.com/apache/arrow/go/v12/arrow/memory"
"github.com/apache/arrow/go/v12/internal/types"
flatbuffers "github.com/google/flatbuffers/go"
"github.com/stretchr/testify/assert"
)
Expand Down
34 changes: 17 additions & 17 deletions go/arrow/table.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
package arrow

import (
"fmt"
"sync/atomic"

"github.com/apache/arrow/go/v12/arrow/internal/debug"
Expand All @@ -42,20 +43,19 @@ type Table interface {
// To get strongly typed data from a Column, you need to iterate the
// chunks and type assert each individual Array. For example:
//
// switch column.DataType().ID {
// case arrow.INT32:
// for _, c := range column.Data().Chunks() {
// arr := c.(*array.Int32)
// // do something with arr
// }
// case arrow.INT64:
// for _, c := range column.Data().Chunks() {
// arr := c.(*array.Int64)
// // do something with arr
// }
// case ...
// switch column.DataType().ID {
// case arrow.INT32:
// for _, c := range column.Data().Chunks() {
// arr := c.(*array.Int32)
// // do something with arr
// }
//
// case arrow.INT64:
// for _, c := range column.Data().Chunks() {
// arr := c.(*array.Int64)
// // do something with arr
// }
// case ...
// }
type Column struct {
field Field
data *Chunked
Expand All @@ -69,7 +69,7 @@ type Column struct {
// of the ref counting.
func NewColumnFromArr(field Field, arr Array) Column {
if !TypeEqual(field.Type, arr.DataType()) {
panic("arrow/array: inconsistent data type")
panic(fmt.Errorf("%w: arrow/array: inconsistent data type %s vs %s", ErrInvalid, field.Type, arr.DataType()))
}

arr.Retain()
Expand Down Expand Up @@ -98,7 +98,7 @@ func NewColumn(field Field, chunks *Chunked) *Column {

if !TypeEqual(col.data.DataType(), col.field.Type) {
col.data.Release()
panic("arrow/array: inconsistent data type")
panic(fmt.Errorf("%w: arrow/array: inconsistent data type %s vs %s", ErrInvalid, col.data.DataType(), col.field.Type))
}

return &col
Expand Down Expand Up @@ -148,9 +148,9 @@ func NewChunked(dtype DataType, chunks []Array) *Chunked {
if chunk == nil {
continue
}

if !TypeEqual(chunk.DataType(), dtype) {
panic("arrow/array: mismatch data type")
panic(fmt.Errorf("%w: arrow/array: mismatch data type %s vs %s", ErrInvalid, chunk.DataType().String(), dtype.String()))
}
chunk.Retain()
arr.chunks = append(arr.chunks, chunk)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -209,13 +209,13 @@ func (UUIDType) ExtensionName() string { return "uuid" }
func (UUIDType) Serialize() string { return "uuid-serialized" }

// Deserialize expects storageType to be FixedSizeBinaryType{ByteWidth: 16} and the data to be
// "uuid-serialized" in order to correctly create a UuidType for testing deserialize.
// "uuid-serialized" in order to correctly create a UUIDType for testing deserialize.
func (UUIDType) Deserialize(storageType arrow.DataType, data string) (arrow.ExtensionType, error) {
if string(data) != "uuid-serialized" {
return nil, fmt.Errorf("type identifier did not match: '%s'", string(data))
}
if !arrow.TypeEqual(storageType, &arrow.FixedSizeBinaryType{ByteWidth: 16}) {
return nil, fmt.Errorf("invalid storage type for UuidType: %s", storageType.Name())
return nil, fmt.Errorf("invalid storage type for UUIDType: %s", storageType.Name())
}
return NewUUIDType(), nil
}
Expand Down Expand Up @@ -258,15 +258,13 @@ func (a Parametric2Array) ValueString(i int) string {
return fmt.Sprintf("%d", arr.Value(i))
}


// A type where ExtensionName is always the same
type Parametric1Type struct {
arrow.ExtensionBase

param int32
}


func NewParametric1Type(p int32) *Parametric1Type {
ret := &Parametric1Type{param: p}
ret.ExtensionBase.Storage = arrow.PrimitiveTypes.Int32
Expand Down Expand Up @@ -516,13 +514,11 @@ func (SmallintType) Deserialize(storageType arrow.DataType, data string) (arrow.
}

var (
_ arrow.ExtensionType = (*UUIDType)(nil)
_ arrow.ExtensionType = (*Parametric1Type)(nil)
_ arrow.ExtensionType = (*Parametric2Type)(nil)
_ arrow.ExtensionType = (*ExtStructType)(nil)
_ arrow.ExtensionType = (*DictExtensionType)(nil)
_ arrow.ExtensionType = (*SmallintType)(nil)
_ array.ExtensionArray = (*UUIDArray)(nil)
_ array.ExtensionArray = (*Parametric1Array)(nil)
_ array.ExtensionArray = (*Parametric2Array)(nil)
_ array.ExtensionArray = (*ExtStructArray)(nil)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ import (

"github.com/apache/arrow/go/v12/arrow"
"github.com/apache/arrow/go/v12/arrow/array"
"github.com/apache/arrow/go/v12/arrow/internal/testing/types"
"github.com/apache/arrow/go/v12/arrow/memory"
"github.com/apache/arrow/go/v12/internal/types"
"github.com/google/uuid"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
Expand Down
6 changes: 6 additions & 0 deletions go/parquet/internal/encoding/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,12 @@ func (b *BufferWriter) Finish() *memory.Buffer {
return buf
}

// Release the underlying buffer and not allocate anything else. To re-use this buffer, Reset() or Finish() should be called
func (b *BufferWriter) Release() {
b.buffer.Release()
b.buffer = nil
}

func (b *BufferWriter) Truncate() {
b.pos = 0
b.offset = 0
Expand Down
Loading

0 comments on commit a334a06

Please sign in to comment.