Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ARROW-8226: [Go] Add 64-bit offset Binary Builder and String Builder #13719

Merged
merged 2 commits into from
Jul 29, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions go/arrow/array/array.go
Original file line number Diff line number Diff line change
Expand Up @@ -176,8 +176,8 @@ func init() {
arrow.EXTENSION: func(data arrow.ArrayData) arrow.Array { return NewExtensionData(data) },
arrow.FIXED_SIZE_LIST: func(data arrow.ArrayData) arrow.Array { return NewFixedSizeListData(data) },
arrow.DURATION: func(data arrow.ArrayData) arrow.Array { return NewDurationData(data) },
arrow.LARGE_STRING: unsupportedArrayType,
arrow.LARGE_BINARY: unsupportedArrayType,
arrow.LARGE_STRING: func(data arrow.ArrayData) arrow.Array { return NewLargeStringData(data) },
arrow.LARGE_BINARY: func(data arrow.ArrayData) arrow.Array { return NewLargeBinaryData(data) },
arrow.LARGE_LIST: unsupportedArrayType,
arrow.INTERVAL: func(data arrow.ArrayData) arrow.Array { return NewIntervalData(data) },
arrow.INTERVAL_MONTH_DAY_NANO: func(data arrow.ArrayData) arrow.Array { return NewMonthDayNanoIntervalData(data) },
Expand Down
4 changes: 2 additions & 2 deletions go/arrow/array/array_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ func TestMakeFromData(t *testing.T) {
{name: "float64", d: &testDataType{arrow.FLOAT64}},
{name: "string", d: &testDataType{arrow.STRING}, size: 3},
{name: "binary", d: &testDataType{arrow.BINARY}, size: 3},
{name: "large_string", d: &testDataType{arrow.LARGE_STRING}, size: 3},
{name: "large_binary", d: &testDataType{arrow.LARGE_BINARY}, size: 3},
{name: "fixed_size_binary", d: &testDataType{arrow.FIXED_SIZE_BINARY}},
{name: "date32", d: &testDataType{arrow.DATE32}},
{name: "date64", d: &testDataType{arrow.DATE64}},
Expand Down Expand Up @@ -114,8 +116,6 @@ func TestMakeFromData(t *testing.T) {
// unsupported types
{name: "sparse union", d: &testDataType{arrow.SPARSE_UNION}, expPanic: true, expError: "unsupported data type: SPARSE_UNION"},
{name: "dense union", d: &testDataType{arrow.DENSE_UNION}, expPanic: true, expError: "unsupported data type: DENSE_UNION"},
{name: "large string", d: &testDataType{arrow.LARGE_STRING}, expPanic: true, expError: "unsupported data type: LARGE_STRING"},
{name: "large binary", d: &testDataType{arrow.LARGE_BINARY}, expPanic: true, expError: "unsupported data type: LARGE_BINARY"},
{name: "large list", d: &testDataType{arrow.LARGE_LIST}, expPanic: true, expError: "unsupported data type: LARGE_LIST"},
{name: "decimal256", d: &testDataType{arrow.DECIMAL256}, expPanic: true, expError: "unsupported data type: DECIMAL256"},

Expand Down
129 changes: 129 additions & 0 deletions go/arrow/array/binary.go
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,135 @@ func arrayEqualBinary(left, right *Binary) bool {
return true
}

type LargeBinary struct {
array
valueOffsets []int64
valueBytes []byte
}

func NewLargeBinaryData(data arrow.ArrayData) *LargeBinary {
a := &LargeBinary{}
a.refCount = 1
a.setData(data.(*Data))
return a
}

func (a *LargeBinary) Value(i int) []byte {
if i < 0 || i >= a.array.data.length {
panic("arrow/array: index out of range")
}
idx := a.array.data.offset + i
return a.valueBytes[a.valueOffsets[idx]:a.valueOffsets[idx+1]]
}

func (a *LargeBinary) ValueString(i int) string {
b := a.Value(i)
return *(*string)(unsafe.Pointer(&b))
}

func (a *LargeBinary) ValueOffset(i int) int64 {
if i < 0 || i >= a.array.data.length {
panic("arrow/array: index out of range")
}
return a.valueOffsets[a.array.data.offset+i]
}

func (a *LargeBinary) ValueLen(i int) int {
if i < 0 || i >= a.array.data.length {
panic("arrow/array: index out of range")
}
beg := a.array.data.offset + i
return int(a.valueOffsets[beg+1] - a.valueOffsets[beg])
}

func (a *LargeBinary) ValueOffsets() []int64 {
beg := a.array.data.offset
end := beg + a.array.data.length + 1
return a.valueOffsets[beg:end]
}

func (a *LargeBinary) ValueBytes() []byte {
beg := a.array.data.offset
end := beg + a.array.data.length
return a.valueBytes[a.valueOffsets[beg]:a.valueOffsets[end]]
}

func (a *LargeBinary) String() string {
var o strings.Builder
o.WriteString("[")
for i := 0; i < a.Len(); i++ {
if i > 0 {
o.WriteString(" ")
}
switch {
case a.IsNull(i):
o.WriteString("(null)")
default:
fmt.Fprintf(&o, "%q", a.ValueString(i))
}
}
o.WriteString("]")
return o.String()
}

func (a *LargeBinary) setData(data *Data) {
if len(data.buffers) != 3 {
panic("len(data.buffers) != 3")
}

a.array.setData(data)

if valueData := data.buffers[2]; valueData != nil {
a.valueBytes = valueData.Bytes()
}

if valueOffsets := data.buffers[1]; valueOffsets != nil {
a.valueOffsets = arrow.Int64Traits.CastFromBytes(valueOffsets.Bytes())
}

if a.array.data.length < 1 {
return
}

expNumOffsets := a.array.data.offset + a.array.data.length + 1
if len(a.valueOffsets) < expNumOffsets {
panic(fmt.Errorf("arrow/array: large binary offset buffer must have at least %d values", expNumOffsets))
}

if int(a.valueOffsets[expNumOffsets-1]) > len(a.valueBytes) {
panic("arrow/array: large binary offsets out of bounds of data buffer")
}
}

func (a *LargeBinary) getOneForMarshal(i int) interface{} {
if a.IsNull(i) {
return nil
}
return a.Value(i)
}

func (a *LargeBinary) MarshalJSON() ([]byte, error) {
vals := make([]interface{}, a.Len())
for i := 0; i < a.Len(); i++ {
vals[i] = a.getOneForMarshal(i)
}
// golang marshal standard says that []byte will be marshalled
// as a base64-encoded string
return json.Marshal(vals)
}

func arrayEqualLargeBinary(left, right *LargeBinary) bool {
for i := 0; i < left.Len(); i++ {
if left.IsNull(i) {
continue
}
if !bytes.Equal(left.Value(i), right.Value(i)) {
return false
}
}
return true
}

var (
_ arrow.Array = (*Binary)(nil)
)
163 changes: 163 additions & 0 deletions go/arrow/array/binary_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,48 @@ func TestBinary(t *testing.T) {
b.Release()
}

func TestLargeBinary(t *testing.T) {
mem := memory.NewCheckedAllocator(memory.NewGoAllocator())
defer mem.AssertSize(t, 0)

b := NewBinaryBuilder(mem, arrow.BinaryTypes.LargeBinary)

values := [][]byte{
[]byte("AAA"),
nil,
[]byte("BBBB"),
}
valid := []bool{true, false, true}
b.AppendValues(values, valid)

b.Retain()
b.Release()

assert.Panics(t, func() {
b.NewBinaryArray()
})

a := b.NewLargeBinaryArray()
assert.Equal(t, 3, a.Len())
assert.Equal(t, 1, a.NullN())
assert.Equal(t, []byte("AAA"), a.Value(0))
assert.Equal(t, []byte{}, a.Value(1))
assert.Equal(t, []byte("BBBB"), a.Value(2))
a.Release()

// Test builder reset and NewArray API.
b.AppendValues(values, valid)
a = b.NewArray().(*LargeBinary)
assert.Equal(t, 3, a.Len())
assert.Equal(t, 1, a.NullN())
assert.Equal(t, []byte("AAA"), a.Value(0))
assert.Equal(t, []byte{}, a.Value(1))
assert.Equal(t, []byte("BBBB"), a.Value(2))
a.Release()

b.Release()
}

func TestBinarySliceData(t *testing.T) {
mem := memory.NewCheckedAllocator(memory.NewGoAllocator())
defer mem.AssertSize(t, 0)
Expand Down Expand Up @@ -336,6 +378,33 @@ func TestBinaryValueOffset(t *testing.T) {
}
}

func TestLargeBinaryValueOffset(t *testing.T) {
mem := memory.NewCheckedAllocator(memory.NewGoAllocator())
defer mem.AssertSize(t, 0)

values := []string{"a", "bc", "", "", "hijk", "lm", "", "opq", "", "tu"}
valids := []bool{true, true, false, false, true, true, true, true, false, true}

b := NewBinaryBuilder(mem, arrow.BinaryTypes.LargeBinary)
defer b.Release()

b.AppendStringValues(values, valids)

arr := b.NewArray().(*LargeBinary)
defer arr.Release()

slice := NewSlice(arr, 2, 9).(*LargeBinary)
defer slice.Release()

offset := 3
vs := values[2:9]

for i, v := range vs {
assert.EqualValues(t, offset, slice.ValueOffset(i))
offset += len(v)
}
}

func TestBinaryValueLen(t *testing.T) {
mem := memory.NewCheckedAllocator(memory.NewGoAllocator())
defer mem.AssertSize(t, 0)
Expand All @@ -361,6 +430,31 @@ func TestBinaryValueLen(t *testing.T) {
}
}

func TestLargeBinaryValueLen(t *testing.T) {
mem := memory.NewCheckedAllocator(memory.NewGoAllocator())
defer mem.AssertSize(t, 0)

values := []string{"a", "bc", "", "", "hijk", "lm", "", "opq", "", "tu"}
valids := []bool{true, true, false, false, true, true, true, true, false, true}

b := NewBinaryBuilder(mem, arrow.BinaryTypes.LargeBinary)
defer b.Release()

b.AppendStringValues(values, valids)

arr := b.NewArray().(*LargeBinary)
defer arr.Release()

slice := NewSlice(arr, 2, 9).(*LargeBinary)
defer slice.Release()

vs := values[2:9]

for i, v := range vs {
assert.Equal(t, len(v), slice.ValueLen(i))
}
}

func TestBinaryValueOffsets(t *testing.T) {
mem := memory.NewCheckedAllocator(memory.NewGoAllocator())
defer mem.AssertSize(t, 0)
Expand All @@ -384,6 +478,29 @@ func TestBinaryValueOffsets(t *testing.T) {
assert.Equal(t, []int32{3, 3, 3, 7, 9, 9, 12, 12}, slice.ValueOffsets())
}

func TestLargeBinaryValueOffsets(t *testing.T) {
mem := memory.NewCheckedAllocator(memory.NewGoAllocator())
defer mem.AssertSize(t, 0)

values := []string{"a", "bc", "", "", "hijk", "lm", "", "opq", "", "tu"}
valids := []bool{true, true, false, false, true, true, true, true, false, true}

b := NewBinaryBuilder(mem, arrow.BinaryTypes.LargeBinary)
defer b.Release()

b.AppendStringValues(values, valids)

arr := b.NewArray().(*LargeBinary)
defer arr.Release()

assert.Equal(t, []int64{0, 1, 3, 3, 3, 7, 9, 9, 12, 12, 14}, arr.ValueOffsets())

slice := NewSlice(arr, 2, 9).(*LargeBinary)
defer slice.Release()

assert.Equal(t, []int64{3, 3, 3, 7, 9, 9, 12, 12}, slice.ValueOffsets())
}

func TestBinaryValueBytes(t *testing.T) {
mem := memory.NewCheckedAllocator(memory.NewGoAllocator())
defer mem.AssertSize(t, 0)
Expand All @@ -407,6 +524,29 @@ func TestBinaryValueBytes(t *testing.T) {
assert.Equal(t, []byte{'h', 'i', 'j', 'k', 'l', 'm', 'o', 'p', 'q'}, slice.ValueBytes())
}

func TestLargeBinaryValueBytes(t *testing.T) {
mem := memory.NewCheckedAllocator(memory.NewGoAllocator())
defer mem.AssertSize(t, 0)

values := []string{"a", "bc", "", "", "hijk", "lm", "", "opq", "", "tu"}
valids := []bool{true, true, false, false, true, true, true, true, false, true}

b := NewBinaryBuilder(mem, arrow.BinaryTypes.LargeBinary)
defer b.Release()

b.AppendStringValues(values, valids)

arr := b.NewArray().(*LargeBinary)
defer arr.Release()

assert.Equal(t, []byte{'a', 'b', 'c', 'h', 'i', 'j', 'k', 'l', 'm', 'o', 'p', 'q', 't', 'u'}, arr.ValueBytes())

slice := NewSlice(arr, 2, 9).(*LargeBinary)
defer slice.Release()

assert.Equal(t, []byte{'h', 'i', 'j', 'k', 'l', 'm', 'o', 'p', 'q'}, slice.ValueBytes())
}

func TestBinaryStringer(t *testing.T) {
mem := memory.NewCheckedAllocator(memory.NewGoAllocator())
defer mem.AssertSize(t, 0)
Expand All @@ -430,6 +570,29 @@ func TestBinaryStringer(t *testing.T) {
}
}

func TestLargeBinaryStringer(t *testing.T) {
mem := memory.NewCheckedAllocator(memory.NewGoAllocator())
defer mem.AssertSize(t, 0)

values := []string{"a", "bc", "", "é", "", "hijk", "lm", "", "opq", "", "tu"}
valids := []bool{true, true, false, true, false, true, true, true, true, false, true}

b := NewBinaryBuilder(mem, arrow.BinaryTypes.LargeBinary)
defer b.Release()

b.AppendStringValues(values, valids)

arr := b.NewArray().(*LargeBinary)
defer arr.Release()

got := arr.String()
want := `["a" "bc" (null) "é" (null) "hijk" "lm" "" "opq" (null) "tu"]`

if got != want {
t.Fatalf("invalid stringer:\ngot= %s\nwant=%s\n", got, want)
}
}

func TestBinaryInvalidOffsets(t *testing.T) {
const expectedPanic = "arrow/array: binary offsets out of bounds of data buffer"

Expand Down
Loading