Skip to content

Commit

Permalink
ARROW-8226: [Go] Add 64-bit offset Binary Builder and String Builder
Browse files Browse the repository at this point in the history
  • Loading branch information
zeroshade committed Jul 26, 2022
1 parent a5a2837 commit 4b514ed
Show file tree
Hide file tree
Showing 20 changed files with 1,162 additions and 101 deletions.
4 changes: 2 additions & 2 deletions go/arrow/array/array.go
Original file line number Diff line number Diff line change
Expand Up @@ -176,8 +176,8 @@ func init() {
arrow.EXTENSION: func(data arrow.ArrayData) arrow.Array { return NewExtensionData(data) },
arrow.FIXED_SIZE_LIST: func(data arrow.ArrayData) arrow.Array { return NewFixedSizeListData(data) },
arrow.DURATION: func(data arrow.ArrayData) arrow.Array { return NewDurationData(data) },
arrow.LARGE_STRING: unsupportedArrayType,
arrow.LARGE_BINARY: unsupportedArrayType,
arrow.LARGE_STRING: func(data arrow.ArrayData) arrow.Array { return NewLargeStringData(data) },
arrow.LARGE_BINARY: func(data arrow.ArrayData) arrow.Array { return NewLargeBinaryData(data) },
arrow.LARGE_LIST: unsupportedArrayType,
arrow.INTERVAL: func(data arrow.ArrayData) arrow.Array { return NewIntervalData(data) },
arrow.INTERVAL_MONTH_DAY_NANO: func(data arrow.ArrayData) arrow.Array { return NewMonthDayNanoIntervalData(data) },
Expand Down
4 changes: 2 additions & 2 deletions go/arrow/array/array_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ func TestMakeFromData(t *testing.T) {
{name: "float64", d: &testDataType{arrow.FLOAT64}},
{name: "string", d: &testDataType{arrow.STRING}, size: 3},
{name: "binary", d: &testDataType{arrow.BINARY}, size: 3},
{name: "large_string", d: &testDataType{arrow.LARGE_STRING}, size: 3},
{name: "large_binary", d: &testDataType{arrow.LARGE_BINARY}, size: 3},
{name: "fixed_size_binary", d: &testDataType{arrow.FIXED_SIZE_BINARY}},
{name: "date32", d: &testDataType{arrow.DATE32}},
{name: "date64", d: &testDataType{arrow.DATE64}},
Expand Down Expand Up @@ -114,8 +116,6 @@ func TestMakeFromData(t *testing.T) {
// unsupported types
{name: "sparse union", d: &testDataType{arrow.SPARSE_UNION}, expPanic: true, expError: "unsupported data type: SPARSE_UNION"},
{name: "dense union", d: &testDataType{arrow.DENSE_UNION}, expPanic: true, expError: "unsupported data type: DENSE_UNION"},
{name: "large string", d: &testDataType{arrow.LARGE_STRING}, expPanic: true, expError: "unsupported data type: LARGE_STRING"},
{name: "large binary", d: &testDataType{arrow.LARGE_BINARY}, expPanic: true, expError: "unsupported data type: LARGE_BINARY"},
{name: "large list", d: &testDataType{arrow.LARGE_LIST}, expPanic: true, expError: "unsupported data type: LARGE_LIST"},
{name: "decimal256", d: &testDataType{arrow.DECIMAL256}, expPanic: true, expError: "unsupported data type: DECIMAL256"},

Expand Down
129 changes: 129 additions & 0 deletions go/arrow/array/binary.go
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,135 @@ func arrayEqualBinary(left, right *Binary) bool {
return true
}

type LargeBinary struct {
array
valueOffsets []int64
valueBytes []byte
}

func NewLargeBinaryData(data arrow.ArrayData) *LargeBinary {
a := &LargeBinary{}
a.refCount = 1
a.setData(data.(*Data))
return a
}

func (a *LargeBinary) Value(i int) []byte {
if i < 0 || i >= a.array.data.length {
panic("arrow/array: index out of range")
}
idx := a.array.data.offset + i
return a.valueBytes[a.valueOffsets[idx]:a.valueOffsets[idx+1]]
}

func (a *LargeBinary) ValueString(i int) string {
b := a.Value(i)
return *(*string)(unsafe.Pointer(&b))
}

func (a *LargeBinary) ValueOffset(i int) int64 {
if i < 0 || i >= a.array.data.length {
panic("arrow/array: index out of range")
}
return a.valueOffsets[a.array.data.offset+i]
}

func (a *LargeBinary) ValueLen(i int) int {
if i < 0 || i >= a.array.data.length {
panic("arrow/array: index out of range")
}
beg := a.array.data.offset + i
return int(a.valueOffsets[beg+1] - a.valueOffsets[beg])
}

func (a *LargeBinary) ValueOffsets() []int64 {
beg := a.array.data.offset
end := beg + a.array.data.length + 1
return a.valueOffsets[beg:end]
}

func (a *LargeBinary) ValueBytes() []byte {
beg := a.array.data.offset
end := beg + a.array.data.length
return a.valueBytes[a.valueOffsets[beg]:a.valueOffsets[end]]
}

func (a *LargeBinary) String() string {
var o strings.Builder
o.WriteString("[")
for i := 0; i < a.Len(); i++ {
if i > 0 {
o.WriteString(" ")
}
switch {
case a.IsNull(i):
o.WriteString("(null)")
default:
fmt.Fprintf(&o, "%q", a.ValueString(i))
}
}
o.WriteString("]")
return o.String()
}

func (a *LargeBinary) setData(data *Data) {
if len(data.buffers) != 3 {
panic("len(data.buffers) != 3")
}

a.array.setData(data)

if valueData := data.buffers[2]; valueData != nil {
a.valueBytes = valueData.Bytes()
}

if valueOffsets := data.buffers[1]; valueOffsets != nil {
a.valueOffsets = arrow.Int64Traits.CastFromBytes(valueOffsets.Bytes())
}

if a.array.data.length < 1 {
return
}

expNumOffsets := a.array.data.offset + a.array.data.length + 1
if len(a.valueOffsets) < expNumOffsets {
panic(fmt.Errorf("arrow/array: large binary offset buffer must have at least %d values", expNumOffsets))
}

if int(a.valueOffsets[expNumOffsets-1]) > len(a.valueBytes) {
panic("arrow/array: large binary offsets out of bounds of data buffer")
}
}

func (a *LargeBinary) getOneForMarshal(i int) interface{} {
if a.IsNull(i) {
return nil
}
return a.Value(i)
}

func (a *LargeBinary) MarshalJSON() ([]byte, error) {
vals := make([]interface{}, a.Len())
for i := 0; i < a.Len(); i++ {
vals[i] = a.getOneForMarshal(i)
}
// golang marshal standard says that []byte will be marshalled
// as a base64-encoded string
return json.Marshal(vals)
}

func arrayEqualLargeBinary(left, right *LargeBinary) bool {
for i := 0; i < left.Len(); i++ {
if left.IsNull(i) {
continue
}
if !bytes.Equal(left.Value(i), right.Value(i)) {
return false
}
}
return true
}

var (
_ arrow.Array = (*Binary)(nil)
)
163 changes: 163 additions & 0 deletions go/arrow/array/binary_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,48 @@ func TestBinary(t *testing.T) {
b.Release()
}

func TestLargeBinary(t *testing.T) {
mem := memory.NewCheckedAllocator(memory.NewGoAllocator())
defer mem.AssertSize(t, 0)

b := NewBinaryBuilder(mem, arrow.BinaryTypes.LargeBinary)

values := [][]byte{
[]byte("AAA"),
nil,
[]byte("BBBB"),
}
valid := []bool{true, false, true}
b.AppendValues(values, valid)

b.Retain()
b.Release()

assert.Panics(t, func() {
b.NewBinaryArray()
})

a := b.NewLargeBinaryArray()
assert.Equal(t, 3, a.Len())
assert.Equal(t, 1, a.NullN())
assert.Equal(t, []byte("AAA"), a.Value(0))
assert.Equal(t, []byte{}, a.Value(1))
assert.Equal(t, []byte("BBBB"), a.Value(2))
a.Release()

// Test builder reset and NewArray API.
b.AppendValues(values, valid)
a = b.NewArray().(*LargeBinary)
assert.Equal(t, 3, a.Len())
assert.Equal(t, 1, a.NullN())
assert.Equal(t, []byte("AAA"), a.Value(0))
assert.Equal(t, []byte{}, a.Value(1))
assert.Equal(t, []byte("BBBB"), a.Value(2))
a.Release()

b.Release()
}

func TestBinarySliceData(t *testing.T) {
mem := memory.NewCheckedAllocator(memory.NewGoAllocator())
defer mem.AssertSize(t, 0)
Expand Down Expand Up @@ -336,6 +378,33 @@ func TestBinaryValueOffset(t *testing.T) {
}
}

func TestLargeBinaryValueOffset(t *testing.T) {
mem := memory.NewCheckedAllocator(memory.NewGoAllocator())
defer mem.AssertSize(t, 0)

values := []string{"a", "bc", "", "", "hijk", "lm", "", "opq", "", "tu"}
valids := []bool{true, true, false, false, true, true, true, true, false, true}

b := NewBinaryBuilder(mem, arrow.BinaryTypes.LargeBinary)
defer b.Release()

b.AppendStringValues(values, valids)

arr := b.NewArray().(*LargeBinary)
defer arr.Release()

slice := NewSlice(arr, 2, 9).(*LargeBinary)
defer slice.Release()

offset := 3
vs := values[2:9]

for i, v := range vs {
assert.EqualValues(t, offset, slice.ValueOffset(i))
offset += len(v)
}
}

func TestBinaryValueLen(t *testing.T) {
mem := memory.NewCheckedAllocator(memory.NewGoAllocator())
defer mem.AssertSize(t, 0)
Expand All @@ -361,6 +430,31 @@ func TestBinaryValueLen(t *testing.T) {
}
}

func TestLargeBinaryValueLen(t *testing.T) {
mem := memory.NewCheckedAllocator(memory.NewGoAllocator())
defer mem.AssertSize(t, 0)

values := []string{"a", "bc", "", "", "hijk", "lm", "", "opq", "", "tu"}
valids := []bool{true, true, false, false, true, true, true, true, false, true}

b := NewBinaryBuilder(mem, arrow.BinaryTypes.LargeBinary)
defer b.Release()

b.AppendStringValues(values, valids)

arr := b.NewArray().(*LargeBinary)
defer arr.Release()

slice := NewSlice(arr, 2, 9).(*LargeBinary)
defer slice.Release()

vs := values[2:9]

for i, v := range vs {
assert.Equal(t, len(v), slice.ValueLen(i))
}
}

func TestBinaryValueOffsets(t *testing.T) {
mem := memory.NewCheckedAllocator(memory.NewGoAllocator())
defer mem.AssertSize(t, 0)
Expand All @@ -384,6 +478,29 @@ func TestBinaryValueOffsets(t *testing.T) {
assert.Equal(t, []int32{3, 3, 3, 7, 9, 9, 12, 12}, slice.ValueOffsets())
}

func TestLargeBinaryValueOffsets(t *testing.T) {
mem := memory.NewCheckedAllocator(memory.NewGoAllocator())
defer mem.AssertSize(t, 0)

values := []string{"a", "bc", "", "", "hijk", "lm", "", "opq", "", "tu"}
valids := []bool{true, true, false, false, true, true, true, true, false, true}

b := NewBinaryBuilder(mem, arrow.BinaryTypes.LargeBinary)
defer b.Release()

b.AppendStringValues(values, valids)

arr := b.NewArray().(*LargeBinary)
defer arr.Release()

assert.Equal(t, []int64{0, 1, 3, 3, 3, 7, 9, 9, 12, 12, 14}, arr.ValueOffsets())

slice := NewSlice(arr, 2, 9).(*LargeBinary)
defer slice.Release()

assert.Equal(t, []int64{3, 3, 3, 7, 9, 9, 12, 12}, slice.ValueOffsets())
}

func TestBinaryValueBytes(t *testing.T) {
mem := memory.NewCheckedAllocator(memory.NewGoAllocator())
defer mem.AssertSize(t, 0)
Expand All @@ -407,6 +524,29 @@ func TestBinaryValueBytes(t *testing.T) {
assert.Equal(t, []byte{'h', 'i', 'j', 'k', 'l', 'm', 'o', 'p', 'q'}, slice.ValueBytes())
}

func TestLargeBinaryValueBytes(t *testing.T) {
mem := memory.NewCheckedAllocator(memory.NewGoAllocator())
defer mem.AssertSize(t, 0)

values := []string{"a", "bc", "", "", "hijk", "lm", "", "opq", "", "tu"}
valids := []bool{true, true, false, false, true, true, true, true, false, true}

b := NewBinaryBuilder(mem, arrow.BinaryTypes.LargeBinary)
defer b.Release()

b.AppendStringValues(values, valids)

arr := b.NewArray().(*LargeBinary)
defer arr.Release()

assert.Equal(t, []byte{'a', 'b', 'c', 'h', 'i', 'j', 'k', 'l', 'm', 'o', 'p', 'q', 't', 'u'}, arr.ValueBytes())

slice := NewSlice(arr, 2, 9).(*LargeBinary)
defer slice.Release()

assert.Equal(t, []byte{'h', 'i', 'j', 'k', 'l', 'm', 'o', 'p', 'q'}, slice.ValueBytes())
}

func TestBinaryStringer(t *testing.T) {
mem := memory.NewCheckedAllocator(memory.NewGoAllocator())
defer mem.AssertSize(t, 0)
Expand All @@ -430,6 +570,29 @@ func TestBinaryStringer(t *testing.T) {
}
}

func TestLargeBinaryStringer(t *testing.T) {
mem := memory.NewCheckedAllocator(memory.NewGoAllocator())
defer mem.AssertSize(t, 0)

values := []string{"a", "bc", "", "é", "", "hijk", "lm", "", "opq", "", "tu"}
valids := []bool{true, true, false, true, false, true, true, true, true, false, true}

b := NewBinaryBuilder(mem, arrow.BinaryTypes.LargeBinary)
defer b.Release()

b.AppendStringValues(values, valids)

arr := b.NewArray().(*LargeBinary)
defer arr.Release()

got := arr.String()
want := `["a" "bc" (null) "é" (null) "hijk" "lm" "" "opq" (null) "tu"]`

if got != want {
t.Fatalf("invalid stringer:\ngot= %s\nwant=%s\n", got, want)
}
}

func TestBinaryInvalidOffsets(t *testing.T) {
const expectedPanic = "arrow/array: binary offsets out of bounds of data buffer"

Expand Down
Loading

0 comments on commit 4b514ed

Please sign in to comment.