From 519258fe5dd6fbd0bf9f94695867e3db99d2767d Mon Sep 17 00:00:00 2001 From: James Duong Date: Fri, 13 Oct 2023 17:49:06 -0700 Subject: [PATCH] GH-38254: [Java] Add reusable buffer getters to char/binary vectors Add a reusable buffer interface that can be populated by character and binary vectors to avoid allocations when consuming vector content. Optimize getObject() on VarCharVector/LargeVarCharVector to avoid an extra allocation of a byte array (copy from ArrowBuf directly to the resulting Text). --- .../apache/arrow/memory/ReusableBuffer.java | 47 ++++++++++ .../arrow/vector/FixedSizeBinaryVector.java | 13 +++ .../arrow/vector/LargeVarBinaryVector.java | 15 ++++ .../arrow/vector/LargeVarCharVector.java | 27 +++++- .../apache/arrow/vector/VarBinaryVector.java | 15 ++++ .../apache/arrow/vector/VarCharVector.java | 25 +++++- .../arrow/vector/util/ReusableByteArray.java | 80 +++++++++++++++++ .../org/apache/arrow/vector/util/Text.java | 37 +------- .../vector/TestFixedSizeBinaryVector.java | 16 ++++ .../vector/TestLargeVarBinaryVector.java | 75 +++++++++++----- .../arrow/vector/TestLargeVarCharVector.java | 21 +++++ .../apache/arrow/vector/TestValueVector.java | 44 ++++++++++ .../vector/util/TestReusableByteArray.java | 87 +++++++++++++++++++ 13 files changed, 436 insertions(+), 66 deletions(-) create mode 100644 java/memory/memory-core/src/main/java/org/apache/arrow/memory/ReusableBuffer.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/util/ReusableByteArray.java create mode 100644 java/vector/src/test/java/org/apache/arrow/vector/util/TestReusableByteArray.java diff --git a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/ReusableBuffer.java b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/ReusableBuffer.java new file mode 100644 index 0000000000000..053c365cd1b52 --- /dev/null +++ b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/ReusableBuffer.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.memory; + +/** + * A lightweight, automatically expanding container for holding byte data. + * @param The type of the underlying buffer. + */ +public interface ReusableBuffer { + /** + * Get the number of valid bytes in the data. + * + * @return the number of valid bytes in the data + */ + int getLength(); + + /** + * Get the buffer backing this ReusableBuffer. + */ + T getBuffer(); + + /** + * Set the buffer to the contents of the given ArrowBuf. + * The internal buffer must resize if it cannot fit the contents + * of the data. + * + * @param srcBytes the data to copy from + * @param start the first position of the new data + * @param len the number of bytes of the new data + */ + void set(ArrowBuf srcBytes, long start, int len); +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/FixedSizeBinaryVector.java b/java/vector/src/main/java/org/apache/arrow/vector/FixedSizeBinaryVector.java index 967d560d78dea..a09cd9865ec47 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/FixedSizeBinaryVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/FixedSizeBinaryVector.java @@ -21,6 +21,7 @@ import org.apache.arrow.memory.ArrowBuf; import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.ReusableBuffer; import org.apache.arrow.util.Preconditions; import org.apache.arrow.vector.complex.impl.FixedSizeBinaryReaderImpl; import org.apache.arrow.vector.complex.reader.FieldReader; @@ -116,6 +117,18 @@ public byte[] get(int index) { return dst; } + /** + * Read the value at the given position to the given output buffer. + * The caller is responsible for checking for nullity first. + * + * @param index position of element. + * @param outputBuffer the buffer to write into. + */ + public void read(int index, ReusableBuffer outputBuffer) { + final int startOffset = index * byteWidth; + outputBuffer.set(valueBuffer, startOffset, byteWidth); + } + /** * Get the element at the given index from the vector and * sets the state in holder. If element at given index diff --git a/java/vector/src/main/java/org/apache/arrow/vector/LargeVarBinaryVector.java b/java/vector/src/main/java/org/apache/arrow/vector/LargeVarBinaryVector.java index 0063a61da570a..d50370178801c 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/LargeVarBinaryVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/LargeVarBinaryVector.java @@ -18,6 +18,7 @@ package org.apache.arrow.vector; import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.ReusableBuffer; import org.apache.arrow.vector.complex.impl.LargeVarBinaryReaderImpl; import org.apache.arrow.vector.complex.reader.FieldReader; import org.apache.arrow.vector.holders.LargeVarBinaryHolder; @@ -112,6 +113,20 @@ public byte[] get(int index) { return result; } + /** + * Read the value at the given position to the given output buffer. + * The caller is responsible for checking for nullity first. + * + * @param index position of element. + * @param outputBuffer the buffer to write into. + */ + public void read(int index, ReusableBuffer outputBuffer) { + final long startOffset = getStartOffset(index); + final int dataLength = + (int) (offsetBuffer.getLong((long) (index + 1) * OFFSET_WIDTH) - startOffset); + outputBuffer.set(valueBuffer, startOffset, dataLength); + } + /** * Get the variable length element at specified index as Text. * diff --git a/java/vector/src/main/java/org/apache/arrow/vector/LargeVarCharVector.java b/java/vector/src/main/java/org/apache/arrow/vector/LargeVarCharVector.java index e9472c9f2c71e..cf0d97e814f42 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/LargeVarCharVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/LargeVarCharVector.java @@ -17,7 +17,10 @@ package org.apache.arrow.vector; +import static org.apache.arrow.vector.NullCheckingForGet.NULL_CHECKING_ENABLED; + import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.ReusableBuffer; import org.apache.arrow.vector.complex.impl.LargeVarCharReaderImpl; import org.apache.arrow.vector.complex.reader.FieldReader; import org.apache.arrow.vector.holders.LargeVarCharHolder; @@ -120,12 +123,28 @@ public byte[] get(int index) { * @return Text object for non-null element, null otherwise */ public Text getObject(int index) { - byte[] b = get(index); - if (b == null) { + assert index >= 0; + if (NULL_CHECKING_ENABLED && isSet(index) == 0) { return null; - } else { - return new Text(b); } + + final Text result = new Text(); + read(index, result); + return result; + } + + /** + * Read the value at the given position to the given output buffer. + * The caller is responsible for checking for nullity first. + * + * @param index position of element. + * @param outputBuffer the buffer to write into. + */ + public void read(int index, ReusableBuffer outputBuffer) { + final long startOffset = getStartOffset(index); + final int dataLength = + (int) (offsetBuffer.getLong((long) (index + 1) * OFFSET_WIDTH) - startOffset); + outputBuffer.set(valueBuffer, startOffset, dataLength); } /** diff --git a/java/vector/src/main/java/org/apache/arrow/vector/VarBinaryVector.java b/java/vector/src/main/java/org/apache/arrow/vector/VarBinaryVector.java index 34e072aaa8324..a5f1e67f12c6d 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/VarBinaryVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/VarBinaryVector.java @@ -20,6 +20,7 @@ import static org.apache.arrow.vector.NullCheckingForGet.NULL_CHECKING_ENABLED; import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.ReusableBuffer; import org.apache.arrow.vector.complex.impl.VarBinaryReaderImpl; import org.apache.arrow.vector.complex.reader.FieldReader; import org.apache.arrow.vector.holders.NullableVarBinaryHolder; @@ -113,6 +114,20 @@ public byte[] get(int index) { return result; } + /** + * Read the value at the given position to the given output buffer. + * The caller is responsible for checking for nullity first. + * + * @param index position of element. + * @param outputBuffer the buffer to write into. + */ + public void read(int index, ReusableBuffer outputBuffer) { + final int startOffset = getStartOffset(index); + final int dataLength = + offsetBuffer.getInt((long) (index + 1) * OFFSET_WIDTH) - startOffset; + outputBuffer.set(valueBuffer, startOffset, dataLength); + } + /** * Get the variable length element at specified index as Text. * diff --git a/java/vector/src/main/java/org/apache/arrow/vector/VarCharVector.java b/java/vector/src/main/java/org/apache/arrow/vector/VarCharVector.java index 2c83893819a1e..eaeb6e5121cb0 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/VarCharVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/VarCharVector.java @@ -20,6 +20,7 @@ import static org.apache.arrow.vector.NullCheckingForGet.NULL_CHECKING_ENABLED; import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.ReusableBuffer; import org.apache.arrow.vector.complex.impl.VarCharReaderImpl; import org.apache.arrow.vector.complex.reader.FieldReader; import org.apache.arrow.vector.holders.NullableVarCharHolder; @@ -119,12 +120,28 @@ public byte[] get(int index) { * @return Text object for non-null element, null otherwise */ public Text getObject(int index) { - byte[] b = get(index); - if (b == null) { + assert index >= 0; + if (NULL_CHECKING_ENABLED && isSet(index) == 0) { return null; - } else { - return new Text(b); } + + final Text result = new Text(); + read(index, result); + return result; + } + + /** + * Read the value at the given position to the given output buffer. + * The caller is responsible for checking for nullity first. + * + * @param index position of element. + * @param outputBuffer the buffer to write into. + */ + public void read(int index, ReusableBuffer outputBuffer) { + final int startOffset = getStartOffset(index); + final int dataLength = + offsetBuffer.getInt((long) (index + 1) * OFFSET_WIDTH) - startOffset; + outputBuffer.set(valueBuffer, startOffset, dataLength); } /** diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/ReusableByteArray.java b/java/vector/src/main/java/org/apache/arrow/vector/util/ReusableByteArray.java new file mode 100644 index 0000000000000..7c107bcaa590b --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/ReusableByteArray.java @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.util; + +import java.util.Arrays; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.ReusableBuffer; + +/** + * A wrapper around byte arrays for repeated writing. + */ +public class ReusableByteArray implements ReusableBuffer { + + protected static final byte[] EMPTY_BYTES = new byte[0]; + + protected byte[] bytes; + protected int length; + + public ReusableByteArray() { + bytes = EMPTY_BYTES; + } + + /** + * Get the number of bytes in the byte array. + * + * @return the number of bytes in the byte array + */ + @Override + public int getLength() { + return length; + } + + @Override + public byte[] getBuffer() { + return bytes; + } + + @Override + public void set(ArrowBuf srcBytes, long start, int len) { + setCapacity(len, false); + srcBytes.getBytes(start, bytes, 0, len); + length = len; + } + + /** + * Sets the capacity of this object to at least len bytes. If the + * current buffer is longer, then the capacity and existing content of the buffer are unchanged. + * If len is larger than the current capacity, the Text object's capacity is + * increased to match. + * + * @param len the number of bytes we need + * @param keepData should the old data be kept + */ + protected void setCapacity(int len, boolean keepData) { + if (bytes == null || bytes.length < len) { + if (bytes != null && keepData) { + bytes = Arrays.copyOf(bytes, Math.max(len, length << 1)); + } else { + bytes = new byte[len]; + } + } + } + +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/Text.java b/java/vector/src/main/java/org/apache/arrow/vector/util/Text.java index 778af0ca956df..06c72e11364c7 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/Text.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/Text.java @@ -29,7 +29,6 @@ import java.nio.charset.MalformedInputException; import java.text.CharacterIterator; import java.text.StringCharacterIterator; -import java.util.Arrays; import java.util.Optional; import com.fasterxml.jackson.core.JsonGenerationException; @@ -43,7 +42,7 @@ * Lifted from Hadoop 2.7.1 */ @JsonSerialize(using = Text.TextSerializer.class) -public class Text { +public class Text extends ReusableByteArray { private static ThreadLocal ENCODER_FACTORY = new ThreadLocal() { @@ -65,13 +64,9 @@ protected CharsetDecoder initialValue() { } }; - private static final byte[] EMPTY_BYTES = new byte[0]; - - private byte[] bytes; - private int length; public Text() { - bytes = EMPTY_BYTES; + super(); } /** @@ -123,15 +118,6 @@ public byte[] getBytes() { return bytes; } - /** - * Get the number of bytes in the byte array. - * - * @return the number of bytes in the byte array - */ - public int getLength() { - return length; - } - /** * Returns the Unicode Scalar Value (32-bit integer value) for the character at * position. Note that this method avoids using the converter or doing String @@ -278,25 +264,6 @@ public void clear() { length = 0; } - /** - * Sets the capacity of this Text object to at least len bytes. If the - * current buffer is longer, then the capacity and existing content of the buffer are unchanged. - * If len is larger than the current capacity, the Text object's capacity is - * increased to match. - * - * @param len the number of bytes we need - * @param keepData should the old data be kept - */ - private void setCapacity(int len, boolean keepData) { - if (bytes == null || bytes.length < len) { - if (bytes != null && keepData) { - bytes = Arrays.copyOf(bytes, Math.max(len, length << 1)); - } else { - bytes = new byte[len]; - } - } - } - @Override public String toString() { try { diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestFixedSizeBinaryVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestFixedSizeBinaryVector.java index c413f4e23ebc3..b9cd89e4ad731 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestFixedSizeBinaryVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestFixedSizeBinaryVector.java @@ -24,6 +24,7 @@ import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.vector.holders.FixedSizeBinaryHolder; import org.apache.arrow.vector.holders.NullableFixedSizeBinaryHolder; +import org.apache.arrow.vector.util.ReusableByteArray; import org.apache.arrow.vector.util.TransferPair; import org.junit.After; import org.junit.Before; @@ -286,4 +287,19 @@ public void testGetTransferPairWithField() { // Field inside a new vector created by reusing a field should be the same in memory as the original field. assertSame(fromVector.getField(), toVector.getField()); } + + @Test + public void testGetBytesRepeatedly() { + for (int i = 0; i < numValues; i++) { + vector.set(i, values[i]); + } + vector.setValueCount(numValues); + + ReusableByteArray reusableByteArray = new ReusableByteArray(); + for (int i = 0; i < numValues; i++) { + // verify results + vector.read(i, reusableByteArray); + assertArrayEquals(values[i], reusableByteArray.getBuffer()); + } + } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestLargeVarBinaryVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestLargeVarBinaryVector.java index 644827ce995e8..54de774b10582 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestLargeVarBinaryVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestLargeVarBinaryVector.java @@ -17,13 +17,18 @@ package org.apache.arrow.vector; +import static org.junit.Assert.assertArrayEquals; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertSame; import static org.junit.Assert.assertTrue; +import java.util.Arrays; + import org.apache.arrow.memory.ArrowBuf; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; import org.apache.arrow.vector.holders.NullableLargeVarBinaryHolder; +import org.apache.arrow.vector.util.ReusableByteArray; import org.junit.After; import org.junit.Before; import org.junit.Test; @@ -54,21 +59,20 @@ public void testSetNullableLargeVarBinaryHolder() { binHolder.isSet = 1; String str = "hello"; - ArrowBuf buf = allocator.buffer(16); - buf.setBytes(0, str.getBytes()); - - binHolder.start = 0; - binHolder.end = str.length(); - binHolder.buffer = buf; + try (ArrowBuf buf = allocator.buffer(16)) { + buf.setBytes(0, str.getBytes()); - vector.set(0, nullHolder); - vector.set(1, binHolder); + binHolder.start = 0; + binHolder.end = str.length(); + binHolder.buffer = buf; - // verify results - assertTrue(vector.isNull(0)); - assertEquals(str, new String(vector.get(1))); + vector.set(0, nullHolder); + vector.set(1, binHolder); - buf.close(); + // verify results + assertTrue(vector.isNull(0)); + assertEquals(str, new String(vector.get(1))); + } } } @@ -84,21 +88,46 @@ public void testSetNullableLargeVarBinaryHolderSafe() { binHolder.isSet = 1; String str = "hello world"; - ArrowBuf buf = allocator.buffer(16); - buf.setBytes(0, str.getBytes()); + try (ArrowBuf buf = allocator.buffer(16)) { + buf.setBytes(0, str.getBytes()); - binHolder.start = 0; - binHolder.end = str.length(); - binHolder.buffer = buf; + binHolder.start = 0; + binHolder.end = str.length(); + binHolder.buffer = buf; - vector.setSafe(0, binHolder); - vector.setSafe(1, nullHolder); + vector.setSafe(0, binHolder); + vector.setSafe(1, nullHolder); - // verify results - assertEquals(str, new String(vector.get(0))); - assertTrue(vector.isNull(1)); + // verify results + assertEquals(str, new String(vector.get(0))); + assertTrue(vector.isNull(1)); + } + } + } - buf.close(); + @Test + public void testGetBytesRepeatedly() { + try (LargeVarBinaryVector vector = new LargeVarBinaryVector("", allocator)) { + vector.allocateNew(5, 1); + + final String str = "hello world"; + final String str2 = "foo"; + vector.setSafe(0, str.getBytes()); + vector.setSafe(1, str2.getBytes()); + + // verify results + ReusableByteArray reusableByteArray = new ReusableByteArray(); + vector.read(0, reusableByteArray); + byte[] oldBuffer = reusableByteArray.getBuffer(); + assertArrayEquals(str.getBytes(), Arrays.copyOfRange(reusableByteArray.getBuffer(), + 0, reusableByteArray.getLength())); + + vector.read(1, reusableByteArray); + assertArrayEquals(str2.getBytes(), Arrays.copyOfRange(reusableByteArray.getBuffer(), + 0, reusableByteArray.getLength())); + + // There should not have been any reallocation since the newer value is smaller in length. + assertSame(oldBuffer, reusableByteArray.getBuffer()); } } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestLargeVarCharVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestLargeVarCharVector.java index 1b81c6b209fbb..47635d2ff60a8 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestLargeVarCharVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestLargeVarCharVector.java @@ -37,6 +37,7 @@ import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.Schema; import org.apache.arrow.vector.util.OversizedAllocationException; +import org.apache.arrow.vector.util.Text; import org.apache.arrow.vector.util.TransferPair; import org.junit.After; import org.junit.Assert; @@ -794,6 +795,26 @@ public void testNullableType() { } } + @Test + public void testGetTextRepeatedly() { + try (final LargeVarCharVector vector = new LargeVarCharVector("myvector", allocator)) { + + ValueVectorDataPopulator.setVector(vector, STR1, STR2); + vector.setValueCount(2); + + /* check the vector output */ + Text text = new Text(); + vector.read(0, text); + byte[] result = new byte[text.getLength()]; + System.arraycopy(text.getBytes(), 0, result, 0, text.getLength()); + assertArrayEquals(STR1, result); + vector.read(1, text); + result = new byte[text.getLength()]; + System.arraycopy(text.getBytes(), 0, result, 0, text.getLength()); + assertArrayEquals(STR2, text.getBytes()); + } + } + private void populateLargeVarcharVector(final LargeVarCharVector vector, int valueCount, String[] values) { for (int i = 0; i < valueCount; i += 3) { final String s = String.format("%010d", i); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java index 0928d3eb03082..51a45a33f128e 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java @@ -25,6 +25,7 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertSame; import static org.junit.Assert.assertTrue; import java.nio.ByteBuffer; @@ -56,6 +57,7 @@ import org.apache.arrow.vector.holders.NullableVarBinaryHolder; import org.apache.arrow.vector.holders.NullableVarCharHolder; import org.apache.arrow.vector.ipc.message.ArrowRecordBatch; +import org.apache.arrow.vector.testing.ValueVectorDataPopulator; import org.apache.arrow.vector.types.Types; import org.apache.arrow.vector.types.Types.MinorType; import org.apache.arrow.vector.types.pojo.ArrowType; @@ -63,6 +65,7 @@ import org.apache.arrow.vector.types.pojo.FieldType; import org.apache.arrow.vector.types.pojo.Schema; import org.apache.arrow.vector.util.OversizedAllocationException; +import org.apache.arrow.vector.util.ReusableByteArray; import org.apache.arrow.vector.util.Text; import org.apache.arrow.vector.util.TransferPair; import org.junit.After; @@ -1107,6 +1110,22 @@ public void testNullableVarType1() { } } + @Test + public void testGetTextRepeatedly() { + try (final VarCharVector vector = new VarCharVector("myvector", allocator)) { + + ValueVectorDataPopulator.setVector(vector, STR1, STR2); + vector.setValueCount(2); + + /* check the vector output */ + Text text = new Text(); + vector.read(0, text); + assertArrayEquals(STR1, text.getBytes()); + vector.read(1, text); + assertArrayEquals(STR2, text.getBytes()); + } + } + @Test /* VarBinaryVector */ public void testNullableVarType2() { @@ -1156,6 +1175,31 @@ public void testReallocateCheckSuccess() { } } + @Test + public void testGetBytesRepeatedly() { + try (VarBinaryVector vector = new VarBinaryVector("", allocator)) { + vector.allocateNew(5, 1); + + final String str = "hello world"; + final String str2 = "foo"; + vector.setSafe(0, str.getBytes()); + vector.setSafe(1, str2.getBytes()); + + // verify results + ReusableByteArray reusableByteArray = new ReusableByteArray(); + vector.read(0, reusableByteArray); + assertArrayEquals(str.getBytes(), Arrays.copyOfRange(reusableByteArray.getBuffer(), + 0, reusableByteArray.getLength())); + byte[] oldBuffer = reusableByteArray.getBuffer(); + + vector.read(1, reusableByteArray); + assertArrayEquals(str2.getBytes(), Arrays.copyOfRange(reusableByteArray.getBuffer(), + 0, reusableByteArray.getLength())); + + // There should not have been any reallocation since the newer value is smaller in length. + assertSame(oldBuffer, reusableByteArray.getBuffer()); + } + } /* * generic tests diff --git a/java/vector/src/test/java/org/apache/arrow/vector/util/TestReusableByteArray.java b/java/vector/src/test/java/org/apache/arrow/vector/util/TestReusableByteArray.java new file mode 100644 index 0000000000000..ed03548f2ec81 --- /dev/null +++ b/java/vector/src/test/java/org/apache/arrow/vector/util/TestReusableByteArray.java @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.util; + +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotSame; +import static org.junit.Assert.assertSame; +import static org.junit.Assert.assertTrue; + +import java.util.Arrays; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.BaseValueVector; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +public class TestReusableByteArray { + + private BufferAllocator allocator; + + @Before + public void prepare() { + // Permit allocating 4 vectors of max size. + allocator = new RootAllocator(4 * BaseValueVector.MAX_ALLOCATION_SIZE); + } + + @After + public void shutdown() { + allocator.close(); + } + + @Test + public void testSetByteArrayRepeatedly() { + ReusableByteArray byteArray = new ReusableByteArray(); + try (ArrowBuf workingBuf = allocator.buffer(100)) { + final String str = "test"; + workingBuf.setBytes(0, str.getBytes()); + byteArray.set(workingBuf, 0, str.getBytes().length); + assertEquals(str.getBytes().length, byteArray.getLength()); + assertArrayEquals(str.getBytes(), Arrays.copyOfRange(byteArray.getBuffer(), 0, byteArray.getLength())); + + // Test a longer string. Should require reallocation. + final String str2 = "test_longer"; + byte[] oldBuffer = byteArray.getBuffer(); + workingBuf.clear(); + workingBuf.setBytes(0, str2.getBytes()); + byteArray.set(workingBuf, 0, str2.getBytes().length); + assertEquals(str2.getBytes().length, byteArray.getLength()); + assertArrayEquals(str2.getBytes(), Arrays.copyOfRange(byteArray.getBuffer(), 0, byteArray.getLength())); + + // Verify reallocation needed. + assertNotSame(oldBuffer, byteArray.getBuffer()); + assertTrue(byteArray.getBuffer().length > oldBuffer.length); + + // Test writing a shorter string. Should not require reallocation. + final String str3 = "short"; + oldBuffer = byteArray.getBuffer(); + workingBuf.clear(); + workingBuf.setBytes(0, str3.getBytes()); + byteArray.set(workingBuf, 0, str3.getBytes().length); + assertEquals(str3.getBytes().length, byteArray.getLength()); + assertArrayEquals(str3.getBytes(), Arrays.copyOfRange(byteArray.getBuffer(), 0, byteArray.getLength())); + + // Verify reallocation was not needed. + assertSame(oldBuffer, byteArray.getBuffer()); + } + } +}