Skip to content

Commit

Permalink
ARROW-5920: [Java] Support sort & compare for all variable width vectors
Browse files Browse the repository at this point in the history
All types of variable-width vectors can reuse the same comparator for sorting & searching.

Author: liyafan82 <[email protected]>

Closes apache#4860 from liyafan82/fly_0712_varsort and squashes the following commits:

cbd8c3f <liyafan82>   Provide a utility to create the default comparator
46d2c11 <liyafan82>  Support sort & compare for all variable width vectors
  • Loading branch information
liyafan82 authored and pprudhvi committed Aug 11, 2019
1 parent 716e9e1 commit 8216643
Show file tree
Hide file tree
Showing 5 changed files with 63 additions and 25 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -17,20 +17,51 @@

package org.apache.arrow.algorithm.sort;

import static org.apache.arrow.vector.BaseVariableWidthVector.OFFSET_WIDTH;

import org.apache.arrow.vector.BaseFixedWidthVector;
import org.apache.arrow.vector.BaseVariableWidthVector;
import org.apache.arrow.vector.BigIntVector;
import org.apache.arrow.vector.Float4Vector;
import org.apache.arrow.vector.Float8Vector;
import org.apache.arrow.vector.IntVector;
import org.apache.arrow.vector.SmallIntVector;
import org.apache.arrow.vector.TinyIntVector;
import org.apache.arrow.vector.VarCharVector;
import org.apache.arrow.vector.holders.NullableVarCharHolder;
import org.apache.arrow.vector.ValueVector;

/**
* Default comparator implementations for different types of vectors.
*/
public class DefaultVectorComparators {

/**
* Create the default comparator for the vector.
* @param vector the vector.
* @param <T> the vector type.
* @return the default comparator.
*/
public static <T extends ValueVector> VectorValueComparator<T> createDefaultComparator(T vector) {
if (vector instanceof BaseFixedWidthVector) {
if (vector instanceof TinyIntVector) {
return (VectorValueComparator<T>) new ByteComparator();
} else if (vector instanceof SmallIntVector) {
return (VectorValueComparator<T>) new ShortComparator();
} else if (vector instanceof IntVector) {
return (VectorValueComparator<T>) new IntComparator();
} else if (vector instanceof BigIntVector) {
return (VectorValueComparator<T>) new LongComparator();
} else if (vector instanceof Float4Vector) {
return (VectorValueComparator<T>) new Float4Comparator();
} else if (vector instanceof Float8Vector) {
return (VectorValueComparator<T>) new Float8Comparator();
}
} else if (vector instanceof BaseVariableWidthVector) {
return (VectorValueComparator<T>) new VariableWidthComparator();
}

throw new IllegalArgumentException("No default comparator for " + vector.getClass().getCanonicalName());
}

/**
* Default comparator for bytes.
* The comparison is based on values, with null comes first.
Expand Down Expand Up @@ -169,26 +200,26 @@ public int compareNotNull(int index1, int index2) {
}

/**
* Default comparator for varchars.
* Default comparator for {@link org.apache.arrow.vector.BaseVariableWidthVector}.
* The comparison is in lexicographic order, with null comes first.
*/
public static class VarCharComparator extends VectorValueComparator<VarCharVector> {

private NullableVarCharHolder holder1 = new NullableVarCharHolder();
private NullableVarCharHolder holder2 = new NullableVarCharHolder();
public static class VariableWidthComparator extends VectorValueComparator<BaseVariableWidthVector> {

@Override
public int compareNotNull(int index1, int index2) {
vector1.get(index1, holder1);
vector2.get(index2, holder2);
int start1 = vector1.getOffsetBuffer().getInt(index1 * OFFSET_WIDTH);
int start2 = vector2.getOffsetBuffer().getInt(index2 * OFFSET_WIDTH);

int end1 = vector1.getOffsetBuffer().getInt((index1 + 1) * OFFSET_WIDTH);
int end2 = vector2.getOffsetBuffer().getInt((index2 + 1) * OFFSET_WIDTH);

int length1 = holder1.end - holder1.start;
int length2 = holder2.end - holder2.start;
int length1 = end1 - start1;
int length2 = end2 - start2;

int minLength = length1 < length2 ? length1 : length2;
for (int i = 0; i < minLength; i++) {
byte b1 = holder1.buffer.getByte(holder1.start + i);
byte b2 = holder2.buffer.getByte(holder2.start + i);
byte b1 = vector1.getDataBuffer().getByte(start1 + i);
byte b2 = vector2.getDataBuffer().getByte(start2 + i);

if (b1 != b2) {
return b1 - b2;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import org.apache.arrow.algorithm.sort.VectorValueComparator;
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.BaseVariableWidthVector;
import org.apache.arrow.vector.IntVector;
import org.apache.arrow.vector.VarCharVector;
import org.junit.After;
Expand Down Expand Up @@ -68,7 +69,8 @@ public void testBinarySearchInt() {
negVector.set(0, -333);

// do search
VectorValueComparator<IntVector> comparator = new DefaultVectorComparators.IntComparator();
VectorValueComparator<IntVector> comparator =
DefaultVectorComparators.createDefaultComparator(rawVector);
for (int i = 0; i < VECTOR_LENGTH; i++) {
int result = VectorSearcher.binarySearch(rawVector, comparator, rawVector, i);
assertEquals(i, result);
Expand Down Expand Up @@ -99,7 +101,8 @@ public void testLinearSearchInt() {
negVector.set(0, -333);

// do search
VectorValueComparator<IntVector> comparator = new DefaultVectorComparators.IntComparator();
VectorValueComparator<IntVector> comparator =
DefaultVectorComparators.createDefaultComparator(rawVector);
for (int i = 0; i < VECTOR_LENGTH; i++) {
int result = VectorSearcher.linearSearch(rawVector, comparator, rawVector, i);
assertEquals(i, result);
Expand Down Expand Up @@ -137,7 +140,8 @@ public void testBinarySearchVarChar() {
negVector.set(0, "abcd".getBytes());

// do search
VectorValueComparator<VarCharVector> comparator = new DefaultVectorComparators.VarCharComparator();
VectorValueComparator<BaseVariableWidthVector> comparator =
DefaultVectorComparators.createDefaultComparator(rawVector);
for (int i = 0; i < VECTOR_LENGTH; i++) {
int result = VectorSearcher.binarySearch(rawVector, comparator, rawVector, i);
assertEquals(i, result);
Expand Down Expand Up @@ -175,7 +179,8 @@ public void testLinearSearchVarChar() {
negVector.set(0, "abcd".getBytes());

// do search
VectorValueComparator<VarCharVector> comparator = new DefaultVectorComparators.VarCharComparator();
VectorValueComparator<BaseVariableWidthVector> comparator =
DefaultVectorComparators.createDefaultComparator(rawVector);
for (int i = 0; i < VECTOR_LENGTH; i++) {
int result = VectorSearcher.linearSearch(rawVector, comparator, rawVector, i);
assertEquals(i, result);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ public void testSortInt() {

// sort the vector
FixedWidthInPlaceVectorSorter sorter = new FixedWidthInPlaceVectorSorter();
DefaultVectorComparators.IntComparator comparator = new DefaultVectorComparators.IntComparator();
VectorValueComparator<IntVector> comparator = DefaultVectorComparators.createDefaultComparator(vec);

sorter.sortInPlace(vec, comparator);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ public void testSortByte() {

// sort the vector
FixedWidthOutOfPlaceVectorSorter sorter = new FixedWidthOutOfPlaceVectorSorter();
DefaultVectorComparators.ByteComparator comparator = new DefaultVectorComparators.ByteComparator();
VectorValueComparator<TinyIntVector> comparator = DefaultVectorComparators.createDefaultComparator(vec);

TinyIntVector sortedVec =
(TinyIntVector) vec.getField().getFieldType().createNewSingleVector("", allocator, null);
Expand Down Expand Up @@ -117,7 +117,7 @@ public void testSortShort() {

// sort the vector
FixedWidthOutOfPlaceVectorSorter sorter = new FixedWidthOutOfPlaceVectorSorter();
DefaultVectorComparators.ShortComparator comparator = new DefaultVectorComparators.ShortComparator();
VectorValueComparator<SmallIntVector> comparator = DefaultVectorComparators.createDefaultComparator(vec);

SmallIntVector sortedVec =
(SmallIntVector) vec.getField().getFieldType().createNewSingleVector("", allocator, null);
Expand Down Expand Up @@ -164,7 +164,7 @@ public void testSortInt() {

// sort the vector
FixedWidthOutOfPlaceVectorSorter sorter = new FixedWidthOutOfPlaceVectorSorter();
DefaultVectorComparators.IntComparator comparator = new DefaultVectorComparators.IntComparator();
VectorValueComparator<IntVector> comparator = DefaultVectorComparators.createDefaultComparator(vec);

IntVector sortedVec = (IntVector) vec.getField().getFieldType().createNewSingleVector("", allocator, null);
sortedVec.allocateNew(vec.getValueCount());
Expand Down Expand Up @@ -210,7 +210,7 @@ public void testSortLong() {

// sort the vector
FixedWidthOutOfPlaceVectorSorter sorter = new FixedWidthOutOfPlaceVectorSorter();
DefaultVectorComparators.LongComparator comparator = new DefaultVectorComparators.LongComparator();
VectorValueComparator<BigIntVector> comparator = DefaultVectorComparators.createDefaultComparator(vec);

BigIntVector sortedVec = (BigIntVector) vec.getField().getFieldType().createNewSingleVector("", allocator, null);
sortedVec.allocateNew(vec.getValueCount());
Expand Down Expand Up @@ -256,7 +256,7 @@ public void testSortFloat() {

// sort the vector
FixedWidthOutOfPlaceVectorSorter sorter = new FixedWidthOutOfPlaceVectorSorter();
DefaultVectorComparators.Float4Comparator comparator = new DefaultVectorComparators.Float4Comparator();
VectorValueComparator<Float4Vector> comparator = DefaultVectorComparators.createDefaultComparator(vec);

Float4Vector sortedVec = (Float4Vector) vec.getField().getFieldType().createNewSingleVector("", allocator, null);
sortedVec.allocateNew(vec.getValueCount());
Expand Down Expand Up @@ -302,7 +302,7 @@ public void testSortDobule() {

// sort the vector
FixedWidthOutOfPlaceVectorSorter sorter = new FixedWidthOutOfPlaceVectorSorter();
DefaultVectorComparators.Float8Comparator comparator = new DefaultVectorComparators.Float8Comparator();
VectorValueComparator<Float8Vector> comparator = DefaultVectorComparators.createDefaultComparator(vec);

Float8Vector sortedVec = (Float8Vector) vec.getField().getFieldType().createNewSingleVector("", allocator, null);
sortedVec.allocateNew(vec.getValueCount());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@

import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.BaseVariableWidthVector;
import org.apache.arrow.vector.VarCharVector;
import org.junit.After;
import org.junit.Assert;
Expand Down Expand Up @@ -65,7 +66,8 @@ public void testSortString() {

// sort the vector
VariableWidthOutOfPlaceVectorSorter sorter = new VariableWidthOutOfPlaceVectorSorter();
DefaultVectorComparators.VarCharComparator comparator = new DefaultVectorComparators.VarCharComparator();
VectorValueComparator<BaseVariableWidthVector> comparator =
DefaultVectorComparators.createDefaultComparator(vec);

VarCharVector sortedVec =
(VarCharVector) vec.getField().getFieldType().createNewSingleVector("", allocator, null);
Expand Down

0 comments on commit 8216643

Please sign in to comment.