Skip to content

Commit

Permalink
Rewrite CollectionUtils dedup to work with any type (#85352)
Browse files Browse the repository at this point in the history
CollectionUtils contains a method for sorting and deduplicating an hppc
list of byte arrays. That is a very specific type. Yet the algorithm for
deduplicating a sorted list is very simple and does not need to be
specially typed.

This commit removes the sort portion of the method, as that is already
easily available (and timsort in Java should be just fine for this
purpose, we don't need introsort), renames to uniquify, and makes the
method take a generic List along with a Comparator.

relates #84735
  • Loading branch information
rjernst authored Mar 28, 2022
1 parent 9e588de commit 31918fb
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 71 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,6 @@

package org.elasticsearch.common.util;

import com.carrotsearch.hppc.ObjectArrayList;

import org.apache.lucene.util.IntroSorter;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.collect.Iterators;

Expand All @@ -20,8 +17,10 @@
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.IdentityHashMap;
import java.util.List;
import java.util.ListIterator;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
Expand All @@ -42,6 +41,33 @@ public static boolean isEmpty(Object[] array) {
return array == null || array.length == 0;
}

/**
* Eliminate duplicates from a sorted list in-place.
*
* @param list A sorted list, which will be modified in place.
* @param cmp A comparator the list is already sorted by.
*/
public static <T> void uniquify(List<T> list, Comparator<T> cmp) {
if (list.size() <= 1) {
return;
}

ListIterator<T> uniqueItr = list.listIterator();
ListIterator<T> existingItr = list.listIterator();
T uniqueValue = uniqueItr.next(); // get first element to compare with
existingItr.next(); // advance the existing iterator to the second element, where we will begin comparing
do {
T existingValue = existingItr.next();
if (cmp.compare(existingValue, uniqueValue) != 0 && (uniqueValue = uniqueItr.next()) != existingValue) {
uniqueItr.set(existingValue);
}
} while (existingItr.hasNext());

// Lop off the rest of the list. Note with LinkedList this requires advancing back to this index,
// but Java provides no way to efficiently remove from the end of a non random-access list.
list.subList(uniqueItr.nextIndex(), list.size()).clear();
}

/**
* Return a rotated view of the given list with the given distance.
*/
Expand All @@ -62,61 +88,6 @@ public static <T> List<T> rotate(final List<T> list, int distance) {
return new RotatedList<>(list, d);
}

public static void sortAndDedup(final ObjectArrayList<byte[]> array) {
int len = array.size();
if (len > 1) {
sort(array);
int uniqueCount = 1;
for (int i = 1; i < len; ++i) {
if (Arrays.equals(array.get(i), array.get(i - 1)) == false) {
array.set(uniqueCount++, array.get(i));
}
}
array.elementsCount = uniqueCount;
}
}

public static void sort(final ObjectArrayList<byte[]> array) {
new IntroSorter() {

byte[] pivot;

@Override
protected void swap(int i, int j) {
final byte[] tmp = array.get(i);
array.set(i, array.get(j));
array.set(j, tmp);
}

@Override
protected int compare(int i, int j) {
return compare(array.get(i), array.get(j));
}

@Override
protected void setPivot(int i) {
pivot = array.get(i);
}

@Override
protected int comparePivot(int j) {
return compare(pivot, array.get(j));
}

private int compare(byte[] left, byte[] right) {
for (int i = 0, j = 0; i < left.length && j < right.length; i++, j++) {
int a = left[i] & 0xFF;
int b = right[j] & 0xFF;
if (a != b) {
return a - b;
}
}
return left.length - right.length;
}

}.sort(0, array.size());
}

public static int[] toArray(Collection<Integer> ints) {
Objects.requireNonNull(ints);
return ints.stream().mapToInt(s -> s).toArray();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@

package org.elasticsearch.index.mapper;

import com.carrotsearch.hppc.ObjectArrayList;

import org.apache.lucene.document.StoredField;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.BytesRef;
Expand All @@ -28,6 +26,8 @@

import java.io.IOException;
import java.time.ZoneId;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Base64;
import java.util.Collections;
import java.util.List;
Expand Down Expand Up @@ -194,30 +194,28 @@ protected String contentType() {

public static class CustomBinaryDocValuesField extends CustomDocValuesField {

private final ObjectArrayList<byte[]> bytesList;

private int totalSize = 0;
private final List<byte[]> bytesList;

public CustomBinaryDocValuesField(String name, byte[] bytes) {
super(name);
bytesList = new ObjectArrayList<>();
bytesList = new ArrayList<>();
add(bytes);
}

public void add(byte[] bytes) {
bytesList.add(bytes);
totalSize += bytes.length;
}

@Override
public BytesRef binaryValue() {
try {
CollectionUtils.sortAndDedup(bytesList);
int size = bytesList.size();
BytesStreamOutput out = new BytesStreamOutput(totalSize + (size + 1) * 5);
out.writeVInt(size); // write total number of values
for (int i = 0; i < size; i++) {
final byte[] value = bytesList.get(i);
bytesList.sort(Arrays::compareUnsigned);
CollectionUtils.uniquify(bytesList, Arrays::compareUnsigned);
int bytesSize = bytesList.stream().map(a -> a.length).reduce(0, Integer::sum);
int n = bytesList.size();
BytesStreamOutput out = new BytesStreamOutput(bytesSize + (n + 1) * 5);
out.writeVInt(n); // write total number of values
for (var value : bytesList) {
int valueLength = value.length;
out.writeVInt(valueLength);
out.writeBytes(value, 0, valueLength);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,10 @@
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

Expand All @@ -23,6 +25,7 @@
import static org.elasticsearch.common.util.CollectionUtils.limitSize;
import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.lessThan;

public class CollectionUtilsTests extends ESTestCase {
public void testRotateEmpty() {
Expand Down Expand Up @@ -54,6 +57,25 @@ public void testRotate() {
}
}

private <T> void assertUniquify(List<T> list, Comparator<T> cmp, int size) {
for (List<T> listCopy : List.of(new ArrayList<T>(list), new LinkedList<T>(list))) {
CollectionUtils.uniquify(listCopy, cmp);
for (int i = 0; i < listCopy.size() - 1; ++i) {
assertThat(cmp.compare(listCopy.get(i), listCopy.get(i + 1)), lessThan(0));
}
assertThat(listCopy.size(), equalTo(size));
}
}

public void testUniquify() {
assertUniquify(List.<Integer>of(), Comparator.naturalOrder(), 0);
assertUniquify(List.of(1), Comparator.naturalOrder(), 1);
assertUniquify(List.of(1, 2, 3), Comparator.naturalOrder(), 3);
assertUniquify(List.of(1, 1, 1), Comparator.naturalOrder(), 1);
assertUniquify(List.of(1, 2, 2, 3), Comparator.naturalOrder(), 3);
assertUniquify(List.of(1, 2, 2, 2), Comparator.naturalOrder(), 2);
}

public void testEmptyPartition() {
assertEquals(Collections.emptyList(), eagerPartition(Collections.emptyList(), 1));
}
Expand Down

0 comments on commit 31918fb

Please sign in to comment.