Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rewrite CollectionUtils dedup to work with any type #85352

Merged
merged 14 commits into from
Mar 28, 2022
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,10 @@

package org.elasticsearch.common.util;

import com.carrotsearch.hppc.ObjectArrayList;

import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefArray;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.InPlaceMergeSorter;
import org.apache.lucene.util.IntroSorter;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.collect.Iterators;

Expand All @@ -27,6 +24,7 @@
import java.util.Comparator;
import java.util.IdentityHashMap;
import java.util.List;
import java.util.ListIterator;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
Expand All @@ -47,6 +45,33 @@ public static boolean isEmpty(Object[] array) {
return array == null || array.length == 0;
}

/**
* Eliminate duplicates from a list.
rjernst marked this conversation as resolved.
Show resolved Hide resolved
*
* @param list A sorted list, which will be modified in place.
* @param cmp A comparator the list is already sorted by.
*/
public static <T> void uniquify(List<T> list, Comparator<T> cmp) {
if (list.size() <= 1) {
return;
}

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we verify that the list implements random access?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The algorithm only requires a forward iterator. I've rewritten to use ListIterator instead of indices. The only caveat is that for LinkedList Java does not provide an efficient means to remove the rest of a list from a given point.

ListIterator<T> uniqueItr = list.listIterator();
ListIterator<T> existingItr = list.listIterator();
T uniqueValue = uniqueItr.next(); // get first element to compare with
existingItr.next(); // advance the existing iterator to the second element, where we will begin comparing
do {
T existingValue = existingItr.next();
if (cmp.compare(existingValue, uniqueValue) != 0 && (uniqueValue = uniqueItr.next()) != existingValue) {
uniqueItr.set(existingValue);
}
} while (existingItr.hasNext());

// Lop off the rest of the list. Note with LinkedList this requires advancing back to this index,
// but Java provides no way to efficiently remove from the end of a non random-access list.
list.subList(uniqueItr.nextIndex(), list.size()).clear();
}

/**
* Return a rotated view of the given list with the given distance.
*/
Expand All @@ -67,61 +92,6 @@ public static <T> List<T> rotate(final List<T> list, int distance) {
return new RotatedList<>(list, d);
}

public static void sortAndDedup(final ObjectArrayList<byte[]> array) {
int len = array.size();
if (len > 1) {
sort(array);
int uniqueCount = 1;
for (int i = 1; i < len; ++i) {
if (Arrays.equals(array.get(i), array.get(i - 1)) == false) {
array.set(uniqueCount++, array.get(i));
}
}
array.elementsCount = uniqueCount;
}
}

public static void sort(final ObjectArrayList<byte[]> array) {
new IntroSorter() {

byte[] pivot;

@Override
protected void swap(int i, int j) {
final byte[] tmp = array.get(i);
array.set(i, array.get(j));
array.set(j, tmp);
}

@Override
protected int compare(int i, int j) {
return compare(array.get(i), array.get(j));
}

@Override
protected void setPivot(int i) {
pivot = array.get(i);
}

@Override
protected int comparePivot(int j) {
return compare(pivot, array.get(j));
}

private int compare(byte[] left, byte[] right) {
for (int i = 0, j = 0; i < left.length && j < right.length; i++, j++) {
int a = left[i] & 0xFF;
int b = right[j] & 0xFF;
if (a != b) {
return a - b;
}
}
return left.length - right.length;
}

}.sort(0, array.size());
}

public static int[] toArray(Collection<Integer> ints) {
Objects.requireNonNull(ints);
return ints.stream().mapToInt(s -> s).toArray();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@

package org.elasticsearch.index.mapper;

import com.carrotsearch.hppc.ObjectArrayList;

import org.apache.lucene.document.StoredField;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.BytesRef;
Expand All @@ -28,6 +26,8 @@

import java.io.IOException;
import java.time.ZoneId;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Base64;
import java.util.Collections;
import java.util.List;
Expand Down Expand Up @@ -194,30 +194,28 @@ protected String contentType() {

public static class CustomBinaryDocValuesField extends CustomDocValuesField {

private final ObjectArrayList<byte[]> bytesList;

private int totalSize = 0;
private final List<byte[]> bytesList;

public CustomBinaryDocValuesField(String name, byte[] bytes) {
super(name);
bytesList = new ObjectArrayList<>();
bytesList = new ArrayList<>();
add(bytes);
}

public void add(byte[] bytes) {
bytesList.add(bytes);
totalSize += bytes.length;
}

@Override
public BytesRef binaryValue() {
try {
CollectionUtils.sortAndDedup(bytesList);
int size = bytesList.size();
BytesStreamOutput out = new BytesStreamOutput(totalSize + (size + 1) * 5);
out.writeVInt(size); // write total number of values
for (int i = 0; i < size; i++) {
final byte[] value = bytesList.get(i);
bytesList.sort(Arrays::compareUnsigned);
CollectionUtils.uniquify(bytesList, Arrays::compareUnsigned);
int bytesSize = bytesList.stream().map(a -> a.length).reduce(0, Integer::sum);
int n = bytesList.size();
BytesStreamOutput out = new BytesStreamOutput(bytesSize + (n + 1) * 5);
out.writeVInt(n); // write total number of values
for (var value : bytesList) {
int valueLength = value.length;
out.writeVInt(valueLength);
out.writeBytes(value, 0, valueLength);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,11 @@
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.SortedSet;
Expand All @@ -31,6 +33,7 @@
import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.is;
import static org.hamcrest.Matchers.lessThan;

public class CollectionUtilsTests extends ESTestCase {
public void testRotateEmpty() {
Expand Down Expand Up @@ -62,6 +65,25 @@ public void testRotate() {
}
}

private <T> void assertUniquify(List<T> list, Comparator<T> cmp, int size) {
for (List<T> listCopy : List.of(new ArrayList<T>(list), new LinkedList<T>(list))) {
CollectionUtils.uniquify(listCopy, cmp);
for (int i = 0; i < listCopy.size() - 1; ++i) {
assertThat(cmp.compare(listCopy.get(i), listCopy.get(i + 1)), lessThan(0));
}
assertThat(listCopy.size(), equalTo(size));
}
}

public void testUniquify() {
assertUniquify(List.<Integer>of(), Comparator.naturalOrder(), 0);
assertUniquify(List.of(1), Comparator.naturalOrder(), 1);
assertUniquify(List.of(1, 2, 3), Comparator.naturalOrder(), 3);
assertUniquify(List.of(1, 1, 1), Comparator.naturalOrder(), 1);
assertUniquify(List.of(1, 2, 2, 3), Comparator.naturalOrder(), 3);
assertUniquify(List.of(1, 2, 2, 2), Comparator.naturalOrder(), 2);
}

public void testSortAndDedupByteRefArray() {
SortedSet<BytesRef> set = new TreeSet<>();
final int numValues = scaledRandomIntBetween(0, 10000);
Expand Down