Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rewrite CollectionUtils dedup to work with any type #85352

Merged
merged 14 commits into from
Mar 28, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,10 @@

package org.elasticsearch.common.util;

import com.carrotsearch.hppc.ObjectArrayList;

import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefArray;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.InPlaceMergeSorter;
import org.apache.lucene.util.IntroSorter;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.collect.Iterators;

Expand Down Expand Up @@ -47,6 +44,27 @@ public static boolean isEmpty(Object[] array) {
return array == null || array.length == 0;
}

public static <T> void unique(List<T> list, Comparator<T> cmp) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add javadocs that the list must be sorted according to the given comparator?
Also a bit of a nit-pick, but since this modifies the list in-place, I feel like naming the method after a verb would be more appropriate?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changed to uniquify

if (list.size() <= 1) {
return;
}

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we verify that the list implements random access?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The algorithm only requires a forward iterator. I've rewritten to use ListIterator instead of indices. The only caveat is that for LinkedList Java does not provide an efficient means to remove the rest of a list from a given point.

int prevNdx = 0;
T prevValue = list.get(0);
for (int i = 1; i < list.size(); ++i) {
T nextValue = list.get(i);
if (cmp.compare(nextValue, prevValue) != 0 && prevNdx++ != i) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need to pre-increment rather than post-increment? Otherwise it looks to me like a list where all elements are unique would still overwrite all the time?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are right, it should be pre-increment.

list.set(prevNdx, nextValue);
prevValue = nextValue;
}
}
++prevNdx;
if (prevNdx != list.size()) {
// lop off the rest of the list
list.subList(prevNdx, list.size()).clear();
}
}

/**
* Return a rotated view of the given list with the given distance.
*/
Expand All @@ -67,61 +85,6 @@ public static <T> List<T> rotate(final List<T> list, int distance) {
return new RotatedList<>(list, d);
}

public static void sortAndDedup(final ObjectArrayList<byte[]> array) {
int len = array.size();
if (len > 1) {
sort(array);
int uniqueCount = 1;
for (int i = 1; i < len; ++i) {
if (Arrays.equals(array.get(i), array.get(i - 1)) == false) {
array.set(uniqueCount++, array.get(i));
}
}
array.elementsCount = uniqueCount;
}
}

public static void sort(final ObjectArrayList<byte[]> array) {
new IntroSorter() {

byte[] pivot;

@Override
protected void swap(int i, int j) {
final byte[] tmp = array.get(i);
array.set(i, array.get(j));
array.set(j, tmp);
}

@Override
protected int compare(int i, int j) {
return compare(array.get(i), array.get(j));
}

@Override
protected void setPivot(int i) {
pivot = array.get(i);
}

@Override
protected int comparePivot(int j) {
return compare(pivot, array.get(j));
}

private int compare(byte[] left, byte[] right) {
for (int i = 0, j = 0; i < left.length && j < right.length; i++, j++) {
int a = left[i] & 0xFF;
int b = right[j] & 0xFF;
if (a != b) {
return a - b;
}
}
return left.length - right.length;
}

}.sort(0, array.size());
}

public static int[] toArray(Collection<Integer> ints) {
Objects.requireNonNull(ints);
return ints.stream().mapToInt(s -> s).toArray();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@

package org.elasticsearch.index.mapper;

import com.carrotsearch.hppc.ObjectArrayList;

import org.apache.lucene.document.StoredField;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.BytesRef;
Expand All @@ -28,6 +26,8 @@

import java.io.IOException;
import java.time.ZoneId;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Base64;
import java.util.Collections;
import java.util.List;
Expand Down Expand Up @@ -194,30 +194,28 @@ protected String contentType() {

public static class CustomBinaryDocValuesField extends CustomDocValuesField {

private final ObjectArrayList<byte[]> bytesList;

private int totalSize = 0;
private final List<byte[]> bytesList;

public CustomBinaryDocValuesField(String name, byte[] bytes) {
super(name);
bytesList = new ObjectArrayList<>();
bytesList = new ArrayList<>();
add(bytes);
}

public void add(byte[] bytes) {
bytesList.add(bytes);
totalSize += bytes.length;
}

@Override
public BytesRef binaryValue() {
try {
CollectionUtils.sortAndDedup(bytesList);
int size = bytesList.size();
BytesStreamOutput out = new BytesStreamOutput(totalSize + (size + 1) * 5);
out.writeVInt(size); // write total number of values
for (int i = 0; i < size; i++) {
final byte[] value = bytesList.get(i);
bytesList.sort(Arrays::compare);
CollectionUtils.unique(bytesList, Arrays::compare);
int bytesSize = bytesList.stream().map(a -> a.length).reduce(0, Integer::sum);
int n = bytesList.size();
BytesStreamOutput out = new BytesStreamOutput(bytesSize + (n + 1) * 5);
out.writeVInt(n); // write total number of values
for (var value : bytesList) {
int valueLength = value.length;
out.writeVInt(valueLength);
out.writeBytes(value, 0, valueLength);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
Expand All @@ -31,6 +32,7 @@
import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.is;
import static org.hamcrest.Matchers.lessThan;

public class CollectionUtilsTests extends ESTestCase {
public void testRotateEmpty() {
Expand Down Expand Up @@ -62,6 +64,24 @@ public void testRotate() {
}
}

private <T> void assertUnique(List<T> list, Comparator<T> cmp, int size) {
List<T> listCopy = new ArrayList<>(list);
CollectionUtils.unique(listCopy, cmp);
for (int i = 0; i < listCopy.size() - 1; ++i) {
assertThat(cmp.compare(listCopy.get(i), listCopy.get(i + 1)), lessThan(0));
}
assertThat(listCopy.size(), equalTo(size));
}

public void testUnique() {
assertUnique(List.<Integer>of(), Comparator.naturalOrder(), 0);
assertUnique(List.of(1), Comparator.naturalOrder(), 1);
assertUnique(List.of(1, 2, 3), Comparator.naturalOrder(), 3);
assertUnique(List.of(1, 1, 1), Comparator.naturalOrder(), 1);
assertUnique(List.of(1, 2, 2, 3), Comparator.naturalOrder(), 3);
assertUnique(List.of(1, 2, 2, 2), Comparator.naturalOrder(), 2);
}

public void testSortAndDedupByteRefArray() {
SortedSet<BytesRef> set = new TreeSet<>();
final int numValues = scaledRandomIntBetween(0, 10000);
Expand Down