From 88704194bef26b8b83d0166d6938778f4b831fd4 Mon Sep 17 00:00:00 2001 From: Ketan Verma Date: Sun, 30 Jul 2023 23:11:52 +0530 Subject: [PATCH] Replace BytesRefHash and clean up alternative implementations Signed-off-by: Ketan Verma --- CHANGELOG.md | 1 + .../common/util/BytesRefHashBenchmark.java | 78 +--- .../opensearch/common/util/BytesRefHash.java | 364 ++++++++++++------ .../common/util/CompactBytesRefHash.java | 286 -------------- .../common/util/ReorganizingBytesRefHash.java | 301 --------------- .../bucket/terms/BytesKeyedBucketOrds.java | 7 +- .../bucket/terms/SignificanceLookup.java | 2 +- .../terms/StringRareTermsAggregator.java | 2 +- .../common/util/BytesRefHashTests.java | 61 +-- .../common/util/CompactBytesRefHashTests.java | 58 --- .../util/ReorganizingBytesRefHashTests.java | 70 ---- 11 files changed, 280 insertions(+), 950 deletions(-) delete mode 100644 server/src/main/java/org/opensearch/common/util/CompactBytesRefHash.java delete mode 100644 server/src/main/java/org/opensearch/common/util/ReorganizingBytesRefHash.java delete mode 100644 server/src/test/java/org/opensearch/common/util/CompactBytesRefHashTests.java delete mode 100644 server/src/test/java/org/opensearch/common/util/ReorganizingBytesRefHashTests.java diff --git a/CHANGELOG.md b/CHANGELOG.md index 861674753861a..676136037c889 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -100,6 +100,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Exclude 'benchmarks' from codecov report ([#8805](https://github.com/opensearch-project/OpenSearch/pull/8805)) - [Refactor] MediaTypeParser to MediaTypeParserRegistry ([#8636](https://github.com/opensearch-project/OpenSearch/pull/8636)) - Create separate SourceLookup instance per segment slice in SignificantTextAggregatorFactory ([#8807](https://github.com/opensearch-project/OpenSearch/pull/8807)) +- Performance improvements for BytesRefHash ([#8788](https://github.com/opensearch-project/OpenSearch/pull/8788)) ### Deprecated diff --git a/benchmarks/src/main/java/org/opensearch/common/util/BytesRefHashBenchmark.java b/benchmarks/src/main/java/org/opensearch/common/util/BytesRefHashBenchmark.java index 8ad04a42b190f..2e2a2399e9c0d 100644 --- a/benchmarks/src/main/java/org/opensearch/common/util/BytesRefHashBenchmark.java +++ b/benchmarks/src/main/java/org/opensearch/common/util/BytesRefHashBenchmark.java @@ -8,7 +8,6 @@ package org.opensearch.common.util; -import net.openhft.hashing.LongHashFunction; import org.apache.lucene.util.BytesRef; import org.openjdk.jmh.annotations.Benchmark; import org.openjdk.jmh.annotations.BenchmarkMode; @@ -23,7 +22,6 @@ import org.openjdk.jmh.annotations.TearDown; import org.openjdk.jmh.annotations.Warmup; import org.openjdk.jmh.infra.Blackhole; -import org.opensearch.common.lease.Releasable; import org.opensearch.common.lease.Releasables; import java.util.HashSet; @@ -32,7 +30,7 @@ import java.util.concurrent.TimeUnit; import java.util.stream.Stream; -@Fork(value = 5) +@Fork(value = 3) @Warmup(iterations = 1, time = 2) @Measurement(iterations = 3, time = 5) @BenchmarkMode(Mode.AverageTime) @@ -45,7 +43,7 @@ public class BytesRefHashBenchmark { public void add(Blackhole bh, Options opts) { for (int hit = 0; hit < NUM_HITS; hit++) { BytesRef key = opts.keys[hit % opts.keys.length]; - for (HashTable table : opts.tables) { + for (BytesRefHash table : opts.tables) { bh.consume(table.add(key)); } } @@ -53,9 +51,6 @@ public void add(Blackhole bh, Options opts) { @State(Scope.Benchmark) public static class Options { - @Param({ "baseline", "compact", "reorganizing" }) - public String type; - @Param({ "1", "2", @@ -163,14 +158,16 @@ public static class Options { @Param({ "8", "32", "128" }) public Integer length; - private HashTable[] tables; + private BytesRefHash[] tables; private BytesRef[] keys; @Setup public void setup() { assert size <= Math.pow(26, length) : "key length too small to generate the required number of keys"; - tables = Stream.generate(this::newHashTable).limit(NUM_TABLES).toArray(HashTable[]::new); + tables = Stream.generate(() -> new BytesRefHash(BigArrays.NON_RECYCLING_INSTANCE)) + .limit(NUM_TABLES) + .toArray(BytesRefHash[]::new); Random random = new Random(0); Set seen = new HashSet<>(); keys = new BytesRef[size]; @@ -193,68 +190,5 @@ public void setup() { public void tearDown() { Releasables.close(tables); } - - private HashTable newHashTable() { - switch (type) { - case "baseline": - return new HashTable() { - private final BytesRefHash table = new BytesRefHash(1, 0.6f, BigArrays.NON_RECYCLING_INSTANCE); - - @Override - public long add(BytesRef key) { - return table.add(key); - } - - @Override - public void close() { - table.close(); - } - }; - case "compact": - return new HashTable() { - private final CompactBytesRefHash table = new CompactBytesRefHash( - 1, - 0.6f, - key -> LongHashFunction.xx3().hashBytes(key.bytes, key.offset, key.length), - BigArrays.NON_RECYCLING_INSTANCE - ); - - @Override - public long add(BytesRef key) { - return table.add(key); - } - - @Override - public void close() { - table.close(); - } - }; - case "reorganizing": - return new HashTable() { - private final ReorganizingBytesRefHash table = new ReorganizingBytesRefHash( - 1, - 0.6f, - key -> LongHashFunction.xx3().hashBytes(key.bytes, key.offset, key.length), - BigArrays.NON_RECYCLING_INSTANCE - ); - - @Override - public long add(BytesRef key) { - return table.add(key); - } - - @Override - public void close() { - table.close(); - } - }; - default: - throw new IllegalArgumentException("invalid hash table type: " + type); - } - } - } - - private interface HashTable extends Releasable { - long add(BytesRef key); } } diff --git a/server/src/main/java/org/opensearch/common/util/BytesRefHash.java b/server/src/main/java/org/opensearch/common/util/BytesRefHash.java index ecc93d017beaf..087470dae2cb9 100644 --- a/server/src/main/java/org/opensearch/common/util/BytesRefHash.java +++ b/server/src/main/java/org/opensearch/common/util/BytesRefHash.java @@ -6,179 +6,289 @@ * compatible open source license. */ -/* - * Licensed to Elasticsearch under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. Elasticsearch licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Modifications Copyright OpenSearch Contributors. See - * GitHub history for details. - */ - package org.opensearch.common.util; +import net.openhft.hashing.LongHashFunction; import org.apache.lucene.util.BytesRef; import org.opensearch.common.lease.Releasable; import org.opensearch.common.lease.Releasables; import org.opensearch.core.common.util.ByteArray; +import java.security.AccessController; +import java.security.PrivilegedAction; + /** - * Specialized hash table implementation similar to Lucene's BytesRefHash that maps - * BytesRef values to ids. Collisions are resolved with open addressing and linear - * probing, growth is smooth thanks to {@link BigArrays}, hashes are cached for faster - * re-hashing and capacity is always a multiple of 2 for faster identification of buckets. - * This class is not thread-safe. + * Specialized hash table implementation that maps a {@link BytesRef} key to a long ordinal. + * + *

+ * It uses a compact byte-packing strategy to encode the ordinal and fingerprint information + * in the hash table value. It makes lookups faster by short-circuiting expensive equality checks + * for keys that collide onto the same hash table slot. * - * @opensearch.internal + *

+ * This class is not thread-safe. + * + * @opensearch.internal */ -public final class BytesRefHash extends AbstractHash { +public class BytesRefHash implements Releasable { + private static final LongHashFunction XX3 = AccessController.doPrivileged( + (PrivilegedAction) () -> LongHashFunction.xx3(System.nanoTime()) + ); + + private static final long MAX_CAPACITY = 1L << 32; + private static final long DEFAULT_INITIAL_CAPACITY = 32; + private static final float DEFAULT_LOAD_FACTOR = 0.6f; + private static final Hasher DEFAULT_HASHER = key -> XX3.hashBytes(key.bytes, key.offset, key.length); + + private static final long MASK_ORDINAL = 0x00000000FFFFFFFFL; // extract ordinal + private static final long MASK_FINGERPRINT = 0xFFFFFFFF00000000L; // extract fingerprint + + /** + * Maximum load factor after which the capacity is doubled. + */ + private final float loadFactor; - private LongArray startOffsets; - private ByteArray bytes; - private IntArray hashes; // we cache hashes for faster re-hashing - private final BytesRef spare; + /** + * Calculates the hash of a {@link BytesRef} key. + */ + private final Hasher hasher; + + /** + * Utility class to allocate recyclable arrays. + */ + private final BigArrays bigArrays; + + /** + * Reusable BytesRef to read keys. + */ + private final BytesRef scratch = new BytesRef(); + + /** + * Current capacity of the hash table. This must be a power of two so that the hash table slot + * can be identified quickly using bitmasks, thus avoiding expensive modulo or integer division. + */ + private long capacity; + + /** + * Bitmask to identify the hash table slot from a key's hash. + */ + private long mask; + + /** + * Size threshold after which the hash table needs to be doubled in capacity. + */ + private long grow; + + /** + * Current size of the hash table. + */ + private long size; + + /** + * Underlying array to store the hash table values. + * + *

+ * Each hash table value (64-bit) uses the following byte packing strategy: + *

+     * |================================|================================|
+     * | Fingerprint                    | Ordinal                        |
+     * |--------------------------------|--------------------------------|
+     * | 32 bits                        | 32 bits                        |
+     * |================================|================================|
+     * 
+ * + *

+ * This allows us to encode and manipulate additional information in the hash table + * itself without having to look elsewhere in the memory, which is much slower. + * + *

+ * Terminology: table[index] = value = (fingerprint | ordinal) + */ + private LongArray table; - // Constructor with configurable capacity and default maximum load factor. - public BytesRefHash(long capacity, BigArrays bigArrays) { - this(capacity, DEFAULT_MAX_LOAD_FACTOR, bigArrays); + /** + * Underlying array to store the starting offsets of keys. + * + *

+ * Terminology: + *

+     *   offsets[ordinal] = starting offset (inclusive)
+     *   offsets[ordinal + 1] = ending offset (exclusive)
+     * 
+ */ + private LongArray offsets; + + /** + * Underlying byte array to store the keys. + * + *

+ * Terminology: keys[start...end] = key + */ + private ByteArray keys; + + public BytesRefHash(final BigArrays bigArrays) { + this(DEFAULT_INITIAL_CAPACITY, DEFAULT_LOAD_FACTOR, DEFAULT_HASHER, bigArrays); } - // Constructor with configurable capacity and load factor. - public BytesRefHash(long capacity, float maxLoadFactor, BigArrays bigArrays) { - super(capacity, maxLoadFactor, bigArrays); - startOffsets = bigArrays.newLongArray(capacity + 1, false); - startOffsets.set(0, 0); - bytes = bigArrays.newByteArray(capacity * 3, false); - hashes = bigArrays.newIntArray(capacity, false); - spare = new BytesRef(); + public BytesRefHash(final long initialCapacity, final BigArrays bigArrays) { + this(initialCapacity, DEFAULT_LOAD_FACTOR, DEFAULT_HASHER, bigArrays); } - // BytesRef has a weak hashCode function so we try to improve it by rehashing using Murmur3 - // Feel free to remove rehashing if BytesRef gets a better hash function - private static int rehash(int hash) { - return BitMixer.mix32(hash); + public BytesRefHash(final long initialCapacity, final float loadFactor, final BigArrays bigArrays) { + this(initialCapacity, loadFactor, DEFAULT_HASHER, bigArrays); } - /** - * Return the key at 0 <= index <= capacity(). The result is undefined if the slot is unused. - *

Beware that the content of the {@link BytesRef} may become invalid as soon as {@link #close()} is called

- */ - public BytesRef get(long id, BytesRef dest) { - final long startOffset = startOffsets.get(id); - final int length = (int) (startOffsets.get(id + 1) - startOffset); - bytes.get(startOffset, length, dest); - return dest; + public BytesRefHash(final long initialCapacity, final float loadFactor, final Hasher hasher, final BigArrays bigArrays) { + assert initialCapacity > 0 : "initial capacity must be greater than 0"; + assert loadFactor > 0 && loadFactor < 1 : "load factor must be between 0 and 1"; + + this.loadFactor = loadFactor; + this.hasher = hasher; + this.bigArrays = bigArrays; + + capacity = Math.max(1, Long.highestOneBit((long) (initialCapacity / loadFactor)) << 1); + mask = capacity - 1; + size = 0; + grow = (long) (capacity * loadFactor); + + table = bigArrays.newLongArray(capacity, false); + table.fill(0, capacity, -1); + offsets = bigArrays.newLongArray(initialCapacity + 1, false); + offsets.set(0, 0); + keys = bigArrays.newByteArray(initialCapacity * 3, false); } /** - * Get the id associated with key + * Adds the given key to the hash table and returns its ordinal. + * If the key exists already, it returns (-1 - ordinal). */ - public long find(BytesRef key, int code) { - final long slot = slot(rehash(code), mask); - for (long index = slot;; index = nextSlot(index, mask)) { - final long id = id(index); - if (id == -1L || key.bytesEquals(get(id, spare))) { - return id; + public long add(final BytesRef key) { + final long hash = hasher.hash(key); + final long fingerprint = hash & MASK_FINGERPRINT; + + for (long idx = hash & mask, value, ordinal;; idx = (idx + 1) & mask) { + if ((value = table.get(idx)) == -1) { + final long val = fingerprint | size; + if (size >= grow) { + growAndInsert(hash, val); + } else { + table.set(idx, val); + } + return append(key); + } else if (((value & MASK_FINGERPRINT) == fingerprint) && key.bytesEquals(get(ordinal = (value & MASK_ORDINAL), scratch))) { + return -1 - ordinal; } } } - /** Sugar for {@link #find(BytesRef, int) find(key, key.hashCode()} */ - public long find(BytesRef key) { - return find(key, key.hashCode()); - } + /** + * Returns the ordinal associated with the given key, or -1 if the key doesn't exist. + * + *

+ * Using the 64-bit hash value, up to 32 least significant bits (LSB) are used to identify the + * home slot in the hash table, and an additional 32 bits are used to identify the fingerprint. + * The fingerprint further increases the entropy and reduces the number of false lookups in the + * keys' table during equality checks, which is expensive. + * + *

+ * Total entropy bits = 32 + log2(capacity) + * + *

+ * Linear probing starts from the home slot, until a match or an empty slot is found. + * Values are first checked using their fingerprint (to reduce false positives), then verified + * in the keys' table using an equality check. + */ + public long find(final BytesRef key) { + final long hash = hasher.hash(key); + final long fingerprint = hash & MASK_FINGERPRINT; - private long set(BytesRef key, int code, long id) { - assert rehash(key.hashCode()) == code; - assert size < maxSize; - final long slot = slot(code, mask); - for (long index = slot;; index = nextSlot(index, mask)) { - final long curId = id(index); - if (curId == -1) { // means unset - id(index, id); - append(id, key, code); - ++size; - return id; - } else if (key.bytesEquals(get(curId, spare))) { - return -1 - curId; + for (long idx = hash & mask, value, ordinal;; idx = (idx + 1) & mask) { + if ((value = table.get(idx)) == -1) { + return -1; + } else if (((value & MASK_FINGERPRINT) == fingerprint) && key.bytesEquals(get(ordinal = (value & MASK_ORDINAL), scratch))) { + return ordinal; } } } - private void append(long id, BytesRef key, int code) { - assert size == id; - final long startOffset = startOffsets.get(size); - bytes = bigArrays.grow(bytes, startOffset + key.length); - bytes.set(startOffset, key.bytes, key.offset, key.length); - startOffsets = bigArrays.grow(startOffsets, size + 2); - startOffsets.set(size + 1, startOffset + key.length); - hashes = bigArrays.grow(hashes, id + 1); - hashes.set(id, code); + /** + * Returns the key associated with the given ordinal. + * The result is undefined for an unused ordinal. + * + *

+ * Beware that the content of the {@link BytesRef} may become invalid as soon as {@link #close()} is called + */ + public BytesRef get(final long ordinal, final BytesRef dest) { + final long start = offsets.get(ordinal); + final int length = (int) (offsets.get(ordinal + 1) - start); + keys.get(start, length, dest); + return dest; } - private boolean assertConsistent(long id, int code) { - get(id, spare); - return rehash(spare.hashCode()) == code; + /** + * Returns the number of mappings in this hash table. + */ + public long size() { + return size; } - private void reset(int code, long id) { - assert assertConsistent(id, code); - final long slot = slot(code, mask); - for (long index = slot;; index = nextSlot(index, mask)) { - final long curId = id(index); - if (curId == -1) { // means unset - id(index, id); - break; - } - } + /** + * Appends the key in the keys' and offsets' tables. + */ + private long append(final BytesRef key) { + final long start = offsets.get(size); + final long end = start + key.length; + offsets = bigArrays.grow(offsets, size + 2); + offsets.set(size + 1, end); + keys = bigArrays.grow(keys, end); + keys.set(start, key.bytes, key.offset, key.length); + return size++; } /** - * Try to add key. Return its newly allocated id if it wasn't in the hash table yet, or -1-id - * if it was already present in the hash table. + * Grows the hash table by doubling its capacity, inserting the provided value, + * and reinserting the previous values at their updated slots. */ - public long add(BytesRef key, int code) { - if (size >= maxSize) { - assert size == maxSize; - grow(); - } - assert size < maxSize; - return set(key, rehash(code), size); - } + private void growAndInsert(final long hash, final long value) { + // Ensure that the hash table doesn't grow too large. + // This implicitly also ensures that the ordinals are no larger than 2^32, thus, + // preventing them from polluting the fingerprint bits in the hash table values. + assert capacity < MAX_CAPACITY : "hash table already at the max capacity"; - /** Sugar to {@link #add(BytesRef, int) add(key, key.hashCode()}. */ - public long add(BytesRef key) { - return add(key, key.hashCode()); + capacity <<= 1; + mask = capacity - 1; + grow = (long) (capacity * loadFactor); + table = bigArrays.grow(table, capacity); + table.fill(0, capacity, -1); + table.set(hash & mask, value); + + for (long ordinal = 0; ordinal < size; ordinal++) { + reinsert(ordinal, hasher.hash(get(ordinal, scratch))); + } } - @Override - protected void removeAndAdd(long index) { - final long id = id(index, -1); - assert id >= 0; - final int code = hashes.get(id); - reset(code, id); + /** + * Reinserts the hash table value for an existing key stored at the given ordinal. + */ + private void reinsert(final long ordinal, final long hash) { + for (long idx = hash & mask;; idx = (idx + 1) & mask) { + if (table.get(idx) == -1) { + table.set(idx, (hash & MASK_FINGERPRINT) | ordinal); + return; + } + } } @Override public void close() { - try (Releasable releasable = Releasables.wrap(bytes, hashes, startOffsets)) { - super.close(); - } + Releasables.close(table, offsets, keys); } + /** + * Hasher calculates the hash of a {@link BytesRef} key. + */ + @FunctionalInterface + public interface Hasher { + long hash(BytesRef key); + } } diff --git a/server/src/main/java/org/opensearch/common/util/CompactBytesRefHash.java b/server/src/main/java/org/opensearch/common/util/CompactBytesRefHash.java deleted file mode 100644 index d8d4690d14e90..0000000000000 --- a/server/src/main/java/org/opensearch/common/util/CompactBytesRefHash.java +++ /dev/null @@ -1,286 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * - * The OpenSearch Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -package org.opensearch.common.util; - -import net.openhft.hashing.LongHashFunction; -import org.apache.lucene.util.BytesRef; -import org.opensearch.common.lease.Releasable; -import org.opensearch.common.lease.Releasables; -import org.opensearch.core.common.util.ByteArray; - -import java.security.AccessController; -import java.security.PrivilegedAction; - -/** - * Specialized hash table implementation that maps a {@link BytesRef} key to a long ordinal. - * - *

- * It uses a compact byte-packing strategy to encode the ordinal and fingerprint information - * in the hash table value. It makes lookups faster by short-circuiting expensive equality checks - * for keys that collide onto the same hash table slot. - * - *

- * This class is not thread-safe. - * - * @opensearch.internal - */ -public class CompactBytesRefHash implements Releasable { - private static final LongHashFunction XX3 = AccessController.doPrivileged( - (PrivilegedAction) () -> LongHashFunction.xx3(System.nanoTime()) - ); - - private static final long MAX_CAPACITY = 1L << 32; - private static final long DEFAULT_INITIAL_CAPACITY = 32; - private static final float DEFAULT_LOAD_FACTOR = 0.6f; - private static final Hasher DEFAULT_HASHER = key -> XX3.hashBytes(key.bytes, key.offset, key.length); - - private static final long MASK_ORDINAL = 0x00000000FFFFFFFFL; // extract ordinal - private static final long MASK_FINGERPRINT = 0xFFFFFFFF00000000L; // extract fingerprint - - /** - * Maximum load factor after which the capacity is doubled. - */ - private final float loadFactor; - - /** - * Calculates the hash of a {@link BytesRef} key. - */ - private final Hasher hasher; - - /** - * Utility class to allocate recyclable arrays. - */ - private final BigArrays bigArrays; - - /** - * Reusable BytesRef to read keys. - */ - private final BytesRef scratch = new BytesRef(); - - /** - * Current capacity of the hash table. This must be a power of two so that the hash table slot - * can be identified quickly using bitmasks, thus avoiding expensive modulo or integer division. - */ - private long capacity; - - /** - * Bitmask to identify the hash table slot from a key's hash. - */ - private long mask; - - /** - * Size threshold after which the hash table needs to be doubled in capacity. - */ - private long grow; - - /** - * Current size of the hash table. - */ - private long size; - - /** - * Underlying array to store the hash table values. - * - *

- * Each hash table value (64-bit) uses the following byte packing strategy: - *

-     * |================================|================================|
-     * | Fingerprint                    | Ordinal                        |
-     * |--------------------------------|--------------------------------|
-     * | 32 bits                        | 32 bits                        |
-     * |================================|================================|
-     * 
- * - *

- * This allows us to encode and manipulate additional information in the hash table - * itself without having to look elsewhere in the memory, which is much slower. - * - *

- * Terminology: table[index] = value = (fingerprint | ordinal) - */ - private LongArray table; - - /** - * Underlying array to store the starting offsets of keys. - * - *

- * Terminology: - *

-     *   offsets[ordinal] = starting offset (inclusive)
-     *   offsets[ordinal + 1] = ending offset (exclusive)
-     * 
- */ - private LongArray offsets; - - /** - * Underlying byte array to store the keys. - * - *

- * Terminology: keys[start...end] = key - */ - private ByteArray keys; - - public CompactBytesRefHash(final BigArrays bigArrays) { - this(DEFAULT_INITIAL_CAPACITY, DEFAULT_LOAD_FACTOR, DEFAULT_HASHER, bigArrays); - } - - public CompactBytesRefHash(final long initialCapacity, final float loadFactor, final Hasher hasher, final BigArrays bigArrays) { - assert initialCapacity > 0 : "initial capacity must be greater than 0"; - assert loadFactor > 0 && loadFactor < 1 : "load factor must be between 0 and 1"; - - this.loadFactor = loadFactor; - this.hasher = hasher; - this.bigArrays = bigArrays; - - capacity = Math.max(1, Long.highestOneBit((long) (initialCapacity / loadFactor)) << 1); - mask = capacity - 1; - size = 0; - grow = (long) (capacity * loadFactor); - - table = bigArrays.newLongArray(capacity, false); - table.fill(0, capacity, -1); - offsets = bigArrays.newLongArray(initialCapacity + 1, false); - offsets.set(0, 0); - keys = bigArrays.newByteArray(initialCapacity * 3, false); - } - - /** - * Adds the given key to the hash table and returns its ordinal. - * If the key exists already, it returns (-1 - ordinal). - */ - public long add(final BytesRef key) { - final long hash = hasher.hash(key); - final long fingerprint = hash & MASK_FINGERPRINT; - - for (long idx = hash & mask, value, ordinal;; idx = (idx + 1) & mask) { - if ((value = table.get(idx)) == -1) { - final long val = fingerprint | size; - if (size >= grow) { - growAndInsert(hash, val); - } else { - table.set(idx, val); - } - return append(key); - } else if (((value & MASK_FINGERPRINT) == fingerprint) && key.bytesEquals(get(ordinal = (value & MASK_ORDINAL), scratch))) { - return -1 - ordinal; - } - } - } - - /** - * Returns the ordinal associated with the given key, or -1 if the key doesn't exist. - * - *

- * Using the 64-bit hash value, up to 32 least significant bits (LSB) are used to identify the - * home slot in the hash table, and an additional 32 bits are used to identify the fingerprint. - * The fingerprint further increases the entropy and reduces the number of false lookups in the - * keys' table during equality checks, which is expensive. - * - *

- * Total entropy bits = 32 + log2(capacity) - * - *

- * Linear probing starts from the home slot, until a match or an empty slot is found. - * Values are first checked using their fingerprint (to reduce false positives), then verified - * in the keys' table using an equality check. - */ - public long find(final BytesRef key) { - final long hash = hasher.hash(key); - final long fingerprint = hash & MASK_FINGERPRINT; - - for (long idx = hash & mask, value, ordinal;; idx = (idx + 1) & mask) { - if ((value = table.get(idx)) == -1) { - return -1; - } else if (((value & MASK_FINGERPRINT) == fingerprint) && key.bytesEquals(get(ordinal = (value & MASK_ORDINAL), scratch))) { - return ordinal; - } - } - } - - /** - * Returns the key associated with the given ordinal. - * The result is undefined for an unused ordinal. - * - *

- * Beware that the content of the {@link BytesRef} may become invalid as soon as {@link #close()} is called - */ - public BytesRef get(final long ordinal, final BytesRef dest) { - final long start = offsets.get(ordinal); - final int length = (int) (offsets.get(ordinal + 1) - start); - keys.get(start, length, dest); - return dest; - } - - /** - * Returns the number of mappings in this hash table. - */ - public long size() { - return size; - } - - /** - * Appends the key in the keys' and offsets' tables. - */ - private long append(final BytesRef key) { - final long start = offsets.get(size); - final long end = start + key.length; - offsets = bigArrays.grow(offsets, size + 2); - offsets.set(size + 1, end); - keys = bigArrays.grow(keys, end); - keys.set(start, key.bytes, key.offset, key.length); - return size++; - } - - /** - * Grows the hash table by doubling its capacity, inserting the provided value, - * and reinserting the previous values at their updated slots. - */ - private void growAndInsert(final long hash, final long value) { - // Ensure that the hash table doesn't grow too large. - // This implicitly also ensures that the ordinals are no larger than 2^32, thus, - // preventing them from polluting the fingerprint bits in the hash table values. - assert capacity < MAX_CAPACITY : "hash table already at the max capacity"; - - capacity <<= 1; - mask = capacity - 1; - grow = (long) (capacity * loadFactor); - table = bigArrays.grow(table, capacity); - table.fill(0, capacity, -1); - table.set(hash & mask, value); - - for (long ordinal = 0; ordinal < size; ordinal++) { - reinsert(ordinal, hasher.hash(get(ordinal, scratch))); - } - } - - /** - * Reinserts the hash table value for an existing key stored at the given ordinal. - */ - private void reinsert(final long ordinal, final long hash) { - for (long idx = hash & mask;; idx = (idx + 1) & mask) { - if (table.get(idx) == -1) { - table.set(idx, (hash & MASK_FINGERPRINT) | ordinal); - return; - } - } - } - - @Override - public void close() { - Releasables.close(table, offsets, keys); - } - - /** - * Hasher calculates the hash of a {@link BytesRef} key. - */ - @FunctionalInterface - public interface Hasher { - long hash(BytesRef key); - } -} diff --git a/server/src/main/java/org/opensearch/common/util/ReorganizingBytesRefHash.java b/server/src/main/java/org/opensearch/common/util/ReorganizingBytesRefHash.java deleted file mode 100644 index 1806733cc1567..0000000000000 --- a/server/src/main/java/org/opensearch/common/util/ReorganizingBytesRefHash.java +++ /dev/null @@ -1,301 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * - * The OpenSearch Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -package org.opensearch.common.util; - -import net.openhft.hashing.LongHashFunction; -import org.apache.lucene.util.BytesRef; -import org.opensearch.common.lease.Releasable; -import org.opensearch.common.lease.Releasables; -import org.opensearch.core.common.util.ByteArray; - -import java.security.AccessController; -import java.security.PrivilegedAction; - -/** - * Specialized hash table implementation that maps a {@link BytesRef} key to a long ordinal. - * - *

- * It organizes itself by moving keys around dynamically in order to reduce the - * longest probe sequence length (PSL), which makes lookups faster as keys are likely to - * be found in the same CPU cache line. It also uses fingerprints to short-circuit expensive - * equality checks for keys that collide onto the same hash table slot. - * - *

- * This class is not thread-safe. - * - * @opensearch.internal - */ -public class ReorganizingBytesRefHash implements Releasable { - private static final LongHashFunction XX3 = AccessController.doPrivileged( - (PrivilegedAction) () -> LongHashFunction.xx3(System.nanoTime()) - ); - - private static final long MAX_CAPACITY = 1L << 32; - private static final long DEFAULT_INITIAL_CAPACITY = 32; - private static final float DEFAULT_LOAD_FACTOR = 0.6f; - private static final Hasher DEFAULT_HASHER = key -> XX3.hashBytes(key.bytes, key.offset, key.length); - - private static final long MASK_ORDINAL = 0x00000000FFFFFFFFL; // extract ordinal - private static final long MASK_FINGERPRINT = 0x0000FFFF00000000L; // extract fingerprint - private static final long MASK_PSL = 0x7FFF000000000000L; // extract PSL - private static final long INCR_PSL = 0x0001000000000000L; // increment PSL by one - - /** - * Maximum load factor after which the capacity is doubled. - */ - private final float loadFactor; - - /** - * Calculates the hash of a {@link BytesRef} key. - */ - private final Hasher hasher; - - /** - * Utility class to allocate recyclable arrays. - */ - private final BigArrays bigArrays; - - /** - * Reusable BytesRef to read keys. - */ - private final BytesRef scratch = new BytesRef(); - - /** - * Current capacity of the hash table. This must be a power of two so that the hash table slot - * can be identified quickly using bitmasks, thus avoiding expensive modulo or integer division. - */ - private long capacity; - - /** - * Bitmask to identify the hash table slot from a key's hash. - */ - private long mask; - - /** - * Size threshold after which the hash table needs to be doubled in capacity. - */ - private long grow; - - /** - * Current size of the hash table. - */ - private long size; - - /** - * Underlying array to store the hash table values. - * - *

- * Each hash table value (64-bit) uses the following byte packing strategy: - *

-     * |=========|===============|================|================================|
-     * | Discard | PSL           | Fingerprint    | Ordinal                        |
-     * |    -    |---------------|----------------|--------------------------------|
-     * | 1 bit   | 15 bits       | 16 bits        | 32 bits                        |
-     * |=========|===============|================|================================|
-     * 
- * - *

- * This allows us to encode and manipulate additional information in the hash table - * itself without having to look elsewhere in the memory, which is much slower. - * - *

- * Terminology: table[index] = value = (discard | psl | fingerprint | ordinal) - */ - private LongArray table; - - /** - * Underlying array to store the starting offsets of keys. - * - *

- * Terminology: - *

-     *   offsets[ordinal] = starting offset (inclusive)
-     *   offsets[ordinal + 1] = ending offset (exclusive)
-     * 
- */ - private LongArray offsets; - - /** - * Underlying byte array to store the keys. - * - *

- * Terminology: keys[start...end] = key - */ - private ByteArray keys; - - public ReorganizingBytesRefHash(final BigArrays bigArrays) { - this(DEFAULT_INITIAL_CAPACITY, DEFAULT_LOAD_FACTOR, DEFAULT_HASHER, bigArrays); - } - - public ReorganizingBytesRefHash(final long initialCapacity, final float loadFactor, final Hasher hasher, final BigArrays bigArrays) { - assert initialCapacity > 0 : "initial capacity must be greater than 0"; - assert loadFactor > 0 && loadFactor < 1 : "load factor must be between 0 and 1"; - - this.loadFactor = loadFactor; - this.hasher = hasher; - this.bigArrays = bigArrays; - - capacity = Math.max(1, Long.highestOneBit((long) (initialCapacity / loadFactor)) << 1); - mask = capacity - 1; - size = 0; - grow = (long) (capacity * loadFactor); - - table = bigArrays.newLongArray(capacity, false); - table.fill(0, capacity, -1); - offsets = bigArrays.newLongArray(initialCapacity + 1, false); - offsets.set(0, 0); - keys = bigArrays.newByteArray(initialCapacity * 3, false); - } - - /** - * Adds the given key to the hash table and returns its ordinal. - * If the key exists already, it returns (-1 - ordinal). - */ - public long add(final BytesRef key) { - final long hash = hasher.hash(key); - final long fingerprint = hash & MASK_FINGERPRINT; - - for (long idx = hash & mask, value, ordinal;; idx = (idx + 1) & mask) { - if ((value = table.get(idx)) == -1) { - final long val = (fingerprint | size); - if (size >= grow) { - growAndInsert(hash, val); - } else { - insert(hash, val); - } - return append(key); - } else if (((value & MASK_FINGERPRINT) == fingerprint) && key.bytesEquals(get(ordinal = (value & MASK_ORDINAL), scratch))) { - return -(1 + ordinal); - } - } - } - - /** - * Returns the ordinal associated with the given key, or -1 if the key doesn't exist. - * - *

- * Using the 64-bit hash value, up to 32 least significant bits (LSB) are used to identify the - * home slot in the hash table, and an additional 16 bits are used to identify the fingerprint. - * The fingerprint further increases the entropy and reduces the number of false lookups in the - * keys' table during equality checks, which is expensive. - * - *

- * Total entropy bits = 16 + log2(capacity) - * - *

- * Linear probing starts from the home slot, until a match or an empty slot is found. - * Values are first checked using their fingerprint (to reduce false positives), then verified - * in the keys' table using an equality check. - */ - public long find(final BytesRef key) { - final long hash = hasher.hash(key); - final long fingerprint = hash & MASK_FINGERPRINT; - - for (long idx = hash & mask, value, ordinal;; idx = (idx + 1) & mask) { - if ((value = table.get(idx)) == -1) { - return -1; - } else if (((value & MASK_FINGERPRINT) == fingerprint) && key.bytesEquals(get(ordinal = (value & MASK_ORDINAL), scratch))) { - return ordinal; - } - } - } - - /** - * Returns the key associated with the given ordinal. - * The result is undefined for an unused ordinal. - * - *

- * Beware that the content of the {@link BytesRef} may become invalid as soon as {@link #close()} is called - */ - public BytesRef get(final long ordinal, final BytesRef dest) { - final long start = offsets.get(ordinal); - final int length = (int) (offsets.get(ordinal + 1) - start); - keys.get(start, length, dest); - return dest; - } - - /** - * Returns the number of mappings in this hash table. - */ - public long size() { - return size; - } - - /** - * Appends the key in the keys' and offsets' tables. - */ - private long append(final BytesRef key) { - final long start = offsets.get(size); - final long end = start + key.length; - offsets = bigArrays.grow(offsets, size + 2); - offsets.set(size + 1, end); - keys = bigArrays.grow(keys, end); - keys.set(start, key.bytes, key.offset, key.length); - return size++; - } - - /** - * Grows the hash table by doubling its capacity, inserting the provided value, - * and reinserting the previous values at their updated slots. - */ - private void growAndInsert(final long hash, final long value) { - // Ensure that the hash table doesn't grow too large. - // This implicitly also ensures that the ordinals are no larger than 2^32, thus, - // preventing them from polluting the fingerprint bits in the hash table values. - assert capacity < MAX_CAPACITY : "hash table already at the max capacity"; - - capacity <<= 1; - mask = capacity - 1; - grow = (long) (capacity * loadFactor); - table = bigArrays.grow(table, capacity); - table.fill(0, capacity, -1); - table.set(hash & mask, value); - - for (long ordinal = 0; ordinal < size; ordinal++) { - final long h = hasher.hash(get(ordinal, scratch)); - insert(h, (h & MASK_FINGERPRINT) | ordinal); - } - } - - /** - * Inserts the hash table value for a missing key. - */ - private void insert(final long hash, final long value) { - for (long idx = hash & mask, current = value, existing;; idx = (idx + 1) & mask) { - if ((existing = table.get(idx)) == -1) { - table.set(idx, current); - return; - } else if ((existing & MASK_PSL) < (current & MASK_PSL)) { - current = table.set(idx, current); - } - current += INCR_PSL; - } - } - - @Override - public void close() { - Releasables.close(table, offsets, keys); - } - - /** - * Returns the underlying hash table. - * Visible for unit-tests. - */ - LongArray getTable() { - return table; - } - - /** - * Hasher calculates the hash of a {@link BytesRef} key. - */ - @FunctionalInterface - public interface Hasher { - long hash(BytesRef key); - } -} diff --git a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/BytesKeyedBucketOrds.java b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/BytesKeyedBucketOrds.java index 034a435032546..e4dfbb7761fc1 100644 --- a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/BytesKeyedBucketOrds.java +++ b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/BytesKeyedBucketOrds.java @@ -37,7 +37,6 @@ import org.opensearch.common.util.BytesRefHash; import org.opensearch.common.lease.Releasable; import org.opensearch.common.lease.Releasables; -import org.opensearch.common.util.CompactBytesRefHash; import org.opensearch.search.aggregations.CardinalityUpperBound; /** @@ -129,10 +128,10 @@ public void readValue(BytesRef dest) {} * @opensearch.internal */ private static class FromSingle extends BytesKeyedBucketOrds { - private final CompactBytesRefHash ords; + private final BytesRefHash ords; private FromSingle(BigArrays bigArrays) { - ords = new CompactBytesRefHash(bigArrays); + ords = new BytesRefHash(bigArrays); } @Override @@ -191,7 +190,7 @@ private static class FromMany extends BytesKeyedBucketOrds { private final LongKeyedBucketOrds longToBucketOrds; private FromMany(BigArrays bigArrays) { - bytesToLong = new BytesRefHash(1, bigArrays); + bytesToLong = new BytesRefHash(bigArrays); longToBucketOrds = LongKeyedBucketOrds.build(bigArrays, CardinalityUpperBound.MANY); } diff --git a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/SignificanceLookup.java b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/SignificanceLookup.java index aa1409a7bec78..9fc606ff9b324 100644 --- a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/SignificanceLookup.java +++ b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/SignificanceLookup.java @@ -123,7 +123,7 @@ public void close() {} }; } return new BackgroundFrequencyForBytes() { - private final BytesRefHash termToPosition = new BytesRefHash(1, bigArrays); + private final BytesRefHash termToPosition = new BytesRefHash(bigArrays); private LongArray positionToFreq = bigArrays.newLongArray(1, false); @Override diff --git a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/StringRareTermsAggregator.java b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/StringRareTermsAggregator.java index b12c397b00fe2..8d98e7c2f40a7 100644 --- a/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/StringRareTermsAggregator.java +++ b/server/src/main/java/org/opensearch/search/aggregations/bucket/terms/StringRareTermsAggregator.java @@ -135,7 +135,7 @@ public InternalAggregation[] buildAggregations(long[] owningBucketOrds) throws I Arrays.fill(mergeMap, -1); long offset = 0; for (int owningOrdIdx = 0; owningOrdIdx < owningBucketOrds.length; owningOrdIdx++) { - try (BytesRefHash bucketsInThisOwningBucketToCollect = new BytesRefHash(1, context.bigArrays())) { + try (BytesRefHash bucketsInThisOwningBucketToCollect = new BytesRefHash(context.bigArrays())) { filters[owningOrdIdx] = newFilter(); List builtBuckets = new ArrayList<>(); BytesKeyedBucketOrds.BucketOrdsEnum collectedBuckets = bucketOrds.ordsEnum(owningBucketOrds[owningOrdIdx]); diff --git a/server/src/test/java/org/opensearch/common/util/BytesRefHashTests.java b/server/src/test/java/org/opensearch/common/util/BytesRefHashTests.java index 8b719283ed71d..2c0b305d4d481 100644 --- a/server/src/test/java/org/opensearch/common/util/BytesRefHashTests.java +++ b/server/src/test/java/org/opensearch/common/util/BytesRefHashTests.java @@ -32,6 +32,7 @@ package org.opensearch.common.util; +import net.openhft.hashing.LongHashFunction; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; import org.apache.lucene.tests.util.TestUtil; @@ -44,6 +45,7 @@ import java.util.Map; import java.util.Map.Entry; import java.util.Set; +import java.util.stream.Stream; public class BytesRefHashTests extends OpenSearchTestCase { @@ -57,9 +59,13 @@ private void newHash() { if (hash != null) { hash.close(); } - // Test high load factors to make sure that collision resolution works fine - final float maxLoadFactor = 0.6f + randomFloat() * 0.39f; - hash = new BytesRefHash(randomIntBetween(0, 100), maxLoadFactor, randomBigArrays()); + LongHashFunction hasher = LongHashFunction.xx3(randomLong()); + hash = new BytesRefHash( + randomIntBetween(1, 100), // random capacity + 0.6f + randomFloat() * 0.39f, // random load factor to verify collision resolution + key -> hasher.hashBytes(key.bytes, key.offset, key.length), + randomBigArrays() + ); } @Override @@ -68,39 +74,34 @@ public void setUp() throws Exception { newHash(); } - public void testDuel() { - final int len = randomIntBetween(1, 100000); - final BytesRef[] values = new BytesRef[len]; - for (int i = 0; i < values.length; ++i) { - values[i] = new BytesRef(randomAlphaOfLength(5)); - } - final Map valueToId = new HashMap<>(); - final BytesRef[] idToValue = new BytesRef[values.length]; - final int iters = randomInt(1000000); - for (int i = 0; i < iters; ++i) { - final BytesRef value = randomFrom(values); - if (valueToId.containsKey(value)) { - assertEquals(-1 - valueToId.get(value), hash.add(value, value.hashCode())); + public void testFuzzy() { + Map reference = new HashMap<>(); + BytesRef[] keys = Stream.generate(() -> new BytesRef(randomAlphaOfLength(20))) + .limit(randomIntBetween(1000, 2000)) + .toArray(BytesRef[]::new); + + // Verify the behaviour of "add" and "find". + for (int i = 0; i < keys.length * 10; i++) { + BytesRef key = keys[i % keys.length]; + if (reference.containsKey(key)) { + long expectedOrdinal = reference.get(key); + assertEquals(-1 - expectedOrdinal, hash.add(key)); + assertEquals(expectedOrdinal, hash.find(key)); } else { - assertEquals(valueToId.size(), hash.add(value, value.hashCode())); - idToValue[valueToId.size()] = value; - valueToId.put(value, valueToId.size()); + assertEquals(-1, hash.find(key)); + reference.put(key, (long) reference.size()); + assertEquals((long) reference.get(key), hash.add(key)); } } - assertEquals(valueToId.size(), hash.size()); - for (final var next : valueToId.entrySet()) { - assertEquals(next.getValue().longValue(), hash.find(next.getKey(), next.getKey().hashCode())); + // Verify the behaviour of "get". + BytesRef scratch = new BytesRef(); + for (Map.Entry entry : reference.entrySet()) { + assertEquals(entry.getKey(), hash.get(entry.getValue(), scratch)); } - for (long i = 0; i < hash.capacity(); ++i) { - final long id = hash.id(i); - BytesRef spare = new BytesRef(); - if (id >= 0) { - hash.get(id, spare); - assertEquals(idToValue[(int) id], spare); - } - } + // Verify the behaviour of "size". + assertEquals(reference.size(), hash.size()); hash.close(); } diff --git a/server/src/test/java/org/opensearch/common/util/CompactBytesRefHashTests.java b/server/src/test/java/org/opensearch/common/util/CompactBytesRefHashTests.java deleted file mode 100644 index 297fe82a2a505..0000000000000 --- a/server/src/test/java/org/opensearch/common/util/CompactBytesRefHashTests.java +++ /dev/null @@ -1,58 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * - * The OpenSearch Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -package org.opensearch.common.util; - -import net.openhft.hashing.LongHashFunction; -import org.apache.lucene.util.BytesRef; -import org.opensearch.test.OpenSearchTestCase; - -import java.util.HashMap; -import java.util.Map; -import java.util.stream.Stream; - -public class CompactBytesRefHashTests extends OpenSearchTestCase { - - public void testFuzzy() { - LongHashFunction hasher = LongHashFunction.xx3(randomLong()); - Map reference = new HashMap<>(); - BytesRef[] keys = Stream.generate(() -> new BytesRef(randomAlphaOfLength(20))).limit(1000).toArray(BytesRef[]::new); - - try ( - CompactBytesRefHash h = new CompactBytesRefHash( - randomIntBetween(1, 100), // random capacity - 0.6f + randomFloat() * 0.39f, // random load factor to verify collision resolution - key -> hasher.hashBytes(key.bytes, key.offset, key.length), - BigArrays.NON_RECYCLING_INSTANCE - ) - ) { - // Verify the behaviour of "add" and "find". - for (int i = 0; i < keys.length * 10; i++) { - BytesRef key = keys[i % keys.length]; - if (reference.containsKey(key)) { - long expectedOrdinal = reference.get(key); - assertEquals(-1 - expectedOrdinal, h.add(key)); - assertEquals(expectedOrdinal, h.find(key)); - } else { - assertEquals(-1, h.find(key)); - reference.put(key, (long) reference.size()); - assertEquals((long) reference.get(key), h.add(key)); - } - } - - // Verify the behaviour of "get". - BytesRef scratch = new BytesRef(); - for (Map.Entry entry : reference.entrySet()) { - assertEquals(entry.getKey(), h.get(entry.getValue(), scratch)); - } - - // Verify the behaviour of "size". - assertEquals(reference.size(), h.size()); - } - } -} diff --git a/server/src/test/java/org/opensearch/common/util/ReorganizingBytesRefHashTests.java b/server/src/test/java/org/opensearch/common/util/ReorganizingBytesRefHashTests.java deleted file mode 100644 index e7ab6d1fa21a6..0000000000000 --- a/server/src/test/java/org/opensearch/common/util/ReorganizingBytesRefHashTests.java +++ /dev/null @@ -1,70 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * - * The OpenSearch Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -package org.opensearch.common.util; - -import net.openhft.hashing.LongHashFunction; -import org.apache.lucene.util.BytesRef; -import org.opensearch.test.OpenSearchTestCase; - -import java.util.HashMap; -import java.util.Map; -import java.util.stream.Stream; - -public class ReorganizingBytesRefHashTests extends OpenSearchTestCase { - - public void testFuzzy() { - LongHashFunction hasher = LongHashFunction.xx3(randomLong()); - Map reference = new HashMap<>(); - BytesRef[] keys = Stream.generate(() -> new BytesRef(randomAlphaOfLength(20))).limit(1000).toArray(BytesRef[]::new); - - try ( - ReorganizingBytesRefHash h = new ReorganizingBytesRefHash( - randomIntBetween(1, 100), // random capacity - 0.6f + randomFloat() * 0.39f, // random load factor to verify collision resolution - key -> hasher.hashBytes(key.bytes, key.offset, key.length), - BigArrays.NON_RECYCLING_INSTANCE - ) - ) { - // Verify the behaviour of "add" and "find". - for (int i = 0; i < keys.length * 10; i++) { - BytesRef key = keys[i % keys.length]; - if (reference.containsKey(key)) { - long expectedOrdinal = reference.get(key); - assertEquals(-1 - expectedOrdinal, h.add(key)); - assertEquals(expectedOrdinal, h.find(key)); - } else { - assertEquals(-1, h.find(key)); - reference.put(key, (long) reference.size()); - assertEquals((long) reference.get(key), h.add(key)); - } - } - - // Verify the behaviour of "get". - BytesRef scratch = new BytesRef(); - for (Map.Entry entry : reference.entrySet()) { - assertEquals(entry.getKey(), h.get(entry.getValue(), scratch)); - } - - // Verify the behaviour of "size". - assertEquals(reference.size(), h.size()); - - // Verify the calculation of PSLs. - long capacity = h.getTable().size(); - long mask = capacity - 1; - for (long idx = 0; idx < h.getTable().size(); idx++) { - long value = h.getTable().get(idx); - if (value != -1) { - BytesRef key = h.get((int) value, scratch); - long homeIdx = hasher.hashBytes(key.bytes, key.offset, key.length) & mask; - assertEquals((capacity + idx - homeIdx) & mask, value >>> 48); - } - } - } - } -}