From 9e7991b087a456773d49c4939eb3265b9a61d871 Mon Sep 17 00:00:00 2001 From: Chris Larsen Date: Mon, 23 Oct 2023 09:58:18 -0700 Subject: [PATCH] GH-38414 [Java] Add byte array hashing to ArrowBufHasher. Used in the next Java PR to hash values when writing to a dictionary in batched mode. --- .../memory/util/hash/ArrowBufHasher.java | 9 +++ .../arrow/memory/util/hash/MurmurHasher.java | 44 +++++++++++++ .../arrow/memory/util/hash/SimpleHasher.java | 29 +++++++++ .../memory/util/TestArrowBufPointer.java | 5 ++ .../memory/util/hash/TestArrowBufHasher.java | 61 ++++++++++++------- 5 files changed, 125 insertions(+), 23 deletions(-) diff --git a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/hash/ArrowBufHasher.java b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/hash/ArrowBufHasher.java index 0de8e62a4a4b7..d1aa7e7df94a4 100644 --- a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/hash/ArrowBufHasher.java +++ b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/hash/ArrowBufHasher.java @@ -44,4 +44,13 @@ public interface ArrowBufHasher { * @return the hash code. */ int hashCode(ArrowBuf buf, long offset, long length); + + /** + * Calculates the hash code for a byte array. + * @param buf the non-null byte array. + * @param offset offset within the buffer for the memory region. + * @param length length of the memory region. + * @return the hash code. + */ + int hashCode(byte[] buf, int offset, int length); } diff --git a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/hash/MurmurHasher.java b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/hash/MurmurHasher.java index 75fc3f0c45831..5aa0bd4ea117e 100644 --- a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/hash/MurmurHasher.java +++ b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/hash/MurmurHasher.java @@ -64,6 +64,11 @@ public int hashCode(ArrowBuf buf, long offset, long length) { return hashCode(buf.memoryAddress() + offset, length); } + @Override + public int hashCode(byte[] buf, int offset, int length) { + return hashCode(buf, offset, length, seed); + } + /** * Calculates the hash code for a memory region. * @param buf the buffer for the memory region. @@ -106,6 +111,36 @@ public static int hashCode(long address, long length, int seed) { return finalizeHashCode(hash, length); } + /** + * Calculates the hash code for a byte array. + * @param buffer the non-null buffer to read. + * @param offset an offset into the byte array. + * @param length length of the memory region. + * @param seed the seed. + * @return the hash code. + */ + public static int hashCode(byte[] buffer, int offset, int length, int seed) { + int index = offset; + int hash = seed; + while (index + 4 <= length) { + int intValue = readInt(buffer, index, 4); + hash = combineHashCode(hash, intValue); + index += 4; + } + + if (index < length) { + // process remaining data as a integer in little endian + int intValue = 0; + for (int i = length - 1; i >= index; i--) { + intValue <<= 8; + intValue |= (buffer[i] & 0x000000ff); + index += 1; + } + hash = combineHashCode(hash, intValue); + } + return finalizeHashCode(hash, length); + } + /** * Combine the current hash code and a new int value to calculate * a new hash code. @@ -172,4 +207,13 @@ public boolean equals(Object o) { public int hashCode() { return seed; } + + private static int readInt(byte[] buffer, int offset, int len) { + int result = 0; + for (int i = offset; i < offset + len; i++) { + result <<= 8; + result |= (buffer[i] & 0x000000ff); + } + return result; + } } diff --git a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/hash/SimpleHasher.java b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/hash/SimpleHasher.java index da0ee482997f2..5c34e16bc414b 100644 --- a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/hash/SimpleHasher.java +++ b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/hash/SimpleHasher.java @@ -17,6 +17,7 @@ package org.apache.arrow.memory.util.hash; +import java.nio.ByteBuffer; import org.apache.arrow.memory.ArrowBuf; import org.apache.arrow.memory.util.MemoryUtil; @@ -92,6 +93,34 @@ public int hashCode(ArrowBuf buf, long offset, long length) { return hashCode(buf.memoryAddress() + offset, length); } + @Override + public int hashCode(byte[] buf, int offset, int length) { + int hashValue = 0; + int index = 0; + while (index + 8 <= length) { + long longValue = ByteBuffer.wrap(buf, offset + index, 8).getLong(); + int longHash = getLongHashCode(longValue); + hashValue = combineHashCode(hashValue, longHash); + index += 8; + } + + if (index + 4 <= length) { + int intValue = ByteBuffer.wrap(buf, offset + index, 4).getInt(); + int intHash = intValue; + hashValue = combineHashCode(hashValue, intHash); + index += 4; + } + + while (index < length) { + byte byteValue = buf[index]; + int byteHash = byteValue; + hashValue = combineHashCode(hashValue, byteHash); + index += 1; + } + + return finalizeHashCode(hashValue); + } + protected int combineHashCode(int currentHashCode, int newHashCode) { return currentHashCode * 37 + newHashCode; } diff --git a/java/memory/memory-core/src/test/java/org/apache/arrow/memory/util/TestArrowBufPointer.java b/java/memory/memory-core/src/test/java/org/apache/arrow/memory/util/TestArrowBufPointer.java index a1d5624a7e8c0..49c10787fbe8d 100644 --- a/java/memory/memory-core/src/test/java/org/apache/arrow/memory/util/TestArrowBufPointer.java +++ b/java/memory/memory-core/src/test/java/org/apache/arrow/memory/util/TestArrowBufPointer.java @@ -204,6 +204,11 @@ public int hashCode(ArrowBuf buf, long offset, long length) { return SimpleHasher.INSTANCE.hashCode(buf, offset, length); } + @Override + public int hashCode(byte[] buf, int offset, int length) { + return 0; + } + @Override public int hashCode() { return super.hashCode(); diff --git a/java/memory/memory-core/src/test/java/org/apache/arrow/memory/util/hash/TestArrowBufHasher.java b/java/memory/memory-core/src/test/java/org/apache/arrow/memory/util/hash/TestArrowBufHasher.java index 3da0602bdfd9c..d016b7b50e25b 100644 --- a/java/memory/memory-core/src/test/java/org/apache/arrow/memory/util/hash/TestArrowBufHasher.java +++ b/java/memory/memory-core/src/test/java/org/apache/arrow/memory/util/hash/TestArrowBufHasher.java @@ -64,30 +64,39 @@ public void shutdown() { public void testHasher() { try (ArrowBuf buf1 = allocator.buffer(BUFFER_LENGTH); ArrowBuf buf2 = allocator.buffer(BUFFER_LENGTH)) { + byte[] ba1 = new byte[BUFFER_LENGTH]; + byte[] ba2 = new byte[BUFFER_LENGTH]; + // prepare data for (int i = 0; i < BUFFER_LENGTH / 4; i++) { buf1.setFloat(i * 4, i / 10.0f); buf2.setFloat(i * 4, i / 10.0f); } - - verifyHashCodesEqual(buf1, 0, 100, buf2, 0, 100); - verifyHashCodesEqual(buf1, 1, 5, buf2, 1, 5); - verifyHashCodesEqual(buf1, 10, 17, buf2, 10, 17); - verifyHashCodesEqual(buf1, 33, 25, buf2, 33, 25); - verifyHashCodesEqual(buf1, 22, 22, buf2, 22, 22); - verifyHashCodesEqual(buf1, 123, 333, buf2, 123, 333); - verifyHashCodesEqual(buf1, 374, 1, buf2, 374, 1); - verifyHashCodesEqual(buf1, 11, 0, buf2, 11, 0); - verifyHashCodesEqual(buf1, 75, 25, buf2, 75, 25); - verifyHashCodesEqual(buf1, 0, 1024, buf2, 0, 1024); + buf1.getBytes(0, ba1); + buf2.getBytes(0, ba2); + + verifyHashCodesEqual(buf1, ba1, 0, 100, buf2, ba2, 0, 100); + verifyHashCodesEqual(buf1, ba1, 1, 5, buf2, ba2, 1, 5); + verifyHashCodesEqual(buf1, ba1, 10, 17, buf2, ba2, 10, 17); + verifyHashCodesEqual(buf1, ba1, 33, 25, buf2, ba2, 33, 25); + verifyHashCodesEqual(buf1, ba1, 22, 22, buf2, ba2, 22, 22); + verifyHashCodesEqual(buf1, ba1, 123, 333, buf2, ba2, 123, 333); + verifyHashCodesEqual(buf1, ba1, 374, 1, buf2, ba2, 374, 1); + verifyHashCodesEqual(buf1, ba1, 11, 0, buf2, ba2, 11, 0); + verifyHashCodesEqual(buf1, ba1, 75, 25, buf2, ba2, 75, 25); + verifyHashCodesEqual(buf1, ba1, 0, 1024, buf2, ba2, 0, 1024); } } - private void verifyHashCodesEqual(ArrowBuf buf1, int offset1, int length1, - ArrowBuf buf2, int offset2, int length2) { + private void verifyHashCodesEqual(ArrowBuf buf1, byte[] ba1, int offset1, int length1, + ArrowBuf buf2, byte[] ba2, int offset2, int length2) { int hashCode1 = hasher.hashCode(buf1, offset1, length1); int hashCode2 = hasher.hashCode(buf2, offset2, length2); assertEquals(hashCode1, hashCode2); + + hashCode1 = hasher.hashCode(ba1, offset1, length1); + hashCode2 = hasher.hashCode(ba2, offset2, length2); + assertEquals(hashCode1, hashCode2); } @Test @@ -116,30 +125,36 @@ public void testHasherNegative() { public void testHasherLessThanInt() { try (ArrowBuf buf1 = allocator.buffer(4); ArrowBuf buf2 = allocator.buffer(4)) { - buf1.writeBytes("foo1".getBytes(StandardCharsets.UTF_8)); - buf2.writeBytes("bar2".getBytes(StandardCharsets.UTF_8)); + byte[] ba1 = "foo1".getBytes(StandardCharsets.UTF_8); + byte[] ba2 = "bar2".getBytes(StandardCharsets.UTF_8); + buf1.writeBytes(ba1); + buf2.writeBytes(ba2); for (int i = 1; i <= 4; i ++) { - verifyHashCodeNotEqual(buf1, 0, i, buf2, 0, i); + verifyHashCodeNotEqual(buf1, ba1, 0, i, buf2, ba2, 0, i); } } } - private void verifyHashCodeNotEqual(ArrowBuf buf1, int offset1, int length1, - ArrowBuf buf2, int offset2, int length2) { + private void verifyHashCodeNotEqual(ArrowBuf buf1, byte[] ba1, int offset1, int length1, + ArrowBuf buf2, byte[] ba2, int offset2, int length2) { int hashCode1 = hasher.hashCode(buf1, 0, length1); int hashCode2 = hasher.hashCode(buf2, 0, length2); assertNotEquals(hashCode1, hashCode2); + + hashCode1 = hasher.hashCode(ba1, 0, length1); + hashCode2 = hasher.hashCode(ba2, 0, length2); + assertNotEquals(hashCode1, hashCode2); } @Parameterized.Parameters(name = "hasher = {0}") public static Collection getHasher() { return Arrays.asList( - new Object[] {SimpleHasher.class.getSimpleName(), - SimpleHasher.INSTANCE}, - new Object[] {MurmurHasher.class.getSimpleName(), - new MurmurHasher() - } + new Object[] {SimpleHasher.class.getSimpleName(), + SimpleHasher.INSTANCE}, + new Object[] {MurmurHasher.class.getSimpleName(), + new MurmurHasher() + } ); } }