Skip to content

Commit

Permalink
GH-38414 [Java] Add byte array hashing to ArrowBufHasher.
Browse files Browse the repository at this point in the history
Used in the next Java PR to hash values when writing to a dictionary in batched
mode.
  • Loading branch information
manolama committed Oct 23, 2023
1 parent 3beb93a commit 9e7991b
Show file tree
Hide file tree
Showing 5 changed files with 125 additions and 23 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -44,4 +44,13 @@ public interface ArrowBufHasher {
* @return the hash code.
*/
int hashCode(ArrowBuf buf, long offset, long length);

/**
* Calculates the hash code for a byte array.
* @param buf the non-null byte array.
* @param offset offset within the buffer for the memory region.
* @param length length of the memory region.
* @return the hash code.
*/
int hashCode(byte[] buf, int offset, int length);
}
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,11 @@ public int hashCode(ArrowBuf buf, long offset, long length) {
return hashCode(buf.memoryAddress() + offset, length);
}

@Override
public int hashCode(byte[] buf, int offset, int length) {
return hashCode(buf, offset, length, seed);
}

/**
* Calculates the hash code for a memory region.
* @param buf the buffer for the memory region.
Expand Down Expand Up @@ -106,6 +111,36 @@ public static int hashCode(long address, long length, int seed) {
return finalizeHashCode(hash, length);
}

/**
* Calculates the hash code for a byte array.
* @param buffer the non-null buffer to read.
* @param offset an offset into the byte array.
* @param length length of the memory region.
* @param seed the seed.
* @return the hash code.
*/
public static int hashCode(byte[] buffer, int offset, int length, int seed) {
int index = offset;
int hash = seed;
while (index + 4 <= length) {
int intValue = readInt(buffer, index, 4);
hash = combineHashCode(hash, intValue);
index += 4;
}

if (index < length) {
// process remaining data as a integer in little endian
int intValue = 0;
for (int i = length - 1; i >= index; i--) {
intValue <<= 8;
intValue |= (buffer[i] & 0x000000ff);
index += 1;
}
hash = combineHashCode(hash, intValue);
}
return finalizeHashCode(hash, length);
}

/**
* Combine the current hash code and a new int value to calculate
* a new hash code.
Expand Down Expand Up @@ -172,4 +207,13 @@ public boolean equals(Object o) {
public int hashCode() {
return seed;
}

private static int readInt(byte[] buffer, int offset, int len) {
int result = 0;
for (int i = offset; i < offset + len; i++) {
result <<= 8;
result |= (buffer[i] & 0x000000ff);
}
return result;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

package org.apache.arrow.memory.util.hash;

import java.nio.ByteBuffer;

import org.apache.arrow.memory.ArrowBuf;
import org.apache.arrow.memory.util.MemoryUtil;
Expand Down Expand Up @@ -92,6 +93,34 @@ public int hashCode(ArrowBuf buf, long offset, long length) {
return hashCode(buf.memoryAddress() + offset, length);
}

@Override
public int hashCode(byte[] buf, int offset, int length) {
int hashValue = 0;
int index = 0;
while (index + 8 <= length) {
long longValue = ByteBuffer.wrap(buf, offset + index, 8).getLong();
int longHash = getLongHashCode(longValue);
hashValue = combineHashCode(hashValue, longHash);
index += 8;
}

if (index + 4 <= length) {
int intValue = ByteBuffer.wrap(buf, offset + index, 4).getInt();
int intHash = intValue;
hashValue = combineHashCode(hashValue, intHash);
index += 4;
}

while (index < length) {
byte byteValue = buf[index];
int byteHash = byteValue;
hashValue = combineHashCode(hashValue, byteHash);
index += 1;
}

return finalizeHashCode(hashValue);
}

protected int combineHashCode(int currentHashCode, int newHashCode) {
return currentHashCode * 37 + newHashCode;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,11 @@ public int hashCode(ArrowBuf buf, long offset, long length) {
return SimpleHasher.INSTANCE.hashCode(buf, offset, length);
}

@Override
public int hashCode(byte[] buf, int offset, int length) {
return 0;
}

@Override
public int hashCode() {
return super.hashCode();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,30 +64,39 @@ public void shutdown() {
public void testHasher() {
try (ArrowBuf buf1 = allocator.buffer(BUFFER_LENGTH);
ArrowBuf buf2 = allocator.buffer(BUFFER_LENGTH)) {
byte[] ba1 = new byte[BUFFER_LENGTH];
byte[] ba2 = new byte[BUFFER_LENGTH];

// prepare data
for (int i = 0; i < BUFFER_LENGTH / 4; i++) {
buf1.setFloat(i * 4, i / 10.0f);
buf2.setFloat(i * 4, i / 10.0f);
}

verifyHashCodesEqual(buf1, 0, 100, buf2, 0, 100);
verifyHashCodesEqual(buf1, 1, 5, buf2, 1, 5);
verifyHashCodesEqual(buf1, 10, 17, buf2, 10, 17);
verifyHashCodesEqual(buf1, 33, 25, buf2, 33, 25);
verifyHashCodesEqual(buf1, 22, 22, buf2, 22, 22);
verifyHashCodesEqual(buf1, 123, 333, buf2, 123, 333);
verifyHashCodesEqual(buf1, 374, 1, buf2, 374, 1);
verifyHashCodesEqual(buf1, 11, 0, buf2, 11, 0);
verifyHashCodesEqual(buf1, 75, 25, buf2, 75, 25);
verifyHashCodesEqual(buf1, 0, 1024, buf2, 0, 1024);
buf1.getBytes(0, ba1);
buf2.getBytes(0, ba2);

verifyHashCodesEqual(buf1, ba1, 0, 100, buf2, ba2, 0, 100);
verifyHashCodesEqual(buf1, ba1, 1, 5, buf2, ba2, 1, 5);
verifyHashCodesEqual(buf1, ba1, 10, 17, buf2, ba2, 10, 17);
verifyHashCodesEqual(buf1, ba1, 33, 25, buf2, ba2, 33, 25);
verifyHashCodesEqual(buf1, ba1, 22, 22, buf2, ba2, 22, 22);
verifyHashCodesEqual(buf1, ba1, 123, 333, buf2, ba2, 123, 333);
verifyHashCodesEqual(buf1, ba1, 374, 1, buf2, ba2, 374, 1);
verifyHashCodesEqual(buf1, ba1, 11, 0, buf2, ba2, 11, 0);
verifyHashCodesEqual(buf1, ba1, 75, 25, buf2, ba2, 75, 25);
verifyHashCodesEqual(buf1, ba1, 0, 1024, buf2, ba2, 0, 1024);
}
}

private void verifyHashCodesEqual(ArrowBuf buf1, int offset1, int length1,
ArrowBuf buf2, int offset2, int length2) {
private void verifyHashCodesEqual(ArrowBuf buf1, byte[] ba1, int offset1, int length1,
ArrowBuf buf2, byte[] ba2, int offset2, int length2) {
int hashCode1 = hasher.hashCode(buf1, offset1, length1);
int hashCode2 = hasher.hashCode(buf2, offset2, length2);
assertEquals(hashCode1, hashCode2);

hashCode1 = hasher.hashCode(ba1, offset1, length1);
hashCode2 = hasher.hashCode(ba2, offset2, length2);
assertEquals(hashCode1, hashCode2);
}

@Test
Expand Down Expand Up @@ -116,30 +125,36 @@ public void testHasherNegative() {
public void testHasherLessThanInt() {
try (ArrowBuf buf1 = allocator.buffer(4);
ArrowBuf buf2 = allocator.buffer(4)) {
buf1.writeBytes("foo1".getBytes(StandardCharsets.UTF_8));
buf2.writeBytes("bar2".getBytes(StandardCharsets.UTF_8));
byte[] ba1 = "foo1".getBytes(StandardCharsets.UTF_8);
byte[] ba2 = "bar2".getBytes(StandardCharsets.UTF_8);
buf1.writeBytes(ba1);
buf2.writeBytes(ba2);

for (int i = 1; i <= 4; i ++) {
verifyHashCodeNotEqual(buf1, 0, i, buf2, 0, i);
verifyHashCodeNotEqual(buf1, ba1, 0, i, buf2, ba2, 0, i);
}
}
}

private void verifyHashCodeNotEqual(ArrowBuf buf1, int offset1, int length1,
ArrowBuf buf2, int offset2, int length2) {
private void verifyHashCodeNotEqual(ArrowBuf buf1, byte[] ba1, int offset1, int length1,
ArrowBuf buf2, byte[] ba2, int offset2, int length2) {
int hashCode1 = hasher.hashCode(buf1, 0, length1);
int hashCode2 = hasher.hashCode(buf2, 0, length2);
assertNotEquals(hashCode1, hashCode2);

hashCode1 = hasher.hashCode(ba1, 0, length1);
hashCode2 = hasher.hashCode(ba2, 0, length2);
assertNotEquals(hashCode1, hashCode2);
}

@Parameterized.Parameters(name = "hasher = {0}")
public static Collection<Object[]> getHasher() {
return Arrays.asList(
new Object[] {SimpleHasher.class.getSimpleName(),
SimpleHasher.INSTANCE},
new Object[] {MurmurHasher.class.getSimpleName(),
new MurmurHasher()
}
new Object[] {SimpleHasher.class.getSimpleName(),
SimpleHasher.INSTANCE},
new Object[] {MurmurHasher.class.getSimpleName(),
new MurmurHasher()
}
);
}
}

0 comments on commit 9e7991b

Please sign in to comment.