Skip to content

Commit bc27f17

Browse files
liyafan82Pindikura Ravindra
authored andcommitted
ARROW-6024: [Java] Provide more hash algorithms
Provide more hash algorithms to choose for different scenarios. In particular, we provide the following hash algorithms: * Simple hasher: A hasher that calculates the hash code of integers as is, and do not perform any finalization. So the computation is extremely efficient, but the quality of the produced hash code may not be good. * Murmur finalizing hasher: Finalize the hash code by the Murmur hashing algorithm. Details of the algorithm can be found in https://en.wikipedia.org/wiki/MurmurHash. Murmur hashing is computational expensive, as it involves several integer multiplications. However, the produced hash codes have good quality in the sense that they are uniformly distributed in the universe. * Jenkins finalizing hasher: Finalize the hash code by Bob Jenkins' algorithm. Details of this algorithm can be found in http://www.burtleburtle.net/bob/hash/integer.html. Jenkins hashing is less computational expensive than Murmur hashing, as it involves no integer multiplication. However, the produced hash codes also have good quality in the sense that they are uniformly distributed in the universe. * Non-negative hasher: Wrapper for another hasher, to make the generated hash code non-negative. This can be useful for scenarios like hash table. Closes apache#4934 from liyafan82/fly_0724_hash and squashes the following commits: e4d4663 <Pindikura Ravindra> Update MurmurHasher.java 8bcd5a4 <liyafan82> Provide more hash algorithms Lead-authored-by: Liya Fan <fan_li_ya@foxmail.com> Co-authored-by: Pindikura Ravindra <ravindra@dremio.com> Co-authored-by: liyafan82 <fan_li_ya@foxmail.com> Signed-off-by: Micah Kornfield <emkornfield@gmail.com>
1 parent af9e583 commit bc27f17

8 files changed

Lines changed: 371 additions & 247 deletions

File tree

java/memory/src/main/java/org/apache/arrow/memory/util/ArrowBufPointer.java

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,10 @@
1717

1818
package org.apache.arrow.memory.util;
1919

20+
import java.nio.ByteOrder;
21+
2022
import org.apache.arrow.memory.util.hash.ArrowBufHasher;
21-
import org.apache.arrow.memory.util.hash.DirectHasher;
23+
import org.apache.arrow.memory.util.hash.SimpleHasher;
2224
import org.apache.arrow.util.Preconditions;
2325

2426
import io.netty.buffer.ArrowBuf;
@@ -29,6 +31,8 @@
2931
*/
3032
public final class ArrowBufPointer {
3133

34+
public static final boolean LITTLE_ENDIAN = ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN;
35+
3236
/**
3337
* The hash code when the arrow buffer is null.
3438
*/
@@ -53,7 +57,7 @@ public final class ArrowBufPointer {
5357
* The default constructor.
5458
*/
5559
public ArrowBufPointer() {
56-
this(DirectHasher.INSTANCE);
60+
this(SimpleHasher.INSTANCE);
5761
}
5862

5963
/**
@@ -72,7 +76,7 @@ public ArrowBufPointer(ArrowBufHasher hasher) {
7276
* @param length the length off set of the memory region pointed to.
7377
*/
7478
public ArrowBufPointer(ArrowBuf buf, int offset, int length) {
75-
this(buf, offset, length, DirectHasher.INSTANCE);
79+
this(buf, offset, length, SimpleHasher.INSTANCE);
7680
}
7781

7882
/**

java/memory/src/main/java/org/apache/arrow/memory/util/ByteFunctionHelpers.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919

2020
import org.apache.arrow.memory.BoundsChecking;
2121
import org.apache.arrow.memory.util.hash.ArrowBufHasher;
22-
import org.apache.arrow.memory.util.hash.DirectHasher;
22+
import org.apache.arrow.memory.util.hash.SimpleHasher;
2323

2424
import io.netty.buffer.ArrowBuf;
2525
import io.netty.util.internal.PlatformDependent;
@@ -253,7 +253,7 @@ private static int memcmp(
253253
*/
254254
public static final int hash(final ArrowBuf buf, int start, int end) {
255255

256-
ArrowBufHasher hasher = DirectHasher.INSTANCE;
256+
ArrowBufHasher hasher = SimpleHasher.INSTANCE;
257257

258258
return hasher.hashCode(buf, start, end - start);
259259
}

java/memory/src/main/java/org/apache/arrow/memory/util/hash/ArrowBufHasher.java

Lines changed: 4 additions & 112 deletions
Original file line numberDiff line numberDiff line change
@@ -17,93 +17,24 @@
1717

1818
package org.apache.arrow.memory.util.hash;
1919

20-
import static io.netty.util.internal.PlatformDependent.getByte;
21-
import static io.netty.util.internal.PlatformDependent.getInt;
22-
import static io.netty.util.internal.PlatformDependent.getLong;
23-
24-
import java.nio.ByteOrder;
25-
2620
import io.netty.buffer.ArrowBuf;
2721

2822
/**
2923
* Utility for calculating the hash code for a consecutive memory region.
3024
* This class provides the basic framework for efficiently calculating the hash code.
31-
* It first splits the memory region into small segments with 8 bytes, 4 bytes and 1 byte,
32-
* and calculates hash codes for them separately. It produces the final hash code by combining
33-
* the hash codes and finalizing the resulting hash code.
34-
*
3525
* <p>
36-
* To compute the hash code, the user simply calls the hashCode methods with the starting
37-
* address and length of the memory region.
38-
* </p>
39-
* <p>
40-
* A default light-weight implementation of this class is given in {@link DirectHasher}. However, the users can
41-
* devise their own customized hasher by sub-classing this method and overriding the abstract methods.
42-
* In particular
43-
* <li>
44-
* {@link ArrowBufHasher#combineHashCode(int, int)} provides the method for combining hash
45-
* codes for individual small segments.
46-
* </li>
47-
* <li>
48-
* {@link ArrowBufHasher#finalizeHashCode(int)} provides the method for finalizing the hash code.
49-
* </li>
50-
* <li>
51-
* {@link ArrowBufHasher#getByteHashCode(byte)} provides the method for calculating the hash code
52-
* for 1-byte memory segment.
53-
* </li>
54-
* <li>
55-
* {@link ArrowBufHasher#getIntHashCode(int)} provides the method for calculating the hash code
56-
* for 4-byte memory segment.
57-
* </li>
58-
* <li>
59-
* {@link ArrowBufHasher#getLongHashCode(long)} provides the method for calculating the hash code
60-
* for 8-byte memory segment.
61-
* </li>
26+
* A default light-weight implementation is given in {@link SimpleHasher}.
6227
* </p>
6328
*/
64-
public abstract class ArrowBufHasher {
65-
66-
public static final boolean LITTLE_ENDIAN = ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN;
29+
public interface ArrowBufHasher {
6730

6831
/**
6932
* Calculates the hash code for a memory region.
7033
* @param address start address of the memory region.
7134
* @param length length of the memory region.
7235
* @return the hash code.
7336
*/
74-
public int hashCode(long address, int length) {
75-
int hashValue = 0;
76-
int index = 0;
77-
while (index + 8 <= length) {
78-
long longValue = getLong(address + index);
79-
if (!LITTLE_ENDIAN) {
80-
// assume the buffer is in little endian
81-
longValue = Long.reverseBytes(longValue);
82-
}
83-
int longHash = getLongHashCode(longValue);
84-
hashValue = combineHashCode(hashValue, longHash);
85-
index += 8;
86-
}
87-
88-
while (index + 4 <= length) {
89-
int intValue = getInt(address + index);
90-
if (!LITTLE_ENDIAN) {
91-
intValue = Integer.reverseBytes(intValue);
92-
}
93-
int intHash = getIntHashCode(intValue);
94-
hashValue = combineHashCode(hashValue, intHash);
95-
index += 4;
96-
}
97-
98-
while (index < length) {
99-
byte byteValue = getByte(address + index);
100-
int byteHash = getByteHashCode(byteValue);
101-
hashValue = combineHashCode(hashValue, byteHash);
102-
index += 1;
103-
}
104-
105-
return finalizeHashCode(hashValue);
106-
}
37+
int hashCode(long address, int length);
10738

10839
/**
10940
* Calculates the hash code for a memory region.
@@ -112,44 +43,5 @@ public int hashCode(long address, int length) {
11243
* @param length length of the memory region.
11344
* @return the hash code.
11445
*/
115-
public int hashCode(ArrowBuf buf, int offset, int length) {
116-
buf.checkBytes(offset, offset + length);
117-
return hashCode(buf.memoryAddress() + offset, length);
118-
}
119-
120-
/**
121-
* Calculates the hash code by combining the existing hash code and a new hash code.
122-
* @param currentHashCode the existing hash code.
123-
* @param newHashCode the new hash code.
124-
* @return the combined hash code.
125-
*/
126-
protected abstract int combineHashCode(int currentHashCode, int newHashCode);
127-
128-
/**
129-
* Gets the hash code for a byte value.
130-
* @param byteValue the byte value.
131-
* @return the hash code.
132-
*/
133-
protected abstract int getByteHashCode(byte byteValue);
134-
135-
/**
136-
* Gets the hash code for a integer value.
137-
* @param intValue the integer value.
138-
* @return the hash code.
139-
*/
140-
protected abstract int getIntHashCode(int intValue);
141-
142-
/**
143-
* Gets the hash code for a long value.
144-
* @param longValue the long value.
145-
* @return the hash code.
146-
*/
147-
protected abstract int getLongHashCode(long longValue);
148-
149-
/**
150-
* Finalize the hash code.
151-
* @param hashCode the current hash code.
152-
* @return the finalized hash code.
153-
*/
154-
protected abstract int finalizeHashCode(int hashCode);
46+
int hashCode(ArrowBuf buf, int offset, int length);
15547
}

java/memory/src/main/java/org/apache/arrow/memory/util/hash/DirectHasher.java

Lines changed: 0 additions & 87 deletions
This file was deleted.

0 commit comments

Comments
 (0)