exec: use go runtime hash functions in hashjoiner

The previous hash functions derived from Java's hashCode implementations were good when it came to deriving a hash that could be used for equality (my understanding is that hashCode is primarily used for this) but the distribution property of these functions was unclear. Additionally hashing was not supported for some types. This commit copies go's non-cryptographic hash functions used for maps to support hashing for all types and get good distribution properties with minimal performance impact. The hash algorithm used is derived from: https://github.com/Cyan4973/xxHash https://github.com/google/cityhash And was tested using https://github.com/aappleby/smhasher. Benchmark results shared in PR. Release note: None
cockroachdb · Mar 4, 2019 · f63890e · f63890e
1 parent fb3c5c0
commit f63890e
Show file tree

Hide file tree

Showing 5 changed files with 219 additions and 29 deletions.
diff --git a/pkg/sql/exec/execgen/cmd/execgen/overloads.go b/pkg/sql/exec/execgen/cmd/execgen/overloads.go
@@ -245,10 +245,11 @@ type bytesCustomizer struct{}
 // variable-set semantics.
 type decimalCustomizer struct{}
 
-// float32Customizer and float64Customizer are necessary since float32 and
-// float64 require additional logic for hashing.
-type float32Customizer struct{}
-type float64Customizer struct{}
+// floatCustomizers are used for hash functions.
+type floatCustomizer struct{ width int }
+
+// intCustomizers are used for hash functions.
+type intCustomizer struct{ width int }
 
 func (boolCustomizer) getCmpOpAssignFunc() assignFunc {
 	return func(op overload, target, l, r string) string {
@@ -264,11 +265,11 @@ func (boolCustomizer) getCmpOpAssignFunc() assignFunc {
 func (boolCustomizer) getHashAssignFunc() assignFunc {
 	return func(op overload, target, v, _ string) string {
 		return fmt.Sprintf(`
-			x := uint64(0)
+			x := 0
 			if %[2]s {
     		x = 1
 			}
-			%[1]s = x
+			%[1]s = %[1]s*31 + uintptr(x)
 		`, target, v)
 	}
 }
@@ -289,11 +290,9 @@ func (bytesCustomizer) getCmpOpAssignFunc() assignFunc {
 func (bytesCustomizer) getHashAssignFunc() assignFunc {
 	return func(op overload, target, v, _ string) string {
 		return fmt.Sprintf(`
-			_temp := 1
-			for b := range %s {
-				_temp = _temp*31 + b
-			}
-			%s = uint64(hash)
+			sh := (*reflect.SliceHeader)(unsafe.Pointer(&%[1]s))
+			%[2]s = memhash(unsafe.Pointer(sh.Data), %[2]s, uintptr(len(%[1]s)))
+
 		`, v, target)
 	}
 }
@@ -319,20 +318,21 @@ func (decimalCustomizer) getHashAssignFunc() assignFunc {
 			if err != nil {
 				panic(fmt.Sprintf("%%v", err))
 			}
-			%[1]s = math.Float64bits(d)
+
+			%[1]s = f64hash(noescape(unsafe.Pointer(&d)), %[1]s)
 		`, target, v)
 	}
 }
 
-func (float32Customizer) getHashAssignFunc() assignFunc {
+func (c floatCustomizer) getHashAssignFunc() assignFunc {
 	return func(op overload, target, v, _ string) string {
-		return fmt.Sprintf("%s = uint64(math.Float32bits(%s))", target, v)
+		return fmt.Sprintf("%[1]s = f%[3]dhash(noescape(unsafe.Pointer(&%[2]s)), %[1]s)", target, v, c.width)
 	}
 }
 
-func (float64Customizer) getHashAssignFunc() assignFunc {
+func (c intCustomizer) getHashAssignFunc() assignFunc {
 	return func(op overload, target, v, _ string) string {
-		return fmt.Sprintf("%s = math.Float64bits(%s)", target, v)
+		return fmt.Sprintf("%[1]s = memhash%[3]d(noescape(unsafe.Pointer(&%[2]s)), %[1]s)", target, v, c.width)
 	}
 }
 
@@ -341,8 +341,12 @@ func registerTypeCustomizers() {
 	registerTypeCustomizer(types.Bool, boolCustomizer{})
 	registerTypeCustomizer(types.Bytes, bytesCustomizer{})
 	registerTypeCustomizer(types.Decimal, decimalCustomizer{})
-	registerTypeCustomizer(types.Float32, float32Customizer{})
-	registerTypeCustomizer(types.Float64, float64Customizer{})
+	registerTypeCustomizer(types.Float32, floatCustomizer{width: 32})
+	registerTypeCustomizer(types.Float64, floatCustomizer{width: 64})
+	registerTypeCustomizer(types.Int8, intCustomizer{width: 8})
+	registerTypeCustomizer(types.Int16, intCustomizer{width: 16})
+	registerTypeCustomizer(types.Int32, intCustomizer{width: 32})
+	registerTypeCustomizer(types.Int64, intCustomizer{width: 64})
 }
 
 // Avoid unused warning for Assign, which is only used in templates.

diff --git a/pkg/sql/exec/hash.go b/pkg/sql/exec/hash.go
@@ -0,0 +1,183 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the golang.org/LICENSE file.
+
+// Hashing algorithm inspired by
+//   xxhash: https://code.google.com/p/xxhash/
+// cityhash: https://code.google.com/p/cityhash/
+// Most of the code in this file is copied from the go runtime package. These
+// are the hash functions used for go maps.
+
+package exec
+
+import (
+	"math/rand"
+	"unsafe"
+)
+
+const (
+	ptrSize = 4 << (^uintptr(0) >> 63) // unsafe.Sizeof(uintptr(0)) but an ideal const
+	c0      = uintptr((8-ptrSize)/4*2860486313 + (ptrSize-4)/4*33054211828000289)
+	c1      = uintptr((8-ptrSize)/4*3267000013 + (ptrSize-4)/4*23344194077549503)
+	// Constants for multiplication: four random odd 64-bit numbers.
+	m1 = 16877499708836156737
+	m2 = 2820277070424839065
+	m3 = 9497967016996688599
+	m4 = 15839092249703872147
+)
+
+// hashKey is used to seed the hash function.
+var hashKey [4]uintptr
+
+func init() {
+	rand.Read((*[len(hashKey) * ptrSize]byte)(unsafe.Pointer(&hashKey))[:])
+	hashKey[0] |= 1 // make sure these numbers are odd
+	hashKey[1] |= 1
+	hashKey[2] |= 1
+	hashKey[3] |= 1
+}
+
+func readUnaligned32(p unsafe.Pointer) uint32 {
+	return *(*uint32)(p)
+}
+
+func readUnaligned64(p unsafe.Pointer) uint64 {
+	return *(*uint64)(p)
+}
+
+// Should be a built-in for unsafe.Pointer?
+//go:nosplit
+func add(p unsafe.Pointer, x uintptr) unsafe.Pointer {
+	return unsafe.Pointer(uintptr(p) + x)
+}
+
+//go:linkname noescape runtime.noescape
+func noescape(p unsafe.Pointer) unsafe.Pointer
+
+func memhash(p unsafe.Pointer, seed, s uintptr) uintptr {
+	h := uint64(seed + s*hashKey[0])
+tail:
+	switch {
+	case s == 0:
+	case s < 4:
+		h ^= uint64(*(*byte)(p))
+		h ^= uint64(*(*byte)(add(p, s>>1))) << 8
+		h ^= uint64(*(*byte)(add(p, s-1))) << 16
+		h = rotl31(h*m1) * m2
+	case s <= 8:
+		h ^= uint64(readUnaligned32(p))
+		h ^= uint64(readUnaligned32(add(p, s-4))) << 32
+		h = rotl31(h*m1) * m2
+	case s <= 16:
+		h ^= readUnaligned64(p)
+		h = rotl31(h*m1) * m2
+		h ^= readUnaligned64(add(p, s-8))
+		h = rotl31(h*m1) * m2
+	case s <= 32:
+		h ^= readUnaligned64(p)
+		h = rotl31(h*m1) * m2
+		h ^= readUnaligned64(add(p, 8))
+		h = rotl31(h*m1) * m2
+		h ^= readUnaligned64(add(p, s-16))
+		h = rotl31(h*m1) * m2
+		h ^= readUnaligned64(add(p, s-8))
+		h = rotl31(h*m1) * m2
+	default:
+		v1 := h
+		v2 := uint64(seed * hashKey[1])
+		v3 := uint64(seed * hashKey[2])
+		v4 := uint64(seed * hashKey[3])
+		for s >= 32 {
+			v1 ^= readUnaligned64(p)
+			v1 = rotl31(v1*m1) * m2
+			p = add(p, 8)
+			v2 ^= readUnaligned64(p)
+			v2 = rotl31(v2*m2) * m3
+			p = add(p, 8)
+			v3 ^= readUnaligned64(p)
+			v3 = rotl31(v3*m3) * m4
+			p = add(p, 8)
+			v4 ^= readUnaligned64(p)
+			v4 = rotl31(v4*m4) * m1
+			p = add(p, 8)
+			s -= 32
+		}
+		h = v1 ^ v2 ^ v3 ^ v4
+		goto tail
+	}
+
+	h ^= h >> 29
+	h *= m3
+	h ^= h >> 32
+	return uintptr(h)
+}
+
+func memhash8(p unsafe.Pointer, h uintptr) uintptr {
+	return memhash(p, h, 1)
+}
+
+func memhash16(p unsafe.Pointer, h uintptr) uintptr {
+	return memhash(p, h, 2)
+}
+
+func memhash32(p unsafe.Pointer, seed uintptr) uintptr {
+	h := uint64(seed + 4*hashKey[0])
+	v := uint64(readUnaligned32(p))
+	h ^= v
+	h ^= v << 32
+	h = rotl31(h*m1) * m2
+	h ^= h >> 29
+	h *= m3
+	h ^= h >> 32
+	return uintptr(h)
+}
+
+func memhash64(p unsafe.Pointer, seed uintptr) uintptr {
+	h := uint64(seed + 8*hashKey[0])
+	h ^= uint64(readUnaligned32(p)) | uint64(readUnaligned32(add(p, 4)))<<32
+	h = rotl31(h*m1) * m2
+	h ^= h >> 29
+	h *= m3
+	h ^= h >> 32
+	return uintptr(h)
+}
+
+// Note: in order to get the compiler to issue rotl instructions, we
+// need to constant fold the shift amount by hand.
+// TODO: convince the compiler to issue rotl instructions after inlining.
+func rotl31(x uint64) uint64 {
+	return (x << 31) | (x >> (64 - 31))
+}
+
+// NOTE: Because NaN != NaN, a map can contain any
+// number of (mostly useless) entries keyed with NaNs.
+// To avoid long hash chains, we assign a random number
+// as the hash value for a NaN.
+
+func f32hash(p unsafe.Pointer, h uintptr) uintptr {
+	f := *(*float32)(p)
+	switch {
+	case f == 0:
+		return c1 * (c0 ^ h) // +0, -0
+	case f != f:
+		// TODO(asubiotto): fastrand relies on some stack internals.
+		//return c1 * (c0 ^ h ^ uintptr(fastrand())) // any kind of NaN
+		return c1 * (c0 ^ h ^ uintptr(rand.Uint32()))
+	default:
+		return memhash(p, h, 4)
+	}
+}
+
+func f64hash(p unsafe.Pointer, h uintptr) uintptr {
+	f := *(*float64)(p)
+	switch {
+	case f == 0:
+		return c1 * (c0 ^ h) // +0, -0
+	case f != f:
+		// TODO(asubiotto): fastrand relies on some stack internals.
+		//return c1 * (c0 ^ h ^ uintptr(fastrand())) // any kind of NaN
+		return c1 * (c0 ^ h ^ uintptr(rand.Uint32())) // any kind of NaN
+	default:
+		return memhash(p, h, 8)
+	}
+}
diff --git a/pkg/sql/exec/hashjoiner.go b/pkg/sql/exec/hashjoiner.go
@@ -446,11 +446,9 @@ func (ht *hashTable) loadBatch(batch ColBatch) {
 // key is a tuple of various types, rehash is used to apply a transformation on
 // the resulting hash value based on an element of the key of a specified type.
 //
-// The current integer tuple hashing heuristic is based off of Java's
-// Arrays.hashCode(int[]) and only supports int8, int16, int32, and int64
-// elements. float32 and float64 are hashed according to their respective 32-bit
-// and 64-bit integer representation. bool keys are hashed as a 1 for true and 0
-// for false. bytes are hashed as an array of int8 integers.
+// We currently use the same hash functions used by go's maps.
+// TODO(asubiotto): Once https://go-review.googlesource.com/c/go/+/155118/ is
+// in, we should use the public API.
 //
 // initHash initializes the hash value of each key to its initial state for
 // rehashing purposes.

diff --git a/pkg/sql/exec/hashjoiner_tmpl.go b/pkg/sql/exec/hashjoiner_tmpl.go
@@ -26,7 +26,8 @@ package exec
 import (
 	"bytes"
 	"fmt"
-	"math"
+	"reflect"
+	"unsafe"
 
 	"github.com/cockroachdb/cockroach/pkg/sql/exec/types"
 	"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
@@ -37,8 +38,11 @@ import (
 // Dummy import to pull in "tree" package.
 var _ tree.Datum
 
-// Dummy import to pull in "math" package
-var _ = math.Pi
+// Dummy import to pull in "unsafe" package
+var _ unsafe.Pointer
+
+// Dummy import to pull in "reflect" package
+var _ reflect.SliceHeader
 
 // Dummy import to pull in "bytes" package.
 var _ bytes.Buffer
@@ -140,9 +144,9 @@ func _REHASH_BODY(buckets []uint64, keys []interface{}, nKeys uint64, _SEL_STRIN
 	// {{define "rehashBody"}}
 	for i := uint64(0); i < nKeys; i++ {
 		v := keys[_SEL_IND]
-		var hash uint64
-		_ASSIGN_HASH(hash, v)
-		buckets[i] = buckets[i]*31 + hash
+		p := uintptr(buckets[i])
+		_ASSIGN_HASH(p, v)
+		buckets[i] = uint64(p)
 	}
 	// {{end}}
 

diff --git a/pkg/sql/exec/noescape.s b/pkg/sql/exec/noescape.s
@@ -0,0 +1 @@
+// Empty assembly file to allow go:linkname to work.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		// Empty assembly file to allow go:linkname to work.