decred · davecgh · Feb 4, 2021 · Feb 2, 2021
diff --git a/container/apbf/README.md b/container/apbf/README.md
@@ -31,7 +31,7 @@ unbounded event stream with a tunable upper bound on the false positive rate.
 
 ### Additional Implementation Details
 
-This implementation deviates from the original paper in at least a couple of
+This implementation deviates from the original paper in at least the following
 important ways:
 
 - It uses Dillinger-Manolis enhanced double hashing instead of the more
@@ -41,6 +41,7 @@ important ways:
 - Every filter is given a unique key for the internal hashing logic so each one
   will have a unique set of false positives and that key is automatically
   changed when the filter is manually reset by the caller
+- Lemire fast reduction is used instead of standard modular reduction
 
 ## Choosing Parameters
 
@@ -87,18 +88,18 @@ described above, for the parameter selection.
 
 Capacity | Target FP | Actual Observed FP
 ---------|-----------|-------------------
-1000     |   0.1%    | 0.099%
-10000    |   0.1%    | 0.096%
+1000     |   0.1%    | 0.097%
+10000    |   0.1%    | 0.099%
 10000000 |   0.1%    | 0.1%
-1000     |   1.0%    | 0.948%
-10000    |   1.0%    | 0.868%
+1000     |   1.0%    | 0.867%
+10000    |   1.0%    | 0.862%
 10000000 |   1.0%    | 0.857%
-1000     |   2.0%    | 1.448%
-10000    |   2.0%    | 1.47%
-10000000 |   2.0%    | 1.46%
+1000     |   2.0%    | 1.464%
+10000    |   2.0%    | 1.451%
+10000000 |   2.0%    | 1.461%
 1000     |  10.0%    | 6.74%
-10000    |  10.0%    | 6.834%
-10000000 |  10.0%    | 7.027%
+10000    |  10.0%    | 6.96%
+10000000 |  10.0%    | 7.024%
 
 ## Memory Usage
 
@@ -126,32 +127,32 @@ operations.  The benchmarks are from a Ryzen 7 1700 processor.
 
 Capacity | Target FP |  Time / Op  | Allocs / Op
 ---------|-----------|-------------|------------
-1000     | 0.1%      | 159ns ± 5%  | 0
-1000     | 0.01%     | 208ns ± 0%  | 0
-1000     | 0.001%    | 245ns ± 0%  | 0
-100000   | 0.01%     | 198ns ± 3%  | 0
-100000   | 0.0001%   | 271ns ± 2%  | 0
-1000000  | 0.00001%  | 496ns ± 7%  | 0
+1000     | 0.1%      |  59ns ± 1%  | 0
+1000     | 0.01%     |  69ns ± 2%  | 0
+1000     | 0.001%    |  78ns ± 2%  | 0
+100000   | 0.01%     |  80ns ± 1%  | 0
+100000   | 0.0001%   | 110ns ± 1%  | 0
+1000000  | 0.00001%  | 205ns ± 2%  | 0
 
 ### `Contains` (item matches filter, worst case)
 
 Capacity | Target FP |  Time / Op  | Allocs / Op
 ---------|-----------|-------------|------------
-1000     | 0.1%      | 175ns ± 1%  | 0
-1000     | 0.01%     | 228ns ± 1%  | 0
-1000     | 0.001%    | 267ns ± 1%  | 0
-100000   | 0.01%     | 211ns ± 1%  | 0
-100000   | 0.0001%   | 282ns ± 1%  | 0
+1000     | 0.1%      |  69ns ± 2%  | 0
+1000     | 0.01%     |  80ns ± 1%  | 0
+1000     | 0.001%    |  89ns ± 1%  | 0
+100000   | 0.01%     |  80ns ± 1%  | 0
+100000   | 0.0001%   |  98ns ± 1%  | 0
 
 ### `Contains` (item does NOT match filter)
 
 Capacity | Target FP |  Time / Op  | Allocs / Op
 ---------|-----------|-------------|------------
-1000     | 0.1%      | 45.4ns ± 0% | 0
-1000     | 0.01%     | 57.6ns ±21% | 0
-1000     | 0.001%    | 56.2ns ±19% | 0
-100000   | 0.01%     | 48.7ns ±15% | 0
-100000   | 0.0001%   | 49.6ns ±19% | 0
+1000     | 0.1%      | 42.0ns ±26% | 0
+1000     | 0.01%     | 37.7ns ±5%  | 0
+1000     | 0.001%    | 37.0ns ±4%  | 0
+100000   | 0.01%     | 37.6ns ±10% | 0
+100000   | 0.0001%   | 36.3ns ±6%  | 0
 
 ## Installation and Updating
 

diff --git a/container/apbf/filter.go b/container/apbf/filter.go
@@ -10,6 +10,7 @@ package apbf
 import (
 	"encoding/binary"
 	"math"
+	"math/bits"
 	"sync"
 	"time"
 
@@ -302,6 +303,34 @@ func (f *Filter) setBit(bit uint64) {
 	f.data[bit>>3] |= 1 << (bit & 7)
 }
 
+// fastReduce calculates a mapping that is more or less equivalent to x mod N.
+// However, instead of using a mod operation that can lead to slowness on many
+// processors when not using a power of two due to unnecessary division, this
+// uses a "multiply-and-shift" trick that eliminates all divisions as described
+// in a blog post by Daniel Lemire, located at the following site at the time
+// of this writing:
+// https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
+//
+// Since that link might disappear, the general idea is to multiply by N and
+// shift right by log2(N).  Since N is a 64-bit integer in this case, it
+// becomes:
+//
+// (x * N) / 2^64 == (x * N) >> 64
+//
+// This is a fair map since it maps integers in the range [0,2^64) to multiples
+// of N in [0, N*2^64) and then divides by 2^64 to map all multiples of N in
+// [0,2^64) to 0, all multiples of N in [2^64, 2*2^64) to 1, etc.  This results
+// in either ceil(2^64/N) or floor(2^64/N) multiples of N.
+func fastReduce(x, N uint64) uint64 {
+	// This uses math/bits to perform the 128-bit multiplication as the compiler
+	// will replace it with the relevant intrinsic on most architectures.
+	//
+	// The high 64 bits in a 128-bit product is the same as shifting the entire
+	// product right by 64 bits.
+	hi, _ := bits.Mul64(x, N)
+	return hi
+}
+
 // Add inserts the provided data into the filter.
 //
 // This function is safe for concurrent access.
@@ -335,7 +364,7 @@ func (f *Filter) Add(data []byte) {
 	hash1, hash2 := siphash.Hash128(f.key0, f.key1, data)
 	derivedIdx, acc := deriveIndex(logicalSlice, hash1, hash2)
 	for i := uint8(0); i < f.k; i++ {
-		f.setBit(sliceBitOffset + derivedIdx%f.bitsPerSlice)
+		f.setBit(sliceBitOffset + fastReduce(derivedIdx, f.bitsPerSlice))
 
 		// Move to the next logical slice while wrapping around the ring buffer
 		// if needed.
@@ -404,7 +433,7 @@ func (f *Filter) Contains(data []byte) bool {
 	hash1, hash2 := siphash.Hash128(f.key0, f.key1, data)
 	derivedIdx, acc := deriveIndex(logicalSlice, hash1, hash2)
 	for {
-		if f.isBitSet(sliceBitOffset + derivedIdx%f.bitsPerSlice) {
+		if f.isBitSet(sliceBitOffset + fastReduce(derivedIdx, f.bitsPerSlice)) {
 			// Successful query when the required number of consecutive matches
 			// is achieved.
 			curMatches++

diff --git a/container/apbf/go.mod b/container/apbf/go.mod
@@ -1,5 +1,5 @@
 module github.com/decred/dcrd/container/apbf
 
-go 1.11
+go 1.13
 
 require github.com/dchest/siphash v1.2.2