Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize bloom filter #2588

Merged
merged 20 commits into from
Jan 8, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions network/p2p/gossip/bloom.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ import (
// [maxExpectedElements] elements anticipated at any moment, and a false
// positive probability of [falsePositiveProbability].
func NewBloomFilter(
StephenButtolph marked this conversation as resolved.
Show resolved Hide resolved
maxExpectedElements uint64,
maxExpectedElements int,
falsePositiveProbability float64,
) (*BloomFilter, error) {
bloom, err := bloom.New(bloom.OptimalParameters(
Expand Down Expand Up @@ -65,11 +65,15 @@ func ResetBloomFilterIfNeeded(
bloomFilter *BloomFilter,
falsePositiveProbability float64,
) (bool, error) {
if bloomFilter.bloom.FalsePositiveProbability() < falsePositiveProbability {
numSeeds, numBytes := bloomFilter.bloom.Parameters()
// TODO: Precalculate maxEntries, as it is independent of the current state
// of the bloom filter.
maxEntries := bloom.EstimateEntries(numSeeds, numBytes, falsePositiveProbability)
patrick-ogrady marked this conversation as resolved.
Show resolved Hide resolved
if bloomFilter.bloom.Count() < maxEntries {
patrick-ogrady marked this conversation as resolved.
Show resolved Hide resolved
return false, nil
}

newBloom, err := bloom.New(bloomFilter.bloom.Parameters())
newBloom, err := bloom.New(numSeeds, numBytes)
if err != nil {
return false, err
}
Expand Down
14 changes: 8 additions & 6 deletions pubsub/bloom/bloom_filter.go → pubsub/bloom/filter.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ import (
"github.com/ava-labs/avalanchego/utils/bloom"
)

const bytesPerSeed = 8

var errMaxBytes = errors.New("too large")

type Filter interface {
StephenButtolph marked this conversation as resolved.
Show resolved Hide resolved
Expand All @@ -19,27 +21,27 @@ type Filter interface {
Check([]byte) bool
}

func New(maxN uint64, p float64, maxBytes int) (Filter, error) {
func New(maxN int, p float64, maxBytes int) (Filter, error) {
numSeeds, numBytes := bloom.OptimalParameters(maxN, p)
if neededBytes := 1 + numSeeds*8 + numBytes; neededBytes > maxBytes {
if neededBytes := 1 + numSeeds*bytesPerSeed + numBytes; neededBytes > maxBytes {
return nil, errMaxBytes
}
f, err := bloom.New(numSeeds, numBytes)
return &bloomFitler{
return &fitler{
filter: f,
}, err
}

type bloomFitler struct {
type fitler struct {
StephenButtolph marked this conversation as resolved.
Show resolved Hide resolved
filter *bloom.Filter
}

func (f *bloomFitler) Add(bl ...[]byte) {
func (f *fitler) Add(bl ...[]byte) {
for _, b := range bl {
bloom.Add(f.filter, b, nil)
}
}

func (f *bloomFitler) Check(b []byte) bool {
func (f *fitler) Check(b []byte) bool {
return bloom.Contains(f.filter, b, nil)
}
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@ import (

func TestNew(t *testing.T) {
var (
require = require.New(t)
maxN uint64 = 10000
p = 0.1
maxBytes = 1 * units.MiB // 1 MiB
require = require.New(t)
maxN = 10000
p = 0.1
maxBytes = 1 * units.MiB // 1 MiB
)
f, err := New(maxN, p, maxBytes)
require.NoError(err)
Expand Down
2 changes: 1 addition & 1 deletion pubsub/connection.go
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ func (c *connection) handleNewBloom(cmd *NewBloom) error {
if !cmd.IsParamsValid() {
return ErrInvalidFilterParam
}
filter, err := bloom.New(uint64(cmd.MaxElements), float64(cmd.CollisionProb), MaxBytes)
filter, err := bloom.New(int(cmd.MaxElements), float64(cmd.CollisionProb), MaxBytes)
dhrubabasu marked this conversation as resolved.
Show resolved Hide resolved
if err != nil {
return fmt.Errorf("bloom filter creation failed %w", err)
}
Expand Down
38 changes: 8 additions & 30 deletions utils/bloom/filter.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ import (
"encoding/binary"
"errors"
"fmt"
"math"
"math/bits"
"sync"
)
Expand All @@ -27,7 +26,6 @@ var (
errInvalidNumSeeds = errors.New("invalid num seeds")
errTooFewSeeds = errors.New("too few seeds")
errTooManySeeds = errors.New("too many seeds")
errPaddedNumSeeds = errors.New("number of seeds unnecessarily padded")
errTooFewEntries = errors.New("too few entries")
)

Expand Down Expand Up @@ -80,18 +78,13 @@ func (f *Filter) Add(hash uint64) {
f.count++
}

// FalsePositiveProbability is a lower-bound on the probability of false
// positives. For values where numBits >> numSeeds, the predicted probability is
// fairly accurate.
func (f *Filter) FalsePositiveProbability() float64 {
numSeeds := float64(len(f.seeds))
numBits := float64(f.numBits)

// Count returns the number of elements that have been added to the bloom
// filter.
func (f *Filter) Count() int {
f.lock.RLock()
numAdded := float64(f.count)
f.lock.RUnlock()
defer f.lock.RUnlock()

return falsePositiveProbability(numSeeds, numBits, numAdded)
return f.count
}

func (f *Filter) Contains(hash uint64) bool {
Expand Down Expand Up @@ -128,12 +121,6 @@ func newSeeds(numSeeds int) ([]uint64, error) {
return seeds, nil
}

// ref: https://tsapps.nist.gov/publication/get_pdf.cfm?pub_id=903775
func falsePositiveProbability(numSeeds, numBits, numAdded float64) float64 {
bitCollisionProbability := 1. - math.Exp(-numSeeds*numAdded/numBits)
return math.Pow(bitCollisionProbability, numSeeds)
}

func contains(seeds []uint64, entries []byte, hash uint64) bool {
var (
numBits = bitsPerByte * uint64(len(entries))
Expand All @@ -151,22 +138,13 @@ func contains(seeds []uint64, entries []byte, hash uint64) bool {

func marshal(seeds []uint64, entries []byte) []byte {
numSeeds := len(seeds)
numSeedsUint64 := uint64(numSeeds)
seedsOffset := uintSize(numSeedsUint64)
entriesOffset := seedsOffset + numSeeds*bytesPerUint64
entriesOffset := 1 + numSeeds*bytesPerUint64

bytes := make([]byte, entriesOffset+len(entries))
binary.PutUvarint(bytes, numSeedsUint64)
bytes[0] = byte(numSeeds)
for i, seed := range seeds {
binary.BigEndian.PutUint64(bytes[seedsOffset+i*bytesPerUint64:], seed)
binary.BigEndian.PutUint64(bytes[1+i*bytesPerUint64:], seed)
}
copy(bytes[entriesOffset:], entries)
return bytes
}

func uintSize(value uint64) int {
if value == 0 {
return 1
}
return (bits.Len64(value) + 6) / 7
}
61 changes: 1 addition & 60 deletions utils/bloom/filter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@
package bloom

import (
"encoding/binary"
"fmt"
"math/rand"
"testing"

Expand Down Expand Up @@ -63,7 +61,7 @@ func TestNormalUsage(t *testing.T) {
}
}

require.InDelta(filter.FalsePositiveProbability(), 0.01, 1e-4)
require.Equal(len(toAdd), filter.Count())

numSeeds, numBytes := filter.Parameters()
require.Equal(initialNumSeeds, numSeeds)
Expand All @@ -81,51 +79,6 @@ func TestNormalUsage(t *testing.T) {
require.Equal(filterBytes, parsedFilterBytes)
}

func TestFalsePositiveProbability(t *testing.T) {
tests := []struct {
numSeeds float64
numBits float64
numAdded float64
expectedFalsePositiveProbability float64
allowedFalsePositiveProbabilityDelta float64
}{
{
numSeeds: 8,
numBits: 10_000,
numAdded: 0,
expectedFalsePositiveProbability: 0,
allowedFalsePositiveProbabilityDelta: 0,
},
{ // params from OptimalParameters(10_000, .01)
numSeeds: 7,
numBits: 11_982 * 8,
numAdded: 10_000,
expectedFalsePositiveProbability: .01,
allowedFalsePositiveProbabilityDelta: 1e-4,
},
{ // params from OptimalParameters(100_000, .001)
numSeeds: 10,
numBits: 179_720 * 8,
numAdded: 100_000,
expectedFalsePositiveProbability: .001,
allowedFalsePositiveProbabilityDelta: 1e-7,
},
{ // params from OptimalParameters(10_000, .01)
numSeeds: 7,
numBits: 11_982 * 8,
numAdded: 15_000,
expectedFalsePositiveProbability: .05,
allowedFalsePositiveProbabilityDelta: .01,
},
}
for _, test := range tests {
t.Run(fmt.Sprintf("%f_%f_%f", test.numSeeds, test.numBits, test.numAdded), func(t *testing.T) {
p := falsePositiveProbability(test.numSeeds, test.numBits, test.numAdded)
require.InDelta(t, test.expectedFalsePositiveProbability, p, test.allowedFalsePositiveProbabilityDelta)
})
}
}

func BenchmarkAdd(b *testing.B) {
f, err := New(8, 16*units.KiB)
require.NoError(b, err)
Expand All @@ -145,15 +98,3 @@ func BenchmarkMarshal(b *testing.B) {
f.Marshal()
}
}

func FuzzUintSize(f *testing.F) {
f.Add(uint64(0))
for i := 0; i < 64; i++ {
f.Add(uint64(1) << i)
}
f.Fuzz(func(t *testing.T, value uint64) {
length := uintSize(value)
expectedLength := len(binary.AppendUvarint(nil, value))
require.Equal(t, expectedLength, length)
})
}
31 changes: 24 additions & 7 deletions utils/bloom/optimal.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,22 +4,39 @@
package bloom

import (
"math"

bloomfilter "github.com/holiman/bloomfilter/v2"

"github.com/ava-labs/avalanchego/utils/math"
safemath "github.com/ava-labs/avalanchego/utils/math"
)

// OptimalParameters calculates the optimal [numSeeds] and [numBytes] that
// should be allocated for a bloom filter which will contain [maxEntries] and
// target [falsePositiveProbability].
func OptimalParameters(maxEntries uint64, falsePositiveProbability float64) (int, int) {
optimalNumBits := bloomfilter.OptimalM(maxEntries, falsePositiveProbability)
func OptimalParameters(maxEntries int, falsePositiveProbability float64) (int, int) {
optimalNumBits := bloomfilter.OptimalM(uint64(maxEntries), falsePositiveProbability)
numBytes := (optimalNumBits + bitsPerByte - 1) / bitsPerByte
numBytes = math.Max(numBytes, minEntries)
numBytes = safemath.Max(numBytes, minEntries)
numBits := numBytes * bitsPerByte

numSeeds := bloomfilter.OptimalK(numBits, maxEntries)
numSeeds = math.Max(numSeeds, minSeeds)
numSeeds = math.Min(numSeeds, maxSeeds)
numSeeds := bloomfilter.OptimalK(numBits, uint64(maxEntries))
numSeeds = safemath.Max(numSeeds, minSeeds)
numSeeds = safemath.Min(numSeeds, maxSeeds)
return int(numSeeds), int(numBytes)
}

// EstimateEntries estimates the number of entries that must be added to a bloom
// filter with [numSeeds] and [numBytes] to reach [falsePositiveProbability].
// This is derived by inversing a lower-bound on the probability of false
// positives. For values where numBits >> numSeeds, the predicted probability is
// fairly accurate.
//
// ref: https://tsapps.nist.gov/publication/get_pdf.cfm?pub_id=903775
func EstimateEntries(numSeeds, numBytes int, falsePositiveProbability float64) int {
invNumSeeds := 1 / float64(numSeeds)
numBits := float64(numBytes * 8)
exp := 1 - math.Pow(falsePositiveProbability, invNumSeeds)
entries := -math.Log(exp) * numBits * invNumSeeds
return int(math.Ceil(entries))
}
58 changes: 58 additions & 0 deletions utils/bloom/optimal_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
// Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved.
// See the file LICENSE for licensing terms.

package bloom

import (
"fmt"
"math"
"testing"

"github.com/stretchr/testify/require"
)

func TestEstimateEntries(t *testing.T) {
tests := []struct {
numSeeds int
numBytes int
falsePositiveProbability float64
expectedEntries int
}{
{
numSeeds: 8,
numBytes: 2_048,
falsePositiveProbability: 0,
expectedEntries: 0,
},
{ // params from OptimalParameters(10_000, .01)
numSeeds: 7,
numBytes: 11_982,
falsePositiveProbability: .01,
expectedEntries: 9_993,
},
{ // params from OptimalParameters(100_000, .001)
numSeeds: 10,
numBytes: 179_720,
falsePositiveProbability: .001,
expectedEntries: 100_000,
},
{ // params from OptimalParameters(10_000, .01)
numSeeds: 7,
numBytes: 11_982,
falsePositiveProbability: .05,
expectedEntries: 14_449,
},
{ // params from OptimalParameters(10_000, .01)
numSeeds: 7,
numBytes: 11_982,
falsePositiveProbability: 1,
expectedEntries: math.MaxInt,
},
}
for _, test := range tests {
t.Run(fmt.Sprintf("%d_%d_%f", test.numSeeds, test.numBytes, test.falsePositiveProbability), func(t *testing.T) {
entries := EstimateEntries(test.numSeeds, test.numBytes, test.falsePositiveProbability)
require.Equal(t, test.expectedEntries, entries)
})
}
}
Loading
Loading