diff --git a/src/bloom.cpp b/src/bloom.cpp index b2f7f1171a6c2..3a234f21f4c70 100644 --- a/src/bloom.cpp +++ b/src/bloom.cpp @@ -24,22 +24,33 @@ CBloomFilter::CBloomFilter(unsigned int nElements, double nFPRate, unsigned int nTweakIn, unsigned char nFlagsIn) : - /** - * The ideal size for a bloom filter with a given number of elements and false positive rate is: - * - nElements * log(fp rate) / ln(2)^2 - * We ignore filter parameters which will create a bloom filter larger than the protocol limits - */ + /** + * The ideal size for a bloom filter with a given number of elements and false positive rate is: + * - nElements * log(fp rate) / ln(2)^2 + * We ignore filter parameters which will create a bloom filter larger than the protocol limits + */ vData(std::min((unsigned int)(-1 / LN2SQUARED * nElements * log(nFPRate)), MAX_BLOOM_FILTER_SIZE * 8) / 8), - /** - * The ideal number of hash functions is filter size * ln(2) / number of elements - * Again, we ignore filter parameters which will create a bloom filter with more hash functions than the protocol limits - * See https://en.wikipedia.org/wiki/Bloom_filter for an explanation of these formulas - */ + /** + * The ideal number of hash functions is filter size * ln(2) / number of elements + * Again, we ignore filter parameters which will create a bloom filter with more hash functions than the protocol limits + * See https://en.wikipedia.org/wiki/Bloom_filter for an explanation of these formulas + */ isFull(false), isEmpty(false), - nHashFuncs(std::min((unsigned int)(vData.size() * 8 / nElements * LN2), MAX_HASH_FUNCS)), - nTweak(nTweakIn), - nFlags(nFlagsIn) + nHashFuncs(std::min((unsigned int)(vData.size() * 8 / nElements * LN2), MAX_HASH_FUNCS)), + nTweak(nTweakIn), + nFlags(nFlagsIn) +{ +} + +// Private constructor used by CRollingBloomFilter +CBloomFilter::CBloomFilter(unsigned int nElements, double nFPRate, unsigned int nTweakIn) : + vData((unsigned int)(-1 / LN2SQUARED * nElements * log(nFPRate)) / 8), + isFull(false), + isEmpty(true), + nHashFuncs((unsigned int)(vData.size() * 8 / nElements * LN2)), + nTweak(nTweakIn), + nFlags(BLOOM_UPDATE_NONE) { } @@ -243,3 +254,43 @@ void CBloomFilter::UpdateEmptyFull() isFull = full; isEmpty = empty; } + +CRollingBloomFilter::CRollingBloomFilter(unsigned int nElements, double fpRate, unsigned int nTweak) : + b1(nElements * 2, fpRate, nTweak), b2(nElements * 2, fpRate, nTweak) +{ + // Implemented using two bloom filters of 2 * nElements each. + // We fill them up, and clear them, staggered, every nElements + // inserted, so at least one always contains the last nElements + // inserted. + nBloomSize = nElements * 2; + nInsertions = 0; +} + +void CRollingBloomFilter::insert(const std::vector& vKey) +{ + if (nInsertions == 0) { + b1.clear(); + } else if (nInsertions == nBloomSize / 2) { + b2.clear(); + } + b1.insert(vKey); + b2.insert(vKey); + if (++nInsertions == nBloomSize) { + nInsertions = 0; + } +} + +bool CRollingBloomFilter::contains(const std::vector& vKey) const +{ + if (nInsertions < nBloomSize / 2) { + return b2.contains(vKey); + } + return b1.contains(vKey); +} + +void CRollingBloomFilter::clear() +{ + b1.clear(); + b2.clear(); + nInsertions = 0; +} diff --git a/src/bloom.h b/src/bloom.h index d7297665c479a..1d285306968ff 100644 --- a/src/bloom.h +++ b/src/bloom.h @@ -54,6 +54,10 @@ class CBloomFilter unsigned int Hash(unsigned int nHashNum, const std::vector& vDataToHash) const; + // Private constructor for CRollingBloomFilter, no restrictions on size + CBloomFilter(unsigned int nElements, double nFPRate, unsigned int nTweak); + friend class CRollingBloomFilter; + public: /** * Creates a new bloom filter which will provide the given fp rate when filled with the given number of elements @@ -104,4 +108,28 @@ class CBloomFilter void UpdateEmptyFull(); }; +/** + * RollingBloomFilter is a probabilistic "keep track of most recently inserted" set. + * Construct it with the number of items to keep track of, and a false-positive rate. + * + * contains(item) will always return true if item was one of the last N things + * insert()'ed ... but may also return true for items that were not inserted. + */ +class CRollingBloomFilter +{ +public: + CRollingBloomFilter(unsigned int nElements, double nFPRate, unsigned int nTweak); + + void insert(const std::vector& vKey); + bool contains(const std::vector& vKey) const; + + void clear(); + +private: + unsigned int nBloomSize; + unsigned int nInsertions; + CBloomFilter b1, b2; +}; + + #endif // BITCOIN_BLOOM_H diff --git a/src/test/bloom_tests.cpp b/src/test/bloom_tests.cpp index 555a22297529d..00c943fef4260 100644 --- a/src/test/bloom_tests.cpp +++ b/src/test/bloom_tests.cpp @@ -9,6 +9,7 @@ #include "clientversion.h" #include "key.h" #include "merkleblock.h" +#include "random.h" #include "serialize.h" #include "streams.h" #include "uint256.h" @@ -459,4 +460,81 @@ BOOST_AUTO_TEST_CASE(merkle_block_4_test_update_none) BOOST_CHECK(!filter.contains(COutPoint(uint256("0x02981fa052f0481dbc5868f4fc2166035a10f27a03cfd2de67326471df5bc041"), 0))); } +static std::vector RandomData() +{ + uint256 r = GetRandHash(); + return std::vector(r.begin(), r.end()); +} + +BOOST_AUTO_TEST_CASE(rolling_bloom) +{ + // last-100-entry, 1% false positive: + CRollingBloomFilter rb1(100, 0.01, 0); + + // Overfill: + static const int DATASIZE = 399; + std::vector data[DATASIZE]; + for (int i = 0; i < DATASIZE; i++) { + data[i] = RandomData(); + rb1.insert(data[i]); + } + // Last 100 guaranteed to be remembered: + for (int i = 299; i < DATASIZE; i++) { + BOOST_CHECK(rb1.contains(data[i])); + } + + // false positive rate is 1%, so we should get about 100 hits if + // testing 10,000 random keys. We get worst-case false positive + // behavior when the filter is as full as possible, which is + // when we've inserted one minus an integer multiple of nElement*2. + unsigned int nHits = 0; + for (int i = 0; i < 10000; i++) { + if (rb1.contains(RandomData())) + ++nHits; + } + // Run test_bitcoin with --log_level=message to see BOOST_TEST_MESSAGEs: + BOOST_TEST_MESSAGE("RollingBloomFilter got " << nHits << " false positives (~100 expected)"); + + // Insanely unlikely to get a fp count outside this range: + BOOST_CHECK(nHits > 25); + BOOST_CHECK(nHits < 175); + + BOOST_CHECK(rb1.contains(data[DATASIZE - 1])); + rb1.clear(); + BOOST_CHECK(!rb1.contains(data[DATASIZE - 1])); + + // Now roll through data, make sure last 100 entries + // are always remembered: + for (int i = 0; i < DATASIZE; i++) { + if (i >= 100) + BOOST_CHECK(rb1.contains(data[i - 100])); + rb1.insert(data[i]); + } + + // Insert 999 more random entries: + for (int i = 0; i < 999; i++) { + rb1.insert(RandomData()); + } + // Sanity check to make sure the filter isn't just filling up: + nHits = 0; + for (int i = 0; i < DATASIZE; i++) { + if (rb1.contains(data[i])) + ++nHits; + } + // Expect about 5 false positives, more than 100 means + // something is definitely broken. + BOOST_TEST_MESSAGE("RollingBloomFilter got " << nHits << " false positives (~5 expected)"); + BOOST_CHECK(nHits < 100); + + // last-1000-entry, 0.01% false positive: + CRollingBloomFilter rb2(1000, 0.001, 0); + for (int i = 0; i < DATASIZE; i++) { + rb2.insert(data[i]); + } + // ... room for all of them: + for (int i = 0; i < DATASIZE; i++) { + BOOST_CHECK(rb2.contains(data[i])); + } +} + BOOST_AUTO_TEST_SUITE_END()