From b16fe672cced44429c56f297351e3cbeab592fa4 Mon Sep 17 00:00:00 2001 From: Alexandr Guzhva Date: Wed, 14 Aug 2024 17:10:13 -0700 Subject: [PATCH] introduce options for reducing the overhead for a clustering procedure (#3731) Summary: Several changes: 1. Introduce `ClusteringParameters::check_input_data_for_NaNs`, which may suppress checks for NaN values in the input data 2. Introduce `ClusteringParameters::use_faster_subsampling`, which uses a newly added SplitMix64-based rng (`SplitMix64RandomGenerator`) and also may pick duplicate points from the original input dataset. Surprisingly, `rand_perm()` may involve noticeable non-zero costs for certain scenarios. 3. Negative values for `ClusteringParameters::seed` initialize internal clustering rng with high-resolution clock each time, making clustering procedure to pick different subsamples each time. I've decided not to use `std::random_device` in order to avoid possible negative effects. Useful for future `ProductResidualQuantizer` improvements. Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3731 Reviewed By: asadoughi Differential Revision: D61106105 Pulled By: mnorris11 fbshipit-source-id: 072ab2f5ce4f82f9cf49d678122f65d1c08ce596 --- faiss/Clustering.cpp | 39 ++++++++++++++++++++++++++++++++++---- faiss/Clustering.h | 11 ++++++++++- faiss/utils/random.cpp | 43 ++++++++++++++++++++++++++++++++++++++++++ faiss/utils/random.h | 25 ++++++++++++++++++++++++ 4 files changed, 113 insertions(+), 5 deletions(-) diff --git a/faiss/Clustering.cpp b/faiss/Clustering.cpp index 31955bd531..16aefe8e83 100644 --- a/faiss/Clustering.cpp +++ b/faiss/Clustering.cpp @@ -11,6 +11,7 @@ #include #include +#include #include #include #include @@ -74,6 +75,14 @@ void Clustering::train( namespace { +uint64_t get_actual_rng_seed(const int seed) { + return (seed >= 0) + ? seed + : static_cast(std::chrono::high_resolution_clock::now() + .time_since_epoch() + .count()); +} + idx_t subsample_training_set( const Clustering& clus, idx_t nx, @@ -87,11 +96,30 @@ idx_t subsample_training_set( clus.k * clus.max_points_per_centroid, nx); } - std::vector perm(nx); - rand_perm(perm.data(), nx, clus.seed); + + const uint64_t actual_seed = get_actual_rng_seed(clus.seed); + + std::vector perm; + if (clus.use_faster_subsampling) { + // use subsampling with splitmix64 rng + SplitMix64RandomGenerator rng(actual_seed); + + const idx_t new_nx = clus.k * clus.max_points_per_centroid; + perm.resize(new_nx); + for (idx_t i = 0; i < new_nx; i++) { + perm[i] = rng.rand_int(nx); + } + } else { + // use subsampling with a default std rng + perm.resize(nx); + rand_perm(perm.data(), nx, actual_seed); + } + nx = clus.k * clus.max_points_per_centroid; uint8_t* x_new = new uint8_t[nx * line_size]; *x_out = x_new; + + // might be worth omp-ing as well for (idx_t i = 0; i < nx; i++) { memcpy(x_new + i * line_size, x + perm[i] * line_size, line_size); } @@ -280,7 +308,7 @@ void Clustering::train_encoded( double t0 = getmillisecs(); - if (!codec) { + if (!codec && check_input_data_for_NaNs) { // Check for NaNs in input data. Normally it is the user's // responsibility, but it may spare us some hard-to-debug // reports. @@ -383,6 +411,9 @@ void Clustering::train_encoded( } t0 = getmillisecs(); + // initialize seed + const uint64_t actual_seed = get_actual_rng_seed(seed); + // temporary buffer to decode vectors during the optimization std::vector decode_buffer(codec ? d * decode_block_size : 0); @@ -395,7 +426,7 @@ void Clustering::train_encoded( centroids.resize(d * k); std::vector perm(nx); - rand_perm(perm.data(), nx, seed + 1 + redo * 15486557L); + rand_perm(perm.data(), nx, actual_seed + 1 + redo * 15486557L); if (!codec) { for (int i = n_input_centroids; i < k; i++) { diff --git a/faiss/Clustering.h b/faiss/Clustering.h index ef1f00adcd..520225904a 100644 --- a/faiss/Clustering.h +++ b/faiss/Clustering.h @@ -43,11 +43,20 @@ struct ClusteringParameters { int min_points_per_centroid = 39; /// to limit size of dataset, otherwise the training set is subsampled int max_points_per_centroid = 256; - /// seed for the random number generator + /// seed for the random number generator. + /// negative values lead to seeding an internal rng with + /// std::high_resolution_clock. int seed = 1234; /// when the training set is encoded, batch size of the codec decoder size_t decode_block_size = 32768; + + /// whether to check for NaNs in an input data + bool check_input_data_for_NaNs = true; + + /// Whether to use splitmix64-based random number generator for subsampling, + /// which is faster, but may pick duplicate points. + bool use_faster_subsampling = false; }; struct ClusteringIterationStats { diff --git a/faiss/utils/random.cpp b/faiss/utils/random.cpp index 9ab8d0adbe..877a7c2526 100644 --- a/faiss/utils/random.cpp +++ b/faiss/utils/random.cpp @@ -54,6 +54,37 @@ double RandomGenerator::rand_double() { return mt() / double(mt.max()); } +SplitMix64RandomGenerator::SplitMix64RandomGenerator(int64_t seed) + : state{static_cast(seed)} {} + +int SplitMix64RandomGenerator::rand_int() { + return next() & 0x7fffffff; +} + +int64_t SplitMix64RandomGenerator::rand_int64() { + uint64_t value = next(); + return static_cast(value & 0x7fffffffffffffffULL); +} + +int SplitMix64RandomGenerator::rand_int(int max) { + return next() % max; +} + +float SplitMix64RandomGenerator::rand_float() { + return next() / float(std::numeric_limits::max()); +} + +double SplitMix64RandomGenerator::rand_double() { + return next() / double(std::numeric_limits::max()); +} + +uint64_t SplitMix64RandomGenerator::next() { + uint64_t z = (state += 0x9e3779b97f4a7c15ULL); + z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9ULL; + z = (z ^ (z >> 27)) * 0x94d049bb133111ebULL; + return z ^ (z >> 31); +} + /*********************************************************************** * Random functions in this C file only exist because Torch * counterparts are slow and not multi-threaded. Typical use is for @@ -162,6 +193,18 @@ void rand_perm(int* perm, size_t n, int64_t seed) { } } +void rand_perm_splitmix64(int* perm, size_t n, int64_t seed) { + for (size_t i = 0; i < n; i++) + perm[i] = i; + + SplitMix64RandomGenerator rng(seed); + + for (size_t i = 0; i + 1 < n; i++) { + int i2 = i + rng.rand_int(n - i); + std::swap(perm[i], perm[i2]); + } +} + void byte_rand(uint8_t* x, size_t n, int64_t seed) { // only try to parallelize on large enough arrays const size_t nblock = n < 1024 ? 1 : 1024; diff --git a/faiss/utils/random.h b/faiss/utils/random.h index 8b4286894a..ac985d69b7 100644 --- a/faiss/utils/random.h +++ b/faiss/utils/random.h @@ -43,6 +43,30 @@ struct RandomGenerator { explicit RandomGenerator(int64_t seed = 1234); }; +/// fast random generator that cannot be used in multithreaded contexts. +/// based on https://prng.di.unimi.it/ +struct SplitMix64RandomGenerator { + uint64_t state; + + /// random positive integer + int rand_int(); + + /// random int64_t + int64_t rand_int64(); + + /// generate random integer between 0 and max-1 + int rand_int(int max); + + /// between 0 and 1 + float rand_float(); + + double rand_double(); + + explicit SplitMix64RandomGenerator(int64_t seed = 1234); + + uint64_t next(); +}; + /* Generate an array of uniform random floats / multi-threaded implementation */ void float_rand(float* x, size_t n, int64_t seed); void float_randn(float* x, size_t n, int64_t seed); @@ -53,6 +77,7 @@ void int64_rand_max(int64_t* x, size_t n, uint64_t max, int64_t seed); /* random permutation */ void rand_perm(int* perm, size_t n, int64_t seed); +void rand_perm_splitmix64(int* perm, size_t n, int64_t seed); /* Random set of vectors with intrinsic dimensionality 10 that is harder to * index than a subspace of dim 10 but easier than uniform data in dimension d