introduce options for reducing the overhead for a clustering procedure (

facebookresearch#3731) Summary: Several changes: 1. Introduce `ClusteringParameters::check_input_data_for_NaNs`, which may suppress checks for NaN values in the input data 2. Introduce `ClusteringParameters::use_faster_subsampling`, which uses a newly added SplitMix64-based rng (`SplitMix64RandomGenerator`) and also may pick duplicate points from the original input dataset. Surprisingly, `rand_perm()` may involve noticeable non-zero costs for certain scenarios. 3. Negative values for `ClusteringParameters::seed` initialize internal clustering rng with high-resolution clock each time, making clustering procedure to pick different subsamples each time. I've decided not to use `std::random_device` in order to avoid possible negative effects. Useful for future `ProductResidualQuantizer` improvements. Pull Request resolved: facebookresearch#3731 Reviewed By: asadoughi Differential Revision: D61106105 Pulled By: mnorris11 fbshipit-source-id: 072ab2f5ce4f82f9cf49d678122f65d1c08ce596
dingodb · Aug 20, 2024 · b16fe67 · b16fe67
1 parent 0084be4
commit b16fe67
Show file tree

Hide file tree

Showing 4 changed files with 113 additions and 5 deletions.
diff --git a/faiss/Clustering.cpp b/faiss/Clustering.cpp
@@ -11,6 +11,7 @@
 #include <faiss/VectorTransform.h>
 #include <faiss/impl/AuxIndexStructures.h>
 
+#include <chrono>
 #include <cinttypes>
 #include <cmath>
 #include <cstdio>
@@ -74,6 +75,14 @@ void Clustering::train(
 
 namespace {
 
+uint64_t get_actual_rng_seed(const int seed) {
+    return (seed >= 0)
+            ? seed
+            : static_cast<uint64_t>(std::chrono::high_resolution_clock::now()
+                                            .time_since_epoch()
+                                            .count());
+}
+
 idx_t subsample_training_set(
         const Clustering& clus,
         idx_t nx,
@@ -87,11 +96,30 @@ idx_t subsample_training_set(
                clus.k * clus.max_points_per_centroid,
                nx);
     }
-    std::vector<int> perm(nx);
-    rand_perm(perm.data(), nx, clus.seed);
+
+    const uint64_t actual_seed = get_actual_rng_seed(clus.seed);
+
+    std::vector<int> perm;
+    if (clus.use_faster_subsampling) {
+        // use subsampling with splitmix64 rng
+        SplitMix64RandomGenerator rng(actual_seed);
+
+        const idx_t new_nx = clus.k * clus.max_points_per_centroid;
+        perm.resize(new_nx);
+        for (idx_t i = 0; i < new_nx; i++) {
+            perm[i] = rng.rand_int(nx);
+        }
+    } else {
+        // use subsampling with a default std rng
+        perm.resize(nx);
+        rand_perm(perm.data(), nx, actual_seed);
+    }
+
     nx = clus.k * clus.max_points_per_centroid;
     uint8_t* x_new = new uint8_t[nx * line_size];
     *x_out = x_new;
+
+    // might be worth omp-ing as well
     for (idx_t i = 0; i < nx; i++) {
         memcpy(x_new + i * line_size, x + perm[i] * line_size, line_size);
     }
@@ -280,7 +308,7 @@ void Clustering::train_encoded(
 
     double t0 = getmillisecs();
 
-    if (!codec) {
+    if (!codec && check_input_data_for_NaNs) {
         // Check for NaNs in input data. Normally it is the user's
         // responsibility, but it may spare us some hard-to-debug
         // reports.
@@ -383,6 +411,9 @@ void Clustering::train_encoded(
     }
     t0 = getmillisecs();
 
+    // initialize seed
+    const uint64_t actual_seed = get_actual_rng_seed(seed);
+
     // temporary buffer to decode vectors during the optimization
     std::vector<float> decode_buffer(codec ? d * decode_block_size : 0);
 
@@ -395,7 +426,7 @@ void Clustering::train_encoded(
         centroids.resize(d * k);
         std::vector<int> perm(nx);
 
-        rand_perm(perm.data(), nx, seed + 1 + redo * 15486557L);
+        rand_perm(perm.data(), nx, actual_seed + 1 + redo * 15486557L);
 
         if (!codec) {
             for (int i = n_input_centroids; i < k; i++) {

diff --git a/faiss/Clustering.h b/faiss/Clustering.h
@@ -43,11 +43,20 @@ struct ClusteringParameters {
     int min_points_per_centroid = 39;
     /// to limit size of dataset, otherwise the training set is subsampled
     int max_points_per_centroid = 256;
-    /// seed for the random number generator
+    /// seed for the random number generator.
+    /// negative values lead to seeding an internal rng with
+    /// std::high_resolution_clock.
     int seed = 1234;
 
     /// when the training set is encoded, batch size of the codec decoder
     size_t decode_block_size = 32768;
+
+    /// whether to check for NaNs in an input data
+    bool check_input_data_for_NaNs = true;
+
+    /// Whether to use splitmix64-based random number generator for subsampling,
+    /// which is faster, but may pick duplicate points.
+    bool use_faster_subsampling = false;
 };
 
 struct ClusteringIterationStats {

diff --git a/faiss/utils/random.cpp b/faiss/utils/random.cpp
@@ -54,6 +54,37 @@ double RandomGenerator::rand_double() {
     return mt() / double(mt.max());
 }
 
+SplitMix64RandomGenerator::SplitMix64RandomGenerator(int64_t seed)
+        : state{static_cast<uint64_t>(seed)} {}
+
+int SplitMix64RandomGenerator::rand_int() {
+    return next() & 0x7fffffff;
+}
+
+int64_t SplitMix64RandomGenerator::rand_int64() {
+    uint64_t value = next();
+    return static_cast<int64_t>(value & 0x7fffffffffffffffULL);
+}
+
+int SplitMix64RandomGenerator::rand_int(int max) {
+    return next() % max;
+}
+
+float SplitMix64RandomGenerator::rand_float() {
+    return next() / float(std::numeric_limits<uint64_t>::max());
+}
+
+double SplitMix64RandomGenerator::rand_double() {
+    return next() / double(std::numeric_limits<uint64_t>::max());
+}
+
+uint64_t SplitMix64RandomGenerator::next() {
+    uint64_t z = (state += 0x9e3779b97f4a7c15ULL);
+    z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9ULL;
+    z = (z ^ (z >> 27)) * 0x94d049bb133111ebULL;
+    return z ^ (z >> 31);
+}
+
 /***********************************************************************
  * Random functions in this C file only exist because Torch
  *  counterparts are slow and not multi-threaded.  Typical use is for
@@ -162,6 +193,18 @@ void rand_perm(int* perm, size_t n, int64_t seed) {
     }
 }
 
+void rand_perm_splitmix64(int* perm, size_t n, int64_t seed) {
+    for (size_t i = 0; i < n; i++)
+        perm[i] = i;
+
+    SplitMix64RandomGenerator rng(seed);
+
+    for (size_t i = 0; i + 1 < n; i++) {
+        int i2 = i + rng.rand_int(n - i);
+        std::swap(perm[i], perm[i2]);
+    }
+}
+
 void byte_rand(uint8_t* x, size_t n, int64_t seed) {
     // only try to parallelize on large enough arrays
     const size_t nblock = n < 1024 ? 1 : 1024;

diff --git a/faiss/utils/random.h b/faiss/utils/random.h
@@ -43,6 +43,30 @@ struct RandomGenerator {
     explicit RandomGenerator(int64_t seed = 1234);
 };
 
+/// fast random generator that cannot be used in multithreaded contexts.
+/// based on https://prng.di.unimi.it/
+struct SplitMix64RandomGenerator {
+    uint64_t state;
+
+    /// random positive integer
+    int rand_int();
+
+    /// random int64_t
+    int64_t rand_int64();
+
+    /// generate random integer between 0 and max-1
+    int rand_int(int max);
+
+    /// between 0 and 1
+    float rand_float();
+
+    double rand_double();
+
+    explicit SplitMix64RandomGenerator(int64_t seed = 1234);
+
+    uint64_t next();
+};
+
 /* Generate an array of uniform random floats / multi-threaded implementation */
 void float_rand(float* x, size_t n, int64_t seed);
 void float_randn(float* x, size_t n, int64_t seed);
@@ -53,6 +77,7 @@ void int64_rand_max(int64_t* x, size_t n, uint64_t max, int64_t seed);
 
 /* random permutation */
 void rand_perm(int* perm, size_t n, int64_t seed);
+void rand_perm_splitmix64(int* perm, size_t n, int64_t seed);
 
 /* Random set of vectors with intrinsic dimensionality 10 that is harder to
  * index than a subspace of dim 10 but easier than uniform data in dimension d