forked from shiwendai/Faiss
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Clustering.h
118 lines (94 loc) · 3.74 KB
/
Clustering.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
/**
* Copyright (c) 2015-present, Facebook, Inc.
* All rights reserved.
*
* This source code is licensed under the BSD+Patents license found in the
* LICENSE file in the root directory of this source tree.
*/
// -*- c++ -*-
#ifndef FAISS_CLUSTERING_H
#define FAISS_CLUSTERING_H
#include "Index.h"
#include <vector>
namespace faiss {
/** Class for the clustering parameters. Can be passed to the
* constructor of the Clustering object.
* 聚类参数类,
*/
struct ClusteringParameters {
//聚类迭代次数(迭代次数是指:在一次聚类中,不断的聚类迭代以获取质心的过程)
int niter; ///< clustering iterations
//聚类次数(聚类次数是指:每次聚类都会得到一个质心,最终从这nredo中去最优的质心的过程)
int nredo; ///< redo clustering this many times and keep best
//是否打印日志
bool verbose;
//是否需要标准化的质心
bool spherical; ///< do we want normalized centroids?
//在每次迭代中更新索引
bool update_index; ///< update index after each iteration?
//用提供的质心作为输入,并且在迭代中不改变质心
bool frozen_centroids; ///< use the centroids provided as input and do not change them during iterations
//每个聚类至少包含数据集大小
int min_points_per_centroid; ///< otherwise you get a warning
//每个聚类最多包含的数据集大小
int max_points_per_centroid; ///< to limit size of dataset
//产生随机数的种子
int seed; ///< seed for the random number generator
/// sets reasonable defaults
ClusteringParameters ();
};
/** clustering based on assignment - centroid update iterations
*
* The clustering is based on an Index object that assigns training
* points to the centroids. Therefore, at each iteration the centroids
* are added to the index.
*
* On output, the centoids table is set to the latest version
* of the centroids and they are also added to the index. If the
* centroids table it is not empty on input, it is also used for
* initialization.
*
* To do several clusterings, just call train() several times on
* different training sets, clearing the centroid table in between.
*/
struct Clustering: ClusteringParameters {
/// typedef long idx_t; 所有索引都是这个类型
typedef Index::idx_t idx_t;
/// 向量维数
size_t d; ///< dimension of the vectors
/// 质心的数量
size_t k; ///< nb of centroids
/// 存储所有质心数据大小为 k * d
std::vector<float> centroids; /// vector centroids (k * d)
/// objective values (sum of distances reported by index) over
/// iterations
/// 聚类时,每次迭代的距离值的和(所有距离之和)
std::vector<float> obj;
/// the only mandatory parameters are k and d
/// 构造函数强制入参只有 k 和 d
Clustering (int d, int k);
Clustering (int d, int k, const ClusteringParameters &cp);
/// Index is used during the assignment stage
/** simplified interface
*
* @param n 待训练的数据集数量
* @param x 训练集 (大小n * d)
* @param index 被用来在训练阶段计算点到质心的距离
*/
virtual void train (idx_t n, const float * x, faiss::Index & index);
virtual ~Clustering() {}
};
/** simplified interface
*
* @param d dimension of the data
* @param n nb of training vectors
* @param k nb of output centroids
* @param x training set (size n * d)
* @param centroids output centroids (size k * d)
* @return final quantization error
*/
float kmeans_clustering (size_t d, size_t n, size_t k,
const float *x,
float *centroids);
}
#endif