From 8b76eb2eef2594e09a26c6dfca2095e0bfb3143c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Thu, 11 Jun 2015 11:19:41 +0200 Subject: [PATCH 1/5] use BLAS in brute force (via NumPy) --- ann_benchmarks.py | 26 +++++++++++++++++--------- install.sh | 2 +- install/bruteforce.sh | 3 +++ 3 files changed, 21 insertions(+), 10 deletions(-) create mode 100644 install/bruteforce.sh diff --git a/ann_benchmarks.py b/ann_benchmarks.py index 217178e6a..24d0c2422 100644 --- a/ann_benchmarks.py +++ b/ann_benchmarks.py @@ -22,7 +22,7 @@ class BaseANN(object): pass - + class LSHF(BaseANN): def __init__(self, metric, n_estimators=10, n_candidates=50): self.name = 'LSHF(n_est=%d, n_cand=%d)' % (n_estimators, n_candidates) @@ -118,7 +118,7 @@ def __init__(self, metric, n_trees, n_candidates): self._n_trees = n_trees self._n_candidates = n_candidates self._metric = metric - self.name = 'PANNS(n_trees=%d, n_cand=%d)' % (n_trees, n_candidates) + self.name = 'PANNS(n_trees=%d, n_cand=%d)' % (n_trees, n_candidates) def fit(self, X): self._panns = panns.PannsIndex(X.shape[1], metric=self._metric) @@ -175,17 +175,24 @@ def query(self, v, n): class BruteForce(BaseANN): + """kNN search that uses a linear scan = brute force.""" def __init__(self, metric): + if metric not in ('angular', ): + raise NotImplementedError("BruteForce doesn't support metric %s" % metric) self._metric = metric self.name = 'BruteForce()' def fit(self, X): - metric = {'angular': 'cosine', 'euclidean': 'l2'}[self._metric] - self._nbrs = sklearn.neighbors.NearestNeighbors(algorithm='brute', metric=metric) - self._nbrs.fit(X) + """Initialize the search index.""" + # normalize vectors to unit length + self.index = X / numpy.sqrt((X ** 2).sum(-1))[..., numpy.newaxis] def query(self, v, n): - return list(self._nbrs.kneighbors(v, return_distance=False, n_neighbors=n)[0]) + """Find indices of `n` most similar vectors from the index to query vector `v`.""" + query = v / numpy.sqrt((v ** 2).sum()) # normalize query to unit length + cossims = numpy.dot(self.index, query) # cossim = dot product over normalized vectors + indices = numpy.argsort(cossims)[::-1] # sort by cossim, highest first + return indices[:n] # return top `n` most similar def get_dataset(which='glove', limit=-1): @@ -246,7 +253,8 @@ def get_queries(args): print len(queries), '...' return queries - + + def get_algos(m): return { 'lshf': [LSHF(m, 5, 10), LSHF(m, 5, 20), LSHF(m, 10, 20), LSHF(m, 10, 50), LSHF(m, 20, 100)], @@ -305,7 +313,7 @@ def get_fn(base, args): if os.path.exists(results_fn): for line in open(results_fn): algos_already_ran.add(line.strip().split('\t')[1]) - + algos = get_algos(args.distance) algos_flat = [] @@ -313,7 +321,7 @@ def get_fn(base, args): for algo in algos[library]: if algo.name not in algos_already_ran: algos_flat.append((library, algo)) - + random.shuffle(algos_flat) print 'order:', algos_flat diff --git a/install.sh b/install.sh index 59068bcb5..9dbffd901 100644 --- a/install.sh +++ b/install.sh @@ -1,6 +1,6 @@ sudo apt-get install -y python-numpy python-scipy cd install -for fn in annoy.sh panns.sh nearpy.sh sklearn.sh flann.sh kgraph.sh glove.sh sift.sh +for fn in bruteforce.sh annoy.sh panns.sh nearpy.sh sklearn.sh flann.sh kgraph.sh glove.sh sift.sh do source $fn done diff --git a/install/bruteforce.sh b/install/bruteforce.sh new file mode 100644 index 000000000..19c649d30 --- /dev/null +++ b/install/bruteforce.sh @@ -0,0 +1,3 @@ +sudo apt-get install -y python-pip python-dev +sudo apt-get install -y libatlas-dev libatlas3gf-base +sudo apt-get install -y python-numpy From 6af3c21c03ce12b0c2e85c83090b55cebb6fefc3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Sat, 13 Jun 2015 15:06:22 +0200 Subject: [PATCH 2/5] add euclidean distance to BruteForce --- ann_benchmarks.py | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/ann_benchmarks.py b/ann_benchmarks.py index 24d0c2422..73d4bb021 100644 --- a/ann_benchmarks.py +++ b/ann_benchmarks.py @@ -177,21 +177,35 @@ def query(self, v, n): class BruteForce(BaseANN): """kNN search that uses a linear scan = brute force.""" def __init__(self, metric): - if metric not in ('angular', ): + if metric not in ('angular', 'euclidean'): raise NotImplementedError("BruteForce doesn't support metric %s" % metric) self._metric = metric self.name = 'BruteForce()' def fit(self, X): """Initialize the search index.""" - # normalize vectors to unit length - self.index = X / numpy.sqrt((X ** 2).sum(-1))[..., numpy.newaxis] + self.lengths = (X ** 2).sum(-1) # record (squared) length of each vector + if self._metric == 'angular': + # for cossim, normalize index vectors to unit length + self.index = numpy.ascontiguousarray(X / numpy.sqrt(self.lengths)[..., numpy.newaxis]) + elif self._metric == 'euclidean': + self.index = numpy.ascontiguousarray(X) + else: + assert False, "invalid metric" # shouldn't get past the constructor! def query(self, v, n): """Find indices of `n` most similar vectors from the index to query vector `v`.""" - query = v / numpy.sqrt((v ** 2).sum()) # normalize query to unit length - cossims = numpy.dot(self.index, query) # cossim = dot product over normalized vectors - indices = numpy.argsort(cossims)[::-1] # sort by cossim, highest first + if self._metric == 'angular': + query = v / numpy.sqrt((v ** 2).sum()) # normalize query to unit length + cossims = numpy.dot(self.index, query) # cossim = dot product over normalized vectors + indices = numpy.argsort(cossims)[::-1] # sort by cossim, highest first + elif self._metric == 'euclidean': + # HACK we ignore query length as that's a constant not affecting the final ordering: + # argmax_a (a - b)^2 = argmax_a a^2 - 2ab + b^2 = argmax_a a^2 - 2ab + squared_dists = self.lengths - 2 * numpy.dot(self.index, v) + indices = numpy.argsort(squared_dists) # sort by l2 distance, lowest first + else: + assert False, "invalid metric" # shouldn't get past the constructor! return indices[:n] # return top `n` most similar From db1c2a29994c41032ab9e8d16d57337d6fba1f9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Sat, 13 Jun 2015 15:55:14 +0200 Subject: [PATCH 3/5] use partial sort to speed up BruteForce --- ann_benchmarks.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ann_benchmarks.py b/ann_benchmarks.py index 73d4bb021..c7e1630d9 100644 --- a/ann_benchmarks.py +++ b/ann_benchmarks.py @@ -184,7 +184,7 @@ def __init__(self, metric): def fit(self, X): """Initialize the search index.""" - self.lengths = (X ** 2).sum(-1) # record (squared) length of each vector + self.lengths = (X ** 2).sum(-1) # precompute (squared) length of each vector if self._metric == 'angular': # for cossim, normalize index vectors to unit length self.index = numpy.ascontiguousarray(X / numpy.sqrt(self.lengths)[..., numpy.newaxis]) @@ -198,15 +198,15 @@ def query(self, v, n): if self._metric == 'angular': query = v / numpy.sqrt((v ** 2).sum()) # normalize query to unit length cossims = numpy.dot(self.index, query) # cossim = dot product over normalized vectors - indices = numpy.argsort(cossims)[::-1] # sort by cossim, highest first + dists = -cossims # just for convenience, so that lowest = best elif self._metric == 'euclidean': # HACK we ignore query length as that's a constant not affecting the final ordering: # argmax_a (a - b)^2 = argmax_a a^2 - 2ab + b^2 = argmax_a a^2 - 2ab - squared_dists = self.lengths - 2 * numpy.dot(self.index, v) - indices = numpy.argsort(squared_dists) # sort by l2 distance, lowest first + dists = self.lengths - 2 * numpy.dot(self.index, v) else: assert False, "invalid metric" # shouldn't get past the constructor! - return indices[:n] # return top `n` most similar + indices = numpy.argpartition(dists, n)[:n] # partition-sort by distance, get `n` closest + return sorted(indices, key=lambda index: dists[index]) # resort `n` closest into final order def get_dataset(which='glove', limit=-1): From cf2b2d959cf8b77bc8fc6e569ab557833d60a030 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Sat, 13 Jun 2015 21:35:03 +0200 Subject: [PATCH 4/5] simplify & improve BruteForce docs --- ann_benchmarks.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/ann_benchmarks.py b/ann_benchmarks.py index c7e1630d9..3da038f38 100644 --- a/ann_benchmarks.py +++ b/ann_benchmarks.py @@ -195,13 +195,12 @@ def fit(self, X): def query(self, v, n): """Find indices of `n` most similar vectors from the index to query vector `v`.""" + # HACK we ignore query length as that's a constant not affecting the final ordering if self._metric == 'angular': - query = v / numpy.sqrt((v ** 2).sum()) # normalize query to unit length - cossims = numpy.dot(self.index, query) # cossim = dot product over normalized vectors - dists = -cossims # just for convenience, so that lowest = best + # argmax_a cossim(a, b) = argmax_a dot(a, b) / |a||b| = argmin_a -dot(a, b) + dists = -numpy.dot(self.index, v) elif self._metric == 'euclidean': - # HACK we ignore query length as that's a constant not affecting the final ordering: - # argmax_a (a - b)^2 = argmax_a a^2 - 2ab + b^2 = argmax_a a^2 - 2ab + # argmin_a (a - b)^2 = argmin_a a^2 - 2ab + b^2 = argmin_a a^2 - 2ab dists = self.lengths - 2 * numpy.dot(self.index, v) else: assert False, "invalid metric" # shouldn't get past the constructor! From b72f4526bdcf243b5606ad5a43b3191082bc0d5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Sat, 13 Jun 2015 22:30:22 +0200 Subject: [PATCH 5/5] switch BruteForce to single precision math --- ann_benchmarks.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/ann_benchmarks.py b/ann_benchmarks.py index 3da038f38..1402bab6c 100644 --- a/ann_benchmarks.py +++ b/ann_benchmarks.py @@ -176,25 +176,28 @@ def query(self, v, n): class BruteForce(BaseANN): """kNN search that uses a linear scan = brute force.""" - def __init__(self, metric): + def __init__(self, metric, precision=numpy.float32): if metric not in ('angular', 'euclidean'): raise NotImplementedError("BruteForce doesn't support metric %s" % metric) self._metric = metric + self._precision = precision self.name = 'BruteForce()' def fit(self, X): """Initialize the search index.""" - self.lengths = (X ** 2).sum(-1) # precompute (squared) length of each vector + lens = (X ** 2).sum(-1) # precompute (squared) length of each vector if self._metric == 'angular': - # for cossim, normalize index vectors to unit length - self.index = numpy.ascontiguousarray(X / numpy.sqrt(self.lengths)[..., numpy.newaxis]) + X /= numpy.sqrt(lens)[..., numpy.newaxis] # normalize index vectors to unit length + self.index = numpy.ascontiguousarray(X, dtype=self._precision) elif self._metric == 'euclidean': - self.index = numpy.ascontiguousarray(X) + self.index = numpy.ascontiguousarray(X, dtype=self._precision) + self.lengths = numpy.ascontiguousarray(lens, dtype=self._precision) else: assert False, "invalid metric" # shouldn't get past the constructor! def query(self, v, n): """Find indices of `n` most similar vectors from the index to query vector `v`.""" + v = numpy.ascontiguousarray(v, dtype=self._precision) # use same precision for query as for index # HACK we ignore query length as that's a constant not affecting the final ordering if self._metric == 'angular': # argmax_a cossim(a, b) = argmax_a dot(a, b) / |a||b| = argmin_a -dot(a, b) @@ -205,7 +208,7 @@ def query(self, v, n): else: assert False, "invalid metric" # shouldn't get past the constructor! indices = numpy.argpartition(dists, n)[:n] # partition-sort by distance, get `n` closest - return sorted(indices, key=lambda index: dists[index]) # resort `n` closest into final order + return sorted(indices, key=lambda index: dists[index]) # sort `n` closest into correct order def get_dataset(which='glove', limit=-1):