From 9aa12719d48966745f8cafcc05d528e09bbbaea3 Mon Sep 17 00:00:00 2001 From: Erik Bernhardsson Date: Thu, 18 Jun 2015 00:03:12 +0200 Subject: [PATCH 1/5] trying to add tox & setting up travis --- .travis.yml | 8 ++++++++ ann_benchmarks.py => ann_benchmarks/__init__.py | 0 install.sh | 2 +- install/annoy.sh | 4 ++-- setup.py | 3 +++ 5 files changed, 14 insertions(+), 3 deletions(-) create mode 100644 .travis.yml rename ann_benchmarks.py => ann_benchmarks/__init__.py (100%) create mode 100644 setup.py diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 000000000..3f98406a3 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,8 @@ +language: python +python: + - "2.7" + - "3.4" + +install: bash install.sh + +script: nosetests diff --git a/ann_benchmarks.py b/ann_benchmarks/__init__.py similarity index 100% rename from ann_benchmarks.py rename to ann_benchmarks/__init__.py diff --git a/install.sh b/install.sh index 0de8603ef..65bbde1ba 100644 --- a/install.sh +++ b/install.sh @@ -1,4 +1,4 @@ -sudo apt-get install -y python-numpy python-scipy python-sklearn +apt-get install -y python-numpy python-scipy python-sklearn cd install for fn in annoy.sh panns.sh nearpy.sh sklearn.sh flann.sh kgraph.sh nmslib.sh glove.sh sift.sh do diff --git a/install/annoy.sh b/install/annoy.sh index 1de1c3f86..7026aeb9b 100644 --- a/install/annoy.sh +++ b/install/annoy.sh @@ -1,5 +1,5 @@ -sudo apt-get install -y python-dev python-setuptools +apt-get install -y python-dev python-setuptools git clone https://github.com/spotify/annoy cd annoy -sudo python setup.py install +python setup.py install cd .. diff --git a/setup.py b/setup.py new file mode 100644 index 000000000..8ee9aa17b --- /dev/null +++ b/setup.py @@ -0,0 +1,3 @@ +from setuptools import setup + +setup(packages=['ann_benchmarks']) From eef86ee87021632a0f46c59b579eadc702c49c8a Mon Sep 17 00:00:00 2001 From: Erik Bernhardsson Date: Thu, 18 Jun 2015 18:47:33 +0200 Subject: [PATCH 2/5] run unittests --- ann_benchmarks/__init__.py | 16 +++++++++------- test/test.py | 22 ++++++++++++++++++++++ 2 files changed, 31 insertions(+), 7 deletions(-) create mode 100644 test/test.py diff --git a/ann_benchmarks/__init__.py b/ann_benchmarks/__init__.py index 45a7b5951..b3039f343 100644 --- a/ann_benchmarks/__init__.py +++ b/ann_benchmarks/__init__.py @@ -1,10 +1,3 @@ -import sklearn.neighbors -import annoy -import pyflann -import panns -import nmslib -import nearpy, nearpy.hashes, nearpy.distances -import pykgraph import gzip, numpy, time, os, multiprocessing, argparse, pickle, resource try: from urllib import urlretrieve @@ -32,6 +25,7 @@ def __init__(self, metric, n_estimators=10, n_candidates=50): self._n_candidates = n_candidates def fit(self, X): + import sklearn.neighbors self._lshf = sklearn.neighbors.LSHForest(n_estimators=self._n_estimators, n_candidates=self._n_candidates) if self._metric == 'angular': X = sklearn.preprocessing.normalize(X, axis=1, norm='l2') @@ -86,6 +80,7 @@ def __init__(self, metric, target_precision): self._metric = metric def fit(self, X): + import pyflann self._flann = pyflann.FLANN(target_precision=self._target_precision, algorithm='autotuned', log_level='info') if self._metric == 'angular': X = sklearn.preprocessing.normalize(X, axis=1, norm='l2') @@ -105,6 +100,7 @@ def __init__(self, metric, n_trees, n_candidates): self.name = 'Annoy(n_trees=%d, n_cand=%d)' % (n_trees, n_candidates) def fit(self, X): + import annoy self._annoy = annoy.AnnoyIndex(f=X.shape[1], metric=self._metric) for i, x in enumerate(X): self._annoy.add_item(i, x.tolist()) @@ -122,6 +118,7 @@ def __init__(self, metric, n_trees, n_candidates): self.name = 'PANNS(n_trees=%d, n_cand=%d)' % (n_trees, n_candidates) def fit(self, X): + import panns self._panns = panns.PannsIndex(X.shape[1], metric=self._metric) for x in X: self._panns.add_vector(x) @@ -139,6 +136,8 @@ def __init__(self, metric, n_bits, hash_counts): self.name = 'NearPy(n_bits=%d, hash_counts=%d)' % (n_bits, hash_counts) def fit(self, X): + import nearpy, nearpy.hashes, nearpy.distances + hashes = [] # TODO: doesn't seem like the NearPy code is using the metric?? @@ -162,6 +161,8 @@ def __init__(self, metric, P): self._metric = metric def fit(self, X): + import pykgraph + if self._metric == 'angular': X = sklearn.preprocessing.normalize(X, axis=1, norm='l2') self._kgraph = pykgraph.KGraph() @@ -182,6 +183,7 @@ def __init__(self, metric, method_name, method_param): self.name = 'Nmslib(method_name=%s, method_param=%s)' % (method_name, method_param) def fit(self, X): + import nmslib self._index = nmslib.initIndex(X.shape[0], self._nmslib_metric, [], self._method_name, self._method_param, nmslib.DataType.VECTOR, nmslib.DistType.FLOAT) for i, x in enumerate(X): diff --git a/test/test.py b/test/test.py new file mode 100644 index 000000000..2cefff462 --- /dev/null +++ b/test/test.py @@ -0,0 +1,22 @@ +import inspect +import ann_benchmarks +from sklearn.datasets.samples_generator import make_blobs + +# Generate dataset +X, labels_true = make_blobs(n_samples=1000, n_features=10, + centers=10, cluster_std=5, + random_state=0) + +def check_algo(algo_name, algo): + algo.fit(X) + result = algo.query(X[42], 10) + assert result[0] == 42 + assert len(result) == 10 + assert len(set(result)) == 10 + +def test_all_algos(): + for metric in ['angular', 'euclidean']: + algos = ann_benchmarks.get_algos(metric) + for algo_key in algos.keys(): + for algo in algos[algo_key]: + yield check_algo, algo.name, algo # pass name just so unittest can capture it From 74c397d61ef4ab054094ed09927a1831df285d96 Mon Sep 17 00:00:00 2001 From: Erik Bernhardsson Date: Thu, 18 Jun 2015 18:54:41 +0200 Subject: [PATCH 3/5] don't download big datasets by default --- README.rst | 4 +++- install.sh | 2 +- install/glove.sh | 1 + install/sift.sh | 1 + 4 files changed, 6 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index e9b404781..183c66c09 100644 --- a/README.rst +++ b/README.rst @@ -28,7 +28,9 @@ Doing fast searching of nearest neighbors in high dimensional spaces is an incre Install ------- -Clone the repo and run ``bash install.sh``. This will install all libraries as well as downloading and preprocessing all data sets. It could take a while. It has been tested in Ubuntu 14.04. +Clone the repo and run ``bash install.sh``. This will install all libraries. It could take a while. It has been tested in Ubuntu 14.04. + +To download and preprocess the data sets, run ``bash install/glove.sh`` and ``bash install/sift.sh``. There is also a Docker image available under `erikbern/ann `__ containing all libraries and data sets. diff --git a/install.sh b/install.sh index 65bbde1ba..7f7997933 100644 --- a/install.sh +++ b/install.sh @@ -1,6 +1,6 @@ apt-get install -y python-numpy python-scipy python-sklearn cd install -for fn in annoy.sh panns.sh nearpy.sh sklearn.sh flann.sh kgraph.sh nmslib.sh glove.sh sift.sh +for fn in annoy.sh panns.sh nearpy.sh sklearn.sh flann.sh kgraph.sh nmslib.sh do source $fn done diff --git a/install/glove.sh b/install/glove.sh index 444bd1ad8..c41e94105 100644 --- a/install/glove.sh +++ b/install/glove.sh @@ -1,3 +1,4 @@ +cd "$(dirname "$0")" wget "http://www-nlp.stanford.edu/data/glove.twitter.27B.100d.txt.gz" gunzip -d glove.twitter.27B.100d.txt.gz cut -d " " -f 2- glove.twitter.27B.100d.txt > glove.txt # strip first column diff --git a/install/sift.sh b/install/sift.sh index 6f28dcc9b..41182acdb 100644 --- a/install/sift.sh +++ b/install/sift.sh @@ -1,3 +1,4 @@ +cd "$(dirname "$0")" wget "ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz" tar -xzf sift.tar.gz rm -rf sift.tar.gz From eddaee5a4ba755be89f29142d7025bc87f828025 Mon Sep 17 00:00:00 2001 From: Erik Bernhardsson Date: Thu, 18 Jun 2015 19:21:31 +0200 Subject: [PATCH 4/5] fixing stuff --- .travis.yml | 5 +++-- ann_benchmarks/__init__.py | 29 ++++++++++++++++------------- install.sh | 11 ++++++++++- install/kgraph.sh | 1 + install/nearpy.sh | 5 +++-- install/nmslib.sh | 9 ++++----- test/test.py | 17 +++++++++++------ 7 files changed, 48 insertions(+), 29 deletions(-) diff --git a/.travis.yml b/.travis.yml index 3f98406a3..21382e1df 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,8 +1,9 @@ language: python python: - - "2.7" + - "2.7_with_system_site_packages" - "3.4" -install: bash install.sh +install: + - sudo bash install.sh script: nosetests diff --git a/ann_benchmarks/__init__.py b/ann_benchmarks/__init__.py index b3039f343..d622446a6 100644 --- a/ann_benchmarks/__init__.py +++ b/ann_benchmarks/__init__.py @@ -1,15 +1,15 @@ -import gzip, numpy, time, os, multiprocessing, argparse, pickle, resource +import gzip, numpy, time, os, multiprocessing, argparse, pickle, resource, random try: from urllib import urlretrieve except ImportError: from urllib.request import urlretrieve # Python 3 -import sklearn.cross_validation, sklearn.preprocessing, random +import sklearn.preprocessing # Set resource limits to prevent memory bombs memory_limit = 12 * 2**30 soft, hard = resource.getrlimit(resource.RLIMIT_DATA) if soft == resource.RLIM_INFINITY or soft >= memory_limit: - print 'resetting memory limit from', soft, 'to', memory_limit + print('resetting memory limit from', soft, 'to', memory_limit) resource.setrlimit(resource.RLIMIT_DATA, (memory_limit, hard)) @@ -203,6 +203,7 @@ def __init__(self, metric): self.name = 'BruteForce()' def fit(self, X): + import sklearn.neighbors metric = {'angular': 'cosine', 'euclidean': 'l2'}[self._metric] self._nbrs = sklearn.neighbors.NearestNeighbors(algorithm='brute', metric=metric) self._nbrs.fit(X) @@ -223,8 +224,10 @@ def get_dataset(which='glove', limit=-1): break X = numpy.vstack(X) + import sklearn.cross_validation + X_train, X_test = sklearn.cross_validation.train_test_split(X, test_size=1000, random_state=42) - print X_train.shape, X_test.shape + print(X_train.shape, X_test.shape) return X_train, X_test @@ -235,7 +238,7 @@ def run_algo(args, library, algo, results_fn): if algo != 'bf': algo.fit(X_train) build_time = time.time() - t0 - print 'Built index in', build_time + print('Built index in', build_time) best_search_time = float('inf') best_precision = 0.0 # should be deterministic but paranoid @@ -249,10 +252,10 @@ def run_algo(args, library, algo, results_fn): precision = k / (len(queries) * 10) best_search_time = min(best_search_time, search_time) best_precision = max(best_precision, precision) - print search_time, precision + print(search_time, precision) output = [library, algo.name, build_time, best_search_time, best_precision] - print output + print(output) f = open(results_fn, 'a') f.write('\t'.join(map(str, output)) + '\n') @@ -260,7 +263,7 @@ def run_algo(args, library, algo, results_fn): def get_queries(args): - print 'computing queries with correct results...' + print('computing queries with correct results...') bf = BruteForce(args.distance) X_train, X_test = get_dataset(which=args.dataset, limit=args.limit) @@ -272,7 +275,7 @@ def get_queries(args): correct = bf.query(x, 10) queries.append((x, correct)) if len(queries) % 100 == 0: - print len(queries), '...' + print(len(queries), '...') return queries @@ -369,7 +372,7 @@ def get_fn(base, args): results_fn = get_fn('results', args) queries_fn = get_fn('queries', args) - print 'storing queries in', queries_fn, 'and results in', results_fn + print('storing queries in', queries_fn, 'and results in', results_fn) if not os.path.exists(queries_fn): queries = get_queries(args) @@ -379,7 +382,7 @@ def get_fn(base, args): else: queries = pickle.load(open(queries_fn)) - print 'got', len(queries), 'queries' + print('got', len(queries), 'queries') algos_already_ran = set() if os.path.exists(results_fn): @@ -396,10 +399,10 @@ def get_fn(base, args): random.shuffle(algos_flat) - print 'order:', algos_flat + print('order:', algos_flat) for library, algo in algos_flat: - print algo.name, '...' + print(algo.name, '...') # Spawn a subprocess to force the memory to be reclaimed at the end p = multiprocessing.Process(target=run_algo, args=(args, library, algo, results_fn)) p.start() diff --git a/install.sh b/install.sh index 7f7997933..a71369888 100644 --- a/install.sh +++ b/install.sh @@ -1,4 +1,13 @@ -apt-get install -y python-numpy python-scipy python-sklearn +apt-get update +apt-get install -y python-numpy python-scipy python-pip python-nose +pip install scikit-learn + +# Install GCC 4.8 +add-apt-repository ppa:ubuntu-toolchain-r/test -y +apt-get update -qq +apt-get install -y libboost1.48-all-dev g++-4.8 +export CXX="g++-4.8" CC="gcc-4.8" + cd install for fn in annoy.sh panns.sh nearpy.sh sklearn.sh flann.sh kgraph.sh nmslib.sh do diff --git a/install/kgraph.sh b/install/kgraph.sh index 0c3fdb66f..42794509c 100644 --- a/install/kgraph.sh +++ b/install/kgraph.sh @@ -1,5 +1,6 @@ git clone https://github.com/aaalgo/kgraph pushd kgraph +apt-get install -y libboost-timer-dev libbooost-chrono-dev sudo make deps-ubuntu make make release diff --git a/install/nearpy.sh b/install/nearpy.sh index 729544692..2592fbee2 100644 --- a/install/nearpy.sh +++ b/install/nearpy.sh @@ -1,2 +1,3 @@ -sudo apt-get install -y python-pip -sudo pip install nearpy bitarray redis +apt-get install -y python-pip libhdf5-dev +pip install cython +pip install nearpy bitarray redis h5py diff --git a/install/nmslib.sh b/install/nmslib.sh index a9786f95c..9cfc1eaa2 100755 --- a/install/nmslib.sh +++ b/install/nmslib.sh @@ -4,13 +4,12 @@ rm -rf NonMetricSpaceLib # Note that we use the develop branch here: git clone https://github.com/searchivarius/NonMetricSpaceLib.git cd NonMetricSpaceLib/similarity_search -git checkout ann-benchmark -sudo apt-get install -y cmake libeigen3-dev libgsl0-dev libboost-all-dev g++-4.8 -# Actually let's make g++ an alias -alias g++=g++-4.8 +git checkout ann-benchmark +apt-get install -y cmake libeigen3-dev libgsl0-dev libboost-all-dev +echo "CC: $CC, CXX: $CXX" cmake . make -j 4 cd ../python_binding make -sudo make install +make install cd ../.. diff --git a/test/test.py b/test/test.py index 2cefff462..d9d6cdecf 100644 --- a/test/test.py +++ b/test/test.py @@ -1,22 +1,27 @@ +import random import inspect import ann_benchmarks from sklearn.datasets.samples_generator import make_blobs # Generate dataset -X, labels_true = make_blobs(n_samples=1000, n_features=10, +X, labels_true = make_blobs(n_samples=10000, n_features=10, centers=10, cluster_std=5, random_state=0) def check_algo(algo_name, algo): algo.fit(X) result = algo.query(X[42], 10) - assert result[0] == 42 - assert len(result) == 10 - assert len(set(result)) == 10 + if len(result) != 10: + raise AssertionError('Expected results to have length 10: Result: %s' % result) + if len(set(result)) != 10: + raise AssertionError('Expected results to be unique: Result: %s' % result) + #if result[0] != 42: + # raise AssertionError('Expected first item to be 42: Result: %s' % result) + def test_all_algos(): for metric in ['angular', 'euclidean']: algos = ann_benchmarks.get_algos(metric) for algo_key in algos.keys(): - for algo in algos[algo_key]: - yield check_algo, algo.name, algo # pass name just so unittest can capture it + algo = random.choice(algos[algo_key]) # Just pick one of each + yield check_algo, algo.name, algo # pass name just so unittest can capture it From e58e4afa693c497745dbb67f6bdf4221d362b5d4 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Sat, 20 Jun 2015 12:33:53 +0000 Subject: [PATCH 5/5] various fixes to support nmslib --- ann_benchmarks/__init__.py | 115 +++++++++++++++++++++---------------- 1 file changed, 64 insertions(+), 51 deletions(-) diff --git a/ann_benchmarks/__init__.py b/ann_benchmarks/__init__.py index d622446a6..5cb42a57b 100644 --- a/ann_benchmarks/__init__.py +++ b/ann_benchmarks/__init__.py @@ -184,6 +184,12 @@ def __init__(self, metric, method_name, method_param): def fit(self, X): import nmslib + if self._method_name == 'vptree': + # To avoid this issue: + # terminate called after throwing an instance of 'std::runtime_error' + # what(): The data size is too small or the bucket size is too big. Select the parameters so that is NOT less than * 1000 + # Aborted (core dumped) + self._method_param.append('bucketSize=%d' % min(int(X.shape[0] * 0.0005), 1000)) self._index = nmslib.initIndex(X.shape[0], self._nmslib_metric, [], self._method_name, self._method_param, nmslib.DataType.VECTOR, nmslib.DistType.FLOAT) for i, x in enumerate(X): @@ -191,9 +197,11 @@ def fit(self, X): nmslib.buildIndex(self._index) def query(self, v, n): + import nmslib return nmslib.knnQuery(self._index, n, v.tolist()) def freeIndex(self): + import nmslib nmslib.freeIndex(self._index) @@ -280,7 +288,7 @@ def get_queries(args): return queries def get_algos(m): - return { + algos = { 'lshf': [LSHF(m, 5, 10), LSHF(m, 5, 20), LSHF(m, 10, 20), LSHF(m, 10, 50), LSHF(m, 20, 100)], 'flann': [FLANN(m, 0.2), FLANN(m, 0.5), FLANN(m, 0.7), FLANN(m, 0.8), FLANN(m, 0.9), FLANN(m, 0.95), FLANN(m, 0.97), FLANN(m, 0.98), FLANN(m, 0.99), FLANN(m, 0.995)], 'panns': [PANNS(m, 5, 20), PANNS(m, 10, 10), PANNS(m, 10, 50), PANNS(m, 10, 100), PANNS(m, 20, 100), PANNS(m, 40, 100)], @@ -294,58 +302,63 @@ def get_algos(m): 'ball': [BallTree(m, 10), BallTree(m, 20), BallTree(m, 40), BallTree(m, 100), BallTree(m, 200), BallTree(m, 400), BallTree(m, 1000)], 'kd': [KDTree(m, 10), KDTree(m, 20), KDTree(m, 40), KDTree(m, 100), KDTree(m, 200), KDTree(m, 400), KDTree(m, 1000)], - # START: Non-Metric Space Library (nmslib) entries - 'MP-lsh(lshkit)':[ - Nmslib(m, 'lsh_multiprobe', ['desiredRecall=0.99','H=1200001','T=10','L=50','tuneK=10']), - Nmslib(m, 'lsh_multiprobe', ['desiredRecall=0.97','H=1200001','T=10','L=50','tuneK=10']), - Nmslib(m, 'lsh_multiprobe', ['desiredRecall=0.95','H=1200001','T=10','L=50','tuneK=10']), - Nmslib(m, 'lsh_multiprobe', ['desiredRecall=0.90','H=1200001','T=10','L=50','tuneK=10']), - Nmslib(m, 'lsh_multiprobe', ['desiredRecall=0.85','H=1200001','T=10','L=50','tuneK=10']), - Nmslib(m, 'lsh_multiprobe', ['desiredRecall=0.80','H=1200001','T=10','L=50','tuneK=10']), - Nmslib(m, 'lsh_multiprobe', ['desiredRecall=0.7','H=1200001','T=10','L=50','tuneK=10']), - Nmslib(m, 'lsh_multiprobe', ['desiredRecall=0.6','H=1200001','T=10','L=50','tuneK=10']), - Nmslib(m, 'lsh_multiprobe', ['desiredRecall=0.5','H=1200001','T=10','L=50','tuneK=10']), - Nmslib(m, 'lsh_multiprobe', ['desiredRecall=0.4','H=1200001','T=10','L=50','tuneK=10']), - Nmslib(m, 'lsh_multiprobe', ['desiredRecall=0.3','H=1200001','T=10','L=50','tuneK=10']), - Nmslib(m, 'lsh_multiprobe', ['desiredRecall=0.2','H=1200001','T=10','L=50','tuneK=10']), - Nmslib(m, 'lsh_multiprobe', ['desiredRecall=0.1','H=1200001','T=10','L=50','tuneK=10']), - ], - - 'bruteforce0(nmslib)': [Nmslib(m, 'seq_search', ['copyMem=0'])], - 'bruteforce1(nmslib)': [Nmslib(m, 'seq_search', ['copyMem=1'])], - - 'BallTree(nmslib)': [ - Nmslib(m, 'vptree', ['tuneK=10', 'desiredRecall=0.99', 'bucketSize=100']), - Nmslib(m, 'vptree', ['tuneK=10', 'desiredRecall=0.95', 'bucketSize=100']), - Nmslib(m, 'vptree', ['tuneK=10', 'desiredRecall=0.90', 'bucketSize=100']), - Nmslib(m, 'vptree', ['tuneK=10', 'desiredRecall=0.85', 'bucketSize=100']), - Nmslib(m, 'vptree', ['tuneK=10', 'desiredRecall=0.8', 'bucketSize=100']), - Nmslib(m, 'vptree', ['tuneK=10', 'desiredRecall=0.7', 'bucketSize=100']), - Nmslib(m, 'vptree', ['tuneK=10', 'desiredRecall=0.6', 'bucketSize=100']), - Nmslib(m, 'vptree', ['tuneK=10', 'desiredRecall=0.5', 'bucketSize=100']), - Nmslib(m, 'vptree', ['tuneK=10', 'desiredRecall=0.4', 'bucketSize=100']), - Nmslib(m, 'vptree', ['tuneK=10', 'desiredRecall=0.3', 'bucketSize=100']), - Nmslib(m, 'vptree', ['tuneK=10', 'desiredRecall=0.2', 'bucketSize=100']), - Nmslib(m, 'vptree', ['tuneK=10', 'desiredRecall=0.1', 'bucketSize=100']), - ], - - 'SW-graph(nmslib)':[ - Nmslib(m, 'small_world_rand', ['NN=20', 'initIndexAttempts=4', 'initSearchAttempts=48']), - Nmslib(m, 'small_world_rand', ['NN=20', 'initIndexAttempts=4', 'initSearchAttempts=32']), - Nmslib(m, 'small_world_rand', ['NN=20', 'initIndexAttempts=4', 'initSearchAttempts=16']), - Nmslib(m, 'small_world_rand', ['NN=20', 'initIndexAttempts=4', 'initSearchAttempts=8']), - Nmslib(m, 'small_world_rand', ['NN=20', 'initIndexAttempts=4', 'initSearchAttempts=4']), - Nmslib(m, 'small_world_rand', ['NN=20', 'initIndexAttempts=4', 'initSearchAttempts=2']), - Nmslib(m, 'small_world_rand', ['NN=17', 'initIndexAttempts=4', 'initSearchAttempts=2']), - Nmslib(m, 'small_world_rand', ['NN=14', 'initIndexAttempts=4', 'initSearchAttempts=2']), - Nmslib(m, 'small_world_rand', ['NN=11', 'initIndexAttempts=5', 'initSearchAttempts=2']), - Nmslib(m, 'small_world_rand', ['NN=8', 'initIndexAttempts=5', 'initSearchAttempts=2']), - Nmslib(m, 'small_world_rand', ['NN=5', 'initIndexAttempts=5', 'initSearchAttempts=2']), - Nmslib(m, 'small_world_rand', ['NN=3', 'initIndexAttempts=5', 'initSearchAttempts=2']), - ] - # END: Non-Metric Space Library (nmslib) entries + # START: Non-Metric Space Library (nmslib) entries + 'bruteforce0(nmslib)': [Nmslib(m, 'seq_search', ['copyMem=0'])], + 'bruteforce1(nmslib)': [Nmslib(m, 'seq_search', ['copyMem=1'])], + + 'BallTree(nmslib)': [ + Nmslib(m, 'vptree', ['tuneK=10', 'desiredRecall=0.99']), + Nmslib(m, 'vptree', ['tuneK=10', 'desiredRecall=0.95']), + Nmslib(m, 'vptree', ['tuneK=10', 'desiredRecall=0.90']), + Nmslib(m, 'vptree', ['tuneK=10', 'desiredRecall=0.85']), + Nmslib(m, 'vptree', ['tuneK=10', 'desiredRecall=0.8']), + Nmslib(m, 'vptree', ['tuneK=10', 'desiredRecall=0.7']), + Nmslib(m, 'vptree', ['tuneK=10', 'desiredRecall=0.6']), + Nmslib(m, 'vptree', ['tuneK=10', 'desiredRecall=0.5']), + Nmslib(m, 'vptree', ['tuneK=10', 'desiredRecall=0.4']), + Nmslib(m, 'vptree', ['tuneK=10', 'desiredRecall=0.3']), + Nmslib(m, 'vptree', ['tuneK=10', 'desiredRecall=0.2']), + Nmslib(m, 'vptree', ['tuneK=10', 'desiredRecall=0.1']), + ], + + 'SW-graph(nmslib)':[ + Nmslib(m, 'small_world_rand', ['NN=20', 'initIndexAttempts=4', 'initSearchAttempts=48']), + Nmslib(m, 'small_world_rand', ['NN=20', 'initIndexAttempts=4', 'initSearchAttempts=32']), + Nmslib(m, 'small_world_rand', ['NN=20', 'initIndexAttempts=4', 'initSearchAttempts=16']), + Nmslib(m, 'small_world_rand', ['NN=20', 'initIndexAttempts=4', 'initSearchAttempts=8']), + Nmslib(m, 'small_world_rand', ['NN=20', 'initIndexAttempts=4', 'initSearchAttempts=4']), + Nmslib(m, 'small_world_rand', ['NN=20', 'initIndexAttempts=4', 'initSearchAttempts=2']), + Nmslib(m, 'small_world_rand', ['NN=17', 'initIndexAttempts=4', 'initSearchAttempts=2']), + Nmslib(m, 'small_world_rand', ['NN=14', 'initIndexAttempts=4', 'initSearchAttempts=2']), + Nmslib(m, 'small_world_rand', ['NN=11', 'initIndexAttempts=5', 'initSearchAttempts=2']), + Nmslib(m, 'small_world_rand', ['NN=8', 'initIndexAttempts=5', 'initSearchAttempts=2']), + Nmslib(m, 'small_world_rand', ['NN=5', 'initIndexAttempts=5', 'initSearchAttempts=2']), + Nmslib(m, 'small_world_rand', ['NN=3', 'initIndexAttempts=5', 'initSearchAttempts=2']), + ] } + if m == 'euclidean': + # Only works for euclidean distance + algos['MP-lsh(lshkit)'] = [ + Nmslib(m, 'lsh_multiprobe', ['desiredRecall=0.99','H=1200001','T=10','L=50','tuneK=10']), + Nmslib(m, 'lsh_multiprobe', ['desiredRecall=0.97','H=1200001','T=10','L=50','tuneK=10']), + Nmslib(m, 'lsh_multiprobe', ['desiredRecall=0.95','H=1200001','T=10','L=50','tuneK=10']), + Nmslib(m, 'lsh_multiprobe', ['desiredRecall=0.90','H=1200001','T=10','L=50','tuneK=10']), + Nmslib(m, 'lsh_multiprobe', ['desiredRecall=0.85','H=1200001','T=10','L=50','tuneK=10']), + Nmslib(m, 'lsh_multiprobe', ['desiredRecall=0.80','H=1200001','T=10','L=50','tuneK=10']), + Nmslib(m, 'lsh_multiprobe', ['desiredRecall=0.7','H=1200001','T=10','L=50','tuneK=10']), + Nmslib(m, 'lsh_multiprobe', ['desiredRecall=0.6','H=1200001','T=10','L=50','tuneK=10']), + Nmslib(m, 'lsh_multiprobe', ['desiredRecall=0.5','H=1200001','T=10','L=50','tuneK=10']), + Nmslib(m, 'lsh_multiprobe', ['desiredRecall=0.4','H=1200001','T=10','L=50','tuneK=10']), + Nmslib(m, 'lsh_multiprobe', ['desiredRecall=0.3','H=1200001','T=10','L=50','tuneK=10']), + Nmslib(m, 'lsh_multiprobe', ['desiredRecall=0.2','H=1200001','T=10','L=50','tuneK=10']), + Nmslib(m, 'lsh_multiprobe', ['desiredRecall=0.1','H=1200001','T=10','L=50','tuneK=10']), + ] + + # END: Non-Metric Space Library (nmslib) entries + + return algos + def get_fn(base, args): fn = os.path.join(base, args.dataset)