Skip to content

Commit

Permalink
fixing stuff
Browse files Browse the repository at this point in the history
  • Loading branch information
Erik Bernhardsson committed Jun 20, 2015
1 parent 74c397d commit 292436c
Show file tree
Hide file tree
Showing 7 changed files with 45 additions and 28 deletions.
5 changes: 3 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
language: python
python:
- "2.7"
- "2.7_with_system_site_packages"
- "3.4"

install: bash install.sh
install:
- sudo bash install.sh

script: nosetests
29 changes: 16 additions & 13 deletions ann_benchmarks/__init__.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
import gzip, numpy, time, os, multiprocessing, argparse, pickle, resource
import gzip, numpy, time, os, multiprocessing, argparse, pickle, resource, random
try:
from urllib import urlretrieve
except ImportError:
from urllib.request import urlretrieve # Python 3
import sklearn.cross_validation, sklearn.preprocessing, random
import sklearn.preprocessing

# Set resource limits to prevent memory bombs
memory_limit = 12 * 2**30
soft, hard = resource.getrlimit(resource.RLIMIT_DATA)
if soft == resource.RLIM_INFINITY or soft >= memory_limit:
print 'resetting memory limit from', soft, 'to', memory_limit
print('resetting memory limit from', soft, 'to', memory_limit)
resource.setrlimit(resource.RLIMIT_DATA, (memory_limit, hard))


Expand Down Expand Up @@ -203,6 +203,7 @@ def __init__(self, metric):
self.name = 'BruteForce()'

def fit(self, X):
import sklearn.neighbors
metric = {'angular': 'cosine', 'euclidean': 'l2'}[self._metric]
self._nbrs = sklearn.neighbors.NearestNeighbors(algorithm='brute', metric=metric)
self._nbrs.fit(X)
Expand All @@ -223,8 +224,10 @@ def get_dataset(which='glove', limit=-1):
break

X = numpy.vstack(X)
import sklearn.cross_validation

X_train, X_test = sklearn.cross_validation.train_test_split(X, test_size=1000, random_state=42)
print X_train.shape, X_test.shape
print(X_train.shape, X_test.shape)
return X_train, X_test


Expand All @@ -235,7 +238,7 @@ def run_algo(args, library, algo, results_fn):
if algo != 'bf':
algo.fit(X_train)
build_time = time.time() - t0
print 'Built index in', build_time
print('Built index in', build_time)

best_search_time = float('inf')
best_precision = 0.0 # should be deterministic but paranoid
Expand All @@ -249,18 +252,18 @@ def run_algo(args, library, algo, results_fn):
precision = k / (len(queries) * 10)
best_search_time = min(best_search_time, search_time)
best_precision = max(best_precision, precision)
print search_time, precision
print(search_time, precision)

output = [library, algo.name, build_time, best_search_time, best_precision]
print output
print(output)

f = open(results_fn, 'a')
f.write('\t'.join(map(str, output)) + '\n')
f.close()


def get_queries(args):
print 'computing queries with correct results...'
print('computing queries with correct results...')

bf = BruteForce(args.distance)
X_train, X_test = get_dataset(which=args.dataset, limit=args.limit)
Expand All @@ -272,7 +275,7 @@ def get_queries(args):
correct = bf.query(x, 10)
queries.append((x, correct))
if len(queries) % 100 == 0:
print len(queries), '...'
print(len(queries), '...')

return queries

Expand Down Expand Up @@ -369,7 +372,7 @@ def get_fn(base, args):
results_fn = get_fn('results', args)
queries_fn = get_fn('queries', args)

print 'storing queries in', queries_fn, 'and results in', results_fn
print('storing queries in', queries_fn, 'and results in', results_fn)

if not os.path.exists(queries_fn):
queries = get_queries(args)
Expand All @@ -379,7 +382,7 @@ def get_fn(base, args):
else:
queries = pickle.load(open(queries_fn))

print 'got', len(queries), 'queries'
print('got', len(queries), 'queries')

algos_already_ran = set()
if os.path.exists(results_fn):
Expand All @@ -396,10 +399,10 @@ def get_fn(base, args):

random.shuffle(algos_flat)

print 'order:', algos_flat
print('order:', algos_flat)

for library, algo in algos_flat:
print algo.name, '...'
print(algo.name, '...')
# Spawn a subprocess to force the memory to be reclaimed at the end
p = multiprocessing.Process(target=run_algo, args=(args, library, algo, results_fn))
p.start()
Expand Down
11 changes: 10 additions & 1 deletion install.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,13 @@
apt-get install -y python-numpy python-scipy python-sklearn
apt-get update
apt-get install -y python-numpy python-scipy python-pip
pip install scikit-learn

# Install GCC 4.8
add-apt-repository ppa:ubuntu-toolchain-r/test -y
apt-get update -qq
apt-get install -y libboost1.48-all-dev g++-4.8
export CXX="g++-4.8" CC="gcc-4.8"

cd install
for fn in annoy.sh panns.sh nearpy.sh sklearn.sh flann.sh kgraph.sh nmslib.sh
do
Expand Down
1 change: 1 addition & 0 deletions install/kgraph.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
git clone https://github.com/aaalgo/kgraph
pushd kgraph
apt-get install -y libboost-timer-dev libbooost-chrono-dev
sudo make deps-ubuntu
make
make release
Expand Down
4 changes: 2 additions & 2 deletions install/nearpy.sh
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
sudo apt-get install -y python-pip
sudo pip install nearpy bitarray redis
apt-get install -y python-pip
pip install nearpy bitarray redis h5py
9 changes: 4 additions & 5 deletions install/nmslib.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,12 @@ rm -rf NonMetricSpaceLib
# Note that we use the develop branch here:
git clone https://github.com/searchivarius/NonMetricSpaceLib.git
cd NonMetricSpaceLib/similarity_search
git checkout ann-benchmark
sudo apt-get install -y cmake libeigen3-dev libgsl0-dev libboost-all-dev g++-4.8
# Actually let's make g++ an alias
alias g++=g++-4.8
git checkout ann-benchmark
apt-get install -y cmake libeigen3-dev libgsl0-dev libboost-all-dev
echo "CC: $CC, CXX: $CXX"
cmake .
make -j 4
cd ../python_binding
make
sudo make install
make install
cd ../..
14 changes: 9 additions & 5 deletions test/test.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import random
import inspect
import ann_benchmarks
from sklearn.datasets.samples_generator import make_blobs
Expand All @@ -10,13 +11,16 @@
def check_algo(algo_name, algo):
algo.fit(X)
result = algo.query(X[42], 10)
assert result[0] == 42
assert len(result) == 10
assert len(set(result)) == 10
if result[0] != 42:
raise AssertionError('Expected first item to be 42: Result: %s' % result)
if len(result) != 10:
raise AssertionError('Expected results to have length 10: Result: %s' % result)
if len(set(result)) != 10:
raise AssertionError('Expected results to be unique: Result: %s' % result)

def test_all_algos():
for metric in ['angular', 'euclidean']:
algos = ann_benchmarks.get_algos(metric)
for algo_key in algos.keys():
for algo in algos[algo_key]:
yield check_algo, algo.name, algo # pass name just so unittest can capture it
algo = random.choice(algos[algo_key]) # Just pick one of each
yield check_algo, algo.name, algo # pass name just so unittest can capture it

0 comments on commit 292436c

Please sign in to comment.