diff --git a/ann_benchmarks/datasets.py b/ann_benchmarks/datasets.py index 19b766937..693db60a9 100644 --- a/ann_benchmarks/datasets.py +++ b/ann_benchmarks/datasets.py @@ -40,8 +40,8 @@ def write_output(train, test, fn, distance, count=100): f.attrs['distance'] = distance print('train size: %9d * %4d' % train.shape) print('test size: %9d * %4d' % test.shape) - f.create_dataset('train', (len(train), len(train[0])), dtype='f')[:] = train - f.create_dataset('test', (len(test), len(test[0])), dtype='f')[:] = test + f.create_dataset('train', (len(train), len(train[0])), dtype=train.dtype)[:] = train + f.create_dataset('test', (len(test), len(test[0])), dtype=test.dtype)[:] = test neighbors = f.create_dataset('neighbors', (len(test), count), dtype='i') distances = f.create_dataset('distances', (len(test), count), dtype='f') bf = BruteForceBLAS(distance, precision=numpy.float32) @@ -57,8 +57,13 @@ def write_output(train, test, fn, distance, count=100): f.close() -def glove(out_fn, d): +def train_test_split(X, test_size=10000): import sklearn.model_selection + print('Splitting %d*%d into train/test' % X.shape) + return sklearn.model_selection.train_test_split(X, test_size=test_size, random_state=1) + + +def glove(out_fn, d): import zipfile url = 'http://nlp.stanford.edu/data/glove.twitter.27B.zip' @@ -71,9 +76,7 @@ def glove(out_fn, d): for line in z.open(z_fn): v = [float(x) for x in line.strip().split()[1:]] X.append(numpy.array(v)) - print('splitting output...') - X_train, X_test = sklearn.model_selection.train_test_split(X, test_size=10000, random_state=1) - print('writing output...') + X_train, X_test = train_test_split(X) write_output(numpy.array(X_train), numpy.array(X_test), out_fn, 'angular') @@ -170,7 +173,6 @@ def fashion_mnist(out_fn): def transform_bag_of_words(filename, n_dimensions, out_fn): import gzip - import sklearn.model_selection from scipy.sparse import lil_matrix from sklearn.feature_extraction.text import TfidfTransformer from sklearn import random_projection @@ -188,24 +190,41 @@ def transform_bag_of_words(filename, n_dimensions, out_fn): B = TfidfTransformer().fit_transform(A) print("reducing dimensionality...") C = random_projection.GaussianRandomProjection(n_components = n_dimensions).fit_transform(B) - X_train, X_test = sklearn.model_selection.train_test_split(C, test_size=10000, random_state=1) - print('writing output...') + X_train, X_test = train_test_split(C) write_output(numpy.array(X_train), numpy.array(X_test), out_fn, 'angular') def nytimes(out_fn, n_dimensions): - download('https://archive.ics.uci.edu/ml/machine-learning-databases/bag-of-words/docword.nytimes.txt.gz', 'nytimes.txt.gz') - transform_bag_of_words('nytimes.txt.gz', n_dimensions, out_fn) + fn = 'nytimes_%s.txt.gz' % n_dimensions + download('https://archive.ics.uci.edu/ml/machine-learning-databases/bag-of-words/docword.nytimes.txt.gz', fn) + transform_bag_of_words(fn, n_dimensions, out_fn) def random(out_fn, n_dims, n_samples, centers, distance): - import sklearn.model_selection import sklearn.datasets X, _ = sklearn.datasets.make_blobs(n_samples=n_samples, n_features=n_dims, centers=centers, random_state=1) - X_train, X_test = sklearn.model_selection.train_test_split(X, test_size=0.1, random_state=1) + X_train, X_test = train_test_split(X, test_size=0.1) write_output(X_train, X_test, out_fn, distance) + +def word2bits(out_fn, path, fn): + import tarfile + local_fn = fn + '.tar.gz' + url = 'http://web.stanford.edu/~maxlam/word_vectors/compressed/%s/%s.tar.gz' % (path, fn) + download(url, local_fn) + print('parsing vectors in %s...' % local_fn) + with tarfile.open(local_fn, 'r:gz') as t: + f = t.extractfile(fn) + n_words, k = [int(z) for z in next(f).strip().split()] + X = numpy.zeros((n_words, k), dtype=numpy.bool) + for i in range(n_words): + X[i] = [float(z) > 0 for z in next(f).strip().split()[1:]] + + X_train, X_test = train_test_split(X) + write_output(X_train, X_test, out_fn, 'euclidean') # TODO: use hamming + + DATASETS = { 'fashion-mnist-784-euclidean': fashion_mnist, 'gist-960-euclidean': gist, @@ -221,4 +240,5 @@ def random(out_fn, n_dims, n_samples, centers, distance): 'sift-128-euclidean': sift, 'nytimes-256-angular': lambda out_fn: nytimes(out_fn, 256), 'nytimes-16-angular': lambda out_fn: nytimes(out_fn, 16), + 'word2bits-800-hamming': lambda out_fn: word2bits(out_fn, '400K', 'w2b_bitlevel1_size800_vocab400K'), }