Skip to content

Commit

Permalink
Merge pull request #69 from erikbern/hamming
Browse files Browse the repository at this point in the history
Use word2bits to create a Hamming dataset
  • Loading branch information
erikbern authored Mar 31, 2018
2 parents d8afeff + 48c1a06 commit ed5b030
Showing 1 changed file with 33 additions and 13 deletions.
46 changes: 33 additions & 13 deletions ann_benchmarks/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@ def write_output(train, test, fn, distance, count=100):
f.attrs['distance'] = distance
print('train size: %9d * %4d' % train.shape)
print('test size: %9d * %4d' % test.shape)
f.create_dataset('train', (len(train), len(train[0])), dtype='f')[:] = train
f.create_dataset('test', (len(test), len(test[0])), dtype='f')[:] = test
f.create_dataset('train', (len(train), len(train[0])), dtype=train.dtype)[:] = train
f.create_dataset('test', (len(test), len(test[0])), dtype=test.dtype)[:] = test
neighbors = f.create_dataset('neighbors', (len(test), count), dtype='i')
distances = f.create_dataset('distances', (len(test), count), dtype='f')
bf = BruteForceBLAS(distance, precision=numpy.float32)
Expand All @@ -57,8 +57,13 @@ def write_output(train, test, fn, distance, count=100):
f.close()


def glove(out_fn, d):
def train_test_split(X, test_size=10000):
import sklearn.model_selection
print('Splitting %d*%d into train/test' % X.shape)
return sklearn.model_selection.train_test_split(X, test_size=test_size, random_state=1)


def glove(out_fn, d):
import zipfile

url = 'http://nlp.stanford.edu/data/glove.twitter.27B.zip'
Expand All @@ -71,9 +76,7 @@ def glove(out_fn, d):
for line in z.open(z_fn):
v = [float(x) for x in line.strip().split()[1:]]
X.append(numpy.array(v))
print('splitting output...')
X_train, X_test = sklearn.model_selection.train_test_split(X, test_size=10000, random_state=1)
print('writing output...')
X_train, X_test = train_test_split(X)
write_output(numpy.array(X_train), numpy.array(X_test), out_fn, 'angular')


Expand Down Expand Up @@ -170,7 +173,6 @@ def fashion_mnist(out_fn):

def transform_bag_of_words(filename, n_dimensions, out_fn):
import gzip
import sklearn.model_selection
from scipy.sparse import lil_matrix
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import random_projection
Expand All @@ -188,24 +190,41 @@ def transform_bag_of_words(filename, n_dimensions, out_fn):
B = TfidfTransformer().fit_transform(A)
print("reducing dimensionality...")
C = random_projection.GaussianRandomProjection(n_components = n_dimensions).fit_transform(B)
X_train, X_test = sklearn.model_selection.train_test_split(C, test_size=10000, random_state=1)
print('writing output...')
X_train, X_test = train_test_split(C)
write_output(numpy.array(X_train), numpy.array(X_test), out_fn, 'angular')


def nytimes(out_fn, n_dimensions):
download('https://archive.ics.uci.edu/ml/machine-learning-databases/bag-of-words/docword.nytimes.txt.gz', 'nytimes.txt.gz')
transform_bag_of_words('nytimes.txt.gz', n_dimensions, out_fn)
fn = 'nytimes_%s.txt.gz' % n_dimensions
download('https://archive.ics.uci.edu/ml/machine-learning-databases/bag-of-words/docword.nytimes.txt.gz', fn)
transform_bag_of_words(fn, n_dimensions, out_fn)


def random(out_fn, n_dims, n_samples, centers, distance):
import sklearn.model_selection
import sklearn.datasets

X, _ = sklearn.datasets.make_blobs(n_samples=n_samples, n_features=n_dims, centers=centers, random_state=1)
X_train, X_test = sklearn.model_selection.train_test_split(X, test_size=0.1, random_state=1)
X_train, X_test = train_test_split(X, test_size=0.1)
write_output(X_train, X_test, out_fn, distance)


def word2bits(out_fn, path, fn):
import tarfile
local_fn = fn + '.tar.gz'
url = 'http://web.stanford.edu/~maxlam/word_vectors/compressed/%s/%s.tar.gz' % (path, fn)
download(url, local_fn)
print('parsing vectors in %s...' % local_fn)
with tarfile.open(local_fn, 'r:gz') as t:
f = t.extractfile(fn)
n_words, k = [int(z) for z in next(f).strip().split()]
X = numpy.zeros((n_words, k), dtype=numpy.bool)
for i in range(n_words):
X[i] = [float(z) > 0 for z in next(f).strip().split()[1:]]

X_train, X_test = train_test_split(X)
write_output(X_train, X_test, out_fn, 'euclidean') # TODO: use hamming


DATASETS = {
'fashion-mnist-784-euclidean': fashion_mnist,
'gist-960-euclidean': gist,
Expand All @@ -221,4 +240,5 @@ def random(out_fn, n_dims, n_samples, centers, distance):
'sift-128-euclidean': sift,
'nytimes-256-angular': lambda out_fn: nytimes(out_fn, 256),
'nytimes-16-angular': lambda out_fn: nytimes(out_fn, 16),
'word2bits-800-hamming': lambda out_fn: word2bits(out_fn, '400K', 'w2b_bitlevel1_size800_vocab400K'),
}

0 comments on commit ed5b030

Please sign in to comment.