Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use word2bits to create a Hamming dataset #69

Merged
merged 2 commits into from
Mar 31, 2018
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 33 additions & 13 deletions ann_benchmarks/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@ def write_output(train, test, fn, distance, count=100):
f.attrs['distance'] = distance
print('train size: %9d * %4d' % train.shape)
print('test size: %9d * %4d' % test.shape)
f.create_dataset('train', (len(train), len(train[0])), dtype='f')[:] = train
f.create_dataset('test', (len(test), len(test[0])), dtype='f')[:] = test
f.create_dataset('train', (len(train), len(train[0])), dtype=train.dtype)[:] = train
f.create_dataset('test', (len(test), len(test[0])), dtype=test.dtype)[:] = test
neighbors = f.create_dataset('neighbors', (len(test), count), dtype='i')
distances = f.create_dataset('distances', (len(test), count), dtype='f')
bf = BruteForceBLAS(distance, precision=numpy.float32)
Expand All @@ -57,8 +57,13 @@ def write_output(train, test, fn, distance, count=100):
f.close()


def glove(out_fn, d):
def train_test_split(X, test_size=10000):
import sklearn.model_selection
print('Splitting %d*%d into train/test' % X.shape)
return sklearn.model_selection.train_test_split(X, test_size=test_size, random_state=1)


def glove(out_fn, d):
import zipfile

url = 'http://nlp.stanford.edu/data/glove.twitter.27B.zip'
Expand All @@ -71,9 +76,7 @@ def glove(out_fn, d):
for line in z.open(z_fn):
v = [float(x) for x in line.strip().split()[1:]]
X.append(numpy.array(v))
print('splitting output...')
X_train, X_test = sklearn.model_selection.train_test_split(X, test_size=10000, random_state=1)
print('writing output...')
X_train, X_test = train_test_split(X)
write_output(numpy.array(X_train), numpy.array(X_test), out_fn, 'angular')


Expand Down Expand Up @@ -170,7 +173,6 @@ def fashion_mnist(out_fn):

def transform_bag_of_words(filename, n_dimensions, out_fn):
import gzip
import sklearn.model_selection
from scipy.sparse import lil_matrix
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import random_projection
Expand All @@ -188,24 +190,41 @@ def transform_bag_of_words(filename, n_dimensions, out_fn):
B = TfidfTransformer().fit_transform(A)
print("reducing dimensionality...")
C = random_projection.GaussianRandomProjection(n_components = n_dimensions).fit_transform(B)
X_train, X_test = sklearn.model_selection.train_test_split(C, test_size=10000, random_state=1)
print('writing output...')
X_train, X_test = train_test_split(C)
write_output(numpy.array(X_train), numpy.array(X_test), out_fn, 'angular')


def nytimes(out_fn, n_dimensions):
download('https://archive.ics.uci.edu/ml/machine-learning-databases/bag-of-words/docword.nytimes.txt.gz', 'nytimes.txt.gz')
transform_bag_of_words('nytimes.txt.gz', n_dimensions, out_fn)
fn = 'nytimes_%s.txt.gz' % n_dimensions
download('https://archive.ics.uci.edu/ml/machine-learning-databases/bag-of-words/docword.nytimes.txt.gz', fn)
transform_bag_of_words(fn, n_dimensions, out_fn)


def random(out_fn, n_dims, n_samples, centers, distance):
import sklearn.model_selection
import sklearn.datasets

X, _ = sklearn.datasets.make_blobs(n_samples=n_samples, n_features=n_dims, centers=centers, random_state=1)
X_train, X_test = sklearn.model_selection.train_test_split(X, test_size=0.1, random_state=1)
X_train, X_test = train_test_split(X, test_size=0.1)
write_output(X_train, X_test, out_fn, distance)


def word2bits(out_fn, path, fn):
import tarfile
local_fn = fn + '.tar.gz'
url = 'http://web.stanford.edu/~maxlam/word_vectors/compressed/%s/%s.tar.gz' % (path, fn)
download(url, local_fn)
print('parsing vectors in %s...' % local_fn)
with tarfile.open(local_fn, 'r:gz') as t:
f = t.extractfile(fn)
n_words, k = [int(z) for z in next(f).strip().split()]
X = numpy.zeros((n_words, k), dtype=numpy.bool)
for i in range(n_words):
X[i] = [float(z) > 0 for z in next(f).strip().split()[1:]]

X_train, X_test = train_test_split(X)
write_output(X_train, X_test, out_fn, 'euclidean') # TODO: use hamming


DATASETS = {
'fashion-mnist-784-euclidean': fashion_mnist,
'gist-960-euclidean': gist,
Expand All @@ -221,4 +240,5 @@ def random(out_fn, n_dims, n_samples, centers, distance):
'sift-128-euclidean': sift,
'nytimes-256-angular': lambda out_fn: nytimes(out_fn, 256),
'nytimes-16-angular': lambda out_fn: nytimes(out_fn, 16),
'word2bits-800-hamming': lambda out_fn: word2bits(out_fn, '400K', 'w2b_bitlevel1_size800_vocab400K'),
}