Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Split build_vocab to scan, scale, finalize; train() loop/locking refactor; downsampling into cython #380

Merged
merged 13 commits into from
Jul 5, 2015
Merged
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
*.pkl
*.bak
*.npy
*.npz

# OS generated files #
######################
Expand All @@ -44,6 +45,7 @@ Thumbs.db
.ropeproject
.settings/
.eggs
cython_debug
docs/src/_build/
docs/_static
dedan_gensim.tmproj
Expand Down
215 changes: 116 additions & 99 deletions gensim/models/doc2vec.py

Large diffs are not rendered by default.

3,686 changes: 2,068 additions & 1,618 deletions gensim/models/doc2vec_inner.c

Large diffs are not rendered by default.

128 changes: 69 additions & 59 deletions gensim/models/doc2vec_inner.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ from libc.string cimport memset, memcpy

from scipy.linalg.blas import fblas

from word2vec_inner cimport bisect_left, \
from word2vec_inner cimport bisect_left, random_int32, \
scopy, saxpy, sdot, dsdot, snrm2, sscal, \
REAL_t, EXP_TABLE, \
our_dot, our_saxpy, \
Expand Down Expand Up @@ -219,11 +219,12 @@ cdef unsigned long long fast_document_dmc_neg(
return next_random


def train_document_dbow(model, word_vocabs, doctag_indexes, alpha, work=None,
def train_document_dbow(model, doc_words, doctag_indexes, alpha, work=None,
train_words=False, learn_doctags=True, learn_words=True, learn_hidden=True,
word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None):
cdef int hs = model.hs
cdef int negative = model.negative
cdef int sample = (model.sample != 0)
cdef int _train_words = train_words
cdef int _learn_words = learn_words
cdef int _learn_hidden = learn_hidden
Expand All @@ -246,6 +247,7 @@ def train_document_dbow(model, word_vocabs, doctag_indexes, alpha, work=None,
cdef int window = model.window

cdef int i, j
cdef unsigned long long r
cdef long result = 0

# For hierarchical softmax
Expand Down Expand Up @@ -280,45 +282,46 @@ def train_document_dbow(model, word_vocabs, doctag_indexes, alpha, work=None,
syn1neg = <REAL_t *>(np.PyArray_DATA(model.syn1neg))
cum_table = <np.uint32_t *>(np.PyArray_DATA(model.cum_table))
cum_table_len = len(model.cum_table)
if negative or sample:
next_random = (2**24) * model.random.randint(0, 2**24) + model.random.randint(0, 2**24)

# convert Python structures to primitive types, so we can release the GIL
if work is None:
work = zeros(model.layer1_size, dtype=REAL)
_work = <REAL_t *>np.PyArray_DATA(work)
document_len = <int>min(MAX_DOCUMENT_LEN, len(word_vocabs))
doctag_len = <int>min(MAX_DOCUMENT_LEN, len(doctag_indexes))

for i in range(document_len):
predict_word = word_vocabs[i]
if predict_word is None:
# shrink document to leave out word
document_len = document_len - 1
continue # leaving j unchanged
else:
indexes[i] = predict_word.index
if hs:
codelens[i] = <int>len(predict_word.code)
codes[i] = <np.uint8_t *>np.PyArray_DATA(predict_word.code)
points[i] = <np.uint32_t *>np.PyArray_DATA(predict_word.point)
else:
codelens[i] = 1
result += 1
vlookup = model.vocab
i = 0
for token in doc_words:
predict_word = vlookup[token] if token in vlookup else None
if predict_word is None: # shrink document to leave out word
continue # leaving i unchanged
if sample and predict_word.sample_int < random_int32(&next_random):
continue
indexes[i] = predict_word.index
if hs:
codelens[i] = <int>len(predict_word.code)
codes[i] = <np.uint8_t *>np.PyArray_DATA(predict_word.code)
points[i] = <np.uint32_t *>np.PyArray_DATA(predict_word.point)
result += 1
i += 1
if i == MAX_DOCUMENT_LEN:
break # TODO: log warning, tally overflow?
document_len = i

if _train_words:
# single randint() call avoids a big thread-synchronization slowdown
for i, item in enumerate(model.random.randint(0, window, document_len)):
reduced_windows[i] = item

doctag_len = <int>min(MAX_DOCUMENT_LEN, len(doctag_indexes))
for i in range(doctag_len):
_doctag_indexes[i] = doctag_indexes[i]
result += 1

# release GIL & train on the document
with nogil:
for i in range(document_len):
if codelens[i] == 0:
continue
if _train_words: # simultaneous skip-gram wordvec-training
j = i - window + reduced_windows[i]
if j < 0:
Expand All @@ -327,7 +330,7 @@ def train_document_dbow(model, word_vocabs, doctag_indexes, alpha, work=None,
if k > document_len:
k = document_len
for j in range(j, k):
if j == i or codelens[j] == 0:
if j == i:
continue
if hs:
# we reuse the DBOW function, as it is equivalent to skip-gram for this purpose
Expand All @@ -352,11 +355,12 @@ def train_document_dbow(model, word_vocabs, doctag_indexes, alpha, work=None,
return result


def train_document_dm(model, word_vocabs, doctag_indexes, alpha, work=None, neu1=None,
def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=None,
learn_doctags=True, learn_words=True, learn_hidden=True,
word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None):
cdef int hs = model.hs
cdef int negative = model.negative
cdef int sample = (model.sample != 0)
cdef int _learn_doctags = learn_doctags
cdef int _learn_words = learn_words
cdef int _learn_hidden = learn_hidden
Expand Down Expand Up @@ -415,6 +419,7 @@ def train_document_dm(model, word_vocabs, doctag_indexes, alpha, work=None, neu1
syn1neg = <REAL_t *>(np.PyArray_DATA(model.syn1neg))
cum_table = <np.uint32_t *>(np.PyArray_DATA(model.cum_table))
cum_table_len = len(model.cum_table)
if negative or sample:
next_random = (2**24) * model.random.randint(0, 2**24) + model.random.randint(0, 2**24)

# convert Python structures to primitive types, so we can release the GIL
Expand All @@ -425,22 +430,25 @@ def train_document_dm(model, word_vocabs, doctag_indexes, alpha, work=None, neu1
neu1 = zeros(model.layer1_size, dtype=REAL)
_neu1 = <REAL_t *>np.PyArray_DATA(neu1)

document_len = <int>min(MAX_DOCUMENT_LEN, len(word_vocabs))
j = 0
for i in range(document_len):
word = word_vocabs[i]
if word is None:
# shrink document to leave out word
document_len = document_len - 1
continue # leaving j unchanged
else:
indexes[j] = word.index
if hs:
codelens[j] = <int>len(word.code)
codes[j] = <np.uint8_t *>np.PyArray_DATA(word.code)
points[j] = <np.uint32_t *>np.PyArray_DATA(word.point)
result += 1
j = j + 1
vlookup = model.vocab
i = 0
for token in doc_words:
predict_word = vlookup[token] if token in vlookup else None
if predict_word is None: # shrink document to leave out word
continue # leaving i unchanged
if sample and predict_word.sample_int < random_int32(&next_random):
continue
indexes[i] = predict_word.index
if hs:
codelens[i] = <int>len(predict_word.code)
codes[i] = <np.uint8_t *>np.PyArray_DATA(predict_word.code)
points[i] = <np.uint32_t *>np.PyArray_DATA(predict_word.point)
result += 1
i += 1
if i == MAX_DOCUMENT_LEN:
break # TODO: log warning, tally overflow?
document_len = i

# single randint() call avoids a big thread-sync slowdown
for i, item in enumerate(model.random.randint(0, window, document_len)):
reduced_windows[i] = item
Expand Down Expand Up @@ -504,11 +512,12 @@ def train_document_dm(model, word_vocabs, doctag_indexes, alpha, work=None, neu1
return result


def train_document_dm_concat(model, word_vocabs, doctag_indexes, alpha, work=None, neu1=None,
def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None, neu1=None,
learn_doctags=True, learn_words=True, learn_hidden=True,
word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None):
cdef int hs = model.hs
cdef int negative = model.negative
cdef int sample = (model.sample != 0)
cdef int _learn_doctags = learn_doctags
cdef int _learn_words = learn_words
cdef int _learn_hidden = learn_hidden
Expand Down Expand Up @@ -549,7 +558,7 @@ def train_document_dm_concat(model, word_vocabs, doctag_indexes, alpha, work=Non

doctag_len = <int>min(MAX_DOCUMENT_LEN, len(doctag_indexes))
if doctag_len != expected_doctag_len:
return 0 # skip doc without expected nmber of tags
return 0 # skip doc without expected number of tags

# default vectors, locks from syn0/doctag_syn0
if word_vectors is None:
Expand All @@ -572,6 +581,7 @@ def train_document_dm_concat(model, word_vocabs, doctag_indexes, alpha, work=Non
syn1neg = <REAL_t *>(np.PyArray_DATA(model.syn1neg))
cum_table = <np.uint32_t *>(np.PyArray_DATA(model.cum_table))
cum_table_len = len(model.cum_table)
if negative or sample:
next_random = (2**24) * model.random.randint(0, 2**24) + model.random.randint(0, 2**24)

# convert Python structures to primitive types, so we can release the GIL
Expand All @@ -582,24 +592,24 @@ def train_document_dm_concat(model, word_vocabs, doctag_indexes, alpha, work=Non
neu1 = zeros(model.layer1_size, dtype=REAL)
_neu1 = <REAL_t *>np.PyArray_DATA(neu1)

document_len = <int>min(MAX_DOCUMENT_LEN, len(word_vocabs))
j = 0
for i in range(document_len):
word = word_vocabs[i]
if word is None:
# shrink document to leave out word
document_len = document_len - 1
continue # leaving j unchanged
else:
indexes[j] = word.index
if hs:
codelens[j] = <int>len(word.code)
codes[j] = <np.uint8_t *>np.PyArray_DATA(word.code)
points[j] = <np.uint32_t *>np.PyArray_DATA(word.point)
else:
codelens[j] = 1
result += 1
j = j + 1
vlookup = model.vocab
i = 0
for token in doc_words:
predict_word = vlookup[token] if token in vlookup else None
if predict_word is None: # shrink document to leave out word
continue # leaving i unchanged
if sample and predict_word.sample_int < random_int32(&next_random):
continue
indexes[i] = predict_word.index
if hs:
codelens[i] = <int>len(predict_word.code)
codes[i] = <np.uint8_t *>np.PyArray_DATA(predict_word.code)
points[i] = <np.uint32_t *>np.PyArray_DATA(predict_word.point)
result += 1
i += 1
if i == MAX_DOCUMENT_LEN:
break # TODO: log warning, tally overflow?
document_len = i

for i in range(doctag_len):
_doctag_indexes[i] = doctag_indexes[i]
Expand Down
Loading