Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP GSOC 2018]: Multistream API, Part 1 #2048

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
51 commits
Select commit Hold shift + click to select a range
2724812
Add wikipedia parsing script
May 14, 2018
f893487
Track performance metrics in base_any2vec.py
May 14, 2018
f03d9e6
reset performance metrics in beginning of epoch
May 14, 2018
55517fd
add tracking CPU load + benchmarking script
May 15, 2018
8ae3248
Some bug fixes
May 15, 2018
29d2dba
prettify logging results in benchmark script
May 15, 2018
5e47dfa
More prettifying in benchmark script
May 15, 2018
389293f
add SUM cpu load
May 21, 2018
b1765e7
remove sent2vec from script
May 21, 2018
4d50cff
First approach to multistream, only for word2vec right now
May 21, 2018
48f498c
adapted benchmarking script to multistream
May 21, 2018
a2a6e4f
fix
May 21, 2018
b9668ee
fix bench script
May 22, 2018
2765207
Measure vocabulary building time
May 28, 2018
d110f26
fix
May 28, 2018
c9e507f
multiprocessing multistream
May 30, 2018
44bc8f8
add w2v benchmarking script
May 30, 2018
99d0fc0
multiprocessinng for scan_vocab
May 30, 2018
ffd5204
fixes
May 30, 2018
8a0badd
without progress_per at all
May 31, 2018
f21b3a2
Merge branch 'develop' into feature/gsoc-multistream-api-1
Jun 15, 2018
75cac9d
Merge branch 'feature/gsoc-multistream-api-1' of https://github.com/p…
Jun 15, 2018
2472b2b
get rid of job_producer, make batches in _worker_loop
Jun 15, 2018
4e0c103
fix
Jun 15, 2018
3dd8a64
fix
Jun 15, 2018
d389847
make cythonlinesentence. not working, but at least compiles now
Jun 20, 2018
4c1d3a6
add operator>>
Jun 21, 2018
36882a0
change ifstream to ifstream*
Jun 21, 2018
37b55f3
fastlinesentence in c++
Jun 21, 2018
97f834d
almost working version; works on large files, but one bug is to be fixed
Jun 21, 2018
944e3dc
remove batch iterator from pyx
Jun 21, 2018
0081f01
working code
Jun 22, 2018
fe66246
remove build_vocab changes
Jun 23, 2018
491a087
approaching to fully nogil cython _worker_loop
Jun 27, 2018
15e07ae
wrapper fix
Jun 27, 2018
5cad26b
one more fix
Jun 27, 2018
495c4dc
more fixes
Jun 27, 2018
8b29df8
upd
Jun 27, 2018
2119c3a
try to cythonize batch preparation
Jun 27, 2018
3506ec9
it compiles
Jun 27, 2018
62f71ee
prepare batch inside nogil section in a while loop
Jun 28, 2018
8924af5
compiles
Jun 28, 2018
53fedfa
some bugfixes
Jun 29, 2018
c679bc6
add cpu_distribution script
Jun 29, 2018
921ff38
accept CythonLineSentence into _worker_loop, not filename
Jul 4, 2018
9e4ed0e
make CythonLineSentence iterable
Jul 4, 2018
f9ea23b
fix
Jul 4, 2018
cb8bb71
python iterators without gil
Jul 5, 2018
6162b50
fix
Jul 5, 2018
c14fca1
fixes
Jul 5, 2018
440c6df
last changes
Jul 9, 2018
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
193 changes: 94 additions & 99 deletions gensim/models/base_any2vec.py

Large diffs are not rendered by default.

25 changes: 25 additions & 0 deletions gensim/models/linesentence.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#pragma once

#include <stdexcept>
#include "linesentence.h"


FastLineSentence::FastLineSentence() : is_eof_(false) { }
FastLineSentence::FastLineSentence(const std::string& filename) : filename_(filename), fs_(filename), is_eof_(false) { }

std::vector<std::string> FastLineSentence::ReadSentence() {
if (fs_.eof()) {
is_eof_ = true;
return {};
}
std::string line, word;
std::getline(fs_, line);
std::vector<std::string> res;

std::istringstream iss(line);
while (iss >> word) {
res.push_back(word);
}

return res;
}
20 changes: 20 additions & 0 deletions gensim/models/linesentence.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#pragma once

#include <fstream>
#include <sstream>
#include <vector>


class FastLineSentence {
public:
explicit FastLineSentence();
explicit FastLineSentence(const std::string& filename);

std::vector<std::string> ReadSentence();
inline bool IsEof() const { return is_eof_; }
inline void Reset() { fs_.close(); fs_ = std::ifstream(filename_); is_eof_ = false; }
private:
std::string filename_;
std::ifstream fs_;
bool is_eof_;
};
50 changes: 30 additions & 20 deletions gensim/models/word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,13 +110,15 @@
from copy import deepcopy
from collections import defaultdict
import threading
import multiprocessing as mp
import itertools
import warnings

from gensim.utils import keep_vocab_item, call_on_class_only
from gensim.models.keyedvectors import Vocab, Word2VecKeyedVectors
from gensim.models.base_any2vec import BaseWordEmbeddingsModel


try:
from queue import Queue, Empty
except ImportError:
Expand All @@ -136,7 +138,7 @@
logger = logging.getLogger(__name__)

try:
from gensim.models.word2vec_inner import train_batch_sg, train_batch_cbow
from gensim.models.word2vec_inner import train_batch_sg, train_epoch_cbow, train_epoch_cbow_pystream
from gensim.models.word2vec_inner import score_sentence_sg, score_sentence_cbow
from gensim.models.word2vec_inner import FAST_VERSION, MAX_WORDS_IN_BATCH

Expand Down Expand Up @@ -423,7 +425,7 @@ class Word2Vec(BaseWordEmbeddingsModel):

"""

def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5,
def __init__(self, sentences=None, input_streams=None, size=100, alpha=0.025, window=5, min_count=5,
max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0,
trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False, callbacks=(),
Expand Down Expand Up @@ -528,23 +530,29 @@ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5,
self.trainables = Word2VecTrainables(seed=seed, vector_size=size, hashfxn=hashfxn)

super(Word2Vec, self).__init__(
sentences=sentences, workers=workers, vector_size=size, epochs=iter, callbacks=callbacks,
batch_words=batch_words, trim_rule=trim_rule, sg=sg, alpha=alpha, window=window, seed=seed,
hs=hs, negative=negative, cbow_mean=cbow_mean, min_alpha=min_alpha, compute_loss=compute_loss,
sentences=sentences, input_streams=input_streams, workers=workers, vector_size=size, epochs=iter,
callbacks=callbacks, batch_words=batch_words, trim_rule=trim_rule, sg=sg, alpha=alpha, window=window,
seed=seed, hs=hs, negative=negative, cbow_mean=cbow_mean, min_alpha=min_alpha, compute_loss=compute_loss,
fast_version=FAST_VERSION)

def _do_train_job(self, sentences, alpha, inits):
"""
Train a single batch of sentences. Return 2-tuple `(effective word count after
ignoring unknown words and sentence length trimming, total word count)`.
"""
work, neu1 = inits
tally = 0
if self.sg:
tally += train_batch_sg(self, sentences, alpha, work, self.compute_loss)
else:
tally += train_batch_cbow(self, sentences, alpha, work, neu1, self.compute_loss)
return tally, self._raw_word_count(sentences)
# def _do_train_job(self, sentences, alpha, inits):
# """
# Train a single batch of sentences. Return 2-tuple `(effective word count after
# ignoring unknown words and sentence length trimming, total word count)`.
# """
# work, neu1 = inits
# tally = train_batch_cbow(self, sentences, alpha, work, neu1, self.compute_loss)
# return tally, self._raw_word_count(sentences)

def _worker_loop(self, input_stream, progress_queue):
work, neu1 = self._get_thread_working_mem()
jobs_processed = 0
alpha = self._get_job_params(0)

examples, tally, raw_tally = train_epoch_cbow_pystream(self, input_stream, alpha, work, neu1, False)
progress_queue.put((examples, tally, raw_tally))
progress_queue.put(None)
# logger.debug("worker exiting, processed %i jobs", jobs_processed)

def _clear_post_train(self):
"""Resets certain properties of the model, post training."""
Expand All @@ -555,7 +563,7 @@ def _set_train_params(self, **kwargs):
self.compute_loss = kwargs['compute_loss']
self.running_training_loss = 0

def train(self, sentences, total_examples=None, total_words=None,
def train(self, input_streams, total_examples=None, total_words=None,
epochs=None, start_alpha=None, end_alpha=None, word_count=0,
queue_factor=2, report_delay=1.0, compute_loss=False, callbacks=()):
"""Update the model's neural weights from a sequence of sentences (can be a once-only generator stream).
Expand Down Expand Up @@ -613,7 +621,7 @@ def train(self, sentences, total_examples=None, total_words=None,
"""

return super(Word2Vec, self).train(
sentences, total_examples=total_examples, total_words=total_words,
input_streams, total_examples=total_examples, total_words=total_words,
epochs=epochs, start_alpha=start_alpha, end_alpha=end_alpha, word_count=word_count,
queue_factor=queue_factor, report_delay=report_delay, compute_loss=compute_loss, callbacks=callbacks)

Expand Down Expand Up @@ -1156,8 +1164,10 @@ def __init__(self, max_vocab_size=None, min_count=5, sample=1e-3, sorted_vocab=T
self.raw_vocab = None
self.max_final_vocab = max_final_vocab

def scan_vocab(self, sentences, progress_per=10000, trim_rule=None):
def scan_vocab(self, input_streams, progress_per=10000, trim_rule=None):
"""Do an initial scan of all words appearing in sentences."""
sentences = itertools.chain(*input_streams)

logger.info("collecting all words and their counts")
sentence_no = -1
total_words = 0
Expand Down
Loading