piskvorky · persiyanov · May 14, 2018 · May 14, 2018 · May 14, 2018 · May 15, 2018
diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py
diff --git a/gensim/models/linesentence.cpp b/gensim/models/linesentence.cpp
@@ -0,0 +1,25 @@
+#pragma once
+
+#include <stdexcept>
+#include "linesentence.h"
+
+
+FastLineSentence::FastLineSentence() : is_eof_(false) { }
+FastLineSentence::FastLineSentence(const std::string& filename) : filename_(filename), fs_(filename), is_eof_(false) { }
+
+std::vector<std::string> FastLineSentence::ReadSentence() {
+    if (fs_.eof()) {
+        is_eof_ = true;
+        return {};
+    }
+	std::string line, word;
+	std::getline(fs_, line);
+	std::vector<std::string> res;
+
+	std::istringstream iss(line);
+	while (iss >> word) {
+		res.push_back(word);
+	}
+
+	return res;
+}
diff --git a/gensim/models/linesentence.h b/gensim/models/linesentence.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <fstream>
+#include <sstream>
+#include <vector>
+
+
+class FastLineSentence {
+public:
+    explicit FastLineSentence();
+	explicit FastLineSentence(const std::string& filename);
+
+	std::vector<std::string> ReadSentence();
+	inline bool IsEof() const { return is_eof_; }
+	inline void Reset() { fs_.close(); fs_ = std::ifstream(filename_); is_eof_ = false; }
+private:
+    std::string filename_;
+	std::ifstream fs_;
+	bool is_eof_;
+};
diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
@@ -110,13 +110,15 @@
 from copy import deepcopy
 from collections import defaultdict
 import threading
+import multiprocessing as mp
 import itertools
 import warnings
 
 from gensim.utils import keep_vocab_item, call_on_class_only
 from gensim.models.keyedvectors import Vocab, Word2VecKeyedVectors
 from gensim.models.base_any2vec import BaseWordEmbeddingsModel
 
+
 try:
     from queue import Queue, Empty
 except ImportError:
@@ -136,7 +138,7 @@
 logger = logging.getLogger(__name__)
 
 try:
-    from gensim.models.word2vec_inner import train_batch_sg, train_batch_cbow
+    from gensim.models.word2vec_inner import train_batch_sg, train_epoch_cbow, train_epoch_cbow_pystream
     from gensim.models.word2vec_inner import score_sentence_sg, score_sentence_cbow
     from gensim.models.word2vec_inner import FAST_VERSION, MAX_WORDS_IN_BATCH
 
@@ -423,7 +425,7 @@ class Word2Vec(BaseWordEmbeddingsModel):
 
     """
 
-    def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5,
+    def __init__(self, sentences=None, input_streams=None, size=100, alpha=0.025, window=5, min_count=5,
                  max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
                  sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0,
                  trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False, callbacks=(),
@@ -528,23 +530,29 @@ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5,
         self.trainables = Word2VecTrainables(seed=seed, vector_size=size, hashfxn=hashfxn)
 
         super(Word2Vec, self).__init__(
-            sentences=sentences, workers=workers, vector_size=size, epochs=iter, callbacks=callbacks,
-            batch_words=batch_words, trim_rule=trim_rule, sg=sg, alpha=alpha, window=window, seed=seed,
-            hs=hs, negative=negative, cbow_mean=cbow_mean, min_alpha=min_alpha, compute_loss=compute_loss,
+            sentences=sentences, input_streams=input_streams, workers=workers, vector_size=size, epochs=iter,
+            callbacks=callbacks, batch_words=batch_words, trim_rule=trim_rule, sg=sg, alpha=alpha, window=window,
+            seed=seed, hs=hs, negative=negative, cbow_mean=cbow_mean, min_alpha=min_alpha, compute_loss=compute_loss,
             fast_version=FAST_VERSION)
 
-    def _do_train_job(self, sentences, alpha, inits):
-        """
-        Train a single batch of sentences. Return 2-tuple `(effective word count after
-        ignoring unknown words and sentence length trimming, total word count)`.
-        """
-        work, neu1 = inits
-        tally = 0
-        if self.sg:
-            tally += train_batch_sg(self, sentences, alpha, work, self.compute_loss)
-        else:
-            tally += train_batch_cbow(self, sentences, alpha, work, neu1, self.compute_loss)
-        return tally, self._raw_word_count(sentences)
+    # def _do_train_job(self, sentences, alpha, inits):
+    #     """
+    #     Train a single batch of sentences. Return 2-tuple `(effective word count after
+    #     ignoring unknown words and sentence length trimming, total word count)`.
+    #     """
+    #     work, neu1 = inits
+    #     tally = train_batch_cbow(self, sentences, alpha, work, neu1, self.compute_loss)
+    #     return tally, self._raw_word_count(sentences)
+
+    def _worker_loop(self, input_stream, progress_queue):
+        work, neu1 = self._get_thread_working_mem()
+        jobs_processed = 0
+        alpha = self._get_job_params(0)
+
+        examples, tally, raw_tally = train_epoch_cbow_pystream(self, input_stream, alpha, work, neu1, False)
+        progress_queue.put((examples, tally, raw_tally))
+        progress_queue.put(None)
+        # logger.debug("worker exiting, processed %i jobs", jobs_processed)
 
     def _clear_post_train(self):
         """Resets certain properties of the model, post training."""
@@ -555,7 +563,7 @@ def _set_train_params(self, **kwargs):
             self.compute_loss = kwargs['compute_loss']
         self.running_training_loss = 0
 
-    def train(self, sentences, total_examples=None, total_words=None,
+    def train(self, input_streams, total_examples=None, total_words=None,
               epochs=None, start_alpha=None, end_alpha=None, word_count=0,
               queue_factor=2, report_delay=1.0, compute_loss=False, callbacks=()):
         """Update the model's neural weights from a sequence of sentences (can be a once-only generator stream).
@@ -613,7 +621,7 @@ def train(self, sentences, total_examples=None, total_words=None,
         """
 
         return super(Word2Vec, self).train(
-            sentences, total_examples=total_examples, total_words=total_words,
+            input_streams, total_examples=total_examples, total_words=total_words,
             epochs=epochs, start_alpha=start_alpha, end_alpha=end_alpha, word_count=word_count,
             queue_factor=queue_factor, report_delay=report_delay, compute_loss=compute_loss, callbacks=callbacks)
 
@@ -1156,8 +1164,10 @@ def __init__(self, max_vocab_size=None, min_count=5, sample=1e-3, sorted_vocab=T
         self.raw_vocab = None
         self.max_final_vocab = max_final_vocab
 
-    def scan_vocab(self, sentences, progress_per=10000, trim_rule=None):
+    def scan_vocab(self, input_streams, progress_per=10000, trim_rule=None):
         """Do an initial scan of all words appearing in sentences."""
+        sentences = itertools.chain(*input_streams)
+
         logger.info("collecting all words and their counts")
         sentence_no = -1
         total_words = 0