From 27248122815141c43abb35aa080639b3bb2af876 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Mon, 14 May 2018 15:27:23 +0300 Subject: [PATCH 01/49] Add wikipedia parsing script --- wikipedia_to_txt.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 wikipedia_to_txt.py diff --git a/wikipedia_to_txt.py b/wikipedia_to_txt.py new file mode 100644 index 0000000000..d05e2cabf1 --- /dev/null +++ b/wikipedia_to_txt.py @@ -0,0 +1,29 @@ +from __future__ import unicode_literals +from __future__ import print_function + +import logging +import codecs + +import gensim.downloader as api +from gensim.parsing.preprocessing import preprocess_string + + +logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) + +logger = logging.getLogger(__name__) + + +if __name__ == '__main__': + data = api.load('wiki-english-20171001') + + fout = codecs.open('gensim-enwiki.txt', 'w', encoding='utf8') + for i, article in enumerate(data): + for section in article['section_texts']: + fout.write(' '.join(preprocess_string(section)) + '\n') + + if (i + 1) % 10000 == 0: + logger.info('Processed {} articles.'.format(i + 1)) + + i += 1 + + fout.close() \ No newline at end of file From f893487e2dab8c59387537bf1ae71f961ed40c23 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Mon, 14 May 2018 15:54:08 +0300 Subject: [PATCH 02/49] Track performance metrics in base_any2vec.py --- gensim/models/base_any2vec.py | 43 ++++++++++++++++++- .../scripts/wikipedia_to_txt.py | 0 2 files changed, 42 insertions(+), 1 deletion(-) rename wikipedia_to_txt.py => gensim/scripts/wikipedia_to_txt.py (100%) diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py index e6a31263ec..d8f7898c12 100644 --- a/gensim/models/base_any2vec.py +++ b/gensim/models/base_any2vec.py @@ -26,6 +26,40 @@ logger = logging.getLogger(__name__) +PERFORMANCE_METRICS = { + 'total_time': 0.0, # Total training time for 1 epoch in seconds. + 'queue_size': 0.0, # Average job queue size. + 'words_sec': 0.0 # Average speed in words per second. +} + +_QUEUE_SIZE_SUM = 0.0 +_QUEUE_SIZE_TIMES = 0.0 + + +def _reset_performance_metrics(): + global PERFORMANCE_METRICS, _QUEUE_SIZE_TIMES, _QUEUE_SIZE_SUM + PERFORMANCE_METRICS = { + 'total_time': 0.0, + 'queue_size': 0.0, + 'words_sec': 0.0 + } + + _QUEUE_SIZE_SUM = 0.0 + _QUEUE_SIZE_TIMES = 0.0 + + +def _update_queue_stats(qsize): + global _QUEUE_SIZE_SUM, _QUEUE_SIZE_TIMES + _QUEUE_SIZE_SUM += qsize + _QUEUE_SIZE_TIMES += 1 + + +def _finalize_performance_metrics(elapsed, words_sec): + PERFORMANCE_METRICS['total_time'] = elapsed + PERFORMANCE_METRICS['words_sec'] = words_sec + PERFORMANCE_METRICS['queue_size'] = _QUEUE_SIZE_SUM / _QUEUE_SIZE_TIMES + + class BaseAny2VecModel(utils.SaveLoad): """Base class for training, using and evaluating any2vec model. Contains implementation for multi-threaded training. @@ -644,11 +678,16 @@ def _log_progress(self, job_queue, progress_queue, cur_epoch, example_count, tot raw_word_count, total_words, trained_word_count, elapsed): if total_examples: # examples-based progress % + + job_queue_size = utils.qsize(job_queue) + logger.info( "EPOCH %i - PROGRESS: at %.2f%% examples, %.0f words/s, in_qsize %i, out_qsize %i", cur_epoch + 1, 100.0 * example_count / total_examples, trained_word_count / elapsed, - utils.qsize(job_queue), utils.qsize(progress_queue) + job_queue_size, utils.qsize(progress_queue) ) + + _update_queue_stats(job_queue_size) else: # words-based progress % logger.info( @@ -664,6 +703,8 @@ def _log_epoch_end(self, cur_epoch, example_count, total_examples, raw_word_coun cur_epoch + 1, raw_word_count, trained_word_count, elapsed, trained_word_count / elapsed ) + _finalize_performance_metrics(elapsed, trained_word_count / elapsed) + # check that the input corpus hasn't changed during iteration if total_examples and total_examples != example_count: logger.warning( diff --git a/wikipedia_to_txt.py b/gensim/scripts/wikipedia_to_txt.py similarity index 100% rename from wikipedia_to_txt.py rename to gensim/scripts/wikipedia_to_txt.py From f03d9e66d25003b854a21e3cec89c5e9358f159d Mon Sep 17 00:00:00 2001 From: persiyanov Date: Mon, 14 May 2018 16:32:27 +0300 Subject: [PATCH 03/49] reset performance metrics in beginning of epoch --- gensim/models/base_any2vec.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py index d8f7898c12..df87dfa0b5 100644 --- a/gensim/models/base_any2vec.py +++ b/gensim/models/base_any2vec.py @@ -237,6 +237,8 @@ def _log_epoch_progress(self, progress_queue, job_queue, cur_epoch=0, total_exam def _train_epoch(self, data_iterable, cur_epoch=0, total_examples=None, total_words=None, queue_factor=2, report_delay=1.0): """Train one epoch.""" + _reset_performance_metrics() + job_queue = Queue(maxsize=queue_factor * self.workers) progress_queue = Queue(maxsize=(queue_factor + 1) * self.workers) From 55517fd5d1e471319fa7db28c6e6eddfcaf3b292 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Tue, 15 May 2018 14:43:58 +0300 Subject: [PATCH 04/49] add tracking CPU load + benchmarking script --- gensim/models/base_any2vec.py | 43 +++++------ gensim/scripts/benchmark_any2vec_speed.py | 89 +++++++++++++++++++++++ 2 files changed, 111 insertions(+), 21 deletions(-) create mode 100644 gensim/scripts/benchmark_any2vec_speed.py diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py index df87dfa0b5..a0e0f298a0 100644 --- a/gensim/models/base_any2vec.py +++ b/gensim/models/base_any2vec.py @@ -13,10 +13,11 @@ from six.moves import xrange from six import itervalues from gensim import matutils -from numpy import float32 as REAL, ones, random, dtype, zeros +from numpy import float32 as REAL, ones, random, dtype, zeros, array from types import GeneratorType from gensim.utils import deprecated import warnings +import psutil try: from queue import Queue @@ -26,38 +27,38 @@ logger = logging.getLogger(__name__) -PERFORMANCE_METRICS = { - 'total_time': 0.0, # Total training time for 1 epoch in seconds. - 'queue_size': 0.0, # Average job queue size. - 'words_sec': 0.0 # Average speed in words per second. -} - -_QUEUE_SIZE_SUM = 0.0 -_QUEUE_SIZE_TIMES = 0.0 +PERFORMANCE_METRICS = None +_NUM_STATS_UPDATES = None def _reset_performance_metrics(): - global PERFORMANCE_METRICS, _QUEUE_SIZE_TIMES, _QUEUE_SIZE_SUM + global PERFORMANCE_METRICS, _NUM_STATS_UPDATES, _QUEUE_SIZE_SUM PERFORMANCE_METRICS = { - 'total_time': 0.0, - 'queue_size': 0.0, - 'words_sec': 0.0 + 'total_time': 0.0, # Total training time for 1 epoch in seconds. + 'queue_size': 0.0, # Average job queue size. + 'words_sec': 0.0, # Average speed in words per second. + 'cpu_load': zeros(psutil.cpu_count(), dtype=REAL) } - _QUEUE_SIZE_SUM = 0.0 - _QUEUE_SIZE_TIMES = 0.0 + _NUM_STATS_UPDATES = 0.0 + + # Stub call in order to obtain correct results for subsequent calls. + psutil.cpu_percent(interval=None, percpu=True) + +def _update_queue_and_cpu_stats(qsize): + global _NUM_STATS_UPDATES + PERFORMANCE_METRICS['queue_size'] += qsize + PERFORMANCE_METRICS['cpu_load'] += array(psutil.cpu_percent(interval=None, percpu=True), dtype=REAL) -def _update_queue_stats(qsize): - global _QUEUE_SIZE_SUM, _QUEUE_SIZE_TIMES - _QUEUE_SIZE_SUM += qsize - _QUEUE_SIZE_TIMES += 1 + _NUM_STATS_UPDATES += 1 def _finalize_performance_metrics(elapsed, words_sec): PERFORMANCE_METRICS['total_time'] = elapsed PERFORMANCE_METRICS['words_sec'] = words_sec - PERFORMANCE_METRICS['queue_size'] = _QUEUE_SIZE_SUM / _QUEUE_SIZE_TIMES + PERFORMANCE_METRICS['queue_size'] = PERFORMANCE_METRICS['queue_size'] / _NUM_STATS_UPDATES + PERFORMANCE_METRICS['cpu_load'] = list(PERFORMANCE_METRICS['cpu_load'] / _NUM_STATS_UPDATES) class BaseAny2VecModel(utils.SaveLoad): @@ -689,7 +690,7 @@ def _log_progress(self, job_queue, progress_queue, cur_epoch, example_count, tot job_queue_size, utils.qsize(progress_queue) ) - _update_queue_stats(job_queue_size) + _update_queue_and_cpu_stats(job_queue_size) else: # words-based progress % logger.info( diff --git a/gensim/scripts/benchmark_any2vec_speed.py b/gensim/scripts/benchmark_any2vec_speed.py new file mode 100644 index 0000000000..d12efe6cdc --- /dev/null +++ b/gensim/scripts/benchmark_any2vec_speed.py @@ -0,0 +1,89 @@ +from __future__ import unicode_literals +from __future__ import print_function + +import logging +import argparse +import json +import copy + +from gensim.models.base_any2vec import PERFORMANCE_METRICS +from gensim.models.fasttext import FastText +from gensim.models.word2vec import Word2Vec +from gensim.models.doc2vec import Doc2Vec +from gensim.models.word2vec import LineSentence + + +logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) + +logger = logging.getLogger(__name__) + + +SUPPORTED_MODELS = { + 'fasttext': FastText, + 'word2vec': Word2Vec, + 'doc2vec': Doc2Vec +} + + +def benchmark_model(input, model, window, workers, vector_size): + if model == 'doc2vec': + kwargs = { + 'documents': LineSentence(input) + } + else: + kwargs = { + 'sentences': LineSentence(input) + } + + kwargs['size'] = vector_size + kwargs['window'] = window + kwargs['workers'] = workers + kwargs['iter'] = 1 + + logger.info('Creating model with kwargs={}'.format(kwargs)) + + # Training model for 1 epoch. + SUPPORTED_MODELS[model](**kwargs) + + return copy.deepcopy(PERFORMANCE_METRICS) + + +def do_benchmarks(input, models_grid, vector_size, workers_grid, windows_grid, label): + report = {} + + for model in models_grid: + for window in windows_grid: + for workers in workers_grid: + model_str = '{}-{}-window{}-workers{}-size{}'.format(label, model, window, workers, vector_size) + + logger.info('Start benchmarking {}.'.format(model_str)) + results = benchmark_model(input, model, window, workers, vector_size) + + logger.info('--- MODEL {} RESULTS ---'.format(model_str).center(30)) + logger.info('* Total time: {} sec.'.format(results['total_time'])) + logger.info('* Avg queue size: {} elems.'.format(results['queue_size'])) + logger.info('* Processing speed: {} words/sec'.format(results['words_sec'])) + logger.info('* Avg CPU loads: {}'.format(results['cpu_load'])) + + report[model_str] = results + + fout_name = '{}-results.json'.format(label) + with open(fout_name, 'w') as fout: + json.dump(report, fout) + + logger.info('Saved metrics report to {}.'.format(fout_name)) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='GSOC Multistream-API: evaluate performance ' + 'metrics for any2vec models') + parser.add_argument('--input', type=str) + parser.add_argument('--models-grid', nargs='+', type=str, default=SUPPORTED_MODELS.keys()) + parser.add_argument('--size', type=int, default=300) + parser.add_argument('--workers-grid', nargs='+', type=int, default=[1, 4, 8, 10, 12, 14]) + parser.add_argument('--windows-grid', nargs='+', type=int, default=[10]) + parser.add_argument('--label', type=str, default='untitled') + + args = parser.parse_args() + + do_benchmarks(args.input, args.models_grid, args.size, args.workers_grid, args.windows_grid, args.label) From 8ae3248b3550b30471f01b57447674f5d87160e7 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Tue, 15 May 2018 15:00:10 +0300 Subject: [PATCH 05/49] Some bug fixes --- gensim/models/base_any2vec.py | 6 ++++-- gensim/scripts/benchmark_any2vec_speed.py | 16 ++++++++-------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py index a0e0f298a0..6623bb5115 100644 --- a/gensim/models/base_any2vec.py +++ b/gensim/models/base_any2vec.py @@ -57,8 +57,10 @@ def _update_queue_and_cpu_stats(qsize): def _finalize_performance_metrics(elapsed, words_sec): PERFORMANCE_METRICS['total_time'] = elapsed PERFORMANCE_METRICS['words_sec'] = words_sec - PERFORMANCE_METRICS['queue_size'] = PERFORMANCE_METRICS['queue_size'] / _NUM_STATS_UPDATES - PERFORMANCE_METRICS['cpu_load'] = list(PERFORMANCE_METRICS['cpu_load'] / _NUM_STATS_UPDATES) + + if _NUM_STATS_UPDATES: + PERFORMANCE_METRICS['queue_size'] = PERFORMANCE_METRICS['queue_size'] / _NUM_STATS_UPDATES + PERFORMANCE_METRICS['cpu_load'] = list(PERFORMANCE_METRICS['cpu_load'] / _NUM_STATS_UPDATES) class BaseAny2VecModel(utils.SaveLoad): diff --git a/gensim/scripts/benchmark_any2vec_speed.py b/gensim/scripts/benchmark_any2vec_speed.py index d12efe6cdc..01a4b98ee1 100644 --- a/gensim/scripts/benchmark_any2vec_speed.py +++ b/gensim/scripts/benchmark_any2vec_speed.py @@ -6,7 +6,7 @@ import json import copy -from gensim.models.base_any2vec import PERFORMANCE_METRICS +from gensim.models import base_any2vec from gensim.models.fasttext import FastText from gensim.models.word2vec import Word2Vec from gensim.models.doc2vec import Doc2Vec @@ -45,7 +45,7 @@ def benchmark_model(input, model, window, workers, vector_size): # Training model for 1 epoch. SUPPORTED_MODELS[model](**kwargs) - return copy.deepcopy(PERFORMANCE_METRICS) + return copy.deepcopy(base_any2vec.PERFORMANCE_METRICS) def do_benchmarks(input, models_grid, vector_size, workers_grid, windows_grid, label): @@ -59,12 +59,12 @@ def do_benchmarks(input, models_grid, vector_size, workers_grid, windows_grid, l logger.info('Start benchmarking {}.'.format(model_str)) results = benchmark_model(input, model, window, workers, vector_size) - logger.info('--- MODEL {} RESULTS ---'.format(model_str).center(30)) - logger.info('* Total time: {} sec.'.format(results['total_time'])) - logger.info('* Avg queue size: {} elems.'.format(results['queue_size'])) - logger.info('* Processing speed: {} words/sec'.format(results['words_sec'])) - logger.info('* Avg CPU loads: {}'.format(results['cpu_load'])) - + logger.info('----- MODEL {} RESULTS -----'.format(model_str).center(50)) + logger.info('\t* Total time: {} sec.'.format(results['total_time'])) + logger.info('\t* Avg queue size: {} elems.'.format(results['queue_size'])) + logger.info('\t* Processing speed: {} words/sec'.format(results['words_sec'])) + logger.info('\t* Avg CPU loads: {}'.format(results['cpu_load'])) + report[model_str] = results fout_name = '{}-results.json'.format(label) From 29d2dbab9786c4d26fbbc2e7bc48f0e4dbd14430 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Tue, 15 May 2018 15:58:57 +0300 Subject: [PATCH 06/49] prettify logging results in benchmark script --- gensim/scripts/benchmark_any2vec_speed.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/gensim/scripts/benchmark_any2vec_speed.py b/gensim/scripts/benchmark_any2vec_speed.py index 01a4b98ee1..1a038aed11 100644 --- a/gensim/scripts/benchmark_any2vec_speed.py +++ b/gensim/scripts/benchmark_any2vec_speed.py @@ -9,7 +9,7 @@ from gensim.models import base_any2vec from gensim.models.fasttext import FastText from gensim.models.word2vec import Word2Vec -from gensim.models.doc2vec import Doc2Vec +from gensim.models.doc2vec import Doc2Vec, TaggedLineDocument from gensim.models.word2vec import LineSentence @@ -25,10 +25,18 @@ } +def print_results(model_str, results): + logger.info('----- MODEL {} RESULTS -----'.format(model_str).center(50)) + logger.info('\t* Total time: {} sec.'.format(results['total_time'])) + logger.info('\t* Avg queue size: {} elems.'.format(results['queue_size'])) + logger.info('\t* Processing speed: {} words/sec'.format(results['words_sec'])) + logger.info('\t* Avg CPU loads: {}'.format(results['cpu_load'])) + + def benchmark_model(input, model, window, workers, vector_size): if model == 'doc2vec': kwargs = { - 'documents': LineSentence(input) + 'documents': TaggedLineDocument(input) } else: kwargs = { @@ -59,14 +67,14 @@ def do_benchmarks(input, models_grid, vector_size, workers_grid, windows_grid, l logger.info('Start benchmarking {}.'.format(model_str)) results = benchmark_model(input, model, window, workers, vector_size) - logger.info('----- MODEL {} RESULTS -----'.format(model_str).center(50)) - logger.info('\t* Total time: {} sec.'.format(results['total_time'])) - logger.info('\t* Avg queue size: {} elems.'.format(results['queue_size'])) - logger.info('\t* Processing speed: {} words/sec'.format(results['words_sec'])) - logger.info('\t* Avg CPU loads: {}'.format(results['cpu_load'])) + print_results(model_str, results) report[model_str] = results + logger.info('Benchmarking completed. Here are the results:') + for model_str, results in report.iteritems(): + print_results(model_str, results) + fout_name = '{}-results.json'.format(label) with open(fout_name, 'w') as fout: json.dump(report, fout) From 5e47dfa31b5d7c50aed252018c0753df9ae1ca58 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Tue, 15 May 2018 16:44:45 +0300 Subject: [PATCH 07/49] More prettifying in benchmark script --- gensim/models/base_any2vec.py | 5 ++++- gensim/scripts/benchmark_any2vec_speed.py | 16 ++++++++-------- gensim/scripts/wikipedia_to_txt.py | 2 +- 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py index 6623bb5115..3f6f315976 100644 --- a/gensim/models/base_any2vec.py +++ b/gensim/models/base_any2vec.py @@ -60,7 +60,10 @@ def _finalize_performance_metrics(elapsed, words_sec): if _NUM_STATS_UPDATES: PERFORMANCE_METRICS['queue_size'] = PERFORMANCE_METRICS['queue_size'] / _NUM_STATS_UPDATES - PERFORMANCE_METRICS['cpu_load'] = list(PERFORMANCE_METRICS['cpu_load'] / _NUM_STATS_UPDATES) + PERFORMANCE_METRICS['cpu_load'] = PERFORMANCE_METRICS['cpu_load'] / _NUM_STATS_UPDATES + + # Explicitly format to string because floats are not serializable by json + PERFORMANCE_METRICS['cpu_load'] = ', '.join('{:.2f}'.format(x) for x in PERFORMANCE_METRICS['cpu_load']) class BaseAny2VecModel(utils.SaveLoad): diff --git a/gensim/scripts/benchmark_any2vec_speed.py b/gensim/scripts/benchmark_any2vec_speed.py index 1a038aed11..b831d5ed44 100644 --- a/gensim/scripts/benchmark_any2vec_speed.py +++ b/gensim/scripts/benchmark_any2vec_speed.py @@ -26,7 +26,7 @@ def print_results(model_str, results): - logger.info('----- MODEL {} RESULTS -----'.format(model_str).center(50)) + logger.info('----- MODEL "{}" RESULTS -----'.format(model_str).center(50)) logger.info('\t* Total time: {} sec.'.format(results['total_time'])) logger.info('\t* Avg queue size: {} elems.'.format(results['queue_size'])) logger.info('\t* Processing speed: {} words/sec'.format(results['words_sec'])) @@ -57,27 +57,27 @@ def benchmark_model(input, model, window, workers, vector_size): def do_benchmarks(input, models_grid, vector_size, workers_grid, windows_grid, label): - report = {} + full_report = {} for model in models_grid: for window in windows_grid: for workers in workers_grid: - model_str = '{}-{}-window{}-workers{}-size{}'.format(label, model, window, workers, vector_size) + model_str = '{}-{}-window-{:02d}-workers-{:02d}-size-{}'.format(label, model, window, workers, vector_size) logger.info('Start benchmarking {}.'.format(model_str)) results = benchmark_model(input, model, window, workers, vector_size) print_results(model_str, results) - report[model_str] = results + full_report[model_str] = results logger.info('Benchmarking completed. Here are the results:') - for model_str, results in report.iteritems(): - print_results(model_str, results) + for model_str in sorted(full_report.keys()): + print_results(model_str, full_report[model_str]) - fout_name = '{}-results.json'.format(label) + fout_name = '{}-report.json'.format(label) with open(fout_name, 'w') as fout: - json.dump(report, fout) + json.dump(full_report, fout) logger.info('Saved metrics report to {}.'.format(fout_name)) diff --git a/gensim/scripts/wikipedia_to_txt.py b/gensim/scripts/wikipedia_to_txt.py index d05e2cabf1..fda2853da4 100644 --- a/gensim/scripts/wikipedia_to_txt.py +++ b/gensim/scripts/wikipedia_to_txt.py @@ -26,4 +26,4 @@ i += 1 - fout.close() \ No newline at end of file + fout.close() From 389293f7b71eb738349826683a60418177561c8d Mon Sep 17 00:00:00 2001 From: persiyanov Date: Mon, 21 May 2018 11:35:48 +0300 Subject: [PATCH 08/49] add SUM cpu load --- gensim/models/base_any2vec.py | 4 +++- gensim/scripts/benchmark_any2vec_speed.py | 15 ++++++++++++--- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py index 3f6f315976..46c6c93f0d 100644 --- a/gensim/models/base_any2vec.py +++ b/gensim/models/base_any2vec.py @@ -37,7 +37,8 @@ def _reset_performance_metrics(): 'total_time': 0.0, # Total training time for 1 epoch in seconds. 'queue_size': 0.0, # Average job queue size. 'words_sec': 0.0, # Average speed in words per second. - 'cpu_load': zeros(psutil.cpu_count(), dtype=REAL) + 'cpu_load': zeros(psutil.cpu_count(), dtype=REAL), + 'cpu_load_sum': 0.0 } _NUM_STATS_UPDATES = 0.0 @@ -61,6 +62,7 @@ def _finalize_performance_metrics(elapsed, words_sec): if _NUM_STATS_UPDATES: PERFORMANCE_METRICS['queue_size'] = PERFORMANCE_METRICS['queue_size'] / _NUM_STATS_UPDATES PERFORMANCE_METRICS['cpu_load'] = PERFORMANCE_METRICS['cpu_load'] / _NUM_STATS_UPDATES + PERFORMANCE_METRICS['cpu_load_sum'] = PERFORMANCE_METRICS['cpu_load'].sum() # Explicitly format to string because floats are not serializable by json PERFORMANCE_METRICS['cpu_load'] = ', '.join('{:.2f}'.format(x) for x in PERFORMANCE_METRICS['cpu_load']) diff --git a/gensim/scripts/benchmark_any2vec_speed.py b/gensim/scripts/benchmark_any2vec_speed.py index b831d5ed44..7d6f38a03f 100644 --- a/gensim/scripts/benchmark_any2vec_speed.py +++ b/gensim/scripts/benchmark_any2vec_speed.py @@ -5,11 +5,13 @@ import argparse import json import copy +import yappi from gensim.models import base_any2vec from gensim.models.fasttext import FastText from gensim.models.word2vec import Word2Vec from gensim.models.doc2vec import Doc2Vec, TaggedLineDocument +from gensim.models.sent2vec import Sent2Vec from gensim.models.word2vec import LineSentence @@ -21,7 +23,8 @@ SUPPORTED_MODELS = { 'fasttext': FastText, 'word2vec': Word2Vec, - 'doc2vec': Doc2Vec + 'doc2vec': Doc2Vec, + 'sent2vec': Sent2Vec } @@ -31,6 +34,7 @@ def print_results(model_str, results): logger.info('\t* Avg queue size: {} elems.'.format(results['queue_size'])) logger.info('\t* Processing speed: {} words/sec'.format(results['words_sec'])) logger.info('\t* Avg CPU loads: {}'.format(results['cpu_load'])) + logger.info('\t* Sum CPU loads: {}'.format(results['cpu_load_sum'])) def benchmark_model(input, model, window, workers, vector_size): @@ -44,14 +48,19 @@ def benchmark_model(input, model, window, workers, vector_size): } kwargs['size'] = vector_size - kwargs['window'] = window + + if model != 'sent2vec': + kwargs['window'] = window kwargs['workers'] = workers - kwargs['iter'] = 1 + kwargs['epochs'] = 1 logger.info('Creating model with kwargs={}'.format(kwargs)) # Training model for 1 epoch. + yappi.start() SUPPORTED_MODELS[model](**kwargs) + yappi.get_func_stats().print_all() + yappi.get_thread_stats().print_all() return copy.deepcopy(base_any2vec.PERFORMANCE_METRICS) From b1765e79a80d818d1fb3d618983b7fef333410f1 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Mon, 21 May 2018 13:24:50 +0300 Subject: [PATCH 09/49] remove sent2vec from script --- gensim/scripts/benchmark_any2vec_speed.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/gensim/scripts/benchmark_any2vec_speed.py b/gensim/scripts/benchmark_any2vec_speed.py index 7d6f38a03f..5c62a8fe7d 100644 --- a/gensim/scripts/benchmark_any2vec_speed.py +++ b/gensim/scripts/benchmark_any2vec_speed.py @@ -11,7 +11,6 @@ from gensim.models.fasttext import FastText from gensim.models.word2vec import Word2Vec from gensim.models.doc2vec import Doc2Vec, TaggedLineDocument -from gensim.models.sent2vec import Sent2Vec from gensim.models.word2vec import LineSentence @@ -24,7 +23,6 @@ 'fasttext': FastText, 'word2vec': Word2Vec, 'doc2vec': Doc2Vec, - 'sent2vec': Sent2Vec } From 4d50cffc43548d08d9dc2d73860ff9fbf2222191 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Mon, 21 May 2018 16:49:41 +0300 Subject: [PATCH 10/49] First approach to multistream, only for word2vec right now --- gensim/models/base_any2vec.py | 36 +++++++++++++++-------- gensim/models/word2vec.py | 12 ++++---- gensim/scripts/benchmark_any2vec_speed.py | 1 + 3 files changed, 31 insertions(+), 18 deletions(-) diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py index 46c6c93f0d..5e362fbdfa 100644 --- a/gensim/models/base_any2vec.py +++ b/gensim/models/base_any2vec.py @@ -16,6 +16,7 @@ from numpy import float32 as REAL, ones, random, dtype, zeros, array from types import GeneratorType from gensim.utils import deprecated +import itertools import warnings import psutil @@ -242,7 +243,7 @@ def _log_epoch_progress(self, progress_queue, job_queue, cur_epoch=0, total_exam self.total_train_time += elapsed return trained_word_count, raw_word_count, job_tally - def _train_epoch(self, data_iterable, cur_epoch=0, total_examples=None, + def _train_epoch(self, data_iterables, cur_epoch=0, total_examples=None, total_words=None, queue_factor=2, report_delay=1.0): """Train one epoch.""" _reset_performance_metrics() @@ -257,10 +258,13 @@ def _train_epoch(self, data_iterable, cur_epoch=0, total_examples=None, for _ in xrange(self.workers) ] - workers.append(threading.Thread( - target=self._job_producer, - args=(data_iterable, job_queue), - kwargs={'cur_epoch': cur_epoch, 'total_examples': total_examples, 'total_words': total_words})) + workers.extend( + threading.Thread( + target=self._job_producer, + args=(data_iterable, job_queue), + kwargs={'cur_epoch': cur_epoch, 'total_examples': total_examples, 'total_words': total_words} + ) for data_iterable in data_iterables + ) for thread in workers: thread.daemon = True # make interrupting the process with ctrl+c easier @@ -272,7 +276,7 @@ def _train_epoch(self, data_iterable, cur_epoch=0, total_examples=None, return trained_word_count, raw_word_count, job_tally - def train(self, data_iterable, epochs=None, total_examples=None, + def train(self, data_iterables, epochs=None, total_examples=None, total_words=None, queue_factor=2, report_delay=1.0, callbacks=(), **kwargs): """Handle multi-worker training.""" self._set_train_params(**kwargs) @@ -297,7 +301,7 @@ def train(self, data_iterable, epochs=None, total_examples=None, callback.on_epoch_begin(self) trained_word_count_epoch, raw_word_count_epoch, job_tally_epoch = self._train_epoch( - data_iterable, cur_epoch=cur_epoch, total_examples=total_examples, total_words=total_words, + data_iterables, cur_epoch=cur_epoch, total_examples=total_examples, total_words=total_words, queue_factor=queue_factor, report_delay=report_delay) trained_word_count += trained_word_count_epoch raw_word_count += raw_word_count_epoch @@ -341,8 +345,8 @@ def _do_train_job(self, data_iterable, job_parameters, thread_private_mem): def _set_train_params(self, **kwargs): raise NotImplementedError() - def __init__(self, sentences=None, workers=3, vector_size=100, epochs=5, callbacks=(), batch_words=10000, - trim_rule=None, sg=0, alpha=0.025, window=5, seed=1, hs=0, negative=5, cbow_mean=1, + def __init__(self, sentences=None, input_streams=None, workers=3, vector_size=100, epochs=5, callbacks=(), + batch_words=10000, trim_rule=None, sg=0, alpha=0.025, window=5, seed=1, hs=0, negative=5, cbow_mean=1, min_alpha=0.0001, compute_loss=False, fast_version=0, **kwargs): self.sg = int(sg) if vector_size % 4 != 0: @@ -374,12 +378,20 @@ def __init__(self, sentences=None, workers=3, vector_size=100, epochs=5, callbac self.neg_labels[0] = 1. if sentences is not None: + assert input_streams is None, "You can't pass both `sententes` and `input_streams`." if isinstance(sentences, GeneratorType): raise TypeError("You can't pass a generator as the sentences argument. Try an iterator.") + self.build_vocab(sentences, trim_rule=trim_rule) self.train( - sentences, total_examples=self.corpus_count, epochs=self.epochs, start_alpha=self.alpha, + [sentences], total_examples=self.corpus_count, epochs=self.epochs, start_alpha=self.alpha, end_alpha=self.min_alpha, compute_loss=compute_loss) + elif input_streams is not None: + assert len(input_streams) > 0 + + self.build_vocab(itertools.chain(input_streams), trim_rule=trim_rule) + self.train(input_streams, total_examples=self.corpus_count, epochs=self.epochs, start_alpha=self.alpha, + end_alpha=self.min_alpha, compute_loss=compute_loss) else: if trim_rule is not None: logger.warning( @@ -599,7 +611,7 @@ def estimate_memory(self, vocab_size=None, report=None): ) return report - def train(self, sentences, total_examples=None, total_words=None, + def train(self, input_streams, total_examples=None, total_words=None, epochs=None, start_alpha=None, end_alpha=None, word_count=0, queue_factor=2, report_delay=1.0, compute_loss=False, callbacks=()): @@ -608,7 +620,7 @@ def train(self, sentences, total_examples=None, total_words=None, self.compute_loss = compute_loss self.running_training_loss = 0.0 return super(BaseWordEmbeddingsModel, self).train( - sentences, total_examples=total_examples, total_words=total_words, + input_streams, total_examples=total_examples, total_words=total_words, epochs=epochs, start_alpha=start_alpha, end_alpha=end_alpha, word_count=word_count, queue_factor=queue_factor, report_delay=report_delay, compute_loss=compute_loss, callbacks=callbacks) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 9539aa8d2c..a1fa610e14 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -423,7 +423,7 @@ class Word2Vec(BaseWordEmbeddingsModel): """ - def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, + def __init__(self, sentences=None, input_streams=None, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False, callbacks=(), @@ -528,9 +528,9 @@ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, self.trainables = Word2VecTrainables(seed=seed, vector_size=size, hashfxn=hashfxn) super(Word2Vec, self).__init__( - sentences=sentences, workers=workers, vector_size=size, epochs=iter, callbacks=callbacks, - batch_words=batch_words, trim_rule=trim_rule, sg=sg, alpha=alpha, window=window, seed=seed, - hs=hs, negative=negative, cbow_mean=cbow_mean, min_alpha=min_alpha, compute_loss=compute_loss, + sentences=sentences, input_streams=input_streams, workers=workers, vector_size=size, epochs=iter, + callbacks=callbacks, batch_words=batch_words, trim_rule=trim_rule, sg=sg, alpha=alpha, window=window, + seed=seed, hs=hs, negative=negative, cbow_mean=cbow_mean, min_alpha=min_alpha, compute_loss=compute_loss, fast_version=FAST_VERSION) def _do_train_job(self, sentences, alpha, inits): @@ -555,7 +555,7 @@ def _set_train_params(self, **kwargs): self.compute_loss = kwargs['compute_loss'] self.running_training_loss = 0 - def train(self, sentences, total_examples=None, total_words=None, + def train(self, input_streams, total_examples=None, total_words=None, epochs=None, start_alpha=None, end_alpha=None, word_count=0, queue_factor=2, report_delay=1.0, compute_loss=False, callbacks=()): """Update the model's neural weights from a sequence of sentences (can be a once-only generator stream). @@ -613,7 +613,7 @@ def train(self, sentences, total_examples=None, total_words=None, """ return super(Word2Vec, self).train( - sentences, total_examples=total_examples, total_words=total_words, + input_streams, total_examples=total_examples, total_words=total_words, epochs=epochs, start_alpha=start_alpha, end_alpha=end_alpha, word_count=word_count, queue_factor=queue_factor, report_delay=report_delay, compute_loss=compute_loss, callbacks=callbacks) diff --git a/gensim/scripts/benchmark_any2vec_speed.py b/gensim/scripts/benchmark_any2vec_speed.py index 5c62a8fe7d..af25f2bc40 100644 --- a/gensim/scripts/benchmark_any2vec_speed.py +++ b/gensim/scripts/benchmark_any2vec_speed.py @@ -49,6 +49,7 @@ def benchmark_model(input, model, window, workers, vector_size): if model != 'sent2vec': kwargs['window'] = window + kwargs['workers'] = workers kwargs['epochs'] = 1 From 48f498cedad51886f737c3173e5a58b00d95dfe2 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Mon, 21 May 2018 17:23:44 +0300 Subject: [PATCH 11/49] adapted benchmarking script to multistream --- gensim/models/base_any2vec.py | 2 +- gensim/scripts/benchmark_any2vec_speed.py | 28 +++++++++++++++-------- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py index 5e362fbdfa..0236c3a6e5 100644 --- a/gensim/models/base_any2vec.py +++ b/gensim/models/base_any2vec.py @@ -389,7 +389,7 @@ def __init__(self, sentences=None, input_streams=None, workers=3, vector_size=10 elif input_streams is not None: assert len(input_streams) > 0 - self.build_vocab(itertools.chain(input_streams), trim_rule=trim_rule) + self.build_vocab(itertools.chain(*input_streams), trim_rule=trim_rule) self.train(input_streams, total_examples=self.corpus_count, epochs=self.epochs, start_alpha=self.alpha, end_alpha=self.min_alpha, compute_loss=compute_loss) else: diff --git a/gensim/scripts/benchmark_any2vec_speed.py b/gensim/scripts/benchmark_any2vec_speed.py index af25f2bc40..c7eea52dba 100644 --- a/gensim/scripts/benchmark_any2vec_speed.py +++ b/gensim/scripts/benchmark_any2vec_speed.py @@ -6,6 +6,8 @@ import json import copy import yappi +import os +import glob from gensim.models import base_any2vec from gensim.models.fasttext import FastText @@ -32,17 +34,17 @@ def print_results(model_str, results): logger.info('\t* Avg queue size: {} elems.'.format(results['queue_size'])) logger.info('\t* Processing speed: {} words/sec'.format(results['words_sec'])) logger.info('\t* Avg CPU loads: {}'.format(results['cpu_load'])) - logger.info('\t* Sum CPU loads: {}'.format(results['cpu_load_sum'])) + logger.info('\t* Sum CPU load: {}'.format(results['cpu_load_sum'])) -def benchmark_model(input, model, window, workers, vector_size): +def benchmark_model(input_streams, model, window, workers, vector_size): if model == 'doc2vec': kwargs = { - 'documents': TaggedLineDocument(input) + 'input_streams': [TaggedLineDocument(inp) for inp in input_streams] } else: kwargs = { - 'sentences': LineSentence(input) + 'input_streams': [LineSentence(inp) for inp in input_streams] } kwargs['size'] = vector_size @@ -51,7 +53,7 @@ def benchmark_model(input, model, window, workers, vector_size): kwargs['window'] = window kwargs['workers'] = workers - kwargs['epochs'] = 1 + kwargs['iter'] = 1 logger.info('Creating model with kwargs={}'.format(kwargs)) @@ -64,7 +66,7 @@ def benchmark_model(input, model, window, workers, vector_size): return copy.deepcopy(base_any2vec.PERFORMANCE_METRICS) -def do_benchmarks(input, models_grid, vector_size, workers_grid, windows_grid, label): +def do_benchmarks(input_streams, models_grid, vector_size, workers_grid, windows_grid, label): full_report = {} for model in models_grid: @@ -73,7 +75,7 @@ def do_benchmarks(input, models_grid, vector_size, workers_grid, windows_grid, l model_str = '{}-{}-window-{:02d}-workers-{:02d}-size-{}'.format(label, model, window, workers, vector_size) logger.info('Start benchmarking {}.'.format(model_str)) - results = benchmark_model(input, model, window, workers, vector_size) + results = benchmark_model(input_streams, model, window, workers, vector_size) print_results(model_str, results) @@ -93,7 +95,8 @@ def do_benchmarks(input, models_grid, vector_size, workers_grid, windows_grid, l if __name__ == '__main__': parser = argparse.ArgumentParser(description='GSOC Multistream-API: evaluate performance ' 'metrics for any2vec models') - parser.add_argument('--input', type=str) + parser.add_argument('--input', type=str, help='Input file or regexp if `multistream` mode is on.') + parser.add_argument('--multistream', action='store_true') parser.add_argument('--models-grid', nargs='+', type=str, default=SUPPORTED_MODELS.keys()) parser.add_argument('--size', type=int, default=300) parser.add_argument('--workers-grid', nargs='+', type=int, default=[1, 4, 8, 10, 12, 14]) @@ -102,4 +105,11 @@ def do_benchmarks(input, models_grid, vector_size, workers_grid, windows_grid, l args = parser.parse_args() - do_benchmarks(args.input, args.models_grid, args.size, args.workers_grid, args.windows_grid, args.label) + input_ = os.path.expanduser(args.input) + if args.multistream: + input_streams = glob.glob(input_) + logger.info('Glob found {} input streams. List: {}'.format(len(input_streams), input_streams)) + else: + input_streams = [input_] + + do_benchmarks(input_streams, args.models_grid, args.size, args.workers_grid, args.windows_grid, args.label) From a2a6e4f349a320041c8620ab64b6d55a79c4baa4 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Mon, 21 May 2018 19:14:38 +0300 Subject: [PATCH 12/49] fix --- gensim/models/base_any2vec.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py index 0236c3a6e5..022164aaed 100644 --- a/gensim/models/base_any2vec.py +++ b/gensim/models/base_any2vec.py @@ -67,6 +67,7 @@ def _finalize_performance_metrics(elapsed, words_sec): # Explicitly format to string because floats are not serializable by json PERFORMANCE_METRICS['cpu_load'] = ', '.join('{:.2f}'.format(x) for x in PERFORMANCE_METRICS['cpu_load']) + PERFORMANCE_METRICS['cpu_load_sum'] = str(PERFORMANCE_METRICS['cpu_load_sum']) class BaseAny2VecModel(utils.SaveLoad): From b9668ee99233d9d6fd095d42055519318c4f876b Mon Sep 17 00:00:00 2001 From: persiyanov Date: Tue, 22 May 2018 23:13:28 +0300 Subject: [PATCH 13/49] fix bench script --- gensim/scripts/benchmark_any2vec_speed.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/gensim/scripts/benchmark_any2vec_speed.py b/gensim/scripts/benchmark_any2vec_speed.py index c7eea52dba..a0424a0a11 100644 --- a/gensim/scripts/benchmark_any2vec_speed.py +++ b/gensim/scripts/benchmark_any2vec_speed.py @@ -5,7 +5,7 @@ import argparse import json import copy -import yappi +# import yappi import os import glob @@ -58,10 +58,10 @@ def benchmark_model(input_streams, model, window, workers, vector_size): logger.info('Creating model with kwargs={}'.format(kwargs)) # Training model for 1 epoch. - yappi.start() + # yappi.start() SUPPORTED_MODELS[model](**kwargs) - yappi.get_func_stats().print_all() - yappi.get_thread_stats().print_all() + # yappi.get_func_stats().print_all() + # yappi.get_thread_stats().print_all() return copy.deepcopy(base_any2vec.PERFORMANCE_METRICS) @@ -96,7 +96,6 @@ def do_benchmarks(input_streams, models_grid, vector_size, workers_grid, windows parser = argparse.ArgumentParser(description='GSOC Multistream-API: evaluate performance ' 'metrics for any2vec models') parser.add_argument('--input', type=str, help='Input file or regexp if `multistream` mode is on.') - parser.add_argument('--multistream', action='store_true') parser.add_argument('--models-grid', nargs='+', type=str, default=SUPPORTED_MODELS.keys()) parser.add_argument('--size', type=int, default=300) parser.add_argument('--workers-grid', nargs='+', type=int, default=[1, 4, 8, 10, 12, 14]) @@ -106,10 +105,7 @@ def do_benchmarks(input_streams, models_grid, vector_size, workers_grid, windows args = parser.parse_args() input_ = os.path.expanduser(args.input) - if args.multistream: - input_streams = glob.glob(input_) - logger.info('Glob found {} input streams. List: {}'.format(len(input_streams), input_streams)) - else: - input_streams = [input_] + input_streams = glob.glob(input_) + logger.info('Glob found {} input streams. List: {}'.format(len(input_streams), input_streams)) do_benchmarks(input_streams, args.models_grid, args.size, args.workers_grid, args.windows_grid, args.label) From 2765207ec7c4427fddd41b3eeb74d5bd00797f98 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Mon, 28 May 2018 13:11:11 +0300 Subject: [PATCH 14/49] Measure vocabulary building time --- gensim/models/base_any2vec.py | 17 +++++++++++++---- gensim/scripts/benchmark_any2vec_speed.py | 3 ++- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py index 022164aaed..d0b4e134c6 100644 --- a/gensim/models/base_any2vec.py +++ b/gensim/models/base_any2vec.py @@ -19,6 +19,7 @@ import itertools import warnings import psutil +import time try: from queue import Queue @@ -39,7 +40,8 @@ def _reset_performance_metrics(): 'queue_size': 0.0, # Average job queue size. 'words_sec': 0.0, # Average speed in words per second. 'cpu_load': zeros(psutil.cpu_count(), dtype=REAL), - 'cpu_load_sum': 0.0 + 'cpu_load_sum': 0.0, + 'vocab_time': 0.0 } _NUM_STATS_UPDATES = 0.0 @@ -56,6 +58,10 @@ def _update_queue_and_cpu_stats(qsize): _NUM_STATS_UPDATES += 1 +def _set_vocab_time(elapsed): + PERFORMANCE_METRICS['vocab_time'] = elapsed + + def _finalize_performance_metrics(elapsed, words_sec): PERFORMANCE_METRICS['total_time'] = elapsed PERFORMANCE_METRICS['words_sec'] = words_sec @@ -192,9 +198,6 @@ def _job_producer(self, data_iterator, job_queue, cur_epoch=0, total_examples=No "be sure to provide a corpus that offers restartable iteration = an iterable)." ) - # give the workers heads up that they can finish -- no more work! - for _ in xrange(self.workers): - job_queue.put(None) logger.debug("job loop exiting, total %i jobs", job_no) def _log_progress(self, job_queue, progress_queue, cur_epoch, example_count, total_examples, @@ -533,6 +536,8 @@ def build_vocab(self, sentences, update=False, progress_per=10000, keep_raw_voca Indicates how many words to process before showing/updating the progress. """ + start_time = time.time() + total_words, corpus_count = self.vocabulary.scan_vocab( sentences, progress_per=progress_per, trim_rule=trim_rule) self.corpus_count = corpus_count @@ -542,6 +547,10 @@ def build_vocab(self, sentences, update=False, progress_per=10000, keep_raw_voca report_values['memory'] = self.estimate_memory(vocab_size=report_values['num_retained_words']) self.trainables.prepare_weights(self.hs, self.negative, self.wv, update=update, vocabulary=self.vocabulary) + end_time = time.time() + _set_vocab_time(end_time - start_time) + logger.info('Vocabulary was built in {:.2f} seconds.'.format(end_time - start_time)) + def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=None, trim_rule=None, update=False): """Build vocabulary from a dictionary of word frequencies. Build model vocabulary from a passed dictionary that contains (word,word count). diff --git a/gensim/scripts/benchmark_any2vec_speed.py b/gensim/scripts/benchmark_any2vec_speed.py index a0424a0a11..bda130cd8c 100644 --- a/gensim/scripts/benchmark_any2vec_speed.py +++ b/gensim/scripts/benchmark_any2vec_speed.py @@ -30,7 +30,8 @@ def print_results(model_str, results): logger.info('----- MODEL "{}" RESULTS -----'.format(model_str).center(50)) - logger.info('\t* Total time: {} sec.'.format(results['total_time'])) + logger.info('\t* Vocab time: {} sec.'.format(results['vocab_time'])) + logger.info('\t* Total epoch time: {} sec.'.format(results['total_time'])) logger.info('\t* Avg queue size: {} elems.'.format(results['queue_size'])) logger.info('\t* Processing speed: {} words/sec'.format(results['words_sec'])) logger.info('\t* Avg CPU loads: {}'.format(results['cpu_load'])) From d110f2683d9f156840e6d8fc9ed9204c46e4a7a0 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Mon, 28 May 2018 16:49:11 +0300 Subject: [PATCH 15/49] fix --- gensim/models/base_any2vec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py index d0b4e134c6..4ab2cc0ed8 100644 --- a/gensim/models/base_any2vec.py +++ b/gensim/models/base_any2vec.py @@ -91,6 +91,7 @@ def __init__(self, workers=3, vector_size=100, epochs=5, callbacks=(), batch_wor - self.trainables (instance of concrete implementation of `BaseTrainables` abstract class) """ + _reset_performance_metrics() self.vector_size = int(vector_size) self.workers = int(workers) self.epochs = epochs @@ -250,7 +251,6 @@ def _log_epoch_progress(self, progress_queue, job_queue, cur_epoch=0, total_exam def _train_epoch(self, data_iterables, cur_epoch=0, total_examples=None, total_words=None, queue_factor=2, report_delay=1.0): """Train one epoch.""" - _reset_performance_metrics() job_queue = Queue(maxsize=queue_factor * self.workers) progress_queue = Queue(maxsize=(queue_factor + 1) * self.workers) From c9e507f34b9f7305b387dccf78443651e4281224 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Wed, 30 May 2018 14:27:15 +0300 Subject: [PATCH 16/49] multiprocessing multistream --- gensim/models/base_any2vec.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py index 4ab2cc0ed8..31fbefe0ce 100644 --- a/gensim/models/base_any2vec.py +++ b/gensim/models/base_any2vec.py @@ -10,6 +10,7 @@ import logging from timeit import default_timer import threading +import multiprocessing as mp from six.moves import xrange from six import itervalues from gensim import matutils @@ -199,6 +200,9 @@ def _job_producer(self, data_iterator, job_queue, cur_epoch=0, total_examples=No "be sure to provide a corpus that offers restartable iteration = an iterable)." ) + # give the workers heads up that they can finish -- no more work! + for _ in xrange(self.workers): + job_queue.put(None) logger.debug("job loop exiting, total %i jobs", job_no) def _log_progress(self, job_queue, progress_queue, cur_epoch, example_count, total_examples, @@ -252,7 +256,7 @@ def _train_epoch(self, data_iterables, cur_epoch=0, total_examples=None, total_words=None, queue_factor=2, report_delay=1.0): """Train one epoch.""" - job_queue = Queue(maxsize=queue_factor * self.workers) + job_queue = mp.Queue(maxsize=queue_factor * self.workers) progress_queue = Queue(maxsize=(queue_factor + 1) * self.workers) workers = [ @@ -262,13 +266,24 @@ def _train_epoch(self, data_iterables, cur_epoch=0, total_examples=None, for _ in xrange(self.workers) ] - workers.extend( - threading.Thread( + processes = [ + mp.Process( target=self._job_producer, args=(data_iterable, job_queue), kwargs={'cur_epoch': cur_epoch, 'total_examples': total_examples, 'total_words': total_words} ) for data_iterable in data_iterables - ) + ] + # workers.extend( + # threading.Thread( + # target=self._job_producer, + # args=(data_iterable, job_queue), + # kwargs={'cur_epoch': cur_epoch, 'total_examples': total_examples, 'total_words': total_words} + # ) for data_iterable in data_iterables + # ) + + for process in processes: + process.daemon = True + process.start() for thread in workers: thread.daemon = True # make interrupting the process with ctrl+c easier From 44bc8f8da7cc6172469a4d1000c898c41394ff5d Mon Sep 17 00:00:00 2001 From: persiyanov Date: Wed, 30 May 2018 21:30:11 +0300 Subject: [PATCH 17/49] add w2v benchmarking script --- gensim/scripts/benchmark_w2v_vocab.py | 38 +++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 gensim/scripts/benchmark_w2v_vocab.py diff --git a/gensim/scripts/benchmark_w2v_vocab.py b/gensim/scripts/benchmark_w2v_vocab.py new file mode 100644 index 0000000000..27c9e07b22 --- /dev/null +++ b/gensim/scripts/benchmark_w2v_vocab.py @@ -0,0 +1,38 @@ +from __future__ import unicode_literals +from __future__ import print_function + +import logging +import argparse +# import yappi +import os +import glob + +from gensim.models import base_any2vec +from gensim.models.word2vec import Word2Vec, LineSentence + + +logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) + +logger = logging.getLogger(__name__) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='GSOC Multistream-API: evaluate vocab performance ' + 'for word2vec') + parser.add_argument('--input', type=str, help='Input file or regexp for multistream.') + parser.add_argument('--size', type=int, default=300) + parser.add_argument('--workers-grid', nargs='+', type=int, default=[1, 2, 3, 4, 5, 8, 10, 12, 14]) + parser.add_argument('--label', type=str, default='untitled') + + args = parser.parse_args() + + input_ = os.path.expanduser(args.input) + input_streams = glob.glob(input_) + logger.info('Glob found {} input streams. List: {}'.format(len(input_streams), input_streams)) + + input_streams = [LineSentence(_) for _ in input_streams] + for workers in args.workers_grid: + model = Word2Vec() + model.build_vocab(input_streams, workers=workers) + logger.info('Workers = {}\tVocab time = {:.2f} secs'.format(workers, + base_any2vec.PERFORMANCE_METRICS['vocab_time'])) From 99d0fc0cccade3b29845143996adc2858e8b7cc3 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Wed, 30 May 2018 21:44:49 +0300 Subject: [PATCH 18/49] multiprocessinng for scan_vocab --- gensim/models/base_any2vec.py | 6 ++-- gensim/models/word2vec.py | 64 ++++++++++++++++++++++++----------- gensim/utils.py | 25 ++++++++++++++ 3 files changed, 73 insertions(+), 22 deletions(-) diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py index 31fbefe0ce..2a38b5e439 100644 --- a/gensim/models/base_any2vec.py +++ b/gensim/models/base_any2vec.py @@ -408,7 +408,7 @@ def __init__(self, sentences=None, input_streams=None, workers=3, vector_size=10 elif input_streams is not None: assert len(input_streams) > 0 - self.build_vocab(itertools.chain(*input_streams), trim_rule=trim_rule) + self.build_vocab(input_streams, trim_rule=trim_rule) self.train(input_streams, total_examples=self.corpus_count, epochs=self.epochs, start_alpha=self.alpha, end_alpha=self.min_alpha, compute_loss=compute_loss) else: @@ -534,7 +534,7 @@ def __str__(self): self.__class__.__name__, len(self.wv.index2word), self.vector_size, self.alpha ) - def build_vocab(self, sentences, update=False, progress_per=10000, keep_raw_vocab=False, trim_rule=None, **kwargs): + def build_vocab(self, input_streams, update=False, progress_per=10000, workers=1, keep_raw_vocab=False, trim_rule=None, **kwargs): """Build vocabulary from a sequence of sentences (can be a once-only generator stream). Each sentence is a iterable of iterables (can simply be a list of unicode strings too). @@ -554,7 +554,7 @@ def build_vocab(self, sentences, update=False, progress_per=10000, keep_raw_voca start_time = time.time() total_words, corpus_count = self.vocabulary.scan_vocab( - sentences, progress_per=progress_per, trim_rule=trim_rule) + input_streams, progress_per=progress_per, workers=workers, trim_rule=trim_rule) self.corpus_count = corpus_count report_values = self.vocabulary.prepare_vocab( self.hs, self.negative, self.wv, update=update, keep_raw_vocab=keep_raw_vocab, diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index a1fa610e14..a6ff16a9e6 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -110,6 +110,7 @@ from copy import deepcopy from collections import defaultdict import threading +import multiprocessing as mp import itertools import warnings @@ -1156,42 +1157,67 @@ def __init__(self, max_vocab_size=None, min_count=5, sample=1e-3, sorted_vocab=T self.raw_vocab = None self.max_final_vocab = max_final_vocab - def scan_vocab(self, sentences, progress_per=10000, trim_rule=None): - """Do an initial scan of all words appearing in sentences.""" - logger.info("collecting all words and their counts") - sentence_no = -1 - total_words = 0 + def _scan_vocab_worker(self, stream, progress_queue, trim_rule=None): + """Do an initial scan of all words appearing in stream.""" min_reduce = 1 vocab = defaultdict(int) checked_string_types = 0 - for sentence_no, sentence in enumerate(sentences): + for sentence in stream: if not checked_string_types: if isinstance(sentence, string_types): - logger.warning( - "Each 'sentences' item should be a list of words (usually unicode strings). " - "First item here is instead plain %s.", - type(sentence) - ) + log_msg = "Each 'sentences' item should be a list of words (usually unicode strings). " \ + "First item here is instead plain %s." % type(sentence) + progress_queue.put(log_msg) + checked_string_types += 1 - if sentence_no % progress_per == 0: - logger.info( - "PROGRESS: at sentence #%i, processed %i words, keeping %i word types", - sentence_no, total_words, len(vocab) - ) + for word in sentence: vocab[word] += 1 - total_words += len(sentence) if self.max_vocab_size and len(vocab) > self.max_vocab_size: utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule) min_reduce += 1 + progress_queue.put((len(sentence), 1)) + + progress_queue.put(None) + return vocab + + def scan_vocab(self, input_streams, progress_per=10000, workers=1, trim_rule=None): + progress_queue = mp.Queue() + pool = mp.Pool(processes=min(workers, len(input_streams))) + + results = [ + pool.apply_async(self._scan_vocab_worker, (stream, progress_queue, trim_rule)) for stream in input_streams + ] + + logger.info("collecting all words and their counts") + unfinished_tasks = len(results) + total_words = 0 + sentence_no = -1 + while unfinished_tasks > 0: + report = progress_queue.get() + if report is None: + unfinished_tasks -= 1 + logger.info("scan vocab worker process finished; awaiting finish of %i more procs", unfinished_tasks) + elif isinstance(report, string_types): + logger.warning(report) + else: + num_words, num_sentences = report + total_words += num_words + sentence_no += num_sentences + + if sentence_no % progress_per == 0: + logger.info("PROGRESS: at sentence #%i, processed %i words", sentence_no, total_words) + + self.raw_vocab = reduce(utils.merge_dicts, [res.get() for res in results]) logger.info( "collected %i word types from a corpus of %i raw words and %i sentences", - len(vocab), total_words, sentence_no + 1 + len(self.raw_vocab), total_words, sentence_no + 1 ) + corpus_count = sentence_no + 1 - self.raw_vocab = vocab + return total_words, corpus_count def sort_vocab(self, wv): diff --git a/gensim/utils.py b/gensim/utils.py index 6d2823c652..2bf433ceb8 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -1709,6 +1709,31 @@ def prune_vocab(vocab, min_reduce, trim_rule=None): return result +def merge_dicts(dict1, dict2): + """Merge `dict1` of (word, freq1) and `dict2` of (word, freq2) into `dict1` of (word, freq1+freq2). + + Parameters + ---------- + dict1 : dict + First dictionary. + dict2 : dict + Second dictionary. + + Returns + ------- + result : dict + Merged dictionary with sum of frequencies as values. + + """ + for word, freq in dict2: + if word in dict1: + dict1[word] += freq + else: + dict1[word] = freq + + return dict1 + + def qsize(queue): """Get the (approximate) queue size where available. From ffd520488a7439ae872b2f2fb5af6726a57f308c Mon Sep 17 00:00:00 2001 From: persiyanov Date: Wed, 30 May 2018 22:14:26 +0300 Subject: [PATCH 19/49] fixes --- gensim/models/word2vec.py | 60 +++++++++++++++++++++------------------ gensim/utils.py | 2 +- 2 files changed, 33 insertions(+), 29 deletions(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index a6ff16a9e6..7ac0c59090 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -1145,6 +1145,33 @@ def __iter__(self): i += self.max_sentence_length +def _scan_vocab_worker(stream, progress_queue, max_vocab_size=None, trim_rule=None): + """Do an initial scan of all words appearing in stream.""" + min_reduce = 1 + vocab = defaultdict(int) + checked_string_types = 0 + for sentence in stream: + if not checked_string_types: + if isinstance(sentence, string_types): + log_msg = "Each 'sentences' item should be a list of words (usually unicode strings). " \ + "First item here is instead plain %s." % type(sentence) + progress_queue.put(log_msg) + + checked_string_types += 1 + + for word in sentence: + vocab[word] += 1 + + if max_vocab_size and len(vocab) > max_vocab_size: + utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule) + min_reduce += 1 + + progress_queue.put((len(sentence), 1)) + + progress_queue.put(None) + return vocab + + class Word2VecVocab(utils.SaveLoad): def __init__(self, max_vocab_size=None, min_count=5, sample=1e-3, sorted_vocab=True, null_word=0, max_final_vocab=None): @@ -1157,38 +1184,15 @@ def __init__(self, max_vocab_size=None, min_count=5, sample=1e-3, sorted_vocab=T self.raw_vocab = None self.max_final_vocab = max_final_vocab - def _scan_vocab_worker(self, stream, progress_queue, trim_rule=None): - """Do an initial scan of all words appearing in stream.""" - min_reduce = 1 - vocab = defaultdict(int) - checked_string_types = 0 - for sentence in stream: - if not checked_string_types: - if isinstance(sentence, string_types): - log_msg = "Each 'sentences' item should be a list of words (usually unicode strings). " \ - "First item here is instead plain %s." % type(sentence) - progress_queue.put(log_msg) - - checked_string_types += 1 - - for word in sentence: - vocab[word] += 1 - - if self.max_vocab_size and len(vocab) > self.max_vocab_size: - utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule) - min_reduce += 1 - - progress_queue.put((len(sentence), 1)) - - progress_queue.put(None) - return vocab - def scan_vocab(self, input_streams, progress_per=10000, workers=1, trim_rule=None): - progress_queue = mp.Queue() + manager = mp.Manager() + progress_queue = manager.Queue() pool = mp.Pool(processes=min(workers, len(input_streams))) results = [ - pool.apply_async(self._scan_vocab_worker, (stream, progress_queue, trim_rule)) for stream in input_streams + pool.apply_async(_scan_vocab_worker, + (stream, progress_queue, self.max_vocab_size, trim_rule) + ) for stream in input_streams ] logger.info("collecting all words and their counts") diff --git a/gensim/utils.py b/gensim/utils.py index 2bf433ceb8..a612c59407 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -1725,7 +1725,7 @@ def merge_dicts(dict1, dict2): Merged dictionary with sum of frequencies as values. """ - for word, freq in dict2: + for word, freq in dict2.iteritems(): if word in dict1: dict1[word] += freq else: From 8a0badd5849013842712ed364aed9c91c83dd201 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Thu, 31 May 2018 10:52:06 +0300 Subject: [PATCH 20/49] without progress_per at all --- gensim/models/word2vec.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 7ac0c59090..85cade6bf3 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -1145,11 +1145,13 @@ def __iter__(self): i += self.max_sentence_length -def _scan_vocab_worker(stream, progress_queue, max_vocab_size=None, trim_rule=None): +def _scan_vocab_worker(stream, progress_queue, progress_per=10000, max_vocab_size=None, trim_rule=None): """Do an initial scan of all words appearing in stream.""" min_reduce = 1 vocab = defaultdict(int) checked_string_types = 0 + sentence_no = -1 + total_words = 0 for sentence in stream: if not checked_string_types: if isinstance(sentence, string_types): @@ -1166,8 +1168,15 @@ def _scan_vocab_worker(stream, progress_queue, max_vocab_size=None, trim_rule=No utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule) min_reduce += 1 - progress_queue.put((len(sentence), 1)) + total_words += len(sentence) + sentence_no += 1 + # if sentence_no % progress_per == 0: + # progress_queue.put((total_words, sentence_no + 1)) + # sentence_no = -1 + # total_words = 0 + + progress_queue.put((total_words, sentence_no + 1)) progress_queue.put(None) return vocab @@ -1191,9 +1200,10 @@ def scan_vocab(self, input_streams, progress_per=10000, workers=1, trim_rule=Non results = [ pool.apply_async(_scan_vocab_worker, - (stream, progress_queue, self.max_vocab_size, trim_rule) + (stream, progress_queue, progress_per, self.max_vocab_size, trim_rule) ) for stream in input_streams ] + pool.close() logger.info("collecting all words and their counts") unfinished_tasks = len(results) @@ -1203,7 +1213,7 @@ def scan_vocab(self, input_streams, progress_per=10000, workers=1, trim_rule=Non report = progress_queue.get() if report is None: unfinished_tasks -= 1 - logger.info("scan vocab worker process finished; awaiting finish of %i more procs", unfinished_tasks) + logger.info("scan vocab task finished; awaiting finish of %i more tasks", unfinished_tasks) elif isinstance(report, string_types): logger.warning(report) else: From 2472b2bade8816f114d40482b8752076be9f922f Mon Sep 17 00:00:00 2001 From: persiyanov Date: Fri, 15 Jun 2018 17:19:40 +0300 Subject: [PATCH 21/49] get rid of job_producer, make batches in _worker_loop --- gensim/models/base_any2vec.py | 74 ++++++++++++----------------------- 1 file changed, 24 insertions(+), 50 deletions(-) diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py index 2a38b5e439..a256085d1b 100644 --- a/gensim/models/base_any2vec.py +++ b/gensim/models/base_any2vec.py @@ -134,16 +134,12 @@ def _check_training_sanity(self, epochs=None, total_examples=None, total_words=N """Check that the training parameters provided make sense. e.g. raise error if `epochs` not provided.""" raise NotImplementedError() - def _worker_loop(self, job_queue, progress_queue): + def _worker_loop(self, input_stream, progress_queue): """Train the model, lifting lists of data from the job_queue.""" thread_private_mem = self._get_thread_working_mem() jobs_processed = 0 - while True: - job = job_queue.get() - if job is None: - progress_queue.put(None) - break # no more jobs => quit this worker - data_iterable, job_parameters = job + for batch in self._batch_iterator(input_stream): + data_iterable, job_parameters = batch for callback in self.callbacks: callback.on_batch_begin(self) @@ -155,16 +151,17 @@ def _worker_loop(self, job_queue, progress_queue): progress_queue.put((len(data_iterable), tally, raw_tally)) # report back progress jobs_processed += 1 + + progress_queue.put(None) logger.debug("worker exiting, processed %i jobs", jobs_processed) - def _job_producer(self, data_iterator, job_queue, cur_epoch=0, total_examples=None, total_words=None): - """Fill jobs queue using the input `data_iterator`.""" + def _batch_iterator(self, input_stream, cur_epoch=0, total_examples=None, total_words=None): job_batch, batch_size = [], 0 pushed_words, pushed_examples = 0, 0 next_job_params = self._get_job_params(cur_epoch) job_no = 0 - for data_idx, data in enumerate(data_iterator): + for data_idx, data in enumerate(input_stream): data_length = self._raw_word_count([data]) # can we fit this sentence into the existing job batch? @@ -174,7 +171,8 @@ def _job_producer(self, data_iterator, job_queue, cur_epoch=0, total_examples=No batch_size += data_length else: job_no += 1 - job_queue.put((job_batch, next_job_params)) + + yield job_batch, next_job_params # update the learning rate for the next job if total_examples: @@ -192,7 +190,7 @@ def _job_producer(self, data_iterator, job_queue, cur_epoch=0, total_examples=No # add the last job too (may be significantly smaller than batch_words) if job_batch: job_no += 1 - job_queue.put((job_batch, next_job_params)) + yield job_batch, next_job_params if job_no == 0 and self.train_count == 0: logger.warning( @@ -200,12 +198,9 @@ def _job_producer(self, data_iterator, job_queue, cur_epoch=0, total_examples=No "be sure to provide a corpus that offers restartable iteration = an iterable)." ) - # give the workers heads up that they can finish -- no more work! - for _ in xrange(self.workers): - job_queue.put(None) - logger.debug("job loop exiting, total %i jobs", job_no) + logger.debug("batch iterator loop exiting, total %i jobs", job_no) - def _log_progress(self, job_queue, progress_queue, cur_epoch, example_count, total_examples, + def _log_progress(self, progress_queue, cur_epoch, example_count, total_examples, raw_word_count, total_words, trained_word_count, elapsed): raise NotImplementedError() @@ -216,7 +211,7 @@ def _log_epoch_end(self, cur_epoch, example_count, total_examples, raw_word_coun def _log_train_end(self, raw_word_count, trained_word_count, total_elapsed, job_tally): raise NotImplementedError() - def _log_epoch_progress(self, progress_queue, job_queue, cur_epoch=0, total_examples=None, total_words=None, + def _log_epoch_progress(self, progress_queue, cur_epoch=0, total_examples=None, total_words=None, report_delay=1.0): example_count, trained_word_count, raw_word_count = 0, 0, 0 start, next_report = default_timer() - 0.00001, 1.0 @@ -241,7 +236,7 @@ def _log_epoch_progress(self, progress_queue, job_queue, cur_epoch=0, total_exam elapsed = default_timer() - start if elapsed >= next_report: self._log_progress( - job_queue, progress_queue, cur_epoch, example_count, total_examples, + progress_queue, cur_epoch, example_count, total_examples, raw_word_count, total_words, trained_word_count, elapsed) next_report = elapsed + report_delay # all done; report the final stats @@ -255,42 +250,23 @@ def _log_epoch_progress(self, progress_queue, job_queue, cur_epoch=0, total_exam def _train_epoch(self, data_iterables, cur_epoch=0, total_examples=None, total_words=None, queue_factor=2, report_delay=1.0): """Train one epoch.""" + assert len(data_iterables) == self.workers - job_queue = mp.Queue(maxsize=queue_factor * self.workers) progress_queue = Queue(maxsize=(queue_factor + 1) * self.workers) workers = [ threading.Thread( target=self._worker_loop, - args=(job_queue, progress_queue,)) - for _ in xrange(self.workers) + args=(input_stream, progress_queue,)) + for input_stream in data_iterables ] - processes = [ - mp.Process( - target=self._job_producer, - args=(data_iterable, job_queue), - kwargs={'cur_epoch': cur_epoch, 'total_examples': total_examples, 'total_words': total_words} - ) for data_iterable in data_iterables - ] - # workers.extend( - # threading.Thread( - # target=self._job_producer, - # args=(data_iterable, job_queue), - # kwargs={'cur_epoch': cur_epoch, 'total_examples': total_examples, 'total_words': total_words} - # ) for data_iterable in data_iterables - # ) - - for process in processes: - process.daemon = True - process.start() - for thread in workers: thread.daemon = True # make interrupting the process with ctrl+c easier thread.start() trained_word_count, raw_word_count, job_tally = self._log_epoch_progress( - progress_queue, job_queue, cur_epoch=cur_epoch, total_examples=total_examples, total_words=total_words, + progress_queue, cur_epoch=cur_epoch, total_examples=total_examples, total_words=total_words, report_delay=report_delay) return trained_word_count, raw_word_count, job_tally @@ -721,26 +697,24 @@ def load(cls, *args, **kwargs): model.total_train_time = 0 return model - def _log_progress(self, job_queue, progress_queue, cur_epoch, example_count, total_examples, + def _log_progress(self, progress_queue, cur_epoch, example_count, total_examples, raw_word_count, total_words, trained_word_count, elapsed): if total_examples: # examples-based progress % - job_queue_size = utils.qsize(job_queue) - logger.info( - "EPOCH %i - PROGRESS: at %.2f%% examples, %.0f words/s, in_qsize %i, out_qsize %i", + "EPOCH %i - PROGRESS: at %.2f%% examples, %.0f words/s, out_qsize %i", cur_epoch + 1, 100.0 * example_count / total_examples, trained_word_count / elapsed, - job_queue_size, utils.qsize(progress_queue) + utils.qsize(progress_queue) ) - _update_queue_and_cpu_stats(job_queue_size) + # _update_queue_and_cpu_stats(job_queue_size) else: # words-based progress % logger.info( - "EPOCH %i - PROGRESS: at %.2f%% words, %.0f words/s, in_qsize %i, out_qsize %i", + "EPOCH %i - PROGRESS: at %.2f%% words, %.0f words/s, out_qsize %i", cur_epoch + 1, 100.0 * raw_word_count / total_words, trained_word_count / elapsed, - utils.qsize(job_queue), utils.qsize(progress_queue) + utils.qsize(progress_queue) ) def _log_epoch_end(self, cur_epoch, example_count, total_examples, raw_word_count, total_words, From 4e0c1031672b1d746b35fc2621c15aca60de0bfc Mon Sep 17 00:00:00 2001 From: persiyanov Date: Fri, 15 Jun 2018 17:40:13 +0300 Subject: [PATCH 22/49] fix --- gensim/models/base_any2vec.py | 16 ++++++++-------- gensim/scripts/benchmark_any2vec_speed.py | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py index a256085d1b..dce53c2ff0 100644 --- a/gensim/models/base_any2vec.py +++ b/gensim/models/base_any2vec.py @@ -157,7 +157,7 @@ def _worker_loop(self, input_stream, progress_queue): def _batch_iterator(self, input_stream, cur_epoch=0, total_examples=None, total_words=None): job_batch, batch_size = [], 0 - pushed_words, pushed_examples = 0, 0 + # pushed_words, pushed_examples = 0, 0 next_job_params = self._get_job_params(cur_epoch) job_no = 0 @@ -175,15 +175,15 @@ def _batch_iterator(self, input_stream, cur_epoch=0, total_examples=None, total_ yield job_batch, next_job_params # update the learning rate for the next job - if total_examples: + # if total_examples: # examples-based decay - pushed_examples += len(job_batch) - epoch_progress = 1.0 * pushed_examples / total_examples - else: + # pushed_examples += len(job_batch) + # epoch_progress = 1.0 * pushed_examples / total_examples + # else: # words-based decay - pushed_words += self._raw_word_count(job_batch) - epoch_progress = 1.0 * pushed_words / total_words - next_job_params = self._update_job_params(next_job_params, epoch_progress, cur_epoch) + # pushed_words += self._raw_word_count(job_batch) + # epoch_progress = 1.0 * pushed_words / total_words + # next_job_params = self._update_job_params(next_job_params, epoch_progress, cur_epoch) # add the sentence that didn't fit as the first item of a new job job_batch, batch_size = [data], data_length diff --git a/gensim/scripts/benchmark_any2vec_speed.py b/gensim/scripts/benchmark_any2vec_speed.py index bda130cd8c..c14639a1c8 100644 --- a/gensim/scripts/benchmark_any2vec_speed.py +++ b/gensim/scripts/benchmark_any2vec_speed.py @@ -32,7 +32,7 @@ def print_results(model_str, results): logger.info('----- MODEL "{}" RESULTS -----'.format(model_str).center(50)) logger.info('\t* Vocab time: {} sec.'.format(results['vocab_time'])) logger.info('\t* Total epoch time: {} sec.'.format(results['total_time'])) - logger.info('\t* Avg queue size: {} elems.'.format(results['queue_size'])) + # logger.info('\t* Avg queue size: {} elems.'.format(results['queue_size'])) logger.info('\t* Processing speed: {} words/sec'.format(results['words_sec'])) logger.info('\t* Avg CPU loads: {}'.format(results['cpu_load'])) logger.info('\t* Sum CPU load: {}'.format(results['cpu_load_sum'])) From 3dd8a6401043e961543c2388c448a88b16f7127e Mon Sep 17 00:00:00 2001 From: persiyanov Date: Fri, 15 Jun 2018 17:41:42 +0300 Subject: [PATCH 23/49] fix --- gensim/models/base_any2vec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py index dce53c2ff0..8b927a5144 100644 --- a/gensim/models/base_any2vec.py +++ b/gensim/models/base_any2vec.py @@ -708,7 +708,7 @@ def _log_progress(self, progress_queue, cur_epoch, example_count, total_examples utils.qsize(progress_queue) ) - # _update_queue_and_cpu_stats(job_queue_size) + _update_queue_and_cpu_stats(0) else: # words-based progress % logger.info( From d389847bbb0627c89048f6d073dd75a1786d3cb9 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Thu, 21 Jun 2018 02:29:39 +0300 Subject: [PATCH 24/49] make cythonlinesentence. not working, but at least compiles now --- gensim/models/base_any2vec.py | 54 +++++++++++++++++++++----------- gensim/models/word2vec_inner.pxd | 29 +++++++++++++++++ gensim/models/word2vec_inner.pyx | 37 +++++++++++++++++++++- 3 files changed, 100 insertions(+), 20 deletions(-) diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py index 8b927a5144..12329f77e4 100644 --- a/gensim/models/base_any2vec.py +++ b/gensim/models/base_any2vec.py @@ -22,6 +22,8 @@ import psutil import time +from gensim.models.word2vec_inner import CythonLineSentence + try: from queue import Queue except ImportError: @@ -134,16 +136,43 @@ def _check_training_sanity(self, epochs=None, total_examples=None, total_words=N """Check that the training parameters provided make sense. e.g. raise error if `epochs` not provided.""" raise NotImplementedError() - def _worker_loop(self, input_stream, progress_queue): - """Train the model, lifting lists of data from the job_queue.""" + # def _worker_loop(self, input_stream, progress_queue): + # """Train the model, lifting lists of data from the job_queue.""" + # thread_private_mem = self._get_thread_working_mem() + # jobs_processed = 0 + # for batch in self._batch_iterator(input_stream): + # data_iterable, job_parameters = batch + # + # for callback in self.callbacks: + # callback.on_batch_begin(self) + # + # tally, raw_tally = self._do_train_job(data_iterable, job_parameters, thread_private_mem) + # + # for callback in self.callbacks: + # callback.on_batch_end(self) + # + # progress_queue.put((len(data_iterable), tally, raw_tally)) # report back progress + # jobs_processed += 1 + # + # progress_queue.put(None) + # logger.debug("worker exiting, processed %i jobs", jobs_processed) + + def _worker_loop(self, fname, progress_queue): thread_private_mem = self._get_thread_working_mem() jobs_processed = 0 - for batch in self._batch_iterator(input_stream): - data_iterable, job_parameters = batch + job_parameters = self._get_job_params(0) + input_stream = CythonLineSentence(fname) + while True: + try: + # Prepare batch with NO GIL + data_iterable = input_stream.next_batch() + except: + break for callback in self.callbacks: callback.on_batch_begin(self) + # No GIL tally, raw_tally = self._do_train_job(data_iterable, job_parameters, thread_private_mem) for callback in self.callbacks: @@ -157,8 +186,6 @@ def _worker_loop(self, input_stream, progress_queue): def _batch_iterator(self, input_stream, cur_epoch=0, total_examples=None, total_words=None): job_batch, batch_size = [], 0 - # pushed_words, pushed_examples = 0, 0 - next_job_params = self._get_job_params(cur_epoch) job_no = 0 for data_idx, data in enumerate(input_stream): @@ -172,25 +199,14 @@ def _batch_iterator(self, input_stream, cur_epoch=0, total_examples=None, total_ else: job_no += 1 - yield job_batch, next_job_params - - # update the learning rate for the next job - # if total_examples: - # examples-based decay - # pushed_examples += len(job_batch) - # epoch_progress = 1.0 * pushed_examples / total_examples - # else: - # words-based decay - # pushed_words += self._raw_word_count(job_batch) - # epoch_progress = 1.0 * pushed_words / total_words - # next_job_params = self._update_job_params(next_job_params, epoch_progress, cur_epoch) + yield job_batch # add the sentence that didn't fit as the first item of a new job job_batch, batch_size = [data], data_length # add the last job too (may be significantly smaller than batch_words) if job_batch: job_no += 1 - yield job_batch, next_job_params + yield job_batch if job_no == 0 and self.train_count == 0: logger.warning( diff --git a/gensim/models/word2vec_inner.pxd b/gensim/models/word2vec_inner.pxd index 04cca9e887..df4f63fe28 100644 --- a/gensim/models/word2vec_inner.pxd +++ b/gensim/models/word2vec_inner.pxd @@ -5,6 +5,9 @@ # Copyright (C) 2013 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.htmlcimport numpy as np +from libcpp.string cimport string +from libcpp.vector cimport vector + cdef extern from "voidptr.h": void* PyCObject_AsVoidPtr(object obj) @@ -52,3 +55,29 @@ cdef void our_saxpy_noblas(const int *N, const float *alpha, const float *X, con cdef unsigned long long bisect_left(np.uint32_t *a, unsigned long long x, unsigned long long lo, unsigned long long hi) nogil cdef unsigned long long random_int32(unsigned long long *next_random) nogil + + +cdef extern from "" namespace "std": + cdef cppclass istream: + istream& read(const char*, int) except+ + +cdef extern from "" namespace "std::ios_base": + cdef cppclass open_mode: + pass + cdef open_mode binary + # you can define other constants as needed + +cdef extern from "" namespace "std": + cdef cppclass ifstream(istream): + # constructors + ifstream(const char*) except + + ifstream(const char*, open_mode) except+ + +cdef class CythonLineSentence: + """Simple format: one sentence = one line; words already preprocessed and separated by whitespace. + """ + cdef public char* source + cdef public int max_sentence_length + + cdef string read_line(self, ifstream*) nogil + cdef vector[string] next_batch(self) nogil diff --git a/gensim/models/word2vec_inner.pyx b/gensim/models/word2vec_inner.pyx index 98e719c6d4..cc049d4d6b 100755 --- a/gensim/models/word2vec_inner.pyx +++ b/gensim/models/word2vec_inner.pyx @@ -1,4 +1,5 @@ #!/usr/bin/env cython +# distutils: language = c++ # cython: boundscheck=False # cython: wraparound=False # cython: cdivision=True @@ -13,7 +14,11 @@ cimport numpy as np from libc.math cimport exp from libc.math cimport log -from libc.string cimport memset +from libc.string cimport memset, strtok +# from libc.stdio cimport FILE, fopen, fscanf, fclose +from libcpp.string cimport string#, getline +from libcpp.vector cimport vector +# from libcpp.sstream cimport istringstream # scipy <= 0.15 try: @@ -42,6 +47,36 @@ cdef REAL_t[EXP_TABLE_SIZE] LOG_TABLE cdef int ONE = 1 cdef REAL_t ONEF = 1.0 +@cython.final +cdef class CythonLineSentence: + """Simple format: one sentence = one line; words already preprocessed and separated by whitespace. + """ + def __init__(self, source, max_sentence_length=MAX_SENTENCE_LEN): + """ + `source` can be either a string or a file object. Clip the file to the first + `limit` lines (or not clipped if limit is None, the default). + + Example:: + + sentences = LineSentence('myfile.txt') + + Or for compressed files:: + + sentences = LineSentence('compressed_text.txt.bz2') + sentences = LineSentence('compressed_text.txt.gz') + + """ + self.source = source + self.max_sentence_length = max_sentence_length + + cdef string read_line(self, ifstream* fd) nogil: + + return string() + + cdef vector[string] next_batch(self) nogil: + return vector[string]() + + # for when fblas.sdot returns a double cdef REAL_t our_dot_double(const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil: return dsdot(N, X, incX, Y, incY) From 4c1d3a603ffa8185266becac3a3b7e1400f4814c Mon Sep 17 00:00:00 2001 From: persiyanov Date: Thu, 21 Jun 2018 16:22:16 +0300 Subject: [PATCH 25/49] add operator>> --- gensim/models/base_any2vec.py | 2 +- gensim/models/word2vec_inner.pxd | 20 ++++++-------------- gensim/models/word2vec_inner.pyx | 10 ++++++---- setup.py | 2 +- 4 files changed, 14 insertions(+), 20 deletions(-) diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py index 12329f77e4..ede6db810a 100644 --- a/gensim/models/base_any2vec.py +++ b/gensim/models/base_any2vec.py @@ -172,7 +172,7 @@ def _worker_loop(self, fname, progress_queue): for callback in self.callbacks: callback.on_batch_begin(self) - # No GIL + # No GIL (almost) tally, raw_tally = self._do_train_job(data_iterable, job_parameters, thread_private_mem) for callback in self.callbacks: diff --git a/gensim/models/word2vec_inner.pxd b/gensim/models/word2vec_inner.pxd index df4f63fe28..2be97ae220 100644 --- a/gensim/models/word2vec_inner.pxd +++ b/gensim/models/word2vec_inner.pxd @@ -57,27 +57,19 @@ cdef unsigned long long bisect_left(np.uint32_t *a, unsigned long long x, unsign cdef unsigned long long random_int32(unsigned long long *next_random) nogil -cdef extern from "" namespace "std": - cdef cppclass istream: - istream& read(const char*, int) except+ - -cdef extern from "" namespace "std::ios_base": - cdef cppclass open_mode: - pass - cdef open_mode binary - # you can define other constants as needed - cdef extern from "" namespace "std": - cdef cppclass ifstream(istream): + cdef cppclass ifstream: # constructors + ifstream() except + ifstream(const char*) except + - ifstream(const char*, open_mode) except+ + ifstream& operator>>(string&) except + cdef class CythonLineSentence: """Simple format: one sentence = one line; words already preprocessed and separated by whitespace. """ cdef public char* source cdef public int max_sentence_length + cdef ifstream fd - cdef string read_line(self, ifstream*) nogil - cdef vector[string] next_batch(self) nogil + cpdef string read_line(self) nogil + cpdef vector[string] next_batch(self) nogil diff --git a/gensim/models/word2vec_inner.pyx b/gensim/models/word2vec_inner.pyx index cc049d4d6b..47c2a68bd2 100755 --- a/gensim/models/word2vec_inner.pyx +++ b/gensim/models/word2vec_inner.pyx @@ -68,12 +68,14 @@ cdef class CythonLineSentence: """ self.source = source self.max_sentence_length = max_sentence_length + self.fd = ifstream(source) - cdef string read_line(self, ifstream* fd) nogil: + cpdef string read_line(self) nogil: + cdef string val + self.fd >> val + return val - return string() - - cdef vector[string] next_batch(self) nogil: + cpdef vector[string] next_batch(self) nogil: return vector[string]() diff --git a/setup.py b/setup.py index 132eb925c5..8ae898e50f 100644 --- a/setup.py +++ b/setup.py @@ -250,7 +250,7 @@ def finalize_options(self): ext_modules=[ Extension('gensim.models.word2vec_inner', - sources=['./gensim/models/word2vec_inner.c'], + sources=['./gensim/models/word2vec_inner.cpp'], include_dirs=[model_dir]), Extension('gensim.models.doc2vec_inner', sources=['./gensim/models/doc2vec_inner.c'], From 36882a0c7a708d3cf9a77f6c1570d22e6cdc2b35 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Thu, 21 Jun 2018 16:30:28 +0300 Subject: [PATCH 26/49] change ifstream to ifstream* --- gensim/models/word2vec_inner.pxd | 2 +- gensim/models/word2vec_inner.pyx | 12 ++++++++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/gensim/models/word2vec_inner.pxd b/gensim/models/word2vec_inner.pxd index 2be97ae220..1f6e12c3ef 100644 --- a/gensim/models/word2vec_inner.pxd +++ b/gensim/models/word2vec_inner.pxd @@ -69,7 +69,7 @@ cdef class CythonLineSentence: """ cdef public char* source cdef public int max_sentence_length - cdef ifstream fd + cdef ifstream* fd cpdef string read_line(self) nogil cpdef vector[string] next_batch(self) nogil diff --git a/gensim/models/word2vec_inner.pyx b/gensim/models/word2vec_inner.pyx index 47c2a68bd2..c82c13f198 100755 --- a/gensim/models/word2vec_inner.pyx +++ b/gensim/models/word2vec_inner.pyx @@ -20,6 +20,8 @@ from libcpp.string cimport string#, getline from libcpp.vector cimport vector # from libcpp.sstream cimport istringstream +from cython.operator import dereference as deref + # scipy <= 0.15 try: from scipy.linalg.blas import fblas @@ -51,6 +53,9 @@ cdef REAL_t ONEF = 1.0 cdef class CythonLineSentence: """Simple format: one sentence = one line; words already preprocessed and separated by whitespace. """ + def __cinit__(self, source, max_sentence_length): + self.fd = new ifstream(source) + def __init__(self, source, max_sentence_length=MAX_SENTENCE_LEN): """ `source` can be either a string or a file object. Clip the file to the first @@ -68,11 +73,14 @@ cdef class CythonLineSentence: """ self.source = source self.max_sentence_length = max_sentence_length - self.fd = ifstream(source) + + def __dealloc__(self): + if self.fd != NULL: + del self.fd cpdef string read_line(self) nogil: cdef string val - self.fd >> val + deref(self.fd) >> val return val cpdef vector[string] next_batch(self) nogil: From 37b55f3532a986002a3ccc121790bc71565e1503 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Thu, 21 Jun 2018 17:47:27 +0300 Subject: [PATCH 27/49] fastlinesentence in c++ --- gensim/models/linesentence.cpp | 17 +++++++++++++++++ gensim/models/linesentence.h | 15 +++++++++++++++ gensim/models/word2vec_inner.pxd | 21 --------------------- gensim/models/word2vec_inner.pyx | 26 ++++++++++++++++---------- setup.py | 3 ++- 5 files changed, 50 insertions(+), 32 deletions(-) create mode 100644 gensim/models/linesentence.cpp create mode 100644 gensim/models/linesentence.h diff --git a/gensim/models/linesentence.cpp b/gensim/models/linesentence.cpp new file mode 100644 index 0000000000..c7ef3051c5 --- /dev/null +++ b/gensim/models/linesentence.cpp @@ -0,0 +1,17 @@ +#include "linesentence.h" + + +FastLineSentence::FastLineSentence(const std::string& filename) : fs_(filename) { } + +std::vector FastLineSentence::ReadSentence() { + std::string line, word; + std::getline(fs_, line); + std::vector res; + + std::istringstream iss(line); + while (iss >> word) { + res.push_back(word); + } + + return res; +} \ No newline at end of file diff --git a/gensim/models/linesentence.h b/gensim/models/linesentence.h new file mode 100644 index 0000000000..95bb82add2 --- /dev/null +++ b/gensim/models/linesentence.h @@ -0,0 +1,15 @@ +#pragma once + +#include +#include +#include + + +class FastLineSentence { +public: + explicit FastLineSentence(const std::string& filename); + + std::vector ReadSentence(); +private: + std::ifstream fs_; +}; \ No newline at end of file diff --git a/gensim/models/word2vec_inner.pxd b/gensim/models/word2vec_inner.pxd index 1f6e12c3ef..04cca9e887 100644 --- a/gensim/models/word2vec_inner.pxd +++ b/gensim/models/word2vec_inner.pxd @@ -5,9 +5,6 @@ # Copyright (C) 2013 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.htmlcimport numpy as np -from libcpp.string cimport string -from libcpp.vector cimport vector - cdef extern from "voidptr.h": void* PyCObject_AsVoidPtr(object obj) @@ -55,21 +52,3 @@ cdef void our_saxpy_noblas(const int *N, const float *alpha, const float *X, con cdef unsigned long long bisect_left(np.uint32_t *a, unsigned long long x, unsigned long long lo, unsigned long long hi) nogil cdef unsigned long long random_int32(unsigned long long *next_random) nogil - - -cdef extern from "" namespace "std": - cdef cppclass ifstream: - # constructors - ifstream() except + - ifstream(const char*) except + - ifstream& operator>>(string&) except + - -cdef class CythonLineSentence: - """Simple format: one sentence = one line; words already preprocessed and separated by whitespace. - """ - cdef public char* source - cdef public int max_sentence_length - cdef ifstream* fd - - cpdef string read_line(self) nogil - cpdef vector[string] next_batch(self) nogil diff --git a/gensim/models/word2vec_inner.pyx b/gensim/models/word2vec_inner.pyx index c82c13f198..aaedc34207 100755 --- a/gensim/models/word2vec_inner.pyx +++ b/gensim/models/word2vec_inner.pyx @@ -1,5 +1,6 @@ #!/usr/bin/env cython # distutils: language = c++ +# distutils: sources = linesentence.cpp # cython: boundscheck=False # cython: wraparound=False # cython: cdivision=True @@ -49,12 +50,19 @@ cdef REAL_t[EXP_TABLE_SIZE] LOG_TABLE cdef int ONE = 1 cdef REAL_t ONEF = 1.0 +cdef extern from "linesentence.h": + cdef cppclass FastLineSentence: + FastLineSentence(string&) except + + vector[string] ReadSentence() nogil except + + @cython.final cdef class CythonLineSentence: - """Simple format: one sentence = one line; words already preprocessed and separated by whitespace. - """ - def __cinit__(self, source, max_sentence_length): - self.fd = new ifstream(source) + cdef FastLineSentence* _thisptr + cdef public string source + cdef public int max_sentence_length + + def __cinit__(self, source, max_sentence_length=MAX_SENTENCE_LEN): + self._thisptr = new FastLineSentence(source) def __init__(self, source, max_sentence_length=MAX_SENTENCE_LEN): """ @@ -75,13 +83,11 @@ cdef class CythonLineSentence: self.max_sentence_length = max_sentence_length def __dealloc__(self): - if self.fd != NULL: - del self.fd + if self._thisptr != NULL: + del self._thisptr - cpdef string read_line(self) nogil: - cdef string val - deref(self.fd) >> val - return val + cpdef vector[string] read_sentence(self) nogil: + return self._thisptr.ReadSentence() cpdef vector[string] next_batch(self) nogil: return vector[string]() diff --git a/setup.py b/setup.py index 8ae898e50f..6e45b4de7c 100644 --- a/setup.py +++ b/setup.py @@ -250,7 +250,8 @@ def finalize_options(self): ext_modules=[ Extension('gensim.models.word2vec_inner', - sources=['./gensim/models/word2vec_inner.cpp'], + sources=['./gensim/models/word2vec_inner.cpp', './gensim/models/linesentence.cpp'], + language="c++", include_dirs=[model_dir]), Extension('gensim.models.doc2vec_inner', sources=['./gensim/models/doc2vec_inner.c'], From 97f834d903a6a870014758c376d18519b7f39349 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Thu, 21 Jun 2018 21:01:40 +0300 Subject: [PATCH 28/49] almost working version; works on large files, but one bug is to be fixed --- gensim/models/linesentence.cpp | 6 +++ gensim/models/word2vec_inner.pyx | 89 +++++++++++++++++++++++--------- 2 files changed, 72 insertions(+), 23 deletions(-) diff --git a/gensim/models/linesentence.cpp b/gensim/models/linesentence.cpp index c7ef3051c5..ae7cfda533 100644 --- a/gensim/models/linesentence.cpp +++ b/gensim/models/linesentence.cpp @@ -1,9 +1,15 @@ +#pragma once + +#include #include "linesentence.h" FastLineSentence::FastLineSentence(const std::string& filename) : fs_(filename) { } std::vector FastLineSentence::ReadSentence() { + if (fs_.eof()) { + throw std::runtime_error("EOF occured in C++!"); + } std::string line, word; std::getline(fs_, line); std::vector res; diff --git a/gensim/models/word2vec_inner.pyx b/gensim/models/word2vec_inner.pyx index aaedc34207..f7bf50ab82 100755 --- a/gensim/models/word2vec_inner.pyx +++ b/gensim/models/word2vec_inner.pyx @@ -16,12 +16,10 @@ cimport numpy as np from libc.math cimport exp from libc.math cimport log from libc.string cimport memset, strtok -# from libc.stdio cimport FILE, fopen, fscanf, fclose -from libcpp.string cimport string#, getline +from libcpp.string cimport string from libcpp.vector cimport vector -# from libcpp.sstream cimport istringstream +from libcpp cimport bool as bool_t -from cython.operator import dereference as deref # scipy <= 0.15 try: @@ -50,47 +48,92 @@ cdef REAL_t[EXP_TABLE_SIZE] LOG_TABLE cdef int ONE = 1 cdef REAL_t ONEF = 1.0 + cdef extern from "linesentence.h": cdef cppclass FastLineSentence: FastLineSentence(string&) except + vector[string] ReadSentence() nogil except + + +def _batch_iterator(self, input_stream, cur_epoch=0, total_examples=None, total_words=None): + job_batch, batch_size = [], 0 + job_no = 0 + + for data_idx, data in enumerate(input_stream): + data_length = self._raw_word_count([data]) + + # can we fit this sentence into the existing job batch? + if batch_size + data_length <= self.batch_words: + # yes => add it to the current job + job_batch.append(data) + batch_size += data_length + else: + job_no += 1 + + yield job_batch + + # add the sentence that didn't fit as the first item of a new job + job_batch, batch_size = [data], data_length + # add the last job too (may be significantly smaller than batch_words) + if job_batch: + job_no += 1 + yield job_batch + + + @cython.final cdef class CythonLineSentence: cdef FastLineSentence* _thisptr cdef public string source - cdef public int max_sentence_length + cdef public int max_sentence_length, max_words_in_batch + cdef vector[string] buf_data def __cinit__(self, source, max_sentence_length=MAX_SENTENCE_LEN): self._thisptr = new FastLineSentence(source) def __init__(self, source, max_sentence_length=MAX_SENTENCE_LEN): - """ - `source` can be either a string or a file object. Clip the file to the first - `limit` lines (or not clipped if limit is None, the default). - - Example:: - - sentences = LineSentence('myfile.txt') - - Or for compressed files:: - - sentences = LineSentence('compressed_text.txt.bz2') - sentences = LineSentence('compressed_text.txt.gz') - - """ self.source = source - self.max_sentence_length = max_sentence_length + self.max_sentence_length = max_sentence_length # isn't used in this hacky prototype + self.max_words_in_batch = MAX_SENTENCE_LEN def __dealloc__(self): if self._thisptr != NULL: del self._thisptr - cpdef vector[string] read_sentence(self) nogil: + cpdef vector[string] read_sentence(self) nogil except *: return self._thisptr.ReadSentence() - cpdef vector[string] next_batch(self) nogil: - return vector[string]() + cpdef vector[vector[string]] next_batch(self) except *: + with nogil: + return self._next_batch() + + cpdef vector[vector[string]] _next_batch(self) nogil except *: + cdef: + vector[vector[string]] job_batch + vector[string] data + int batch_size = 0 + int data_length = 0 + + # Try to read data from previous calls which was not returned + if self.buf_data.size() > 0: + data = self.buf_data + self.buf_data.clear() + else: + data = self.read_sentence() + + data_length = data.size() + while batch_size + data_length <= self.max_words_in_batch: + job_batch.push_back(data) + batch_size += data_length + + # TODO: if it raises an exception, we will not return a batch we read up to this moment + data = self.read_sentence() + data_length = data.size() + + # Save data which doesn't fit in batch in order to return it later. + buf_data = data + + return job_batch # for when fblas.sdot returns a double From 944e3dc434439886e69d78cd06ad78de15acee69 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Thu, 21 Jun 2018 21:09:19 +0300 Subject: [PATCH 29/49] remove batch iterator from pyx --- gensim/models/word2vec_inner.pyx | 26 -------------------------- 1 file changed, 26 deletions(-) diff --git a/gensim/models/word2vec_inner.pyx b/gensim/models/word2vec_inner.pyx index f7bf50ab82..c48e179497 100755 --- a/gensim/models/word2vec_inner.pyx +++ b/gensim/models/word2vec_inner.pyx @@ -55,32 +55,6 @@ cdef extern from "linesentence.h": vector[string] ReadSentence() nogil except + -def _batch_iterator(self, input_stream, cur_epoch=0, total_examples=None, total_words=None): - job_batch, batch_size = [], 0 - job_no = 0 - - for data_idx, data in enumerate(input_stream): - data_length = self._raw_word_count([data]) - - # can we fit this sentence into the existing job batch? - if batch_size + data_length <= self.batch_words: - # yes => add it to the current job - job_batch.append(data) - batch_size += data_length - else: - job_no += 1 - - yield job_batch - - # add the sentence that didn't fit as the first item of a new job - job_batch, batch_size = [data], data_length - # add the last job too (may be significantly smaller than batch_words) - if job_batch: - job_no += 1 - yield job_batch - - - @cython.final cdef class CythonLineSentence: cdef FastLineSentence* _thisptr From 0081f012c42b9853ec7bcfe7a857ca8a14c3c3cf Mon Sep 17 00:00:00 2001 From: persiyanov Date: Fri, 22 Jun 2018 14:36:38 +0300 Subject: [PATCH 30/49] working code --- gensim/models/base_any2vec.py | 64 +++++++++++----------- gensim/models/word2vec.py | 66 ++++++++++++----------- gensim/scripts/benchmark_any2vec_speed.py | 2 +- 3 files changed, 69 insertions(+), 63 deletions(-) diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py index ede6db810a..63faa34585 100644 --- a/gensim/models/base_any2vec.py +++ b/gensim/models/base_any2vec.py @@ -184,37 +184,37 @@ def _worker_loop(self, fname, progress_queue): progress_queue.put(None) logger.debug("worker exiting, processed %i jobs", jobs_processed) - def _batch_iterator(self, input_stream, cur_epoch=0, total_examples=None, total_words=None): - job_batch, batch_size = [], 0 - job_no = 0 - - for data_idx, data in enumerate(input_stream): - data_length = self._raw_word_count([data]) - - # can we fit this sentence into the existing job batch? - if batch_size + data_length <= self.batch_words: - # yes => add it to the current job - job_batch.append(data) - batch_size += data_length - else: - job_no += 1 - - yield job_batch - - # add the sentence that didn't fit as the first item of a new job - job_batch, batch_size = [data], data_length - # add the last job too (may be significantly smaller than batch_words) - if job_batch: - job_no += 1 - yield job_batch - - if job_no == 0 and self.train_count == 0: - logger.warning( - "train() called with an empty iterator (if not intended, " - "be sure to provide a corpus that offers restartable iteration = an iterable)." - ) - - logger.debug("batch iterator loop exiting, total %i jobs", job_no) + # def _batch_iterator(self, input_stream, cur_epoch=0, total_examples=None, total_words=None): + # job_batch, batch_size = [], 0 + # job_no = 0 + # + # for data_idx, data in enumerate(input_stream): + # data_length = self._raw_word_count([data]) + # + # # can we fit this sentence into the existing job batch? + # if batch_size + data_length <= self.batch_words: + # # yes => add it to the current job + # job_batch.append(data) + # batch_size += data_length + # else: + # job_no += 1 + # + # yield job_batch + # + # # add the sentence that didn't fit as the first item of a new job + # job_batch, batch_size = [data], data_length + # # add the last job too (may be significantly smaller than batch_words) + # if job_batch: + # job_no += 1 + # yield job_batch + # + # if job_no == 0 and self.train_count == 0: + # logger.warning( + # "train() called with an empty iterator (if not intended, " + # "be sure to provide a corpus that offers restartable iteration = an iterable)." + # ) + # + # logger.debug("batch iterator loop exiting, total %i jobs", job_no) def _log_progress(self, progress_queue, cur_epoch, example_count, total_examples, raw_word_count, total_words, trained_word_count, elapsed): @@ -546,7 +546,7 @@ def build_vocab(self, input_streams, update=False, progress_per=10000, workers=1 start_time = time.time() total_words, corpus_count = self.vocabulary.scan_vocab( - input_streams, progress_per=progress_per, workers=workers, trim_rule=trim_rule) + input_streams, progress_per=progress_per, trim_rule=trim_rule) self.corpus_count = corpus_count report_values = self.vocabulary.prepare_vocab( self.hs, self.negative, self.wv, update=update, keep_raw_vocab=keep_raw_vocab, diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 85cade6bf3..41af223f5e 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -1193,45 +1193,51 @@ def __init__(self, max_vocab_size=None, min_count=5, sample=1e-3, sorted_vocab=T self.raw_vocab = None self.max_final_vocab = max_final_vocab - def scan_vocab(self, input_streams, progress_per=10000, workers=1, trim_rule=None): - manager = mp.Manager() - progress_queue = manager.Queue() - pool = mp.Pool(processes=min(workers, len(input_streams))) - - results = [ - pool.apply_async(_scan_vocab_worker, - (stream, progress_queue, progress_per, self.max_vocab_size, trim_rule) - ) for stream in input_streams - ] - pool.close() + def scan_vocab(self, input_streams, progress_per=10000, trim_rule=None): + """Do an initial scan of all words appearing in sentences.""" + from itertools import chain + line_sentences = [] + for st in input_streams: + if isinstance(st, string_types): + line_sentences.append(LineSentence(st)) + else: + raise RuntimeError("error!!!!!!!!") + sentences = chain(*line_sentences) logger.info("collecting all words and their counts") - unfinished_tasks = len(results) - total_words = 0 sentence_no = -1 - while unfinished_tasks > 0: - report = progress_queue.get() - if report is None: - unfinished_tasks -= 1 - logger.info("scan vocab task finished; awaiting finish of %i more tasks", unfinished_tasks) - elif isinstance(report, string_types): - logger.warning(report) - else: - num_words, num_sentences = report - total_words += num_words - sentence_no += num_sentences + total_words = 0 + min_reduce = 1 + vocab = defaultdict(int) + checked_string_types = 0 + for sentence_no, sentence in enumerate(sentences): + if not checked_string_types: + if isinstance(sentence, string_types): + logger.warning( + "Each 'sentences' item should be a list of words (usually unicode strings). " + "First item here is instead plain %s.", + type(sentence) + ) + checked_string_types += 1 + if sentence_no % progress_per == 0: + logger.info( + "PROGRESS: at sentence #%i, processed %i words, keeping %i word types", + sentence_no, total_words, len(vocab) + ) + for word in sentence: + vocab[word] += 1 + total_words += len(sentence) - if sentence_no % progress_per == 0: - logger.info("PROGRESS: at sentence #%i, processed %i words", sentence_no, total_words) + if self.max_vocab_size and len(vocab) > self.max_vocab_size: + utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule) + min_reduce += 1 - self.raw_vocab = reduce(utils.merge_dicts, [res.get() for res in results]) logger.info( "collected %i word types from a corpus of %i raw words and %i sentences", - len(self.raw_vocab), total_words, sentence_no + 1 + len(vocab), total_words, sentence_no + 1 ) - corpus_count = sentence_no + 1 - + self.raw_vocab = vocab return total_words, corpus_count def sort_vocab(self, wv): diff --git a/gensim/scripts/benchmark_any2vec_speed.py b/gensim/scripts/benchmark_any2vec_speed.py index c14639a1c8..5b2bbccc48 100644 --- a/gensim/scripts/benchmark_any2vec_speed.py +++ b/gensim/scripts/benchmark_any2vec_speed.py @@ -45,7 +45,7 @@ def benchmark_model(input_streams, model, window, workers, vector_size): } else: kwargs = { - 'input_streams': [LineSentence(inp) for inp in input_streams] + 'input_streams': [inp for inp in input_streams] # hack for CythonLineSentence } kwargs['size'] = vector_size From fe662469adc4b0cf80c9ca3c75e5398bf0ee3206 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Sat, 23 Jun 2018 15:24:02 +0300 Subject: [PATCH 31/49] remove build_vocab changes --- gensim/models/base_any2vec.py | 21 -------------------- gensim/models/word2vec.py | 36 ----------------------------------- 2 files changed, 57 deletions(-) diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py index 63faa34585..babfa18241 100644 --- a/gensim/models/base_any2vec.py +++ b/gensim/models/base_any2vec.py @@ -136,27 +136,6 @@ def _check_training_sanity(self, epochs=None, total_examples=None, total_words=N """Check that the training parameters provided make sense. e.g. raise error if `epochs` not provided.""" raise NotImplementedError() - # def _worker_loop(self, input_stream, progress_queue): - # """Train the model, lifting lists of data from the job_queue.""" - # thread_private_mem = self._get_thread_working_mem() - # jobs_processed = 0 - # for batch in self._batch_iterator(input_stream): - # data_iterable, job_parameters = batch - # - # for callback in self.callbacks: - # callback.on_batch_begin(self) - # - # tally, raw_tally = self._do_train_job(data_iterable, job_parameters, thread_private_mem) - # - # for callback in self.callbacks: - # callback.on_batch_end(self) - # - # progress_queue.put((len(data_iterable), tally, raw_tally)) # report back progress - # jobs_processed += 1 - # - # progress_queue.put(None) - # logger.debug("worker exiting, processed %i jobs", jobs_processed) - def _worker_loop(self, fname, progress_queue): thread_private_mem = self._get_thread_working_mem() jobs_processed = 0 diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 41af223f5e..d831fcade9 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -1145,42 +1145,6 @@ def __iter__(self): i += self.max_sentence_length -def _scan_vocab_worker(stream, progress_queue, progress_per=10000, max_vocab_size=None, trim_rule=None): - """Do an initial scan of all words appearing in stream.""" - min_reduce = 1 - vocab = defaultdict(int) - checked_string_types = 0 - sentence_no = -1 - total_words = 0 - for sentence in stream: - if not checked_string_types: - if isinstance(sentence, string_types): - log_msg = "Each 'sentences' item should be a list of words (usually unicode strings). " \ - "First item here is instead plain %s." % type(sentence) - progress_queue.put(log_msg) - - checked_string_types += 1 - - for word in sentence: - vocab[word] += 1 - - if max_vocab_size and len(vocab) > max_vocab_size: - utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule) - min_reduce += 1 - - total_words += len(sentence) - sentence_no += 1 - - # if sentence_no % progress_per == 0: - # progress_queue.put((total_words, sentence_no + 1)) - # sentence_no = -1 - # total_words = 0 - - progress_queue.put((total_words, sentence_no + 1)) - progress_queue.put(None) - return vocab - - class Word2VecVocab(utils.SaveLoad): def __init__(self, max_vocab_size=None, min_count=5, sample=1e-3, sorted_vocab=True, null_word=0, max_final_vocab=None): From 491a08743cbfd432b5e1146ec2d5b221f59b1e86 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Wed, 27 Jun 2018 13:36:28 +0300 Subject: [PATCH 32/49] approaching to fully nogil cython _worker_loop --- gensim/models/linesentence.cpp | 7 +++-- gensim/models/linesentence.h | 4 ++- gensim/models/word2vec.py | 50 ++++++++++++++++++++++++-------- gensim/models/word2vec_inner.pyx | 5 ++++ 4 files changed, 50 insertions(+), 16 deletions(-) diff --git a/gensim/models/linesentence.cpp b/gensim/models/linesentence.cpp index ae7cfda533..1f5c4312f8 100644 --- a/gensim/models/linesentence.cpp +++ b/gensim/models/linesentence.cpp @@ -4,11 +4,12 @@ #include "linesentence.h" -FastLineSentence::FastLineSentence(const std::string& filename) : fs_(filename) { } +FastLineSentence::FastLineSentence(const std::string& filename) : fs_(filename), is_eof_(false) { } std::vector FastLineSentence::ReadSentence() { if (fs_.eof()) { - throw std::runtime_error("EOF occured in C++!"); + if_eof_ = true; + return {} } std::string line, word; std::getline(fs_, line); @@ -20,4 +21,4 @@ std::vector FastLineSentence::ReadSentence() { } return res; -} \ No newline at end of file +} diff --git a/gensim/models/linesentence.h b/gensim/models/linesentence.h index 95bb82add2..f23dad965e 100644 --- a/gensim/models/linesentence.h +++ b/gensim/models/linesentence.h @@ -10,6 +10,8 @@ class FastLineSentence { explicit FastLineSentence(const std::string& filename); std::vector ReadSentence(); + inline bool IsEof() const { return is_eof_; } private: std::ifstream fs_; -}; \ No newline at end of file + bool is_eof_; +}; diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index d831fcade9..d10cb3b07c 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -118,6 +118,8 @@ from gensim.models.keyedvectors import Vocab, Word2VecKeyedVectors from gensim.models.base_any2vec import BaseWordEmbeddingsModel +from gensim.models.word2vec_inner import CythonLineSentence + try: from queue import Queue, Empty except ImportError: @@ -534,18 +536,42 @@ def __init__(self, sentences=None, input_streams=None, size=100, alpha=0.025, wi seed=seed, hs=hs, negative=negative, cbow_mean=cbow_mean, min_alpha=min_alpha, compute_loss=compute_loss, fast_version=FAST_VERSION) - def _do_train_job(self, sentences, alpha, inits): - """ - Train a single batch of sentences. Return 2-tuple `(effective word count after - ignoring unknown words and sentence length trimming, total word count)`. - """ - work, neu1 = inits - tally = 0 - if self.sg: - tally += train_batch_sg(self, sentences, alpha, work, self.compute_loss) - else: - tally += train_batch_cbow(self, sentences, alpha, work, neu1, self.compute_loss) - return tally, self._raw_word_count(sentences) + # def _do_train_job(self, sentences, alpha, inits): + # """ + # Train a single batch of sentences. Return 2-tuple `(effective word count after + # ignoring unknown words and sentence length trimming, total word count)`. + # """ + # work, neu1 = inits + # tally = train_batch_cbow(self, sentences, alpha, work, neu1, self.compute_loss) + # return tally, self._raw_word_count(sentences) + + def _worker_loop(self, fname, progress_queue): + work, neu1 = self._get_thread_working_mem() + jobs_processed = 0 + alpha = self._get_job_params(0) + input_stream = CythonLineSentence(fname) + while True: + if not input_stream.is_eof: + # Prepare batch with NO GIL + data_iterable = input_stream.next_batch() + else: + break + + for callback in self.callbacks: + callback.on_batch_begin(self) + + # No GIL (almost) (_do_train_job) + tally = train_batch_cbow(self, data_iterable, alpha, work, neu1, False) + raw_tally = self._raw_word_count(data_iterable) + + for callback in self.callbacks: + callback.on_batch_end(self) + + progress_queue.put((len(data_iterable), tally, raw_tally)) # report back progress + jobs_processed += 1 + + progress_queue.put(None) + logger.debug("worker exiting, processed %i jobs", jobs_processed) def _clear_post_train(self): """Resets certain properties of the model, post training.""" diff --git a/gensim/models/word2vec_inner.pyx b/gensim/models/word2vec_inner.pyx index c48e179497..88879525d9 100755 --- a/gensim/models/word2vec_inner.pyx +++ b/gensim/models/word2vec_inner.pyx @@ -61,6 +61,7 @@ cdef class CythonLineSentence: cdef public string source cdef public int max_sentence_length, max_words_in_batch cdef vector[string] buf_data + cdef public bool_t is_eof def __cinit__(self, source, max_sentence_length=MAX_SENTENCE_LEN): self._thisptr = new FastLineSentence(source) @@ -69,6 +70,7 @@ cdef class CythonLineSentence: self.source = source self.max_sentence_length = max_sentence_length # isn't used in this hacky prototype self.max_words_in_batch = MAX_SENTENCE_LEN + self.is_eof = False def __dealloc__(self): if self._thisptr != NULL: @@ -107,6 +109,9 @@ cdef class CythonLineSentence: # Save data which doesn't fit in batch in order to return it later. buf_data = data + if self._thisptr.IsEof(): + self.is_eof = True + return job_batch From 15e07ae884c2432a108241bea0f0190f0ad06298 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Wed, 27 Jun 2018 13:45:04 +0300 Subject: [PATCH 33/49] wrapper fix --- gensim/models/word2vec_inner.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/gensim/models/word2vec_inner.pyx b/gensim/models/word2vec_inner.pyx index 88879525d9..fd18418114 100755 --- a/gensim/models/word2vec_inner.pyx +++ b/gensim/models/word2vec_inner.pyx @@ -53,6 +53,7 @@ cdef extern from "linesentence.h": cdef cppclass FastLineSentence: FastLineSentence(string&) except + vector[string] ReadSentence() nogil except + + bool_t IsEof() nogil @cython.final From 5cad26baa40c4a3757f7225317d1b817526c11f3 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Wed, 27 Jun 2018 13:46:44 +0300 Subject: [PATCH 34/49] one more fix --- gensim/models/linesentence.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gensim/models/linesentence.cpp b/gensim/models/linesentence.cpp index 1f5c4312f8..ff19f6fe7f 100644 --- a/gensim/models/linesentence.cpp +++ b/gensim/models/linesentence.cpp @@ -8,8 +8,8 @@ FastLineSentence::FastLineSentence(const std::string& filename) : fs_(filename), std::vector FastLineSentence::ReadSentence() { if (fs_.eof()) { - if_eof_ = true; - return {} + is_eof_ = true; + return {}; } std::string line, word; std::getline(fs_, line); From 495c4dce8c3a5f20aa8915da3a1476895b5b1742 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Wed, 27 Jun 2018 14:20:22 +0300 Subject: [PATCH 35/49] more fixes --- gensim/models/base_any2vec.py | 50 ++++++++++++++++---------------- gensim/models/word2vec.py | 2 +- gensim/models/word2vec_inner.pyx | 23 ++++++++++----- 3 files changed, 41 insertions(+), 34 deletions(-) diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py index babfa18241..b008f73234 100644 --- a/gensim/models/base_any2vec.py +++ b/gensim/models/base_any2vec.py @@ -136,32 +136,32 @@ def _check_training_sanity(self, epochs=None, total_examples=None, total_words=N """Check that the training parameters provided make sense. e.g. raise error if `epochs` not provided.""" raise NotImplementedError() - def _worker_loop(self, fname, progress_queue): - thread_private_mem = self._get_thread_working_mem() - jobs_processed = 0 - job_parameters = self._get_job_params(0) - input_stream = CythonLineSentence(fname) - while True: - try: - # Prepare batch with NO GIL - data_iterable = input_stream.next_batch() - except: - break - - for callback in self.callbacks: - callback.on_batch_begin(self) - + # def _worker_loop(self, fname, progress_queue): + # thread_private_mem = self._get_thread_working_mem() + # jobs_processed = 0 + # job_parameters = self._get_job_params(0) + # input_stream = CythonLineSentence(fname) + # while True: + # try: + # Prepare batch with NO GIL + # data_iterable = input_stream.next_batch() + # except: + # break + # + # for callback in self.callbacks: + # callback.on_batch_begin(self) + # # No GIL (almost) - tally, raw_tally = self._do_train_job(data_iterable, job_parameters, thread_private_mem) - - for callback in self.callbacks: - callback.on_batch_end(self) - - progress_queue.put((len(data_iterable), tally, raw_tally)) # report back progress - jobs_processed += 1 - - progress_queue.put(None) - logger.debug("worker exiting, processed %i jobs", jobs_processed) + # tally, raw_tally = self._do_train_job(data_iterable, job_parameters, thread_private_mem) + # + # for callback in self.callbacks: + # callback.on_batch_end(self) + # + # progress_queue.put((len(data_iterable), tally, raw_tally)) # report back progress + # jobs_processed += 1 + # + # progress_queue.put(None) + # logger.debug("worker exiting, processed %i jobs", jobs_processed) # def _batch_iterator(self, input_stream, cur_epoch=0, total_examples=None, total_words=None): # job_batch, batch_size = [], 0 diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index d10cb3b07c..a3dd5b0f67 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -551,7 +551,7 @@ def _worker_loop(self, fname, progress_queue): alpha = self._get_job_params(0) input_stream = CythonLineSentence(fname) while True: - if not input_stream.is_eof: + if not input_stream.is_eof(): # Prepare batch with NO GIL data_iterable = input_stream.next_batch() else: diff --git a/gensim/models/word2vec_inner.pyx b/gensim/models/word2vec_inner.pyx index fd18418114..fc2bad1c45 100755 --- a/gensim/models/word2vec_inner.pyx +++ b/gensim/models/word2vec_inner.pyx @@ -62,7 +62,6 @@ cdef class CythonLineSentence: cdef public string source cdef public int max_sentence_length, max_words_in_batch cdef vector[string] buf_data - cdef public bool_t is_eof def __cinit__(self, source, max_sentence_length=MAX_SENTENCE_LEN): self._thisptr = new FastLineSentence(source) @@ -71,12 +70,14 @@ cdef class CythonLineSentence: self.source = source self.max_sentence_length = max_sentence_length # isn't used in this hacky prototype self.max_words_in_batch = MAX_SENTENCE_LEN - self.is_eof = False def __dealloc__(self): if self._thisptr != NULL: del self._thisptr + cpdef bool_t is_eof(self) nogil: + return self._thisptr.IsEof() + cpdef vector[string] read_sentence(self) nogil except *: return self._thisptr.ReadSentence() @@ -91,6 +92,13 @@ cdef class CythonLineSentence: int batch_size = 0 int data_length = 0 + if self.is_eof() and self.buf_data.size() == 0: + return job_batch + elif self.is_eof(): + job_batch.push_back(self.buf_data) + self.buf_data.clear() + return job_batch + # Try to read data from previous calls which was not returned if self.buf_data.size() > 0: data = self.buf_data @@ -103,15 +111,14 @@ cdef class CythonLineSentence: job_batch.push_back(data) batch_size += data_length - # TODO: if it raises an exception, we will not return a batch we read up to this moment + if self.is_eof(): + break data = self.read_sentence() data_length = data.size() - # Save data which doesn't fit in batch in order to return it later. - buf_data = data - - if self._thisptr.IsEof(): - self.is_eof = True + if not self.is_eof(): + # Save data which doesn't fit in batch in order to return it later. + buf_data = data return job_batch From 8b29df8768399024a4955796dd16942c1ce9bcd7 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Wed, 27 Jun 2018 21:03:05 +0300 Subject: [PATCH 36/49] upd --- gensim/models/word2vec.py | 25 ++++--------------------- gensim/models/word2vec_inner.pyx | 3 +-- 2 files changed, 5 insertions(+), 23 deletions(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index a3dd5b0f67..91722f0d4b 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -139,7 +139,7 @@ logger = logging.getLogger(__name__) try: - from gensim.models.word2vec_inner import train_batch_sg, train_batch_cbow + from gensim.models.word2vec_inner import train_batch_sg, train_epoch_cbow from gensim.models.word2vec_inner import score_sentence_sg, score_sentence_cbow from gensim.models.word2vec_inner import FAST_VERSION, MAX_WORDS_IN_BATCH @@ -550,28 +550,11 @@ def _worker_loop(self, fname, progress_queue): jobs_processed = 0 alpha = self._get_job_params(0) input_stream = CythonLineSentence(fname) - while True: - if not input_stream.is_eof(): - # Prepare batch with NO GIL - data_iterable = input_stream.next_batch() - else: - break - - for callback in self.callbacks: - callback.on_batch_begin(self) - - # No GIL (almost) (_do_train_job) - tally = train_batch_cbow(self, data_iterable, alpha, work, neu1, False) - raw_tally = self._raw_word_count(data_iterable) - - for callback in self.callbacks: - callback.on_batch_end(self) - - progress_queue.put((len(data_iterable), tally, raw_tally)) # report back progress - jobs_processed += 1 + tally, raw_tally = train_epoch_cbow(self, input_stream, alpha, work, neu1, False) + progress_queue.put((0, tally, raw_tally)) progress_queue.put(None) - logger.debug("worker exiting, processed %i jobs", jobs_processed) + # logger.debug("worker exiting, processed %i jobs", jobs_processed) def _clear_post_train(self): """Resets certain properties of the model, post training.""" diff --git a/gensim/models/word2vec_inner.pyx b/gensim/models/word2vec_inner.pyx index fc2bad1c45..255d4dd70c 100755 --- a/gensim/models/word2vec_inner.pyx +++ b/gensim/models/word2vec_inner.pyx @@ -481,7 +481,7 @@ def train_batch_sg(model, sentences, alpha, _work, compute_loss): return effective_words -def train_batch_cbow(model, sentences, alpha, _work, _neu1, compute_loss): +cpdef train_epoch_cbow(model, input_stream, alpha, _work, _neu1, compute_loss): cdef int hs = model.hs cdef int negative = model.negative cdef int sample = (model.vocabulary.sample != 0) @@ -583,7 +583,6 @@ def train_batch_cbow(model, sentences, alpha, _work, _neu1, compute_loss): if negative: next_random = fast_sentence_cbow_neg(negative, cum_table, cum_table_len, codelens, neu1, syn0, syn1neg, size, indexes, _alpha, work, i, j, k, cbow_mean, next_random, word_locks, _compute_loss, &_running_training_loss) - model.running_training_loss = _running_training_loss return effective_words From 2119c3ad8b349af963b1df26db0c9db6eb272418 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Wed, 27 Jun 2018 21:24:55 +0300 Subject: [PATCH 37/49] try to cythonize batch preparation --- gensim/models/word2vec_inner.pyx | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/gensim/models/word2vec_inner.pyx b/gensim/models/word2vec_inner.pyx index 255d4dd70c..a5eff66cff 100755 --- a/gensim/models/word2vec_inner.pyx +++ b/gensim/models/word2vec_inner.pyx @@ -19,7 +19,8 @@ from libc.string cimport memset, strtok from libcpp.string cimport string from libcpp.vector cimport vector from libcpp cimport bool as bool_t - +from libcpp.unordered_map cimport unordered_map +from libcpp.pair import pair # scipy <= 0.15 try: @@ -518,6 +519,9 @@ cpdef train_epoch_cbow(model, input_stream, alpha, _work, _neu1, compute_loss): # for sampling (negative and frequent-word downsampling) cdef unsigned long long next_random + # for preparing batches without Python GIL + cdef unordered_map[string, pair[unsigned long long, unsigned long long]] vocab + if hs: syn1 = (np.PyArray_DATA(model.trainables.syn1)) @@ -533,22 +537,25 @@ cpdef train_epoch_cbow(model, input_stream, alpha, _work, _neu1, compute_loss): neu1 = np.PyArray_DATA(_neu1) # prepare C structures so we can go "full C" and release the Python GIL - vlookup = model.wv.vocab + + for word in model.wv.vocab: + vocab[word] = (model.wv.vocab[word].index, model.wv.vocab[word].sample_int) + + sentences = input_stream.next_batch() + + cdef pair[unsigned long long, unsigned long long] word sentence_idx[0] = 0 # indices of the first sentence always start at 0 for sent in sentences: if not sent: continue # ignore empty sentences; leave effective_sentences unchanged for token in sent: - word = vlookup[token] if token in vlookup else None - if word is None: - continue # leaving `effective_words` unchanged = shortening the sentence = expanding the window - if sample and word.sample_int < random_int32(&next_random): + if token not in vocab: + continue # leaving `effective_words` unchanged = shortening the sentence = expanding the window + word = vocab[token] + if sample and word.second < random_int32(&next_random): continue - indexes[effective_words] = word.index - if hs: - codelens[effective_words] = len(word.code) - codes[effective_words] = np.PyArray_DATA(word.code) - points[effective_words] = np.PyArray_DATA(word.point) + indexes[effective_words] = word.first + effective_words += 1 if effective_words == MAX_SENTENCE_LEN: break # TODO: log warning, tally overflow? @@ -583,7 +590,7 @@ cpdef train_epoch_cbow(model, input_stream, alpha, _work, _neu1, compute_loss): if negative: next_random = fast_sentence_cbow_neg(negative, cum_table, cum_table_len, codelens, neu1, syn0, syn1neg, size, indexes, _alpha, work, i, j, k, cbow_mean, next_random, word_locks, _compute_loss, &_running_training_loss) - return effective_words + return effective_words, effective_words # return properly raw_tally as a second value (not tally) # Score is only implemented for hierarchical softmax From 3506ec960805ee62e136c5c24d3b8ab92059cae9 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Wed, 27 Jun 2018 21:36:48 +0300 Subject: [PATCH 38/49] it compiles --- gensim/models/word2vec_inner.pyx | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/gensim/models/word2vec_inner.pyx b/gensim/models/word2vec_inner.pyx index a5eff66cff..7dd451a9ef 100755 --- a/gensim/models/word2vec_inner.pyx +++ b/gensim/models/word2vec_inner.pyx @@ -20,7 +20,7 @@ from libcpp.string cimport string from libcpp.vector cimport vector from libcpp cimport bool as bool_t from libcpp.unordered_map cimport unordered_map -from libcpp.pair import pair +from libcpp.pair cimport pair # scipy <= 0.15 try: @@ -49,6 +49,8 @@ cdef REAL_t[EXP_TABLE_SIZE] LOG_TABLE cdef int ONE = 1 cdef REAL_t ONEF = 1.0 +ctypedef unsigned long long ULongLong + cdef extern from "linesentence.h": cdef cppclass FastLineSentence: @@ -520,7 +522,10 @@ cpdef train_epoch_cbow(model, input_stream, alpha, _work, _neu1, compute_loss): cdef unsigned long long next_random # for preparing batches without Python GIL - cdef unordered_map[string, pair[unsigned long long, unsigned long long]] vocab + cdef unordered_map[string, pair[ULongLong, ULongLong]] vocab + cdef vector[vector[string]] sentences + cdef string token + cdef vector[string] sent if hs: syn1 = (np.PyArray_DATA(model.trainables.syn1)) @@ -538,18 +543,18 @@ cpdef train_epoch_cbow(model, input_stream, alpha, _work, _neu1, compute_loss): # prepare C structures so we can go "full C" and release the Python GIL - for word in model.wv.vocab: - vocab[word] = (model.wv.vocab[word].index, model.wv.vocab[word].sample_int) + for token in model.wv.vocab: + vocab[token] = (model.wv.vocab[token].index, model.wv.vocab[token].sample_int) sentences = input_stream.next_batch() - cdef pair[unsigned long long, unsigned long long] word + cdef pair[ULongLong, ULongLong] word sentence_idx[0] = 0 # indices of the first sentence always start at 0 for sent in sentences: - if not sent: + if sent.empty(): continue # ignore empty sentences; leave effective_sentences unchanged for token in sent: - if token not in vocab: + if vocab.find(token) == vocab.end(): continue # leaving `effective_words` unchanged = shortening the sentence = expanding the window word = vocab[token] if sample and word.second < random_int32(&next_random): From 62f71ee61269459daa4fbe85bcddff990aa58401 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Thu, 28 Jun 2018 12:50:26 +0300 Subject: [PATCH 39/49] prepare batch inside nogil section in a while loop --- gensim/models/word2vec_inner.pyx | 105 +++++++++++++++++-------------- 1 file changed, 59 insertions(+), 46 deletions(-) diff --git a/gensim/models/word2vec_inner.pyx b/gensim/models/word2vec_inner.pyx index 7dd451a9ef..b91687b7eb 100755 --- a/gensim/models/word2vec_inner.pyx +++ b/gensim/models/word2vec_inner.pyx @@ -546,56 +546,69 @@ cpdef train_epoch_cbow(model, input_stream, alpha, _work, _neu1, compute_loss): for token in model.wv.vocab: vocab[token] = (model.wv.vocab[token].index, model.wv.vocab[token].sample_int) - sentences = input_stream.next_batch() - - cdef pair[ULongLong, ULongLong] word - sentence_idx[0] = 0 # indices of the first sentence always start at 0 - for sent in sentences: - if sent.empty(): - continue # ignore empty sentences; leave effective_sentences unchanged - for token in sent: - if vocab.find(token) == vocab.end(): - continue # leaving `effective_words` unchanged = shortening the sentence = expanding the window - word = vocab[token] - if sample and word.second < random_int32(&next_random): - continue - indexes[effective_words] = word.first - - effective_words += 1 - if effective_words == MAX_SENTENCE_LEN: - break # TODO: log warning, tally overflow? - - # keep track of which words go into which sentence, so we don't train - # across sentence boundaries. - # indices of sentence number X are between idx_end: + k = idx_end + if hs: + fast_sentence_cbow_hs(points[i], codes[i], codelens, neu1, syn0, syn1, size, indexes, _alpha, work, i, j, k, cbow_mean, word_locks, _compute_loss, &_running_training_loss) + if negative: + next_random = fast_sentence_cbow_neg(negative, cum_table, cum_table_len, codelens, neu1, syn0, syn1neg, size, indexes, _alpha, work, i, j, k, cbow_mean, next_random, word_locks, _compute_loss, &_running_training_loss) - # precompute "reduced window" offsets in a single randint() call - for i, item in enumerate(model.random.randint(0, window, effective_words)): - reduced_windows[i] = item + total_effective_sentences += effective_sentences + total_effective_words += total_effective_words - # release GIL & train on all sentences - with nogil: - for sent_idx in range(effective_sentences): - idx_start = sentence_idx[sent_idx] - idx_end = sentence_idx[sent_idx + 1] - for i in range(idx_start, idx_end): - j = i - window + reduced_windows[i] - if j < idx_start: - j = idx_start - k = i + window + 1 - reduced_windows[i] - if k > idx_end: - k = idx_end - if hs: - fast_sentence_cbow_hs(points[i], codes[i], codelens, neu1, syn0, syn1, size, indexes, _alpha, work, i, j, k, cbow_mean, word_locks, _compute_loss, &_running_training_loss) - if negative: - next_random = fast_sentence_cbow_neg(negative, cum_table, cum_table_len, codelens, neu1, syn0, syn1neg, size, indexes, _alpha, work, i, j, k, cbow_mean, next_random, word_locks, _compute_loss, &_running_training_loss) - return effective_words, effective_words # return properly raw_tally as a second value (not tally) + return total_effective_words, total_words # return properly raw_tally as a second value (not tally) # Score is only implemented for hierarchical softmax From 8924af5ecd24b6c47c1413d20730dd09485ee6aa Mon Sep 17 00:00:00 2001 From: persiyanov Date: Thu, 28 Jun 2018 13:25:27 +0300 Subject: [PATCH 40/49] compiles --- gensim/models/word2vec_inner.pyx | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/gensim/models/word2vec_inner.pyx b/gensim/models/word2vec_inner.pyx index b91687b7eb..f49dea8752 100755 --- a/gensim/models/word2vec_inner.pyx +++ b/gensim/models/word2vec_inner.pyx @@ -84,11 +84,11 @@ cdef class CythonLineSentence: cpdef vector[string] read_sentence(self) nogil except *: return self._thisptr.ReadSentence() - cpdef vector[vector[string]] next_batch(self) except *: - with nogil: - return self._next_batch() + # cpdef vector[vector[string]] next_batch(self) except *: + # with nogil: + # return self._next_batch() - cpdef vector[vector[string]] _next_batch(self) nogil except *: + cpdef vector[vector[string]] next_batch(self) nogil except *: cdef: vector[vector[string]] job_batch vector[string] data @@ -484,11 +484,12 @@ def train_batch_sg(model, sentences, alpha, _work, compute_loss): return effective_words -cpdef train_epoch_cbow(model, input_stream, alpha, _work, _neu1, compute_loss): +cpdef train_epoch_cbow(model, _input_stream, alpha, _work, _neu1, compute_loss): cdef int hs = model.hs cdef int negative = model.negative cdef int sample = (model.vocabulary.sample != 0) cdef int cbow_mean = model.cbow_mean + cdef CythonLineSentence input_stream = _input_stream cdef int _compute_loss = (1 if compute_loss == True else 0) cdef REAL_t _running_training_loss = model.running_training_loss @@ -548,6 +549,8 @@ cpdef train_epoch_cbow(model, input_stream, alpha, _work, _neu1, compute_loss): # release GIL & train on all sentences cdef int total_effective_words = 0, total_effective_sentences = 0, total_words = 0 + cdef pair[ULongLong, ULongLong] word + cdef ULongLong random_number with nogil: while not input_stream.is_eof(): @@ -555,8 +558,6 @@ cpdef train_epoch_cbow(model, input_stream, alpha, _work, _neu1, compute_loss): effective_words = 0 sentences = input_stream.next_batch() - - cdef pair[ULongLong, ULongLong] word sentence_idx[0] = 0 # indices of the first sentence always start at 0 for sent in sentences: total_words += sent.size() @@ -585,9 +586,8 @@ cpdef train_epoch_cbow(model, input_stream, alpha, _work, _neu1, compute_loss): break # TODO: log warning, tally overflow? # precompute "reduced window" offsets in a single randint() call - for i, item in enumerate(model.random.randint(0, window, effective_words)): - reduced_windows[i] = item - + for i in range(effective_words): + reduced_windows[i] = random_int32(&next_random) % window for sent_idx in range(effective_sentences): idx_start = sentence_idx[sent_idx] From 53fedfa9d3e593dd7d5c73d6ae71793972192eaf Mon Sep 17 00:00:00 2001 From: persiyanov Date: Fri, 29 Jun 2018 12:51:04 +0300 Subject: [PATCH 41/49] some bugfixes --- gensim/models/word2vec_inner.pyx | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/gensim/models/word2vec_inner.pyx b/gensim/models/word2vec_inner.pyx index f49dea8752..1e01466d8e 100755 --- a/gensim/models/word2vec_inner.pyx +++ b/gensim/models/word2vec_inner.pyx @@ -21,6 +21,7 @@ from libcpp.vector cimport vector from libcpp cimport bool as bool_t from libcpp.unordered_map cimport unordered_map from libcpp.pair cimport pair +from libc.stdio cimport printf # scipy <= 0.15 try: @@ -544,8 +545,9 @@ cpdef train_epoch_cbow(model, _input_stream, alpha, _work, _neu1, compute_loss): # prepare C structures so we can go "full C" and release the Python GIL - for token in model.wv.vocab: - vocab[token] = (model.wv.vocab[token].index, model.wv.vocab[token].sample_int) + for py_token in model.wv.vocab: + token = py_token.encode('utf8') + vocab[token] = (model.wv.vocab[py_token].index, model.wv.vocab[py_token].sample_int) # release GIL & train on all sentences cdef int total_effective_words = 0, total_effective_sentences = 0, total_words = 0 @@ -605,7 +607,7 @@ cpdef train_epoch_cbow(model, _input_stream, alpha, _work, _neu1, compute_loss): next_random = fast_sentence_cbow_neg(negative, cum_table, cum_table_len, codelens, neu1, syn0, syn1neg, size, indexes, _alpha, work, i, j, k, cbow_mean, next_random, word_locks, _compute_loss, &_running_training_loss) total_effective_sentences += effective_sentences - total_effective_words += total_effective_words + total_effective_words += effective_words return total_effective_words, total_words # return properly raw_tally as a second value (not tally) From c679bc6942a8ab3db3c0dbb026863e046c297164 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Fri, 29 Jun 2018 13:51:55 +0300 Subject: [PATCH 42/49] add cpu_distribution script --- gensim/models/base_any2vec.py | 1 - gensim/scripts/cpu_distribution.py | 57 ++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 1 deletion(-) create mode 100644 gensim/scripts/cpu_distribution.py diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py index b008f73234..1aaa6a7e94 100644 --- a/gensim/models/base_any2vec.py +++ b/gensim/models/base_any2vec.py @@ -10,7 +10,6 @@ import logging from timeit import default_timer import threading -import multiprocessing as mp from six.moves import xrange from six import itervalues from gensim import matutils diff --git a/gensim/scripts/cpu_distribution.py b/gensim/scripts/cpu_distribution.py new file mode 100644 index 0000000000..6d2ecfa8dc --- /dev/null +++ b/gensim/scripts/cpu_distribution.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python + +# Copyright (c) 2009, Giampaolo Rodola'. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + + +from __future__ import print_function +import collections +import os +import sys +import time +import numpy as np + +import psutil + + +if not hasattr(psutil.Process, "cpu_num"): + sys.exit("platform not supported") + + +def clean_screen(): + if psutil.POSIX: + os.system('clear') + else: + os.system('cls') + + +def main(): + loads = [] + while True: + # header + clean_screen() + cpus_percent = psutil.cpu_percent(percpu=True) + loads.append(sum(cpus_percent)) + + perc25 = np.percentile(loads, 25) + perc50 = np.median(loads) + perc75 = np.percentile(loads, 75) + perc90 = np.percentile(loads, 90) + perc95 = np.percentile(loads, 95) + perc99 = np.percentile(loads, 99) + avg = np.mean(loads) + + print("25% perc : {:.2f}".format(perc25)) + print("50% perc : {:.2f}".format(perc50)) + print("75% perc : {:.2f}".format(perc75)) + print("90% perc : {:.2f}".format(perc90)) + print("95% perc : {:.2f}".format(perc95)) + print("99% perc : {:.2f}".format(perc99)) + print("avg perc : {:.2f}".format(avg)) + + time.sleep(1) + + +if __name__ == '__main__': + main() \ No newline at end of file From 921ff3895aa48ea32c35bf4d0eb7ea21c4e29227 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Wed, 4 Jul 2018 16:02:47 +0300 Subject: [PATCH 43/49] accept CythonLineSentence into _worker_loop, not filename --- gensim/models/base_any2vec.py | 61 ----------------------- gensim/models/word2vec.py | 4 +- gensim/scripts/benchmark_any2vec_speed.py | 3 +- 3 files changed, 3 insertions(+), 65 deletions(-) diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py index 1aaa6a7e94..f732c5f031 100644 --- a/gensim/models/base_any2vec.py +++ b/gensim/models/base_any2vec.py @@ -21,8 +21,6 @@ import psutil import time -from gensim.models.word2vec_inner import CythonLineSentence - try: from queue import Queue except ImportError: @@ -135,65 +133,6 @@ def _check_training_sanity(self, epochs=None, total_examples=None, total_words=N """Check that the training parameters provided make sense. e.g. raise error if `epochs` not provided.""" raise NotImplementedError() - # def _worker_loop(self, fname, progress_queue): - # thread_private_mem = self._get_thread_working_mem() - # jobs_processed = 0 - # job_parameters = self._get_job_params(0) - # input_stream = CythonLineSentence(fname) - # while True: - # try: - # Prepare batch with NO GIL - # data_iterable = input_stream.next_batch() - # except: - # break - # - # for callback in self.callbacks: - # callback.on_batch_begin(self) - # - # No GIL (almost) - # tally, raw_tally = self._do_train_job(data_iterable, job_parameters, thread_private_mem) - # - # for callback in self.callbacks: - # callback.on_batch_end(self) - # - # progress_queue.put((len(data_iterable), tally, raw_tally)) # report back progress - # jobs_processed += 1 - # - # progress_queue.put(None) - # logger.debug("worker exiting, processed %i jobs", jobs_processed) - - # def _batch_iterator(self, input_stream, cur_epoch=0, total_examples=None, total_words=None): - # job_batch, batch_size = [], 0 - # job_no = 0 - # - # for data_idx, data in enumerate(input_stream): - # data_length = self._raw_word_count([data]) - # - # # can we fit this sentence into the existing job batch? - # if batch_size + data_length <= self.batch_words: - # # yes => add it to the current job - # job_batch.append(data) - # batch_size += data_length - # else: - # job_no += 1 - # - # yield job_batch - # - # # add the sentence that didn't fit as the first item of a new job - # job_batch, batch_size = [data], data_length - # # add the last job too (may be significantly smaller than batch_words) - # if job_batch: - # job_no += 1 - # yield job_batch - # - # if job_no == 0 and self.train_count == 0: - # logger.warning( - # "train() called with an empty iterator (if not intended, " - # "be sure to provide a corpus that offers restartable iteration = an iterable)." - # ) - # - # logger.debug("batch iterator loop exiting, total %i jobs", job_no) - def _log_progress(self, progress_queue, cur_epoch, example_count, total_examples, raw_word_count, total_words, trained_word_count, elapsed): raise NotImplementedError() diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 91722f0d4b..ba2963a009 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -118,7 +118,6 @@ from gensim.models.keyedvectors import Vocab, Word2VecKeyedVectors from gensim.models.base_any2vec import BaseWordEmbeddingsModel -from gensim.models.word2vec_inner import CythonLineSentence try: from queue import Queue, Empty @@ -545,11 +544,10 @@ def __init__(self, sentences=None, input_streams=None, size=100, alpha=0.025, wi # tally = train_batch_cbow(self, sentences, alpha, work, neu1, self.compute_loss) # return tally, self._raw_word_count(sentences) - def _worker_loop(self, fname, progress_queue): + def _worker_loop(self, input_stream, progress_queue): work, neu1 = self._get_thread_working_mem() jobs_processed = 0 alpha = self._get_job_params(0) - input_stream = CythonLineSentence(fname) tally, raw_tally = train_epoch_cbow(self, input_stream, alpha, work, neu1, False) progress_queue.put((0, tally, raw_tally)) diff --git a/gensim/scripts/benchmark_any2vec_speed.py b/gensim/scripts/benchmark_any2vec_speed.py index 5b2bbccc48..bdf11689fa 100644 --- a/gensim/scripts/benchmark_any2vec_speed.py +++ b/gensim/scripts/benchmark_any2vec_speed.py @@ -14,6 +14,7 @@ from gensim.models.word2vec import Word2Vec from gensim.models.doc2vec import Doc2Vec, TaggedLineDocument from gensim.models.word2vec import LineSentence +from gensim.models.word2vec_inner import CythonLineSentence logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) @@ -45,7 +46,7 @@ def benchmark_model(input_streams, model, window, workers, vector_size): } else: kwargs = { - 'input_streams': [inp for inp in input_streams] # hack for CythonLineSentence + 'input_streams': [CythonLineSentence(inp) for inp in input_streams] } kwargs['size'] = vector_size From 9e4ed0e0564e9cce5b93a433ee2e47a8fb8b7edc Mon Sep 17 00:00:00 2001 From: persiyanov Date: Wed, 4 Jul 2018 18:34:09 +0300 Subject: [PATCH 44/49] make CythonLineSentence iterable --- gensim/models/word2vec.py | 9 +-------- gensim/models/word2vec_inner.pyx | 10 +++++++--- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index ba2963a009..d00589c189 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -1166,14 +1166,7 @@ def __init__(self, max_vocab_size=None, min_count=5, sample=1e-3, sorted_vocab=T def scan_vocab(self, input_streams, progress_per=10000, trim_rule=None): """Do an initial scan of all words appearing in sentences.""" - from itertools import chain - line_sentences = [] - for st in input_streams: - if isinstance(st, string_types): - line_sentences.append(LineSentence(st)) - else: - raise RuntimeError("error!!!!!!!!") - sentences = chain(*line_sentences) + sentences = itertools.chain(*input_streams) logger.info("collecting all words and their counts") sentence_no = -1 diff --git a/gensim/models/word2vec_inner.pyx b/gensim/models/word2vec_inner.pyx index 1e01466d8e..b3b0691801 100755 --- a/gensim/models/word2vec_inner.pyx +++ b/gensim/models/word2vec_inner.pyx @@ -85,9 +85,13 @@ cdef class CythonLineSentence: cpdef vector[string] read_sentence(self) nogil except *: return self._thisptr.ReadSentence() - # cpdef vector[vector[string]] next_batch(self) except *: - # with nogil: - # return self._next_batch() + cpdef vector[string] read_sentence_gil(self): + return self._thisptr.ReadSentence() + + def __iter__(self): + while not self.is_eof(): + sent = self.read_sentence_gil() + yield sent cpdef vector[vector[string]] next_batch(self) nogil except *: cdef: From f9ea23b4f08619856e5a2c273ba3523ca153a08e Mon Sep 17 00:00:00 2001 From: persiyanov Date: Wed, 4 Jul 2018 20:07:12 +0300 Subject: [PATCH 45/49] fix --- gensim/models/linesentence.cpp | 2 +- gensim/models/linesentence.h | 2 ++ gensim/models/word2vec_inner.pyx | 15 ++++++++++----- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/gensim/models/linesentence.cpp b/gensim/models/linesentence.cpp index ff19f6fe7f..fcc23713f4 100644 --- a/gensim/models/linesentence.cpp +++ b/gensim/models/linesentence.cpp @@ -4,7 +4,7 @@ #include "linesentence.h" -FastLineSentence::FastLineSentence(const std::string& filename) : fs_(filename), is_eof_(false) { } +FastLineSentence::FastLineSentence(const std::string& filename) : filename_(filename), fs_(filename), is_eof_(false) { } std::vector FastLineSentence::ReadSentence() { if (fs_.eof()) { diff --git a/gensim/models/linesentence.h b/gensim/models/linesentence.h index f23dad965e..da4579161d 100644 --- a/gensim/models/linesentence.h +++ b/gensim/models/linesentence.h @@ -11,7 +11,9 @@ class FastLineSentence { std::vector ReadSentence(); inline bool IsEof() const { return is_eof_; } + inline void Reset() { fs_.close(); fs_ = std::ifstream(filename_); is_eof_ = false; } private: + std::string filename_; std::ifstream fs_; bool is_eof_; }; diff --git a/gensim/models/word2vec_inner.pyx b/gensim/models/word2vec_inner.pyx index b3b0691801..b94c88bb71 100755 --- a/gensim/models/word2vec_inner.pyx +++ b/gensim/models/word2vec_inner.pyx @@ -58,6 +58,7 @@ cdef extern from "linesentence.h": FastLineSentence(string&) except + vector[string] ReadSentence() nogil except + bool_t IsEof() nogil + void Reset() nogil @cython.final @@ -85,12 +86,13 @@ cdef class CythonLineSentence: cpdef vector[string] read_sentence(self) nogil except *: return self._thisptr.ReadSentence() - cpdef vector[string] read_sentence_gil(self): - return self._thisptr.ReadSentence() + cpdef void reset(self) nogil: + self._thisptr.Reset() def __iter__(self): + self.reset() while not self.is_eof(): - sent = self.read_sentence_gil() + sent = self.read_sentence() yield sent cpdef vector[vector[string]] next_batch(self) nogil except *: @@ -548,9 +550,11 @@ cpdef train_epoch_cbow(model, _input_stream, alpha, _work, _neu1, compute_loss): neu1 = np.PyArray_DATA(_neu1) # prepare C structures so we can go "full C" and release the Python GIL - for py_token in model.wv.vocab: - token = py_token.encode('utf8') + try: + token = py_token.encode('utf8') + except: + token = py_token vocab[token] = (model.wv.vocab[py_token].index, model.wv.vocab[py_token].sample_int) # release GIL & train on all sentences @@ -559,6 +563,7 @@ cpdef train_epoch_cbow(model, _input_stream, alpha, _work, _neu1, compute_loss): cdef ULongLong random_number with nogil: + input_stream.reset() while not input_stream.is_eof(): effective_sentences = 0 effective_words = 0 From cb8bb7105a734ec0d40e96681820b580cbf5aa56 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Thu, 5 Jul 2018 17:36:19 +0300 Subject: [PATCH 46/49] python iterators without gil --- gensim/models/linesentence.cpp | 1 + gensim/models/linesentence.h | 1 + gensim/models/word2vec_inner.pyx | 156 +++++++++++++++++++++++++++++++ 3 files changed, 158 insertions(+) diff --git a/gensim/models/linesentence.cpp b/gensim/models/linesentence.cpp index fcc23713f4..4f003345bc 100644 --- a/gensim/models/linesentence.cpp +++ b/gensim/models/linesentence.cpp @@ -4,6 +4,7 @@ #include "linesentence.h" +FastLineSentence::FastLineSentence() : is_eof_(false) { } FastLineSentence::FastLineSentence(const std::string& filename) : filename_(filename), fs_(filename), is_eof_(false) { } std::vector FastLineSentence::ReadSentence() { diff --git a/gensim/models/linesentence.h b/gensim/models/linesentence.h index da4579161d..625220d032 100644 --- a/gensim/models/linesentence.h +++ b/gensim/models/linesentence.h @@ -7,6 +7,7 @@ class FastLineSentence { public: + explicit FastLineSentence(); explicit FastLineSentence(const std::string& filename); std::vector ReadSentence(); diff --git a/gensim/models/word2vec_inner.pyx b/gensim/models/word2vec_inner.pyx index b94c88bb71..983ff39491 100755 --- a/gensim/models/word2vec_inner.pyx +++ b/gensim/models/word2vec_inner.pyx @@ -622,6 +622,162 @@ cpdef train_epoch_cbow(model, _input_stream, alpha, _work, _neu1, compute_loss): return total_effective_words, total_words # return properly raw_tally as a second value (not tally) +cdef vector[vector[string]] iterate_batches_from_pystream(input_stream): + cdef vector[vector[string]] job_batch + cdef vector[string] data + + cdef int batch_size = 0 + cdef int data_length = 0 + + for data in input_stream: + data_length = data.size() + + # can we fit this sentence into the existing job batch? + if batch_size + data_length <= MAX_SENTENCE_LEN: + # yes => add it to the current job + job_batch.push_back(data) + batch_size += data_length + else: + yield job_batch + + job_batch.clear() + + job_batch.push_back(data) + batch_size = data_length + + # add the last job too (may be significantly smaller than batch_words) + if not job_batch.empty(): + yield job_batch + + +cpdef train_epoch_cbow_pystream(model, input_stream, alpha, _work, _neu1, compute_loss): + cdef int hs = model.hs + cdef int negative = model.negative + cdef int sample = (model.vocabulary.sample != 0) + cdef int cbow_mean = model.cbow_mean + + cdef int _compute_loss = (1 if compute_loss == True else 0) + cdef REAL_t _running_training_loss = model.running_training_loss + + cdef REAL_t *syn0 = (np.PyArray_DATA(model.wv.vectors)) + cdef REAL_t *word_locks = (np.PyArray_DATA(model.trainables.vectors_lockf)) + cdef REAL_t *work + cdef REAL_t _alpha = alpha + cdef int size = model.wv.vector_size + + cdef int codelens[MAX_SENTENCE_LEN] + cdef np.uint32_t indexes[MAX_SENTENCE_LEN] + cdef np.uint32_t reduced_windows[MAX_SENTENCE_LEN] + cdef int sentence_idx[MAX_SENTENCE_LEN + 1] + cdef int window = model.window + + cdef int i, j, k + cdef int effective_words = 0, effective_sentences = 0 + cdef int sent_idx, idx_start, idx_end + + # For hierarchical softmax + cdef REAL_t *syn1 + cdef np.uint32_t *points[MAX_SENTENCE_LEN] + cdef np.uint8_t *codes[MAX_SENTENCE_LEN] + + # For negative sampling + cdef REAL_t *syn1neg + cdef np.uint32_t *cum_table + cdef unsigned long long cum_table_len + # for sampling (negative and frequent-word downsampling) + cdef unsigned long long next_random + + # for preparing batches without Python GIL + cdef unordered_map[string, pair[ULongLong, ULongLong]] vocab + cdef vector[vector[string]] sentences + cdef string token + cdef vector[string] sent + + if hs: + syn1 = (np.PyArray_DATA(model.trainables.syn1)) + + if negative: + syn1neg = (np.PyArray_DATA(model.trainables.syn1neg)) + cum_table = (np.PyArray_DATA(model.vocabulary.cum_table)) + cum_table_len = len(model.vocabulary.cum_table) + if negative or sample: + next_random = (2**24) * model.random.randint(0, 2**24) + model.random.randint(0, 2**24) + + # convert Python structures to primitive types, so we can release the GIL + work = np.PyArray_DATA(_work) + neu1 = np.PyArray_DATA(_neu1) + + # prepare C structures so we can go "full C" and release the Python GIL + for py_token in model.wv.vocab: + try: + token = py_token.encode('utf8') + except: + token = py_token + vocab[token] = (model.wv.vocab[py_token].index, model.wv.vocab[py_token].sample_int) + + # release GIL & train on all sentences + cdef int total_effective_words = 0, total_effective_sentences = 0, total_words = 0 + cdef pair[ULongLong, ULongLong] word + cdef ULongLong random_number + + while sentences in iterate_batches_from_pystream(input_stream): + with nogil: + effective_sentences = 0 + effective_words = 0 + + sentence_idx[0] = 0 # indices of the first sentence always start at 0 + for sent in sentences: + total_words += sent.size() + + if sent.empty(): + continue # ignore empty sentences; leave effective_sentences unchanged + for token in sent: + if vocab.find(token) == vocab.end(): + continue # leaving `effective_words` unchanged = shortening the sentence = expanding the window + word = vocab[token] + if sample and word.second < random_int32(&next_random): + continue + indexes[effective_words] = word.first + + effective_words += 1 + if effective_words == MAX_SENTENCE_LEN: + break # TODO: log warning, tally overflow? + + # keep track of which words go into which sentence, so we don't train + # across sentence boundaries. + # indices of sentence number X are between idx_end: + k = idx_end + if hs: + fast_sentence_cbow_hs(points[i], codes[i], codelens, neu1, syn0, syn1, size, indexes, _alpha, work, i, j, k, cbow_mean, word_locks, _compute_loss, &_running_training_loss) + if negative: + next_random = fast_sentence_cbow_neg(negative, cum_table, cum_table_len, codelens, neu1, syn0, syn1neg, size, indexes, _alpha, work, i, j, k, cbow_mean, next_random, word_locks, _compute_loss, &_running_training_loss) + + total_effective_sentences += effective_sentences + total_effective_words += effective_words + + + return total_effective_words, total_words # return properly raw_tally as a second value (not tally) + + # Score is only implemented for hierarchical softmax def score_sentence_sg(model, sentence, _work): From 6162b502186ad6fb9bfd968cf1b798fbf41abd16 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Thu, 5 Jul 2018 17:39:54 +0300 Subject: [PATCH 47/49] fix --- gensim/models/word2vec_inner.pyx | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/gensim/models/word2vec_inner.pyx b/gensim/models/word2vec_inner.pyx index 983ff39491..2940b04d24 100755 --- a/gensim/models/word2vec_inner.pyx +++ b/gensim/models/word2vec_inner.pyx @@ -622,31 +622,29 @@ cpdef train_epoch_cbow(model, _input_stream, alpha, _work, _neu1, compute_loss): return total_effective_words, total_words # return properly raw_tally as a second value (not tally) -cdef vector[vector[string]] iterate_batches_from_pystream(input_stream): - cdef vector[vector[string]] job_batch - cdef vector[string] data - - cdef int batch_size = 0 - cdef int data_length = 0 +def iterate_batches_from_pystream(input_stream): + job_batch = [] + data = None + batch_size = 0 + data_length = 0 for data in input_stream: - data_length = data.size() + data_length = len(data) # can we fit this sentence into the existing job batch? if batch_size + data_length <= MAX_SENTENCE_LEN: # yes => add it to the current job - job_batch.push_back(data) + job_batch.append(data) batch_size += data_length else: yield job_batch - job_batch.clear() + job_batch = [data] - job_batch.push_back(data) batch_size = data_length # add the last job too (may be significantly smaller than batch_words) - if not job_batch.empty(): + if job_batch: yield job_batch @@ -720,7 +718,7 @@ cpdef train_epoch_cbow_pystream(model, input_stream, alpha, _work, _neu1, comput cdef pair[ULongLong, ULongLong] word cdef ULongLong random_number - while sentences in iterate_batches_from_pystream(input_stream): + for sentences in iterate_batches_from_pystream(input_stream): with nogil: effective_sentences = 0 effective_words = 0 From c14fca196910d1075326941cbf4d470bc960a1ad Mon Sep 17 00:00:00 2001 From: persiyanov Date: Thu, 5 Jul 2018 19:06:21 +0300 Subject: [PATCH 48/49] fixes --- gensim/models/word2vec.py | 4 +- gensim/models/word2vec_inner.pyx | 1 + gensim/scripts/benchmark_any2vec_speed.py | 2 +- gensim/scripts/benchmark_line_sentence.py | 37 ++++++++++++ gensim/scripts/benchmark_vocab.py | 69 +++++++++++++++++++++++ 5 files changed, 110 insertions(+), 3 deletions(-) create mode 100644 gensim/scripts/benchmark_line_sentence.py create mode 100644 gensim/scripts/benchmark_vocab.py diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index d00589c189..e5021f7e36 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -138,7 +138,7 @@ logger = logging.getLogger(__name__) try: - from gensim.models.word2vec_inner import train_batch_sg, train_epoch_cbow + from gensim.models.word2vec_inner import train_batch_sg, train_epoch_cbow, train_epoch_cbow_pystream from gensim.models.word2vec_inner import score_sentence_sg, score_sentence_cbow from gensim.models.word2vec_inner import FAST_VERSION, MAX_WORDS_IN_BATCH @@ -549,7 +549,7 @@ def _worker_loop(self, input_stream, progress_queue): jobs_processed = 0 alpha = self._get_job_params(0) - tally, raw_tally = train_epoch_cbow(self, input_stream, alpha, work, neu1, False) + tally, raw_tally = train_epoch_cbow_pystream(self, input_stream, alpha, work, neu1, False) progress_queue.put((0, tally, raw_tally)) progress_queue.put(None) # logger.debug("worker exiting, processed %i jobs", jobs_processed) diff --git a/gensim/models/word2vec_inner.pyx b/gensim/models/word2vec_inner.pyx index 2940b04d24..2c0daf2b5b 100755 --- a/gensim/models/word2vec_inner.pyx +++ b/gensim/models/word2vec_inner.pyx @@ -629,6 +629,7 @@ def iterate_batches_from_pystream(input_stream): data_length = 0 for data in input_stream: + data = [x.encode('utf8') for x in data] data_length = len(data) # can we fit this sentence into the existing job batch? diff --git a/gensim/scripts/benchmark_any2vec_speed.py b/gensim/scripts/benchmark_any2vec_speed.py index bdf11689fa..83f12311ba 100644 --- a/gensim/scripts/benchmark_any2vec_speed.py +++ b/gensim/scripts/benchmark_any2vec_speed.py @@ -46,7 +46,7 @@ def benchmark_model(input_streams, model, window, workers, vector_size): } else: kwargs = { - 'input_streams': [CythonLineSentence(inp) for inp in input_streams] + 'input_streams': [LineSentence(inp) for inp in input_streams] } kwargs['size'] = vector_size diff --git a/gensim/scripts/benchmark_line_sentence.py b/gensim/scripts/benchmark_line_sentence.py new file mode 100644 index 0000000000..736b279e5a --- /dev/null +++ b/gensim/scripts/benchmark_line_sentence.py @@ -0,0 +1,37 @@ +from __future__ import unicode_literals +from __future__ import print_function + +import logging +import argparse +import time +import os + +from gensim.models.word2vec import LineSentence + + +logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) + +logger = logging.getLogger(__name__) + + +def do_benchmark(input_): + iter = LineSentence(input_) + + start_time = time.time() + for _ in iter: + pass + end_time = time.time() + + logger.info('Finished benchmarking. Time elapsed: {:.2f} s.'.format(end_time - start_time)) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='GSOC Multistream-API: evaluate performance ' + 'metrics for any2vec models') + parser.add_argument('--input', type=str, help='Input file or regexp if `multistream` mode is on.') + + args = parser.parse_args() + + input_ = os.path.expanduser(args.input) + + do_benchmark(input_) diff --git a/gensim/scripts/benchmark_vocab.py b/gensim/scripts/benchmark_vocab.py new file mode 100644 index 0000000000..37783a27b5 --- /dev/null +++ b/gensim/scripts/benchmark_vocab.py @@ -0,0 +1,69 @@ +from __future__ import unicode_literals +from __future__ import print_function + +import logging +import argparse +import time +import os +import glob +import itertools + +from gensim.models.word2vec import Word2Vec, LineSentence +from gensim.models.doc2vec import Doc2Vec, TaggedLineDocument +from gensim.models.fasttext import FastText +from gensim.parsing.preprocessing import preprocess_string + + +logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) + +logger = logging.getLogger(__name__) + + +class MyLineSentence: + def __init__(self, filename): + self.linesentence = LineSentence(filename) + + def __iter__(self): + for sent in self.linesentence: + yield preprocess_string(' '.join(sent)) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='GSOC Multistream-API: evaluate vocab performance ' + 'for word2vec') + parser.add_argument('--input', type=str, help='Input file or regexp for multistream.') + parser.add_argument('--size', type=int, default=300) + parser.add_argument('--workers-grid', nargs='+', type=int, default=[1, 2, 3, 4, 5, 8, 10]) + parser.add_argument('--model', type=str, default='word2vec') + parser.add_argument('--label', type=str, default='untitled') + + args = parser.parse_args() + + input_ = os.path.expanduser(args.input) + input_files = glob.glob(input_) + logger.info('Glob found {} input files. List: {}'.format(len(input_files), input_files)) + + for workers in args.workers_grid: + if args.model == 'word2vec': + input_streams = [MyLineSentence(_) for _ in input_files] + model = Word2Vec() + elif args.model == 'doc2vec': + input_streams = [TaggedLineDocument(_) for _ in input_files] + model = Doc2Vec() + elif args.model == 'fasttext': + input_streams = [LineSentence(_) for _ in input_files] + model = FastText() + else: + raise NotImplementedError("Model '{}' is not supported", args.model) + + if workers == 1: + sentences = itertools.chain(*input_streams) + input_streams = None + else: + sentences = None + + logger.info('Start building vocab with model={}, workers={}'.format(args.model, workers)) + start_time = time.time() + model.build_vocab(sentences=sentences, input_streams=input_streams, workers=workers) + end_time = time.time() + logger.info('Model = {}\tWorkers = {}\tVocab time = {:.2f} secs'.format(args.model, workers, end_time - start_time)) \ No newline at end of file From 440c6dfcbe2b5b6e578893bc1847ce350a28d154 Mon Sep 17 00:00:00 2001 From: persiyanov Date: Mon, 9 Jul 2018 17:03:58 +0300 Subject: [PATCH 49/49] last changes --- gensim/models/word2vec.py | 4 ++-- gensim/models/word2vec_inner.pyx | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index e5021f7e36..e7a181b982 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -549,8 +549,8 @@ def _worker_loop(self, input_stream, progress_queue): jobs_processed = 0 alpha = self._get_job_params(0) - tally, raw_tally = train_epoch_cbow_pystream(self, input_stream, alpha, work, neu1, False) - progress_queue.put((0, tally, raw_tally)) + examples, tally, raw_tally = train_epoch_cbow_pystream(self, input_stream, alpha, work, neu1, False) + progress_queue.put((examples, tally, raw_tally)) progress_queue.put(None) # logger.debug("worker exiting, processed %i jobs", jobs_processed) diff --git a/gensim/models/word2vec_inner.pyx b/gensim/models/word2vec_inner.pyx index 2c0daf2b5b..3b69568984 100755 --- a/gensim/models/word2vec_inner.pyx +++ b/gensim/models/word2vec_inner.pyx @@ -558,7 +558,7 @@ cpdef train_epoch_cbow(model, _input_stream, alpha, _work, _neu1, compute_loss): vocab[token] = (model.wv.vocab[py_token].index, model.wv.vocab[py_token].sample_int) # release GIL & train on all sentences - cdef int total_effective_words = 0, total_effective_sentences = 0, total_words = 0 + cdef int total_effective_words = 0, total_sentences = 0, total_words = 0 cdef pair[ULongLong, ULongLong] word cdef ULongLong random_number @@ -615,11 +615,11 @@ cpdef train_epoch_cbow(model, _input_stream, alpha, _work, _neu1, compute_loss): if negative: next_random = fast_sentence_cbow_neg(negative, cum_table, cum_table_len, codelens, neu1, syn0, syn1neg, size, indexes, _alpha, work, i, j, k, cbow_mean, next_random, word_locks, _compute_loss, &_running_training_loss) - total_effective_sentences += effective_sentences + total_sentences += sentences.size() total_effective_words += effective_words - return total_effective_words, total_words # return properly raw_tally as a second value (not tally) + return total_sentences, total_effective_words, total_words # return properly raw_tally as a second value (not tally) def iterate_batches_from_pystream(input_stream): @@ -715,7 +715,7 @@ cpdef train_epoch_cbow_pystream(model, input_stream, alpha, _work, _neu1, comput vocab[token] = (model.wv.vocab[py_token].index, model.wv.vocab[py_token].sample_int) # release GIL & train on all sentences - cdef int total_effective_words = 0, total_effective_sentences = 0, total_words = 0 + cdef int total_effective_words = 0, total_sentences = 0, total_words = 0 cdef pair[ULongLong, ULongLong] word cdef ULongLong random_number @@ -770,11 +770,11 @@ cpdef train_epoch_cbow_pystream(model, input_stream, alpha, _work, _neu1, comput if negative: next_random = fast_sentence_cbow_neg(negative, cum_table, cum_table_len, codelens, neu1, syn0, syn1neg, size, indexes, _alpha, work, i, j, k, cbow_mean, next_random, word_locks, _compute_loss, &_running_training_loss) - total_effective_sentences += effective_sentences + total_sentences += sentences.size() total_effective_words += effective_words - return total_effective_words, total_words # return properly raw_tally as a second value (not tally) + return total_sentences, total_effective_words, total_words # return properly raw_tally as a second value (not tally) # Score is only implemented for hierarchical softmax