diff --git a/docs/notebooks/doc2vec-IMDB.ipynb b/docs/notebooks/doc2vec-IMDB.ipynb
new file mode 100644
index 0000000000..0b0f721be1
--- /dev/null
+++ b/docs/notebooks/doc2vec-IMDB.ipynb
@@ -0,0 +1,1843 @@
+{
+ "metadata": {
+ "name": "",
+ "signature": "sha256:26971c428490c5b0358c2d98666355be414831a09bf6cf3c50b03d39bd186505"
+ },
+ "nbformat": 3,
+ "nbformat_minor": 0,
+ "worksheets": [
+ {
+ "cells": [
+ {
+ "cell_type": "heading",
+ "level": 1,
+ "metadata": {},
+ "source": [
+ "gensim doc2vec & IMDB sentiment dataset"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "TODO: section on introduction & motivation\n",
+ "\n",
+ "TODO: prerequisites + dependencies (statsmodels, patsy, ?)"
+ ]
+ },
+ {
+ "cell_type": "heading",
+ "level": 2,
+ "metadata": {},
+ "source": [
+ "Load corpus"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Fetch and prep exactly as in Mikolov's go.sh shell script. (Note this cell tests for existence of required files, so steps won't repeat once the final summary file (`aclImdb/alldata-id.txt`) is available alongside this notebook.)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "%%bash\n",
+ "# adapted from Mikolov's example go.sh script: \n",
+ "if [ ! -f \"aclImdb/alldata-id.txt\" ]\n",
+ "then\n",
+ " if [ ! -d \"aclImdb\" ] \n",
+ " then\n",
+ " if [ ! -f \"aclImdb_v1.tar.gz\" ]\n",
+ " then\n",
+ " wget --quiet http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz\n",
+ " fi\n",
+ " tar xf aclImdb_v1.tar.gz\n",
+ " fi\n",
+ " \n",
+ " #this function will convert text to lowercase and will disconnect punctuation and special symbols from words\n",
+ " function normalize_text {\n",
+ " awk '{print tolower($0);}' < $1 | sed -e 's/\\./ \\. /g' -e 's/
/ /g' -e 's/\"/ \" /g' \\\n",
+ " -e 's/,/ , /g' -e 's/(/ ( /g' -e 's/)/ ) /g' -e 's/\\!/ \\! /g' -e 's/\\?/ \\? /g' \\\n",
+ " -e 's/\\;/ \\; /g' -e 's/\\:/ \\: /g' > $1-norm\n",
+ " }\n",
+ "\n",
+ " export LC_ALL=C\n",
+ " for j in train/pos train/neg test/pos test/neg train/unsup; do\n",
+ " rm temp\n",
+ " for i in `ls aclImdb/$j`; do cat aclImdb/$j/$i >> temp; awk 'BEGIN{print;}' >> temp; done\n",
+ " normalize_text temp\n",
+ " mv temp-norm aclImdb/$j/norm.txt\n",
+ " done\n",
+ " mv aclImdb/train/pos/norm.txt aclImdb/train-pos.txt\n",
+ " mv aclImdb/train/neg/norm.txt aclImdb/train-neg.txt\n",
+ " mv aclImdb/test/pos/norm.txt aclImdb/test-pos.txt\n",
+ " mv aclImdb/test/neg/norm.txt aclImdb/test-neg.txt\n",
+ " mv aclImdb/train/unsup/norm.txt aclImdb/train-unsup.txt\n",
+ "\n",
+ " cat aclImdb/train-pos.txt aclImdb/train-neg.txt aclImdb/test-pos.txt aclImdb/test-neg.txt aclImdb/train-unsup.txt > aclImdb/alldata.txt\n",
+ " awk 'BEGIN{a=0;}{print \"_*\" a \" \" $0; a++;}' < aclImdb/alldata.txt > aclImdb/alldata-id.txt\n",
+ "fi"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "stream",
+ "stream": "stderr",
+ "text": [
+ "rm: temp: No such file or directory\n"
+ ]
+ }
+ ],
+ "prompt_number": 1
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "import os.path\n",
+ "assert os.path.isfile(\"aclImdb/alldata-id.txt\"), \"alldata-id.txt unavailable\""
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 2
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The data is small enough to be read into memory. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "import gensim\n",
+ "from gensim.models.doc2vec import TaggedDocument\n",
+ "from collections import namedtuple\n",
+ "\n",
+ "SentimentDocument = namedtuple('SentimentDocument', 'words tags split sentiment')\n",
+ "\n",
+ "alldocs = [] # will hold all docs in original order\n",
+ "with open('aclImdb/alldata-id.txt') as alldata:\n",
+ " for line_no, line in enumerate(alldata):\n",
+ " tokens = gensim.utils.to_unicode(line).split()\n",
+ " words = tokens[1:]\n",
+ " tags = [line_no] # `tags = [tokens[0]]` would also work at extra memory cost\n",
+ " split = ['train','test','extra','extra'][line_no//25000] # 25k train, 25k test, 25k extra\n",
+ " sentiment = [1.0, 0.0, 1.0, 0.0, None, None, None, None][line_no//12500] # [12.5K pos, 12.5K neg]*2 then unknown\n",
+ " alldocs.append(SentimentDocument(words, tags, split, sentiment))\n",
+ "\n",
+ "train_docs = [doc for doc in alldocs if doc.split == 'train']\n",
+ "test_docs = [doc for doc in alldocs if doc.split == 'test']\n",
+ "doc_list = alldocs[:] # for reshuffling per pass\n",
+ "\n",
+ "print('%d docs: %d train-sentiment, %d test-sentiment' % (len(doc_list), len(train_docs), len(test_docs)))"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "100000 docs: 25000 train-sentiment, 25000 test-sentiment\n"
+ ]
+ }
+ ],
+ "prompt_number": 3
+ },
+ {
+ "cell_type": "heading",
+ "level": 2,
+ "metadata": {},
+ "source": [
+ "Set-up Doc2Vec Training & Evaluation Models"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Approximating experiment of Le & Mikolov [\"Distributed Representations of Sentences and Documents\"](http://cs.stanford.edu/~quocle/paragraph_vector.pdf), also with guidance from Mikolov's [example go.sh](https://groups.google.com/d/msg/word2vec-toolkit/Q49FIrNOQRo/J6KG8mUj45sJ):\n",
+ "\n",
+ "`./word2vec -train ../alldata-id.txt -output vectors.txt -cbow 0 -size 100 -window 10 -negative 5 -hs 0 -sample 1e-4 -threads 40 -binary 0 -iter 20 -min-count 1 -sentence-vectors 1`\n",
+ "\n",
+ "Parameter choices below vary:\n",
+ "\n",
+ "* 100-dimensional vectors, as the 400d vectors of the paper don't seem to offer much benefit on this task\n",
+ "* similarly, frequent word subsampling seems to decrease sentiment-prediction accuracy, so it's left out\n",
+ "* `cbow=0` means skip-gram which is equivalent to the paper's 'PV-DBOW' mode, matched in gensim with `dm=0`\n",
+ "* added to that DBOW model are two DM models, one which averages context vectors (`dm_mean`) and one which concatenates them (`dm_concat`, resulting in a much larger, slower, more data-hungry model)\n",
+ "* a `min_count=2` saves quite a bit of model memory, discarding only words that appear in a single doc (and are thus no more expressive than the unique-to-each doc vectors themselves)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "from gensim.models import Doc2Vec\n",
+ "import gensim.models.doc2vec\n",
+ "from collections import OrderedDict\n",
+ "import multiprocessing\n",
+ "\n",
+ "cores = multiprocessing.cpu_count()\n",
+ "assert gensim.models.doc2vec.FAST_VERSION > -1, \"this will be painfully slow otherwise\"\n",
+ "\n",
+ "simple_models = [\n",
+ " # PV-DM w/concatenation - window=5 (both sides) approximates paper's 10-word total window size\n",
+ " Doc2Vec(dm=1, dm_concat=1, size=100, window=5, negative=5, hs=0, min_count=2, workers=cores),\n",
+ " # PV-DBOW \n",
+ " Doc2Vec(dm=0, size=100, negative=5, hs=0, min_count=2, workers=cores),\n",
+ " # PV-DM w/average\n",
+ " Doc2Vec(dm=1, dm_mean=1, size=100, window=10, negative=5, hs=0, min_count=2, workers=cores),\n",
+ "]\n",
+ "\n",
+ "# speed setup by sharing results of 1st model's vocabulary scan\n",
+ "simple_models[0].build_vocab(alldocs) # PV-DM/concat requires one special NULL word so it serves as template\n",
+ "print(simple_models[0])\n",
+ "for model in simple_models[1:]:\n",
+ " model.reset_from(simple_models[0])\n",
+ " print(model)\n",
+ "\n",
+ "models_by_name = OrderedDict((str(model), model) for model in simple_models)"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "Doc2Vec(dm/c,d100,n5,w5,mc2,t8)\n",
+ "Doc2Vec(dbow,d100,n5,mc2,t8)"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "Doc2Vec(dm/m,d100,n5,w10,mc2,t8)"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n"
+ ]
+ }
+ ],
+ "prompt_number": 4
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Following the paper, we also evaluate models in pairs. These wrappers return the concatenation of the vectors from each model. (Only the singular models are trained.)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "from gensim.test.test_doc2vec import ConcatenatedDoc2Vec\n",
+ "models_by_name['dbow+dmm'] = ConcatenatedDoc2Vec([simple_models[1], simple_models[2]])\n",
+ "models_by_name['dbow+dmc'] = ConcatenatedDoc2Vec([simple_models[1], simple_models[0]])"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 5
+ },
+ {
+ "cell_type": "heading",
+ "level": 2,
+ "metadata": {},
+ "source": [
+ "Predictive Evaluation Methods"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Helper methods for evaluating error rate."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "import numpy as np\n",
+ "import statsmodels.api as sm\n",
+ "from random import sample\n",
+ "\n",
+ "# for timing\n",
+ "from contextlib import contextmanager\n",
+ "from timeit import default_timer\n",
+ "import time \n",
+ "\n",
+ "@contextmanager\n",
+ "def elapsed_timer():\n",
+ " start = default_timer()\n",
+ " elapser = lambda: default_timer() - start\n",
+ " yield lambda: elapser()\n",
+ " end = default_timer()\n",
+ " elapser = lambda: end-start\n",
+ " \n",
+ "def logistic_predictor_from_data(train_targets, train_regressors):\n",
+ " logit = sm.Logit(train_targets, train_regressors)\n",
+ " predictor = logit.fit(disp=0)\n",
+ " #print(predictor.summary())\n",
+ " return predictor\n",
+ "\n",
+ "def error_rate_for_model(test_model, train_set, test_set, infer=False, infer_steps=3, infer_alpha=0.1, infer_subsample=0.1):\n",
+ " \"\"\"Report error rate on test_doc sentiments, using supplied model and train_docs\"\"\"\n",
+ "\n",
+ " train_targets, train_regressors = zip(*[(doc.sentiment, test_model.docvecs[doc.tags[0]]) for doc in train_set])\n",
+ " train_regressors = sm.add_constant(train_regressors)\n",
+ " predictor = logistic_predictor_from_data(train_targets, train_regressors)\n",
+ "\n",
+ " test_data = test_set\n",
+ " if infer:\n",
+ " if infer_subsample < 1.0:\n",
+ " test_data = sample(test_data, int(infer_subsample * len(test_data)))\n",
+ " test_regressors = [test_model.infer_vector(doc.words, steps=infer_steps, alpha=infer_alpha) for doc in test_data]\n",
+ " else:\n",
+ " test_regressors = [test_model.docvecs[doc.tags[0]] for doc in test_docs]\n",
+ " test_regressors = sm.add_constant(test_regressors)\n",
+ " \n",
+ " # predict & evaluate\n",
+ " test_predictions = predictor.predict(test_regressors)\n",
+ " corrects = sum(np.rint(test_predictions) == [doc.sentiment for doc in test_data])\n",
+ " errors = len(test_predictions) - corrects\n",
+ " error_rate = float(errors) / len(test_predictions)\n",
+ " return (error_rate, errors, len(test_predictions), predictor)\n"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 8
+ },
+ {
+ "cell_type": "heading",
+ "level": 2,
+ "metadata": {},
+ "source": [
+ "Bulk Training"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Using explicit multiple-pass, alpha-reduction approach as sketched in [gensim doc2vec blog post](http://radimrehurek.com/2014/12/doc2vec-tutorial/) \u2013 with added shuffling of corpus on each pass.\n",
+ "\n",
+ "Note that vector training is occurring on *all* documents of the dataset, which includes all TRAIN/TEST/DEV docs.\n",
+ "\n",
+ "Evaluation of each model's sentiment-predictive power is repeated after each pass, as an error rate (lower is better), to see the rates-of-relative-improvement. The base numbers reuse the TRAIN and TEST vectors stored in the models for the logistic regression, while the _inferred_ results use newly-inferred TEST vectors. \n",
+ "\n",
+ "(On a 4-core 2.6Ghz Intel Core i7, these 20 passes training and evaluating 3 main models takes about an hour.)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "from collections import defaultdict\n",
+ "best_error = defaultdict(lambda :1.0) # to selectively-print only best errors achieved"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 9
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "from random import shuffle\n",
+ "import datetime\n",
+ "\n",
+ "alpha, min_alpha, passes = (0.025, 0.001, 20)\n",
+ "alpha_delta = (alpha - min_alpha) / passes\n",
+ "\n",
+ "print(\"START %s\" % datetime.datetime.now())\n",
+ "\n",
+ "for epoch in range(passes):\n",
+ " shuffle(doc_list) # shuffling gets best results\n",
+ " \n",
+ " for name, train_model in models_by_name.items():\n",
+ " # train\n",
+ " duration = 'na'\n",
+ " train_model.alpha, train_model.min_alpha = alpha, alpha\n",
+ " with elapsed_timer() as elapsed:\n",
+ " train_model.train(doc_list)\n",
+ " duration = '%.1f' % elapsed()\n",
+ " \n",
+ " # evaluate\n",
+ " eval_duration = ''\n",
+ " with elapsed_timer() as eval_elapsed:\n",
+ " err, err_count, test_count, predictor = error_rate_for_model(train_model, train_docs, test_docs)\n",
+ " eval_duration = '%.1f' % eval_elapsed()\n",
+ " best_indicator = ' '\n",
+ " if err <= best_error[name]:\n",
+ " best_error[name] = err\n",
+ " best_indicator = '*' \n",
+ " print(\"%s%f : %i passes : %s %ss %ss\" % (best_indicator, err, epoch + 1, name, duration, eval_duration))\n",
+ "\n",
+ " if (epoch % 5) == 0:\n",
+ " eval_duration = ''\n",
+ " with elapsed_timer() as eval_elapsed:\n",
+ " infer_err, err_count, test_count, predictor = error_rate_for_model(train_model, train_docs, test_docs, infer=True)\n",
+ " eval_duration = '%.1f' % eval_elapsed()\n",
+ " best_indicator = ' '\n",
+ " if infer_err < best_error[name + '_inferred']:\n",
+ " best_error[name + '_inferred'] = infer_err\n",
+ " best_indicator = '*'\n",
+ " print(\"%s%f : %i passes : %s %ss %ss\" % (best_indicator, infer_err, epoch + 1, name + '_inferred', duration, eval_duration))\n",
+ "\n",
+ " print('completed pass %i at alpha %f' % (epoch + 1, alpha))\n",
+ " alpha -= alpha_delta\n",
+ " \n",
+ "print(\"END %s\" % str(datetime.datetime.now()))"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "START 2015-06-28 20:34:29.500839\n",
+ "*0.417080 : 1 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,t8) 84.5s 1.0s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.363200 : 1 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,t8)_inferred 84.5s 14.9s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.219520 : 1 passes : Doc2Vec(dbow,d100,n5,mc2,t8) 19.0s 0.6s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.184000 : 1 passes : Doc2Vec(dbow,d100,n5,mc2,t8)_inferred 19.0s 4.6s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.277080 : 1 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,t8) 35.0s 0.6s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.230800 : 1 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,t8)_inferred 35.0s 6.4s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.207840 : 1 passes : dbow+dmm 0.0s 1.5s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.185200 : 1 passes : dbow+dmm_inferred 0.0s 11.2s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.220720 : 1 passes : dbow+dmc 0.0s 1.1s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.189200 : 1 passes : dbow+dmc_inferred 0.0s 19.3s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "completed pass 1 at alpha 0.025000\n",
+ "*0.357120 : 2 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,t8) 73.1s 0.6s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.144360 : 2 passes : Doc2Vec(dbow,d100,n5,mc2,t8) 19.8s 0.6s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.225640 : 2 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,t8) 36.2s 1.0s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.141160 : 2 passes : dbow+dmm 0.0s 1.1s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.144800 : 2 passes : dbow+dmc 0.0s 1.2s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "completed pass 2 at alpha 0.023800\n",
+ "*0.326840 : 3 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,t8) 73.6s 0.6s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.125880 : 3 passes : Doc2Vec(dbow,d100,n5,mc2,t8) 20.1s 0.7s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.202680 : 3 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,t8) 36.0s 0.6s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.123280 : 3 passes : dbow+dmm 0.0s 1.6s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.126040 : 3 passes : dbow+dmc 0.0s 1.2s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "completed pass 3 at alpha 0.022600\n",
+ "*0.302360 : 4 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,t8) 72.6s 0.6s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.113640 : 4 passes : Doc2Vec(dbow,d100,n5,mc2,t8) 19.9s 0.7s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.189880 : 4 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,t8) 35.8s 0.6s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.114200 : 4 passes : dbow+dmm 0.0s 1.2s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.115640 : 4 passes : dbow+dmc 0.0s 1.6s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "completed pass 4 at alpha 0.021400\n",
+ "*0.281480 : 5 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,t8) 72.7s 0.7s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.109720 : 5 passes : Doc2Vec(dbow,d100,n5,mc2,t8) 21.5s 0.7s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.181360 : 5 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,t8) 37.8s 0.7s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.109760 : 5 passes : dbow+dmm 0.0s 1.3s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.110400 : 5 passes : dbow+dmc 0.0s 1.6s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "completed pass 5 at alpha 0.020200\n",
+ "*0.264640 : 6 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,t8) 72.0s 0.7s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.292000 : 6 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,t8)_inferred 72.0s 13.3s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.107440 : 6 passes : Doc2Vec(dbow,d100,n5,mc2,t8) 21.6s 0.7s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.116000 : 6 passes : Doc2Vec(dbow,d100,n5,mc2,t8)_inferred 21.6s 4.7s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.176040 : 6 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,t8) 37.4s 1.1s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.213600 : 6 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,t8)_inferred 37.4s 6.4s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.107000 : 6 passes : dbow+dmm 0.0s 1.2s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.108000 : 6 passes : dbow+dmm_inferred 0.0s 11.2s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.107880 : 6 passes : dbow+dmc 0.0s 1.2s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.124400 : 6 passes : dbow+dmc_inferred 0.0s 18.3s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "completed pass 6 at alpha 0.019000\n",
+ "*0.254200 : 7 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,t8) 65.7s 1.1s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.106720 : 7 passes : Doc2Vec(dbow,d100,n5,mc2,t8) 19.5s 0.7s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.172880 : 7 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,t8) 35.6s 0.7s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.106080 : 7 passes : dbow+dmm 0.0s 1.2s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.106320 : 7 passes : dbow+dmc 0.0s 1.2s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "completed pass 7 at alpha 0.017800\n",
+ "*0.245880 : 8 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,t8) 68.6s 0.7s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.104920 : 8 passes : Doc2Vec(dbow,d100,n5,mc2,t8) 20.0s 1.0s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.171000 : 8 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,t8) 35.4s 0.7s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.104760 : 8 passes : dbow+dmm 0.0s 1.3s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.105600 : 8 passes : dbow+dmc 0.0s 1.3s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "completed pass 8 at alpha 0.016600\n",
+ "*0.238400 : 9 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,t8) 66.1s 0.6s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.104520 : 9 passes : Doc2Vec(dbow,d100,n5,mc2,t8) 21.2s 1.1s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.167600 : 9 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,t8) 37.5s 0.7s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.103680 : 9 passes : dbow+dmm 0.0s 1.2s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.103480 : 9 passes : dbow+dmc 0.0s 1.2s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "completed pass 9 at alpha 0.015400\n",
+ "*0.232160 : 10 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,t8) 69.0s 0.7s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.103680 : 10 passes : Doc2Vec(dbow,d100,n5,mc2,t8) 21.8s 0.7s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.166000 : 10 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,t8) 35.4s 1.1s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.101920 : 10 passes : dbow+dmm 0.0s 1.2s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ " 0.103560 : 10 passes : dbow+dmc 0.0s 1.2s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "completed pass 10 at alpha 0.014200\n",
+ "*0.227760 : 11 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,t8) 66.4s 0.7s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.242400 : 11 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,t8)_inferred 66.4s 13.0s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.102160 : 11 passes : Doc2Vec(dbow,d100,n5,mc2,t8) 19.7s 0.6s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.113200 : 11 passes : Doc2Vec(dbow,d100,n5,mc2,t8)_inferred 19.7s 5.0s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.163480 : 11 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,t8) 35.4s 0.6s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.208800 : 11 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,t8)_inferred 35.4s 6.2s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.101560 : 11 passes : dbow+dmm 0.0s 1.2s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.102000 : 11 passes : dbow+dmm_inferred 0.0s 11.4s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.101920 : 11 passes : dbow+dmc 0.0s 1.6s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.109600 : 11 passes : dbow+dmc_inferred 0.0s 17.4s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "completed pass 11 at alpha 0.013000\n",
+ "*0.225960 : 12 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,t8) 61.8s 0.7s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.101720 : 12 passes : Doc2Vec(dbow,d100,n5,mc2,t8) 20.2s 0.7s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.163000 : 12 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,t8) 35.5s 0.7s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.100840 : 12 passes : dbow+dmm 0.0s 1.2s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.101920 : 12 passes : dbow+dmc 0.0s 1.7s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "completed pass 12 at alpha 0.011800\n",
+ "*0.222360 : 13 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,t8) 65.2s 0.7s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ " 0.103120 : 13 passes : Doc2Vec(dbow,d100,n5,mc2,t8) 20.0s 0.7s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.161960 : 13 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,t8) 35.2s 0.6s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ " 0.101640 : 13 passes : dbow+dmm 0.0s 1.2s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ " 0.102600 : 13 passes : dbow+dmc 0.0s 1.2s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "completed pass 13 at alpha 0.010600\n",
+ "*0.220960 : 14 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,t8) 65.3s 1.1s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ " 0.102920 : 14 passes : Doc2Vec(dbow,d100,n5,mc2,t8) 19.9s 0.7s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.160160 : 14 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,t8) 36.0s 0.7s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ " 0.101720 : 14 passes : dbow+dmm 0.0s 1.2s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ " 0.102560 : 14 passes : dbow+dmc 0.0s 1.2s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "completed pass 14 at alpha 0.009400\n",
+ "*0.219400 : 15 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,t8) 64.0s 1.0s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.101440 : 15 passes : Doc2Vec(dbow,d100,n5,mc2,t8) 19.5s 0.7s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ " 0.160640 : 15 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,t8) 38.6s 0.7s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.100160 : 15 passes : dbow+dmm 0.0s 1.2s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.101880 : 15 passes : dbow+dmc 0.0s 1.3s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "completed pass 15 at alpha 0.008200\n",
+ "*0.216880 : 16 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,t8) 64.1s 1.1s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.232400 : 16 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,t8)_inferred 64.1s 12.8s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ " 0.101760 : 16 passes : Doc2Vec(dbow,d100,n5,mc2,t8) 19.1s 0.7s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.111600 : 16 passes : Doc2Vec(dbow,d100,n5,mc2,t8)_inferred 19.1s 4.7s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.159800 : 16 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,t8) 34.9s 0.6s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.184000 : 16 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,t8)_inferred 34.9s 6.5s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ " 0.100640 : 16 passes : dbow+dmm 0.0s 1.6s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.094800 : 16 passes : dbow+dmm_inferred 0.0s 11.7s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.101320 : 16 passes : dbow+dmc 0.0s 1.2s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ " 0.109600 : 16 passes : dbow+dmc_inferred 0.0s 17.5s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "completed pass 16 at alpha 0.007000\n",
+ " 0.217160 : 17 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,t8) 58.6s 0.6s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ " 0.101760 : 17 passes : Doc2Vec(dbow,d100,n5,mc2,t8) 19.5s 0.7s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.159640 : 17 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,t8) 37.0s 1.1s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ " 0.100760 : 17 passes : dbow+dmm 0.0s 1.3s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ " 0.101480 : 17 passes : dbow+dmc 0.0s 1.3s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "completed pass 17 at alpha 0.005800\n",
+ "*0.216080 : 18 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,t8) 60.7s 0.6s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ " 0.101520 : 18 passes : Doc2Vec(dbow,d100,n5,mc2,t8) 19.6s 0.6s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.158760 : 18 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,t8) 34.9s 1.0s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ " 0.100800 : 18 passes : dbow+dmm 0.0s 1.2s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ " 0.101760 : 18 passes : dbow+dmc 0.0s 1.2s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "completed pass 18 at alpha 0.004600\n",
+ "*0.215560 : 19 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,t8) 62.6s 0.7s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.101000 : 19 passes : Doc2Vec(dbow,d100,n5,mc2,t8) 20.6s 0.7s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ " 0.159080 : 19 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,t8) 35.9s 0.7s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "*0.099920 : 19 passes : dbow+dmm 0.0s 1.7s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ " 0.102280 : 19 passes : dbow+dmc 0.0s 1.2s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "completed pass 19 at alpha 0.003400\n",
+ "*0.215160 : 20 passes : Doc2Vec(dm/c,d100,n5,w5,mc2,t8) 58.3s 0.6s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ " 0.101360 : 20 passes : Doc2Vec(dbow,d100,n5,mc2,t8) 19.5s 0.7s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ " 0.158920 : 20 passes : Doc2Vec(dm/m,d100,n5,w10,mc2,t8) 33.6s 0.6s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ " 0.100480 : 20 passes : dbow+dmm 0.0s 1.5s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ " 0.102160 : 20 passes : dbow+dmc 0.0s 1.1s"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "completed pass 20 at alpha 0.002200\n",
+ "END 2015-06-28 21:20:48.994706\n"
+ ]
+ }
+ ],
+ "prompt_number": 10
+ },
+ {
+ "cell_type": "heading",
+ "level": 2,
+ "metadata": {},
+ "source": [
+ "Achieved Sentiment-Prediction Accuracy"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "collapsed": true,
+ "input": [
+ "# print best error rates achieved\n",
+ "for rate, name in sorted((rate, name) for name, rate in best_error.items()):\n",
+ " print(\"%f %s\" % (rate, name))"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "0.094800 dbow+dmm_inferred\n",
+ "0.099920 dbow+dmm\n",
+ "0.101000 Doc2Vec(dbow,d100,n5,mc2,t8)\n",
+ "0.101320 dbow+dmc\n",
+ "0.109600 dbow+dmc_inferred\n",
+ "0.111600 Doc2Vec(dbow,d100,n5,mc2,t8)_inferred\n",
+ "0.158760 Doc2Vec(dm/m,d100,n5,w10,mc2,t8)\n",
+ "0.184000 Doc2Vec(dm/m,d100,n5,w10,mc2,t8)_inferred\n",
+ "0.215160 Doc2Vec(dm/c,d100,n5,w5,mc2,t8)\n",
+ "0.232400 Doc2Vec(dm/c,d100,n5,w5,mc2,t8)_inferred\n"
+ ]
+ }
+ ],
+ "prompt_number": 12
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "In my testing, unlike the paper's report, DBOW performs best. Concatenating vectors from different models only offers a small predictive improvement. The best results I've seen are still just under 10% error rate, still a ways from the paper's 7.42%.\n"
+ ]
+ },
+ {
+ "cell_type": "heading",
+ "level": 2,
+ "metadata": {},
+ "source": [
+ "Examining Results"
+ ]
+ },
+ {
+ "cell_type": "heading",
+ "level": 3,
+ "metadata": {},
+ "source": [
+ "Are inferred vectors close to the precalculated ones?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "doc_id = np.random.randint(simple_models[0].docvecs.count) # pick random doc; re-run cell for more examples\n",
+ "print('for doc %d...' % doc_id)\n",
+ "for model in simple_models:\n",
+ " inferred_docvec = model.infer_vector(alldocs[doc_id].words)\n",
+ " print('%s:\\n %s' % (model, model.docvecs.most_similar([inferred_docvec], topn=3)))"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "for doc 25430...\n",
+ "Doc2Vec(dm/c,d100,n5,w5,mc2,t8):\n",
+ " [(25430, 0.6583491563796997), (27314, 0.4142411947250366), (16479, 0.40846431255340576)]"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "Doc2Vec(dbow,d100,n5,mc2,t8):\n",
+ " [(25430, 0.9325973987579346), (49281, 0.5766637921333313), (79679, 0.5634804964065552)]"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "Doc2Vec(dm/m,d100,n5,w10,mc2,t8):\n",
+ " [(25430, 0.7970066666603088), (97818, 0.6925815343856812), (230, 0.690807580947876)]"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n"
+ ]
+ }
+ ],
+ "prompt_number": 13
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "(Yes, here the stored vector from 20 epochs of training is usually one of the closest to a freshly-inferred vector for the same words. Note the defaults for inference are very abbreviated \u2013 just 3 steps starting at a high alpha \u2013 and likely need tuning for other applications.)"
+ ]
+ },
+ {
+ "cell_type": "heading",
+ "level": 3,
+ "metadata": {},
+ "source": [
+ "Do close documents seem more related than distant ones?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "import random\n",
+ "\n",
+ "doc_id = np.random.randint(simple_models[0].docvecs.count) # pick random doc, re-run cell for more examples\n",
+ "model = random.choice(simple_models) # and a random model\n",
+ "sims = model.docvecs.most_similar(doc_id, topn=model.docvecs.count) # get *all* similar documents\n",
+ "print('TARGET (%d): \u00ab%s\u00bb\\n' % (doc_id, ' '.join(alldocs[doc_id].words)))\n",
+ "print('SIMILAR/DISSIMILAR DOCS PER MODEL %s:\\n' % model)\n",
+ "for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:\n",
+ " print('%s %s: \u00ab%s\u00bb\\n' % (label, sims[index], ' '.join(alldocs[sims[index][0]].words)))\n"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "TARGET (72927): \u00abthis is one of the best films of this year . for a year that was fueled by controversy and crap , it was nice to finally see a film that had a true heart to it . from the opening scene to the end , i was so moved by the love that will smith has for his son . basically , if you see this movie and walk out of it feeling nothing , there is something that is very wrong with you . loved this movie , it's the perfect movie to end the year with . the best part was after the movie , my friends and i all got up and realized that this movie had actually made the four of us tear up ! it's an amazing film and if will smith doesn't get at least an oscar nom , then the oscars will just suck . in fact will smith should actually just win an oscar for this role . ! ! ! i loved this movie ! ! ! ! everybody needs to see especially the people in this world that take everything for granted , watch this movie , it will change you !\u00bb\n",
+ "\n",
+ "SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d100,n5,w10,mc2,t8):\n",
+ "\n",
+ "MOST (2046, 0.7372332215309143): \u00abi thought this movie would be dumb , but i really liked it . people i know hate it because spirit was the only horse that talked . well , so what ? the songs were good , and the horses didn't need to talk to seem human . i wouldn't care to own the movie , and i would love to see it again . 8/10\u00bb\n",
+ "\n",
+ "MEDIAN (6999, 0.4129640758037567): \u00abokay , the recent history of star trek has not been good . the next generation faded in its last few seasons , ds9 boldly stayed where no one had stayed before , and voyager started very bad and never really lived up to its promise . so , when they announced a new star trek series , i did not have high expectations . and , the first episode , broken bow , did have some problems . but , overall it was solid trek material and a good romp . i'll get the nits out of the way first . the opening theme is dull and i don't look forward to sitting through it regularly , but that's what remotes are for . what was really bad was the completely gratuitous lotion rubbing scene that just about drove my wife out of the room . they need to cut that nonsense out . but , the plot was strong and moved along well . the characters , though still new , seem to be well rounded and not always what you would expect . the vulcans are clearly being presented very differently than before , with a slightly ominous theme . i particularly liked the linguist , who is the first star trek character to not be able to stand proud in the face of death , but rather has to deal with her phobias and fears . they seemed to stay true to trek lore , something that has been a significant problem in past series , though they have plenty of time to bring us things like shooting through shields , the instant invention of technology that can fix anything , and the inevitable plethora of time-travel stories . anyone want to start a pool on how long before the borg show up ? all in all , the series has enormous potential . they are seeing the universe with fresh eyes . we have the chance to learn how things got the way they were in the later series . how did the klingons go from just insulting to war ? how did we meet the romulans ? how did the federation form and just who put earth in charge . why is the prime directive so important ? if they address these things rather than spitting out time travel episodes , this will be an interesting series . my favorite line : zephram cochran saying \" where no man has gone before \" ( not \" no one \" )\u00bb\n",
+ "\n",
+ "LEAST (16617, 0.015464222989976406): \u00abi saw this movie during a tolkien-themed interim class during my sophomore year of college . i was seated unfortunately close to the screen and my professor chose me to serve as a whipping boy- everyone else was laughing , but they weren't within constant eyesight . let's get it out of the way : the peter jackson 'lord of the rings' films do owe something to the bakshi film . in jackson's version of the fellowship of the ring , for instance , the scene in which the black riders assault the empty inn beds is almost a complete carbon copy of the scene in bakshi's film , shot by shot . you could call this plagiarism or homage , depending on your agenda . i'm sure the similarities don't stop there . i'm not going to do any research to find out what they are , because that would imply i have some mote of respect for this film . i'm sure others have outlined the similarities- look around . this movie is a complete train wreck in every sense of the metaphor , and many , many people died in the accident . i've decided to list what i can remember in a more or less chronological fashion- if i've left out anything else that offended me it's because i'm completely overwhelmed , confronted with a wealth of failure ( and , at high points , mediocrity ) . *due to heavy use of rotoscoping , gandalf is no longer a gentle , wise wizard but a wildly flailing prophet of doom ( whose hat inexplicably changes color once or twice during the course of the film ) . *saruman the white is sometimes referred to as 'aruman' during the film , without explanation . he wears purple and red for some mysterious reason . *sam is flat out hideous . the portrayal of his friendship with frodo is strangely childlike and unsatisfying . yes , hobbits are small like children , but they are not children . *merry and pippin are never introduced--they simply appear during a scene change with a one-sentence explanation . the film is filled with sloppy editing like this . *frodo , sam , pippin and merry are singing merrily as they skip through along the road . one of the hobbits procures a lute at least twice as large as he is from behind his back--which was not visible before--and begins strumming in typical fantasy bard fashion as they all break into \" la-la-la \" s . awful . *aragorn , apparently , is a native american dressed in an extremely stereotypical fantasy tunic ( no pants ) , complete with huge , square pilgrim belt buckle . he is arguably the worst swordsman in the entire movie--oftentimes he gets one wobbly swing in before being knocked flat on his ass . *the black riders appear more like lepers than menacing instruments of evil . they limp everywhere they go at a painfully slow pace . this is disturbing to be sure , but not frightening . *the scene before the black riders attempt to cross the ford of bruinen ( in which they stare at frodo , who is on the other side on horseback ) goes on forever , during which time the riders rear their horses in a vaguely threatening manner and . . . do nothing else . the scene was probably intended to illustrate frodo's hallucinatory decline as he succumbs to his wound . it turns out to be more plodding than anything else . *gimli the dwarf is just as tall as legolas the elf . he's a dwarf . there is simply no excuse for that . he also looks like a bastardized david the gnome . it's a crude but accurate description . *boromir appears to have pilfered elmer fudd's golden viking armor from that bugs bunny opera episode . he looks ridiculous . *despite the similarity to tolkien's illustration , the balrog is howl inducing and the least-threatening villain in the entire film . it looks like someone wearing pink bedroom slippers , and it's barely taller than gandalf . \" purists \" may prefer this balrog , but i'll take jackson's version any day . *the battle scenes are awkward and embarrassing . almost none of the characters display any level of competency with their armaments . i'm not asking for action-packed scenes like those in jackson's film , but they are supposed to be fighting . *treebeard makes a very short appearance , and i was sorry he bothered to show up at all . watch the film , you'll see what i mean . alright , now for the good parts of the film . *some of the voice acting is pretty good . it isn't that aragorn sounds bad , he just looks kind of like the jolly green giant . *galadriel is somewhat interesting in this portrayal ; like tom bombadil , she seems immune to the ring's powers of temptation , and her voice actress isn't horrible either . *boromir's death isn't as heart wrenching as in jackson's portrayal of the same scene , but it's still appropriately dramatic ( and more true to his death in the book , though i don't believe jackson made a mistake shooting it the way he did ) . *as my professor pointed out ( between whispered threats ) , the orcs ( mainly at helm's deep , if i'm correct ) resemble the war-ravaged corpses of soldiers , a political statement that works pretty well if you realize what's being attempted . *while this isn't really a positive point about the film , bakshi can't be blamed for the majority of the failures in this movie , or so i've been told--the project was on a tight budget , and late in its production he lost creative control to some of the higher-ups ( who i'm sure hadn't read the books ) . let me be clear : i respect bakshi for even attempting something of this magnitude . i simply have a hard time believing he was happy with the final product . overall , i cannot in any way recommend this blasphemous adaptation of tolkien's classic trilogy even for laughs , unless you've already read the books and have your own visualizations of the characters , places and events . i'm sure somebody , somewhere , will pick a copy of this up in confusion ; if you do , keep an open mind and glean what good you can from it .\u00bb\n",
+ "\n"
+ ]
+ }
+ ],
+ "prompt_number": 14
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "(Somewhat, in terms of reviewer tone, movie genre, etc... the MOST cosine-similar docs usually seem more like the TARGET than the MEDIAN or LEAST.)"
+ ]
+ },
+ {
+ "cell_type": "heading",
+ "level": 3,
+ "metadata": {},
+ "source": [
+ "Do the word vectors show useful similarities?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "word_models = simple_models[:]"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [],
+ "prompt_number": 15
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "import random\n",
+ "from IPython.display import HTML\n",
+ "# pick a random word with a suitable number of occurences\n",
+ "while True:\n",
+ " word = random.choice(word_models[0].index2word)\n",
+ " if word_models[0].vocab[word].count > 10:\n",
+ " break\n",
+ "# or just pick a word from the relevant domain:\n",
+ "word = 'comedy/drama'\n",
+ "similars_per_model = [str(model.most_similar(word, topn=20)).replace('), ','),
\\n') for model in word_models]\n",
+ "similar_table = (\"
\" +\n",
+ " \" | \".join([str(model) for model in word_models]) + \n",
+ " \" |
---|
\" +\n",
+ " \" | \".join(similars_per_model) +\n",
+ " \" |
\")\n",
+ "print(\"most similar words for '%s' (%d occurences)\" % (word, simple_models[0].vocab[word].count))\n",
+ "HTML(similar_table)"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "most similar words for 'comedy/drama' (38 occurences)\n"
+ ]
+ },
+ {
+ "html": [
+ "Doc2Vec(dm/c,d100,n5,w5,mc2,t8) | Doc2Vec(dbow,d100,n5,mc2,t8) | Doc2Vec(dm/m,d100,n5,w10,mc2,t8) |
---|
[('comedy', 0.7255545258522034), \n",
+ "('thriller', 0.6946465969085693), \n",
+ "('drama', 0.6763534545898438), \n",
+ "('romance', 0.6251884698867798), \n",
+ "('dramedy', 0.6217159032821655), \n",
+ "('melodrama', 0.6156137585639954), \n",
+ "('adventure', 0.6091135740280151), \n",
+ "('farce', 0.6034293174743652), \n",
+ "('chiller', 0.5948368906974792), \n",
+ "('romantic-comedy', 0.5876704454421997), \n",
+ "('fantasy', 0.5863304138183594), \n",
+ "('mystery/comedy', 0.577541708946228), \n",
+ "('whodunit', 0.572147011756897), \n",
+ "('biopic', 0.5679721832275391), \n",
+ "('thriller/drama', 0.5630226731300354), \n",
+ "('sitcom', 0.5574496984481812), \n",
+ "('slash-fest', 0.5573585033416748), \n",
+ "('mystery', 0.5542301535606384), \n",
+ "('potboiler', 0.5519827604293823), \n",
+ "('mockumentary', 0.5490710139274597)] | [('1000%', 0.42290645837783813), \n",
+ "(\"gymnast's\", 0.4180164337158203), \n",
+ "('hollywoodland', 0.3898555636405945), \n",
+ "('cultures', 0.3857914209365845), \n",
+ "('hooda', 0.3851744532585144), \n",
+ "('cites', 0.38047513365745544), \n",
+ "(\"78's\", 0.3792475461959839), \n",
+ "(\"dormael's\", 0.3775535225868225), \n",
+ "('jokester', 0.3725704252719879), \n",
+ "('impelled', 0.36853262782096863), \n",
+ "('lia', 0.3684236407279968), \n",
+ "('snivelling', 0.3683513104915619), \n",
+ "('astral', 0.36715900897979736), \n",
+ "('euro-exploitation', 0.35853487253189087), \n",
+ "(\"serra's\", 0.3578598201274872), \n",
+ "('down-on-their-luck', 0.3576606214046478), \n",
+ "('rowles', 0.3567575514316559), \n",
+ "('romantica', 0.3549702763557434), \n",
+ "('bonham-carter', 0.354231059551239), \n",
+ "('1877', 0.3541453182697296)] | [('comedy-drama', 0.6274900436401367), \n",
+ "('comedy', 0.5986765623092651), \n",
+ "('thriller', 0.5765297412872314), \n",
+ "('road-movie', 0.5615973472595215), \n",
+ "('dramedy', 0.5580120086669922), \n",
+ "('time-killer', 0.5497636795043945), \n",
+ "('potboiler', 0.5456510782241821), \n",
+ "('comedy/', 0.5439876317977905), \n",
+ "('actioner', 0.5423712134361267), \n",
+ "('diversion', 0.541743278503418), \n",
+ "('romcom', 0.5402226448059082), \n",
+ "('rom-com', 0.5358527302742004), \n",
+ "('drama', 0.5320745706558228), \n",
+ "('chiller', 0.5229591727256775), \n",
+ "('romp', 0.5228806734085083), \n",
+ "('horror/comedy', 0.5219299793243408), \n",
+ "('weeper', 0.5195824503898621), \n",
+ "('mockumentary', 0.5149033069610596), \n",
+ "('camp-fest', 0.5122634768486023), \n",
+ "('mystery/comedy', 0.5020694732666016)] |
"
+ ],
+ "metadata": {},
+ "output_type": "pyout",
+ "prompt_number": 17,
+ "text": [
+ ""
+ ]
+ }
+ ],
+ "prompt_number": 17
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Do the DBOW words look meaningless? That's because the gensim DBOW model doesn't train word vectors \u2013 they remain at their random initialized values \u2013 unless you ask with the `dbow_words=1` initialization parameter. Concurrent word-training slows DBOW mode significantly, and offers little improvement (and sometimes a little worsening) of the error rate on this IMDB sentiment-prediction task. \n",
+ "\n",
+ "Words from DM models tend to show meaningfully similar words when there are many examples in the training data (as with 'plot' or 'actor'). (All DM modes inherently involve word vector training concurrent with doc vector training.)"
+ ]
+ },
+ {
+ "cell_type": "heading",
+ "level": 3,
+ "metadata": {},
+ "source": [
+ "Are the word vectors from this dataset any good at analogies?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "# assuming something like\n",
+ "# https://word2vec.googlecode.com/svn/trunk/questions-words.txt \n",
+ "# is in local directory\n",
+ "# note: this takes many minutes\n",
+ "for model in word_models:\n",
+ " sections = model.accuracy('questions-words.txt')\n",
+ " correct, incorrect = (len(sum((s['correct'] for s in sections), [])), len(sum((s['incorrect'] for s in sections),[])))\n",
+ " print('%s: %0.2f%% correct (%d of %d)' % (model, float(correct*100)/(correct+incorrect), correct, correct+incorrect))"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": [
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "Doc2Vec(dm/c,d100,n5,w5,mc2,t8): 28.70% correct (5746 of 20024)\n",
+ "Doc2Vec(dbow,d100,n5,mc2,t8): 0.01% correct (2 of 20024)"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n",
+ "Doc2Vec(dm/m,d100,n5,w10,mc2,t8): 27.24% correct (5454 of 20024)"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "stream": "stdout",
+ "text": [
+ "\n"
+ ]
+ }
+ ],
+ "prompt_number": 26
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Even though this is a tiny, domain-specific dataset, it shows some meager capability on the general word analogies \u2013 at least for the DM/concat and DM/mean models which actually train word vectors. (The untrained random-initialized words of the DBOW model of course fail miserably.)"
+ ]
+ },
+ {
+ "cell_type": "heading",
+ "level": 2,
+ "metadata": {},
+ "source": [
+ "Slop"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "This cell left intentionally erroneous. "
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "To mix the Google dataset (if locally available) into the word tests..."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "from gensim.models import Word2Vec\n",
+ "w2v_g100b = Word2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)\n",
+ "w2v_g100b.compact_name = 'w2v_g100b'\n",
+ "word_models.append(w2v_g100b)"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "To get copious logging output from above steps..."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "import logging\n",
+ "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)\n",
+ "rootLogger = logging.getLogger()\n",
+ "rootLogger.setLevel(logging.INFO)"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "To auto-reload python code while developing..."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "collapsed": false,
+ "input": [
+ "%load_ext autoreload\n",
+ "%autoreload 2"
+ ],
+ "language": "python",
+ "metadata": {},
+ "outputs": []
+ }
+ ],
+ "metadata": {}
+ }
+ ]
+}
\ No newline at end of file
diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py
index 97696b8974..895546b107 100644
--- a/gensim/models/doc2vec.py
+++ b/gensim/models/doc2vec.py
@@ -14,7 +14,7 @@
Initialize a model with e.g.::
->>> model = Doc2Vec(sentences, size=100, window=8, min_count=5, workers=4)
+>>> model = Doc2Vec(documents, size=100, window=8, min_count=5, workers=4)
Persist a model to disk with::
@@ -42,131 +42,446 @@
except ImportError:
from Queue import Queue
-from numpy import zeros, random, sum as np_sum
-from six import string_types
+from collections import namedtuple
+
+from numpy import zeros, random, sum as np_sum, add as np_add, concatenate, \
+ repeat as np_repeat, array, float32 as REAL, empty, ones, memmap as np_memmap, \
+ sqrt, newaxis, ndarray, dot, argsort, vstack
logger = logging.getLogger(__name__)
-from gensim import utils # utility fnc for pickling, common scipy operations etc
-from gensim.models.word2vec import Word2Vec, Vocab, train_cbow_pair, train_sg_pair
+from gensim import utils, matutils # utility fnc for pickling, common scipy operations etc
+from gensim.models.word2vec import Word2Vec, Vocab, train_cbow_pair, train_sg_pair, train_sentence_sg
+from six.moves import xrange
+from six import string_types, integer_types
try:
- from gensim.models.doc2vec_inner import train_sentence_dbow, train_sentence_dm, FAST_VERSION
+ from gensim.models.doc2vec_inner import train_document_dbow, train_document_dm, train_document_dm_concat,\
+ FAST_VERSION
except:
# failed... fall back to plain numpy (20-80x slower training than the above)
FAST_VERSION = -1
- def train_sentence_dbow(model, sentence, lbls, alpha, work=None, train_words=True, train_lbls=True):
+ def train_document_dbow(model, word_vocabs, doctag_indexes, alpha, work=None,
+ train_words=False, learn_doctags=True, learn_words=True, learn_hidden=True,
+ word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None):
"""
- Update distributed bag of words model by training on a single sentence.
+ Update distributed bag of words model ("PV-DBOW") by training on a single document.
+
+ Called internally from `Doc2Vec.train()` and `Doc2Vec.infer_vector()`.
+
+ The document is provided as `word_vocabs`, a list of Vocab objects which provide
+ indexes into the word_vector array, and `doctag_indexes`, which provide indexes
+ int the doctag_vectors array. (See `_prepare_items()`.)
- The sentence is a list of Vocab objects (or None, where the corresponding
- word is not in the vocabulary. Called internally from `Doc2Vec.train()`.
+ If `train_words` is True, simultaneously train word-to-word (not just doc-to-word)
+ examples, exactly as per Word2Vec skip-gram training. (Without this option,
+ word vectors are neither consulted nor updated during DBOW doc vector training.)
+
+ Any of `learn_doctags', `learn_words`, and `learn_hidden` may be set False to
+ prevent learning-updates to those respective model weights, as if using the
+ (partially-)frozen model to infer other compatible vectors.
This is the non-optimized, Python version. If you have cython installed, gensim
will use the optimized version from doc2vec_inner instead.
"""
- neg_labels = []
- if model.negative:
- # precompute negative labels
- neg_labels = zeros(model.negative + 1)
- neg_labels[0] = 1.0
-
- for label in lbls:
- if label is None:
- continue # OOV word in the input sentence => skip
- for word in sentence:
+ if doctag_vectors is None:
+ doctag_vectors = model.docvecs.doctag_syn0
+ if doctag_locks is None:
+ doctag_locks = model.docvecs.doctag_syn0_lockf
+
+ if train_words and learn_words:
+ train_sentence_sg(model, word_vocabs, alpha, work) # TODO: adapt for word_vectors/word_locks
+ for doctag_index in doctag_indexes:
+ for word in word_vocabs:
if word is None:
- continue # OOV word in the input sentence => skip
- train_sg_pair(model, word, label, alpha, neg_labels, train_words, train_lbls)
+ continue # OOV word in the input document => skip
+ train_sg_pair(model, word, doctag_index, alpha, learn_vectors=learn_doctags,
+ learn_hidden=learn_hidden, context_vectors=doctag_vectors,
+ context_locks=doctag_locks)
- return len([word for word in sentence if word is not None])
+ return len([word for word in word_vocabs if word is not None])
- def train_sentence_dm(model, sentence, lbls, alpha, work=None, neu1=None, train_words=True, train_lbls=True):
+ def train_document_dm(model, word_vocabs, doctag_indexes, alpha, work=None, neu1=None,
+ learn_doctags=True, learn_words=True, learn_hidden=True,
+ word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None):
"""
- Update distributed memory model by training on a single sentence.
+ Update distributed memory model ("PV-DM") by training on a single document.
+
+ Called internally from `Doc2Vec.train()` and `Doc2Vec.infer_vector()`. This
+ method implements the DM model with a projection (input) layer that is
+ either the sum or mean of the context vectors, depending on the model's
+ `dm_mean` configuration field. See `train_dm_concat()` for the DM model
+ with a concatenated input layer.
- The sentence is a list of Vocab objects (or None, where the corresponding
- word is not in the vocabulary. Called internally from `Doc2Vec.train()`.
+ The document is provided as `word_vocabs`, a list of Vocab objects which provide
+ indexes into the word_vector array, and `doctag_indexes`, which provide indexes
+ int the doctag_vectors array. (See `_prepare_items()`.)
+
+ Any of `learn_doctags', `learn_words`, and `learn_hidden` may be set False to
+ prevent learning-updates to those respective model weights, as if using the
+ (partially-)frozen model to infer other compatible vectors.
This is the non-optimized, Python version. If you have a C compiler, gensim
will use the optimized version from doc2vec_inner instead.
"""
- lbl_indices = [lbl.index for lbl in lbls if lbl is not None]
- lbl_sum = np_sum(model.syn0[lbl_indices], axis=0)
- lbl_len = len(lbl_indices)
- neg_labels = []
- if model.negative:
- # precompute negative labels
- neg_labels = zeros(model.negative + 1)
- neg_labels[0] = 1.
-
- for pos, word in enumerate(sentence):
+ if word_vectors is None:
+ word_vectors = model.syn0
+ if word_locks is None:
+ word_locks = model.syn0_lockf
+ if doctag_vectors is None:
+ doctag_vectors = model.docvecs.doctag_syn0
+ if doctag_locks is None:
+ doctag_locks = model.docvecs.doctag_syn0_lockf
+
+ doctag_sum = np_sum(doctag_vectors[doctag_indexes], axis=0)
+ doctag_len = len(doctag_indexes)
+
+ for pos, word in enumerate(word_vocabs):
if word is None:
- continue # OOV word in the input sentence => skip
+ continue # OOV word in the input document => skip
reduced_window = random.randint(model.window) # `b` in the original doc2vec code
start = max(0, pos - model.window + reduced_window)
- window_pos = enumerate(sentence[start : pos + model.window + 1 - reduced_window], start)
- word2_indices = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)]
- l1 = np_sum(model.syn0[word2_indices], axis=0) + lbl_sum # 1 x layer1_size
- if word2_indices and model.cbow_mean:
- l1 /= (len(word2_indices) + lbl_len)
- neu1e = train_cbow_pair(model, word, word2_indices, l1, alpha, neg_labels, train_words, train_words)
- if train_lbls:
- model.syn0[lbl_indices] += neu1e
+ window_pos = enumerate(word_vocabs[start : pos + model.window + 1 - reduced_window], start)
+ word2_indexes = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)]
+ l1 = np_sum(word_vectors[word2_indexes], axis=0) + doctag_sum # 1 x layer1_size
+ if word2_indexes and model.cbow_mean:
+ l1 /= (len(word2_indexes) + doctag_len)
+ neu1e = train_cbow_pair(model, word, word2_indexes, l1, alpha, learn_vectors=False, learn_hidden=True)
+ if word2_indexes and not model.cbow_mean:
+ neu1e /= (len(word2_indexes) + doctag_len)
+ if learn_doctags:
+ doctag_vectors[doctag_indexes] += \
+ neu1e * np_repeat(doctag_locks[doctag_indexes],model.vector_size).reshape(-1,model.vector_size)
+ if learn_words:
+ word_vectors[word2_indexes] += \
+ neu1e * np_repeat(word_locks[word2_indexes],model.vector_size).reshape(-1,model.vector_size)
+
+ return len([word for word in word_vocabs if word is not None])
+
+
+ def train_document_dm_concat(model, word_vocabs, doctag_indexes, alpha, work=None, neu1=None,
+ learn_doctags=True, learn_words=True, learn_hidden=True,
+ word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None):
+ """
+ Update distributed memory model ("PV-DM") by training on a single document, using a
+ concatenation of the context window word vectors (rather than a sum or average).
- return len([word for word in sentence if word is not None])
+ Called internally from `Doc2Vec.train()` and `Doc2Vec.infer_vector()`.
+ The document is provided as `word_vocabs`, a list of Vocab objects which provide
+ indexes into the word_vector array, and `doctag_indexes`, which provide indexes
+ int the doctag_vectors array. (See `_prepare_items()`.)
-class LabeledSentence(object):
+ Any of `learn_doctags', `learn_words`, and `learn_hidden` may be set False to
+ prevent learning-updates to those respective model weights, as if using the
+ (partially-)frozen model to infer other compatible vectors.
+
+ This is the non-optimized, Python version. If you have a C compiler, gensim
+ will use the optimized version from doc2vec_inner instead.
+
+ """
+ if word_vectors is None:
+ word_vectors = model.syn0
+ if word_locks is None:
+ word_locks = model.syn0_lockf
+ if doctag_vectors is None:
+ doctag_vectors = model.docvecs.doctag_syn0
+ if doctag_locks is None:
+ doctag_locks = model.docvecs.doctag_syn0_lockf
+
+ doctag_len = len(doctag_indexes)
+ if doctag_len != model.dm_tag_count:
+ return 0 # skip doc without expected doctag(s)
+
+ null_word = model.vocab['\0']
+ pre_pad_count = model.window
+ post_pad_count = model.window
+ padded_document_indexes = (
+ (pre_pad_count * [null_word.index]) # pre-padding
+ + [word.index for word in word_vocabs if word is not None] # elide out-of-Vocabulary words
+ + (post_pad_count * [null_word.index]) # post-padding
+ )
+
+ for pos in range(pre_pad_count, len(padded_document_indexes) - post_pad_count):
+ word_context_indexes = (
+ padded_document_indexes[pos - pre_pad_count : pos] # preceding words
+ + padded_document_indexes[pos + 1 : pos + 1 + post_pad_count] # following words
+ )
+ word_context_len = len(word_context_indexes)
+ predict_word = model.vocab[model.index2word[padded_document_indexes[pos]]]
+ # numpy advanced-indexing copies; concatenate, flatten to 1d
+ l1 = concatenate((doctag_vectors[doctag_indexes], word_vectors[word_context_indexes])).ravel()
+ neu1e = train_cbow_pair(model, predict_word, None, l1, alpha, learn_hidden=learn_hidden, learn_vectors=False)
+
+ # filter by locks and shape for addition to source vectors
+ e_locks = concatenate((doctag_locks[doctag_indexes], word_locks[word_context_indexes]))
+ neu1e_r = (neu1e.reshape(-1,model.vector_size)
+ * np_repeat(e_locks,model.vector_size).reshape(-1,model.vector_size))
+
+ if learn_doctags:
+ np_add.at(doctag_vectors, doctag_indexes, neu1e_r[:doctag_len])
+ if learn_words:
+ np_add.at(word_vectors, word_context_indexes, neu1e_r[doctag_len:])
+
+ return len(padded_document_indexes) - pre_pad_count - post_pad_count
+
+
+class TaggedDocument(namedtuple('TaggedDocument','words tags')):
"""
- A single labeled sentence = text item.
+ A single document, made up of `words` (a list of unicode string tokens)
+ and `tags` (a list of tokens). Tags may be one or more unicode string
+ tokens, but typical practice (which will also be most memory-efficient) is
+ for the tags list to include a unique integer id as the only tag.
+
Replaces "sentence as a list of words" from Word2Vec.
"""
- def __init__(self, words, labels):
+ def __str__(self):
+ return '%s(%s, %s)' % (self.__class__.__name__, self.words, self.tags)
+
+
+class DocvecsArray(utils.SaveLoad):
+ """
+ Default storage of doc vectors during/after training, in a numpy array.
+
+ As the 'docvecs' property of a Doc2Vec model, allows access and
+ comparison of document vectors.
+
+ >>> docvec = d2v_model.docvecs[99]
+ >>> docvec = d2v_model.docvecs['SENT_99'] # if string tag used in training
+ >>> sims = d2v_model.docvecs.most_similar(99)
+ >>> sims = d2v_model.docvecs.most_similar('SENT_99'))
+ >>> sims = d2v_model.docvecs.most_similar(docvec))
+
+ If only plain int tags are presented during training, the dict (of
+ string tag -> index) and list (of index -> string tag) stay empty,
+ saving memory.
+
+ Supplying a mapfile_path (as by initializing a Doc2Vec model with a
+ 'docvecs_mapfile' value) will use a pair of memory-mapped
+ files as the array backing for doctag_syn0/doctag_syn0_lockf values.
+
+ The Doc2Vec model automatically uses this class, but a future alternative
+ implementation, based on another persistence mechanism like LMDB, LevelDB,
+ or SQLite, should also be possible.
+ """
+ def __init__(self, mapfile_path=None):
+ self.doctags = {} # string -> Doctag (only filled if necessary)
+ self.index2doctag = [] # int index -> String (only filled if necessary)
+ self.count = -1
+ self.mapfile_path = mapfile_path
+
+ def note_doctag(self, key, document_no, document_length):
+ """Note a document tag during initial corpus scan, for structure sizing."""
+ if isinstance(key, int):
+ self.count = max(self.count, key+1)
+ else:
+ if key in self.doctags:
+ self.doctags[key] = self.doctags[key].repeat(document_length)
+ else:
+ self.doctags[key] = Doctag(document_no, document_length, 1)
+ self.index2doctag.append(key)
+ self.count = max(self.count, len(self.index2doctag))
+
+ def indexed_doctags(self, doctag_tokens):
+ """Return indexes and backing-arrays used in training examples."""
+ return ([i for i in [self._int_index(index,-1) for index in doctag_tokens] if i > -1],
+ self.doctag_syn0, self.doctag_syn0_lockf, doctag_tokens)
+
+ def trained_items(self, indexed_tuples):
+ """Persist any changes made to the given indexes (matching tuple previously
+ returned by indexed_doctags()); a no-op for this implementation"""
+ pass
+
+ def _int_index(self, index, missing=None):
+ """Return int index for either string or int index"""
+ if isinstance(index, int):
+ return index
+ else:
+ return self.doctags[index].index if index in self.doctags else missing
+
+ def _key_index(self, i_index, missing=None):
+ """Return string index for given int index, if available"""
+ if i_index < len(self.index2doctag):
+ return self.index2doctag[i_index]
+ else:
+ return i_index
+
+ def __getitem__(self, index):
+ return self.doctag_syn0[self._int_index(index)]
+
+ def __contains__(self, index):
+ if isinstance(index, int):
+ return index < self.count
+ else:
+ return index in self.doctags
+
+ def borrow_from(self, other_docvecs):
+ self.count = other_docvecs.count
+ self.doctags = other_docvecs.doctags
+ self.index2doctag = other_docvecs.index2doctag
+
+ def clear_sims(self):
+ self.doctag_syn0norm = None
+
+ def reset_weights(self, model):
+ length = max(len(self.doctags),self.count)
+ if self.mapfile_path:
+ self.doctag_syn0 = np_memmap(self.mapfile_path+'.doctag_syn0',dtype=REAL,mode='w+',shape=(length,model.vector_size))
+ self.doctag_syn0_lockf = np_memmap(self.mapfile_path+'.doctag_syn0_lockf',dtype=REAL,mode='w+',shape=(length,))
+ self.doctag_syn0_lockf.fill(1.0)
+ else:
+ self.doctag_syn0 = empty((length, model.vector_size), dtype=REAL)
+ self.doctag_syn0_lockf = ones((length,), dtype=REAL) # zeros suppress learning
+
+ for i in xrange(length):
+ # construct deterministic seed from index AND model seed
+ seed = "%d %s" % (model.seed, self.index2doctag[i] if len(self.index2doctag)>0 else str(i))
+ self.doctag_syn0[i] = model.seeded_vector(seed)
+
+ def init_sims(self, replace=False):
"""
- `words` is a list of tokens (unicode strings),
- `labels` a list of text labels associated with this text
- or a single string label.
+ Precompute L2-normalized vectors.
+ If `replace` is set, forget the original vectors and only keep the normalized
+ ones = saves lots of memory!
+
+ Note that you **cannot continue training** after doing a replace. The model becomes
+ effectively read-only = you can call `most_similar`, `similarity` etc., but not `train`.
+
+ """
+ if getattr(self, 'doctag_syn0norm', None) is None or replace:
+ logger.info("precomputing L2-norms of doc weight vectors")
+ if replace:
+ for i in xrange(self.doctag_syn0.shape[0]):
+ self.doctag_syn0[i, :] /= sqrt((self.doctag_syn0[i, :] ** 2).sum(-1))
+ self.doctag_syn0norm = self.doctag_syn0
+ else:
+ self.doctag_syn0norm = (self.doctag_syn0 / sqrt((self.doctag_syn0 ** 2).sum(-1))[..., newaxis]).astype(REAL)
+
+ def most_similar(self, positive=[], negative=[], topn=10):
"""
- if isinstance(labels, string_types):
- labels = (labels,)
- self.words = words
- self.labels = labels
+ Find the top-N most similar docvecs known from training. Positive docs contribute
+ positively towards the similarity, negative docs negatively.
+
+ This method computes cosine similarity between a simple mean of the projection
+ weight vectors of the given docs. Docs may be specified as vectors, integer indexes
+ of trained docvecs, or if the documents were originally presented with string tags,
+ by the corresponding tags.
+ """
+ self.init_sims()
+
+ if isinstance(positive, string_types + integer_types) and not negative:
+ # allow calls like most_similar('dog'), as a shorthand for most_similar(['dog'])
+ positive = [positive]
+
+ # add weights for each doc, if not already present; default to 1.0 for positive and -1.0 for negative docs
+ positive = [(doc, 1.0) if isinstance(doc, string_types + (ndarray,) + integer_types)
+ else doc for doc in positive]
+ negative = [(doc, -1.0) if isinstance(doc, string_types + (ndarray,) + integer_types)
+ else doc for doc in negative]
+
+ # compute the weighted average of all docs
+ all_docs, mean = set(), []
+ for doc, weight in positive + negative:
+ if isinstance(doc, ndarray):
+ mean.append(weight * doc)
+ elif doc in self.doctags or doc < self.count:
+ mean.append(weight * self.doctag_syn0norm[self._int_index(doc)])
+ all_docs.add(self._int_index(doc))
+ else:
+ raise KeyError("doc '%s' not in trained set" % doc)
+ if not mean:
+ raise ValueError("cannot compute similarity with no input")
+ mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL)
+
+ dists = dot(self.doctag_syn0norm, mean)
+ if not topn:
+ return dists
+ best = argsort(dists)[::-1][:topn + len(all_docs)]
+ # ignore (don't return) docs from the input
+ result = [(self._key_index(sim), float(dists[sim])) for sim in best if sim not in all_docs]
+ return result[:topn]
+
+ def doesnt_match(self, docs):
+ """
+ Which doc from the given list doesn't go with the others?
+
+ (TODO: Accept vectors of out-of-training-set docs, as if from inference.)
+
+ """
+ self.init_sims()
+
+ docs = [doc for doc in docs if doc in self.doctags or 0 <= doc < self.count] # filter out unknowns
+ logger.debug("using docs %s" % docs)
+ if not docs:
+ raise ValueError("cannot select a doc from an empty list")
+ vectors = vstack(self.doctag_syn0norm[self._int_index(doc)] for doc in docs).astype(REAL)
+ mean = matutils.unitvec(vectors.mean(axis=0)).astype(REAL)
+ dists = dot(vectors, mean)
+ return sorted(zip(dists, docs))[0][1]
+
+ def similarity(self, d1, d2):
+ """
+ Compute cosine similarity between two docvecs in the trained set, specified by int index or
+ string tag. (TODO: Accept vectors of out-of-training-set docs, as if from inference.)
+
+ """
+ return dot(matutils.unitvec(self[d1]), matutils.unitvec(self[d2]))
+
+ def n_similarity(self, ds1, ds2):
+ """
+ Compute cosine similarity between two sets of docvecs from the trained set, specified by int
+ index or string tag. (TODO: Accept vectors of out-of-training-set docs, as if from inference.)
+
+ """
+ v1 = [self[doc] for doc in ds1]
+ v2 = [self[doc] for doc in ds2]
+ return dot(matutils.unitvec(array(v1).mean(axis=0)), matutils.unitvec(array(v2).mean(axis=0)))
- def __str__(self):
- return '%s(%s, %s)' % (self.__class__.__name__, self.words, self.labels)
+
+class Doctag(namedtuple('Doctag', 'index, word_count, doc_count')):
+ """A string document tag discovered during the initial vocabulary
+ scan. (The document-vector equivalent of a Vocab object.)
+
+ Will not be used if all presented document tags are ints.
+ """
+ __slots__ = ()
+ def repeat(self, word_count):
+ return self._replace(word_count=self.word_count + word_count, doc_count=self.doc_count + 1)
class Doc2Vec(Word2Vec):
"""Class for training, using and evaluating neural networks described in http://arxiv.org/pdf/1405.4053v2.pdf"""
- def __init__(self, sentences=None, size=300, alpha=0.025, window=8, min_count=5,
+ def __init__(self, documents=None, size=300, alpha=0.025, window=8, min_count=5,
sample=0, seed=1, workers=1, min_alpha=0.0001, dm=1, hs=1, negative=0,
- dm_mean=0, train_words=True, train_lbls=True, **kwargs):
+ dbow_words=0, dm_mean=0, dm_concat=0, dm_tag_count=1,
+ docvecs=None, docvecs_mapfile=None, comment=None, **kwargs):
"""
- Initialize the model from an iterable of `sentences`. Each sentence is a
- LabeledSentence object that will be used for training.
+ Initialize the model from an iterable of `documents`. Each document is a
+ TaggedDocument object that will be used for training.
- The `sentences` iterable can be simply a list of LabeledSentence elements, but for larger corpora,
- consider an iterable that streams the sentences directly from disk/network.
+ The `documents` iterable can be simply a list of TaggedDocument elements, but for larger corpora,
+ consider an iterable that streams the documents directly from disk/network.
- If you don't supply `sentences`, the model is left uninitialized -- use if
+ If you don't supply `documents`, the model is left uninitialized -- use if
you plan to initialize it in some other way.
- `dm` defines the training algorithm. By default (`dm=1`), distributed memory is used.
- Otherwise, `dbow` is employed.
+ `dm` defines the training algorithm. By default (`dm=1`), 'distributed memory' (PV-DM) is used.
+ Otherwise, `distributed bag of words` (PV-DBOW) is employed.
`size` is the dimensionality of the feature vectors.
- `window` is the maximum distance between the current and predicted word within a sentence.
+ `window` is the maximum distance between the predicted word and context words used for prediction
+ within a document.
`alpha` is the initial learning rate (will linearly drop to zero as training progresses).
- `seed` = for the random number generator.
+ `seed` = for the random number generator. Only runs with a single worker will be
+ deterministically reproducible because of the ordering randomness in multi-threaded runs.
`min_count` = ignore all words with total frequency lower than this.
@@ -181,68 +496,175 @@ def __init__(self, sentences=None, size=300, alpha=0.025, window=8, min_count=5,
specifies how many "noise words" should be drawn (usually between 5-20).
`dm_mean` = if 0 (default), use the sum of the context word vectors. If 1, use the mean.
- Only applies when dm is used.
+ Only applies when dm is used in non-concatenative mode.
+
+ `dm_concat` = if 1, use concatenation of context vectors rather than sum/average;
+ default is 0 (off). Note concatenation results in a much-larger model, as the input
+ is no longer the size of one (sampled or arithmatically combined) word vector, but the
+ size of the tag(s) and all words in the context strung together.
+
+ `dm_tag_count` = expected constant number of document tags per document, when using
+ dm_concat mode; default is 1.
+
+ `dbow_words` if set to 1 trains word-vectors (in skip-gram fashion) simultaneous with DBOW
+ doc-vector training; default is 0 (faster training of doc-vectors only).
"""
Word2Vec.__init__(self, size=size, alpha=alpha, window=window, min_count=min_count,
sample=sample, seed=seed, workers=workers, min_alpha=min_alpha,
- sg=(1+dm) % 2, hs=hs, negative=negative, cbow_mean=dm_mean, **kwargs)
- self.train_words = train_words
- self.train_lbls = train_lbls
- if sentences is not None:
- self.build_vocab(sentences)
- self.train(sentences)
-
- @staticmethod
- def _vocab_from(sentences):
- sentence_no, vocab = -1, {}
+ sg=(1+dm) % 2, hs=hs, negative=negative, cbow_mean=dm_mean,
+ null_word=dm_concat, **kwargs)
+ self.dbow_words = dbow_words
+ self.dm_concat = dm_concat
+ self.dm_tag_count = dm_tag_count
+ self.docvecs = docvecs
+ if not self.docvecs:
+ self.docvecs = DocvecsArray(docvecs_mapfile)
+ self.comment = comment
+ if documents is not None:
+ self.build_vocab(documents)
+ self.train(documents)
+
+ def clear_sims(self):
+ Word2Vec.clear_sims(self)
+ self.docvecs.clear_sims()
+
+ def reset_weights(self):
+ if self.dm_concat:
+ # expand l1 size to match concatenated tags+words length
+ self.layer1_size = (self.dm_tag_count + (2 * self.window)) * self.vector_size
+ logger.info("using concatenative %d-dimensional layer1"% (self.layer1_size))
+ Word2Vec.reset_weights(self)
+ self.docvecs.reset_weights(self)
+
+ def reset_from(self, other_model):
+ """Reuse shareable structures from other_model."""
+ self.docvecs.borrow_from(other_model.docvecs)
+ Word2Vec.reset_from(self, other_model)
+
+ def _vocab_from(self, documents):
+ document_no, vocab = -1, {}
total_words = 0
- for sentence_no, sentence in enumerate(sentences):
- if sentence_no % 10000 == 0:
- logger.info("PROGRESS: at item #%i, processed %i words and %i word types" %
- (sentence_no, total_words, len(vocab)))
- sentence_length = len(sentence.words)
- for label in sentence.labels:
- total_words += 1
- if label in vocab:
- vocab[label].count += sentence_length
- else:
- vocab[label] = Vocab(count=sentence_length)
- for word in sentence.words:
+ for document_no, document in enumerate(documents):
+ if document_no % 10000 == 0:
+ logger.info("PROGRESS: at document #%i, processed %i words and %i word types" %
+ (document_no, total_words, len(vocab)))
+ document_length = len(document.words)
+ for tag in document.tags:
+ self.docvecs.note_doctag(tag, document_no, document_length)
+ for word in document.words:
total_words += 1
if word in vocab:
vocab[word].count += 1
else:
vocab[word] = Vocab(count=1)
- logger.info("collected %i word types from a corpus of %i words and %i items" %
- (len(vocab), total_words, sentence_no + 1))
+ logger.info("collected %i word types from a corpus of %i words and %i documents" %
+ (len(vocab), total_words, document_no + 1))
return vocab
- def _prepare_sentences(self, sentences):
- for sentence in sentences:
- # avoid calling random_sample() where prob >= 1, to speed things up a little:
- sampled = [self.vocab[word] for word in sentence.words
- if word in self.vocab and (self.vocab[word].sample_probability >= 1.0 or
- self.vocab[word].sample_probability >= random.random_sample())]
- yield (sampled, [self.vocab[word] for word in sentence.labels if word in self.vocab])
+ def _prepare_items(self, documents):
+ for document in documents:
+ yield (self._tokens_to_vocabs(document.words),
+ self.docvecs.indexed_doctags(document.tags))
+
+ def _tokens_to_vocabs(self, tokens, sample=True, source_dict=None):
+ """Convert list of tokens to items (Vocabs) from source_dict."""
+ if source_dict is None:
+ source_dict = self.vocab
+ if sample:
+ return [source_dict[token] for token in tokens if token in source_dict
+ and (source_dict[token].sample_probability >= 1.0 or
+ source_dict[token].sample_probability >= random.random_sample())]
+ else:
+ return [source_dict[token] for token in tokens if token in source_dict]
def _get_job_words(self, alpha, work, job, neu1):
if self.sg:
- return sum(train_sentence_dbow(self, sentence, lbls, alpha, work, self.train_words, self.train_lbls) for sentence, lbls in job)
+ tally = sum(train_document_dbow(self, word_vocabs, doctag_indexes, alpha, work, train_words=self.dbow_words,
+ doctag_vectors=doctag_vectors, doctag_locks=doctag_locks)
+ for word_vocabs, (doctag_indexes, doctag_vectors, doctag_locks, ignored) in job)
+ elif self.dm_concat:
+ tally = sum(train_document_dm_concat(self, word_vocabs, doctag_indexes, alpha, work, neu1,
+ doctag_vectors=doctag_vectors, doctag_locks=doctag_locks)
+ for word_vocabs, (doctag_indexes, doctag_vectors, doctag_locks, ignored) in job)
else:
- return sum(train_sentence_dm(self, sentence, lbls, alpha, work, neu1, self.train_words, self.train_lbls) for sentence, lbls in job)
-
- def __str__(self):
- return "Doc2Vec(vocab=%s, size=%s, alpha=%s)" % (len(self.index2word), self.layer1_size, self.alpha)
+ tally = sum(train_document_dm(self, word_vocabs, doctag_indexes, alpha, work, neu1,
+ doctag_vectors=doctag_vectors, doctag_locks=doctag_locks)
+ for word_vocabs, (doctag_indexes, doctag_vectors, doctag_locks, ignored) in job)
+ self.docvecs.trained_items(item for s, item in job)
+ return tally
- def save(self, *args, **kwargs):
- kwargs['ignore'] = kwargs.get('ignore', ['syn0norm']) # don't bother storing the cached normalized vectors
- super(Doc2Vec, self).save(*args, **kwargs)
+ def infer_vector(self, document, alpha=0.1, min_alpha=0.0001, steps=5):
+ """
+ Infer a vector for given post-bulk training document.
+ Document should be a list of (word) tokens.
+ """
+ doctag_vectors = empty((1, self.vector_size), dtype=REAL)
+ doctag_vectors[0] = self.seeded_vector(' '.join(document))
+ doctag_locks = ones(1, dtype=REAL)
+ doctag_indexes = [0]
+ word_vocabs = self._tokens_to_vocabs(document)
+
+ work = zeros(self.layer1_size, dtype=REAL)
+ if not self.sg:
+ neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL)
+
+ for i in range(steps):
+ if self.sg:
+ train_document_dbow(self, word_vocabs, doctag_indexes, alpha, work,
+ learn_words=False, learn_hidden=False,
+ doctag_vectors=doctag_vectors, doctag_locks=doctag_locks)
+ elif self.dm_concat:
+ train_document_dm_concat(self, word_vocabs, doctag_indexes, alpha, work, neu1,
+ learn_words=False, learn_hidden=False,
+ doctag_vectors=doctag_vectors, doctag_locks=doctag_locks)
+ else:
+ train_document_dm(self, word_vocabs, doctag_indexes, alpha, work, neu1,
+ learn_words=False, learn_hidden=False,
+ doctag_vectors=doctag_vectors, doctag_locks=doctag_locks)
+ alpha = ((alpha - min_alpha) / (steps - i)) + min_alpha
+
+ return doctag_vectors[0]
-class LabeledBrownCorpus(object):
- """Iterate over sentences from the Brown corpus (part of NLTK data), yielding
- each sentence out as a LabeledSentence object."""
+ def __str__(self):
+ """Abbreviated name reflecting major configuration paramaters."""
+ segments = []
+ if self.comment:
+ segments.append('"%s"' % self.comment)
+ if self.sg:
+ if self.dbow_words:
+ segments.append('dbow+w') # also training words
+ else:
+ segments.append('dbow') # PV-DBOW (skip-gram-style)
+
+ else: # PV-DM...
+ if self.dm_concat:
+ segments.append('dm/c') # ...with concatenative context layer
+ else:
+ if self.cbow_mean:
+ segments.append('dm/m')
+ else:
+ segments.append('dm/s')
+ segments.append('d%d' % self.vector_size) # dimensions
+ if self.negative:
+ segments.append('n%d' % self.negative) # negative samples
+ if self.hs:
+ segments.append('hs')
+ if not self.sg or (self.sg and self.dbow_words):
+ segments.append('w%d' % self.window) # window size, when relevant
+ if self.min_count > 1:
+ segments.append('mc%d' % self.min_count)
+ if self.sample > 0:
+ segments.append('s%E' % self.sample)
+ if self.workers > 1:
+ segments.append('t%d' % self.workers)
+ return 'Doc2Vec(%s)' % ','.join(segments)
+
+
+class TaggedBrownCorpus(object):
+ """Iterate over documents from the Brown corpus (part of NLTK data), yielding
+ each document out as a TaggedDocument object."""
def __init__(self, dirname):
self.dirname = dirname
@@ -253,33 +675,33 @@ def __iter__(self):
continue
for item_no, line in enumerate(utils.smart_open(fname)):
line = utils.to_unicode(line)
- # each file line is a single sentence in the Brown corpus
+ # each file line is a single document in the Brown corpus
# each token is WORD/POS_TAG
token_tags = [t.split('/') for t in line.split() if len(t.split('/')) == 2]
# ignore words with non-alphabetic tags like ",", "!" etc (punctuation, weird stuff)
words = ["%s/%s" % (token.lower(), tag[:2]) for token, tag in token_tags if tag[:2].isalpha()]
- if not words: # don't bother sending out empty sentences
+ if not words: # don't bother sending out empty documents
continue
- yield LabeledSentence(words, ['%s_SENT_%s' % (fname, item_no)])
+ yield TaggedDocument(words, ['%s_SENT_%s' % (fname, item_no)])
-class LabeledLineSentence(object):
- """Simple format: one sentence = one line = one LabeledSentence object.
+class TaggedLineDocument(object):
+ """Simple format: one document = one line = one TaggedDocument object.
Words are expected to be already preprocessed and separated by whitespace,
- labels are constructed automatically from the sentence line number."""
+ tags are constructed automatically from the document line number."""
def __init__(self, source):
"""
`source` can be either a string (filename) or a file object.
Example::
- sentences = LineSentence('myfile.txt')
+ documents = TaggedLineDocument('myfile.txt')
Or for compressed files::
- sentences = LineSentence('compressed_text.txt.bz2')
- sentences = LineSentence('compressed_text.txt.gz')
+ documents = TaggedLineDocument('compressed_text.txt.bz2')
+ documents = TaggedLineDocument('compressed_text.txt.gz')
"""
self.source = source
@@ -291,9 +713,9 @@ def __iter__(self):
# Things that don't have seek will trigger an exception
self.source.seek(0)
for item_no, line in enumerate(self.source):
- yield LabeledSentence(utils.to_unicode(line).split(), ['SENT_%s' % item_no])
+ yield TaggedDocument(utils.to_unicode(line).split(), [item_no])
except AttributeError:
# If it didn't work like a file, use it as a string filename
with utils.smart_open(self.source) as fin:
for item_no, line in enumerate(fin):
- yield LabeledSentence(utils.to_unicode(line).split(), ['SENT_%s' % item_no])
+ yield TaggedDocument(utils.to_unicode(line).split(), [item_no])
diff --git a/gensim/models/doc2vec_inner.c b/gensim/models/doc2vec_inner.c
index 48bdfbaf75..65a888fc56 100644
--- a/gensim/models/doc2vec_inner.c
+++ b/gensim/models/doc2vec_inner.c
@@ -648,12 +648,12 @@ typedef npy_double __pyx_t_5numpy_double_t;
*/
typedef npy_longdouble __pyx_t_5numpy_longdouble_t;
-/* "trunk/gensim/models/doc2vec_inner.pyx":23
+/* "trunk/gensim/models/doc2vec_inner.pyx":24
*
* REAL = np.float32
* ctypedef np.float32_t REAL_t # <<<<<<<<<<<<<<
*
- * DEF MAX_SENTENCE_LEN = 10000
+ * DEF MAX_DOCUMENT_LEN = 10000
*/
typedef __pyx_t_5numpy_float32_t __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t;
#if CYTHON_CCOMPLEX
@@ -715,8 +715,8 @@ typedef npy_clongdouble __pyx_t_5numpy_clongdouble_t;
*/
typedef npy_cdouble __pyx_t_5numpy_complex_t;
-/* "trunk/gensim/models/doc2vec_inner.pyx":27
- * DEF MAX_SENTENCE_LEN = 10000
+/* "trunk/gensim/models/doc2vec_inner.pyx":28
+ * DEF MAX_DOCUMENT_LEN = 10000
*
* ctypedef void (*scopy_ptr) (const int *N, const float *X, const int *incX, float *Y, const int *incY) nogil # <<<<<<<<<<<<<<
* ctypedef void (*saxpy_ptr) (const int *N, const float *alpha, const float *X, const int *incX, float *Y, const int *incY) nogil
@@ -724,7 +724,7 @@ typedef npy_cdouble __pyx_t_5numpy_complex_t;
*/
typedef void (*__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_scopy_ptr)(int const *, float const *, int const *, float *, int const *);
-/* "trunk/gensim/models/doc2vec_inner.pyx":28
+/* "trunk/gensim/models/doc2vec_inner.pyx":29
*
* ctypedef void (*scopy_ptr) (const int *N, const float *X, const int *incX, float *Y, const int *incY) nogil
* ctypedef void (*saxpy_ptr) (const int *N, const float *alpha, const float *X, const int *incX, float *Y, const int *incY) nogil # <<<<<<<<<<<<<<
@@ -733,7 +733,7 @@ typedef void (*__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_scopy_ptr)(int con
*/
typedef void (*__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_saxpy_ptr)(int const *, float const *, float const *, int const *, float *, int const *);
-/* "trunk/gensim/models/doc2vec_inner.pyx":29
+/* "trunk/gensim/models/doc2vec_inner.pyx":30
* ctypedef void (*scopy_ptr) (const int *N, const float *X, const int *incX, float *Y, const int *incY) nogil
* ctypedef void (*saxpy_ptr) (const int *N, const float *alpha, const float *X, const int *incX, float *Y, const int *incY) nogil
* ctypedef float (*sdot_ptr) (const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil # <<<<<<<<<<<<<<
@@ -742,7 +742,7 @@ typedef void (*__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_saxpy_ptr)(int con
*/
typedef float (*__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_sdot_ptr)(int const *, float const *, int const *, float const *, int const *);
-/* "trunk/gensim/models/doc2vec_inner.pyx":30
+/* "trunk/gensim/models/doc2vec_inner.pyx":31
* ctypedef void (*saxpy_ptr) (const int *N, const float *alpha, const float *X, const int *incX, float *Y, const int *incY) nogil
* ctypedef float (*sdot_ptr) (const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil
* ctypedef double (*dsdot_ptr) (const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil # <<<<<<<<<<<<<<
@@ -751,7 +751,7 @@ typedef float (*__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_sdot_ptr)(int con
*/
typedef double (*__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_dsdot_ptr)(int const *, float const *, int const *, float const *, int const *);
-/* "trunk/gensim/models/doc2vec_inner.pyx":31
+/* "trunk/gensim/models/doc2vec_inner.pyx":32
* ctypedef float (*sdot_ptr) (const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil
* ctypedef double (*dsdot_ptr) (const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil
* ctypedef double (*snrm2_ptr) (const int *N, const float *X, const int *incX) nogil # <<<<<<<<<<<<<<
@@ -760,50 +760,32 @@ typedef double (*__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_dsdot_ptr)(int c
*/
typedef double (*__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_snrm2_ptr)(int const *, float const *, int const *);
-/* "trunk/gensim/models/doc2vec_inner.pyx":32
+/* "trunk/gensim/models/doc2vec_inner.pyx":33
* ctypedef double (*dsdot_ptr) (const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil
* ctypedef double (*snrm2_ptr) (const int *N, const float *X, const int *incX) nogil
* ctypedef void (*sscal_ptr) (const int *N, const float *alpha, const float *X, const int *incX) nogil # <<<<<<<<<<<<<<
*
- * ctypedef void (*fast_sentence_dbow_hs_ptr) (
+ * cdef scopy_ptr scopy=PyCObject_AsVoidPtr(fblas.scopy._cpointer) # y = x
*/
typedef void (*__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_sscal_ptr)(int const *, float const *, float const *, int const *);
-/* "trunk/gensim/models/doc2vec_inner.pyx":34
- * ctypedef void (*sscal_ptr) (const int *N, const float *alpha, const float *X, const int *incX) nogil
- *
- * ctypedef void (*fast_sentence_dbow_hs_ptr) ( # <<<<<<<<<<<<<<
- * const np.uint32_t *word_point, const np.uint8_t *word_code, const int codelen,
- * REAL_t *syn0, REAL_t *syn1, const int size,
- */
-typedef void (*__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_fast_sentence_dbow_hs_ptr)(__pyx_t_5numpy_uint32_t const *, __pyx_t_5numpy_uint8_t const *, int const , __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, int const , __pyx_t_5numpy_uint32_t const , __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t const , __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, int, int);
-
-/* "trunk/gensim/models/doc2vec_inner.pyx":39
- * const np.uint32_t word2_index, const REAL_t alpha, REAL_t *work, int tw, int tl) nogil
+/* "trunk/gensim/models/doc2vec_inner.pyx":51
*
- * ctypedef unsigned long long (*fast_sentence_dbow_neg_ptr) ( # <<<<<<<<<<<<<<
- * const int negative, np.uint32_t *table, unsigned long long table_len,
- * REAL_t *syn0, REAL_t *syn1neg, const int size, const np.uint32_t word_index,
- */
-typedef unsigned PY_LONG_LONG (*__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_fast_sentence_dbow_neg_ptr)(int const , __pyx_t_5numpy_uint32_t *, unsigned PY_LONG_LONG, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, int const , __pyx_t_5numpy_uint32_t const , __pyx_t_5numpy_uint32_t const , __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t const , __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, unsigned PY_LONG_LONG, int, int);
-
-/* "trunk/gensim/models/doc2vec_inner.pyx":45
- * unsigned long long next_random, int tw, int tl) nogil
+ * # function implementations swapped based on BLAS detected
+ * ctypedef REAL_t (*our_dot_ptr) (const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil # <<<<<<<<<<<<<<
+ * ctypedef void (*our_saxpy_ptr) (const int *N, const float *alpha, const float *X, const int *incX, float *Y, const int *incY) nogil
*
- * ctypedef void (*fast_sentence_dm_hs_ptr) ( # <<<<<<<<<<<<<<
- * const np.uint32_t *word_point, const np.uint8_t *word_code, int codelens[MAX_SENTENCE_LEN],
- * int lbl_codelens[MAX_SENTENCE_LEN], REAL_t *neu1, REAL_t *syn0, REAL_t *syn1, const int size,
*/
-typedef void (*__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_fast_sentence_dm_hs_ptr)(__pyx_t_5numpy_uint32_t const *, __pyx_t_5numpy_uint8_t const *, int *, int *, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, int const , __pyx_t_5numpy_uint32_t *, __pyx_t_5numpy_uint32_t *, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t const , __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, int, int, int, int, int, int, int);
+typedef __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t (*__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_our_dot_ptr)(int const *, float const *, int const *, float const *, int const *);
-/* "trunk/gensim/models/doc2vec_inner.pyx":51
- * REAL_t *work, int i, int j, int k, int cbow_mean, int lbl_length, int tw, int tl) nogil
+/* "trunk/gensim/models/doc2vec_inner.pyx":52
+ * # function implementations swapped based on BLAS detected
+ * ctypedef REAL_t (*our_dot_ptr) (const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil
+ * ctypedef void (*our_saxpy_ptr) (const int *N, const float *alpha, const float *X, const int *incX, float *Y, const int *incY) nogil # <<<<<<<<<<<<<<
*
- * ctypedef unsigned long long (*fast_sentence_dm_neg_ptr) ( # <<<<<<<<<<<<<<
- * const int negative, np.uint32_t *table, unsigned long long table_len, int codelens[MAX_SENTENCE_LEN],
- * int lbl_codelens[MAX_SENTENCE_LEN], REAL_t *neu1, REAL_t *syn0, REAL_t *syn1neg, const int size,
+ * cdef our_dot_ptr our_dot
*/
-typedef unsigned PY_LONG_LONG (*__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_fast_sentence_dm_neg_ptr)(int const , __pyx_t_5numpy_uint32_t *, unsigned PY_LONG_LONG, int *, int *, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, int const , __pyx_t_5numpy_uint32_t *, __pyx_t_5numpy_uint32_t *, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t const , __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, int, int, int, int, unsigned PY_LONG_LONG, int, int, int);
+typedef void (*__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_our_saxpy_ptr)(int const *, float const *, float const *, int const *, float *, int const *);
/* --- Runtime support code (head) --- */
#ifndef CYTHON_REFNANNY
@@ -1160,25 +1142,21 @@ static __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_sdot_ptr __pyx_v_5trunk_6g
static __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_dsdot_ptr __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_dsdot;
static __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_snrm2_ptr __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_snrm2;
static __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_sscal_ptr __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_sscal;
-static __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_fast_sentence_dbow_hs_ptr __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_fast_sentence_dbow_hs;
-static __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_fast_sentence_dbow_neg_ptr __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_fast_sentence_dbow_neg;
-static __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_fast_sentence_dm_hs_ptr __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_fast_sentence_dm_hs;
-static __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_fast_sentence_dm_neg_ptr __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_fast_sentence_dm_neg;
static __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_EXP_TABLE[1000];
static int __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE;
static __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONEF;
-static void __pyx_f_5trunk_6gensim_6models_13doc2vec_inner_fast_sentence0_dbow_hs(__pyx_t_5numpy_uint32_t const *, __pyx_t_5numpy_uint8_t const *, int const , __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, int const , __pyx_t_5numpy_uint32_t const , __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t const , __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, int, int); /*proto*/
-static void __pyx_f_5trunk_6gensim_6models_13doc2vec_inner_fast_sentence1_dbow_hs(__pyx_t_5numpy_uint32_t const *, __pyx_t_5numpy_uint8_t const *, int const , __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, int const , __pyx_t_5numpy_uint32_t const , __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t const , __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, int, int); /*proto*/
-static void __pyx_f_5trunk_6gensim_6models_13doc2vec_inner_fast_sentence2_dbow_hs(__pyx_t_5numpy_uint32_t const *, __pyx_t_5numpy_uint8_t const *, int const , __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, int const , __pyx_t_5numpy_uint32_t const , __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t const , __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, int, int); /*proto*/
-static unsigned PY_LONG_LONG __pyx_f_5trunk_6gensim_6models_13doc2vec_inner_fast_sentence0_dbow_neg(int const , __pyx_t_5numpy_uint32_t *, unsigned PY_LONG_LONG, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, int const , __pyx_t_5numpy_uint32_t const , __pyx_t_5numpy_uint32_t const , __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t const , __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, unsigned PY_LONG_LONG, int, int); /*proto*/
-static unsigned PY_LONG_LONG __pyx_f_5trunk_6gensim_6models_13doc2vec_inner_fast_sentence1_dbow_neg(int const , __pyx_t_5numpy_uint32_t *, unsigned PY_LONG_LONG, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, int const , __pyx_t_5numpy_uint32_t const , __pyx_t_5numpy_uint32_t const , __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t const , __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, unsigned PY_LONG_LONG, int, int); /*proto*/
-static unsigned PY_LONG_LONG __pyx_f_5trunk_6gensim_6models_13doc2vec_inner_fast_sentence2_dbow_neg(int const , __pyx_t_5numpy_uint32_t *, unsigned PY_LONG_LONG, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, int const , __pyx_t_5numpy_uint32_t const , __pyx_t_5numpy_uint32_t const , __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t const , __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, unsigned PY_LONG_LONG, int, int); /*proto*/
-static void __pyx_f_5trunk_6gensim_6models_13doc2vec_inner_fast_sentence0_dm_hs(__pyx_t_5numpy_uint32_t const *, __pyx_t_5numpy_uint8_t const *, int *, int *, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, int const , __pyx_t_5numpy_uint32_t const *, __pyx_t_5numpy_uint32_t const *, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t const , __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, int, int, int, int, int, int, int); /*proto*/
-static void __pyx_f_5trunk_6gensim_6models_13doc2vec_inner_fast_sentence1_dm_hs(__pyx_t_5numpy_uint32_t const *, __pyx_t_5numpy_uint8_t const *, int *, int *, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, int const , __pyx_t_5numpy_uint32_t const *, __pyx_t_5numpy_uint32_t const *, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t const , __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, int, int, int, int, int, int, int); /*proto*/
-static void __pyx_f_5trunk_6gensim_6models_13doc2vec_inner_fast_sentence2_dm_hs(__pyx_t_5numpy_uint32_t const *, __pyx_t_5numpy_uint8_t const *, int *, int *, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, int const , __pyx_t_5numpy_uint32_t const *, __pyx_t_5numpy_uint32_t const *, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t const , __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, int, int, int, int, int, int, int); /*proto*/
-static unsigned PY_LONG_LONG __pyx_f_5trunk_6gensim_6models_13doc2vec_inner_fast_sentence0_dm_neg(int const , __pyx_t_5numpy_uint32_t *, unsigned PY_LONG_LONG, int *, int *, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, int const , __pyx_t_5numpy_uint32_t *, __pyx_t_5numpy_uint32_t *, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t const , __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, int, int, int, int, unsigned PY_LONG_LONG, int, int, int); /*proto*/
-static unsigned PY_LONG_LONG __pyx_f_5trunk_6gensim_6models_13doc2vec_inner_fast_sentence1_dm_neg(int const , __pyx_t_5numpy_uint32_t *, unsigned PY_LONG_LONG, int *, int *, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, int const , __pyx_t_5numpy_uint32_t *, __pyx_t_5numpy_uint32_t *, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t const , __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, int, int, int, int, unsigned PY_LONG_LONG, int, int, int); /*proto*/
-static unsigned PY_LONG_LONG __pyx_f_5trunk_6gensim_6models_13doc2vec_inner_fast_sentence2_dm_neg(int const , __pyx_t_5numpy_uint32_t *, unsigned PY_LONG_LONG, int *, int *, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, int const , __pyx_t_5numpy_uint32_t *, __pyx_t_5numpy_uint32_t *, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t const , __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, int, int, int, int, unsigned PY_LONG_LONG, int, int, int); /*proto*/
+static __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_our_dot_ptr __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_our_dot;
+static __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_our_saxpy_ptr __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_our_saxpy;
+static __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t __pyx_f_5trunk_6gensim_6models_13doc2vec_inner_our_dot_double(int const *, float const *, int const *, float const *, int const *); /*proto*/
+static __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t __pyx_f_5trunk_6gensim_6models_13doc2vec_inner_our_dot_float(int const *, float const *, int const *, float const *, int const *); /*proto*/
+static __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t __pyx_f_5trunk_6gensim_6models_13doc2vec_inner_our_dot_noblas(int const *, float const *, int const *, float const *, int const *); /*proto*/
+static void __pyx_f_5trunk_6gensim_6models_13doc2vec_inner_our_saxpy_noblas(int const *, float const *, float const *, int const *, float *, int const *); /*proto*/
+static void __pyx_f_5trunk_6gensim_6models_13doc2vec_inner_fast_document_dbow_hs(__pyx_t_5numpy_uint32_t const *, __pyx_t_5numpy_uint8_t const *, int const , __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, int const , __pyx_t_5numpy_uint32_t const , __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t const , __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, int, int, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *); /*proto*/
+static unsigned PY_LONG_LONG __pyx_f_5trunk_6gensim_6models_13doc2vec_inner_fast_document_dbow_neg(int const , __pyx_t_5numpy_uint32_t *, unsigned PY_LONG_LONG, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, int const , __pyx_t_5numpy_uint32_t const , __pyx_t_5numpy_uint32_t const , __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t const , __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, unsigned PY_LONG_LONG, int, int, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *); /*proto*/
+static void __pyx_f_5trunk_6gensim_6models_13doc2vec_inner_fast_document_dm_hs(__pyx_t_5numpy_uint32_t const *, __pyx_t_5numpy_uint8_t const *, int, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t const , __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, int const , int); /*proto*/
+static unsigned PY_LONG_LONG __pyx_f_5trunk_6gensim_6models_13doc2vec_inner_fast_document_dm_neg(int const , __pyx_t_5numpy_uint32_t *, unsigned PY_LONG_LONG, unsigned PY_LONG_LONG, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, int const , __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t const , __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, int const , int); /*proto*/
+static void __pyx_f_5trunk_6gensim_6models_13doc2vec_inner_fast_document_dmc_hs(__pyx_t_5numpy_uint32_t const *, __pyx_t_5numpy_uint8_t const *, int, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t const , __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, int const , int const , int); /*proto*/
+static unsigned PY_LONG_LONG __pyx_f_5trunk_6gensim_6models_13doc2vec_inner_fast_document_dmc_neg(int const , __pyx_t_5numpy_uint32_t *, unsigned PY_LONG_LONG, unsigned PY_LONG_LONG, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, int const , __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t const , __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *, int const , int const , int); /*proto*/
#define __Pyx_MODULE_NAME "trunk.gensim.models.doc2vec_inner"
int __pyx_module_is_main_trunk__gensim__models__doc2vec_inner = 0;
@@ -1187,9 +1165,10 @@ static PyObject *__pyx_builtin_range;
static PyObject *__pyx_builtin_enumerate;
static PyObject *__pyx_builtin_ValueError;
static PyObject *__pyx_builtin_RuntimeError;
-static PyObject *__pyx_pf_5trunk_6gensim_6models_13doc2vec_inner_train_sentence_dbow(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_model, PyObject *__pyx_v_sentence, PyObject *__pyx_v_lbls, PyObject *__pyx_v_alpha, PyObject *__pyx_v__work, PyObject *__pyx_v_train_words, PyObject *__pyx_v_train_lbls); /* proto */
-static PyObject *__pyx_pf_5trunk_6gensim_6models_13doc2vec_inner_2train_sentence_dm(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_model, PyObject *__pyx_v_sentence, PyObject *__pyx_v_lbls, PyObject *__pyx_v_alpha, PyObject *__pyx_v__work, PyObject *__pyx_v__neu1, PyObject *__pyx_v_train_words, PyObject *__pyx_v_train_lbls); /* proto */
-static PyObject *__pyx_pf_5trunk_6gensim_6models_13doc2vec_inner_4init(CYTHON_UNUSED PyObject *__pyx_self); /* proto */
+static PyObject *__pyx_pf_5trunk_6gensim_6models_13doc2vec_inner_train_document_dbow(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_model, PyObject *__pyx_v_word_vocabs, PyObject *__pyx_v_doctag_indexes, PyObject *__pyx_v_alpha, PyObject *__pyx_v_work, PyObject *__pyx_v_train_words, PyObject *__pyx_v_learn_doctags, PyObject *__pyx_v_learn_words, PyObject *__pyx_v_learn_hidden, PyObject *__pyx_v_word_vectors, PyObject *__pyx_v_word_locks, PyObject *__pyx_v_doctag_vectors, PyObject *__pyx_v_doctag_locks); /* proto */
+static PyObject *__pyx_pf_5trunk_6gensim_6models_13doc2vec_inner_2train_document_dm(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_model, PyObject *__pyx_v_word_vocabs, PyObject *__pyx_v_doctag_indexes, PyObject *__pyx_v_alpha, PyObject *__pyx_v_work, PyObject *__pyx_v_neu1, PyObject *__pyx_v_learn_doctags, PyObject *__pyx_v_learn_words, PyObject *__pyx_v_learn_hidden, PyObject *__pyx_v_word_vectors, PyObject *__pyx_v_word_locks, PyObject *__pyx_v_doctag_vectors, PyObject *__pyx_v_doctag_locks); /* proto */
+static PyObject *__pyx_pf_5trunk_6gensim_6models_13doc2vec_inner_4train_document_dm_concat(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_model, PyObject *__pyx_v_word_vocabs, PyObject *__pyx_v_doctag_indexes, PyObject *__pyx_v_alpha, PyObject *__pyx_v_work, PyObject *__pyx_v_neu1, PyObject *__pyx_v_learn_doctags, PyObject *__pyx_v_learn_words, PyObject *__pyx_v_learn_hidden, PyObject *__pyx_v_word_vectors, PyObject *__pyx_v_word_locks, PyObject *__pyx_v_doctag_vectors, PyObject *__pyx_v_doctag_locks); /* proto */
+static PyObject *__pyx_pf_5trunk_6gensim_6models_13doc2vec_inner_6init(CYTHON_UNUSED PyObject *__pyx_self); /* proto */
static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags); /* proto */
static void __pyx_pf_5numpy_7ndarray_2__releasebuffer__(PyArrayObject *__pyx_v_self, Py_buffer *__pyx_v_info); /* proto */
static char __pyx_k_B[] = "B";
@@ -1207,33 +1186,35 @@ static char __pyx_k_i[] = "i";
static char __pyx_k_j[] = "j";
static char __pyx_k_k[] = "k";
static char __pyx_k_l[] = "l";
+static char __pyx_k_m[] = "m";
+static char __pyx_k_n[] = "n";
static char __pyx_k_q[] = "q";
static char __pyx_k_x[] = "x";
static char __pyx_k_y[] = "y";
static char __pyx_k_Zd[] = "Zd";
static char __pyx_k_Zf[] = "Zf";
static char __pyx_k_Zg[] = "Zg";
+static char __pyx_k__5[] = "\000";
static char __pyx_k_hs[] = "hs";
static char __pyx_k_np[] = "np";
-static char __pyx_k_tl[] = "tl";
-static char __pyx_k_tw[] = "tw";
static char __pyx_k_REAL[] = "REAL";
static char __pyx_k_code[] = "code";
static char __pyx_k_init[] = "init";
static char __pyx_k_item[] = "item";
-static char __pyx_k_lbls[] = "lbls";
static char __pyx_k_main[] = "__main__";
-static char __pyx_k_neu1[] = "_neu1";
+static char __pyx_k_neu1[] = "neu1";
static char __pyx_k_sdot[] = "sdot";
static char __pyx_k_size[] = "size";
static char __pyx_k_syn0[] = "syn0";
static char __pyx_k_syn1[] = "syn1";
static char __pyx_k_test[] = "__test__";
static char __pyx_k_word[] = "word";
-static char __pyx_k_work[] = "_work";
+static char __pyx_k_work[] = "work";
static char __pyx_k_alpha[] = "alpha";
static char __pyx_k_codes[] = "codes";
+static char __pyx_k_count[] = "count";
static char __pyx_k_d_res[] = "d_res";
+static char __pyx_k_dtype[] = "dtype";
static char __pyx_k_fblas[] = "fblas";
static char __pyx_k_index[] = "index";
static char __pyx_k_model[] = "model";
@@ -1246,14 +1227,17 @@ static char __pyx_k_scopy[] = "scopy";
static char __pyx_k_snrm2[] = "snrm2";
static char __pyx_k_sscal[] = "sscal";
static char __pyx_k_table[] = "table";
+static char __pyx_k_vocab[] = "vocab";
+static char __pyx_k_zeros[] = "zeros";
static char __pyx_k_import[] = "__import__";
-static char __pyx_k_neu1_2[] = "neu1";
+static char __pyx_k_neu1_2[] = "_neu1";
static char __pyx_k_points[] = "points";
static char __pyx_k_random[] = "random";
static char __pyx_k_result[] = "result";
static char __pyx_k_window[] = "window";
-static char __pyx_k_work_2[] = "work";
+static char __pyx_k_work_2[] = "_work";
static char __pyx_k_alpha_2[] = "_alpha";
+static char __pyx_k_docvecs[] = "docvecs";
static char __pyx_k_float32[] = "float32";
static char __pyx_k_indexes[] = "indexes";
static char __pyx_k_randint[] = "randint";
@@ -1262,27 +1246,50 @@ static char __pyx_k_codelens[] = "codelens";
static char __pyx_k_cpointer[] = "_cpointer";
static char __pyx_k_expected[] = "expected";
static char __pyx_k_negative[] = "negative";
-static char __pyx_k_sentence[] = "sentence";
static char __pyx_k_cbow_mean[] = "cbow_mean";
static char __pyx_k_enumerate[] = "enumerate";
-static char __pyx_k_lbl_codes[] = "lbl_codes";
+static char __pyx_k_inv_count[] = "inv_count";
static char __pyx_k_table_len[] = "table_len";
static char __pyx_k_ValueError[] = "ValueError";
-static char __pyx_k_lbl_length[] = "lbl_length";
-static char __pyx_k_lbl_points[] = "lbl_points";
-static char __pyx_k_train_lbls[] = "train_lbls";
+static char __pyx_k_doctag_len[] = "doctag_len";
+static char __pyx_k_syn0_lockf[] = "syn0_lockf";
+static char __pyx_k_word_locks[] = "word_locks";
+static char __pyx_k_doctag_syn0[] = "doctag_syn0";
static char __pyx_k_layer1_size[] = "layer1_size";
-static char __pyx_k_lbl_indexes[] = "lbl_indexes";
+static char __pyx_k_learn_words[] = "learn_words";
static char __pyx_k_next_random[] = "next_random";
static char __pyx_k_train_words[] = "train_words";
+static char __pyx_k_vector_size[] = "vector_size";
+static char __pyx_k_word_vocabs[] = "word_vocabs";
static char __pyx_k_FAST_VERSION[] = "FAST_VERSION";
static char __pyx_k_RuntimeError[] = "RuntimeError";
-static char __pyx_k_lbl_codelens[] = "lbl_codelens";
-static char __pyx_k_sentence_len[] = "sentence_len";
+static char __pyx_k_dm_tag_count[] = "dm_tag_count";
+static char __pyx_k_doctag_locks[] = "doctag_locks";
+static char __pyx_k_document_len[] = "document_len";
+static char __pyx_k_learn_hidden[] = "learn_hidden";
+static char __pyx_k_predict_word[] = "predict_word";
+static char __pyx_k_word_locks_2[] = "_word_locks";
+static char __pyx_k_word_vectors[] = "word_vectors";
+static char __pyx_k_learn_doctags[] = "learn_doctags";
+static char __pyx_k_learn_words_2[] = "_learn_words";
+static char __pyx_k_train_words_2[] = "_train_words";
+static char __pyx_k_doctag_indexes[] = "doctag_indexes";
+static char __pyx_k_doctag_locks_2[] = "_doctag_locks";
+static char __pyx_k_doctag_vectors[] = "doctag_vectors";
+static char __pyx_k_learn_hidden_2[] = "_learn_hidden";
+static char __pyx_k_window_indexes[] = "window_indexes";
+static char __pyx_k_word_vectors_2[] = "_word_vectors";
+static char __pyx_k_learn_doctags_2[] = "_learn_doctags";
+static char __pyx_k_null_word_index[] = "null_word_index";
static char __pyx_k_reduced_windows[] = "reduced_windows";
+static char __pyx_k_doctag_indexes_2[] = "_doctag_indexes";
+static char __pyx_k_doctag_vectors_2[] = "_doctag_vectors";
+static char __pyx_k_doctag_syn0_lockf[] = "doctag_syn0_lockf";
static char __pyx_k_scipy_linalg_blas[] = "scipy.linalg.blas";
-static char __pyx_k_train_sentence_dm[] = "train_sentence_dm";
-static char __pyx_k_train_sentence_dbow[] = "train_sentence_dbow";
+static char __pyx_k_train_document_dm[] = "train_document_dm";
+static char __pyx_k_expected_doctag_len[] = "expected_doctag_len";
+static char __pyx_k_train_document_dbow[] = "train_document_dbow";
+static char __pyx_k_train_document_dm_concat[] = "train_document_dm_concat";
static char __pyx_k_ndarray_is_not_C_contiguous[] = "ndarray is not C contiguous";
static char __pyx_k_Users_scratch_Documents_dev2015[] = "/Users/scratch/Documents/dev2015/gensim_venv/src/trunk/gensim/models/doc2vec_inner.pyx";
static char __pyx_k_unknown_dtype_code_in_numpy_pxd[] = "unknown dtype code in numpy.pxd (%d)";
@@ -1299,16 +1306,32 @@ static PyObject *__pyx_n_s_REAL;
static PyObject *__pyx_n_s_RuntimeError;
static PyObject *__pyx_kp_s_Users_scratch_Documents_dev2015;
static PyObject *__pyx_n_s_ValueError;
+static PyObject *__pyx_kp_s__5;
static PyObject *__pyx_n_s_alpha;
static PyObject *__pyx_n_s_alpha_2;
static PyObject *__pyx_n_s_cbow_mean;
static PyObject *__pyx_n_s_code;
static PyObject *__pyx_n_s_codelens;
static PyObject *__pyx_n_s_codes;
+static PyObject *__pyx_n_s_count;
static PyObject *__pyx_n_s_cpointer;
static PyObject *__pyx_n_s_d_res;
+static PyObject *__pyx_n_s_dm_tag_count;
+static PyObject *__pyx_n_s_doctag_indexes;
+static PyObject *__pyx_n_s_doctag_indexes_2;
+static PyObject *__pyx_n_s_doctag_len;
+static PyObject *__pyx_n_s_doctag_locks;
+static PyObject *__pyx_n_s_doctag_locks_2;
+static PyObject *__pyx_n_s_doctag_syn0;
+static PyObject *__pyx_n_s_doctag_syn0_lockf;
+static PyObject *__pyx_n_s_doctag_vectors;
+static PyObject *__pyx_n_s_doctag_vectors_2;
+static PyObject *__pyx_n_s_document_len;
+static PyObject *__pyx_n_s_docvecs;
+static PyObject *__pyx_n_s_dtype;
static PyObject *__pyx_n_s_enumerate;
static PyObject *__pyx_n_s_expected;
+static PyObject *__pyx_n_s_expected_doctag_len;
static PyObject *__pyx_n_s_fblas;
static PyObject *__pyx_n_s_float32;
static PyObject *__pyx_n_s_hs;
@@ -1317,18 +1340,21 @@ static PyObject *__pyx_n_s_import;
static PyObject *__pyx_n_s_index;
static PyObject *__pyx_n_s_indexes;
static PyObject *__pyx_n_s_init;
+static PyObject *__pyx_n_s_inv_count;
static PyObject *__pyx_n_s_item;
static PyObject *__pyx_n_s_j;
static PyObject *__pyx_n_s_k;
static PyObject *__pyx_n_s_layer1_size;
-static PyObject *__pyx_n_s_lbl_codelens;
-static PyObject *__pyx_n_s_lbl_codes;
-static PyObject *__pyx_n_s_lbl_indexes;
-static PyObject *__pyx_n_s_lbl_length;
-static PyObject *__pyx_n_s_lbl_points;
-static PyObject *__pyx_n_s_lbls;
+static PyObject *__pyx_n_s_learn_doctags;
+static PyObject *__pyx_n_s_learn_doctags_2;
+static PyObject *__pyx_n_s_learn_hidden;
+static PyObject *__pyx_n_s_learn_hidden_2;
+static PyObject *__pyx_n_s_learn_words;
+static PyObject *__pyx_n_s_learn_words_2;
+static PyObject *__pyx_n_s_m;
static PyObject *__pyx_n_s_main;
static PyObject *__pyx_n_s_model;
+static PyObject *__pyx_n_s_n;
static PyObject *__pyx_kp_u_ndarray_is_not_C_contiguous;
static PyObject *__pyx_kp_u_ndarray_is_not_Fortran_contiguou;
static PyObject *__pyx_n_s_negative;
@@ -1336,10 +1362,12 @@ static PyObject *__pyx_n_s_neu1;
static PyObject *__pyx_n_s_neu1_2;
static PyObject *__pyx_n_s_next_random;
static PyObject *__pyx_n_s_np;
+static PyObject *__pyx_n_s_null_word_index;
static PyObject *__pyx_n_s_numpy;
static PyObject *__pyx_n_s_p_res;
static PyObject *__pyx_n_s_point;
static PyObject *__pyx_n_s_points;
+static PyObject *__pyx_n_s_predict_word;
static PyObject *__pyx_n_s_randint;
static PyObject *__pyx_n_s_random;
static PyObject *__pyx_n_s_range;
@@ -1349,31 +1377,38 @@ static PyObject *__pyx_n_s_saxpy;
static PyObject *__pyx_n_s_scipy_linalg_blas;
static PyObject *__pyx_n_s_scopy;
static PyObject *__pyx_n_s_sdot;
-static PyObject *__pyx_n_s_sentence;
-static PyObject *__pyx_n_s_sentence_len;
static PyObject *__pyx_n_s_size;
static PyObject *__pyx_n_s_snrm2;
static PyObject *__pyx_n_s_sscal;
static PyObject *__pyx_n_s_syn0;
+static PyObject *__pyx_n_s_syn0_lockf;
static PyObject *__pyx_n_s_syn1;
static PyObject *__pyx_n_s_syn1neg;
static PyObject *__pyx_n_s_table;
static PyObject *__pyx_n_s_table_len;
static PyObject *__pyx_n_s_test;
-static PyObject *__pyx_n_s_tl;
-static PyObject *__pyx_n_s_train_lbls;
-static PyObject *__pyx_n_s_train_sentence_dbow;
-static PyObject *__pyx_n_s_train_sentence_dm;
+static PyObject *__pyx_n_s_train_document_dbow;
+static PyObject *__pyx_n_s_train_document_dm;
+static PyObject *__pyx_n_s_train_document_dm_concat;
static PyObject *__pyx_n_s_train_words;
+static PyObject *__pyx_n_s_train_words_2;
static PyObject *__pyx_n_s_trunk_gensim_models_doc2vec_inne;
-static PyObject *__pyx_n_s_tw;
static PyObject *__pyx_kp_u_unknown_dtype_code_in_numpy_pxd;
+static PyObject *__pyx_n_s_vector_size;
+static PyObject *__pyx_n_s_vocab;
static PyObject *__pyx_n_s_window;
+static PyObject *__pyx_n_s_window_indexes;
static PyObject *__pyx_n_s_word;
+static PyObject *__pyx_n_s_word_locks;
+static PyObject *__pyx_n_s_word_locks_2;
+static PyObject *__pyx_n_s_word_vectors;
+static PyObject *__pyx_n_s_word_vectors_2;
+static PyObject *__pyx_n_s_word_vocabs;
static PyObject *__pyx_n_s_work;
static PyObject *__pyx_n_s_work_2;
static PyObject *__pyx_n_s_x;
static PyObject *__pyx_n_s_y;
+static PyObject *__pyx_n_s_zeros;
static PyObject *__pyx_int_0;
static PyObject *__pyx_int_1;
static PyObject *__pyx_int_2;
@@ -1382,206 +1417,209 @@ static PyObject *__pyx_tuple_;
static PyObject *__pyx_tuple__2;
static PyObject *__pyx_tuple__3;
static PyObject *__pyx_tuple__4;
-static PyObject *__pyx_tuple__5;
static PyObject *__pyx_tuple__6;
static PyObject *__pyx_tuple__7;
static PyObject *__pyx_tuple__8;
static PyObject *__pyx_tuple__9;
static PyObject *__pyx_tuple__10;
static PyObject *__pyx_tuple__11;
+static PyObject *__pyx_tuple__12;
static PyObject *__pyx_tuple__13;
-static PyObject *__pyx_tuple__15;
-static PyObject *__pyx_codeobj__12;
-static PyObject *__pyx_codeobj__14;
-static PyObject *__pyx_codeobj__16;
+static PyObject *__pyx_tuple__14;
+static PyObject *__pyx_tuple__16;
+static PyObject *__pyx_tuple__18;
+static PyObject *__pyx_tuple__20;
+static PyObject *__pyx_codeobj__15;
+static PyObject *__pyx_codeobj__17;
+static PyObject *__pyx_codeobj__19;
+static PyObject *__pyx_codeobj__21;
-/* "trunk/gensim/models/doc2vec_inner.pyx":76
- * cdef REAL_t ONEF = 1.0
+/* "trunk/gensim/models/doc2vec_inner.pyx":58
+ *
+ * # for when fblas.sdot returns a double
+ * cdef REAL_t our_dot_double(const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil: # <<<<<<<<<<<<<<
+ * return dsdot(N, X, incX, Y, incY)
*
- * cdef void fast_sentence0_dbow_hs( # <<<<<<<<<<<<<<
- * const np.uint32_t *word_point, const np.uint8_t *word_code, const int codelen,
- * REAL_t *syn0, REAL_t *syn1, const int size,
*/
-static void __pyx_f_5trunk_6gensim_6models_13doc2vec_inner_fast_sentence0_dbow_hs(__pyx_t_5numpy_uint32_t const *__pyx_v_word_point, __pyx_t_5numpy_uint8_t const *__pyx_v_word_code, int const __pyx_v_codelen, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_syn0, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_syn1, int const __pyx_v_size, __pyx_t_5numpy_uint32_t const __pyx_v_word2_index, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t const __pyx_v_alpha, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_work, int __pyx_v_tw, int __pyx_v_tl) {
- PY_LONG_LONG __pyx_v_b;
- PY_LONG_LONG __pyx_v_row1;
- PY_LONG_LONG __pyx_v_row2;
- __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t __pyx_v_f;
- __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t __pyx_v_g;
- int __pyx_t_1;
- PY_LONG_LONG __pyx_t_2;
- int __pyx_t_3;
- int __pyx_t_4;
+static __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t __pyx_f_5trunk_6gensim_6models_13doc2vec_inner_our_dot_double(int const *__pyx_v_N, float const *__pyx_v_X, int const *__pyx_v_incX, float const *__pyx_v_Y, int const *__pyx_v_incY) {
+ __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t __pyx_r;
- /* "trunk/gensim/models/doc2vec_inner.pyx":82
- *
- * cdef long long a, b
- * cdef long long row1 = word2_index * size, row2 # <<<<<<<<<<<<<<
- * cdef REAL_t f, g
+ /* "trunk/gensim/models/doc2vec_inner.pyx":59
+ * # for when fblas.sdot returns a double
+ * cdef REAL_t our_dot_double(const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil:
+ * return dsdot(N, X, incX, Y, incY) # <<<<<<<<<<<<<<
*
+ * # for when fblas.sdot returns a float
*/
- __pyx_v_row1 = (__pyx_v_word2_index * __pyx_v_size);
+ __pyx_r = ((__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t)__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_dsdot(__pyx_v_N, __pyx_v_X, __pyx_v_incX, __pyx_v_Y, __pyx_v_incY));
+ goto __pyx_L0;
- /* "trunk/gensim/models/doc2vec_inner.pyx":85
- * cdef REAL_t f, g
+ /* "trunk/gensim/models/doc2vec_inner.pyx":58
+ *
+ * # for when fblas.sdot returns a double
+ * cdef REAL_t our_dot_double(const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil: # <<<<<<<<<<<<<<
+ * return dsdot(N, X, incX, Y, incY)
*
- * memset(work, 0, size * cython.sizeof(REAL_t)) # <<<<<<<<<<<<<<
- * for b in range(codelen):
- * row2 = word_point[b] * size
*/
- memset(__pyx_v_work, 0, (__pyx_v_size * (sizeof(__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t))));
- /* "trunk/gensim/models/doc2vec_inner.pyx":86
+ /* function exit code */
+ __pyx_L0:;
+ return __pyx_r;
+}
+
+/* "trunk/gensim/models/doc2vec_inner.pyx":62
+ *
+ * # for when fblas.sdot returns a float
+ * cdef REAL_t our_dot_float(const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil: # <<<<<<<<<<<<<<
+ * return sdot(N, X, incX, Y, incY)
*
- * memset(work, 0, size * cython.sizeof(REAL_t))
- * for b in range(codelen): # <<<<<<<<<<<<<<
- * row2 = word_point[b] * size
- * f = dsdot(&size, &syn0[row1], &ONE, &syn1[row2], &ONE)
*/
- __pyx_t_1 = __pyx_v_codelen;
- for (__pyx_t_2 = 0; __pyx_t_2 < __pyx_t_1; __pyx_t_2+=1) {
- __pyx_v_b = __pyx_t_2;
- /* "trunk/gensim/models/doc2vec_inner.pyx":87
- * memset(work, 0, size * cython.sizeof(REAL_t))
- * for b in range(codelen):
- * row2 = word_point[b] * size # <<<<<<<<<<<<<<
- * f = dsdot(&size, &syn0[row1], &ONE, &syn1[row2], &ONE)
- * if f <= -MAX_EXP or f >= MAX_EXP:
+static __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t __pyx_f_5trunk_6gensim_6models_13doc2vec_inner_our_dot_float(int const *__pyx_v_N, float const *__pyx_v_X, int const *__pyx_v_incX, float const *__pyx_v_Y, int const *__pyx_v_incY) {
+ __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t __pyx_r;
+
+ /* "trunk/gensim/models/doc2vec_inner.pyx":63
+ * # for when fblas.sdot returns a float
+ * cdef REAL_t our_dot_float(const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil:
+ * return sdot(N, X, incX, Y, incY) # <<<<<<<<<<<<<<
+ *
+ * # for when no blas available
*/
- __pyx_v_row2 = ((__pyx_v_word_point[__pyx_v_b]) * __pyx_v_size);
+ __pyx_r = ((__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t)__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_sdot(__pyx_v_N, __pyx_v_X, __pyx_v_incX, __pyx_v_Y, __pyx_v_incY));
+ goto __pyx_L0;
- /* "trunk/gensim/models/doc2vec_inner.pyx":88
- * for b in range(codelen):
- * row2 = word_point[b] * size
- * f = dsdot(&size, &syn0[row1], &ONE, &syn1[row2], &ONE) # <<<<<<<<<<<<<<
- * if f <= -MAX_EXP or f >= MAX_EXP:
- * continue
+ /* "trunk/gensim/models/doc2vec_inner.pyx":62
+ *
+ * # for when fblas.sdot returns a float
+ * cdef REAL_t our_dot_float(const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil: # <<<<<<<<<<<<<<
+ * return sdot(N, X, incX, Y, incY)
+ *
*/
- __pyx_v_f = ((__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t)__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_dsdot((&__pyx_v_size), (&(__pyx_v_syn0[__pyx_v_row1])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE), (&(__pyx_v_syn1[__pyx_v_row2])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE)));
- /* "trunk/gensim/models/doc2vec_inner.pyx":89
- * row2 = word_point[b] * size
- * f = dsdot(&size, &syn0[row1], &ONE, &syn1[row2], &ONE)
- * if f <= -MAX_EXP or f >= MAX_EXP: # <<<<<<<<<<<<<<
- * continue
- * f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]
+ /* function exit code */
+ __pyx_L0:;
+ return __pyx_r;
+}
+
+/* "trunk/gensim/models/doc2vec_inner.pyx":66
+ *
+ * # for when no blas available
+ * cdef REAL_t our_dot_noblas(const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil: # <<<<<<<<<<<<<<
+ * # not a true full dot()-implementation: just enough for our cases
+ * cdef int i
*/
- __pyx_t_4 = ((__pyx_v_f <= -6.0) != 0);
- if (!__pyx_t_4) {
- } else {
- __pyx_t_3 = __pyx_t_4;
- goto __pyx_L6_bool_binop_done;
- }
- __pyx_t_4 = ((__pyx_v_f >= 6.0) != 0);
- __pyx_t_3 = __pyx_t_4;
- __pyx_L6_bool_binop_done:;
- if (__pyx_t_3) {
- /* "trunk/gensim/models/doc2vec_inner.pyx":90
- * f = dsdot(&size, &syn0[row1], &ONE, &syn1[row2], &ONE)
- * if f <= -MAX_EXP or f >= MAX_EXP:
- * continue # <<<<<<<<<<<<<<
- * f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]
- * g = (1 - word_code[b] - f) * alpha
+static __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t __pyx_f_5trunk_6gensim_6models_13doc2vec_inner_our_dot_noblas(int const *__pyx_v_N, float const *__pyx_v_X, CYTHON_UNUSED int const *__pyx_v_incX, float const *__pyx_v_Y, CYTHON_UNUSED int const *__pyx_v_incY) {
+ int __pyx_v_i;
+ __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t __pyx_v_a;
+ __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t __pyx_r;
+ int __pyx_t_1;
+
+ /* "trunk/gensim/models/doc2vec_inner.pyx":70
+ * cdef int i
+ * cdef REAL_t a
+ * a = 0.0 # <<<<<<<<<<<<<<
+ * for i from 0 <= i < N[0] by 1:
+ * a += X[i] * Y[i]
*/
- goto __pyx_L3_continue;
- }
+ __pyx_v_a = ((__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t)0.0);
- /* "trunk/gensim/models/doc2vec_inner.pyx":91
- * if f <= -MAX_EXP or f >= MAX_EXP:
- * continue
- * f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))] # <<<<<<<<<<<<<<
- * g = (1 - word_code[b] - f) * alpha
- * saxpy(&size, &g, &syn1[row2], &ONE, work, &ONE)
+ /* "trunk/gensim/models/doc2vec_inner.pyx":71
+ * cdef REAL_t a
+ * a = 0.0
+ * for i from 0 <= i < N[0] by 1: # <<<<<<<<<<<<<<
+ * a += X[i] * Y[i]
+ * return a
*/
- __pyx_v_f = (__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_EXP_TABLE[((int)((__pyx_v_f + 6.0) * 83.0))]);
+ __pyx_t_1 = (__pyx_v_N[0]);
+ for (__pyx_v_i = 0; __pyx_v_i < __pyx_t_1; __pyx_v_i+=1) {
- /* "trunk/gensim/models/doc2vec_inner.pyx":92
- * continue
- * f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]
- * g = (1 - word_code[b] - f) * alpha # <<<<<<<<<<<<<<
- * saxpy(&size, &g, &syn1[row2], &ONE, work, &ONE)
- * if tw:
+ /* "trunk/gensim/models/doc2vec_inner.pyx":72
+ * a = 0.0
+ * for i from 0 <= i < N[0] by 1:
+ * a += X[i] * Y[i] # <<<<<<<<<<<<<<
+ * return a
+ *
*/
- __pyx_v_g = (((1 - (__pyx_v_word_code[__pyx_v_b])) - __pyx_v_f) * __pyx_v_alpha);
+ __pyx_v_a = (__pyx_v_a + ((__pyx_v_X[__pyx_v_i]) * (__pyx_v_Y[__pyx_v_i])));
+ }
- /* "trunk/gensim/models/doc2vec_inner.pyx":93
- * f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]
- * g = (1 - word_code[b] - f) * alpha
- * saxpy(&size, &g, &syn1[row2], &ONE, work, &ONE) # <<<<<<<<<<<<<<
- * if tw:
- * saxpy(&size, &g, &syn0[row1], &ONE, &syn1[row2], &ONE)
+ /* "trunk/gensim/models/doc2vec_inner.pyx":73
+ * for i from 0 <= i < N[0] by 1:
+ * a += X[i] * Y[i]
+ * return a # <<<<<<<<<<<<<<
+ *
+ * # for when no blas available
*/
- __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_saxpy((&__pyx_v_size), (&__pyx_v_g), (&(__pyx_v_syn1[__pyx_v_row2])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE), __pyx_v_work, (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE));
+ __pyx_r = __pyx_v_a;
+ goto __pyx_L0;
- /* "trunk/gensim/models/doc2vec_inner.pyx":94
- * g = (1 - word_code[b] - f) * alpha
- * saxpy(&size, &g, &syn1[row2], &ONE, work, &ONE)
- * if tw: # <<<<<<<<<<<<<<
- * saxpy(&size, &g, &syn0[row1], &ONE, &syn1[row2], &ONE)
- * if tl:
+ /* "trunk/gensim/models/doc2vec_inner.pyx":66
+ *
+ * # for when no blas available
+ * cdef REAL_t our_dot_noblas(const int *N, const float *X, const int *incX, const float *Y, const int *incY) nogil: # <<<<<<<<<<<<<<
+ * # not a true full dot()-implementation: just enough for our cases
+ * cdef int i
*/
- __pyx_t_3 = (__pyx_v_tw != 0);
- if (__pyx_t_3) {
- /* "trunk/gensim/models/doc2vec_inner.pyx":95
- * saxpy(&size, &g, &syn1[row2], &ONE, work, &ONE)
- * if tw:
- * saxpy(&size, &g, &syn0[row1], &ONE, &syn1[row2], &ONE) # <<<<<<<<<<<<<<
- * if tl:
- * saxpy(&size, &ONEF, work, &ONE, &syn0[row1], &ONE)
+ /* function exit code */
+ __pyx_L0:;
+ return __pyx_r;
+}
+
+/* "trunk/gensim/models/doc2vec_inner.pyx":76
+ *
+ * # for when no blas available
+ * cdef void our_saxpy_noblas(const int *N, const float *alpha, const float *X, const int *incX, float *Y, const int *incY) nogil: # <<<<<<<<<<<<<<
+ * cdef int i
+ * for i from 0 <= i < N[0] by 1:
*/
- __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_saxpy((&__pyx_v_size), (&__pyx_v_g), (&(__pyx_v_syn0[__pyx_v_row1])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE), (&(__pyx_v_syn1[__pyx_v_row2])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE));
- goto __pyx_L8;
- }
- __pyx_L8:;
- __pyx_L3_continue:;
- }
- /* "trunk/gensim/models/doc2vec_inner.pyx":96
- * if tw:
- * saxpy(&size, &g, &syn0[row1], &ONE, &syn1[row2], &ONE)
- * if tl: # <<<<<<<<<<<<<<
- * saxpy(&size, &ONEF, work, &ONE, &syn0[row1], &ONE)
+static void __pyx_f_5trunk_6gensim_6models_13doc2vec_inner_our_saxpy_noblas(int const *__pyx_v_N, float const *__pyx_v_alpha, float const *__pyx_v_X, int const *__pyx_v_incX, float *__pyx_v_Y, int const *__pyx_v_incY) {
+ int __pyx_v_i;
+ int __pyx_t_1;
+
+ /* "trunk/gensim/models/doc2vec_inner.pyx":78
+ * cdef void our_saxpy_noblas(const int *N, const float *alpha, const float *X, const int *incX, float *Y, const int *incY) nogil:
+ * cdef int i
+ * for i from 0 <= i < N[0] by 1: # <<<<<<<<<<<<<<
+ * Y[i * (incY[0])] = (alpha[0]) * X[i * (incX[0])] + Y[i * (incY[0])]
*
*/
- __pyx_t_3 = (__pyx_v_tl != 0);
- if (__pyx_t_3) {
+ __pyx_t_1 = (__pyx_v_N[0]);
+ for (__pyx_v_i = 0; __pyx_v_i < __pyx_t_1; __pyx_v_i+=1) {
- /* "trunk/gensim/models/doc2vec_inner.pyx":97
- * saxpy(&size, &g, &syn0[row1], &ONE, &syn1[row2], &ONE)
- * if tl:
- * saxpy(&size, &ONEF, work, &ONE, &syn0[row1], &ONE) # <<<<<<<<<<<<<<
+ /* "trunk/gensim/models/doc2vec_inner.pyx":79
+ * cdef int i
+ * for i from 0 <= i < N[0] by 1:
+ * Y[i * (incY[0])] = (alpha[0]) * X[i * (incX[0])] + Y[i * (incY[0])] # <<<<<<<<<<<<<<
*
*
*/
- __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_saxpy((&__pyx_v_size), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONEF), __pyx_v_work, (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE), (&(__pyx_v_syn0[__pyx_v_row1])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE));
- goto __pyx_L9;
+ (__pyx_v_Y[(__pyx_v_i * (__pyx_v_incY[0]))]) = (((__pyx_v_alpha[0]) * (__pyx_v_X[(__pyx_v_i * (__pyx_v_incX[0]))])) + (__pyx_v_Y[(__pyx_v_i * (__pyx_v_incY[0]))]));
}
- __pyx_L9:;
/* "trunk/gensim/models/doc2vec_inner.pyx":76
- * cdef REAL_t ONEF = 1.0
*
- * cdef void fast_sentence0_dbow_hs( # <<<<<<<<<<<<<<
- * const np.uint32_t *word_point, const np.uint8_t *word_code, const int codelen,
- * REAL_t *syn0, REAL_t *syn1, const int size,
+ * # for when no blas available
+ * cdef void our_saxpy_noblas(const int *N, const float *alpha, const float *X, const int *incX, float *Y, const int *incY) nogil: # <<<<<<<<<<<<<<
+ * cdef int i
+ * for i from 0 <= i < N[0] by 1:
*/
/* function exit code */
}
-/* "trunk/gensim/models/doc2vec_inner.pyx":100
+/* "trunk/gensim/models/doc2vec_inner.pyx":82
*
*
- * cdef void fast_sentence1_dbow_hs( # <<<<<<<<<<<<<<
+ * cdef void fast_document_dbow_hs( # <<<<<<<<<<<<<<
* const np.uint32_t *word_point, const np.uint8_t *word_code, const int codelen,
- * REAL_t *syn0, REAL_t *syn1, const int size,
+ * REAL_t *context_vectors, REAL_t *syn1, const int size,
*/
-static void __pyx_f_5trunk_6gensim_6models_13doc2vec_inner_fast_sentence1_dbow_hs(__pyx_t_5numpy_uint32_t const *__pyx_v_word_point, __pyx_t_5numpy_uint8_t const *__pyx_v_word_code, int const __pyx_v_codelen, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_syn0, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_syn1, int const __pyx_v_size, __pyx_t_5numpy_uint32_t const __pyx_v_word2_index, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t const __pyx_v_alpha, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_work, int __pyx_v_tw, int __pyx_v_tl) {
+static void __pyx_f_5trunk_6gensim_6models_13doc2vec_inner_fast_document_dbow_hs(__pyx_t_5numpy_uint32_t const *__pyx_v_word_point, __pyx_t_5numpy_uint8_t const *__pyx_v_word_code, int const __pyx_v_codelen, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_context_vectors, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_syn1, int const __pyx_v_size, __pyx_t_5numpy_uint32_t const __pyx_v_context_index, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t const __pyx_v_alpha, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_work, int __pyx_v_learn_context, int __pyx_v_learn_hidden, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_context_locks) {
PY_LONG_LONG __pyx_v_b;
PY_LONG_LONG __pyx_v_row1;
PY_LONG_LONG __pyx_v_row2;
@@ -1592,16 +1630,16 @@ static void __pyx_f_5trunk_6gensim_6models_13doc2vec_inner_fast_sentence1_dbow_h
int __pyx_t_3;
int __pyx_t_4;
- /* "trunk/gensim/models/doc2vec_inner.pyx":106
+ /* "trunk/gensim/models/doc2vec_inner.pyx":89
*
* cdef long long a, b
- * cdef long long row1 = word2_index * size, row2 # <<<<<<<<<<<<<<
+ * cdef long long row1 = context_index * size, row2 # <<<<<<<<<<<<<<
* cdef REAL_t f, g
*
*/
- __pyx_v_row1 = (__pyx_v_word2_index * __pyx_v_size);
+ __pyx_v_row1 = (__pyx_v_context_index * __pyx_v_size);
- /* "trunk/gensim/models/doc2vec_inner.pyx":109
+ /* "trunk/gensim/models/doc2vec_inner.pyx":92
* cdef REAL_t f, g
*
* memset(work, 0, size * cython.sizeof(REAL_t)) # <<<<<<<<<<<<<<
@@ -1610,38 +1648,38 @@ static void __pyx_f_5trunk_6gensim_6models_13doc2vec_inner_fast_sentence1_dbow_h
*/
memset(__pyx_v_work, 0, (__pyx_v_size * (sizeof(__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t))));
- /* "trunk/gensim/models/doc2vec_inner.pyx":110
+ /* "trunk/gensim/models/doc2vec_inner.pyx":93
*
* memset(work, 0, size * cython.sizeof(REAL_t))
* for b in range(codelen): # <<<<<<<<<<<<<<
* row2 = word_point[b] * size
- * f = sdot(&size, &syn0[row1], &ONE, &syn1[row2], &ONE)
+ * f = our_dot(&size, &context_vectors[row1], &ONE, &syn1[row2], &ONE)
*/
__pyx_t_1 = __pyx_v_codelen;
for (__pyx_t_2 = 0; __pyx_t_2 < __pyx_t_1; __pyx_t_2+=1) {
__pyx_v_b = __pyx_t_2;
- /* "trunk/gensim/models/doc2vec_inner.pyx":111
+ /* "trunk/gensim/models/doc2vec_inner.pyx":94
* memset(work, 0, size * cython.sizeof(REAL_t))
* for b in range(codelen):
* row2 = word_point[b] * size # <<<<<<<<<<<<<<
- * f = sdot(&size, &syn0[row1], &ONE, &syn1[row2], &ONE)
+ * f = our_dot(&size, &context_vectors[row1], &ONE, &syn1[row2], &ONE)
* if f <= -MAX_EXP or f >= MAX_EXP:
*/
__pyx_v_row2 = ((__pyx_v_word_point[__pyx_v_b]) * __pyx_v_size);
- /* "trunk/gensim/models/doc2vec_inner.pyx":112
+ /* "trunk/gensim/models/doc2vec_inner.pyx":95
* for b in range(codelen):
* row2 = word_point[b] * size
- * f = sdot(&size, &syn0[row1], &ONE, &syn1[row2], &ONE) # <<<<<<<<<<<<<<
+ * f = our_dot(&size, &context_vectors[row1], &ONE, &syn1[row2], &ONE) # <<<<<<<<<<<<<<
* if f <= -MAX_EXP or f >= MAX_EXP:
* continue
*/
- __pyx_v_f = ((__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t)__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_sdot((&__pyx_v_size), (&(__pyx_v_syn0[__pyx_v_row1])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE), (&(__pyx_v_syn1[__pyx_v_row2])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE)));
+ __pyx_v_f = __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_our_dot((&__pyx_v_size), (&(__pyx_v_context_vectors[__pyx_v_row1])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE), (&(__pyx_v_syn1[__pyx_v_row2])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE));
- /* "trunk/gensim/models/doc2vec_inner.pyx":113
+ /* "trunk/gensim/models/doc2vec_inner.pyx":96
* row2 = word_point[b] * size
- * f = sdot(&size, &syn0[row1], &ONE, &syn1[row2], &ONE)
+ * f = our_dot(&size, &context_vectors[row1], &ONE, &syn1[row2], &ONE)
* if f <= -MAX_EXP or f >= MAX_EXP: # <<<<<<<<<<<<<<
* continue
* f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]
@@ -1657,8 +1695,8 @@ static void __pyx_f_5trunk_6gensim_6models_13doc2vec_inner_fast_sentence1_dbow_h
__pyx_L6_bool_binop_done:;
if (__pyx_t_3) {
- /* "trunk/gensim/models/doc2vec_inner.pyx":114
- * f = sdot(&size, &syn0[row1], &ONE, &syn1[row2], &ONE)
+ /* "trunk/gensim/models/doc2vec_inner.pyx":97
+ * f = our_dot(&size, &context_vectors[row1], &ONE, &syn1[row2], &ONE)
* if f <= -MAX_EXP or f >= MAX_EXP:
* continue # <<<<<<<<<<<<<<
* f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]
@@ -1667,354 +1705,519 @@ static void __pyx_f_5trunk_6gensim_6models_13doc2vec_inner_fast_sentence1_dbow_h
goto __pyx_L3_continue;
}
- /* "trunk/gensim/models/doc2vec_inner.pyx":115
+ /* "trunk/gensim/models/doc2vec_inner.pyx":98
* if f <= -MAX_EXP or f >= MAX_EXP:
* continue
* f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))] # <<<<<<<<<<<<<<
* g = (1 - word_code[b] - f) * alpha
- * saxpy(&size, &g, &syn1[row2], &ONE, work, &ONE)
+ * our_saxpy(&size, &g, &syn1[row2], &ONE, work, &ONE)
*/
__pyx_v_f = (__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_EXP_TABLE[((int)((__pyx_v_f + 6.0) * 83.0))]);
- /* "trunk/gensim/models/doc2vec_inner.pyx":116
+ /* "trunk/gensim/models/doc2vec_inner.pyx":99
* continue
* f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]
* g = (1 - word_code[b] - f) * alpha # <<<<<<<<<<<<<<
- * saxpy(&size, &g, &syn1[row2], &ONE, work, &ONE)
- * if tw:
+ * our_saxpy(&size, &g, &syn1[row2], &ONE, work, &ONE)
+ * if learn_hidden:
*/
__pyx_v_g = (((1 - (__pyx_v_word_code[__pyx_v_b])) - __pyx_v_f) * __pyx_v_alpha);
- /* "trunk/gensim/models/doc2vec_inner.pyx":117
+ /* "trunk/gensim/models/doc2vec_inner.pyx":100
* f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]
* g = (1 - word_code[b] - f) * alpha
- * saxpy(&size, &g, &syn1[row2], &ONE, work, &ONE) # <<<<<<<<<<<<<<
- * if tw:
- * saxpy(&size, &g, &syn0[row1], &ONE, &syn1[row2], &ONE)
+ * our_saxpy(&size, &g, &syn1[row2], &ONE, work, &ONE) # <<<<<<<<<<<<<<
+ * if learn_hidden:
+ * our_saxpy(&size, &g, &context_vectors[row1], &ONE, &syn1[row2], &ONE)
*/
- __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_saxpy((&__pyx_v_size), (&__pyx_v_g), (&(__pyx_v_syn1[__pyx_v_row2])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE), __pyx_v_work, (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE));
+ __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_our_saxpy((&__pyx_v_size), (&__pyx_v_g), (&(__pyx_v_syn1[__pyx_v_row2])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE), __pyx_v_work, (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE));
- /* "trunk/gensim/models/doc2vec_inner.pyx":118
+ /* "trunk/gensim/models/doc2vec_inner.pyx":101
* g = (1 - word_code[b] - f) * alpha
- * saxpy(&size, &g, &syn1[row2], &ONE, work, &ONE)
- * if tw: # <<<<<<<<<<<<<<
- * saxpy(&size, &g, &syn0[row1], &ONE, &syn1[row2], &ONE)
- * if tl:
+ * our_saxpy(&size, &g, &syn1[row2], &ONE, work, &ONE)
+ * if learn_hidden: # <<<<<<<<<<<<<<
+ * our_saxpy(&size, &g, &context_vectors[row1], &ONE, &syn1[row2], &ONE)
+ * if learn_context:
*/
- __pyx_t_3 = (__pyx_v_tw != 0);
+ __pyx_t_3 = (__pyx_v_learn_hidden != 0);
if (__pyx_t_3) {
- /* "trunk/gensim/models/doc2vec_inner.pyx":119
- * saxpy(&size, &g, &syn1[row2], &ONE, work, &ONE)
- * if tw:
- * saxpy(&size, &g, &syn0[row1], &ONE, &syn1[row2], &ONE) # <<<<<<<<<<<<<<
- * if tl:
- * saxpy(&size, &ONEF, work, &ONE, &syn0[row1], &ONE)
+ /* "trunk/gensim/models/doc2vec_inner.pyx":102
+ * our_saxpy(&size, &g, &syn1[row2], &ONE, work, &ONE)
+ * if learn_hidden:
+ * our_saxpy(&size, &g, &context_vectors[row1], &ONE, &syn1[row2], &ONE) # <<<<<<<<<<<<<<
+ * if learn_context:
+ * our_saxpy(&size, &context_locks[context_index], work, &ONE, &context_vectors[row1], &ONE)
*/
- __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_saxpy((&__pyx_v_size), (&__pyx_v_g), (&(__pyx_v_syn0[__pyx_v_row1])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE), (&(__pyx_v_syn1[__pyx_v_row2])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE));
+ __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_our_saxpy((&__pyx_v_size), (&__pyx_v_g), (&(__pyx_v_context_vectors[__pyx_v_row1])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE), (&(__pyx_v_syn1[__pyx_v_row2])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE));
goto __pyx_L8;
}
__pyx_L8:;
__pyx_L3_continue:;
}
- /* "trunk/gensim/models/doc2vec_inner.pyx":120
- * if tw:
- * saxpy(&size, &g, &syn0[row1], &ONE, &syn1[row2], &ONE)
- * if tl: # <<<<<<<<<<<<<<
- * saxpy(&size, &ONEF, work, &ONE, &syn0[row1], &ONE)
+ /* "trunk/gensim/models/doc2vec_inner.pyx":103
+ * if learn_hidden:
+ * our_saxpy(&size, &g, &context_vectors[row1], &ONE, &syn1[row2], &ONE)
+ * if learn_context: # <<<<<<<<<<<<<<
+ * our_saxpy(&size, &context_locks[context_index], work, &ONE, &context_vectors[row1], &ONE)
*
*/
- __pyx_t_3 = (__pyx_v_tl != 0);
+ __pyx_t_3 = (__pyx_v_learn_context != 0);
if (__pyx_t_3) {
- /* "trunk/gensim/models/doc2vec_inner.pyx":121
- * saxpy(&size, &g, &syn0[row1], &ONE, &syn1[row2], &ONE)
- * if tl:
- * saxpy(&size, &ONEF, work, &ONE, &syn0[row1], &ONE) # <<<<<<<<<<<<<<
+ /* "trunk/gensim/models/doc2vec_inner.pyx":104
+ * our_saxpy(&size, &g, &context_vectors[row1], &ONE, &syn1[row2], &ONE)
+ * if learn_context:
+ * our_saxpy(&size, &context_locks[context_index], work, &ONE, &context_vectors[row1], &ONE) # <<<<<<<<<<<<<<
*
*
*/
- __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_saxpy((&__pyx_v_size), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONEF), __pyx_v_work, (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE), (&(__pyx_v_syn0[__pyx_v_row1])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE));
+ __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_our_saxpy((&__pyx_v_size), (&(__pyx_v_context_locks[__pyx_v_context_index])), __pyx_v_work, (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE), (&(__pyx_v_context_vectors[__pyx_v_row1])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE));
goto __pyx_L9;
}
__pyx_L9:;
- /* "trunk/gensim/models/doc2vec_inner.pyx":100
+ /* "trunk/gensim/models/doc2vec_inner.pyx":82
*
*
- * cdef void fast_sentence1_dbow_hs( # <<<<<<<<<<<<<<
+ * cdef void fast_document_dbow_hs( # <<<<<<<<<<<<<<
* const np.uint32_t *word_point, const np.uint8_t *word_code, const int codelen,
- * REAL_t *syn0, REAL_t *syn1, const int size,
+ * REAL_t *context_vectors, REAL_t *syn1, const int size,
*/
/* function exit code */
}
-/* "trunk/gensim/models/doc2vec_inner.pyx":124
+/* "trunk/gensim/models/doc2vec_inner.pyx":107
*
*
- * cdef void fast_sentence2_dbow_hs( # <<<<<<<<<<<<<<
- * const np.uint32_t *word_point, const np.uint8_t *word_code, const int codelen,
- * REAL_t *syn0, REAL_t *syn1, const int size,
+ * cdef unsigned long long fast_document_dbow_neg( # <<<<<<<<<<<<<<
+ * const int negative, np.uint32_t *table, unsigned long long table_len,
+ * REAL_t *context_vectors, REAL_t *syn1neg, const int size, const np.uint32_t word_index,
*/
-static void __pyx_f_5trunk_6gensim_6models_13doc2vec_inner_fast_sentence2_dbow_hs(__pyx_t_5numpy_uint32_t const *__pyx_v_word_point, __pyx_t_5numpy_uint8_t const *__pyx_v_word_code, int const __pyx_v_codelen, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_syn0, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_syn1, int const __pyx_v_size, __pyx_t_5numpy_uint32_t const __pyx_v_word2_index, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t const __pyx_v_alpha, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_work, int __pyx_v_tw, int __pyx_v_tl) {
- PY_LONG_LONG __pyx_v_a;
- PY_LONG_LONG __pyx_v_b;
+static unsigned PY_LONG_LONG __pyx_f_5trunk_6gensim_6models_13doc2vec_inner_fast_document_dbow_neg(int const __pyx_v_negative, __pyx_t_5numpy_uint32_t *__pyx_v_table, unsigned PY_LONG_LONG __pyx_v_table_len, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_context_vectors, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_syn1neg, int const __pyx_v_size, __pyx_t_5numpy_uint32_t const __pyx_v_word_index, __pyx_t_5numpy_uint32_t const __pyx_v_context_index, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t const __pyx_v_alpha, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_work, unsigned PY_LONG_LONG __pyx_v_next_random, int __pyx_v_learn_context, int __pyx_v_learn_hidden, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_context_locks) {
PY_LONG_LONG __pyx_v_row1;
PY_LONG_LONG __pyx_v_row2;
+ unsigned PY_LONG_LONG __pyx_v_modulo;
__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t __pyx_v_f;
__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t __pyx_v_g;
- int __pyx_t_1;
- PY_LONG_LONG __pyx_t_2;
+ __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t __pyx_v_label;
+ __pyx_t_5numpy_uint32_t __pyx_v_target_index;
+ int __pyx_v_d;
+ unsigned PY_LONG_LONG __pyx_r;
+ long __pyx_t_1;
+ int __pyx_t_2;
int __pyx_t_3;
- PY_LONG_LONG __pyx_t_4;
- int __pyx_t_5;
- int __pyx_t_6;
- PY_LONG_LONG __pyx_t_7;
+ int __pyx_t_4;
- /* "trunk/gensim/models/doc2vec_inner.pyx":130
- *
- * cdef long long a, b
- * cdef long long row1 = word2_index * size, row2 # <<<<<<<<<<<<<<
- * cdef REAL_t f, g
+ /* "trunk/gensim/models/doc2vec_inner.pyx":114
*
+ * cdef long long a
+ * cdef long long row1 = context_index * size, row2 # <<<<<<<<<<<<<<
+ * cdef unsigned long long modulo = 281474976710655ULL
+ * cdef REAL_t f, g, label
*/
- __pyx_v_row1 = (__pyx_v_word2_index * __pyx_v_size);
+ __pyx_v_row1 = (__pyx_v_context_index * __pyx_v_size);
- /* "trunk/gensim/models/doc2vec_inner.pyx":133
- * cdef REAL_t f, g
- *
- * for a in range(size): # <<<<<<<<<<<<<<
- * work[a] = 0.0
- * for b in range(codelen):
+ /* "trunk/gensim/models/doc2vec_inner.pyx":115
+ * cdef long long a
+ * cdef long long row1 = context_index * size, row2
+ * cdef unsigned long long modulo = 281474976710655ULL # <<<<<<<<<<<<<<
+ * cdef REAL_t f, g, label
+ * cdef np.uint32_t target_index
*/
- __pyx_t_1 = __pyx_v_size;
- for (__pyx_t_2 = 0; __pyx_t_2 < __pyx_t_1; __pyx_t_2+=1) {
- __pyx_v_a = __pyx_t_2;
+ __pyx_v_modulo = 281474976710655ULL;
- /* "trunk/gensim/models/doc2vec_inner.pyx":134
+ /* "trunk/gensim/models/doc2vec_inner.pyx":120
+ * cdef int d
*
- * for a in range(size):
- * work[a] = 0.0 # <<<<<<<<<<<<<<
- * for b in range(codelen):
- * row2 = word_point[b] * size
+ * memset(work, 0, size * cython.sizeof(REAL_t)) # <<<<<<<<<<<<<<
+ *
+ * for d in range(negative+1):
*/
- (__pyx_v_work[__pyx_v_a]) = ((__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t)0.0);
- }
+ memset(__pyx_v_work, 0, (__pyx_v_size * (sizeof(__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t))));
- /* "trunk/gensim/models/doc2vec_inner.pyx":135
- * for a in range(size):
- * work[a] = 0.0
- * for b in range(codelen): # <<<<<<<<<<<<<<
- * row2 = word_point[b] * size
- * f = 0.0
+ /* "trunk/gensim/models/doc2vec_inner.pyx":122
+ * memset(work, 0, size * cython.sizeof(REAL_t))
+ *
+ * for d in range(negative+1): # <<<<<<<<<<<<<<
+ * if d == 0:
+ * target_index = word_index
*/
- __pyx_t_1 = __pyx_v_codelen;
+ __pyx_t_1 = (__pyx_v_negative + 1);
for (__pyx_t_2 = 0; __pyx_t_2 < __pyx_t_1; __pyx_t_2+=1) {
- __pyx_v_b = __pyx_t_2;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":136
- * work[a] = 0.0
- * for b in range(codelen):
- * row2 = word_point[b] * size # <<<<<<<<<<<<<<
- * f = 0.0
- * for a in range(size):
- */
- __pyx_v_row2 = ((__pyx_v_word_point[__pyx_v_b]) * __pyx_v_size);
+ __pyx_v_d = __pyx_t_2;
- /* "trunk/gensim/models/doc2vec_inner.pyx":137
- * for b in range(codelen):
- * row2 = word_point[b] * size
- * f = 0.0 # <<<<<<<<<<<<<<
- * for a in range(size):
- * f += syn0[row1 + a] * syn1[row2 + a]
+ /* "trunk/gensim/models/doc2vec_inner.pyx":123
+ *
+ * for d in range(negative+1):
+ * if d == 0: # <<<<<<<<<<<<<<
+ * target_index = word_index
+ * label = ONEF
*/
- __pyx_v_f = ((__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t)0.0);
+ __pyx_t_3 = ((__pyx_v_d == 0) != 0);
+ if (__pyx_t_3) {
- /* "trunk/gensim/models/doc2vec_inner.pyx":138
- * row2 = word_point[b] * size
- * f = 0.0
- * for a in range(size): # <<<<<<<<<<<<<<
- * f += syn0[row1 + a] * syn1[row2 + a]
- * if f <= -MAX_EXP or f >= MAX_EXP:
+ /* "trunk/gensim/models/doc2vec_inner.pyx":124
+ * for d in range(negative+1):
+ * if d == 0:
+ * target_index = word_index # <<<<<<<<<<<<<<
+ * label = ONEF
+ * else:
*/
- __pyx_t_3 = __pyx_v_size;
- for (__pyx_t_4 = 0; __pyx_t_4 < __pyx_t_3; __pyx_t_4+=1) {
- __pyx_v_a = __pyx_t_4;
+ __pyx_v_target_index = __pyx_v_word_index;
- /* "trunk/gensim/models/doc2vec_inner.pyx":139
- * f = 0.0
- * for a in range(size):
- * f += syn0[row1 + a] * syn1[row2 + a] # <<<<<<<<<<<<<<
- * if f <= -MAX_EXP or f >= MAX_EXP:
- * continue
+ /* "trunk/gensim/models/doc2vec_inner.pyx":125
+ * if d == 0:
+ * target_index = word_index
+ * label = ONEF # <<<<<<<<<<<<<<
+ * else:
+ * target_index = table[(next_random >> 16) % table_len]
*/
- __pyx_v_f = (__pyx_v_f + ((__pyx_v_syn0[(__pyx_v_row1 + __pyx_v_a)]) * (__pyx_v_syn1[(__pyx_v_row2 + __pyx_v_a)])));
+ __pyx_v_label = __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONEF;
+ goto __pyx_L5;
}
+ /*else*/ {
- /* "trunk/gensim/models/doc2vec_inner.pyx":140
- * for a in range(size):
- * f += syn0[row1 + a] * syn1[row2 + a]
- * if f <= -MAX_EXP or f >= MAX_EXP: # <<<<<<<<<<<<<<
- * continue
- * f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]
+ /* "trunk/gensim/models/doc2vec_inner.pyx":127
+ * label = ONEF
+ * else:
+ * target_index = table[(next_random >> 16) % table_len] # <<<<<<<<<<<<<<
+ * next_random = (next_random * 25214903917ULL + 11) & modulo
+ * if target_index == word_index:
*/
- __pyx_t_6 = ((__pyx_v_f <= -6.0) != 0);
- if (!__pyx_t_6) {
- } else {
- __pyx_t_5 = __pyx_t_6;
- goto __pyx_L10_bool_binop_done;
- }
- __pyx_t_6 = ((__pyx_v_f >= 6.0) != 0);
- __pyx_t_5 = __pyx_t_6;
- __pyx_L10_bool_binop_done:;
- if (__pyx_t_5) {
+ __pyx_v_target_index = (__pyx_v_table[((__pyx_v_next_random >> 16) % __pyx_v_table_len)]);
- /* "trunk/gensim/models/doc2vec_inner.pyx":141
- * f += syn0[row1 + a] * syn1[row2 + a]
- * if f <= -MAX_EXP or f >= MAX_EXP:
- * continue # <<<<<<<<<<<<<<
- * f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]
- * g = (1 - word_code[b] - f) * alpha
+ /* "trunk/gensim/models/doc2vec_inner.pyx":128
+ * else:
+ * target_index = table[(next_random >> 16) % table_len]
+ * next_random = (next_random * 25214903917ULL + 11) & modulo # <<<<<<<<<<<<<<
+ * if target_index == word_index:
+ * continue
+ */
+ __pyx_v_next_random = (((__pyx_v_next_random * ((unsigned PY_LONG_LONG)25214903917ULL)) + 11) & __pyx_v_modulo);
+
+ /* "trunk/gensim/models/doc2vec_inner.pyx":129
+ * target_index = table[(next_random >> 16) % table_len]
+ * next_random = (next_random * 25214903917ULL + 11) & modulo
+ * if target_index == word_index: # <<<<<<<<<<<<<<
+ * continue
+ * label = 0.0
+ */
+ __pyx_t_3 = ((__pyx_v_target_index == __pyx_v_word_index) != 0);
+ if (__pyx_t_3) {
+
+ /* "trunk/gensim/models/doc2vec_inner.pyx":130
+ * next_random = (next_random * 25214903917ULL + 11) & modulo
+ * if target_index == word_index:
+ * continue # <<<<<<<<<<<<<<
+ * label = 0.0
+ * row2 = target_index * size
+ */
+ goto __pyx_L3_continue;
+ }
+
+ /* "trunk/gensim/models/doc2vec_inner.pyx":131
+ * if target_index == word_index:
+ * continue
+ * label = 0.0 # <<<<<<<<<<<<<<
+ * row2 = target_index * size
+ * f = our_dot(&size, &context_vectors[row1], &ONE, &syn1neg[row2], &ONE)
+ */
+ __pyx_v_label = ((__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t)0.0);
+ }
+ __pyx_L5:;
+
+ /* "trunk/gensim/models/doc2vec_inner.pyx":132
+ * continue
+ * label = 0.0
+ * row2 = target_index * size # <<<<<<<<<<<<<<
+ * f = our_dot(&size, &context_vectors[row1], &ONE, &syn1neg[row2], &ONE)
+ * if f <= -MAX_EXP or f >= MAX_EXP:
+ */
+ __pyx_v_row2 = (__pyx_v_target_index * __pyx_v_size);
+
+ /* "trunk/gensim/models/doc2vec_inner.pyx":133
+ * label = 0.0
+ * row2 = target_index * size
+ * f = our_dot(&size, &context_vectors[row1], &ONE, &syn1neg[row2], &ONE) # <<<<<<<<<<<<<<
+ * if f <= -MAX_EXP or f >= MAX_EXP:
+ * continue
+ */
+ __pyx_v_f = __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_our_dot((&__pyx_v_size), (&(__pyx_v_context_vectors[__pyx_v_row1])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE), (&(__pyx_v_syn1neg[__pyx_v_row2])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE));
+
+ /* "trunk/gensim/models/doc2vec_inner.pyx":134
+ * row2 = target_index * size
+ * f = our_dot(&size, &context_vectors[row1], &ONE, &syn1neg[row2], &ONE)
+ * if f <= -MAX_EXP or f >= MAX_EXP: # <<<<<<<<<<<<<<
+ * continue
+ * f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]
*/
- goto __pyx_L5_continue;
+ __pyx_t_4 = ((__pyx_v_f <= -6.0) != 0);
+ if (!__pyx_t_4) {
+ } else {
+ __pyx_t_3 = __pyx_t_4;
+ goto __pyx_L8_bool_binop_done;
}
+ __pyx_t_4 = ((__pyx_v_f >= 6.0) != 0);
+ __pyx_t_3 = __pyx_t_4;
+ __pyx_L8_bool_binop_done:;
+ if (__pyx_t_3) {
- /* "trunk/gensim/models/doc2vec_inner.pyx":142
+ /* "trunk/gensim/models/doc2vec_inner.pyx":135
+ * f = our_dot(&size, &context_vectors[row1], &ONE, &syn1neg[row2], &ONE)
+ * if f <= -MAX_EXP or f >= MAX_EXP:
+ * continue # <<<<<<<<<<<<<<
+ * f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]
+ * g = (label - f) * alpha
+ */
+ goto __pyx_L3_continue;
+ }
+
+ /* "trunk/gensim/models/doc2vec_inner.pyx":136
* if f <= -MAX_EXP or f >= MAX_EXP:
* continue
* f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))] # <<<<<<<<<<<<<<
- * g = (1 - word_code[b] - f) * alpha
- * for a in range(size):
+ * g = (label - f) * alpha
+ * our_saxpy(&size, &g, &syn1neg[row2], &ONE, work, &ONE)
*/
__pyx_v_f = (__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_EXP_TABLE[((int)((__pyx_v_f + 6.0) * 83.0))]);
- /* "trunk/gensim/models/doc2vec_inner.pyx":143
+ /* "trunk/gensim/models/doc2vec_inner.pyx":137
* continue
* f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]
- * g = (1 - word_code[b] - f) * alpha # <<<<<<<<<<<<<<
- * for a in range(size):
- * work[a] += g * syn1[row2 + a]
+ * g = (label - f) * alpha # <<<<<<<<<<<<<<
+ * our_saxpy(&size, &g, &syn1neg[row2], &ONE, work, &ONE)
+ * if learn_hidden:
*/
- __pyx_v_g = (((1 - (__pyx_v_word_code[__pyx_v_b])) - __pyx_v_f) * __pyx_v_alpha);
+ __pyx_v_g = ((__pyx_v_label - __pyx_v_f) * __pyx_v_alpha);
- /* "trunk/gensim/models/doc2vec_inner.pyx":144
+ /* "trunk/gensim/models/doc2vec_inner.pyx":138
* f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]
- * g = (1 - word_code[b] - f) * alpha
- * for a in range(size): # <<<<<<<<<<<<<<
- * work[a] += g * syn1[row2 + a]
- * if tw:
+ * g = (label - f) * alpha
+ * our_saxpy(&size, &g, &syn1neg[row2], &ONE, work, &ONE) # <<<<<<<<<<<<<<
+ * if learn_hidden:
+ * our_saxpy(&size, &g, &context_vectors[row1], &ONE, &syn1neg[row2], &ONE)
*/
- __pyx_t_3 = __pyx_v_size;
- for (__pyx_t_4 = 0; __pyx_t_4 < __pyx_t_3; __pyx_t_4+=1) {
- __pyx_v_a = __pyx_t_4;
+ __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_our_saxpy((&__pyx_v_size), (&__pyx_v_g), (&(__pyx_v_syn1neg[__pyx_v_row2])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE), __pyx_v_work, (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE));
- /* "trunk/gensim/models/doc2vec_inner.pyx":145
- * g = (1 - word_code[b] - f) * alpha
- * for a in range(size):
- * work[a] += g * syn1[row2 + a] # <<<<<<<<<<<<<<
- * if tw:
- * for a in range(size):
+ /* "trunk/gensim/models/doc2vec_inner.pyx":139
+ * g = (label - f) * alpha
+ * our_saxpy(&size, &g, &syn1neg[row2], &ONE, work, &ONE)
+ * if learn_hidden: # <<<<<<<<<<<<<<
+ * our_saxpy(&size, &g, &context_vectors[row1], &ONE, &syn1neg[row2], &ONE)
+ * if learn_context:
*/
- __pyx_t_7 = __pyx_v_a;
- (__pyx_v_work[__pyx_t_7]) = ((__pyx_v_work[__pyx_t_7]) + (__pyx_v_g * (__pyx_v_syn1[(__pyx_v_row2 + __pyx_v_a)])));
+ __pyx_t_3 = (__pyx_v_learn_hidden != 0);
+ if (__pyx_t_3) {
+
+ /* "trunk/gensim/models/doc2vec_inner.pyx":140
+ * our_saxpy(&size, &g, &syn1neg[row2], &ONE, work, &ONE)
+ * if learn_hidden:
+ * our_saxpy(&size, &g, &context_vectors[row1], &ONE, &syn1neg[row2], &ONE) # <<<<<<<<<<<<<<
+ * if learn_context:
+ * our_saxpy(&size, &context_locks[context_index], work, &ONE, &context_vectors[row1], &ONE)
+ */
+ __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_our_saxpy((&__pyx_v_size), (&__pyx_v_g), (&(__pyx_v_context_vectors[__pyx_v_row1])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE), (&(__pyx_v_syn1neg[__pyx_v_row2])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE));
+ goto __pyx_L10;
}
+ __pyx_L10:;
+ __pyx_L3_continue:;
+ }
- /* "trunk/gensim/models/doc2vec_inner.pyx":146
- * for a in range(size):
- * work[a] += g * syn1[row2 + a]
- * if tw: # <<<<<<<<<<<<<<
- * for a in range(size):
- * syn1[row2 + a] += g * syn0[row1 + a]
+ /* "trunk/gensim/models/doc2vec_inner.pyx":141
+ * if learn_hidden:
+ * our_saxpy(&size, &g, &context_vectors[row1], &ONE, &syn1neg[row2], &ONE)
+ * if learn_context: # <<<<<<<<<<<<<<
+ * our_saxpy(&size, &context_locks[context_index], work, &ONE, &context_vectors[row1], &ONE)
+ *
*/
- __pyx_t_5 = (__pyx_v_tw != 0);
- if (__pyx_t_5) {
+ __pyx_t_3 = (__pyx_v_learn_context != 0);
+ if (__pyx_t_3) {
- /* "trunk/gensim/models/doc2vec_inner.pyx":147
- * work[a] += g * syn1[row2 + a]
- * if tw:
- * for a in range(size): # <<<<<<<<<<<<<<
- * syn1[row2 + a] += g * syn0[row1 + a]
- * if tl:
- */
- __pyx_t_3 = __pyx_v_size;
- for (__pyx_t_4 = 0; __pyx_t_4 < __pyx_t_3; __pyx_t_4+=1) {
- __pyx_v_a = __pyx_t_4;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":148
- * if tw:
- * for a in range(size):
- * syn1[row2 + a] += g * syn0[row1 + a] # <<<<<<<<<<<<<<
- * if tl:
- * for a in range(size):
- */
- __pyx_t_7 = (__pyx_v_row2 + __pyx_v_a);
- (__pyx_v_syn1[__pyx_t_7]) = ((__pyx_v_syn1[__pyx_t_7]) + (__pyx_v_g * (__pyx_v_syn0[(__pyx_v_row1 + __pyx_v_a)])));
- }
- goto __pyx_L14;
- }
- __pyx_L14:;
- __pyx_L5_continue:;
+ /* "trunk/gensim/models/doc2vec_inner.pyx":142
+ * our_saxpy(&size, &g, &context_vectors[row1], &ONE, &syn1neg[row2], &ONE)
+ * if learn_context:
+ * our_saxpy(&size, &context_locks[context_index], work, &ONE, &context_vectors[row1], &ONE) # <<<<<<<<<<<<<<
+ *
+ * return next_random
+ */
+ __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_our_saxpy((&__pyx_v_size), (&(__pyx_v_context_locks[__pyx_v_context_index])), __pyx_v_work, (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE), (&(__pyx_v_context_vectors[__pyx_v_row1])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE));
+ goto __pyx_L11;
}
+ __pyx_L11:;
- /* "trunk/gensim/models/doc2vec_inner.pyx":149
- * for a in range(size):
- * syn1[row2 + a] += g * syn0[row1 + a]
- * if tl: # <<<<<<<<<<<<<<
- * for a in range(size):
- * syn0[row1 + a] += work[a]
+ /* "trunk/gensim/models/doc2vec_inner.pyx":144
+ * our_saxpy(&size, &context_locks[context_index], work, &ONE, &context_vectors[row1], &ONE)
+ *
+ * return next_random # <<<<<<<<<<<<<<
+ *
+ *
*/
- __pyx_t_5 = (__pyx_v_tl != 0);
- if (__pyx_t_5) {
+ __pyx_r = __pyx_v_next_random;
+ goto __pyx_L0;
+
+ /* "trunk/gensim/models/doc2vec_inner.pyx":107
+ *
+ *
+ * cdef unsigned long long fast_document_dbow_neg( # <<<<<<<<<<<<<<
+ * const int negative, np.uint32_t *table, unsigned long long table_len,
+ * REAL_t *context_vectors, REAL_t *syn1neg, const int size, const np.uint32_t word_index,
+ */
+
+ /* function exit code */
+ __pyx_L0:;
+ return __pyx_r;
+}
+
+/* "trunk/gensim/models/doc2vec_inner.pyx":147
+ *
+ *
+ * cdef void fast_document_dm_hs( # <<<<<<<<<<<<<<
+ * const np.uint32_t *word_point, const np.uint8_t *word_code, int word_code_len,
+ * REAL_t *neu1, REAL_t *syn1, const REAL_t alpha, REAL_t *work,
+ */
+
+static void __pyx_f_5trunk_6gensim_6models_13doc2vec_inner_fast_document_dm_hs(__pyx_t_5numpy_uint32_t const *__pyx_v_word_point, __pyx_t_5numpy_uint8_t const *__pyx_v_word_code, int __pyx_v_word_code_len, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_neu1, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_syn1, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t const __pyx_v_alpha, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_work, int const __pyx_v_size, int __pyx_v_learn_hidden) {
+ PY_LONG_LONG __pyx_v_b;
+ PY_LONG_LONG __pyx_v_row2;
+ __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t __pyx_v_f;
+ __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t __pyx_v_g;
+ int __pyx_t_1;
+ PY_LONG_LONG __pyx_t_2;
+ int __pyx_t_3;
+ int __pyx_t_4;
+
+ /* "trunk/gensim/models/doc2vec_inner.pyx":158
+ * # l1 already composed by caller, passed in as neu1
+ * # work (also passed in) will accumulate l1 error
+ * for b in range(word_code_len): # <<<<<<<<<<<<<<
+ * row2 = word_point[b] * size
+ * f = our_dot(&size, neu1, &ONE, &syn1[row2], &ONE)
+ */
+ __pyx_t_1 = __pyx_v_word_code_len;
+ for (__pyx_t_2 = 0; __pyx_t_2 < __pyx_t_1; __pyx_t_2+=1) {
+ __pyx_v_b = __pyx_t_2;
+
+ /* "trunk/gensim/models/doc2vec_inner.pyx":159
+ * # work (also passed in) will accumulate l1 error
+ * for b in range(word_code_len):
+ * row2 = word_point[b] * size # <<<<<<<<<<<<<<
+ * f = our_dot(&size, neu1, &ONE, &syn1[row2], &ONE)
+ * if f <= -MAX_EXP or f >= MAX_EXP:
+ */
+ __pyx_v_row2 = ((__pyx_v_word_point[__pyx_v_b]) * __pyx_v_size);
+
+ /* "trunk/gensim/models/doc2vec_inner.pyx":160
+ * for b in range(word_code_len):
+ * row2 = word_point[b] * size
+ * f = our_dot(&size, neu1, &ONE, &syn1[row2], &ONE) # <<<<<<<<<<<<<<
+ * if f <= -MAX_EXP or f >= MAX_EXP:
+ * continue
+ */
+ __pyx_v_f = __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_our_dot((&__pyx_v_size), __pyx_v_neu1, (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE), (&(__pyx_v_syn1[__pyx_v_row2])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE));
+
+ /* "trunk/gensim/models/doc2vec_inner.pyx":161
+ * row2 = word_point[b] * size
+ * f = our_dot(&size, neu1, &ONE, &syn1[row2], &ONE)
+ * if f <= -MAX_EXP or f >= MAX_EXP: # <<<<<<<<<<<<<<
+ * continue
+ * f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]
+ */
+ __pyx_t_4 = ((__pyx_v_f <= -6.0) != 0);
+ if (!__pyx_t_4) {
+ } else {
+ __pyx_t_3 = __pyx_t_4;
+ goto __pyx_L6_bool_binop_done;
+ }
+ __pyx_t_4 = ((__pyx_v_f >= 6.0) != 0);
+ __pyx_t_3 = __pyx_t_4;
+ __pyx_L6_bool_binop_done:;
+ if (__pyx_t_3) {
+
+ /* "trunk/gensim/models/doc2vec_inner.pyx":162
+ * f = our_dot(&size, neu1, &ONE, &syn1[row2], &ONE)
+ * if f <= -MAX_EXP or f >= MAX_EXP:
+ * continue # <<<<<<<<<<<<<<
+ * f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]
+ * g = (1 - word_code[b] - f) * alpha
+ */
+ goto __pyx_L3_continue;
+ }
+
+ /* "trunk/gensim/models/doc2vec_inner.pyx":163
+ * if f <= -MAX_EXP or f >= MAX_EXP:
+ * continue
+ * f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))] # <<<<<<<<<<<<<<
+ * g = (1 - word_code[b] - f) * alpha
+ * our_saxpy(&size, &g, &syn1[row2], &ONE, work, &ONE)
+ */
+ __pyx_v_f = (__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_EXP_TABLE[((int)((__pyx_v_f + 6.0) * 83.0))]);
+
+ /* "trunk/gensim/models/doc2vec_inner.pyx":164
+ * continue
+ * f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]
+ * g = (1 - word_code[b] - f) * alpha # <<<<<<<<<<<<<<
+ * our_saxpy(&size, &g, &syn1[row2], &ONE, work, &ONE)
+ * if learn_hidden:
+ */
+ __pyx_v_g = (((1 - (__pyx_v_word_code[__pyx_v_b])) - __pyx_v_f) * __pyx_v_alpha);
- /* "trunk/gensim/models/doc2vec_inner.pyx":150
- * syn1[row2 + a] += g * syn0[row1 + a]
- * if tl:
- * for a in range(size): # <<<<<<<<<<<<<<
- * syn0[row1 + a] += work[a]
+ /* "trunk/gensim/models/doc2vec_inner.pyx":165
+ * f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]
+ * g = (1 - word_code[b] - f) * alpha
+ * our_saxpy(&size, &g, &syn1[row2], &ONE, work, &ONE) # <<<<<<<<<<<<<<
+ * if learn_hidden:
+ * our_saxpy(&size, &g, neu1, &ONE, &syn1[row2], &ONE)
+ */
+ __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_our_saxpy((&__pyx_v_size), (&__pyx_v_g), (&(__pyx_v_syn1[__pyx_v_row2])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE), __pyx_v_work, (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE));
+
+ /* "trunk/gensim/models/doc2vec_inner.pyx":166
+ * g = (1 - word_code[b] - f) * alpha
+ * our_saxpy(&size, &g, &syn1[row2], &ONE, work, &ONE)
+ * if learn_hidden: # <<<<<<<<<<<<<<
+ * our_saxpy(&size, &g, neu1, &ONE, &syn1[row2], &ONE)
*
*/
- __pyx_t_1 = __pyx_v_size;
- for (__pyx_t_2 = 0; __pyx_t_2 < __pyx_t_1; __pyx_t_2+=1) {
- __pyx_v_a = __pyx_t_2;
+ __pyx_t_3 = (__pyx_v_learn_hidden != 0);
+ if (__pyx_t_3) {
- /* "trunk/gensim/models/doc2vec_inner.pyx":151
- * if tl:
- * for a in range(size):
- * syn0[row1 + a] += work[a] # <<<<<<<<<<<<<<
+ /* "trunk/gensim/models/doc2vec_inner.pyx":167
+ * our_saxpy(&size, &g, &syn1[row2], &ONE, work, &ONE)
+ * if learn_hidden:
+ * our_saxpy(&size, &g, neu1, &ONE, &syn1[row2], &ONE) # <<<<<<<<<<<<<<
*
*
*/
- __pyx_t_4 = (__pyx_v_row1 + __pyx_v_a);
- (__pyx_v_syn0[__pyx_t_4]) = ((__pyx_v_syn0[__pyx_t_4]) + (__pyx_v_work[__pyx_v_a]));
+ __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_our_saxpy((&__pyx_v_size), (&__pyx_v_g), __pyx_v_neu1, (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE), (&(__pyx_v_syn1[__pyx_v_row2])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE));
+ goto __pyx_L8;
}
- goto __pyx_L17;
+ __pyx_L8:;
+ __pyx_L3_continue:;
}
- __pyx_L17:;
- /* "trunk/gensim/models/doc2vec_inner.pyx":124
+ /* "trunk/gensim/models/doc2vec_inner.pyx":147
*
*
- * cdef void fast_sentence2_dbow_hs( # <<<<<<<<<<<<<<
- * const np.uint32_t *word_point, const np.uint8_t *word_code, const int codelen,
- * REAL_t *syn0, REAL_t *syn1, const int size,
+ * cdef void fast_document_dm_hs( # <<<<<<<<<<<<<<
+ * const np.uint32_t *word_point, const np.uint8_t *word_code, int word_code_len,
+ * REAL_t *neu1, REAL_t *syn1, const REAL_t alpha, REAL_t *work,
*/
/* function exit code */
}
-/* "trunk/gensim/models/doc2vec_inner.pyx":154
+/* "trunk/gensim/models/doc2vec_inner.pyx":170
*
*
- * cdef unsigned long long fast_sentence0_dbow_neg( # <<<<<<<<<<<<<<
- * const int negative, np.uint32_t *table, unsigned long long table_len,
- * REAL_t *syn0, REAL_t *syn1neg, const int size, const np.uint32_t word_index,
+ * cdef unsigned long long fast_document_dm_neg( # <<<<<<<<<<<<<<
+ * const int negative, np.uint32_t *table, unsigned long long table_len, unsigned long long next_random,
+ * REAL_t *neu1, REAL_t *syn1neg, const int predict_word_index, const REAL_t alpha, REAL_t *work,
*/
-static unsigned PY_LONG_LONG __pyx_f_5trunk_6gensim_6models_13doc2vec_inner_fast_sentence0_dbow_neg(int const __pyx_v_negative, __pyx_t_5numpy_uint32_t *__pyx_v_table, unsigned PY_LONG_LONG __pyx_v_table_len, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_syn0, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_syn1neg, int const __pyx_v_size, __pyx_t_5numpy_uint32_t const __pyx_v_word_index, __pyx_t_5numpy_uint32_t const __pyx_v_word2_index, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t const __pyx_v_alpha, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_work, unsigned PY_LONG_LONG __pyx_v_next_random, int __pyx_v_tw, int __pyx_v_tl) {
- PY_LONG_LONG __pyx_v_row1;
+static unsigned PY_LONG_LONG __pyx_f_5trunk_6gensim_6models_13doc2vec_inner_fast_document_dm_neg(int const __pyx_v_negative, __pyx_t_5numpy_uint32_t *__pyx_v_table, unsigned PY_LONG_LONG __pyx_v_table_len, unsigned PY_LONG_LONG __pyx_v_next_random, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_neu1, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_syn1neg, int const __pyx_v_predict_word_index, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t const __pyx_v_alpha, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_work, int const __pyx_v_size, int __pyx_v_learn_hidden) {
PY_LONG_LONG __pyx_v_row2;
unsigned PY_LONG_LONG __pyx_v_modulo;
__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t __pyx_v_f;
@@ -2028,66 +2231,48 @@ static unsigned PY_LONG_LONG __pyx_f_5trunk_6gensim_6models_13doc2vec_inner_fast
int __pyx_t_3;
int __pyx_t_4;
- /* "trunk/gensim/models/doc2vec_inner.pyx":161
+ /* "trunk/gensim/models/doc2vec_inner.pyx":176
*
- * cdef long long a
- * cdef long long row1 = word2_index * size, row2 # <<<<<<<<<<<<<<
- * cdef unsigned long long modulo = 281474976710655ULL
- * cdef REAL_t f, g, label
- */
- __pyx_v_row1 = (__pyx_v_word2_index * __pyx_v_size);
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":162
- * cdef long long a
- * cdef long long row1 = word2_index * size, row2
+ * cdef long long row2
* cdef unsigned long long modulo = 281474976710655ULL # <<<<<<<<<<<<<<
* cdef REAL_t f, g, label
* cdef np.uint32_t target_index
*/
__pyx_v_modulo = 281474976710655ULL;
- /* "trunk/gensim/models/doc2vec_inner.pyx":167
- * cdef int d
- *
- * memset(work, 0, size * cython.sizeof(REAL_t)) # <<<<<<<<<<<<<<
- *
- * for d in range(negative+1):
- */
- memset(__pyx_v_work, 0, (__pyx_v_size * (sizeof(__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t))));
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":169
- * memset(work, 0, size * cython.sizeof(REAL_t))
- *
+ /* "trunk/gensim/models/doc2vec_inner.pyx":183
+ * # l1 already composed by caller, passed in as neu1
+ * # work (also passsed in) will accumulate l1 error for outside application
* for d in range(negative+1): # <<<<<<<<<<<<<<
* if d == 0:
- * target_index = word_index
+ * target_index = predict_word_index
*/
__pyx_t_1 = (__pyx_v_negative + 1);
for (__pyx_t_2 = 0; __pyx_t_2 < __pyx_t_1; __pyx_t_2+=1) {
__pyx_v_d = __pyx_t_2;
- /* "trunk/gensim/models/doc2vec_inner.pyx":170
- *
+ /* "trunk/gensim/models/doc2vec_inner.pyx":184
+ * # work (also passsed in) will accumulate l1 error for outside application
* for d in range(negative+1):
* if d == 0: # <<<<<<<<<<<<<<
- * target_index = word_index
+ * target_index = predict_word_index
* label = ONEF
*/
__pyx_t_3 = ((__pyx_v_d == 0) != 0);
if (__pyx_t_3) {
- /* "trunk/gensim/models/doc2vec_inner.pyx":171
+ /* "trunk/gensim/models/doc2vec_inner.pyx":185
* for d in range(negative+1):
* if d == 0:
- * target_index = word_index # <<<<<<<<<<<<<<
+ * target_index = predict_word_index # <<<<<<<<<<<<<<
* label = ONEF
* else:
*/
- __pyx_v_target_index = __pyx_v_word_index;
+ __pyx_v_target_index = __pyx_v_predict_word_index;
- /* "trunk/gensim/models/doc2vec_inner.pyx":172
+ /* "trunk/gensim/models/doc2vec_inner.pyx":186
* if d == 0:
- * target_index = word_index
+ * target_index = predict_word_index
* label = ONEF # <<<<<<<<<<<<<<
* else:
* target_index = table[(next_random >> 16) % table_len]
@@ -2097,37 +2282,37 @@ static unsigned PY_LONG_LONG __pyx_f_5trunk_6gensim_6models_13doc2vec_inner_fast
}
/*else*/ {
- /* "trunk/gensim/models/doc2vec_inner.pyx":174
+ /* "trunk/gensim/models/doc2vec_inner.pyx":188
* label = ONEF
* else:
* target_index = table[(next_random >> 16) % table_len] # <<<<<<<<<<<<<<
* next_random = (next_random * 25214903917ULL + 11) & modulo
- * if target_index == word_index:
+ * if target_index == predict_word_index:
*/
__pyx_v_target_index = (__pyx_v_table[((__pyx_v_next_random >> 16) % __pyx_v_table_len)]);
- /* "trunk/gensim/models/doc2vec_inner.pyx":175
+ /* "trunk/gensim/models/doc2vec_inner.pyx":189
* else:
* target_index = table[(next_random >> 16) % table_len]
* next_random = (next_random * 25214903917ULL + 11) & modulo # <<<<<<<<<<<<<<
- * if target_index == word_index:
+ * if target_index == predict_word_index:
* continue
*/
__pyx_v_next_random = (((__pyx_v_next_random * ((unsigned PY_LONG_LONG)25214903917ULL)) + 11) & __pyx_v_modulo);
- /* "trunk/gensim/models/doc2vec_inner.pyx":176
+ /* "trunk/gensim/models/doc2vec_inner.pyx":190
* target_index = table[(next_random >> 16) % table_len]
* next_random = (next_random * 25214903917ULL + 11) & modulo
- * if target_index == word_index: # <<<<<<<<<<<<<<
+ * if target_index == predict_word_index: # <<<<<<<<<<<<<<
* continue
* label = 0.0
*/
- __pyx_t_3 = ((__pyx_v_target_index == __pyx_v_word_index) != 0);
+ __pyx_t_3 = ((__pyx_v_target_index == __pyx_v_predict_word_index) != 0);
if (__pyx_t_3) {
- /* "trunk/gensim/models/doc2vec_inner.pyx":177
+ /* "trunk/gensim/models/doc2vec_inner.pyx":191
* next_random = (next_random * 25214903917ULL + 11) & modulo
- * if target_index == word_index:
+ * if target_index == predict_word_index:
* continue # <<<<<<<<<<<<<<
* label = 0.0
*
@@ -2135,8 +2320,8 @@ static unsigned PY_LONG_LONG __pyx_f_5trunk_6gensim_6models_13doc2vec_inner_fast
goto __pyx_L3_continue;
}
- /* "trunk/gensim/models/doc2vec_inner.pyx":178
- * if target_index == word_index:
+ /* "trunk/gensim/models/doc2vec_inner.pyx":192
+ * if target_index == predict_word_index:
* continue
* label = 0.0 # <<<<<<<<<<<<<<
*
@@ -2146,27 +2331,27 @@ static unsigned PY_LONG_LONG __pyx_f_5trunk_6gensim_6models_13doc2vec_inner_fast
}
__pyx_L5:;
- /* "trunk/gensim/models/doc2vec_inner.pyx":180
+ /* "trunk/gensim/models/doc2vec_inner.pyx":194
* label = 0.0
*
* row2 = target_index * size # <<<<<<<<<<<<<<
- * f = dsdot(&size, &syn0[row1], &ONE, &syn1neg[row2], &ONE)
+ * f = our_dot(&size, neu1, &ONE, &syn1neg[row2], &ONE)
* if f <= -MAX_EXP or f >= MAX_EXP:
*/
__pyx_v_row2 = (__pyx_v_target_index * __pyx_v_size);
- /* "trunk/gensim/models/doc2vec_inner.pyx":181
+ /* "trunk/gensim/models/doc2vec_inner.pyx":195
*
* row2 = target_index * size
- * f = dsdot(&size, &syn0[row1], &ONE, &syn1neg[row2], &ONE) # <<<<<<<<<<<<<<
+ * f = our_dot(&size, neu1, &ONE, &syn1neg[row2], &ONE) # <<<<<<<<<<<<<<
* if f <= -MAX_EXP or f >= MAX_EXP:
* continue
*/
- __pyx_v_f = ((__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t)__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_dsdot((&__pyx_v_size), (&(__pyx_v_syn0[__pyx_v_row1])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE), (&(__pyx_v_syn1neg[__pyx_v_row2])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE)));
+ __pyx_v_f = __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_our_dot((&__pyx_v_size), __pyx_v_neu1, (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE), (&(__pyx_v_syn1neg[__pyx_v_row2])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE));
- /* "trunk/gensim/models/doc2vec_inner.pyx":182
+ /* "trunk/gensim/models/doc2vec_inner.pyx":196
* row2 = target_index * size
- * f = dsdot(&size, &syn0[row1], &ONE, &syn1neg[row2], &ONE)
+ * f = our_dot(&size, neu1, &ONE, &syn1neg[row2], &ONE)
* if f <= -MAX_EXP or f >= MAX_EXP: # <<<<<<<<<<<<<<
* continue
* f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]
@@ -2182,8 +2367,8 @@ static unsigned PY_LONG_LONG __pyx_f_5trunk_6gensim_6models_13doc2vec_inner_fast
__pyx_L8_bool_binop_done:;
if (__pyx_t_3) {
- /* "trunk/gensim/models/doc2vec_inner.pyx":183
- * f = dsdot(&size, &syn0[row1], &ONE, &syn1neg[row2], &ONE)
+ /* "trunk/gensim/models/doc2vec_inner.pyx":197
+ * f = our_dot(&size, neu1, &ONE, &syn1neg[row2], &ONE)
* if f <= -MAX_EXP or f >= MAX_EXP:
* continue # <<<<<<<<<<<<<<
* f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]
@@ -2192,95 +2377,73 @@ static unsigned PY_LONG_LONG __pyx_f_5trunk_6gensim_6models_13doc2vec_inner_fast
goto __pyx_L3_continue;
}
- /* "trunk/gensim/models/doc2vec_inner.pyx":184
+ /* "trunk/gensim/models/doc2vec_inner.pyx":198
* if f <= -MAX_EXP or f >= MAX_EXP:
* continue
* f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))] # <<<<<<<<<<<<<<
* g = (label - f) * alpha
- * saxpy(&size, &g, &syn1neg[row2], &ONE, work, &ONE)
+ * our_saxpy(&size, &g, &syn1neg[row2], &ONE, work, &ONE)
*/
__pyx_v_f = (__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_EXP_TABLE[((int)((__pyx_v_f + 6.0) * 83.0))]);
- /* "trunk/gensim/models/doc2vec_inner.pyx":185
+ /* "trunk/gensim/models/doc2vec_inner.pyx":199
* continue
* f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]
* g = (label - f) * alpha # <<<<<<<<<<<<<<
- * saxpy(&size, &g, &syn1neg[row2], &ONE, work, &ONE)
- * if tw:
+ * our_saxpy(&size, &g, &syn1neg[row2], &ONE, work, &ONE)
+ * if learn_hidden:
*/
__pyx_v_g = ((__pyx_v_label - __pyx_v_f) * __pyx_v_alpha);
- /* "trunk/gensim/models/doc2vec_inner.pyx":186
+ /* "trunk/gensim/models/doc2vec_inner.pyx":200
* f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]
* g = (label - f) * alpha
- * saxpy(&size, &g, &syn1neg[row2], &ONE, work, &ONE) # <<<<<<<<<<<<<<
- * if tw:
- * saxpy(&size, &g, &syn0[row1], &ONE, &syn1neg[row2], &ONE)
+ * our_saxpy(&size, &g, &syn1neg[row2], &ONE, work, &ONE) # <<<<<<<<<<<<<<
+ * if learn_hidden:
+ * our_saxpy(&size, &g, neu1, &ONE, &syn1neg[row2], &ONE)
*/
- __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_saxpy((&__pyx_v_size), (&__pyx_v_g), (&(__pyx_v_syn1neg[__pyx_v_row2])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE), __pyx_v_work, (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE));
+ __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_our_saxpy((&__pyx_v_size), (&__pyx_v_g), (&(__pyx_v_syn1neg[__pyx_v_row2])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE), __pyx_v_work, (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE));
- /* "trunk/gensim/models/doc2vec_inner.pyx":187
+ /* "trunk/gensim/models/doc2vec_inner.pyx":201
* g = (label - f) * alpha
- * saxpy(&size, &g, &syn1neg[row2], &ONE, work, &ONE)
- * if tw: # <<<<<<<<<<<<<<
- * saxpy(&size, &g, &syn0[row1], &ONE, &syn1neg[row2], &ONE)
- * if tl:
+ * our_saxpy(&size, &g, &syn1neg[row2], &ONE, work, &ONE)
+ * if learn_hidden: # <<<<<<<<<<<<<<
+ * our_saxpy(&size, &g, neu1, &ONE, &syn1neg[row2], &ONE)
+ *
*/
- __pyx_t_3 = (__pyx_v_tw != 0);
+ __pyx_t_3 = (__pyx_v_learn_hidden != 0);
if (__pyx_t_3) {
- /* "trunk/gensim/models/doc2vec_inner.pyx":188
- * saxpy(&size, &g, &syn1neg[row2], &ONE, work, &ONE)
- * if tw:
- * saxpy(&size, &g, &syn0[row1], &ONE, &syn1neg[row2], &ONE) # <<<<<<<<<<<<<<
- * if tl:
- * saxpy(&size, &ONEF, work, &ONE, &syn0[row1], &ONE)
+ /* "trunk/gensim/models/doc2vec_inner.pyx":202
+ * our_saxpy(&size, &g, &syn1neg[row2], &ONE, work, &ONE)
+ * if learn_hidden:
+ * our_saxpy(&size, &g, neu1, &ONE, &syn1neg[row2], &ONE) # <<<<<<<<<<<<<<
+ *
+ * return next_random
*/
- __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_saxpy((&__pyx_v_size), (&__pyx_v_g), (&(__pyx_v_syn0[__pyx_v_row1])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE), (&(__pyx_v_syn1neg[__pyx_v_row2])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE));
+ __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_our_saxpy((&__pyx_v_size), (&__pyx_v_g), __pyx_v_neu1, (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE), (&(__pyx_v_syn1neg[__pyx_v_row2])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE));
goto __pyx_L10;
}
__pyx_L10:;
__pyx_L3_continue:;
}
- /* "trunk/gensim/models/doc2vec_inner.pyx":189
- * if tw:
- * saxpy(&size, &g, &syn0[row1], &ONE, &syn1neg[row2], &ONE)
- * if tl: # <<<<<<<<<<<<<<
- * saxpy(&size, &ONEF, work, &ONE, &syn0[row1], &ONE)
- *
- */
- __pyx_t_3 = (__pyx_v_tl != 0);
- if (__pyx_t_3) {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":190
- * saxpy(&size, &g, &syn0[row1], &ONE, &syn1neg[row2], &ONE)
- * if tl:
- * saxpy(&size, &ONEF, work, &ONE, &syn0[row1], &ONE) # <<<<<<<<<<<<<<
- *
- * return next_random
- */
- __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_saxpy((&__pyx_v_size), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONEF), __pyx_v_work, (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE), (&(__pyx_v_syn0[__pyx_v_row1])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE));
- goto __pyx_L11;
- }
- __pyx_L11:;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":192
- * saxpy(&size, &ONEF, work, &ONE, &syn0[row1], &ONE)
+ /* "trunk/gensim/models/doc2vec_inner.pyx":204
+ * our_saxpy(&size, &g, neu1, &ONE, &syn1neg[row2], &ONE)
*
* return next_random # <<<<<<<<<<<<<<
*
- * cdef unsigned long long fast_sentence1_dbow_neg(
+ * cdef void fast_document_dmc_hs(
*/
__pyx_r = __pyx_v_next_random;
goto __pyx_L0;
- /* "trunk/gensim/models/doc2vec_inner.pyx":154
+ /* "trunk/gensim/models/doc2vec_inner.pyx":170
*
*
- * cdef unsigned long long fast_sentence0_dbow_neg( # <<<<<<<<<<<<<<
- * const int negative, np.uint32_t *table, unsigned long long table_len,
- * REAL_t *syn0, REAL_t *syn1neg, const int size, const np.uint32_t word_index,
+ * cdef unsigned long long fast_document_dm_neg( # <<<<<<<<<<<<<<
+ * const int negative, np.uint32_t *table, unsigned long long table_len, unsigned long long next_random,
+ * REAL_t *neu1, REAL_t *syn1neg, const int predict_word_index, const REAL_t alpha, REAL_t *work,
*/
/* function exit code */
@@ -2288,168 +2451,56 @@ static unsigned PY_LONG_LONG __pyx_f_5trunk_6gensim_6models_13doc2vec_inner_fast
return __pyx_r;
}
-/* "trunk/gensim/models/doc2vec_inner.pyx":194
+/* "trunk/gensim/models/doc2vec_inner.pyx":206
* return next_random
*
- * cdef unsigned long long fast_sentence1_dbow_neg( # <<<<<<<<<<<<<<
- * const int negative, np.uint32_t *table, unsigned long long table_len,
- * REAL_t *syn0, REAL_t *syn1neg, const int size, const np.uint32_t word_index,
+ * cdef void fast_document_dmc_hs( # <<<<<<<<<<<<<<
+ * const np.uint32_t *word_point, const np.uint8_t *word_code, int word_code_len,
+ * REAL_t *neu1, REAL_t *syn1, const REAL_t alpha, REAL_t *work,
*/
-static unsigned PY_LONG_LONG __pyx_f_5trunk_6gensim_6models_13doc2vec_inner_fast_sentence1_dbow_neg(int const __pyx_v_negative, __pyx_t_5numpy_uint32_t *__pyx_v_table, unsigned PY_LONG_LONG __pyx_v_table_len, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_syn0, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_syn1neg, int const __pyx_v_size, __pyx_t_5numpy_uint32_t const __pyx_v_word_index, __pyx_t_5numpy_uint32_t const __pyx_v_word2_index, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t const __pyx_v_alpha, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_work, unsigned PY_LONG_LONG __pyx_v_next_random, int __pyx_v_tw, int __pyx_v_tl) {
- PY_LONG_LONG __pyx_v_row1;
+static void __pyx_f_5trunk_6gensim_6models_13doc2vec_inner_fast_document_dmc_hs(__pyx_t_5numpy_uint32_t const *__pyx_v_word_point, __pyx_t_5numpy_uint8_t const *__pyx_v_word_code, int __pyx_v_word_code_len, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_neu1, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_syn1, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t const __pyx_v_alpha, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_work, int const __pyx_v_layer1_size, CYTHON_UNUSED int const __pyx_v_vector_size, int __pyx_v_learn_hidden) {
+ PY_LONG_LONG __pyx_v_b;
PY_LONG_LONG __pyx_v_row2;
- unsigned PY_LONG_LONG __pyx_v_modulo;
__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t __pyx_v_f;
__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t __pyx_v_g;
- __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t __pyx_v_label;
- __pyx_t_5numpy_uint32_t __pyx_v_target_index;
- int __pyx_v_d;
- unsigned PY_LONG_LONG __pyx_r;
- long __pyx_t_1;
- int __pyx_t_2;
+ int __pyx_t_1;
+ PY_LONG_LONG __pyx_t_2;
int __pyx_t_3;
int __pyx_t_4;
- /* "trunk/gensim/models/doc2vec_inner.pyx":201
- *
- * cdef long long a
- * cdef long long row1 = word2_index * size, row2 # <<<<<<<<<<<<<<
- * cdef unsigned long long modulo = 281474976710655ULL
- * cdef REAL_t f, g, label
- */
- __pyx_v_row1 = (__pyx_v_word2_index * __pyx_v_size);
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":202
- * cdef long long a
- * cdef long long row1 = word2_index * size, row2
- * cdef unsigned long long modulo = 281474976710655ULL # <<<<<<<<<<<<<<
- * cdef REAL_t f, g, label
- * cdef np.uint32_t target_index
- */
- __pyx_v_modulo = 281474976710655ULL;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":207
- * cdef int d
- *
- * memset(work, 0, size * cython.sizeof(REAL_t)) # <<<<<<<<<<<<<<
- *
- * for d in range(negative+1):
- */
- memset(__pyx_v_work, 0, (__pyx_v_size * (sizeof(__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t))));
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":209
- * memset(work, 0, size * cython.sizeof(REAL_t))
- *
- * for d in range(negative+1): # <<<<<<<<<<<<<<
- *
- * if d == 0:
+ /* "trunk/gensim/models/doc2vec_inner.pyx":218
+ * # l1 already composed by caller, passed in as neu1
+ * # work accumulates net l1 error; eventually applied by caller
+ * for b in range(word_code_len): # <<<<<<<<<<<<<<
+ * row2 = word_point[b] * layer1_size
+ * f = our_dot(&layer1_size, neu1, &ONE, &syn1[row2], &ONE)
*/
- __pyx_t_1 = (__pyx_v_negative + 1);
+ __pyx_t_1 = __pyx_v_word_code_len;
for (__pyx_t_2 = 0; __pyx_t_2 < __pyx_t_1; __pyx_t_2+=1) {
- __pyx_v_d = __pyx_t_2;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":211
- * for d in range(negative+1):
- *
- * if d == 0: # <<<<<<<<<<<<<<
- * target_index = word_index
- * label = ONEF
- */
- __pyx_t_3 = ((__pyx_v_d == 0) != 0);
- if (__pyx_t_3) {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":212
- *
- * if d == 0:
- * target_index = word_index # <<<<<<<<<<<<<<
- * label = ONEF
- * else:
- */
- __pyx_v_target_index = __pyx_v_word_index;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":213
- * if d == 0:
- * target_index = word_index
- * label = ONEF # <<<<<<<<<<<<<<
- * else:
- * target_index = table[(next_random >> 16) % table_len]
- */
- __pyx_v_label = __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONEF;
- goto __pyx_L5;
- }
- /*else*/ {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":215
- * label = ONEF
- * else:
- * target_index = table[(next_random >> 16) % table_len] # <<<<<<<<<<<<<<
- * next_random = (next_random * 25214903917ULL + 11) & modulo
- * if target_index == word_index:
- */
- __pyx_v_target_index = (__pyx_v_table[((__pyx_v_next_random >> 16) % __pyx_v_table_len)]);
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":216
- * else:
- * target_index = table[(next_random >> 16) % table_len]
- * next_random = (next_random * 25214903917ULL + 11) & modulo # <<<<<<<<<<<<<<
- * if target_index == word_index:
- * continue
- */
- __pyx_v_next_random = (((__pyx_v_next_random * ((unsigned PY_LONG_LONG)25214903917ULL)) + 11) & __pyx_v_modulo);
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":217
- * target_index = table[(next_random >> 16) % table_len]
- * next_random = (next_random * 25214903917ULL + 11) & modulo
- * if target_index == word_index: # <<<<<<<<<<<<<<
- * continue
- * label = 0.0
- */
- __pyx_t_3 = ((__pyx_v_target_index == __pyx_v_word_index) != 0);
- if (__pyx_t_3) {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":218
- * next_random = (next_random * 25214903917ULL + 11) & modulo
- * if target_index == word_index:
- * continue # <<<<<<<<<<<<<<
- * label = 0.0
- *
- */
- goto __pyx_L3_continue;
- }
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":219
- * if target_index == word_index:
- * continue
- * label = 0.0 # <<<<<<<<<<<<<<
- *
- * row2 = target_index * size
- */
- __pyx_v_label = ((__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t)0.0);
- }
- __pyx_L5:;
+ __pyx_v_b = __pyx_t_2;
- /* "trunk/gensim/models/doc2vec_inner.pyx":221
- * label = 0.0
- *
- * row2 = target_index * size # <<<<<<<<<<<<<<
- * f = sdot(&size, &syn0[row1], &ONE, &syn1neg[row2], &ONE)
+ /* "trunk/gensim/models/doc2vec_inner.pyx":219
+ * # work accumulates net l1 error; eventually applied by caller
+ * for b in range(word_code_len):
+ * row2 = word_point[b] * layer1_size # <<<<<<<<<<<<<<
+ * f = our_dot(&layer1_size, neu1, &ONE, &syn1[row2], &ONE)
* if f <= -MAX_EXP or f >= MAX_EXP:
*/
- __pyx_v_row2 = (__pyx_v_target_index * __pyx_v_size);
+ __pyx_v_row2 = ((__pyx_v_word_point[__pyx_v_b]) * __pyx_v_layer1_size);
- /* "trunk/gensim/models/doc2vec_inner.pyx":222
- *
- * row2 = target_index * size
- * f = sdot(&size, &syn0[row1], &ONE, &syn1neg[row2], &ONE) # <<<<<<<<<<<<<<
+ /* "trunk/gensim/models/doc2vec_inner.pyx":220
+ * for b in range(word_code_len):
+ * row2 = word_point[b] * layer1_size
+ * f = our_dot(&layer1_size, neu1, &ONE, &syn1[row2], &ONE) # <<<<<<<<<<<<<<
* if f <= -MAX_EXP or f >= MAX_EXP:
* continue
*/
- __pyx_v_f = ((__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t)__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_sdot((&__pyx_v_size), (&(__pyx_v_syn0[__pyx_v_row1])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE), (&(__pyx_v_syn1neg[__pyx_v_row2])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE)));
+ __pyx_v_f = __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_our_dot((&__pyx_v_layer1_size), __pyx_v_neu1, (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE), (&(__pyx_v_syn1[__pyx_v_row2])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE));
- /* "trunk/gensim/models/doc2vec_inner.pyx":223
- * row2 = target_index * size
- * f = sdot(&size, &syn0[row1], &ONE, &syn1neg[row2], &ONE)
+ /* "trunk/gensim/models/doc2vec_inner.pyx":221
+ * row2 = word_point[b] * layer1_size
+ * f = our_dot(&layer1_size, neu1, &ONE, &syn1[row2], &ONE)
* if f <= -MAX_EXP or f >= MAX_EXP: # <<<<<<<<<<<<<<
* continue
* f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]
@@ -2458,130 +2509,94 @@ static unsigned PY_LONG_LONG __pyx_f_5trunk_6gensim_6models_13doc2vec_inner_fast
if (!__pyx_t_4) {
} else {
__pyx_t_3 = __pyx_t_4;
- goto __pyx_L8_bool_binop_done;
+ goto __pyx_L6_bool_binop_done;
}
__pyx_t_4 = ((__pyx_v_f >= 6.0) != 0);
__pyx_t_3 = __pyx_t_4;
- __pyx_L8_bool_binop_done:;
+ __pyx_L6_bool_binop_done:;
if (__pyx_t_3) {
- /* "trunk/gensim/models/doc2vec_inner.pyx":224
- * f = sdot(&size, &syn0[row1], &ONE, &syn1neg[row2], &ONE)
+ /* "trunk/gensim/models/doc2vec_inner.pyx":222
+ * f = our_dot(&layer1_size, neu1, &ONE, &syn1[row2], &ONE)
* if f <= -MAX_EXP or f >= MAX_EXP:
* continue # <<<<<<<<<<<<<<
* f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]
- * g = (label - f) * alpha
+ * g = (1 - word_code[b] - f) * alpha
*/
goto __pyx_L3_continue;
}
- /* "trunk/gensim/models/doc2vec_inner.pyx":225
+ /* "trunk/gensim/models/doc2vec_inner.pyx":223
* if f <= -MAX_EXP or f >= MAX_EXP:
* continue
* f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))] # <<<<<<<<<<<<<<
- * g = (label - f) * alpha
- * saxpy(&size, &g, &syn1neg[row2], &ONE, work, &ONE)
+ * g = (1 - word_code[b] - f) * alpha
+ * our_saxpy(&layer1_size, &g, &syn1[row2], &ONE, work, &ONE)
*/
__pyx_v_f = (__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_EXP_TABLE[((int)((__pyx_v_f + 6.0) * 83.0))]);
- /* "trunk/gensim/models/doc2vec_inner.pyx":226
+ /* "trunk/gensim/models/doc2vec_inner.pyx":224
* continue
* f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]
- * g = (label - f) * alpha # <<<<<<<<<<<<<<
- * saxpy(&size, &g, &syn1neg[row2], &ONE, work, &ONE)
- * if tw:
+ * g = (1 - word_code[b] - f) * alpha # <<<<<<<<<<<<<<
+ * our_saxpy(&layer1_size, &g, &syn1[row2], &ONE, work, &ONE)
+ * if learn_hidden:
*/
- __pyx_v_g = ((__pyx_v_label - __pyx_v_f) * __pyx_v_alpha);
+ __pyx_v_g = (((1 - (__pyx_v_word_code[__pyx_v_b])) - __pyx_v_f) * __pyx_v_alpha);
- /* "trunk/gensim/models/doc2vec_inner.pyx":227
+ /* "trunk/gensim/models/doc2vec_inner.pyx":225
* f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]
- * g = (label - f) * alpha
- * saxpy(&size, &g, &syn1neg[row2], &ONE, work, &ONE) # <<<<<<<<<<<<<<
- * if tw:
- * saxpy(&size, &g, &syn0[row1], &ONE, &syn1neg[row2], &ONE)
+ * g = (1 - word_code[b] - f) * alpha
+ * our_saxpy(&layer1_size, &g, &syn1[row2], &ONE, work, &ONE) # <<<<<<<<<<<<<<
+ * if learn_hidden:
+ * our_saxpy(&layer1_size, &g, neu1, &ONE, &syn1[row2], &ONE)
*/
- __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_saxpy((&__pyx_v_size), (&__pyx_v_g), (&(__pyx_v_syn1neg[__pyx_v_row2])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE), __pyx_v_work, (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE));
+ __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_our_saxpy((&__pyx_v_layer1_size), (&__pyx_v_g), (&(__pyx_v_syn1[__pyx_v_row2])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE), __pyx_v_work, (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE));
- /* "trunk/gensim/models/doc2vec_inner.pyx":228
- * g = (label - f) * alpha
- * saxpy(&size, &g, &syn1neg[row2], &ONE, work, &ONE)
- * if tw: # <<<<<<<<<<<<<<
- * saxpy(&size, &g, &syn0[row1], &ONE, &syn1neg[row2], &ONE)
- * if tl:
+ /* "trunk/gensim/models/doc2vec_inner.pyx":226
+ * g = (1 - word_code[b] - f) * alpha
+ * our_saxpy(&layer1_size, &g, &syn1[row2], &ONE, work, &ONE)
+ * if learn_hidden: # <<<<<<<<<<<<<<
+ * our_saxpy(&layer1_size, &g, neu1, &ONE, &syn1[row2], &ONE)
+ *
*/
- __pyx_t_3 = (__pyx_v_tw != 0);
+ __pyx_t_3 = (__pyx_v_learn_hidden != 0);
if (__pyx_t_3) {
- /* "trunk/gensim/models/doc2vec_inner.pyx":229
- * saxpy(&size, &g, &syn1neg[row2], &ONE, work, &ONE)
- * if tw:
- * saxpy(&size, &g, &syn0[row1], &ONE, &syn1neg[row2], &ONE) # <<<<<<<<<<<<<<
- * if tl:
- * saxpy(&size, &ONEF, work, &ONE, &syn0[row1], &ONE)
- */
- __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_saxpy((&__pyx_v_size), (&__pyx_v_g), (&(__pyx_v_syn0[__pyx_v_row1])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE), (&(__pyx_v_syn1neg[__pyx_v_row2])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE));
- goto __pyx_L10;
- }
- __pyx_L10:;
- __pyx_L3_continue:;
- }
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":230
- * if tw:
- * saxpy(&size, &g, &syn0[row1], &ONE, &syn1neg[row2], &ONE)
- * if tl: # <<<<<<<<<<<<<<
- * saxpy(&size, &ONEF, work, &ONE, &syn0[row1], &ONE)
+ /* "trunk/gensim/models/doc2vec_inner.pyx":227
+ * our_saxpy(&layer1_size, &g, &syn1[row2], &ONE, work, &ONE)
+ * if learn_hidden:
+ * our_saxpy(&layer1_size, &g, neu1, &ONE, &syn1[row2], &ONE) # <<<<<<<<<<<<<<
*
- */
- __pyx_t_3 = (__pyx_v_tl != 0);
- if (__pyx_t_3) {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":231
- * saxpy(&size, &g, &syn0[row1], &ONE, &syn1neg[row2], &ONE)
- * if tl:
- * saxpy(&size, &ONEF, work, &ONE, &syn0[row1], &ONE) # <<<<<<<<<<<<<<
*
- * return next_random
*/
- __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_saxpy((&__pyx_v_size), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONEF), __pyx_v_work, (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE), (&(__pyx_v_syn0[__pyx_v_row1])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE));
- goto __pyx_L11;
+ __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_our_saxpy((&__pyx_v_layer1_size), (&__pyx_v_g), __pyx_v_neu1, (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE), (&(__pyx_v_syn1[__pyx_v_row2])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE));
+ goto __pyx_L8;
+ }
+ __pyx_L8:;
+ __pyx_L3_continue:;
}
- __pyx_L11:;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":233
- * saxpy(&size, &ONEF, work, &ONE, &syn0[row1], &ONE)
- *
- * return next_random # <<<<<<<<<<<<<<
- *
- * cdef unsigned long long fast_sentence2_dbow_neg(
- */
- __pyx_r = __pyx_v_next_random;
- goto __pyx_L0;
- /* "trunk/gensim/models/doc2vec_inner.pyx":194
+ /* "trunk/gensim/models/doc2vec_inner.pyx":206
* return next_random
*
- * cdef unsigned long long fast_sentence1_dbow_neg( # <<<<<<<<<<<<<<
- * const int negative, np.uint32_t *table, unsigned long long table_len,
- * REAL_t *syn0, REAL_t *syn1neg, const int size, const np.uint32_t word_index,
+ * cdef void fast_document_dmc_hs( # <<<<<<<<<<<<<<
+ * const np.uint32_t *word_point, const np.uint8_t *word_code, int word_code_len,
+ * REAL_t *neu1, REAL_t *syn1, const REAL_t alpha, REAL_t *work,
*/
/* function exit code */
- __pyx_L0:;
- return __pyx_r;
}
-/* "trunk/gensim/models/doc2vec_inner.pyx":235
- * return next_random
+/* "trunk/gensim/models/doc2vec_inner.pyx":230
*
- * cdef unsigned long long fast_sentence2_dbow_neg( # <<<<<<<<<<<<<<
- * const int negative, np.uint32_t *table, unsigned long long table_len,
- * REAL_t *syn0, REAL_t *syn1neg, const int size, const np.uint32_t word_index,
+ *
+ * cdef unsigned long long fast_document_dmc_neg( # <<<<<<<<<<<<<<
+ * const int negative, np.uint32_t *table, unsigned long long table_len, unsigned long long next_random,
+ * REAL_t *neu1, REAL_t *syn1neg, const int predict_word_index, const REAL_t alpha, REAL_t *work,
*/
-static unsigned PY_LONG_LONG __pyx_f_5trunk_6gensim_6models_13doc2vec_inner_fast_sentence2_dbow_neg(int const __pyx_v_negative, __pyx_t_5numpy_uint32_t *__pyx_v_table, unsigned PY_LONG_LONG __pyx_v_table_len, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_syn0, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_syn1neg, int const __pyx_v_size, __pyx_t_5numpy_uint32_t const __pyx_v_word_index, __pyx_t_5numpy_uint32_t const __pyx_v_word2_index, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t const __pyx_v_alpha, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_work, unsigned PY_LONG_LONG __pyx_v_next_random, int __pyx_v_tw, int __pyx_v_tl) {
- PY_LONG_LONG __pyx_v_a;
- PY_LONG_LONG __pyx_v_row1;
+static unsigned PY_LONG_LONG __pyx_f_5trunk_6gensim_6models_13doc2vec_inner_fast_document_dmc_neg(int const __pyx_v_negative, __pyx_t_5numpy_uint32_t *__pyx_v_table, unsigned PY_LONG_LONG __pyx_v_table_len, unsigned PY_LONG_LONG __pyx_v_next_random, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_neu1, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_syn1neg, int const __pyx_v_predict_word_index, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t const __pyx_v_alpha, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_work, int const __pyx_v_layer1_size, CYTHON_UNUSED int const __pyx_v_vector_size, int __pyx_v_learn_hidden) {
PY_LONG_LONG __pyx_v_row2;
unsigned PY_LONG_LONG __pyx_v_modulo;
__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t __pyx_v_f;
@@ -2590,339 +2605,224 @@ static unsigned PY_LONG_LONG __pyx_f_5trunk_6gensim_6models_13doc2vec_inner_fast
__pyx_t_5numpy_uint32_t __pyx_v_target_index;
int __pyx_v_d;
unsigned PY_LONG_LONG __pyx_r;
- int __pyx_t_1;
- PY_LONG_LONG __pyx_t_2;
- long __pyx_t_3;
+ long __pyx_t_1;
+ int __pyx_t_2;
+ int __pyx_t_3;
int __pyx_t_4;
- int __pyx_t_5;
- int __pyx_t_6;
- PY_LONG_LONG __pyx_t_7;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":242
- *
- * cdef long long a
- * cdef long long row1 = word2_index * size, row2 # <<<<<<<<<<<<<<
- * cdef unsigned long long modulo = 281474976710655ULL
- * cdef REAL_t f, g, label
- */
- __pyx_v_row1 = (__pyx_v_word2_index * __pyx_v_size);
- /* "trunk/gensim/models/doc2vec_inner.pyx":243
+ /* "trunk/gensim/models/doc2vec_inner.pyx":237
* cdef long long a
- * cdef long long row1 = word2_index * size, row2
+ * cdef long long row2
* cdef unsigned long long modulo = 281474976710655ULL # <<<<<<<<<<<<<<
* cdef REAL_t f, g, label
* cdef np.uint32_t target_index
*/
__pyx_v_modulo = 281474976710655ULL;
- /* "trunk/gensim/models/doc2vec_inner.pyx":248
- * cdef int d
- *
- * for a in range(size): # <<<<<<<<<<<<<<
- * work[a] = 0.0
- *
- */
- __pyx_t_1 = __pyx_v_size;
- for (__pyx_t_2 = 0; __pyx_t_2 < __pyx_t_1; __pyx_t_2+=1) {
- __pyx_v_a = __pyx_t_2;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":249
- *
- * for a in range(size):
- * work[a] = 0.0 # <<<<<<<<<<<<<<
- *
- * for d in range(negative+1):
- */
- (__pyx_v_work[__pyx_v_a]) = ((__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t)0.0);
- }
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":251
- * work[a] = 0.0
- *
+ /* "trunk/gensim/models/doc2vec_inner.pyx":244
+ * # l1 already composed by caller, passed in as neu1
+ * # work accumulates net l1 error; eventually applied by caller
* for d in range(negative+1): # <<<<<<<<<<<<<<
- *
* if d == 0:
+ * target_index = predict_word_index
*/
- __pyx_t_3 = (__pyx_v_negative + 1);
- for (__pyx_t_1 = 0; __pyx_t_1 < __pyx_t_3; __pyx_t_1+=1) {
- __pyx_v_d = __pyx_t_1;
+ __pyx_t_1 = (__pyx_v_negative + 1);
+ for (__pyx_t_2 = 0; __pyx_t_2 < __pyx_t_1; __pyx_t_2+=1) {
+ __pyx_v_d = __pyx_t_2;
- /* "trunk/gensim/models/doc2vec_inner.pyx":253
+ /* "trunk/gensim/models/doc2vec_inner.pyx":245
+ * # work accumulates net l1 error; eventually applied by caller
* for d in range(negative+1):
- *
* if d == 0: # <<<<<<<<<<<<<<
- * target_index = word_index
+ * target_index = predict_word_index
* label = ONEF
*/
- __pyx_t_4 = ((__pyx_v_d == 0) != 0);
- if (__pyx_t_4) {
+ __pyx_t_3 = ((__pyx_v_d == 0) != 0);
+ if (__pyx_t_3) {
- /* "trunk/gensim/models/doc2vec_inner.pyx":254
- *
+ /* "trunk/gensim/models/doc2vec_inner.pyx":246
+ * for d in range(negative+1):
* if d == 0:
- * target_index = word_index # <<<<<<<<<<<<<<
+ * target_index = predict_word_index # <<<<<<<<<<<<<<
* label = ONEF
* else:
*/
- __pyx_v_target_index = __pyx_v_word_index;
+ __pyx_v_target_index = __pyx_v_predict_word_index;
- /* "trunk/gensim/models/doc2vec_inner.pyx":255
+ /* "trunk/gensim/models/doc2vec_inner.pyx":247
* if d == 0:
- * target_index = word_index
+ * target_index = predict_word_index
* label = ONEF # <<<<<<<<<<<<<<
* else:
* target_index = table[(next_random >> 16) % table_len]
*/
__pyx_v_label = __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONEF;
- goto __pyx_L7;
+ goto __pyx_L5;
}
/*else*/ {
- /* "trunk/gensim/models/doc2vec_inner.pyx":257
+ /* "trunk/gensim/models/doc2vec_inner.pyx":249
* label = ONEF
* else:
* target_index = table[(next_random >> 16) % table_len] # <<<<<<<<<<<<<<
* next_random = (next_random * 25214903917ULL + 11) & modulo
- * if target_index == word_index:
+ * if target_index == predict_word_index:
*/
__pyx_v_target_index = (__pyx_v_table[((__pyx_v_next_random >> 16) % __pyx_v_table_len)]);
- /* "trunk/gensim/models/doc2vec_inner.pyx":258
+ /* "trunk/gensim/models/doc2vec_inner.pyx":250
* else:
* target_index = table[(next_random >> 16) % table_len]
* next_random = (next_random * 25214903917ULL + 11) & modulo # <<<<<<<<<<<<<<
- * if target_index == word_index:
+ * if target_index == predict_word_index:
* continue
*/
__pyx_v_next_random = (((__pyx_v_next_random * ((unsigned PY_LONG_LONG)25214903917ULL)) + 11) & __pyx_v_modulo);
- /* "trunk/gensim/models/doc2vec_inner.pyx":259
+ /* "trunk/gensim/models/doc2vec_inner.pyx":251
* target_index = table[(next_random >> 16) % table_len]
* next_random = (next_random * 25214903917ULL + 11) & modulo
- * if target_index == word_index: # <<<<<<<<<<<<<<
+ * if target_index == predict_word_index: # <<<<<<<<<<<<<<
* continue
* label = 0.0
*/
- __pyx_t_4 = ((__pyx_v_target_index == __pyx_v_word_index) != 0);
- if (__pyx_t_4) {
+ __pyx_t_3 = ((__pyx_v_target_index == __pyx_v_predict_word_index) != 0);
+ if (__pyx_t_3) {
- /* "trunk/gensim/models/doc2vec_inner.pyx":260
+ /* "trunk/gensim/models/doc2vec_inner.pyx":252
* next_random = (next_random * 25214903917ULL + 11) & modulo
- * if target_index == word_index:
+ * if target_index == predict_word_index:
* continue # <<<<<<<<<<<<<<
* label = 0.0
*
*/
- goto __pyx_L5_continue;
+ goto __pyx_L3_continue;
}
- /* "trunk/gensim/models/doc2vec_inner.pyx":261
- * if target_index == word_index:
+ /* "trunk/gensim/models/doc2vec_inner.pyx":253
+ * if target_index == predict_word_index:
* continue
* label = 0.0 # <<<<<<<<<<<<<<
*
- * row2 = target_index * size
+ * row2 = target_index * layer1_size
*/
__pyx_v_label = ((__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t)0.0);
}
- __pyx_L7:;
+ __pyx_L5:;
- /* "trunk/gensim/models/doc2vec_inner.pyx":263
+ /* "trunk/gensim/models/doc2vec_inner.pyx":255
* label = 0.0
*
- * row2 = target_index * size # <<<<<<<<<<<<<<
- * f = 0.0
- * for a in range(size):
- */
- __pyx_v_row2 = (__pyx_v_target_index * __pyx_v_size);
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":264
- *
- * row2 = target_index * size
- * f = 0.0 # <<<<<<<<<<<<<<
- * for a in range(size):
- * f += syn0[row1 + a] * syn1neg[row2 + a]
- */
- __pyx_v_f = ((__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t)0.0);
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":265
- * row2 = target_index * size
- * f = 0.0
- * for a in range(size): # <<<<<<<<<<<<<<
- * f += syn0[row1 + a] * syn1neg[row2 + a]
+ * row2 = target_index * layer1_size # <<<<<<<<<<<<<<
+ * f = our_dot(&layer1_size, neu1, &ONE, &syn1neg[row2], &ONE)
* if f <= -MAX_EXP or f >= MAX_EXP:
*/
- __pyx_t_5 = __pyx_v_size;
- for (__pyx_t_2 = 0; __pyx_t_2 < __pyx_t_5; __pyx_t_2+=1) {
- __pyx_v_a = __pyx_t_2;
+ __pyx_v_row2 = (__pyx_v_target_index * __pyx_v_layer1_size);
- /* "trunk/gensim/models/doc2vec_inner.pyx":266
- * f = 0.0
- * for a in range(size):
- * f += syn0[row1 + a] * syn1neg[row2 + a] # <<<<<<<<<<<<<<
+ /* "trunk/gensim/models/doc2vec_inner.pyx":256
+ *
+ * row2 = target_index * layer1_size
+ * f = our_dot(&layer1_size, neu1, &ONE, &syn1neg[row2], &ONE) # <<<<<<<<<<<<<<
* if f <= -MAX_EXP or f >= MAX_EXP:
* continue
*/
- __pyx_v_f = (__pyx_v_f + ((__pyx_v_syn0[(__pyx_v_row1 + __pyx_v_a)]) * (__pyx_v_syn1neg[(__pyx_v_row2 + __pyx_v_a)])));
- }
+ __pyx_v_f = __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_our_dot((&__pyx_v_layer1_size), __pyx_v_neu1, (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE), (&(__pyx_v_syn1neg[__pyx_v_row2])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE));
- /* "trunk/gensim/models/doc2vec_inner.pyx":267
- * for a in range(size):
- * f += syn0[row1 + a] * syn1neg[row2 + a]
+ /* "trunk/gensim/models/doc2vec_inner.pyx":257
+ * row2 = target_index * layer1_size
+ * f = our_dot(&layer1_size, neu1, &ONE, &syn1neg[row2], &ONE)
* if f <= -MAX_EXP or f >= MAX_EXP: # <<<<<<<<<<<<<<
* continue
* f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]
*/
- __pyx_t_6 = ((__pyx_v_f <= -6.0) != 0);
- if (!__pyx_t_6) {
+ __pyx_t_4 = ((__pyx_v_f <= -6.0) != 0);
+ if (!__pyx_t_4) {
} else {
- __pyx_t_4 = __pyx_t_6;
- goto __pyx_L12_bool_binop_done;
+ __pyx_t_3 = __pyx_t_4;
+ goto __pyx_L8_bool_binop_done;
}
- __pyx_t_6 = ((__pyx_v_f >= 6.0) != 0);
- __pyx_t_4 = __pyx_t_6;
- __pyx_L12_bool_binop_done:;
- if (__pyx_t_4) {
+ __pyx_t_4 = ((__pyx_v_f >= 6.0) != 0);
+ __pyx_t_3 = __pyx_t_4;
+ __pyx_L8_bool_binop_done:;
+ if (__pyx_t_3) {
- /* "trunk/gensim/models/doc2vec_inner.pyx":268
- * f += syn0[row1 + a] * syn1neg[row2 + a]
+ /* "trunk/gensim/models/doc2vec_inner.pyx":258
+ * f = our_dot(&layer1_size, neu1, &ONE, &syn1neg[row2], &ONE)
* if f <= -MAX_EXP or f >= MAX_EXP:
* continue # <<<<<<<<<<<<<<
* f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]
* g = (label - f) * alpha
*/
- goto __pyx_L5_continue;
+ goto __pyx_L3_continue;
}
- /* "trunk/gensim/models/doc2vec_inner.pyx":269
+ /* "trunk/gensim/models/doc2vec_inner.pyx":259
* if f <= -MAX_EXP or f >= MAX_EXP:
* continue
* f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))] # <<<<<<<<<<<<<<
* g = (label - f) * alpha
- * for a in range(size):
+ * our_saxpy(&layer1_size, &g, &syn1neg[row2], &ONE, work, &ONE)
*/
__pyx_v_f = (__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_EXP_TABLE[((int)((__pyx_v_f + 6.0) * 83.0))]);
- /* "trunk/gensim/models/doc2vec_inner.pyx":270
+ /* "trunk/gensim/models/doc2vec_inner.pyx":260
* continue
* f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]
* g = (label - f) * alpha # <<<<<<<<<<<<<<
- * for a in range(size):
- * work[a] += g * syn1neg[row2 + a]
+ * our_saxpy(&layer1_size, &g, &syn1neg[row2], &ONE, work, &ONE)
+ * if learn_hidden:
*/
__pyx_v_g = ((__pyx_v_label - __pyx_v_f) * __pyx_v_alpha);
- /* "trunk/gensim/models/doc2vec_inner.pyx":271
+ /* "trunk/gensim/models/doc2vec_inner.pyx":261
* f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]
* g = (label - f) * alpha
- * for a in range(size): # <<<<<<<<<<<<<<
- * work[a] += g * syn1neg[row2 + a]
- * if tw:
+ * our_saxpy(&layer1_size, &g, &syn1neg[row2], &ONE, work, &ONE) # <<<<<<<<<<<<<<
+ * if learn_hidden:
+ * our_saxpy(&layer1_size, &g, neu1, &ONE, &syn1neg[row2], &ONE)
*/
- __pyx_t_5 = __pyx_v_size;
- for (__pyx_t_2 = 0; __pyx_t_2 < __pyx_t_5; __pyx_t_2+=1) {
- __pyx_v_a = __pyx_t_2;
+ __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_our_saxpy((&__pyx_v_layer1_size), (&__pyx_v_g), (&(__pyx_v_syn1neg[__pyx_v_row2])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE), __pyx_v_work, (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE));
- /* "trunk/gensim/models/doc2vec_inner.pyx":272
+ /* "trunk/gensim/models/doc2vec_inner.pyx":262
* g = (label - f) * alpha
- * for a in range(size):
- * work[a] += g * syn1neg[row2 + a] # <<<<<<<<<<<<<<
- * if tw:
- * for a in range(size):
+ * our_saxpy(&layer1_size, &g, &syn1neg[row2], &ONE, work, &ONE)
+ * if learn_hidden: # <<<<<<<<<<<<<<
+ * our_saxpy(&layer1_size, &g, neu1, &ONE, &syn1neg[row2], &ONE)
+ *
*/
- __pyx_t_7 = __pyx_v_a;
- (__pyx_v_work[__pyx_t_7]) = ((__pyx_v_work[__pyx_t_7]) + (__pyx_v_g * (__pyx_v_syn1neg[(__pyx_v_row2 + __pyx_v_a)])));
- }
+ __pyx_t_3 = (__pyx_v_learn_hidden != 0);
+ if (__pyx_t_3) {
- /* "trunk/gensim/models/doc2vec_inner.pyx":273
- * for a in range(size):
- * work[a] += g * syn1neg[row2 + a]
- * if tw: # <<<<<<<<<<<<<<
- * for a in range(size):
- * syn1neg[row2 + a] += g * syn0[row1 + a]
+ /* "trunk/gensim/models/doc2vec_inner.pyx":263
+ * our_saxpy(&layer1_size, &g, &syn1neg[row2], &ONE, work, &ONE)
+ * if learn_hidden:
+ * our_saxpy(&layer1_size, &g, neu1, &ONE, &syn1neg[row2], &ONE) # <<<<<<<<<<<<<<
+ *
+ * return next_random
*/
- __pyx_t_4 = (__pyx_v_tw != 0);
- if (__pyx_t_4) {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":274
- * work[a] += g * syn1neg[row2 + a]
- * if tw:
- * for a in range(size): # <<<<<<<<<<<<<<
- * syn1neg[row2 + a] += g * syn0[row1 + a]
- * if tl:
- */
- __pyx_t_5 = __pyx_v_size;
- for (__pyx_t_2 = 0; __pyx_t_2 < __pyx_t_5; __pyx_t_2+=1) {
- __pyx_v_a = __pyx_t_2;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":275
- * if tw:
- * for a in range(size):
- * syn1neg[row2 + a] += g * syn0[row1 + a] # <<<<<<<<<<<<<<
- * if tl:
- * for a in range(size):
- */
- __pyx_t_7 = (__pyx_v_row2 + __pyx_v_a);
- (__pyx_v_syn1neg[__pyx_t_7]) = ((__pyx_v_syn1neg[__pyx_t_7]) + (__pyx_v_g * (__pyx_v_syn0[(__pyx_v_row1 + __pyx_v_a)])));
- }
- goto __pyx_L16;
+ __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_our_saxpy((&__pyx_v_layer1_size), (&__pyx_v_g), __pyx_v_neu1, (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE), (&(__pyx_v_syn1neg[__pyx_v_row2])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE));
+ goto __pyx_L10;
}
- __pyx_L16:;
- __pyx_L5_continue:;
+ __pyx_L10:;
+ __pyx_L3_continue:;
}
- /* "trunk/gensim/models/doc2vec_inner.pyx":276
- * for a in range(size):
- * syn1neg[row2 + a] += g * syn0[row1 + a]
- * if tl: # <<<<<<<<<<<<<<
- * for a in range(size):
- * syn0[row1 + a] += work[a]
- */
- __pyx_t_4 = (__pyx_v_tl != 0);
- if (__pyx_t_4) {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":277
- * syn1neg[row2 + a] += g * syn0[row1 + a]
- * if tl:
- * for a in range(size): # <<<<<<<<<<<<<<
- * syn0[row1 + a] += work[a]
- *
- */
- __pyx_t_1 = __pyx_v_size;
- for (__pyx_t_2 = 0; __pyx_t_2 < __pyx_t_1; __pyx_t_2+=1) {
- __pyx_v_a = __pyx_t_2;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":278
- * if tl:
- * for a in range(size):
- * syn0[row1 + a] += work[a] # <<<<<<<<<<<<<<
- *
- * return next_random
- */
- __pyx_t_7 = (__pyx_v_row1 + __pyx_v_a);
- (__pyx_v_syn0[__pyx_t_7]) = ((__pyx_v_syn0[__pyx_t_7]) + (__pyx_v_work[__pyx_v_a]));
- }
- goto __pyx_L19;
- }
- __pyx_L19:;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":280
- * syn0[row1 + a] += work[a]
+ /* "trunk/gensim/models/doc2vec_inner.pyx":265
+ * our_saxpy(&layer1_size, &g, neu1, &ONE, &syn1neg[row2], &ONE)
*
* return next_random # <<<<<<<<<<<<<<
*
- * cdef void fast_sentence0_dm_hs(
+ *
*/
__pyx_r = __pyx_v_next_random;
goto __pyx_L0;
- /* "trunk/gensim/models/doc2vec_inner.pyx":235
- * return next_random
+ /* "trunk/gensim/models/doc2vec_inner.pyx":230
*
- * cdef unsigned long long fast_sentence2_dbow_neg( # <<<<<<<<<<<<<<
- * const int negative, np.uint32_t *table, unsigned long long table_len,
- * REAL_t *syn0, REAL_t *syn1neg, const int size, const np.uint32_t word_index,
+ *
+ * cdef unsigned long long fast_document_dmc_neg( # <<<<<<<<<<<<<<
+ * const int negative, np.uint32_t *table, unsigned long long table_len, unsigned long long next_random,
+ * REAL_t *neu1, REAL_t *syn1neg, const int predict_word_index, const REAL_t alpha, REAL_t *work,
*/
/* function exit code */
@@ -2930,3301 +2830,1438 @@ static unsigned PY_LONG_LONG __pyx_f_5trunk_6gensim_6models_13doc2vec_inner_fast
return __pyx_r;
}
-/* "trunk/gensim/models/doc2vec_inner.pyx":282
- * return next_random
+/* "trunk/gensim/models/doc2vec_inner.pyx":268
*
- * cdef void fast_sentence0_dm_hs( # <<<<<<<<<<<<<<
- * const np.uint32_t *word_point, const np.uint8_t *word_code, int codelens[MAX_SENTENCE_LEN],
- * int lbl_codelens[MAX_SENTENCE_LEN], REAL_t *neu1, REAL_t *syn0, REAL_t *syn1, const int size,
- */
-
-static void __pyx_f_5trunk_6gensim_6models_13doc2vec_inner_fast_sentence0_dm_hs(__pyx_t_5numpy_uint32_t const *__pyx_v_word_point, __pyx_t_5numpy_uint8_t const *__pyx_v_word_code, int *__pyx_v_codelens, int *__pyx_v_lbl_codelens, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_neu1, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_syn0, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_syn1, int const __pyx_v_size, __pyx_t_5numpy_uint32_t const *__pyx_v_indexes, __pyx_t_5numpy_uint32_t const *__pyx_v_lbl_indexes, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t const __pyx_v_alpha, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_work, int __pyx_v_i, int __pyx_v_j, int __pyx_v_k, int __pyx_v_cbow_mean, int __pyx_v_lbl_length, int __pyx_v_tw, int __pyx_v_tl) {
- PY_LONG_LONG __pyx_v_b;
- PY_LONG_LONG __pyx_v_row2;
- __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t __pyx_v_f;
- __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t __pyx_v_g;
- __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t __pyx_v_count;
- __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t __pyx_v_inv_count;
- int __pyx_v_m;
- int __pyx_t_1;
- int __pyx_t_2;
- int __pyx_t_3;
- int __pyx_t_4;
- PY_LONG_LONG __pyx_t_5;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":293
- * cdef int m
*
- * memset(neu1, 0, size * cython.sizeof(REAL_t)) # <<<<<<<<<<<<<<
- * count = 0.0
- * for m in range(j, k):
+ * def train_document_dbow(model, word_vocabs, doctag_indexes, alpha, work=None, # <<<<<<<<<<<<<<
+ * train_words=False, learn_doctags=True, learn_words=True, learn_hidden=True,
+ * word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None):
*/
- memset(__pyx_v_neu1, 0, (__pyx_v_size * (sizeof(__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t))));
- /* "trunk/gensim/models/doc2vec_inner.pyx":294
- *
- * memset(neu1, 0, size * cython.sizeof(REAL_t))
- * count = 0.0 # <<<<<<<<<<<<<<
- * for m in range(j, k):
- * if m == i or codelens[m] == 0:
- */
- __pyx_v_count = ((__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t)0.0);
+/* Python wrapper */
+static PyObject *__pyx_pw_5trunk_6gensim_6models_13doc2vec_inner_1train_document_dbow(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
+static PyMethodDef __pyx_mdef_5trunk_6gensim_6models_13doc2vec_inner_1train_document_dbow = {"train_document_dbow", (PyCFunction)__pyx_pw_5trunk_6gensim_6models_13doc2vec_inner_1train_document_dbow, METH_VARARGS|METH_KEYWORDS, 0};
+static PyObject *__pyx_pw_5trunk_6gensim_6models_13doc2vec_inner_1train_document_dbow(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
+ PyObject *__pyx_v_model = 0;
+ PyObject *__pyx_v_word_vocabs = 0;
+ PyObject *__pyx_v_doctag_indexes = 0;
+ PyObject *__pyx_v_alpha = 0;
+ PyObject *__pyx_v_work = 0;
+ PyObject *__pyx_v_train_words = 0;
+ PyObject *__pyx_v_learn_doctags = 0;
+ PyObject *__pyx_v_learn_words = 0;
+ PyObject *__pyx_v_learn_hidden = 0;
+ PyObject *__pyx_v_word_vectors = 0;
+ PyObject *__pyx_v_word_locks = 0;
+ PyObject *__pyx_v_doctag_vectors = 0;
+ PyObject *__pyx_v_doctag_locks = 0;
+ int __pyx_lineno = 0;
+ const char *__pyx_filename = NULL;
+ int __pyx_clineno = 0;
+ PyObject *__pyx_r = 0;
+ __Pyx_RefNannyDeclarations
+ __Pyx_RefNannySetupContext("train_document_dbow (wrapper)", 0);
+ {
+ static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_model,&__pyx_n_s_word_vocabs,&__pyx_n_s_doctag_indexes,&__pyx_n_s_alpha,&__pyx_n_s_work,&__pyx_n_s_train_words,&__pyx_n_s_learn_doctags,&__pyx_n_s_learn_words,&__pyx_n_s_learn_hidden,&__pyx_n_s_word_vectors,&__pyx_n_s_word_locks,&__pyx_n_s_doctag_vectors,&__pyx_n_s_doctag_locks,0};
+ PyObject* values[13] = {0,0,0,0,0,0,0,0,0,0,0,0,0};
+ values[4] = ((PyObject *)Py_None);
- /* "trunk/gensim/models/doc2vec_inner.pyx":295
- * memset(neu1, 0, size * cython.sizeof(REAL_t))
- * count = 0.0
- * for m in range(j, k): # <<<<<<<<<<<<<<
- * if m == i or codelens[m] == 0:
- * continue
+ /* "trunk/gensim/models/doc2vec_inner.pyx":269
+ *
+ * def train_document_dbow(model, word_vocabs, doctag_indexes, alpha, work=None,
+ * train_words=False, learn_doctags=True, learn_words=True, learn_hidden=True, # <<<<<<<<<<<<<<
+ * word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None):
+ * cdef int hs = model.hs
*/
- __pyx_t_1 = __pyx_v_k;
- for (__pyx_t_2 = __pyx_v_j; __pyx_t_2 < __pyx_t_1; __pyx_t_2+=1) {
- __pyx_v_m = __pyx_t_2;
+ values[5] = ((PyObject *)Py_False);
+ values[6] = ((PyObject *)Py_True);
+ values[7] = ((PyObject *)Py_True);
+ values[8] = ((PyObject *)Py_True);
- /* "trunk/gensim/models/doc2vec_inner.pyx":296
- * count = 0.0
- * for m in range(j, k):
- * if m == i or codelens[m] == 0: # <<<<<<<<<<<<<<
- * continue
- * else:
+ /* "trunk/gensim/models/doc2vec_inner.pyx":270
+ * def train_document_dbow(model, word_vocabs, doctag_indexes, alpha, work=None,
+ * train_words=False, learn_doctags=True, learn_words=True, learn_hidden=True,
+ * word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None): # <<<<<<<<<<<<<<
+ * cdef int hs = model.hs
+ * cdef int negative = model.negative
*/
- __pyx_t_4 = ((__pyx_v_m == __pyx_v_i) != 0);
- if (!__pyx_t_4) {
+ values[9] = ((PyObject *)Py_None);
+ values[10] = ((PyObject *)Py_None);
+ values[11] = ((PyObject *)Py_None);
+ values[12] = ((PyObject *)Py_None);
+ if (unlikely(__pyx_kwds)) {
+ Py_ssize_t kw_args;
+ const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args);
+ switch (pos_args) {
+ case 13: values[12] = PyTuple_GET_ITEM(__pyx_args, 12);
+ case 12: values[11] = PyTuple_GET_ITEM(__pyx_args, 11);
+ case 11: values[10] = PyTuple_GET_ITEM(__pyx_args, 10);
+ case 10: values[9] = PyTuple_GET_ITEM(__pyx_args, 9);
+ case 9: values[8] = PyTuple_GET_ITEM(__pyx_args, 8);
+ case 8: values[7] = PyTuple_GET_ITEM(__pyx_args, 7);
+ case 7: values[6] = PyTuple_GET_ITEM(__pyx_args, 6);
+ case 6: values[5] = PyTuple_GET_ITEM(__pyx_args, 5);
+ case 5: values[4] = PyTuple_GET_ITEM(__pyx_args, 4);
+ case 4: values[3] = PyTuple_GET_ITEM(__pyx_args, 3);
+ case 3: values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+ case 2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+ case 1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+ case 0: break;
+ default: goto __pyx_L5_argtuple_error;
+ }
+ kw_args = PyDict_Size(__pyx_kwds);
+ switch (pos_args) {
+ case 0:
+ if (likely((values[0] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_model)) != 0)) kw_args--;
+ else goto __pyx_L5_argtuple_error;
+ case 1:
+ if (likely((values[1] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_word_vocabs)) != 0)) kw_args--;
+ else {
+ __Pyx_RaiseArgtupleInvalid("train_document_dbow", 0, 4, 13, 1); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 268; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+ }
+ case 2:
+ if (likely((values[2] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_doctag_indexes)) != 0)) kw_args--;
+ else {
+ __Pyx_RaiseArgtupleInvalid("train_document_dbow", 0, 4, 13, 2); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 268; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+ }
+ case 3:
+ if (likely((values[3] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_alpha)) != 0)) kw_args--;
+ else {
+ __Pyx_RaiseArgtupleInvalid("train_document_dbow", 0, 4, 13, 3); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 268; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+ }
+ case 4:
+ if (kw_args > 0) {
+ PyObject* value = PyDict_GetItem(__pyx_kwds, __pyx_n_s_work);
+ if (value) { values[4] = value; kw_args--; }
+ }
+ case 5:
+ if (kw_args > 0) {
+ PyObject* value = PyDict_GetItem(__pyx_kwds, __pyx_n_s_train_words);
+ if (value) { values[5] = value; kw_args--; }
+ }
+ case 6:
+ if (kw_args > 0) {
+ PyObject* value = PyDict_GetItem(__pyx_kwds, __pyx_n_s_learn_doctags);
+ if (value) { values[6] = value; kw_args--; }
+ }
+ case 7:
+ if (kw_args > 0) {
+ PyObject* value = PyDict_GetItem(__pyx_kwds, __pyx_n_s_learn_words);
+ if (value) { values[7] = value; kw_args--; }
+ }
+ case 8:
+ if (kw_args > 0) {
+ PyObject* value = PyDict_GetItem(__pyx_kwds, __pyx_n_s_learn_hidden);
+ if (value) { values[8] = value; kw_args--; }
+ }
+ case 9:
+ if (kw_args > 0) {
+ PyObject* value = PyDict_GetItem(__pyx_kwds, __pyx_n_s_word_vectors);
+ if (value) { values[9] = value; kw_args--; }
+ }
+ case 10:
+ if (kw_args > 0) {
+ PyObject* value = PyDict_GetItem(__pyx_kwds, __pyx_n_s_word_locks);
+ if (value) { values[10] = value; kw_args--; }
+ }
+ case 11:
+ if (kw_args > 0) {
+ PyObject* value = PyDict_GetItem(__pyx_kwds, __pyx_n_s_doctag_vectors);
+ if (value) { values[11] = value; kw_args--; }
+ }
+ case 12:
+ if (kw_args > 0) {
+ PyObject* value = PyDict_GetItem(__pyx_kwds, __pyx_n_s_doctag_locks);
+ if (value) { values[12] = value; kw_args--; }
+ }
+ }
+ if (unlikely(kw_args > 0)) {
+ if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "train_document_dbow") < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 268; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+ }
} else {
- __pyx_t_3 = __pyx_t_4;
- goto __pyx_L6_bool_binop_done;
+ switch (PyTuple_GET_SIZE(__pyx_args)) {
+ case 13: values[12] = PyTuple_GET_ITEM(__pyx_args, 12);
+ case 12: values[11] = PyTuple_GET_ITEM(__pyx_args, 11);
+ case 11: values[10] = PyTuple_GET_ITEM(__pyx_args, 10);
+ case 10: values[9] = PyTuple_GET_ITEM(__pyx_args, 9);
+ case 9: values[8] = PyTuple_GET_ITEM(__pyx_args, 8);
+ case 8: values[7] = PyTuple_GET_ITEM(__pyx_args, 7);
+ case 7: values[6] = PyTuple_GET_ITEM(__pyx_args, 6);
+ case 6: values[5] = PyTuple_GET_ITEM(__pyx_args, 5);
+ case 5: values[4] = PyTuple_GET_ITEM(__pyx_args, 4);
+ case 4: values[3] = PyTuple_GET_ITEM(__pyx_args, 3);
+ values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+ values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+ values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+ break;
+ default: goto __pyx_L5_argtuple_error;
+ }
}
- __pyx_t_4 = (((__pyx_v_codelens[__pyx_v_m]) == 0) != 0);
- __pyx_t_3 = __pyx_t_4;
- __pyx_L6_bool_binop_done:;
- if (__pyx_t_3) {
+ __pyx_v_model = values[0];
+ __pyx_v_word_vocabs = values[1];
+ __pyx_v_doctag_indexes = values[2];
+ __pyx_v_alpha = values[3];
+ __pyx_v_work = values[4];
+ __pyx_v_train_words = values[5];
+ __pyx_v_learn_doctags = values[6];
+ __pyx_v_learn_words = values[7];
+ __pyx_v_learn_hidden = values[8];
+ __pyx_v_word_vectors = values[9];
+ __pyx_v_word_locks = values[10];
+ __pyx_v_doctag_vectors = values[11];
+ __pyx_v_doctag_locks = values[12];
+ }
+ goto __pyx_L4_argument_unpacking_done;
+ __pyx_L5_argtuple_error:;
+ __Pyx_RaiseArgtupleInvalid("train_document_dbow", 0, 4, 13, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 268; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+ __pyx_L3_error:;
+ __Pyx_AddTraceback("trunk.gensim.models.doc2vec_inner.train_document_dbow", __pyx_clineno, __pyx_lineno, __pyx_filename);
+ __Pyx_RefNannyFinishContext();
+ return NULL;
+ __pyx_L4_argument_unpacking_done:;
+ __pyx_r = __pyx_pf_5trunk_6gensim_6models_13doc2vec_inner_train_document_dbow(__pyx_self, __pyx_v_model, __pyx_v_word_vocabs, __pyx_v_doctag_indexes, __pyx_v_alpha, __pyx_v_work, __pyx_v_train_words, __pyx_v_learn_doctags, __pyx_v_learn_words, __pyx_v_learn_hidden, __pyx_v_word_vectors, __pyx_v_word_locks, __pyx_v_doctag_vectors, __pyx_v_doctag_locks);
- /* "trunk/gensim/models/doc2vec_inner.pyx":297
- * for m in range(j, k):
- * if m == i or codelens[m] == 0:
- * continue # <<<<<<<<<<<<<<
- * else:
- * count += ONEF
+ /* "trunk/gensim/models/doc2vec_inner.pyx":268
+ *
+ *
+ * def train_document_dbow(model, word_vocabs, doctag_indexes, alpha, work=None, # <<<<<<<<<<<<<<
+ * train_words=False, learn_doctags=True, learn_words=True, learn_hidden=True,
+ * word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None):
*/
- goto __pyx_L3_continue;
- }
- /*else*/ {
- /* "trunk/gensim/models/doc2vec_inner.pyx":299
- * continue
- * else:
- * count += ONEF # <<<<<<<<<<<<<<
- * saxpy(&size, &ONEF, &syn0[indexes[m] * size], &ONE, neu1, &ONE)
- * for m in range(lbl_length):
- */
- __pyx_v_count = (__pyx_v_count + __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONEF);
+ /* function exit code */
+ __Pyx_RefNannyFinishContext();
+ return __pyx_r;
+}
- /* "trunk/gensim/models/doc2vec_inner.pyx":300
- * else:
- * count += ONEF
- * saxpy(&size, &ONEF, &syn0[indexes[m] * size], &ONE, neu1, &ONE) # <<<<<<<<<<<<<<
- * for m in range(lbl_length):
- * if lbl_codelens[m] == 0:
+static PyObject *__pyx_pf_5trunk_6gensim_6models_13doc2vec_inner_train_document_dbow(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_model, PyObject *__pyx_v_word_vocabs, PyObject *__pyx_v_doctag_indexes, PyObject *__pyx_v_alpha, PyObject *__pyx_v_work, PyObject *__pyx_v_train_words, PyObject *__pyx_v_learn_doctags, PyObject *__pyx_v_learn_words, PyObject *__pyx_v_learn_hidden, PyObject *__pyx_v_word_vectors, PyObject *__pyx_v_word_locks, PyObject *__pyx_v_doctag_vectors, PyObject *__pyx_v_doctag_locks) {
+ int __pyx_v_hs;
+ int __pyx_v_negative;
+ int __pyx_v__train_words;
+ int __pyx_v__learn_words;
+ int __pyx_v__learn_hidden;
+ int __pyx_v__learn_doctags;
+ __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v__word_vectors;
+ __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v__doctag_vectors;
+ __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v__word_locks;
+ __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v__doctag_locks;
+ __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v__work;
+ __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t __pyx_v__alpha;
+ int __pyx_v_size;
+ int __pyx_v_codelens[10000];
+ __pyx_t_5numpy_uint32_t __pyx_v_indexes[10000];
+ __pyx_t_5numpy_uint32_t __pyx_v__doctag_indexes[10000];
+ __pyx_t_5numpy_uint32_t __pyx_v_reduced_windows[10000];
+ int __pyx_v_document_len;
+ int __pyx_v_doctag_len;
+ int __pyx_v_window;
+ int __pyx_v_i;
+ int __pyx_v_j;
+ long __pyx_v_result;
+ __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_syn1;
+ __pyx_t_5numpy_uint32_t *__pyx_v_points[10000];
+ __pyx_t_5numpy_uint8_t *__pyx_v_codes[10000];
+ __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_syn1neg;
+ __pyx_t_5numpy_uint32_t *__pyx_v_table;
+ unsigned PY_LONG_LONG __pyx_v_table_len;
+ unsigned PY_LONG_LONG __pyx_v_next_random;
+ PyObject *__pyx_v_predict_word = NULL;
+ PyObject *__pyx_v_item = NULL;
+ long __pyx_v_k;
+ PyObject *__pyx_r = NULL;
+ __Pyx_RefNannyDeclarations
+ PyObject *__pyx_t_1 = NULL;
+ int __pyx_t_2;
+ __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t __pyx_t_3;
+ int __pyx_t_4;
+ int __pyx_t_5;
+ PyObject *__pyx_t_6 = NULL;
+ Py_ssize_t __pyx_t_7;
+ PyObject *__pyx_t_8 = NULL;
+ unsigned PY_LONG_LONG __pyx_t_9;
+ PyObject *__pyx_t_10 = NULL;
+ long __pyx_t_11;
+ Py_ssize_t __pyx_t_12;
+ int __pyx_t_13;
+ __pyx_t_5numpy_uint32_t __pyx_t_14;
+ PyObject *__pyx_t_15 = NULL;
+ PyObject *__pyx_t_16 = NULL;
+ PyObject *(*__pyx_t_17)(PyObject *);
+ int __pyx_t_18;
+ int __pyx_t_19;
+ int __pyx_lineno = 0;
+ const char *__pyx_filename = NULL;
+ int __pyx_clineno = 0;
+ __Pyx_RefNannySetupContext("train_document_dbow", 0);
+ __Pyx_INCREF(__pyx_v_work);
+ __Pyx_INCREF(__pyx_v_word_vectors);
+ __Pyx_INCREF(__pyx_v_word_locks);
+ __Pyx_INCREF(__pyx_v_doctag_vectors);
+ __Pyx_INCREF(__pyx_v_doctag_locks);
+
+ /* "trunk/gensim/models/doc2vec_inner.pyx":271
+ * train_words=False, learn_doctags=True, learn_words=True, learn_hidden=True,
+ * word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None):
+ * cdef int hs = model.hs # <<<<<<<<<<<<<<
+ * cdef int negative = model.negative
+ * cdef int _train_words = train_words
*/
- __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_saxpy((&__pyx_v_size), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONEF), (&(__pyx_v_syn0[((__pyx_v_indexes[__pyx_v_m]) * __pyx_v_size)])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE), __pyx_v_neu1, (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE));
- }
- __pyx_L3_continue:;
- }
+ __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_hs); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 271; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+ __Pyx_GOTREF(__pyx_t_1);
+ __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_t_1); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 271; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+ __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+ __pyx_v_hs = __pyx_t_2;
- /* "trunk/gensim/models/doc2vec_inner.pyx":301
- * count += ONEF
- * saxpy(&size, &ONEF, &syn0[indexes[m] * size], &ONE, neu1, &ONE)
- * for m in range(lbl_length): # <<<<<<<<<<<<<<
- * if lbl_codelens[m] == 0:
- * continue
+ /* "trunk/gensim/models/doc2vec_inner.pyx":272
+ * word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None):
+ * cdef int hs = model.hs
+ * cdef int negative = model.negative # <<<<<<<<<<<<<<
+ * cdef int _train_words = train_words
+ * cdef int _learn_words = learn_words
*/
- __pyx_t_1 = __pyx_v_lbl_length;
- for (__pyx_t_2 = 0; __pyx_t_2 < __pyx_t_1; __pyx_t_2+=1) {
- __pyx_v_m = __pyx_t_2;
+ __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_negative); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 272; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+ __Pyx_GOTREF(__pyx_t_1);
+ __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_t_1); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 272; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+ __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+ __pyx_v_negative = __pyx_t_2;
- /* "trunk/gensim/models/doc2vec_inner.pyx":302
- * saxpy(&size, &ONEF, &syn0[indexes[m] * size], &ONE, neu1, &ONE)
- * for m in range(lbl_length):
- * if lbl_codelens[m] == 0: # <<<<<<<<<<<<<<
- * continue
- * else:
+ /* "trunk/gensim/models/doc2vec_inner.pyx":273
+ * cdef int hs = model.hs
+ * cdef int negative = model.negative
+ * cdef int _train_words = train_words # <<<<<<<<<<<<<<
+ * cdef int _learn_words = learn_words
+ * cdef int _learn_hidden = learn_hidden
*/
- __pyx_t_3 = (((__pyx_v_lbl_codelens[__pyx_v_m]) == 0) != 0);
- if (__pyx_t_3) {
+ __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_v_train_words); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 273; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+ __pyx_v__train_words = __pyx_t_2;
- /* "trunk/gensim/models/doc2vec_inner.pyx":303
- * for m in range(lbl_length):
- * if lbl_codelens[m] == 0:
- * continue # <<<<<<<<<<<<<<
- * else:
- * count += ONEF
+ /* "trunk/gensim/models/doc2vec_inner.pyx":274
+ * cdef int negative = model.negative
+ * cdef int _train_words = train_words
+ * cdef int _learn_words = learn_words # <<<<<<<<<<<<<<
+ * cdef int _learn_hidden = learn_hidden
+ * cdef int _learn_doctags = learn_doctags
*/
- goto __pyx_L8_continue;
- }
- /*else*/ {
+ __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_v_learn_words); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 274; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+ __pyx_v__learn_words = __pyx_t_2;
- /* "trunk/gensim/models/doc2vec_inner.pyx":305
- * continue
- * else:
- * count += ONEF # <<<<<<<<<<<<<<
- * saxpy(&size, &ONEF, &syn0[lbl_indexes[m] * size], &ONE, neu1, &ONE)
+ /* "trunk/gensim/models/doc2vec_inner.pyx":275
+ * cdef int _train_words = train_words
+ * cdef int _learn_words = learn_words
+ * cdef int _learn_hidden = learn_hidden # <<<<<<<<<<<<<<
+ * cdef int _learn_doctags = learn_doctags
*
*/
- __pyx_v_count = (__pyx_v_count + __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONEF);
+ __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_v_learn_hidden); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 275; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+ __pyx_v__learn_hidden = __pyx_t_2;
- /* "trunk/gensim/models/doc2vec_inner.pyx":306
- * else:
- * count += ONEF
- * saxpy(&size, &ONEF, &syn0[lbl_indexes[m] * size], &ONE, neu1, &ONE) # <<<<<<<<<<<<<<
+ /* "trunk/gensim/models/doc2vec_inner.pyx":276
+ * cdef int _learn_words = learn_words
+ * cdef int _learn_hidden = learn_hidden
+ * cdef int _learn_doctags = learn_doctags # <<<<<<<<<<<<<<
*
- * if cbow_mean and count > (0.5):
+ * cdef REAL_t *_word_vectors
*/
- __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_saxpy((&__pyx_v_size), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONEF), (&(__pyx_v_syn0[((__pyx_v_lbl_indexes[__pyx_v_m]) * __pyx_v_size)])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE), __pyx_v_neu1, (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE));
- }
- __pyx_L8_continue:;
- }
+ __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_v_learn_doctags); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 276; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+ __pyx_v__learn_doctags = __pyx_t_2;
- /* "trunk/gensim/models/doc2vec_inner.pyx":308
- * saxpy(&size, &ONEF, &syn0[lbl_indexes[m] * size], &ONE, neu1, &ONE)
+ /* "trunk/gensim/models/doc2vec_inner.pyx":283
+ * cdef REAL_t *_doctag_locks
+ * cdef REAL_t *_work
+ * cdef REAL_t _alpha = alpha # <<<<<<<<<<<<<<
+ * cdef int size = model.layer1_size
*
- * if cbow_mean and count > (0.5): # <<<<<<<<<<<<<<
- * inv_count = ONEF/count
- * sscal(&size, &inv_count, neu1, &ONE)
*/
- __pyx_t_4 = (__pyx_v_cbow_mean != 0);
- if (__pyx_t_4) {
- } else {
- __pyx_t_3 = __pyx_t_4;
- goto __pyx_L12_bool_binop_done;
- }
- __pyx_t_4 = ((__pyx_v_count > ((__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t)0.5)) != 0);
- __pyx_t_3 = __pyx_t_4;
- __pyx_L12_bool_binop_done:;
- if (__pyx_t_3) {
+ __pyx_t_3 = __pyx_PyFloat_AsFloat(__pyx_v_alpha); if (unlikely((__pyx_t_3 == (npy_float32)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 283; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+ __pyx_v__alpha = __pyx_t_3;
- /* "trunk/gensim/models/doc2vec_inner.pyx":309
- *
- * if cbow_mean and count > (0.5):
- * inv_count = ONEF/count # <<<<<<<<<<<<<<
- * sscal(&size, &inv_count, neu1, &ONE)
+ /* "trunk/gensim/models/doc2vec_inner.pyx":284
+ * cdef REAL_t *_work
+ * cdef REAL_t _alpha = alpha
+ * cdef int size = model.layer1_size # <<<<<<<<<<<<<<
*
+ * cdef int codelens[MAX_DOCUMENT_LEN]
*/
- __pyx_v_inv_count = (__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONEF / __pyx_v_count);
+ __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_layer1_size); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 284; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+ __Pyx_GOTREF(__pyx_t_1);
+ __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_t_1); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 284; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+ __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+ __pyx_v_size = __pyx_t_2;
- /* "trunk/gensim/models/doc2vec_inner.pyx":310
- * if cbow_mean and count > (0.5):
- * inv_count = ONEF/count
- * sscal(&size, &inv_count, neu1, &ONE) # <<<<<<<<<<<<<<
+ /* "trunk/gensim/models/doc2vec_inner.pyx":292
+ * cdef int document_len
+ * cdef int doctag_len
+ * cdef int window = model.window # <<<<<<<<<<<<<<
*
- * memset(work, 0, size * cython.sizeof(REAL_t))
+ * cdef int i, j
*/
- __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_sscal((&__pyx_v_size), (&__pyx_v_inv_count), __pyx_v_neu1, (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE));
- goto __pyx_L11;
- }
- __pyx_L11:;
+ __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_window); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 292; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+ __Pyx_GOTREF(__pyx_t_1);
+ __pyx_t_2 = __Pyx_PyInt_As_int(__pyx_t_1); if (unlikely((__pyx_t_2 == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 292; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+ __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+ __pyx_v_window = __pyx_t_2;
- /* "trunk/gensim/models/doc2vec_inner.pyx":312
- * sscal(&size, &inv_count, neu1, &ONE)
+ /* "trunk/gensim/models/doc2vec_inner.pyx":295
*
- * memset(work, 0, size * cython.sizeof(REAL_t)) # <<<<<<<<<<<<<<
- * for b in range(codelens[i]):
- * row2 = word_point[b] * size
+ * cdef int i, j
+ * cdef long result = 0 # <<<<<<<<<<<<<<
+ *
+ * # For hierarchical softmax
*/
- memset(__pyx_v_work, 0, (__pyx_v_size * (sizeof(__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t))));
+ __pyx_v_result = 0;
- /* "trunk/gensim/models/doc2vec_inner.pyx":313
+ /* "trunk/gensim/models/doc2vec_inner.pyx":309
*
- * memset(work, 0, size * cython.sizeof(REAL_t))
- * for b in range(codelens[i]): # <<<<<<<<<<<<<<
- * row2 = word_point[b] * size
- * f = dsdot(&size, neu1, &ONE, &syn1[row2], &ONE)
+ * # default vectors, locks from syn0/doctag_syn0
+ * if word_vectors is None: # <<<<<<<<<<<<<<
+ * word_vectors = model.syn0
+ * _word_vectors = (np.PyArray_DATA(word_vectors))
*/
- __pyx_t_1 = (__pyx_v_codelens[__pyx_v_i]);
- for (__pyx_t_5 = 0; __pyx_t_5 < __pyx_t_1; __pyx_t_5+=1) {
- __pyx_v_b = __pyx_t_5;
+ __pyx_t_4 = (__pyx_v_word_vectors == Py_None);
+ __pyx_t_5 = (__pyx_t_4 != 0);
+ if (__pyx_t_5) {
- /* "trunk/gensim/models/doc2vec_inner.pyx":314
- * memset(work, 0, size * cython.sizeof(REAL_t))
- * for b in range(codelens[i]):
- * row2 = word_point[b] * size # <<<<<<<<<<<<<<
- * f = dsdot(&size, neu1, &ONE, &syn1[row2], &ONE)
- * if f <= -MAX_EXP or f >= MAX_EXP:
+ /* "trunk/gensim/models/doc2vec_inner.pyx":310
+ * # default vectors, locks from syn0/doctag_syn0
+ * if word_vectors is None:
+ * word_vectors = model.syn0 # <<<<<<<<<<<<<<
+ * _word_vectors = (np.PyArray_DATA(word_vectors))
+ * if doctag_vectors is None:
*/
- __pyx_v_row2 = ((__pyx_v_word_point[__pyx_v_b]) * __pyx_v_size);
+ __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_syn0); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 310; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+ __Pyx_GOTREF(__pyx_t_1);
+ __Pyx_DECREF_SET(__pyx_v_word_vectors, __pyx_t_1);
+ __pyx_t_1 = 0;
+ goto __pyx_L3;
+ }
+ __pyx_L3:;
- /* "trunk/gensim/models/doc2vec_inner.pyx":315
- * for b in range(codelens[i]):
- * row2 = word_point[b] * size
- * f = dsdot(&size, neu1, &ONE, &syn1[row2], &ONE) # <<<<<<<<<<<<<<
- * if f <= -MAX_EXP or f >= MAX_EXP:
- * continue
+ /* "trunk/gensim/models/doc2vec_inner.pyx":311
+ * if word_vectors is None:
+ * word_vectors = model.syn0
+ * _word_vectors = (np.PyArray_DATA(word_vectors)) # <<<<<<<<<<<<<<
+ * if doctag_vectors is None:
+ * doctag_vectors = model.docvecs.doctag_syn0
*/
- __pyx_v_f = ((__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t)__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_dsdot((&__pyx_v_size), __pyx_v_neu1, (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE), (&(__pyx_v_syn1[__pyx_v_row2])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE)));
+ if (!(likely(((__pyx_v_word_vectors) == Py_None) || likely(__Pyx_TypeTest(__pyx_v_word_vectors, __pyx_ptype_5numpy_ndarray))))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 311; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+ __pyx_v__word_vectors = ((__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *)PyArray_DATA(((PyArrayObject *)__pyx_v_word_vectors)));
- /* "trunk/gensim/models/doc2vec_inner.pyx":316
- * row2 = word_point[b] * size
- * f = dsdot(&size, neu1, &ONE, &syn1[row2], &ONE)
- * if f <= -MAX_EXP or f >= MAX_EXP: # <<<<<<<<<<<<<<
- * continue
- * f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]
- */
- __pyx_t_4 = ((__pyx_v_f <= -6.0) != 0);
- if (!__pyx_t_4) {
- } else {
- __pyx_t_3 = __pyx_t_4;
- goto __pyx_L17_bool_binop_done;
- }
- __pyx_t_4 = ((__pyx_v_f >= 6.0) != 0);
- __pyx_t_3 = __pyx_t_4;
- __pyx_L17_bool_binop_done:;
- if (__pyx_t_3) {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":317
- * f = dsdot(&size, neu1, &ONE, &syn1[row2], &ONE)
- * if f <= -MAX_EXP or f >= MAX_EXP:
- * continue # <<<<<<<<<<<<<<
- * f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]
- * g = (1 - word_code[b] - f) * alpha
- */
- goto __pyx_L14_continue;
- }
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":318
- * if f <= -MAX_EXP or f >= MAX_EXP:
- * continue
- * f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))] # <<<<<<<<<<<<<<
- * g = (1 - word_code[b] - f) * alpha
- * saxpy(&size, &g, &syn1[row2], &ONE, work, &ONE)
- */
- __pyx_v_f = (__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_EXP_TABLE[((int)((__pyx_v_f + 6.0) * 83.0))]);
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":319
- * continue
- * f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]
- * g = (1 - word_code[b] - f) * alpha # <<<<<<<<<<<<<<
- * saxpy(&size, &g, &syn1[row2], &ONE, work, &ONE)
- * if tw:
- */
- __pyx_v_g = (((1 - (__pyx_v_word_code[__pyx_v_b])) - __pyx_v_f) * __pyx_v_alpha);
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":320
- * f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]
- * g = (1 - word_code[b] - f) * alpha
- * saxpy(&size, &g, &syn1[row2], &ONE, work, &ONE) # <<<<<<<<<<<<<<
- * if tw:
- * saxpy(&size, &g, neu1, &ONE, &syn1[row2], &ONE)
- */
- __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_saxpy((&__pyx_v_size), (&__pyx_v_g), (&(__pyx_v_syn1[__pyx_v_row2])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE), __pyx_v_work, (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE));
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":321
- * g = (1 - word_code[b] - f) * alpha
- * saxpy(&size, &g, &syn1[row2], &ONE, work, &ONE)
- * if tw: # <<<<<<<<<<<<<<
- * saxpy(&size, &g, neu1, &ONE, &syn1[row2], &ONE)
- * if tw:
- */
- __pyx_t_3 = (__pyx_v_tw != 0);
- if (__pyx_t_3) {
+ /* "trunk/gensim/models/doc2vec_inner.pyx":312
+ * word_vectors = model.syn0
+ * _word_vectors = (np.PyArray_DATA(word_vectors))
+ * if doctag_vectors is None: # <<<<<<<<<<<<<<
+ * doctag_vectors = model.docvecs.doctag_syn0
+ * _doctag_vectors = (np.PyArray_DATA(doctag_vectors))
+ */
+ __pyx_t_5 = (__pyx_v_doctag_vectors == Py_None);
+ __pyx_t_4 = (__pyx_t_5 != 0);
+ if (__pyx_t_4) {
- /* "trunk/gensim/models/doc2vec_inner.pyx":322
- * saxpy(&size, &g, &syn1[row2], &ONE, work, &ONE)
- * if tw:
- * saxpy(&size, &g, neu1, &ONE, &syn1[row2], &ONE) # <<<<<<<<<<<<<<
- * if tw:
- * for m in range(j, k):
+ /* "trunk/gensim/models/doc2vec_inner.pyx":313
+ * _word_vectors = (np.PyArray_DATA(word_vectors))
+ * if doctag_vectors is None:
+ * doctag_vectors = model.docvecs.doctag_syn0 # <<<<<<<<<<<<<<
+ * _doctag_vectors = (np.PyArray_DATA(doctag_vectors))
+ * if word_locks is None:
*/
- __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_saxpy((&__pyx_v_size), (&__pyx_v_g), __pyx_v_neu1, (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE), (&(__pyx_v_syn1[__pyx_v_row2])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE));
- goto __pyx_L19;
- }
- __pyx_L19:;
- __pyx_L14_continue:;
+ __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_docvecs); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 313; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+ __Pyx_GOTREF(__pyx_t_1);
+ __pyx_t_6 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_doctag_syn0); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 313; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+ __Pyx_GOTREF(__pyx_t_6);
+ __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+ __Pyx_DECREF_SET(__pyx_v_doctag_vectors, __pyx_t_6);
+ __pyx_t_6 = 0;
+ goto __pyx_L4;
}
+ __pyx_L4:;
- /* "trunk/gensim/models/doc2vec_inner.pyx":323
- * if tw:
- * saxpy(&size, &g, neu1, &ONE, &syn1[row2], &ONE)
- * if tw: # <<<<<<<<<<<<<<
- * for m in range(j, k):
- * if m == i or codelens[m] == 0:
- */
- __pyx_t_3 = (__pyx_v_tw != 0);
- if (__pyx_t_3) {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":324
- * saxpy(&size, &g, neu1, &ONE, &syn1[row2], &ONE)
- * if tw:
- * for m in range(j, k): # <<<<<<<<<<<<<<
- * if m == i or codelens[m] == 0:
- * continue
- */
- __pyx_t_1 = __pyx_v_k;
- for (__pyx_t_2 = __pyx_v_j; __pyx_t_2 < __pyx_t_1; __pyx_t_2+=1) {
- __pyx_v_m = __pyx_t_2;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":325
- * if tw:
- * for m in range(j, k):
- * if m == i or codelens[m] == 0: # <<<<<<<<<<<<<<
- * continue
- * else:
- */
- __pyx_t_4 = ((__pyx_v_m == __pyx_v_i) != 0);
- if (!__pyx_t_4) {
- } else {
- __pyx_t_3 = __pyx_t_4;
- goto __pyx_L24_bool_binop_done;
- }
- __pyx_t_4 = (((__pyx_v_codelens[__pyx_v_m]) == 0) != 0);
- __pyx_t_3 = __pyx_t_4;
- __pyx_L24_bool_binop_done:;
- if (__pyx_t_3) {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":326
- * for m in range(j, k):
- * if m == i or codelens[m] == 0:
- * continue # <<<<<<<<<<<<<<
- * else:
- * saxpy(&size, &ONEF, work, &ONE, &syn0[indexes[m] * size], &ONE)
- */
- goto __pyx_L21_continue;
- }
- /*else*/ {
+ /* "trunk/gensim/models/doc2vec_inner.pyx":314
+ * if doctag_vectors is None:
+ * doctag_vectors = model.docvecs.doctag_syn0
+ * _doctag_vectors = (np.PyArray_DATA(doctag_vectors)) # <<<<<<<<<<<<<<
+ * if word_locks is None:
+ * word_locks = model.syn0_lockf
+ */
+ if (!(likely(((__pyx_v_doctag_vectors) == Py_None) || likely(__Pyx_TypeTest(__pyx_v_doctag_vectors, __pyx_ptype_5numpy_ndarray))))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 314; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+ __pyx_v__doctag_vectors = ((__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *)PyArray_DATA(((PyArrayObject *)__pyx_v_doctag_vectors)));
+
+ /* "trunk/gensim/models/doc2vec_inner.pyx":315
+ * doctag_vectors = model.docvecs.doctag_syn0
+ * _doctag_vectors = (np.PyArray_DATA(doctag_vectors))
+ * if word_locks is None: # <<<<<<<<<<<<<<
+ * word_locks = model.syn0_lockf
+ * _word_locks = (np.PyArray_DATA(word_locks))
+ */
+ __pyx_t_4 = (__pyx_v_word_locks == Py_None);
+ __pyx_t_5 = (__pyx_t_4 != 0);
+ if (__pyx_t_5) {
- /* "trunk/gensim/models/doc2vec_inner.pyx":328
- * continue
- * else:
- * saxpy(&size, &ONEF, work, &ONE, &syn0[indexes[m] * size], &ONE) # <<<<<<<<<<<<<<
- * if tl:
- * for m in range(lbl_length):
+ /* "trunk/gensim/models/doc2vec_inner.pyx":316
+ * _doctag_vectors = (np.PyArray_DATA(doctag_vectors))
+ * if word_locks is None:
+ * word_locks = model.syn0_lockf # <<<<<<<<<<<<<<
+ * _word_locks = (np.PyArray_DATA(word_locks))
+ * if doctag_locks is None:
*/
- __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_saxpy((&__pyx_v_size), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONEF), __pyx_v_work, (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE), (&(__pyx_v_syn0[((__pyx_v_indexes[__pyx_v_m]) * __pyx_v_size)])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE));
- }
- __pyx_L21_continue:;
- }
- goto __pyx_L20;
+ __pyx_t_6 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_syn0_lockf); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 316; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+ __Pyx_GOTREF(__pyx_t_6);
+ __Pyx_DECREF_SET(__pyx_v_word_locks, __pyx_t_6);
+ __pyx_t_6 = 0;
+ goto __pyx_L5;
}
- __pyx_L20:;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":329
- * else:
- * saxpy(&size, &ONEF, work, &ONE, &syn0[indexes[m] * size], &ONE)
- * if tl: # <<<<<<<<<<<<<<
- * for m in range(lbl_length):
- * if lbl_codelens[m] == 0:
- */
- __pyx_t_3 = (__pyx_v_tl != 0);
- if (__pyx_t_3) {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":330
- * saxpy(&size, &ONEF, work, &ONE, &syn0[indexes[m] * size], &ONE)
- * if tl:
- * for m in range(lbl_length): # <<<<<<<<<<<<<<
- * if lbl_codelens[m] == 0:
- * continue
- */
- __pyx_t_1 = __pyx_v_lbl_length;
- for (__pyx_t_2 = 0; __pyx_t_2 < __pyx_t_1; __pyx_t_2+=1) {
- __pyx_v_m = __pyx_t_2;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":331
- * if tl:
- * for m in range(lbl_length):
- * if lbl_codelens[m] == 0: # <<<<<<<<<<<<<<
- * continue
- * else:
- */
- __pyx_t_3 = (((__pyx_v_lbl_codelens[__pyx_v_m]) == 0) != 0);
- if (__pyx_t_3) {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":332
- * for m in range(lbl_length):
- * if lbl_codelens[m] == 0:
- * continue # <<<<<<<<<<<<<<
- * else:
- * saxpy(&size, &ONEF, work, &ONE, &syn0[lbl_indexes[m]*size], &ONE)
- */
- goto __pyx_L27_continue;
- }
- /*else*/ {
+ __pyx_L5:;
+
+ /* "trunk/gensim/models/doc2vec_inner.pyx":317
+ * if word_locks is None:
+ * word_locks = model.syn0_lockf
+ * _word_locks = (np.PyArray_DATA(word_locks)) # <<<<<<<<<<<<<<
+ * if doctag_locks is None:
+ * doctag_locks = model.docvecs.doctag_syn0_lockf
+ */
+ if (!(likely(((__pyx_v_word_locks) == Py_None) || likely(__Pyx_TypeTest(__pyx_v_word_locks, __pyx_ptype_5numpy_ndarray))))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 317; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+ __pyx_v__word_locks = ((__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *)PyArray_DATA(((PyArrayObject *)__pyx_v_word_locks)));
+
+ /* "trunk/gensim/models/doc2vec_inner.pyx":318
+ * word_locks = model.syn0_lockf
+ * _word_locks = (np.PyArray_DATA(word_locks))
+ * if doctag_locks is None: # <<<<<<<<<<<<<<
+ * doctag_locks = model.docvecs.doctag_syn0_lockf
+ * _doctag_locks = (np.PyArray_DATA(doctag_locks))
+ */
+ __pyx_t_5 = (__pyx_v_doctag_locks == Py_None);
+ __pyx_t_4 = (__pyx_t_5 != 0);
+ if (__pyx_t_4) {
- /* "trunk/gensim/models/doc2vec_inner.pyx":334
- * continue
- * else:
- * saxpy(&size, &ONEF, work, &ONE, &syn0[lbl_indexes[m]*size], &ONE) # <<<<<<<<<<<<<<
+ /* "trunk/gensim/models/doc2vec_inner.pyx":319
+ * _word_locks = (np.PyArray_DATA(word_locks))
+ * if doctag_locks is None:
+ * doctag_locks = model.docvecs.doctag_syn0_lockf # <<<<<<<<<<<<<<
+ * _doctag_locks = (np.PyArray_DATA(doctag_locks))
*
- * cdef void fast_sentence1_dm_hs(
*/
- __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_saxpy((&__pyx_v_size), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONEF), __pyx_v_work, (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE), (&(__pyx_v_syn0[((__pyx_v_lbl_indexes[__pyx_v_m]) * __pyx_v_size)])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE));
- }
- __pyx_L27_continue:;
- }
- goto __pyx_L26;
+ __pyx_t_6 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_docvecs); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 319; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+ __Pyx_GOTREF(__pyx_t_6);
+ __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_t_6, __pyx_n_s_doctag_syn0_lockf); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 319; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+ __Pyx_GOTREF(__pyx_t_1);
+ __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0;
+ __Pyx_DECREF_SET(__pyx_v_doctag_locks, __pyx_t_1);
+ __pyx_t_1 = 0;
+ goto __pyx_L6;
}
- __pyx_L26:;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":282
- * return next_random
- *
- * cdef void fast_sentence0_dm_hs( # <<<<<<<<<<<<<<
- * const np.uint32_t *word_point, const np.uint8_t *word_code, int codelens[MAX_SENTENCE_LEN],
- * int lbl_codelens[MAX_SENTENCE_LEN], REAL_t *neu1, REAL_t *syn0, REAL_t *syn1, const int size,
- */
-
- /* function exit code */
-}
+ __pyx_L6:;
-/* "trunk/gensim/models/doc2vec_inner.pyx":336
- * saxpy(&size, &ONEF, work, &ONE, &syn0[lbl_indexes[m]*size], &ONE)
+ /* "trunk/gensim/models/doc2vec_inner.pyx":320
+ * if doctag_locks is None:
+ * doctag_locks = model.docvecs.doctag_syn0_lockf
+ * _doctag_locks = (np.PyArray_DATA(doctag_locks)) # <<<<<<<<<<<<<<
*
- * cdef void fast_sentence1_dm_hs( # <<<<<<<<<<<<<<
- * const np.uint32_t *word_point, const np.uint8_t *word_code, int codelens[MAX_SENTENCE_LEN],
- * int lbl_codelens[MAX_SENTENCE_LEN], REAL_t *neu1, REAL_t *syn0, REAL_t *syn1, const int size,
+ * if hs:
*/
+ if (!(likely(((__pyx_v_doctag_locks) == Py_None) || likely(__Pyx_TypeTest(__pyx_v_doctag_locks, __pyx_ptype_5numpy_ndarray))))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 320; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+ __pyx_v__doctag_locks = ((__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *)PyArray_DATA(((PyArrayObject *)__pyx_v_doctag_locks)));
-static void __pyx_f_5trunk_6gensim_6models_13doc2vec_inner_fast_sentence1_dm_hs(__pyx_t_5numpy_uint32_t const *__pyx_v_word_point, __pyx_t_5numpy_uint8_t const *__pyx_v_word_code, int *__pyx_v_codelens, int *__pyx_v_lbl_codelens, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_neu1, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_syn0, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_syn1, int const __pyx_v_size, __pyx_t_5numpy_uint32_t const *__pyx_v_indexes, __pyx_t_5numpy_uint32_t const *__pyx_v_lbl_indexes, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t const __pyx_v_alpha, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_work, int __pyx_v_i, int __pyx_v_j, int __pyx_v_k, int __pyx_v_cbow_mean, int __pyx_v_lbl_length, int __pyx_v_tw, int __pyx_v_tl) {
- PY_LONG_LONG __pyx_v_b;
- PY_LONG_LONG __pyx_v_row2;
- __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t __pyx_v_f;
- __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t __pyx_v_g;
- __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t __pyx_v_count;
- __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t __pyx_v_inv_count;
- int __pyx_v_m;
- int __pyx_t_1;
- int __pyx_t_2;
- int __pyx_t_3;
- int __pyx_t_4;
- PY_LONG_LONG __pyx_t_5;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":347
- * cdef int m
+ /* "trunk/gensim/models/doc2vec_inner.pyx":322
+ * _doctag_locks = (np.PyArray_DATA(doctag_locks))
*
- * memset(neu1, 0, size * cython.sizeof(REAL_t)) # <<<<<<<<<<<<<<
- * count = 0.0
- * for m in range(j, k):
- */
- memset(__pyx_v_neu1, 0, (__pyx_v_size * (sizeof(__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t))));
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":348
+ * if hs: # <<<<<<<<<<<<<<
+ * syn1 = (np.PyArray_DATA(model.syn1))
*
- * memset(neu1, 0, size * cython.sizeof(REAL_t))
- * count = 0.0 # <<<<<<<<<<<<<<
- * for m in range(j, k):
- * if m == i or codelens[m] == 0:
- */
- __pyx_v_count = ((__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t)0.0);
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":349
- * memset(neu1, 0, size * cython.sizeof(REAL_t))
- * count = 0.0
- * for m in range(j, k): # <<<<<<<<<<<<<<
- * if m == i or codelens[m] == 0:
- * continue
- */
- __pyx_t_1 = __pyx_v_k;
- for (__pyx_t_2 = __pyx_v_j; __pyx_t_2 < __pyx_t_1; __pyx_t_2+=1) {
- __pyx_v_m = __pyx_t_2;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":350
- * count = 0.0
- * for m in range(j, k):
- * if m == i or codelens[m] == 0: # <<<<<<<<<<<<<<
- * continue
- * else:
- */
- __pyx_t_4 = ((__pyx_v_m == __pyx_v_i) != 0);
- if (!__pyx_t_4) {
- } else {
- __pyx_t_3 = __pyx_t_4;
- goto __pyx_L6_bool_binop_done;
- }
- __pyx_t_4 = (((__pyx_v_codelens[__pyx_v_m]) == 0) != 0);
- __pyx_t_3 = __pyx_t_4;
- __pyx_L6_bool_binop_done:;
- if (__pyx_t_3) {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":351
- * for m in range(j, k):
- * if m == i or codelens[m] == 0:
- * continue # <<<<<<<<<<<<<<
- * else:
- * count += ONEF
- */
- goto __pyx_L3_continue;
- }
- /*else*/ {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":353
- * continue
- * else:
- * count += ONEF # <<<<<<<<<<<<<<
- * saxpy(&size, &ONEF, &syn0[indexes[m] * size], &ONE, neu1, &ONE)
- * for m in range(lbl_length):
- */
- __pyx_v_count = (__pyx_v_count + __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONEF);
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":354
- * else:
- * count += ONEF
- * saxpy(&size, &ONEF, &syn0[indexes[m] * size], &ONE, neu1, &ONE) # <<<<<<<<<<<<<<
- * for m in range(lbl_length):
- * if lbl_codelens[m] == 0:
- */
- __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_saxpy((&__pyx_v_size), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONEF), (&(__pyx_v_syn0[((__pyx_v_indexes[__pyx_v_m]) * __pyx_v_size)])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE), __pyx_v_neu1, (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE));
- }
- __pyx_L3_continue:;
- }
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":355
- * count += ONEF
- * saxpy(&size, &ONEF, &syn0[indexes[m] * size], &ONE, neu1, &ONE)
- * for m in range(lbl_length): # <<<<<<<<<<<<<<
- * if lbl_codelens[m] == 0:
- * continue
- */
- __pyx_t_1 = __pyx_v_lbl_length;
- for (__pyx_t_2 = 0; __pyx_t_2 < __pyx_t_1; __pyx_t_2+=1) {
- __pyx_v_m = __pyx_t_2;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":356
- * saxpy(&size, &ONEF, &syn0[indexes[m] * size], &ONE, neu1, &ONE)
- * for m in range(lbl_length):
- * if lbl_codelens[m] == 0: # <<<<<<<<<<<<<<
- * continue
- * else:
- */
- __pyx_t_3 = (((__pyx_v_lbl_codelens[__pyx_v_m]) == 0) != 0);
- if (__pyx_t_3) {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":357
- * for m in range(lbl_length):
- * if lbl_codelens[m] == 0:
- * continue # <<<<<<<<<<<<<<
- * else:
- * count += ONEF
*/
- goto __pyx_L8_continue;
- }
- /*else*/ {
+ __pyx_t_4 = (__pyx_v_hs != 0);
+ if (__pyx_t_4) {
- /* "trunk/gensim/models/doc2vec_inner.pyx":359
- * continue
- * else:
- * count += ONEF # <<<<<<<<<<<<<<
- * saxpy(&size, &ONEF, &syn0[lbl_indexes[m] * size], &ONE, neu1, &ONE)
+ /* "trunk/gensim/models/doc2vec_inner.pyx":323
*
- */
- __pyx_v_count = (__pyx_v_count + __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONEF);
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":360
- * else:
- * count += ONEF
- * saxpy(&size, &ONEF, &syn0[lbl_indexes[m] * size], &ONE, neu1, &ONE) # <<<<<<<<<<<<<<
+ * if hs:
+ * syn1 = (np.PyArray_DATA(model.syn1)) # <<<<<<<<<<<<<<
*
- * if cbow_mean and count > (0.5):
+ * if negative:
*/
- __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_saxpy((&__pyx_v_size), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONEF), (&(__pyx_v_syn0[((__pyx_v_lbl_indexes[__pyx_v_m]) * __pyx_v_size)])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE), __pyx_v_neu1, (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE));
- }
- __pyx_L8_continue:;
+ __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_syn1); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 323; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+ __Pyx_GOTREF(__pyx_t_1);
+ if (!(likely(((__pyx_t_1) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_1, __pyx_ptype_5numpy_ndarray))))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 323; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+ __pyx_v_syn1 = ((__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *)PyArray_DATA(((PyArrayObject *)__pyx_t_1)));
+ __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+ goto __pyx_L7;
}
+ __pyx_L7:;
- /* "trunk/gensim/models/doc2vec_inner.pyx":362
- * saxpy(&size, &ONEF, &syn0[lbl_indexes[m] * size], &ONE, neu1, &ONE)
+ /* "trunk/gensim/models/doc2vec_inner.pyx":325
+ * syn1 = (np.PyArray_DATA(model.syn1))
*
- * if cbow_mean and count > (0.5): # <<<<<<<<<<<<<<
- * inv_count = ONEF/count
- * sscal(&size, &inv_count , neu1, &ONE)
+ * if negative: # <<<<<<<<<<<<<<
+ * syn1neg = (np.PyArray_DATA(model.syn1neg))
+ * table = (np.PyArray_DATA(model.table))
*/
- __pyx_t_4 = (__pyx_v_cbow_mean != 0);
+ __pyx_t_4 = (__pyx_v_negative != 0);
if (__pyx_t_4) {
- } else {
- __pyx_t_3 = __pyx_t_4;
- goto __pyx_L12_bool_binop_done;
- }
- __pyx_t_4 = ((__pyx_v_count > ((__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t)0.5)) != 0);
- __pyx_t_3 = __pyx_t_4;
- __pyx_L12_bool_binop_done:;
- if (__pyx_t_3) {
- /* "trunk/gensim/models/doc2vec_inner.pyx":363
- *
- * if cbow_mean and count > (0.5):
- * inv_count = ONEF/count # <<<<<<<<<<<<<<
- * sscal(&size, &inv_count , neu1, &ONE)
+ /* "trunk/gensim/models/doc2vec_inner.pyx":326
*
+ * if negative:
+ * syn1neg = (np.PyArray_DATA(model.syn1neg)) # <<<<<<<<<<<<<<
+ * table = (np.PyArray_DATA(model.table))
+ * table_len = len(model.table)
*/
- __pyx_v_inv_count = (__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONEF / __pyx_v_count);
+ __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_syn1neg); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 326; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+ __Pyx_GOTREF(__pyx_t_1);
+ if (!(likely(((__pyx_t_1) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_1, __pyx_ptype_5numpy_ndarray))))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 326; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+ __pyx_v_syn1neg = ((__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *)PyArray_DATA(((PyArrayObject *)__pyx_t_1)));
+ __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
- /* "trunk/gensim/models/doc2vec_inner.pyx":364
- * if cbow_mean and count > (0.5):
- * inv_count = ONEF/count
- * sscal(&size, &inv_count , neu1, &ONE) # <<<<<<<<<<<<<<
- *
- * memset(work, 0, size * cython.sizeof(REAL_t))
+ /* "trunk/gensim/models/doc2vec_inner.pyx":327
+ * if negative:
+ * syn1neg = (np.PyArray_DATA(model.syn1neg))
+ * table = (np.PyArray_DATA(model.table)) # <<<<<<<<<<<<<<
+ * table_len = len(model.table)
+ * next_random = (2**24) * np.random.randint(0, 2**24) + np.random.randint(0, 2**24)
*/
- __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_sscal((&__pyx_v_size), (&__pyx_v_inv_count), __pyx_v_neu1, (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE));
- goto __pyx_L11;
- }
- __pyx_L11:;
+ __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_table); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 327; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+ __Pyx_GOTREF(__pyx_t_1);
+ if (!(likely(((__pyx_t_1) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_1, __pyx_ptype_5numpy_ndarray))))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 327; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+ __pyx_v_table = ((__pyx_t_5numpy_uint32_t *)PyArray_DATA(((PyArrayObject *)__pyx_t_1)));
+ __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
- /* "trunk/gensim/models/doc2vec_inner.pyx":366
- * sscal(&size, &inv_count , neu1, &ONE)
+ /* "trunk/gensim/models/doc2vec_inner.pyx":328
+ * syn1neg = (np.PyArray_DATA(model.syn1neg))
+ * table = (np.PyArray_DATA(model.table))
+ * table_len = len(model.table) # <<<<<<<<<<<<<<
+ * next_random = (2**24) * np.random.randint(0, 2**24) + np.random.randint(0, 2**24)
*
- * memset(work, 0, size * cython.sizeof(REAL_t)) # <<<<<<<<<<<<<<
- * for b in range(codelens[i]):
- * row2 = word_point[b] * size
*/
- memset(__pyx_v_work, 0, (__pyx_v_size * (sizeof(__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t))));
+ __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_model, __pyx_n_s_table); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 328; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+ __Pyx_GOTREF(__pyx_t_1);
+ __pyx_t_7 = PyObject_Length(__pyx_t_1); if (unlikely(__pyx_t_7 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 328; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+ __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+ __pyx_v_table_len = __pyx_t_7;
- /* "trunk/gensim/models/doc2vec_inner.pyx":367
+ /* "trunk/gensim/models/doc2vec_inner.pyx":329
+ * table = (np.PyArray_DATA(model.table))
+ * table_len = len(model.table)
+ * next_random = (2**24) * np.random.randint(0, 2**24) + np.random.randint(0, 2**24) # <<<<<<<<<<<<<<
*
- * memset(work, 0, size * cython.sizeof(REAL_t))
- * for b in range(codelens[i]): # <<<<<<<<<<<<<<
- * row2 = word_point[b] * size
- * f = sdot(&size, neu1, &ONE, &syn1[row2], &ONE)
- */
- __pyx_t_1 = (__pyx_v_codelens[__pyx_v_i]);
- for (__pyx_t_5 = 0; __pyx_t_5 < __pyx_t_1; __pyx_t_5+=1) {
- __pyx_v_b = __pyx_t_5;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":368
- * memset(work, 0, size * cython.sizeof(REAL_t))
- * for b in range(codelens[i]):
- * row2 = word_point[b] * size # <<<<<<<<<<<<<<
- * f = sdot(&size, neu1, &ONE, &syn1[row2], &ONE)
- * if f <= -MAX_EXP or f >= MAX_EXP:
- */
- __pyx_v_row2 = ((__pyx_v_word_point[__pyx_v_b]) * __pyx_v_size);
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":369
- * for b in range(codelens[i]):
- * row2 = word_point[b] * size
- * f = sdot(&size, neu1, &ONE, &syn1[row2], &ONE) # <<<<<<<<<<<<<<
- * if f <= -MAX_EXP or f >= MAX_EXP:
- * continue
- */
- __pyx_v_f = ((__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t)__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_sdot((&__pyx_v_size), __pyx_v_neu1, (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE), (&(__pyx_v_syn1[__pyx_v_row2])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE)));
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":370
- * row2 = word_point[b] * size
- * f = sdot(&size, neu1, &ONE, &syn1[row2], &ONE)
- * if f <= -MAX_EXP or f >= MAX_EXP: # <<<<<<<<<<<<<<
- * continue
- * f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]
- */
- __pyx_t_4 = ((__pyx_v_f <= -6.0) != 0);
- if (!__pyx_t_4) {
- } else {
- __pyx_t_3 = __pyx_t_4;
- goto __pyx_L17_bool_binop_done;
- }
- __pyx_t_4 = ((__pyx_v_f >= 6.0) != 0);
- __pyx_t_3 = __pyx_t_4;
- __pyx_L17_bool_binop_done:;
- if (__pyx_t_3) {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":371
- * f = sdot(&size, neu1, &ONE, &syn1[row2], &ONE)
- * if f <= -MAX_EXP or f >= MAX_EXP:
- * continue # <<<<<<<<<<<<<<
- * f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]
- * g = (1 - word_code[b] - f) * alpha
- */
- goto __pyx_L14_continue;
- }
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":372
- * if f <= -MAX_EXP or f >= MAX_EXP:
- * continue
- * f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))] # <<<<<<<<<<<<<<
- * g = (1 - word_code[b] - f) * alpha
- * saxpy(&size, &g, &syn1[row2], &ONE, work, &ONE)
- */
- __pyx_v_f = (__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_EXP_TABLE[((int)((__pyx_v_f + 6.0) * 83.0))]);
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":373
- * continue
- * f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]
- * g = (1 - word_code[b] - f) * alpha # <<<<<<<<<<<<<<
- * saxpy(&size, &g, &syn1[row2], &ONE, work, &ONE)
- * if tw:
- */
- __pyx_v_g = (((1 - (__pyx_v_word_code[__pyx_v_b])) - __pyx_v_f) * __pyx_v_alpha);
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":374
- * f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]
- * g = (1 - word_code[b] - f) * alpha
- * saxpy(&size, &g, &syn1[row2], &ONE, work, &ONE) # <<<<<<<<<<<<<<
- * if tw:
- * saxpy(&size, &g, neu1, &ONE, &syn1[row2], &ONE)
- */
- __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_saxpy((&__pyx_v_size), (&__pyx_v_g), (&(__pyx_v_syn1[__pyx_v_row2])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE), __pyx_v_work, (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE));
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":375
- * g = (1 - word_code[b] - f) * alpha
- * saxpy(&size, &g, &syn1[row2], &ONE, work, &ONE)
- * if tw: # <<<<<<<<<<<<<<
- * saxpy(&size, &g, neu1, &ONE, &syn1[row2], &ONE)
- * if tw:
- */
- __pyx_t_3 = (__pyx_v_tw != 0);
- if (__pyx_t_3) {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":376
- * saxpy(&size, &g, &syn1[row2], &ONE, work, &ONE)
- * if tw:
- * saxpy(&size, &g, neu1, &ONE, &syn1[row2], &ONE) # <<<<<<<<<<<<<<
- * if tw:
- * for m in range(j, k):
- */
- __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_saxpy((&__pyx_v_size), (&__pyx_v_g), __pyx_v_neu1, (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE), (&(__pyx_v_syn1[__pyx_v_row2])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE));
- goto __pyx_L19;
- }
- __pyx_L19:;
- __pyx_L14_continue:;
- }
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":377
- * if tw:
- * saxpy(&size, &g, neu1, &ONE, &syn1[row2], &ONE)
- * if tw: # <<<<<<<<<<<<<<
- * for m in range(j, k):
- * if m == i or codelens[m] == 0:
- */
- __pyx_t_3 = (__pyx_v_tw != 0);
- if (__pyx_t_3) {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":378
- * saxpy(&size, &g, neu1, &ONE, &syn1[row2], &ONE)
- * if tw:
- * for m in range(j, k): # <<<<<<<<<<<<<<
- * if m == i or codelens[m] == 0:
- * continue
- */
- __pyx_t_1 = __pyx_v_k;
- for (__pyx_t_2 = __pyx_v_j; __pyx_t_2 < __pyx_t_1; __pyx_t_2+=1) {
- __pyx_v_m = __pyx_t_2;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":379
- * if tw:
- * for m in range(j, k):
- * if m == i or codelens[m] == 0: # <<<<<<<<<<<<<<
- * continue
- * else:
- */
- __pyx_t_4 = ((__pyx_v_m == __pyx_v_i) != 0);
- if (!__pyx_t_4) {
- } else {
- __pyx_t_3 = __pyx_t_4;
- goto __pyx_L24_bool_binop_done;
- }
- __pyx_t_4 = (((__pyx_v_codelens[__pyx_v_m]) == 0) != 0);
- __pyx_t_3 = __pyx_t_4;
- __pyx_L24_bool_binop_done:;
- if (__pyx_t_3) {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":380
- * for m in range(j, k):
- * if m == i or codelens[m] == 0:
- * continue # <<<<<<<<<<<<<<
- * else:
- * saxpy(&size, &ONEF, work, &ONE, &syn0[indexes[m]*size], &ONE)
- */
- goto __pyx_L21_continue;
- }
- /*else*/ {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":382
- * continue
- * else:
- * saxpy(&size, &ONEF, work, &ONE, &syn0[indexes[m]*size], &ONE) # <<<<<<<<<<<<<<
- * if tl:
- * for m in range(lbl_length):
- */
- __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_saxpy((&__pyx_v_size), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONEF), __pyx_v_work, (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE), (&(__pyx_v_syn0[((__pyx_v_indexes[__pyx_v_m]) * __pyx_v_size)])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE));
- }
- __pyx_L21_continue:;
- }
- goto __pyx_L20;
- }
- __pyx_L20:;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":383
- * else:
- * saxpy(&size, &ONEF, work, &ONE, &syn0[indexes[m]*size], &ONE)
- * if tl: # <<<<<<<<<<<<<<
- * for m in range(lbl_length):
- * if lbl_codelens[m] == 0:
- */
- __pyx_t_3 = (__pyx_v_tl != 0);
- if (__pyx_t_3) {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":384
- * saxpy(&size, &ONEF, work, &ONE, &syn0[indexes[m]*size], &ONE)
- * if tl:
- * for m in range(lbl_length): # <<<<<<<<<<<<<<
- * if lbl_codelens[m] == 0:
- * continue
- */
- __pyx_t_1 = __pyx_v_lbl_length;
- for (__pyx_t_2 = 0; __pyx_t_2 < __pyx_t_1; __pyx_t_2+=1) {
- __pyx_v_m = __pyx_t_2;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":385
- * if tl:
- * for m in range(lbl_length):
- * if lbl_codelens[m] == 0: # <<<<<<<<<<<<<<
- * continue
- * else:
- */
- __pyx_t_3 = (((__pyx_v_lbl_codelens[__pyx_v_m]) == 0) != 0);
- if (__pyx_t_3) {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":386
- * for m in range(lbl_length):
- * if lbl_codelens[m] == 0:
- * continue # <<<<<<<<<<<<<<
- * else:
- * saxpy(&size, &ONEF, work, &ONE, &syn0[lbl_indexes[m]*size], &ONE)
- */
- goto __pyx_L27_continue;
- }
- /*else*/ {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":388
- * continue
- * else:
- * saxpy(&size, &ONEF, work, &ONE, &syn0[lbl_indexes[m]*size], &ONE) # <<<<<<<<<<<<<<
- *
- * cdef void fast_sentence2_dm_hs(
- */
- __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_saxpy((&__pyx_v_size), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONEF), __pyx_v_work, (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE), (&(__pyx_v_syn0[((__pyx_v_lbl_indexes[__pyx_v_m]) * __pyx_v_size)])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE));
- }
- __pyx_L27_continue:;
- }
- goto __pyx_L26;
- }
- __pyx_L26:;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":336
- * saxpy(&size, &ONEF, work, &ONE, &syn0[lbl_indexes[m]*size], &ONE)
- *
- * cdef void fast_sentence1_dm_hs( # <<<<<<<<<<<<<<
- * const np.uint32_t *word_point, const np.uint8_t *word_code, int codelens[MAX_SENTENCE_LEN],
- * int lbl_codelens[MAX_SENTENCE_LEN], REAL_t *neu1, REAL_t *syn0, REAL_t *syn1, const int size,
- */
-
- /* function exit code */
-}
-
-/* "trunk/gensim/models/doc2vec_inner.pyx":390
- * saxpy(&size, &ONEF, work, &ONE, &syn0[lbl_indexes[m]*size], &ONE)
- *
- * cdef void fast_sentence2_dm_hs( # <<<<<<<<<<<<<<
- * const np.uint32_t *word_point, const np.uint8_t *word_code, int codelens[MAX_SENTENCE_LEN],
- * int lbl_codelens[MAX_SENTENCE_LEN], REAL_t *neu1, REAL_t *syn0, REAL_t *syn1, const int size,
- */
-
-static void __pyx_f_5trunk_6gensim_6models_13doc2vec_inner_fast_sentence2_dm_hs(__pyx_t_5numpy_uint32_t const *__pyx_v_word_point, __pyx_t_5numpy_uint8_t const *__pyx_v_word_code, int *__pyx_v_codelens, int *__pyx_v_lbl_codelens, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_neu1, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_syn0, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_syn1, int const __pyx_v_size, __pyx_t_5numpy_uint32_t const *__pyx_v_indexes, __pyx_t_5numpy_uint32_t const *__pyx_v_lbl_indexes, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t const __pyx_v_alpha, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_work, int __pyx_v_i, int __pyx_v_j, int __pyx_v_k, int __pyx_v_cbow_mean, int __pyx_v_lbl_length, int __pyx_v_tw, int __pyx_v_tl) {
- PY_LONG_LONG __pyx_v_a;
- PY_LONG_LONG __pyx_v_b;
- PY_LONG_LONG __pyx_v_row2;
- __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t __pyx_v_f;
- __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t __pyx_v_g;
- __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t __pyx_v_count;
- int __pyx_v_m;
- int __pyx_t_1;
- PY_LONG_LONG __pyx_t_2;
- int __pyx_t_3;
- int __pyx_t_4;
- int __pyx_t_5;
- int __pyx_t_6;
- PY_LONG_LONG __pyx_t_7;
- PY_LONG_LONG __pyx_t_8;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":401
- * cdef int m
- *
- * for a in range(size): # <<<<<<<<<<<<<<
- * neu1[a] = 0.0
- * count = 0.0
- */
- __pyx_t_1 = __pyx_v_size;
- for (__pyx_t_2 = 0; __pyx_t_2 < __pyx_t_1; __pyx_t_2+=1) {
- __pyx_v_a = __pyx_t_2;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":402
- *
- * for a in range(size):
- * neu1[a] = 0.0 # <<<<<<<<<<<<<<
- * count = 0.0
- * for m in range(j, k):
- */
- (__pyx_v_neu1[__pyx_v_a]) = ((__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t)0.0);
- }
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":403
- * for a in range(size):
- * neu1[a] = 0.0
- * count = 0.0 # <<<<<<<<<<<<<<
- * for m in range(j, k):
- * if m == i or codelens[m] == 0:
- */
- __pyx_v_count = ((__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t)0.0);
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":404
- * neu1[a] = 0.0
- * count = 0.0
- * for m in range(j, k): # <<<<<<<<<<<<<<
- * if m == i or codelens[m] == 0:
- * continue
- */
- __pyx_t_1 = __pyx_v_k;
- for (__pyx_t_3 = __pyx_v_j; __pyx_t_3 < __pyx_t_1; __pyx_t_3+=1) {
- __pyx_v_m = __pyx_t_3;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":405
- * count = 0.0
- * for m in range(j, k):
- * if m == i or codelens[m] == 0: # <<<<<<<<<<<<<<
- * continue
- * else:
- */
- __pyx_t_5 = ((__pyx_v_m == __pyx_v_i) != 0);
- if (!__pyx_t_5) {
- } else {
- __pyx_t_4 = __pyx_t_5;
- goto __pyx_L8_bool_binop_done;
- }
- __pyx_t_5 = (((__pyx_v_codelens[__pyx_v_m]) == 0) != 0);
- __pyx_t_4 = __pyx_t_5;
- __pyx_L8_bool_binop_done:;
- if (__pyx_t_4) {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":406
- * for m in range(j, k):
- * if m == i or codelens[m] == 0:
- * continue # <<<<<<<<<<<<<<
- * else:
- * count += ONEF
- */
- goto __pyx_L5_continue;
- }
- /*else*/ {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":408
- * continue
- * else:
- * count += ONEF # <<<<<<<<<<<<<<
- * for a in range(size):
- * neu1[a] += syn0[indexes[m] * size + a]
- */
- __pyx_v_count = (__pyx_v_count + __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONEF);
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":409
- * else:
- * count += ONEF
- * for a in range(size): # <<<<<<<<<<<<<<
- * neu1[a] += syn0[indexes[m] * size + a]
- * for m in range(lbl_length):
- */
- __pyx_t_6 = __pyx_v_size;
- for (__pyx_t_2 = 0; __pyx_t_2 < __pyx_t_6; __pyx_t_2+=1) {
- __pyx_v_a = __pyx_t_2;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":410
- * count += ONEF
- * for a in range(size):
- * neu1[a] += syn0[indexes[m] * size + a] # <<<<<<<<<<<<<<
- * for m in range(lbl_length):
- * if lbl_codelens[m] == 0:
- */
- __pyx_t_7 = __pyx_v_a;
- (__pyx_v_neu1[__pyx_t_7]) = ((__pyx_v_neu1[__pyx_t_7]) + (__pyx_v_syn0[(((__pyx_v_indexes[__pyx_v_m]) * __pyx_v_size) + __pyx_v_a)]));
- }
- }
- __pyx_L5_continue:;
- }
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":411
- * for a in range(size):
- * neu1[a] += syn0[indexes[m] * size + a]
- * for m in range(lbl_length): # <<<<<<<<<<<<<<
- * if lbl_codelens[m] == 0:
- * continue
- */
- __pyx_t_1 = __pyx_v_lbl_length;
- for (__pyx_t_3 = 0; __pyx_t_3 < __pyx_t_1; __pyx_t_3+=1) {
- __pyx_v_m = __pyx_t_3;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":412
- * neu1[a] += syn0[indexes[m] * size + a]
- * for m in range(lbl_length):
- * if lbl_codelens[m] == 0: # <<<<<<<<<<<<<<
- * continue
- * else:
- */
- __pyx_t_4 = (((__pyx_v_lbl_codelens[__pyx_v_m]) == 0) != 0);
- if (__pyx_t_4) {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":413
- * for m in range(lbl_length):
- * if lbl_codelens[m] == 0:
- * continue # <<<<<<<<<<<<<<
- * else:
- * count += ONEF
- */
- goto __pyx_L12_continue;
- }
- /*else*/ {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":415
- * continue
- * else:
- * count += ONEF # <<<<<<<<<<<<<<
- * for a in range(size):
- * neu1[a] += syn0[lbl_indexes[m] * size + a]
- */
- __pyx_v_count = (__pyx_v_count + __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONEF);
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":416
- * else:
- * count += ONEF
- * for a in range(size): # <<<<<<<<<<<<<<
- * neu1[a] += syn0[lbl_indexes[m] * size + a]
- *
- */
- __pyx_t_6 = __pyx_v_size;
- for (__pyx_t_2 = 0; __pyx_t_2 < __pyx_t_6; __pyx_t_2+=1) {
- __pyx_v_a = __pyx_t_2;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":417
- * count += ONEF
- * for a in range(size):
- * neu1[a] += syn0[lbl_indexes[m] * size + a] # <<<<<<<<<<<<<<
- *
- * if cbow_mean and count > (0.5):
- */
- __pyx_t_7 = __pyx_v_a;
- (__pyx_v_neu1[__pyx_t_7]) = ((__pyx_v_neu1[__pyx_t_7]) + (__pyx_v_syn0[(((__pyx_v_lbl_indexes[__pyx_v_m]) * __pyx_v_size) + __pyx_v_a)]));
- }
- }
- __pyx_L12_continue:;
- }
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":419
- * neu1[a] += syn0[lbl_indexes[m] * size + a]
- *
- * if cbow_mean and count > (0.5): # <<<<<<<<<<<<<<
- * for a in range(size):
- * neu1[a] /= count
- */
- __pyx_t_5 = (__pyx_v_cbow_mean != 0);
- if (__pyx_t_5) {
- } else {
- __pyx_t_4 = __pyx_t_5;
- goto __pyx_L18_bool_binop_done;
- }
- __pyx_t_5 = ((__pyx_v_count > ((__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t)0.5)) != 0);
- __pyx_t_4 = __pyx_t_5;
- __pyx_L18_bool_binop_done:;
- if (__pyx_t_4) {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":420
- *
- * if cbow_mean and count > (0.5):
- * for a in range(size): # <<<<<<<<<<<<<<
- * neu1[a] /= count
- *
- */
- __pyx_t_1 = __pyx_v_size;
- for (__pyx_t_2 = 0; __pyx_t_2 < __pyx_t_1; __pyx_t_2+=1) {
- __pyx_v_a = __pyx_t_2;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":421
- * if cbow_mean and count > (0.5):
- * for a in range(size):
- * neu1[a] /= count # <<<<<<<<<<<<<<
- *
- * for a in range(size):
- */
- __pyx_t_7 = __pyx_v_a;
- (__pyx_v_neu1[__pyx_t_7]) = ((__pyx_v_neu1[__pyx_t_7]) / __pyx_v_count);
- }
- goto __pyx_L17;
- }
- __pyx_L17:;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":423
- * neu1[a] /= count
- *
- * for a in range(size): # <<<<<<<<<<<<<<
- * work[a] = 0.0
- * for b in range(codelens[i]):
- */
- __pyx_t_1 = __pyx_v_size;
- for (__pyx_t_2 = 0; __pyx_t_2 < __pyx_t_1; __pyx_t_2+=1) {
- __pyx_v_a = __pyx_t_2;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":424
- *
- * for a in range(size):
- * work[a] = 0.0 # <<<<<<<<<<<<<<
- * for b in range(codelens[i]):
- * row2 = word_point[b] * size
- */
- (__pyx_v_work[__pyx_v_a]) = ((__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t)0.0);
- }
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":425
- * for a in range(size):
- * work[a] = 0.0
- * for b in range(codelens[i]): # <<<<<<<<<<<<<<
- * row2 = word_point[b] * size
- * f = 0.0
- */
- __pyx_t_1 = (__pyx_v_codelens[__pyx_v_i]);
- for (__pyx_t_2 = 0; __pyx_t_2 < __pyx_t_1; __pyx_t_2+=1) {
- __pyx_v_b = __pyx_t_2;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":426
- * work[a] = 0.0
- * for b in range(codelens[i]):
- * row2 = word_point[b] * size # <<<<<<<<<<<<<<
- * f = 0.0
- * for a in range(size):
- */
- __pyx_v_row2 = ((__pyx_v_word_point[__pyx_v_b]) * __pyx_v_size);
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":427
- * for b in range(codelens[i]):
- * row2 = word_point[b] * size
- * f = 0.0 # <<<<<<<<<<<<<<
- * for a in range(size):
- * f += neu1[a] * syn1[row2 + a]
- */
- __pyx_v_f = ((__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t)0.0);
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":428
- * row2 = word_point[b] * size
- * f = 0.0
- * for a in range(size): # <<<<<<<<<<<<<<
- * f += neu1[a] * syn1[row2 + a]
- * if f <= -MAX_EXP or f >= MAX_EXP:
- */
- __pyx_t_3 = __pyx_v_size;
- for (__pyx_t_7 = 0; __pyx_t_7 < __pyx_t_3; __pyx_t_7+=1) {
- __pyx_v_a = __pyx_t_7;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":429
- * f = 0.0
- * for a in range(size):
- * f += neu1[a] * syn1[row2 + a] # <<<<<<<<<<<<<<
- * if f <= -MAX_EXP or f >= MAX_EXP:
- * continue
- */
- __pyx_v_f = (__pyx_v_f + ((__pyx_v_neu1[__pyx_v_a]) * (__pyx_v_syn1[(__pyx_v_row2 + __pyx_v_a)])));
- }
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":430
- * for a in range(size):
- * f += neu1[a] * syn1[row2 + a]
- * if f <= -MAX_EXP or f >= MAX_EXP: # <<<<<<<<<<<<<<
- * continue
- * f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]
- */
- __pyx_t_5 = ((__pyx_v_f <= -6.0) != 0);
- if (!__pyx_t_5) {
- } else {
- __pyx_t_4 = __pyx_t_5;
- goto __pyx_L29_bool_binop_done;
- }
- __pyx_t_5 = ((__pyx_v_f >= 6.0) != 0);
- __pyx_t_4 = __pyx_t_5;
- __pyx_L29_bool_binop_done:;
- if (__pyx_t_4) {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":431
- * f += neu1[a] * syn1[row2 + a]
- * if f <= -MAX_EXP or f >= MAX_EXP:
- * continue # <<<<<<<<<<<<<<
- * f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]
- * g = (1 - word_code[b] - f) * alpha
- */
- goto __pyx_L24_continue;
- }
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":432
- * if f <= -MAX_EXP or f >= MAX_EXP:
- * continue
- * f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))] # <<<<<<<<<<<<<<
- * g = (1 - word_code[b] - f) * alpha
- * for a in range(size):
- */
- __pyx_v_f = (__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_EXP_TABLE[((int)((__pyx_v_f + 6.0) * 83.0))]);
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":433
- * continue
- * f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]
- * g = (1 - word_code[b] - f) * alpha # <<<<<<<<<<<<<<
- * for a in range(size):
- * work[a] += g * syn1[row2 + a]
- */
- __pyx_v_g = (((1 - (__pyx_v_word_code[__pyx_v_b])) - __pyx_v_f) * __pyx_v_alpha);
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":434
- * f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]
- * g = (1 - word_code[b] - f) * alpha
- * for a in range(size): # <<<<<<<<<<<<<<
- * work[a] += g * syn1[row2 + a]
- * if tw:
- */
- __pyx_t_3 = __pyx_v_size;
- for (__pyx_t_7 = 0; __pyx_t_7 < __pyx_t_3; __pyx_t_7+=1) {
- __pyx_v_a = __pyx_t_7;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":435
- * g = (1 - word_code[b] - f) * alpha
- * for a in range(size):
- * work[a] += g * syn1[row2 + a] # <<<<<<<<<<<<<<
- * if tw:
- * for a in range(size):
- */
- __pyx_t_8 = __pyx_v_a;
- (__pyx_v_work[__pyx_t_8]) = ((__pyx_v_work[__pyx_t_8]) + (__pyx_v_g * (__pyx_v_syn1[(__pyx_v_row2 + __pyx_v_a)])));
- }
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":436
- * for a in range(size):
- * work[a] += g * syn1[row2 + a]
- * if tw: # <<<<<<<<<<<<<<
- * for a in range(size):
- * syn1[row2 + a] += g * neu1[a]
- */
- __pyx_t_4 = (__pyx_v_tw != 0);
- if (__pyx_t_4) {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":437
- * work[a] += g * syn1[row2 + a]
- * if tw:
- * for a in range(size): # <<<<<<<<<<<<<<
- * syn1[row2 + a] += g * neu1[a]
- * if tw:
- */
- __pyx_t_3 = __pyx_v_size;
- for (__pyx_t_7 = 0; __pyx_t_7 < __pyx_t_3; __pyx_t_7+=1) {
- __pyx_v_a = __pyx_t_7;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":438
- * if tw:
- * for a in range(size):
- * syn1[row2 + a] += g * neu1[a] # <<<<<<<<<<<<<<
- * if tw:
- * for m in range(j, k):
- */
- __pyx_t_8 = (__pyx_v_row2 + __pyx_v_a);
- (__pyx_v_syn1[__pyx_t_8]) = ((__pyx_v_syn1[__pyx_t_8]) + (__pyx_v_g * (__pyx_v_neu1[__pyx_v_a])));
- }
- goto __pyx_L33;
- }
- __pyx_L33:;
- __pyx_L24_continue:;
- }
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":439
- * for a in range(size):
- * syn1[row2 + a] += g * neu1[a]
- * if tw: # <<<<<<<<<<<<<<
- * for m in range(j, k):
- * if m == i or codelens[m] == 0:
- */
- __pyx_t_4 = (__pyx_v_tw != 0);
- if (__pyx_t_4) {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":440
- * syn1[row2 + a] += g * neu1[a]
- * if tw:
- * for m in range(j, k): # <<<<<<<<<<<<<<
- * if m == i or codelens[m] == 0:
- * continue
- */
- __pyx_t_1 = __pyx_v_k;
- for (__pyx_t_3 = __pyx_v_j; __pyx_t_3 < __pyx_t_1; __pyx_t_3+=1) {
- __pyx_v_m = __pyx_t_3;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":441
- * if tw:
- * for m in range(j, k):
- * if m == i or codelens[m] == 0: # <<<<<<<<<<<<<<
- * continue
- * else:
- */
- __pyx_t_5 = ((__pyx_v_m == __pyx_v_i) != 0);
- if (!__pyx_t_5) {
- } else {
- __pyx_t_4 = __pyx_t_5;
- goto __pyx_L40_bool_binop_done;
- }
- __pyx_t_5 = (((__pyx_v_codelens[__pyx_v_m]) == 0) != 0);
- __pyx_t_4 = __pyx_t_5;
- __pyx_L40_bool_binop_done:;
- if (__pyx_t_4) {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":442
- * for m in range(j, k):
- * if m == i or codelens[m] == 0:
- * continue # <<<<<<<<<<<<<<
- * else:
- * for a in range(size):
- */
- goto __pyx_L37_continue;
- }
- /*else*/ {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":444
- * continue
- * else:
- * for a in range(size): # <<<<<<<<<<<<<<
- * syn0[indexes[m] * size + a] += work[a]
- * if tl:
- */
- __pyx_t_6 = __pyx_v_size;
- for (__pyx_t_2 = 0; __pyx_t_2 < __pyx_t_6; __pyx_t_2+=1) {
- __pyx_v_a = __pyx_t_2;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":445
- * else:
- * for a in range(size):
- * syn0[indexes[m] * size + a] += work[a] # <<<<<<<<<<<<<<
- * if tl:
- * for m in range(lbl_length):
- */
- __pyx_t_7 = (((__pyx_v_indexes[__pyx_v_m]) * __pyx_v_size) + __pyx_v_a);
- (__pyx_v_syn0[__pyx_t_7]) = ((__pyx_v_syn0[__pyx_t_7]) + (__pyx_v_work[__pyx_v_a]));
- }
- }
- __pyx_L37_continue:;
- }
- goto __pyx_L36;
- }
- __pyx_L36:;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":446
- * for a in range(size):
- * syn0[indexes[m] * size + a] += work[a]
- * if tl: # <<<<<<<<<<<<<<
- * for m in range(lbl_length):
- * if lbl_codelens[m] == 0:
- */
- __pyx_t_4 = (__pyx_v_tl != 0);
- if (__pyx_t_4) {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":447
- * syn0[indexes[m] * size + a] += work[a]
- * if tl:
- * for m in range(lbl_length): # <<<<<<<<<<<<<<
- * if lbl_codelens[m] == 0:
- * continue
- */
- __pyx_t_1 = __pyx_v_lbl_length;
- for (__pyx_t_3 = 0; __pyx_t_3 < __pyx_t_1; __pyx_t_3+=1) {
- __pyx_v_m = __pyx_t_3;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":448
- * if tl:
- * for m in range(lbl_length):
- * if lbl_codelens[m] == 0: # <<<<<<<<<<<<<<
- * continue
- * else:
- */
- __pyx_t_4 = (((__pyx_v_lbl_codelens[__pyx_v_m]) == 0) != 0);
- if (__pyx_t_4) {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":449
- * for m in range(lbl_length):
- * if lbl_codelens[m] == 0:
- * continue # <<<<<<<<<<<<<<
- * else:
- * for a in range(size):
- */
- goto __pyx_L45_continue;
- }
- /*else*/ {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":451
- * continue
- * else:
- * for a in range(size): # <<<<<<<<<<<<<<
- * syn0[lbl_indexes[m] * size + a] += work[a]
- *
- */
- __pyx_t_6 = __pyx_v_size;
- for (__pyx_t_2 = 0; __pyx_t_2 < __pyx_t_6; __pyx_t_2+=1) {
- __pyx_v_a = __pyx_t_2;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":452
- * else:
- * for a in range(size):
- * syn0[lbl_indexes[m] * size + a] += work[a] # <<<<<<<<<<<<<<
- *
- * cdef unsigned long long fast_sentence0_dm_neg(
- */
- __pyx_t_7 = (((__pyx_v_lbl_indexes[__pyx_v_m]) * __pyx_v_size) + __pyx_v_a);
- (__pyx_v_syn0[__pyx_t_7]) = ((__pyx_v_syn0[__pyx_t_7]) + (__pyx_v_work[__pyx_v_a]));
- }
- }
- __pyx_L45_continue:;
- }
- goto __pyx_L44;
- }
- __pyx_L44:;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":390
- * saxpy(&size, &ONEF, work, &ONE, &syn0[lbl_indexes[m]*size], &ONE)
- *
- * cdef void fast_sentence2_dm_hs( # <<<<<<<<<<<<<<
- * const np.uint32_t *word_point, const np.uint8_t *word_code, int codelens[MAX_SENTENCE_LEN],
- * int lbl_codelens[MAX_SENTENCE_LEN], REAL_t *neu1, REAL_t *syn0, REAL_t *syn1, const int size,
- */
-
- /* function exit code */
-}
-
-/* "trunk/gensim/models/doc2vec_inner.pyx":454
- * syn0[lbl_indexes[m] * size + a] += work[a]
- *
- * cdef unsigned long long fast_sentence0_dm_neg( # <<<<<<<<<<<<<<
- * const int negative, np.uint32_t *table, unsigned long long table_len, int codelens[MAX_SENTENCE_LEN],
- * int lbl_codelens[MAX_SENTENCE_LEN], REAL_t *neu1, REAL_t *syn0, REAL_t *syn1neg, const int size,
- */
-
-static unsigned PY_LONG_LONG __pyx_f_5trunk_6gensim_6models_13doc2vec_inner_fast_sentence0_dm_neg(int const __pyx_v_negative, __pyx_t_5numpy_uint32_t *__pyx_v_table, unsigned PY_LONG_LONG __pyx_v_table_len, int *__pyx_v_codelens, int *__pyx_v_lbl_codelens, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_neu1, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_syn0, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_syn1neg, int const __pyx_v_size, __pyx_t_5numpy_uint32_t *__pyx_v_indexes, __pyx_t_5numpy_uint32_t *__pyx_v_lbl_indexes, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t const __pyx_v_alpha, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_work, int __pyx_v_i, int __pyx_v_j, int __pyx_v_k, int __pyx_v_cbow_mean, unsigned PY_LONG_LONG __pyx_v_next_random, int __pyx_v_lbl_length, int __pyx_v_tw, int __pyx_v_tl) {
- PY_LONG_LONG __pyx_v_row2;
- unsigned PY_LONG_LONG __pyx_v_modulo;
- __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t __pyx_v_f;
- __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t __pyx_v_g;
- __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t __pyx_v_count;
- __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t __pyx_v_inv_count;
- __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t __pyx_v_label;
- __pyx_t_5numpy_uint32_t __pyx_v_target_index;
- __pyx_t_5numpy_uint32_t __pyx_v_word_index;
- int __pyx_v_d;
- int __pyx_v_m;
- unsigned PY_LONG_LONG __pyx_r;
- int __pyx_t_1;
- int __pyx_t_2;
- int __pyx_t_3;
- int __pyx_t_4;
- long __pyx_t_5;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":462
- * cdef long long a
- * cdef long long row2
- * cdef unsigned long long modulo = 281474976710655ULL # <<<<<<<<<<<<<<
- * cdef REAL_t f, g, count, inv_count, label
- * cdef np.uint32_t target_index, word_index
- */
- __pyx_v_modulo = 281474976710655ULL;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":467
- * cdef int d, m
- *
- * word_index = indexes[i] # <<<<<<<<<<<<<<
- *
- * memset(neu1, 0, size * cython.sizeof(REAL_t))
- */
- __pyx_v_word_index = (__pyx_v_indexes[__pyx_v_i]);
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":469
- * word_index = indexes[i]
- *
- * memset(neu1, 0, size * cython.sizeof(REAL_t)) # <<<<<<<<<<<<<<
- * count = 0.0
- * for m in range(j, k):
- */
- memset(__pyx_v_neu1, 0, (__pyx_v_size * (sizeof(__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t))));
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":470
- *
- * memset(neu1, 0, size * cython.sizeof(REAL_t))
- * count = 0.0 # <<<<<<<<<<<<<<
- * for m in range(j, k):
- * if m == i or codelens[m] == 0:
- */
- __pyx_v_count = ((__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t)0.0);
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":471
- * memset(neu1, 0, size * cython.sizeof(REAL_t))
- * count = 0.0
- * for m in range(j, k): # <<<<<<<<<<<<<<
- * if m == i or codelens[m] == 0:
- * continue
- */
- __pyx_t_1 = __pyx_v_k;
- for (__pyx_t_2 = __pyx_v_j; __pyx_t_2 < __pyx_t_1; __pyx_t_2+=1) {
- __pyx_v_m = __pyx_t_2;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":472
- * count = 0.0
- * for m in range(j, k):
- * if m == i or codelens[m] == 0: # <<<<<<<<<<<<<<
- * continue
- * else:
- */
- __pyx_t_4 = ((__pyx_v_m == __pyx_v_i) != 0);
- if (!__pyx_t_4) {
- } else {
- __pyx_t_3 = __pyx_t_4;
- goto __pyx_L6_bool_binop_done;
- }
- __pyx_t_4 = (((__pyx_v_codelens[__pyx_v_m]) == 0) != 0);
- __pyx_t_3 = __pyx_t_4;
- __pyx_L6_bool_binop_done:;
- if (__pyx_t_3) {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":473
- * for m in range(j, k):
- * if m == i or codelens[m] == 0:
- * continue # <<<<<<<<<<<<<<
- * else:
- * count += ONEF
- */
- goto __pyx_L3_continue;
- }
- /*else*/ {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":475
- * continue
- * else:
- * count += ONEF # <<<<<<<<<<<<<<
- * saxpy(&size, &ONEF, &syn0[indexes[m] * size], &ONE, neu1, &ONE)
- * for m in range(lbl_length):
- */
- __pyx_v_count = (__pyx_v_count + __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONEF);
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":476
- * else:
- * count += ONEF
- * saxpy(&size, &ONEF, &syn0[indexes[m] * size], &ONE, neu1, &ONE) # <<<<<<<<<<<<<<
- * for m in range(lbl_length):
- * if lbl_codelens[m] == 0:
- */
- __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_saxpy((&__pyx_v_size), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONEF), (&(__pyx_v_syn0[((__pyx_v_indexes[__pyx_v_m]) * __pyx_v_size)])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE), __pyx_v_neu1, (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE));
- }
- __pyx_L3_continue:;
- }
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":477
- * count += ONEF
- * saxpy(&size, &ONEF, &syn0[indexes[m] * size], &ONE, neu1, &ONE)
- * for m in range(lbl_length): # <<<<<<<<<<<<<<
- * if lbl_codelens[m] == 0:
- * continue
- */
- __pyx_t_1 = __pyx_v_lbl_length;
- for (__pyx_t_2 = 0; __pyx_t_2 < __pyx_t_1; __pyx_t_2+=1) {
- __pyx_v_m = __pyx_t_2;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":478
- * saxpy(&size, &ONEF, &syn0[indexes[m] * size], &ONE, neu1, &ONE)
- * for m in range(lbl_length):
- * if lbl_codelens[m] == 0: # <<<<<<<<<<<<<<
- * continue
- * else:
- */
- __pyx_t_3 = (((__pyx_v_lbl_codelens[__pyx_v_m]) == 0) != 0);
- if (__pyx_t_3) {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":479
- * for m in range(lbl_length):
- * if lbl_codelens[m] == 0:
- * continue # <<<<<<<<<<<<<<
- * else:
- * count += ONEF
- */
- goto __pyx_L8_continue;
- }
- /*else*/ {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":481
- * continue
- * else:
- * count += ONEF # <<<<<<<<<<<<<<
- * saxpy(&size, &ONEF, &syn0[lbl_indexes[m] * size], &ONE, neu1, &ONE)
- * if cbow_mean and count > (0.5):
- */
- __pyx_v_count = (__pyx_v_count + __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONEF);
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":482
- * else:
- * count += ONEF
- * saxpy(&size, &ONEF, &syn0[lbl_indexes[m] * size], &ONE, neu1, &ONE) # <<<<<<<<<<<<<<
- * if cbow_mean and count > (0.5):
- * inv_count = ONEF/count
- */
- __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_saxpy((&__pyx_v_size), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONEF), (&(__pyx_v_syn0[((__pyx_v_lbl_indexes[__pyx_v_m]) * __pyx_v_size)])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE), __pyx_v_neu1, (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE));
- }
- __pyx_L8_continue:;
- }
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":483
- * count += ONEF
- * saxpy(&size, &ONEF, &syn0[lbl_indexes[m] * size], &ONE, neu1, &ONE)
- * if cbow_mean and count > (0.5): # <<<<<<<<<<<<<<
- * inv_count = ONEF/count
- * sscal(&size, &inv_count, neu1, &ONE)
- */
- __pyx_t_4 = (__pyx_v_cbow_mean != 0);
- if (__pyx_t_4) {
- } else {
- __pyx_t_3 = __pyx_t_4;
- goto __pyx_L12_bool_binop_done;
- }
- __pyx_t_4 = ((__pyx_v_count > ((__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t)0.5)) != 0);
- __pyx_t_3 = __pyx_t_4;
- __pyx_L12_bool_binop_done:;
- if (__pyx_t_3) {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":484
- * saxpy(&size, &ONEF, &syn0[lbl_indexes[m] * size], &ONE, neu1, &ONE)
- * if cbow_mean and count > (0.5):
- * inv_count = ONEF/count # <<<<<<<<<<<<<<
- * sscal(&size, &inv_count, neu1, &ONE)
- *
- */
- __pyx_v_inv_count = (__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONEF / __pyx_v_count);
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":485
- * if cbow_mean and count > (0.5):
- * inv_count = ONEF/count
- * sscal(&size, &inv_count, neu1, &ONE) # <<<<<<<<<<<<<<
- *
- * memset(work, 0, size * cython.sizeof(REAL_t))
- */
- __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_sscal((&__pyx_v_size), (&__pyx_v_inv_count), __pyx_v_neu1, (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE));
- goto __pyx_L11;
- }
- __pyx_L11:;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":487
- * sscal(&size, &inv_count, neu1, &ONE)
- *
- * memset(work, 0, size * cython.sizeof(REAL_t)) # <<<<<<<<<<<<<<
- *
- * for d in range(negative+1):
- */
- memset(__pyx_v_work, 0, (__pyx_v_size * (sizeof(__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t))));
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":489
- * memset(work, 0, size * cython.sizeof(REAL_t))
- *
- * for d in range(negative+1): # <<<<<<<<<<<<<<
- * if d == 0:
- * target_index = word_index
- */
- __pyx_t_5 = (__pyx_v_negative + 1);
- for (__pyx_t_1 = 0; __pyx_t_1 < __pyx_t_5; __pyx_t_1+=1) {
- __pyx_v_d = __pyx_t_1;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":490
- *
- * for d in range(negative+1):
- * if d == 0: # <<<<<<<<<<<<<<
- * target_index = word_index
- * label = ONEF
- */
- __pyx_t_3 = ((__pyx_v_d == 0) != 0);
- if (__pyx_t_3) {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":491
- * for d in range(negative+1):
- * if d == 0:
- * target_index = word_index # <<<<<<<<<<<<<<
- * label = ONEF
- * else:
- */
- __pyx_v_target_index = __pyx_v_word_index;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":492
- * if d == 0:
- * target_index = word_index
- * label = ONEF # <<<<<<<<<<<<<<
- * else:
- * target_index = table[(next_random >> 16) % table_len]
- */
- __pyx_v_label = __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONEF;
- goto __pyx_L16;
- }
- /*else*/ {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":494
- * label = ONEF
- * else:
- * target_index = table[(next_random >> 16) % table_len] # <<<<<<<<<<<<<<
- * next_random = (next_random * 25214903917ULL + 11) & modulo
- * if target_index == word_index:
- */
- __pyx_v_target_index = (__pyx_v_table[((__pyx_v_next_random >> 16) % __pyx_v_table_len)]);
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":495
- * else:
- * target_index = table[(next_random >> 16) % table_len]
- * next_random = (next_random * 25214903917ULL + 11) & modulo # <<<<<<<<<<<<<<
- * if target_index == word_index:
- * continue
- */
- __pyx_v_next_random = (((__pyx_v_next_random * ((unsigned PY_LONG_LONG)25214903917ULL)) + 11) & __pyx_v_modulo);
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":496
- * target_index = table[(next_random >> 16) % table_len]
- * next_random = (next_random * 25214903917ULL + 11) & modulo
- * if target_index == word_index: # <<<<<<<<<<<<<<
- * continue
- * label = 0.0
- */
- __pyx_t_3 = ((__pyx_v_target_index == __pyx_v_word_index) != 0);
- if (__pyx_t_3) {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":497
- * next_random = (next_random * 25214903917ULL + 11) & modulo
- * if target_index == word_index:
- * continue # <<<<<<<<<<<<<<
- * label = 0.0
- *
- */
- goto __pyx_L14_continue;
- }
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":498
- * if target_index == word_index:
- * continue
- * label = 0.0 # <<<<<<<<<<<<<<
- *
- * row2 = target_index * size
- */
- __pyx_v_label = ((__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t)0.0);
- }
- __pyx_L16:;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":500
- * label = 0.0
- *
- * row2 = target_index * size # <<<<<<<<<<<<<<
- * f = dsdot(&size, neu1, &ONE, &syn1neg[row2], &ONE)
- * if f <= -MAX_EXP or f >= MAX_EXP:
- */
- __pyx_v_row2 = (__pyx_v_target_index * __pyx_v_size);
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":501
- *
- * row2 = target_index * size
- * f = dsdot(&size, neu1, &ONE, &syn1neg[row2], &ONE) # <<<<<<<<<<<<<<
- * if f <= -MAX_EXP or f >= MAX_EXP:
- * continue
- */
- __pyx_v_f = ((__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t)__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_dsdot((&__pyx_v_size), __pyx_v_neu1, (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE), (&(__pyx_v_syn1neg[__pyx_v_row2])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE)));
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":502
- * row2 = target_index * size
- * f = dsdot(&size, neu1, &ONE, &syn1neg[row2], &ONE)
- * if f <= -MAX_EXP or f >= MAX_EXP: # <<<<<<<<<<<<<<
- * continue
- * f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]
- */
- __pyx_t_4 = ((__pyx_v_f <= -6.0) != 0);
- if (!__pyx_t_4) {
- } else {
- __pyx_t_3 = __pyx_t_4;
- goto __pyx_L19_bool_binop_done;
- }
- __pyx_t_4 = ((__pyx_v_f >= 6.0) != 0);
- __pyx_t_3 = __pyx_t_4;
- __pyx_L19_bool_binop_done:;
- if (__pyx_t_3) {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":503
- * f = dsdot(&size, neu1, &ONE, &syn1neg[row2], &ONE)
- * if f <= -MAX_EXP or f >= MAX_EXP:
- * continue # <<<<<<<<<<<<<<
- * f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]
- * g = (label - f) * alpha
- */
- goto __pyx_L14_continue;
- }
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":504
- * if f <= -MAX_EXP or f >= MAX_EXP:
- * continue
- * f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))] # <<<<<<<<<<<<<<
- * g = (label - f) * alpha
- * saxpy(&size, &g, &syn1neg[row2], &ONE, work, &ONE)
- */
- __pyx_v_f = (__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_EXP_TABLE[((int)((__pyx_v_f + 6.0) * 83.0))]);
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":505
- * continue
- * f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]
- * g = (label - f) * alpha # <<<<<<<<<<<<<<
- * saxpy(&size, &g, &syn1neg[row2], &ONE, work, &ONE)
- * if tw:
- */
- __pyx_v_g = ((__pyx_v_label - __pyx_v_f) * __pyx_v_alpha);
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":506
- * f = EXP_TABLE[((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]
- * g = (label - f) * alpha
- * saxpy(&size, &g, &syn1neg[row2], &ONE, work, &ONE) # <<<<<<<<<<<<<<
- * if tw:
- * saxpy(&size, &g, neu1, &ONE, &syn1neg[row2], &ONE)
- */
- __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_saxpy((&__pyx_v_size), (&__pyx_v_g), (&(__pyx_v_syn1neg[__pyx_v_row2])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE), __pyx_v_work, (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE));
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":507
- * g = (label - f) * alpha
- * saxpy(&size, &g, &syn1neg[row2], &ONE, work, &ONE)
- * if tw: # <<<<<<<<<<<<<<
- * saxpy(&size, &g, neu1, &ONE, &syn1neg[row2], &ONE)
- * if tw:
- */
- __pyx_t_3 = (__pyx_v_tw != 0);
- if (__pyx_t_3) {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":508
- * saxpy(&size, &g, &syn1neg[row2], &ONE, work, &ONE)
- * if tw:
- * saxpy(&size, &g, neu1, &ONE, &syn1neg[row2], &ONE) # <<<<<<<<<<<<<<
- * if tw:
- * for m in range(j,k):
- */
- __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_saxpy((&__pyx_v_size), (&__pyx_v_g), __pyx_v_neu1, (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE), (&(__pyx_v_syn1neg[__pyx_v_row2])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE));
- goto __pyx_L21;
- }
- __pyx_L21:;
- __pyx_L14_continue:;
- }
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":509
- * if tw:
- * saxpy(&size, &g, neu1, &ONE, &syn1neg[row2], &ONE)
- * if tw: # <<<<<<<<<<<<<<
- * for m in range(j,k):
- * if m == i or codelens[m] == 0:
- */
- __pyx_t_3 = (__pyx_v_tw != 0);
- if (__pyx_t_3) {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":510
- * saxpy(&size, &g, neu1, &ONE, &syn1neg[row2], &ONE)
- * if tw:
- * for m in range(j,k): # <<<<<<<<<<<<<<
- * if m == i or codelens[m] == 0:
- * continue
- */
- __pyx_t_1 = __pyx_v_k;
- for (__pyx_t_2 = __pyx_v_j; __pyx_t_2 < __pyx_t_1; __pyx_t_2+=1) {
- __pyx_v_m = __pyx_t_2;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":511
- * if tw:
- * for m in range(j,k):
- * if m == i or codelens[m] == 0: # <<<<<<<<<<<<<<
- * continue
- * else:
- */
- __pyx_t_4 = ((__pyx_v_m == __pyx_v_i) != 0);
- if (!__pyx_t_4) {
- } else {
- __pyx_t_3 = __pyx_t_4;
- goto __pyx_L26_bool_binop_done;
- }
- __pyx_t_4 = (((__pyx_v_codelens[__pyx_v_m]) == 0) != 0);
- __pyx_t_3 = __pyx_t_4;
- __pyx_L26_bool_binop_done:;
- if (__pyx_t_3) {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":512
- * for m in range(j,k):
- * if m == i or codelens[m] == 0:
- * continue # <<<<<<<<<<<<<<
- * else:
- * saxpy(&size, &ONEF, work, &ONE, &syn0[indexes[m]*size], &ONE)
- */
- goto __pyx_L23_continue;
- }
- /*else*/ {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":514
- * continue
- * else:
- * saxpy(&size, &ONEF, work, &ONE, &syn0[indexes[m]*size], &ONE) # <<<<<<<<<<<<<<
- * if tl:
- * for m in range(lbl_length):
- */
- __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_saxpy((&__pyx_v_size), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONEF), __pyx_v_work, (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE), (&(__pyx_v_syn0[((__pyx_v_indexes[__pyx_v_m]) * __pyx_v_size)])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE));
- }
- __pyx_L23_continue:;
- }
- goto __pyx_L22;
- }
- __pyx_L22:;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":515
- * else:
- * saxpy(&size, &ONEF, work, &ONE, &syn0[indexes[m]*size], &ONE)
- * if tl: # <<<<<<<<<<<<<<
- * for m in range(lbl_length):
- * if lbl_codelens[m] == 0:
- */
- __pyx_t_3 = (__pyx_v_tl != 0);
- if (__pyx_t_3) {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":516
- * saxpy(&size, &ONEF, work, &ONE, &syn0[indexes[m]*size], &ONE)
- * if tl:
- * for m in range(lbl_length): # <<<<<<<<<<<<<<
- * if lbl_codelens[m] == 0:
- * continue
- */
- __pyx_t_1 = __pyx_v_lbl_length;
- for (__pyx_t_2 = 0; __pyx_t_2 < __pyx_t_1; __pyx_t_2+=1) {
- __pyx_v_m = __pyx_t_2;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":517
- * if tl:
- * for m in range(lbl_length):
- * if lbl_codelens[m] == 0: # <<<<<<<<<<<<<<
- * continue
- * else:
- */
- __pyx_t_3 = (((__pyx_v_lbl_codelens[__pyx_v_m]) == 0) != 0);
- if (__pyx_t_3) {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":518
- * for m in range(lbl_length):
- * if lbl_codelens[m] == 0:
- * continue # <<<<<<<<<<<<<<
- * else:
- * saxpy(&size, &ONEF, work, &ONE, &syn0[lbl_indexes[m]*size], &ONE)
- */
- goto __pyx_L29_continue;
- }
- /*else*/ {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":520
- * continue
- * else:
- * saxpy(&size, &ONEF, work, &ONE, &syn0[lbl_indexes[m]*size], &ONE) # <<<<<<<<<<<<<<
- *
- * return next_random
- */
- __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_saxpy((&__pyx_v_size), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONEF), __pyx_v_work, (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE), (&(__pyx_v_syn0[((__pyx_v_lbl_indexes[__pyx_v_m]) * __pyx_v_size)])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE));
- }
- __pyx_L29_continue:;
- }
- goto __pyx_L28;
- }
- __pyx_L28:;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":522
- * saxpy(&size, &ONEF, work, &ONE, &syn0[lbl_indexes[m]*size], &ONE)
- *
- * return next_random # <<<<<<<<<<<<<<
- *
- * cdef unsigned long long fast_sentence1_dm_neg(
- */
- __pyx_r = __pyx_v_next_random;
- goto __pyx_L0;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":454
- * syn0[lbl_indexes[m] * size + a] += work[a]
- *
- * cdef unsigned long long fast_sentence0_dm_neg( # <<<<<<<<<<<<<<
- * const int negative, np.uint32_t *table, unsigned long long table_len, int codelens[MAX_SENTENCE_LEN],
- * int lbl_codelens[MAX_SENTENCE_LEN], REAL_t *neu1, REAL_t *syn0, REAL_t *syn1neg, const int size,
- */
-
- /* function exit code */
- __pyx_L0:;
- return __pyx_r;
-}
-
-/* "trunk/gensim/models/doc2vec_inner.pyx":524
- * return next_random
- *
- * cdef unsigned long long fast_sentence1_dm_neg( # <<<<<<<<<<<<<<
- * const int negative, np.uint32_t *table, unsigned long long table_len, int codelens[MAX_SENTENCE_LEN],
- * int lbl_codelens[MAX_SENTENCE_LEN], REAL_t *neu1, REAL_t *syn0, REAL_t *syn1neg, const int size,
- */
-
-static unsigned PY_LONG_LONG __pyx_f_5trunk_6gensim_6models_13doc2vec_inner_fast_sentence1_dm_neg(int const __pyx_v_negative, __pyx_t_5numpy_uint32_t *__pyx_v_table, unsigned PY_LONG_LONG __pyx_v_table_len, int *__pyx_v_codelens, int *__pyx_v_lbl_codelens, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_neu1, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_syn0, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_syn1neg, int const __pyx_v_size, __pyx_t_5numpy_uint32_t *__pyx_v_indexes, __pyx_t_5numpy_uint32_t *__pyx_v_lbl_indexes, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t const __pyx_v_alpha, __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t *__pyx_v_work, int __pyx_v_i, int __pyx_v_j, int __pyx_v_k, int __pyx_v_cbow_mean, unsigned PY_LONG_LONG __pyx_v_next_random, int __pyx_v_lbl_length, int __pyx_v_tw, int __pyx_v_tl) {
- PY_LONG_LONG __pyx_v_row2;
- unsigned PY_LONG_LONG __pyx_v_modulo;
- __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t __pyx_v_f;
- __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t __pyx_v_g;
- __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t __pyx_v_count;
- __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t __pyx_v_inv_count;
- __pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t __pyx_v_label;
- __pyx_t_5numpy_uint32_t __pyx_v_target_index;
- __pyx_t_5numpy_uint32_t __pyx_v_word_index;
- int __pyx_v_d;
- int __pyx_v_m;
- unsigned PY_LONG_LONG __pyx_r;
- int __pyx_t_1;
- int __pyx_t_2;
- int __pyx_t_3;
- int __pyx_t_4;
- long __pyx_t_5;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":532
- * cdef long long a
- * cdef long long row2
- * cdef unsigned long long modulo = 281474976710655ULL # <<<<<<<<<<<<<<
- * cdef REAL_t f, g, count, inv_count, label
- * cdef np.uint32_t target_index, word_index
- */
- __pyx_v_modulo = 281474976710655ULL;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":537
- * cdef int d, m
- *
- * word_index = indexes[i] # <<<<<<<<<<<<<<
- *
- * memset(neu1, 0, size * cython.sizeof(REAL_t))
- */
- __pyx_v_word_index = (__pyx_v_indexes[__pyx_v_i]);
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":539
- * word_index = indexes[i]
- *
- * memset(neu1, 0, size * cython.sizeof(REAL_t)) # <<<<<<<<<<<<<<
- * count = 0.0
- * for m in range(j, k):
- */
- memset(__pyx_v_neu1, 0, (__pyx_v_size * (sizeof(__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t))));
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":540
- *
- * memset(neu1, 0, size * cython.sizeof(REAL_t))
- * count = 0.0 # <<<<<<<<<<<<<<
- * for m in range(j, k):
- * if m == i or codelens[m] == 0:
- */
- __pyx_v_count = ((__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t)0.0);
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":541
- * memset(neu1, 0, size * cython.sizeof(REAL_t))
- * count = 0.0
- * for m in range(j, k): # <<<<<<<<<<<<<<
- * if m == i or codelens[m] == 0:
- * continue
- */
- __pyx_t_1 = __pyx_v_k;
- for (__pyx_t_2 = __pyx_v_j; __pyx_t_2 < __pyx_t_1; __pyx_t_2+=1) {
- __pyx_v_m = __pyx_t_2;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":542
- * count = 0.0
- * for m in range(j, k):
- * if m == i or codelens[m] == 0: # <<<<<<<<<<<<<<
- * continue
- * else:
- */
- __pyx_t_4 = ((__pyx_v_m == __pyx_v_i) != 0);
- if (!__pyx_t_4) {
- } else {
- __pyx_t_3 = __pyx_t_4;
- goto __pyx_L6_bool_binop_done;
- }
- __pyx_t_4 = (((__pyx_v_codelens[__pyx_v_m]) == 0) != 0);
- __pyx_t_3 = __pyx_t_4;
- __pyx_L6_bool_binop_done:;
- if (__pyx_t_3) {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":543
- * for m in range(j, k):
- * if m == i or codelens[m] == 0:
- * continue # <<<<<<<<<<<<<<
- * else:
- * count += ONEF
- */
- goto __pyx_L3_continue;
- }
- /*else*/ {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":545
- * continue
- * else:
- * count += ONEF # <<<<<<<<<<<<<<
- * saxpy(&size, &ONEF, &syn0[indexes[m] * size], &ONE, neu1, &ONE)
- * for m in range(lbl_length):
- */
- __pyx_v_count = (__pyx_v_count + __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONEF);
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":546
- * else:
- * count += ONEF
- * saxpy(&size, &ONEF, &syn0[indexes[m] * size], &ONE, neu1, &ONE) # <<<<<<<<<<<<<<
- * for m in range(lbl_length):
- * if lbl_codelens[m] == 0:
- */
- __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_saxpy((&__pyx_v_size), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONEF), (&(__pyx_v_syn0[((__pyx_v_indexes[__pyx_v_m]) * __pyx_v_size)])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE), __pyx_v_neu1, (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE));
- }
- __pyx_L3_continue:;
- }
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":547
- * count += ONEF
- * saxpy(&size, &ONEF, &syn0[indexes[m] * size], &ONE, neu1, &ONE)
- * for m in range(lbl_length): # <<<<<<<<<<<<<<
- * if lbl_codelens[m] == 0:
- * continue
- */
- __pyx_t_1 = __pyx_v_lbl_length;
- for (__pyx_t_2 = 0; __pyx_t_2 < __pyx_t_1; __pyx_t_2+=1) {
- __pyx_v_m = __pyx_t_2;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":548
- * saxpy(&size, &ONEF, &syn0[indexes[m] * size], &ONE, neu1, &ONE)
- * for m in range(lbl_length):
- * if lbl_codelens[m] == 0: # <<<<<<<<<<<<<<
- * continue
- * else:
- */
- __pyx_t_3 = (((__pyx_v_lbl_codelens[__pyx_v_m]) == 0) != 0);
- if (__pyx_t_3) {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":549
- * for m in range(lbl_length):
- * if lbl_codelens[m] == 0:
- * continue # <<<<<<<<<<<<<<
- * else:
- * count += ONEF
- */
- goto __pyx_L8_continue;
- }
- /*else*/ {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":551
- * continue
- * else:
- * count += ONEF # <<<<<<<<<<<<<<
- * saxpy(&size, &ONEF, &syn0[lbl_indexes[m] * size], &ONE, neu1, &ONE)
- * if cbow_mean and count > (0.5):
- */
- __pyx_v_count = (__pyx_v_count + __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONEF);
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":552
- * else:
- * count += ONEF
- * saxpy(&size, &ONEF, &syn0[lbl_indexes[m] * size], &ONE, neu1, &ONE) # <<<<<<<<<<<<<<
- * if cbow_mean and count > (0.5):
- * inv_count = ONEF/count
- */
- __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_saxpy((&__pyx_v_size), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONEF), (&(__pyx_v_syn0[((__pyx_v_lbl_indexes[__pyx_v_m]) * __pyx_v_size)])), (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE), __pyx_v_neu1, (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE));
- }
- __pyx_L8_continue:;
- }
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":553
- * count += ONEF
- * saxpy(&size, &ONEF, &syn0[lbl_indexes[m] * size], &ONE, neu1, &ONE)
- * if cbow_mean and count > (0.5): # <<<<<<<<<<<<<<
- * inv_count = ONEF/count
- * sscal(&size, &inv_count, neu1, &ONE)
- */
- __pyx_t_4 = (__pyx_v_cbow_mean != 0);
- if (__pyx_t_4) {
- } else {
- __pyx_t_3 = __pyx_t_4;
- goto __pyx_L12_bool_binop_done;
- }
- __pyx_t_4 = ((__pyx_v_count > ((__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t)0.5)) != 0);
- __pyx_t_3 = __pyx_t_4;
- __pyx_L12_bool_binop_done:;
- if (__pyx_t_3) {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":554
- * saxpy(&size, &ONEF, &syn0[lbl_indexes[m] * size], &ONE, neu1, &ONE)
- * if cbow_mean and count > (0.5):
- * inv_count = ONEF/count # <<<<<<<<<<<<<<
- * sscal(&size, &inv_count, neu1, &ONE)
- *
- */
- __pyx_v_inv_count = (__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONEF / __pyx_v_count);
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":555
- * if cbow_mean and count > (0.5):
- * inv_count = ONEF/count
- * sscal(&size, &inv_count, neu1, &ONE) # <<<<<<<<<<<<<<
- *
- * memset(work, 0, size * cython.sizeof(REAL_t))
- */
- __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_sscal((&__pyx_v_size), (&__pyx_v_inv_count), __pyx_v_neu1, (&__pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONE));
- goto __pyx_L11;
- }
- __pyx_L11:;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":557
- * sscal(&size, &inv_count, neu1, &ONE)
- *
- * memset(work, 0, size * cython.sizeof(REAL_t)) # <<<<<<<<<<<<<<
- *
- * for d in range(negative+1):
- */
- memset(__pyx_v_work, 0, (__pyx_v_size * (sizeof(__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t))));
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":559
- * memset(work, 0, size * cython.sizeof(REAL_t))
- *
- * for d in range(negative+1): # <<<<<<<<<<<<<<
- * if d == 0:
- * target_index = word_index
- */
- __pyx_t_5 = (__pyx_v_negative + 1);
- for (__pyx_t_1 = 0; __pyx_t_1 < __pyx_t_5; __pyx_t_1+=1) {
- __pyx_v_d = __pyx_t_1;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":560
- *
- * for d in range(negative+1):
- * if d == 0: # <<<<<<<<<<<<<<
- * target_index = word_index
- * label = ONEF
- */
- __pyx_t_3 = ((__pyx_v_d == 0) != 0);
- if (__pyx_t_3) {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":561
- * for d in range(negative+1):
- * if d == 0:
- * target_index = word_index # <<<<<<<<<<<<<<
- * label = ONEF
- * else:
- */
- __pyx_v_target_index = __pyx_v_word_index;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":562
- * if d == 0:
- * target_index = word_index
- * label = ONEF # <<<<<<<<<<<<<<
- * else:
- * target_index = table[(next_random >> 16) % table_len]
- */
- __pyx_v_label = __pyx_v_5trunk_6gensim_6models_13doc2vec_inner_ONEF;
- goto __pyx_L16;
- }
- /*else*/ {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":564
- * label = ONEF
- * else:
- * target_index = table[(next_random >> 16) % table_len] # <<<<<<<<<<<<<<
- * next_random = (next_random * 25214903917ULL + 11) & modulo
- * if target_index == word_index:
- */
- __pyx_v_target_index = (__pyx_v_table[((__pyx_v_next_random >> 16) % __pyx_v_table_len)]);
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":565
- * else:
- * target_index = table[(next_random >> 16) % table_len]
- * next_random = (next_random * 25214903917ULL + 11) & modulo # <<<<<<<<<<<<<<
- * if target_index == word_index:
- * continue
- */
- __pyx_v_next_random = (((__pyx_v_next_random * ((unsigned PY_LONG_LONG)25214903917ULL)) + 11) & __pyx_v_modulo);
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":566
- * target_index = table[(next_random >> 16) % table_len]
- * next_random = (next_random * 25214903917ULL + 11) & modulo
- * if target_index == word_index: # <<<<<<<<<<<<<<
- * continue
- * label = 0.0
- */
- __pyx_t_3 = ((__pyx_v_target_index == __pyx_v_word_index) != 0);
- if (__pyx_t_3) {
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":567
- * next_random = (next_random * 25214903917ULL + 11) & modulo
- * if target_index == word_index:
- * continue # <<<<<<<<<<<<<<
- * label = 0.0
- *
- */
- goto __pyx_L14_continue;
- }
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":568
- * if target_index == word_index:
- * continue
- * label = 0.0 # <<<<<<<<<<<<<<
- *
- * row2 = target_index * size
- */
- __pyx_v_label = ((__pyx_t_5trunk_6gensim_6models_13doc2vec_inner_REAL_t)0.0);
- }
- __pyx_L16:;
-
- /* "trunk/gensim/models/doc2vec_inner.pyx":570
- * label =