diff --git a/docs/notebooks/wikinews-bigram-en.ipynb b/docs/notebooks/wikinews-bigram-en.ipynb new file mode 100644 index 0000000000..89ef9c3ec0 --- /dev/null +++ b/docs/notebooks/wikinews-bigram-en.ipynb @@ -0,0 +1,397 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Illustrating common terms usage using Wikinews in english" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## getting data\n", + "\n", + "We get the cirrussearch dump of wikinews (a dump meant for elastic-search indexation)." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "LANG=\"english\"" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "%%bash\n", + "\n", + "fdate=20170327\n", + "fname=enwikinews-$fdate-cirrussearch-content.json.gz\n", + "if [ ! -e $fname ]\n", + "then\n", + " wget \"https://dumps.wikimedia.org/other/cirrussearch/$fdate/$fname\"\n", + "fi\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# iterator\n", + "import gzip\n", + "import json\n", + "\n", + "FDATE = 20170327\n", + "FNAME = \"enwikinews-%s-cirrussearch-content.json.gz\" % FDATE\n", + "\n", + "def iter_texts(fpath=FNAME):\n", + " with gzip.open(fpath, \"rt\") as f:\n", + " for l in f:\n", + " data = json.loads(l)\n", + " if \"title\" in data:\n", + " yield data[\"title\"]\n", + " yield data[\"text\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...\n", + "[nltk_data] Package punkt is already up-to-date!\n", + "[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...\n", + "[nltk_data] Package stopwords is already up-to-date!\n" + ] + }, + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# also prepare nltk\n", + "import nltk\n", + "nltk.download(\"punkt\")\n", + "nltk.download(\"stopwords\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preparing data\n", + "\n", + "we arrange the corpus as required by gensim" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# make a custom tokenizer\n", + "import re\n", + "from nltk.tokenize import sent_tokenize\n", + "from nltk.tokenize import RegexpTokenizer\n", + "tokenizer = RegexpTokenizer('\\w[\\w-]*|\\d[\\d,]*')" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# prepare a text\n", + "def prepare(txt):\n", + " # lower case\n", + " txt = txt.lower()\n", + " return [tokenizer.tokenize(sent) \n", + " for sent in sent_tokenize(txt, language=LANG)]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# we put all data in ram, it's not so much\n", + "corpus = []\n", + "for txt in iter_texts():\n", + " corpus.extend(prepare(txt))" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Corpus has 1003521 words in 46159 sentences\n" + ] + } + ], + "source": [ + "# how many sentences and words ?\n", + "words_count = sum(len(s) for s in corpus)\n", + "print(\"Corpus has %d words in %d sentences\" % (words_count, len(corpus)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Testing bigram with and without common terms\n", + "\n", + "The `Phrases` model gives us the possiblity of handling common terms, that is words that appears much time in a text and are there only to link objects between them.\n", + "While you could remove them, you may information, for *\"the president is in america\"* is not the same as *\"the president of america\"*\n", + "\n", + "The common_terms parameter Phrases can help you deal with them in a smarter way, keeping them around but avoiding them to crush frequency statistics." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using TensorFlow backend.\n" + ] + } + ], + "source": [ + "from gensim.models.phrases import Phrases" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'i me my myself we our ours ourselves you your yours yourself yourselves he him his himself she her hers herself it its itself they them their theirs themselves what which who whom this that these those am is are was were be been being have has had having do does did doing a an the and but if or because as until while of at by for with about against between into through during before after above below to from up down in out on off over under again further then once here there when where why how all any both each few more most other some such no nor not only own same so than too very s t can will just don should now d ll m o re ve y ain aren couldn didn doesn hadn hasn haven isn ma mightn mustn needn shan shouldn wasn weren won wouldn'" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# which are the stop words we will use\n", + "from nltk.corpus import stopwords\n", + "\" \".join(stopwords.words(LANG))" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# a version of corups without stop words\n", + "stop_words = frozenset(stopwords.words(LANG))\n", + "def stopwords_filter(txt):\n", + " return [w for w in txt if w not in stop_words]\n", + "st_corpus = [stopwords_filter(txt) for txt in corpus]" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 1.33 s, sys: 16 ms, total: 1.34 s\n", + "Wall time: 1.34 s\n", + "CPU times: user 1.64 s, sys: 24 ms, total: 1.67 s\n", + "Wall time: 1.67 s\n" + ] + } + ], + "source": [ + "# bigram std\n", + "%time bigram = Phrases(st_corpus)\n", + "# bigram with common terms\n", + "%time bigram_ct = Phrases(corpus, common_terms=stopwords.words(LANG))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### bigram with common terms inside\n", + "\n", + "What are (some of) the bigram founds thanks to common terms" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "510 grams with common terms found\n" + ] + }, + { + "data": { + "text/plain": [ + "[(5339.47619047619, 'borussia m gladbach'),\n", + " (5460.194782608696, 'billboard in jakarta'),\n", + " (5606.450000000001, 'christ of latter-day'),\n", + " (5862.954248366013, 'skull and bones'),\n", + " (6006.910714285714, 'preserved in amber'),\n", + " (6129.452168746287, 'aisyah and doan'),\n", + " (6158.114416475973, 'funded by your generous'),\n", + " (6407.371428571429, 'restored as burkina'),\n", + " (7081.831578947369, 'click on the donate'),\n", + " (7234.129032258064, 'qatar of intervening'),\n", + " (7377.621673923561, 'sinks in suva'),\n", + " (8146.123931623933, 'lahm to hang'),\n", + " (8163.0819009100105, 'istanbul s ataturk'),\n", + " (8305.851851851852, 'derails in tabasco'),\n", + " (9060.929292929293, 'poet of apostasy'),\n", + " (9593.925133689841, 'creator of kinder'),\n", + " (10512.09375, 'consulate in irbil'),\n", + " (12176.904977375565, 'newsworthy and entertaining'),\n", + " (15829.976470588235, 'santos over nepotism'),\n", + " (16272.689342403628, 'hotness of bhut')]" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# grams that have more than 2 terms, are those with common terms\n", + "ct_ngrams = set((g[1], g[0].decode(\"utf-8\"))\n", + " for g in bigram_ct.export_phrases(corpus) \n", + " if len(g[0].split()) > 2)\n", + "ct_ngrams = sorted(list(ct_ngrams))\n", + "print(len(ct_ngrams), \"grams with common terms found\")\n", + "# highest scores\n", + "ct_ngrams[-20:]" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "location-united : ['location of the united', 'location of united']\n", + "magnitude-6 : ['magnitude 6', 'magnitude of 6']\n", + "tuition-fees : ['tuition and fees', 'tuition fees']\n", + "pleaded-guilty : ['pleaded not guilty', 'pleaded guilty']\n", + "found-guilty : ['found not guilty', 'found guilty']\n", + "france-germany : ['france germany', 'france and germany']\n", + "earlier-week : ['earlier this week', 'earlier in the week']\n", + "since-2003 : ['since 2003', 'since the 2003']\n", + "contact-admissions : ['contact the admissions', 'contact admissions']\n", + "created-text : ['created from text', 'created from the text']\n", + "external-inter-wiki : ['external and inter-wiki', 'external inter-wiki']\n" + ] + } + ], + "source": [ + "# did we found any bigram with same words but different stopwords\n", + "import collections\n", + "by_terms = collections.defaultdict(set)\n", + "for ngram, score in bigram_ct.export_phrases(corpus):\n", + " grams = ngram.split()\n", + " by_terms[(grams[0], grams[-1])].add(ngram)\n", + "for k, v in by_terms.items():\n", + " if len(v) > 1:\n", + " print(b\"-\".join(k).decode(\"utf-8\"),\" : \", [w.decode(\"utf-8\") for w in v])\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index f7888e87f0..0c0ad58dd7 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -52,6 +52,22 @@ >>> print(trigram[bigram[sent]]) [u'the', u'new_york_times', u'is', u'a', u'newspaper'] +The common_terms parameter add a way to give special treatment to common terms (aka stop words) +such that their presence between two words +won't prevent bigram detection. +It allows to detect expressions like "bank of america" or "eye of the beholder". + +>>> common_terms = ["of", "with", "without", "and", "or", "the", "a"] +>>> ct_phrases = Phrases(sentence_stream, common_terms=common_terms) + +The phraser will of course inherit the common_terms from Phrases. + +>>> ct_bigram = Phraser(ct_phrases) +>>> sent = [u'the', u'mayor', u'shows', u'his', u'lack', u'of', u'interest'] +>>> print(bigram[sent]) +[u'the', u'mayor', u'shows', u'his', u'lack_of_interest'] + + .. [1] Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey Dean. Distributed Representations of Words and Phrases and their Compositionality. In Proceedings of NIPS, 2013. @@ -63,6 +79,7 @@ import logging import warnings from collections import defaultdict +import functools as ft import itertools as it from math import log from inspect import getargspec @@ -75,6 +92,7 @@ logger = logging.getLogger(__name__) + def _is_single(obj): """ Check whether `obj` is a single document or an entire corpus. @@ -99,7 +117,67 @@ def _is_single(obj): return False, obj_iter -class Phrases(interfaces.TransformationABC): +class SentenceAnalyzer(object): + + def score_item(self, worda, wordb, components, scorer): + vocab = self.vocab + if worda in vocab and wordb in vocab: + bigram = self.delimiter.join(components) + if bigram in vocab: + return scorer( + worda_count=float(vocab[worda]), + wordb_count=float(vocab[wordb]), + bigram_count=float(vocab[bigram])) + return -1 + + def analyze_sentence(self, sentence, threshold, common_terms, scorer): + """Analyze a sentence + + `sentence` a token list representing the sentence to be analyzed. + + `threshold` the minimum score for a bigram to be taken into account + + `common_terms` the list of common terms, they have a special treatment + + `scorer` the scorer function, as given to Phrases + """ + s = [utils.any2utf8(w) for w in sentence] + last_uncommon = None + in_between = [] + # adding None is a trick that helps getting an automatic happy ending + # has it won't be a common_word, nor score + for word in s + [None]: + is_common = word in common_terms + if not is_common and last_uncommon: + chain = [last_uncommon] + in_between + [word] + # test between last_uncommon + score = self.score_item( + worda=last_uncommon, + wordb=word, + components=chain, + scorer=scorer, + ) + if score > threshold: + yield (chain, score) + last_uncommon = None + in_between = [] + else: + # release words individually + for w in it.chain([last_uncommon], in_between): + yield (w, None) + in_between = [] + last_uncommon = word + elif not is_common: + last_uncommon = word + else: # common term + if last_uncommon: + # wait for uncommon resolution + in_between.append(word) + else: + yield (word, None) + + +class Phrases(SentenceAnalyzer, interfaces.TransformationABC): """ Detect phrases, based on collected collocation counts. Adjacent words that appear together more frequently than expected are joined together with the `_` character. @@ -109,8 +187,9 @@ class Phrases(interfaces.TransformationABC): """ - def __init__(self, sentences=None, min_count=5, threshold=10.0, max_vocab_size=40000000, - delimiter=b'_', progress_per=10000, scoring='default'): + def __init__(self, sentences=None, min_count=5, threshold=10.0, + max_vocab_size=40000000, delimiter=b'_', progress_per=10000, + scoring='default', common_terms=frozenset()): """ Initialize the model from an iterable of `sentences`. Each sentence must be a list of words (unicode strings) that will be used for training. @@ -168,6 +247,8 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0, max_vocab_size=4 A scoring function without any of these parameters (even if the parameters are not used) will raise a ValueError on initialization of the Phrases class. The scoring function must be picklable. + `common_terms` is an optionnal list of "stop words" that won't affect frequency count + of expressions containing them. """ if min_count <= 0: raise ValueError("min_count should be at least 1") @@ -204,6 +285,7 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0, max_vocab_size=4 self.delimiter = delimiter self.progress_per = progress_per self.corpus_word_count = 0 + self.common_terms = frozenset(utils.any2utf8(w) for w in common_terms) # ensure picklability of custom scorer try: @@ -226,7 +308,8 @@ def __str__(self): ) @staticmethod - def learn_vocab(sentences, max_vocab_size, delimiter=b'_', progress_per=10000): + def learn_vocab(sentences, max_vocab_size, delimiter=b'_', progress_per=10000, + common_terms=frozenset()): """Collect unigram/bigram counts from the `sentences` iterable.""" sentence_no = -1 total_words = 0 @@ -237,17 +320,21 @@ def learn_vocab(sentences, max_vocab_size, delimiter=b'_', progress_per=10000): if sentence_no % progress_per == 0: logger.info( "PROGRESS: at sentence #%i, processed %i words and %i word types", - sentence_no, total_words, len(vocab) + sentence_no, total_words, len(vocab), ) - sentence = [utils.any2utf8(w) for w in sentence] - for bigram in zip(sentence, sentence[1:]): - vocab[bigram[0]] += 1 - vocab[delimiter.join(bigram)] += 1 - total_words += 1 - - if sentence: # add last word skipped by previous loop - word = sentence[-1] - vocab[word] += 1 + s = [utils.any2utf8(w) for w in sentence] + last_uncommon = None + in_between = [] + for word in s: + if word not in common_terms: + vocab[word] += 1 + if last_uncommon is not None: + components = it.chain([last_uncommon], in_between, [word]) + vocab[delimiter.join(components)] += 1 + last_uncommon = word + in_between = [] + elif last_uncommon is not None: + in_between.append(word) total_words += 1 if len(vocab) > max_vocab_size: @@ -270,7 +357,8 @@ def add_vocab(self, sentences): # directly, but gives the new sentences a fighting chance to collect # sufficient counts, before being pruned out by the (large) accummulated # counts collected in previous learn_vocab runs. - min_reduce, vocab, total_words = self.learn_vocab(sentences, self.max_vocab_size, self.delimiter, self.progress_per) + min_reduce, vocab, total_words = self.learn_vocab( + sentences, self.max_vocab_size, self.delimiter, self.progress_per, self.common_terms) self.corpus_word_count += total_words if len(self.vocab) > 0: @@ -300,41 +388,26 @@ def export_phrases(self, sentences, out_delimiter=b' ', as_tuples=False): then you can debug the threshold with generated tsv """ - - vocab = self.vocab - threshold = self.threshold - delimiter = self.delimiter # delimiter used for lookup - min_count = self.min_count - scorer = self.scoring - # made floats for scoring function - len_vocab = float(len(vocab)) - scorer_min_count = float(min_count) - corpus_word_count = float(self.corpus_word_count) - + analyze_sentence = ft.partial( + self.analyze_sentence, + threshold=self.threshold, + common_terms=self.common_terms, + scorer=ft.partial( + self.scoring, + len_vocab=float(len(self.vocab)), + min_count=float(self.min_count), + corpus_word_count=float(self.corpus_word_count), + ), + ) for sentence in sentences: - s = [utils.any2utf8(w) for w in sentence] - last_bigram = False - - for word_a, word_b in zip(s, s[1:]): - # last bigram check was moved here to save a few CPU cycles - if word_a in vocab and word_b in vocab and not last_bigram: - bigram_word = delimiter.join((word_a, word_b)) - if bigram_word in vocab: - count_a = float(vocab[word_a]) - count_b = float(vocab[word_b]) - count_ab = float(vocab[bigram_word]) - # scoring MUST have all these parameters, even if they are not used - score = scorer(worda_count=count_a, wordb_count=count_b, bigram_count=count_ab, len_vocab=len_vocab, min_count=scorer_min_count, corpus_word_count=corpus_word_count) - # logger.debug("score for %s: (pab=%s - min_count=%s) / pa=%s / pb=%s * vocab_size=%s = %s", - # bigram_word, count_ab, scorer_min_count, count_a, count_ab, len_vocab, score) - if score > threshold and count_ab >= min_count: - if as_tuples: - yield ((word_a, word_b), score) - else: - yield (out_delimiter.join((word_a, word_b)), score) - last_bigram = True - continue - last_bigram = False + bigrams = analyze_sentence(sentence) + # keeps only not None scores + filtered = ((words, score) for words, score in bigrams if score is not None) + for words, score in filtered: + if as_tuples: + yield (tuple(words), score) + else: + yield (out_delimiter.join(words), score) def __getitem__(self, sentence): """ @@ -357,15 +430,7 @@ def __getitem__(self, sentence): """ warnings.warn("For a faster implementation, use the gensim.models.phrases.Phraser class") - vocab = self.vocab - threshold = self.threshold delimiter = self.delimiter # delimiter used for lookup - min_count = self.min_count - scorer = self.scoring - # made floats for scoring function - len_vocab = float(len(vocab)) - scorer_min_count = float(min_count) - corpus_word_count = float(self.corpus_word_count) is_single, sentence = _is_single(sentence) if not is_single: @@ -373,35 +438,23 @@ def __getitem__(self, sentence): # return an iterable stream. return self._apply(sentence) - s, new_s = [utils.any2utf8(w) for w in sentence], [] - last_bigram = False - vocab = self.vocab - - for word_a, word_b in zip(s, s[1:]): - # last bigram check was moved here to save a few CPU cycles - if word_a in vocab and word_b in vocab and not last_bigram: - bigram_word = delimiter.join((word_a, word_b)) - if bigram_word in vocab: - count_a = float(vocab[word_a]) - count_b = float(vocab[word_b]) - count_ab = float(vocab[bigram_word]) - # scoring MUST have all these parameters, even if they are not used - score = scorer(worda_count=count_a, wordb_count=count_b, bigram_count=count_ab, len_vocab=len_vocab, min_count=scorer_min_count, corpus_word_count=corpus_word_count) - # logger.debug("score for %s: (pab=%s - min_count=%s) / pa=%s / pb=%s * vocab_size=%s = %s", - # bigram_word, count_ab, scorer_min_count, count_a, count_ab, len_vocab, score) - if score > threshold and count_ab >= min_count: - new_s.append(bigram_word) - last_bigram = True - continue - - if not last_bigram: - new_s.append(word_a) - last_bigram = False - - if s: # add last word skipped by previous loop - last_token = s[-1] - if not last_bigram: - new_s.append(last_token) + delimiter = self.delimiter + bigrams = self.analyze_sentence( + sentence, + threshold=self.threshold, + common_terms=self.common_terms, + scorer=ft.partial( + self.scoring, + len_vocab=float(len(self.vocab)), + min_count=float(self.min_count), + corpus_word_count=float(self.corpus_word_count), + ), + ) + new_s = [] + for words, score in bigrams: + if score is not None: + words = delimiter.join(words) + new_s.append(words) return [utils.to_unicode(w) for w in new_s] @@ -457,17 +510,24 @@ def npmi_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, co return log(pab / (pa * pb)) / -log(pab) -def pseudocorpus(source_vocab, sep): +def pseudocorpus(source_vocab, sep, common_terms=frozenset()): """Feeds source_vocab's compound keys back to it, to discover phrases""" for k in source_vocab: if sep not in k: continue unigrams = k.split(sep) for i in range(1, len(unigrams)): - yield [sep.join(unigrams[:i]), sep.join(unigrams[i:])] + if unigrams[i - 1] not in common_terms: + # do not join common terms + cterms = list(it.takewhile(lambda w: w in common_terms, unigrams[i:])) + tail = unigrams[i + len(cterms):] + components = [sep.join(unigrams[:i])] + cterms + if tail: + components.append(sep.join(tail)) + yield components -class Phraser(interfaces.TransformationABC): +class Phraser(SentenceAnalyzer, interfaces.TransformationABC): """ Minimal state & functionality to apply results of a Phrases model to tokens. @@ -485,8 +545,9 @@ def __init__(self, phrases_model): self.min_count = phrases_model.min_count self.delimiter = phrases_model.delimiter self.scoring = phrases_model.scoring + self.common_terms = phrases_model.common_terms + corpus = self.pseudocorpus(phrases_model) self.phrasegrams = {} - corpus = pseudocorpus(phrases_model.vocab, phrases_model.delimiter) logger.info('source_vocab length %i', len(phrases_model.vocab)) count = 0 for bigram, score in phrases_model.export_phrases(corpus, self.delimiter, as_tuples=True): @@ -498,6 +559,18 @@ def __init__(self, phrases_model): logger.info('Phraser added %i phrasegrams', count) logger.info('Phraser built with %i %i phrasegrams', count, len(self.phrasegrams)) + def pseudocorpus(self, phrases_model): + return pseudocorpus(phrases_model.vocab, phrases_model.delimiter, + phrases_model.common_terms) + + def score_item(self, worda, wordb, components, scorer): + """score is retained from original dataset + """ + try: + return self.phrasegrams[tuple(components)][1] + except KeyError: + return -1 + def __getitem__(self, sentence): """ Convert the input tokens `sentence` (=list of unicode strings) into phrase @@ -515,27 +588,17 @@ def __getitem__(self, sentence): # return an iterable stream. return self._apply(sentence) - s, new_s = [utils.any2utf8(w) for w in sentence], [] - last_bigram = False - phrasegrams = self.phrasegrams delimiter = self.delimiter - for word_a, word_b in zip(s, s[1:]): - bigram_tuple = (word_a, word_b) - if phrasegrams.get(bigram_tuple, (-1, -1))[1] > self.threshold and not last_bigram: - bigram_word = delimiter.join((word_a, word_b)) - new_s.append(bigram_word) - last_bigram = True - continue - - if not last_bigram: - new_s.append(word_a) - last_bigram = False - - if s: # add last word skipped by previous loop - last_token = s[-1] - if not last_bigram: - new_s.append(last_token) - + bigrams = self.analyze_sentence( + sentence, + threshold=self.threshold, + common_terms=self.common_terms, + scorer=None) # we will use our score_item function redefinition + new_s = [] + for words, score in bigrams: + if score is not None: + words = delimiter.join(words) + new_s.append(words) return [utils.to_unicode(w) for w in new_s] diff --git a/gensim/test/test_phrases.py b/gensim/test/test_phrases.py index cf008f14cc..faf0127297 100644 --- a/gensim/test/test_phrases.py +++ b/gensim/test/test_phrases.py @@ -14,7 +14,7 @@ import sys from gensim import utils -from gensim.models.phrases import Phrases, Phraser +from gensim.models.phrases import SentenceAnalyzer, Phrases, Phraser, pseudocorpus if sys.version_info[0] >= 3: unicode = str @@ -23,33 +23,155 @@ datapath = lambda fname: os.path.join(module_path, 'test_data', fname) -sentences = [ - ['human', 'interface', 'computer'], - ['survey', 'user', 'computer', 'system', 'response', 'time'], - ['eps', 'user', 'interface', 'system'], - ['system', 'human', 'system', 'eps'], - ['user', 'response', 'time'], - ['trees'], - ['graph', 'trees'], - ['graph', 'minors', 'trees'], - ['graph', 'minors', 'survey'], - ['graph', 'minors', 'survey', 'human', 'interface'] # test bigrams within same sentence -] -unicode_sentences = [[utils.to_unicode(w) for w in sentence] for sentence in sentences] - - -def gen_sentences(): - return ((w for w in sentence) for sentence in sentences) - - -class TestPhrasesCommon(unittest.TestCase): +class TestUtils(unittest.TestCase): + + def test_pseudocorpus_no_common_terms(self): + vocab = [ + "prime_minister", + "gold", + "chief_technical_officer", + "effective"] + result = list(pseudocorpus(vocab, "_")) + self.assertEqual( + result, + [["prime", "minister"], + ["chief", "technical_officer"], + ["chief_technical", "officer"]]) + + def test_pseudocorpus_with_common_terms(self): + vocab = [ + "hall_of_fame", + "gold", + "chief_of_political_bureau", + "effective", + "beware_of_the_dog_in_the_yard"] + common_terms = frozenset(["in", "the", "of"]) + result = list(pseudocorpus(vocab, "_", common_terms=common_terms)) + self.assertEqual( + result, + [["hall", "of", "fame"], + ["chief", "of", "political_bureau"], + ["chief_of_political", "bureau"], + ["beware", "of", "the", "dog_in_the_yard"], + ["beware_of_the_dog", "in", "the", "yard"]]) + + +class TestPhraseAnalysis(unittest.TestCase): + + class AnalysisTester(SentenceAnalyzer): + + def __init__(self, scores): + self.scores = scores + + def score_item(self, worda, wordb, components, scorer): + """Override for test purpose""" + if worda is not None and wordb is not None: + bigram_word = b"_".join(components) + return self.scores.get(bigram_word, -1) + else: + return -1 + + def analyze(self, scores, sentence): + analyzer = self.AnalysisTester(scores) + return list(analyzer.analyze_sentence( + sentence, + threshold=1, + common_terms={b"a", b"the", b"with", b"of"}, + scorer=None)) + + def analyze_words(self, scores, sentence): + result = ( + w if isinstance(w, (tuple, list)) else [w] + for w, score in self.analyze(scores, sentence)) + return [b"_".join(w).decode("utf-8") for w in result] + + def test_simple_analysis(self): + s = ["simple", "sentence", "should", "pass"] + result = self.analyze_words({}, s) + self.assertEqual(result, s) + s = ["a", "simple", "sentence", "with", "no", "bigram", "but", "common", "terms"] + result = self.analyze_words({}, s) + self.assertEqual(result, s) + + def test_analysis_bigrams(self): + scores = { + b"simple_sentence": 2, b"sentence_many": 2, + b"many_possible": 2, b"possible_bigrams": 2} + s = ["simple", "sentence", "many", "possible", "bigrams"] + result = self.analyze_words(scores, s) + self.assertEqual(result, ["simple_sentence", "many_possible", "bigrams"]) + + s = ["some", "simple", "sentence", "many", "bigrams"] + result = self.analyze_words(scores, s) + self.assertEqual(result, ["some", "simple_sentence", "many", "bigrams"]) + + s = ["some", "unrelated", "simple", "words"] + result = self.analyze_words(scores, s) + self.assertEqual(result, s) + + def test_analysis_common_terms(self): + scores = { + b"simple_sentence": 2, b"sentence_many": 2, + b"many_possible": 2, b"possible_bigrams": 2} + s = ["a", "simple", "sentence", "many", "the", "possible", "bigrams"] + result = self.analyze_words(scores, s) + self.assertEqual(result, ["a", "simple_sentence", "many", "the", "possible_bigrams"]) + + s = ["simple", "the", "sentence", "and", "many", "possible", "bigrams", "with", "a"] + result = self.analyze_words(scores, s) + self.assertEqual(result, [ + "simple", "the", "sentence", "and", "many_possible", "bigrams", "with", "a"]) + + def test_analysis_common_terms_in_between(self): + scores = { + b"simple_sentence": 2, b"sentence_with_many": 2, + b"many_possible": 2, b"many_of_the_possible": 2, b"possible_bigrams": 2} + s = ["sentence", "with", "many", "possible", "bigrams"] + result = self.analyze_words(scores, s) + self.assertEqual(result, ["sentence_with_many", "possible_bigrams"]) + + s = ["a", "simple", "sentence", "with", "many", "of", "the", "possible", "bigrams", "with"] + result = self.analyze_words(scores, s) + self.assertEqual( + result, ["a", "simple_sentence", "with", "many_of_the_possible", "bigrams", "with"]) + + +class PhrasesData: + sentences = [ + ['human', 'interface', 'computer'], + ['survey', 'user', 'computer', 'system', 'response', 'time'], + ['eps', 'user', 'interface', 'system'], + ['system', 'human', 'system', 'eps'], + ['user', 'response', 'time'], + ['trees'], + ['graph', 'trees'], + ['graph', 'minors', 'trees'], + ['graph', 'minors', 'survey'], + ['graph', 'minors', 'survey', 'human', 'interface'] # test bigrams within same sentence + ] + unicode_sentences = [[utils.to_unicode(w) for w in sentence] for sentence in sentences] + common_terms = frozenset() + + bigram1 = u'response_time' + bigram2 = u'graph_minors' + bigram3 = u'human_interface' + + def gen_sentences(self): + return ((w for w in sentence) for sentence in self.sentences) + + +class PhrasesCommon: """ Tests that need to be run for both Prases and Phraser classes.""" def setUp(self): - self.bigram = Phrases(sentences, min_count=1, threshold=1) - self.bigram_default = Phrases(sentences) - self.bigram_utf8 = Phrases(sentences, min_count=1, threshold=1) - self.bigram_unicode = Phrases(unicode_sentences, min_count=1, threshold=1) + self.bigram = Phrases( + self.sentences, min_count=1, threshold=1, common_terms=self.common_terms) + self.bigram_default = Phrases( + self.sentences, common_terms=self.common_terms) + self.bigram_utf8 = Phrases( + self.sentences, min_count=1, threshold=1, common_terms=self.common_terms) + self.bigram_unicode = Phrases( + self.unicode_sentences, min_count=1, threshold=1, common_terms=self.common_terms) def testEmptyInputsOnBigramConstruction(self): """Test that empty inputs don't throw errors and return the expected result.""" @@ -67,23 +189,22 @@ def testEmptyInputsOnBigramConstruction(self): def testSentenceGeneration(self): """Test basic bigram using a dummy corpus.""" # test that we generate the same amount of sentences as the input - self.assertEqual(len(sentences), len(list(self.bigram_default[sentences]))) + self.assertEqual(len(self.sentences), len(list(self.bigram_default[self.sentences]))) def testSentenceGenerationWithGenerator(self): """Test basic bigram production when corpus is a generator.""" - self.assertEqual(len(list(gen_sentences())), - len(list(self.bigram_default[gen_sentences()]))) + self.assertEqual(len(list(self.gen_sentences())), + len(list(self.bigram_default[self.gen_sentences()]))) def testBigramConstruction(self): """Test Phrases bigram construction building.""" # with this setting we should get response_time and graph_minors bigram1_seen = False bigram2_seen = False - - for s in self.bigram[sentences]: - if not bigram1_seen and u'response_time' in s: + for s in self.bigram[self.sentences]: + if not bigram1_seen and self.bigram1 in s: bigram1_seen = True - if not bigram2_seen and u'graph_minors' in s: + if not bigram2_seen and self.bigram2 in s: bigram2_seen = True if bigram1_seen and bigram2_seen: break @@ -92,21 +213,21 @@ def testBigramConstruction(self): # check the same thing, this time using single doc transformation # last sentence should contain both graph_minors and human_interface - self.assertTrue(u'response_time' in self.bigram[sentences[1]]) - self.assertTrue(u'response_time' in self.bigram[sentences[4]]) - self.assertTrue(u'graph_minors' in self.bigram[sentences[-2]]) - self.assertTrue(u'graph_minors' in self.bigram[sentences[-1]]) - self.assertTrue(u'human_interface' in self.bigram[sentences[-1]]) + self.assertTrue(self.bigram1 in self.bigram[self.sentences[1]]) + self.assertTrue(self.bigram1 in self.bigram[self.sentences[4]]) + self.assertTrue(self.bigram2 in self.bigram[self.sentences[-2]]) + self.assertTrue(self.bigram2 in self.bigram[self.sentences[-1]]) + self.assertTrue(self.bigram3 in self.bigram[self.sentences[-1]]) def testBigramConstructionFromGenerator(self): """Test Phrases bigram construction building when corpus is a generator""" bigram1_seen = False bigram2_seen = False - for s in self.bigram[gen_sentences()]: - if not bigram1_seen and 'response_time' in s: + for s in self.bigram[self.gen_sentences()]: + if not bigram1_seen and self.bigram1 in s: bigram1_seen = True - if not bigram2_seen and 'graph_minors' in s: + if not bigram2_seen and self.bigram2 in s: bigram2_seen = True if bigram1_seen and bigram2_seen: break @@ -116,10 +237,10 @@ def testEncoding(self): """Test that both utf8 and unicode input work; output must be unicode.""" expected = [u'survey', u'user', u'computer', u'system', u'response_time'] - self.assertEqual(self.bigram_utf8[sentences[1]], expected) - self.assertEqual(self.bigram_unicode[sentences[1]], expected) + self.assertEqual(self.bigram_utf8[self.sentences[1]], expected) + self.assertEqual(self.bigram_unicode[self.sentences[1]], expected) - transformed = ' '.join(self.bigram_utf8[sentences[1]]) + transformed = ' '.join(self.bigram_utf8[self.sentences[1]]) self.assertTrue(isinstance(transformed, unicode)) @@ -131,22 +252,26 @@ def dumb_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, co return 1 -class TestPhrasesModel(unittest.TestCase): +class TestPhrasesModel(PhrasesData, PhrasesCommon, unittest.TestCase): + def testExportPhrases(self): """Test Phrases bigram export_phrases functionality.""" - bigram = Phrases(sentences, min_count=1, threshold=1) + bigram = Phrases(self.sentences, min_count=1, threshold=1) seen_bigrams = set() - for phrase, score in bigram.export_phrases(sentences): + for phrase, score in bigram.export_phrases(self.sentences): seen_bigrams.add(phrase) - assert seen_bigrams == {b'response time', b'graph minors', b'human interface'} + assert seen_bigrams == { + b'response time', + b'graph minors', + b'human interface', + } def testMultipleBigramsSingleEntry(self): """ a single entry should produce multiple bigrams. """ - bigram = Phrases(sentences, min_count=1, threshold=1) - + bigram = Phrases(self.sentences, min_count=1, threshold=1) seen_bigrams = set() test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']] @@ -157,7 +282,7 @@ def testMultipleBigramsSingleEntry(self): def testScoringDefault(self): """ test the default scoring, from the mikolov word2vec paper """ - bigram = Phrases(sentences, min_count=1, threshold=1) + bigram = Phrases(self.sentences, min_count=1, threshold=1) seen_scores = set() @@ -172,7 +297,7 @@ def testScoringDefault(self): def test__getitem__(self): """ test Phrases[sentences] with a single sentence""" - bigram = Phrases(sentences, min_count=1, threshold=1) + bigram = Phrases(self.sentences, min_count=1, threshold=1) # pdb.set_trace() test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']] phrased_sentence = next(bigram[test_sentences].__iter__()) @@ -181,7 +306,7 @@ def test__getitem__(self): def testScoringNpmi(self): """ test normalized pointwise mutual information scoring """ - bigram = Phrases(sentences, min_count=1, threshold=.5, scoring='npmi') + bigram = Phrases(self.sentences, min_count=1, threshold=.5, scoring='npmi') seen_scores = set() test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']] @@ -196,7 +321,7 @@ def testScoringNpmi(self): def testCustomScorer(self): """ test using a custom scoring function """ - bigram = Phrases(sentences, min_count=1, threshold=.001, scoring=dumb_scorer) + bigram = Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer) seen_scores = [] test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']] @@ -209,21 +334,25 @@ def testCustomScorer(self): def testBadParameters(self): """Test the phrases module with bad parameters.""" # should fail with something less or equal than 0 - self.assertRaises(ValueError, Phrases, sentences, min_count=0) + self.assertRaises(ValueError, Phrases, self.sentences, min_count=0) # threshold should be positive - self.assertRaises(ValueError, Phrases, sentences, threshold=-1) + self.assertRaises(ValueError, Phrases, self.sentences, threshold=-1) def testPruning(self): """Test that max_vocab_size parameter is respected.""" - bigram = Phrases(sentences, max_vocab_size=5) + bigram = Phrases(self.sentences, max_vocab_size=5) self.assertTrue(len(bigram.vocab) <= 5) +# endclass TestPhrasesModel + + +class TestPhrasesScoringPersistence(PhrasesData, unittest.TestCase): def testSaveLoadCustomScorer(self): """ saving and loading a Phrases object with a custom scorer """ try: - bigram = Phrases(sentences, min_count=1, threshold=.001, scoring=dumb_scorer) + bigram = Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer) bigram.save("test_phrases_testSaveLoadCustomScorer_temp_save.pkl") bigram_loaded = Phrases.load("test_phrases_testSaveLoadCustomScorer_temp_save.pkl") seen_scores = [] @@ -242,7 +371,7 @@ def testSaveLoad(self): """ Saving and loading a Phrases object.""" try: - bigram = Phrases(sentences, min_count=1, threshold=1) + bigram = Phrases(self.sentences, min_count=1, threshold=1) bigram.save("test_phrases_testSaveLoad_temp_save.pkl") bigram_loaded = Phrases.load("test_phrases_testSaveLoad_temp_save.pkl") seen_scores = set() @@ -264,7 +393,7 @@ def testSaveLoadStringScoring(self): This should ensure backwards compatibility with the previous version of Phrases""" try: - bigram = Phrases(sentences, min_count=1, threshold=1) + bigram = Phrases(self.sentences, min_count=1, threshold=1) bigram.scoring = "default" bigram.save("test_phrases_testSaveLoadStringScoring_temp_save.pkl") bigram_loaded = Phrases.load("test_phrases_testSaveLoadStringScoring_temp_save.pkl") @@ -287,7 +416,7 @@ def testSaveLoadNoScoring(self): This should ensure backwards compatibility with old versions of Phrases""" try: - bigram = Phrases(sentences, min_count=1, threshold=1) + bigram = Phrases(self.sentences, min_count=1, threshold=1) del(bigram.scoring) bigram.save("test_phrases_testSaveLoadNoScoring_temp_save.pkl") bigram_loaded = Phrases.load("test_phrases_testSaveLoadNoScoring_temp_save.pkl") @@ -304,27 +433,180 @@ def testSaveLoadNoScoring(self): finally: if os.path.exists("test_phrases_testSaveLoadNoScoring_temp_save.pkl"): os.remove("test_phrases_testSaveLoadNoScoring_temp_save.pkl") -# endclass TestPhrasesModel -class TestPhraserModel(TestPhrasesCommon): +class TestPhraserModel(PhrasesData, PhrasesCommon, unittest.TestCase): """ Test Phraser models.""" def setUp(self): """Set up Phraser models for the tests.""" - bigram_phrases = Phrases(sentences, min_count=1, threshold=1) + bigram_phrases = Phrases( + self.sentences, min_count=1, threshold=1, common_terms=self.common_terms) self.bigram = Phraser(bigram_phrases) - bigram_default_phrases = Phrases(sentences) + bigram_default_phrases = Phrases(self.sentences, common_terms=self.common_terms) self.bigram_default = Phraser(bigram_default_phrases) - bigram_utf8_phrases = Phrases(sentences, min_count=1, threshold=1) + bigram_utf8_phrases = Phrases( + self.sentences, min_count=1, threshold=1, common_terms=self.common_terms) self.bigram_utf8 = Phraser(bigram_utf8_phrases) - bigram_unicode_phrases = Phrases(unicode_sentences, min_count=1, threshold=1) + bigram_unicode_phrases = Phrases( + self.unicode_sentences, min_count=1, threshold=1, common_terms=self.common_terms) self.bigram_unicode = Phraser(bigram_unicode_phrases) +class CommonTermsPhrasesData: + """This mixin permits to reuse the test, using, this time the common_terms option + """ + + sentences = [ + ['human', 'interface', 'with', 'computer'], + ['survey', 'of', 'user', 'computer', 'system', 'lack', 'of', 'interest'], + ['eps', 'user', 'interface', 'system'], + ['system', 'and', 'human', 'system', 'eps'], + ['user', 'lack', 'of', 'interest'], + ['trees'], + ['graph', 'of', 'trees'], + ['data', 'and', 'graph', 'of', 'trees'], + ['data', 'and', 'graph', 'survey'], + ['data', 'and', 'graph', 'survey', 'for', 'human', 'interface'] # test bigrams within same sentence + ] + unicode_sentences = [[utils.to_unicode(w) for w in sentence] for sentence in sentences] + common_terms = ['of', 'and', 'for'] + + bigram1 = u'lack_of_interest' + bigram2 = u'data_and_graph' + bigram3 = u'human_interface' + expression1 = u'lack of interest' + expression2 = u'data and graph' + expression3 = u'human interface' + + def gen_sentences(self): + return ((w for w in sentence) for sentence in self.sentences) + + +class TestPhrasesModelCommonTerms(CommonTermsPhrasesData, TestPhrasesModel): + """Test Phrases models with common terms""" + + def testEncoding(self): + """Test that both utf8 and unicode input work; output must be unicode.""" + expected = [u'survey', u'of', u'user', u'computer', u'system', u'lack_of_interest'] + + self.assertEqual(self.bigram_utf8[self.sentences[1]], expected) + self.assertEqual(self.bigram_unicode[self.sentences[1]], expected) + + transformed = ' '.join(self.bigram_utf8[self.sentences[1]]) + self.assertTrue(isinstance(transformed, unicode)) + + def testMultipleBigramsSingleEntry(self): + """ a single entry should produce multiple bigrams. """ + bigram = Phrases(self.sentences, min_count=1, threshold=1, common_terms=self.common_terms) + + seen_bigrams = set() + test_sentences = [['data', 'and', 'graph', 'survey', 'for', 'human', 'interface']] + for phrase, score in bigram.export_phrases(test_sentences): + seen_bigrams.add(phrase) + assert seen_bigrams == set([ + b'data and graph', + b'human interface', + ]) + + def testExportPhrases(self): + """Test Phrases bigram export_phrases functionality.""" + bigram = Phrases(self.sentences, min_count=1, threshold=1, common_terms=self.common_terms) + + seen_bigrams = set() + + for phrase, score in bigram.export_phrases(self.sentences): + seen_bigrams.add(phrase) + + assert seen_bigrams == set([ + b'human interface', + b'graph of trees', + b'data and graph', + b'lack of interest', + ]) + + def testScoringDefault(self): + """ test the default scoring, from the mikolov word2vec paper """ + bigram = Phrases(self.sentences, min_count=1, threshold=1, common_terms=self.common_terms) + + seen_scores = set() + + test_sentences = [['data', 'and', 'graph', 'survey', 'for', 'human', 'interface']] + for phrase, score in bigram.export_phrases(test_sentences): + seen_scores.add(round(score, 3)) + + min_count = float(bigram.min_count) + len_vocab = float(len(bigram.vocab)) + graph = float(bigram.vocab[b"graph"]) + data = float(bigram.vocab[b"data"]) + data_and_graph = float(bigram.vocab[b"data_and_graph"]) + human = float(bigram.vocab[b"human"]) + interface = float(bigram.vocab[b"interface"]) + human_interface = float(bigram.vocab[b"human_interface"]) + + assert seen_scores == set([ + # score for data and graph + round((data_and_graph - min_count) / data / graph * len_vocab, 3), + # score for human interface + round((human_interface - min_count) / human / interface * len_vocab, 3), + ]) + + def testScoringNpmi(self): + """ test normalized pointwise mutual information scoring """ + bigram = Phrases(self.sentences, min_count=1, threshold=.5, + scoring='npmi', common_terms=self.common_terms) + + seen_scores = set() + + test_sentences = [['data', 'and', 'graph', 'survey', 'for', 'human', 'interface']] + for phrase, score in bigram.export_phrases(test_sentences): + seen_scores.add(round(score, 3)) + + assert seen_scores == set([ + .74, # score for data and graph + .894 # score for human interface + ]) + + def testCustomScorer(self): + """ test using a custom scoring function """ + + bigram = Phrases(self.sentences, min_count=1, threshold=.001, + scoring=dumb_scorer, common_terms=self.common_terms) + + seen_scores = [] + test_sentences = [['data', 'and', 'graph', 'survey', 'for', 'human', 'interface']] + for phrase, score in bigram.export_phrases(test_sentences): + seen_scores.append(score) + + assert all(seen_scores) # all scores 1 + assert len(seen_scores) == 2 # 'data and graph' 'survey for human' + + def test__getitem__(self): + """ test Phrases[sentences] with a single sentence""" + bigram = Phrases(self.sentences, min_count=1, threshold=1, common_terms=self.common_terms) + # pdb.set_trace() + test_sentences = [['data', 'and', 'graph', 'survey', 'for', 'human', 'interface']] + phrased_sentence = next(bigram[test_sentences].__iter__()) + + assert phrased_sentence == ['data_and_graph', 'survey', 'for', 'human_interface'] + + +class TestPhraserModelCommonTerms(CommonTermsPhrasesData, TestPhraserModel): + + def testEncoding(self): + """Test that both utf8 and unicode input work; output must be unicode.""" + expected = [u'survey', u'of', u'user', u'computer', u'system', u'lack_of_interest'] + + self.assertEqual(self.bigram_utf8[self.sentences[1]], expected) + self.assertEqual(self.bigram_unicode[self.sentences[1]], expected) + + transformed = ' '.join(self.bigram_utf8[self.sentences[1]]) + self.assertTrue(isinstance(transformed, unicode)) + + if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) unittest.main() diff --git a/gensim/test/test_sklearn_api.py b/gensim/test/test_sklearn_api.py index 2dd54073b7..565263f312 100644 --- a/gensim/test/test_sklearn_api.py +++ b/gensim/test/test_sklearn_api.py @@ -218,7 +218,7 @@ def testPipeline(self): text_lda = Pipeline([('features', model,), ('classifier', clf)]) text_lda.fit(corpus, data.target) score = text_lda.score(corpus, data.target) - self.assertGreater(score, 0.40) + self.assertGreaterEqual(score, 0.40) def testSetGetParams(self): # updating only one param