Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor documentation for gensim.models.phrases #1950

Merged
merged 21 commits into from
Apr 3, 2018
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
141 changes: 115 additions & 26 deletions gensim/models/phrases.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,6 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0,
>>> #Then we got these linked tags:
[u'graph_minors']


"""
if min_count <= 0:
raise ValueError("min_count should be at least 1")
Expand Down Expand Up @@ -489,7 +488,7 @@ def learn_vocab(sentences, max_vocab_size, delimiter=b'_', progress_per=10000,
'eps': 6, 'survey': 6, 'time': 6, 'survey_system': 3, 'minors_survey': 3})
>>> #Number of tokens and phrases #TODO: why so many?
>>> print learned[2]
>>> 87
87

"""
sentence_no = -1
Expand Down Expand Up @@ -592,7 +591,6 @@ def export_phrases(self, sentences, out_delimiter=b' ', as_tuples=False):
as_tuples : bool, optional
If true, yield (tuple(words), score), otherwise - (out_delimiter.join(words), score).


Example
-------
>>> from gensim.test.utils import datapath
Expand Down Expand Up @@ -737,9 +735,11 @@ def original_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count

Notes
-----
From paper :math:`(count(worda, wordb) - min_count) * N /
(count(worda) * count(wordb)) > threshold`, where `N` is the total vocabulary size.
Formula from paper:
:math:`\\frac{(count(word_a, word_b) - mincount) * N }{ (count(word_a) * count(word_b))} > threshold`,
where `N` is the total vocabulary size.

#TODO: something really bad with LaTex
"""
return (bigram_count - min_count) / worda_count / wordb_count * len_vocab

Expand All @@ -765,11 +765,11 @@ def npmi_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, co
corpus_word_count : int
Number of words in corpus.


Notes
-----
From paper :math:`ln(prop(worda, wordb) / (prop(worda)*prop(wordb))) / - ln(prop(worda, wordb)`
where prop(n) is the count of n / the count of everything in the entire corpus.
Formula from paper:
:math:`\\frac{ln(prop(word_a, word_b) / (prop(word_a)*prop(word_b)))}{ -ln(prop(word_a, word_b)}`,
where :math:`prop(n)` is the count of n / the count of everything in the entire corpus.

"""
pa = worda_count / corpus_word_count
Expand All @@ -784,17 +784,32 @@ def pseudocorpus(source_vocab, sep, common_terms=frozenset()):
Parameters
----------
source_vocab : iterable of list of str
Vocabulary.
Tokens vocabulary.
sep : str
Separator.
Separator element.
common_terms : set, optional
Stopwords.
Immutable set of stopwords.

Yields
------
generator
Generator with phrases.

Examples
--------
>>> from gensim.test.utils import datapath
>>> from gensim.models.word2vec import Text8Corpus
>>> from gensim.models.phrases import Phrases,pseudocorpus
>>> #Create corpus
>>> sentences = Text8Corpus(datapath('testcorpus.txt'))
>>> #Train the detector with:
>>> phrases = Phrases(sentences, min_count=1, threshold=1)
>>> pseudo = pseudocorpus(sentences, " ")
>>> sent = [u'trees', u'graph', u'minors']
>>> for token in list(pseudo):
>>> print token
>>> #TODO: doesn't work

"""
for k in source_vocab:
if sep not in k:
Expand All @@ -812,18 +827,7 @@ def pseudocorpus(source_vocab, sep, common_terms=frozenset()):


class Phraser(SentenceAnalyzer, PhrasesTransformation):
"""Minimal state & functionality to apply results of a Phrases model to tokens.

Notes
-----
After the one-time initialization, a Phraser will be much smaller and
somewhat faster than using the full Phrases model.

Reflects the results of the source model's `min_count`, `threshold`, and
`scoring` settings. (You can tamper with those & create a new Phraser to try
other values.)

"""
"""Minimal state & functionality to apply results of a Phrases model to tokens."""

def __init__(self, phrases_model):
"""
Expand All @@ -832,6 +836,15 @@ def __init__(self, phrases_model):
phrases_model : :class:`~gensim.models.phrases.Phrases`
Phrases class object.

Notes
-----
After the one-time initialization, a Phraser will be much smaller and
somewhat faster than using the full Phrases model.

Reflects the results of the source model's `min_count`, `threshold`, and
`scoring` settings. (You can tamper with those & create a new Phraser to try
other values.)

Example
----------
>>> from gensim.test.utils import datapath
Expand Down Expand Up @@ -894,16 +907,25 @@ def pseudocorpus(self, phrases_model):
generator
Generator with phrases.


Example
-------
>>> from gensim.test.utils import datapath
>>> from gensim.models.word2vec import Text8Corpus
>>> from gensim.models.phrases import Phrases, Phraser
>>> sentences = Text8Corpus(datapath('testcorpus.txt'))
>>> #train the detector with:
>>> phrases_model = Phrases(sentences, min_count=5, threshold=100)
>>> #Create a Phraser object to transform any sentence and turn 2 suitable tokens into 1 phrase:
>>> phraser_model = Phraser(phrases_model)
>>> #Initialize pseudocorpus
>>> pseudo = phraser_model.pseudocorpus(phrases_model)
>>> #Get all phrases from it
>>> for phrase in pseudo:
>>> print phrase
['human', 'system']
['trees', 'trees']
['system', 'system']
...

"""
return pseudocorpus(phrases_model.vocab, phrases_model.delimiter,
Expand All @@ -919,6 +941,7 @@ def score_item(self, worda, wordb, components, scorer):
wordb : str
Second word for comparison. Should be unicode string.
components : generator
Contain phrases.
scorer : {'default', 'npmi'}
Scorer function, as given to :class:`~gensim.models.phrases.Phrases`.

Expand All @@ -933,12 +956,40 @@ def score_item(self, worda, wordb, components, scorer):
>>> from gensim.models.word2vec import Text8Corpus
>>> from gensim.models.phrases import Phrases, Phraser
>>> sentences = Text8Corpus(datapath('testcorpus.txt'))
>>> #train the detector with
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PEP8: # followed by one space (here and elsewhere).

>>> phrases_model = Phrases(sentences, min_count=5, threshold=100)
>>> #Create a Phraser object to transform any sentence and turn 2 suitable tokens into 1 phrase:
>>> phraser_model = Phraser(phrases_model)
>>> #Initialize pseudocorpus
>>> pseudo = phraser_model.pseudocorpus(phrases_model)
//>>> phraser_model.score_item("tree","human",pseudo,'default')
>>> phraser_model.score_item(u"tree",u"human",pseudo,'default')
>>> #Compare 2 words
>>> phraser_model.score_item(u'tree',u'human',pseudo,'default')
>>> # -1 means, that there is no suitable phrase among pseudocorpus elements. #TODO: look below for some
# interesting feature
-1
>>> from gensim.test.utils import datapath
>>> from gensim.models.word2vec import Text8Corpus
>>> from gensim.models.phrases import Phrases, Phraser
>>> sentences = Text8Corpus(datapath('testcorpus.txt'))
>>> # train the detector with:
>>> phrases_model = Phrases(sentences, min_count=1, threshold=1)
>>> # Create a Phraser object to transform any sentence and turn 2 suitable tokens into 1 phrase:
>>> phraser_model = Phraser(phrases_model)
>>> # Initialize pseudocorpus:
>>> pseudo = phraser_model.pseudocorpus(phrases_model)
>>> # Compare 2 words:
>>> phraser_model.score_item(u'tree',u'human',pseudo,'default') #TODO: there is a strange problem: first time
# it will rise an error:
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "gensim/models/phrases.py", line 985, in score_item
phraser_model = Phraser(phrases_model)
TypeError: unhashable type: 'list'
#But if i launch it second time, it will return -1. Have no idea how does this work.


>>> # -1 means, that there is no suitable phrase among pseudocorpus elements.


"""
try:
Expand Down Expand Up @@ -978,6 +1029,44 @@ def __getitem__(self, sentence):
>>> phraser_model["tree", "human"]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

incorrect, please use phraser_model[["tree", "human"]]

[u'tree', u'human']

Examples
----------
>>> from gensim.test.utils import datapath
>>> from gensim.models.word2vec import Text8Corpus
>>> from gensim.models.phrases import Phrases, Phraser
>>>
>>> #Create corpus
>>> sentences = Text8Corpus(datapath('testcorpus.txt'))
>>>
>>> #Train the detector with:
>>> phrases = Phrases(sentences, min_count=1, threshold=1)
>>> # Create a Phraser object to transform any sentence and turn 2 suitable tokens into 1 phrase:
>>> phraser_model = Phraser(phrases)
>>> #Input is a list of unicode strings:
>>> sent = [u'trees', u'graph', u'minors']
>>> #Both of these tokens appear in corpus at least twice, and phrase score is higher, than treshold = 1:
>>> print(phraser_model[sent])
[u'trees_graph', u'minors']

>>> from gensim.test.utils import datapath
>>> from gensim.models.word2vec import Text8Corpus
>>> from gensim.models.phrases import Phrases, Phraser
>>>
>>> #Create corpus
>>> sentences = Text8Corpus(datapath('testcorpus.txt'))
>>>
>>> #Train the detector with:
>>> phrases = Phrases(sentences, min_count=1, threshold=1)
>>> # Create a Phraser object to transform any sentence and turn 2 suitable tokens into 1 phrase:
>>> phraser_model = Phraser(phrases)
>>> #Input is a corpus:
>>> sent = [[u'trees', u'graph', u'minors'],[u'graph', u'minors']]
>>> #So we get 2 phrases
>>> res = phraser_model[sent]
>>> for phrase in res:
>>> print phrase
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Best use brackets, for py3k compatibility.

[u'trees_graph', u'minors']
[u'graph_minors']

"""
is_single, sentence = _is_single(sentence)
Expand Down