-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Refactor documentation for gensim.models.phrases
#1950
Merged
Merged
Changes from 1 commit
Commits
Show all changes
21 commits
Select commit
Hold shift + click to select a range
f0722da
Create PR
CLearERR 58aef2e
Small changes
CLearERR 7937e5e
More small changes
CLearERR 866d650
Merge remote-tracking branch 'upstream/develop' into modelphrases
CLearERR acce08a
Additions
CLearERR 78dd276
Additions II
CLearERR 87e571f
Additions III
CLearERR 8f28017
Updates & Example I
CLearERR 38422e0
Updates N
CLearERR 2e2d0a1
Added examples
CLearERR 0f3a972
Partial fix
CLearERR 76462be
Merge remote-tracking branch 'upstream/develop' into modelphrases
CLearERR c644a80
Fixed phrases example
CLearERR ac39eb4
Improved examples(beta)
CLearERR 4aa2e29
Fixed links
CLearERR f9a34f0
More examples II
CLearERR 12a3bb5
Final checks
CLearERR aa9c535
fix phrases[1]
menshikh-iv c955e7d
fix phrases[2]
menshikh-iv 3844e4f
fix phrases[3]
menshikh-iv 0e3bb9e
fix phrases[4]
menshikh-iv File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -344,7 +344,6 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0, | |
>>> #Then we got these linked tags: | ||
[u'graph_minors'] | ||
|
||
|
||
""" | ||
if min_count <= 0: | ||
raise ValueError("min_count should be at least 1") | ||
|
@@ -489,7 +488,7 @@ def learn_vocab(sentences, max_vocab_size, delimiter=b'_', progress_per=10000, | |
'eps': 6, 'survey': 6, 'time': 6, 'survey_system': 3, 'minors_survey': 3}) | ||
>>> #Number of tokens and phrases #TODO: why so many? | ||
>>> print learned[2] | ||
>>> 87 | ||
87 | ||
|
||
""" | ||
sentence_no = -1 | ||
|
@@ -592,7 +591,6 @@ def export_phrases(self, sentences, out_delimiter=b' ', as_tuples=False): | |
as_tuples : bool, optional | ||
If true, yield (tuple(words), score), otherwise - (out_delimiter.join(words), score). | ||
|
||
|
||
Example | ||
------- | ||
>>> from gensim.test.utils import datapath | ||
|
@@ -737,9 +735,11 @@ def original_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count | |
|
||
Notes | ||
----- | ||
From paper :math:`(count(worda, wordb) - min_count) * N / | ||
(count(worda) * count(wordb)) > threshold`, where `N` is the total vocabulary size. | ||
Formula from paper: | ||
:math:`\\frac{(count(word_a, word_b) - mincount) * N }{ (count(word_a) * count(word_b))} > threshold`, | ||
where `N` is the total vocabulary size. | ||
|
||
#TODO: something really bad with LaTex | ||
""" | ||
return (bigram_count - min_count) / worda_count / wordb_count * len_vocab | ||
|
||
|
@@ -765,11 +765,11 @@ def npmi_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, co | |
corpus_word_count : int | ||
Number of words in corpus. | ||
|
||
|
||
Notes | ||
----- | ||
From paper :math:`ln(prop(worda, wordb) / (prop(worda)*prop(wordb))) / - ln(prop(worda, wordb)` | ||
where prop(n) is the count of n / the count of everything in the entire corpus. | ||
Formula from paper: | ||
:math:`\\frac{ln(prop(word_a, word_b) / (prop(word_a)*prop(word_b)))}{ -ln(prop(word_a, word_b)}`, | ||
where :math:`prop(n)` is the count of n / the count of everything in the entire corpus. | ||
|
||
""" | ||
pa = worda_count / corpus_word_count | ||
|
@@ -784,17 +784,32 @@ def pseudocorpus(source_vocab, sep, common_terms=frozenset()): | |
Parameters | ||
---------- | ||
source_vocab : iterable of list of str | ||
Vocabulary. | ||
Tokens vocabulary. | ||
sep : str | ||
Separator. | ||
Separator element. | ||
common_terms : set, optional | ||
Stopwords. | ||
Immutable set of stopwords. | ||
|
||
Yields | ||
------ | ||
generator | ||
Generator with phrases. | ||
|
||
Examples | ||
-------- | ||
>>> from gensim.test.utils import datapath | ||
>>> from gensim.models.word2vec import Text8Corpus | ||
>>> from gensim.models.phrases import Phrases,pseudocorpus | ||
>>> #Create corpus | ||
>>> sentences = Text8Corpus(datapath('testcorpus.txt')) | ||
>>> #Train the detector with: | ||
>>> phrases = Phrases(sentences, min_count=1, threshold=1) | ||
>>> pseudo = pseudocorpus(sentences, " ") | ||
>>> sent = [u'trees', u'graph', u'minors'] | ||
>>> for token in list(pseudo): | ||
>>> print token | ||
>>> #TODO: doesn't work | ||
|
||
""" | ||
for k in source_vocab: | ||
if sep not in k: | ||
|
@@ -812,18 +827,7 @@ def pseudocorpus(source_vocab, sep, common_terms=frozenset()): | |
|
||
|
||
class Phraser(SentenceAnalyzer, PhrasesTransformation): | ||
"""Minimal state & functionality to apply results of a Phrases model to tokens. | ||
|
||
Notes | ||
----- | ||
After the one-time initialization, a Phraser will be much smaller and | ||
somewhat faster than using the full Phrases model. | ||
|
||
Reflects the results of the source model's `min_count`, `threshold`, and | ||
`scoring` settings. (You can tamper with those & create a new Phraser to try | ||
other values.) | ||
|
||
""" | ||
"""Minimal state & functionality to apply results of a Phrases model to tokens.""" | ||
|
||
def __init__(self, phrases_model): | ||
""" | ||
|
@@ -832,6 +836,15 @@ def __init__(self, phrases_model): | |
phrases_model : :class:`~gensim.models.phrases.Phrases` | ||
Phrases class object. | ||
|
||
Notes | ||
----- | ||
After the one-time initialization, a Phraser will be much smaller and | ||
somewhat faster than using the full Phrases model. | ||
|
||
Reflects the results of the source model's `min_count`, `threshold`, and | ||
`scoring` settings. (You can tamper with those & create a new Phraser to try | ||
other values.) | ||
|
||
Example | ||
---------- | ||
>>> from gensim.test.utils import datapath | ||
|
@@ -894,16 +907,25 @@ def pseudocorpus(self, phrases_model): | |
generator | ||
Generator with phrases. | ||
|
||
|
||
Example | ||
------- | ||
>>> from gensim.test.utils import datapath | ||
>>> from gensim.models.word2vec import Text8Corpus | ||
>>> from gensim.models.phrases import Phrases, Phraser | ||
>>> sentences = Text8Corpus(datapath('testcorpus.txt')) | ||
>>> #train the detector with: | ||
>>> phrases_model = Phrases(sentences, min_count=5, threshold=100) | ||
>>> #Create a Phraser object to transform any sentence and turn 2 suitable tokens into 1 phrase: | ||
>>> phraser_model = Phraser(phrases_model) | ||
>>> #Initialize pseudocorpus | ||
>>> pseudo = phraser_model.pseudocorpus(phrases_model) | ||
>>> #Get all phrases from it | ||
>>> for phrase in pseudo: | ||
>>> print phrase | ||
['human', 'system'] | ||
['trees', 'trees'] | ||
['system', 'system'] | ||
... | ||
|
||
""" | ||
return pseudocorpus(phrases_model.vocab, phrases_model.delimiter, | ||
|
@@ -919,6 +941,7 @@ def score_item(self, worda, wordb, components, scorer): | |
wordb : str | ||
Second word for comparison. Should be unicode string. | ||
components : generator | ||
Contain phrases. | ||
scorer : {'default', 'npmi'} | ||
Scorer function, as given to :class:`~gensim.models.phrases.Phrases`. | ||
|
||
|
@@ -933,12 +956,40 @@ def score_item(self, worda, wordb, components, scorer): | |
>>> from gensim.models.word2vec import Text8Corpus | ||
>>> from gensim.models.phrases import Phrases, Phraser | ||
>>> sentences = Text8Corpus(datapath('testcorpus.txt')) | ||
>>> #train the detector with | ||
>>> phrases_model = Phrases(sentences, min_count=5, threshold=100) | ||
>>> #Create a Phraser object to transform any sentence and turn 2 suitable tokens into 1 phrase: | ||
>>> phraser_model = Phraser(phrases_model) | ||
>>> #Initialize pseudocorpus | ||
>>> pseudo = phraser_model.pseudocorpus(phrases_model) | ||
//>>> phraser_model.score_item("tree","human",pseudo,'default') | ||
>>> phraser_model.score_item(u"tree",u"human",pseudo,'default') | ||
>>> #Compare 2 words | ||
>>> phraser_model.score_item(u'tree',u'human',pseudo,'default') | ||
>>> # -1 means, that there is no suitable phrase among pseudocorpus elements. #TODO: look below for some | ||
# interesting feature | ||
-1 | ||
>>> from gensim.test.utils import datapath | ||
>>> from gensim.models.word2vec import Text8Corpus | ||
>>> from gensim.models.phrases import Phrases, Phraser | ||
>>> sentences = Text8Corpus(datapath('testcorpus.txt')) | ||
>>> # train the detector with: | ||
>>> phrases_model = Phrases(sentences, min_count=1, threshold=1) | ||
>>> # Create a Phraser object to transform any sentence and turn 2 suitable tokens into 1 phrase: | ||
>>> phraser_model = Phraser(phrases_model) | ||
>>> # Initialize pseudocorpus: | ||
>>> pseudo = phraser_model.pseudocorpus(phrases_model) | ||
>>> # Compare 2 words: | ||
>>> phraser_model.score_item(u'tree',u'human',pseudo,'default') #TODO: there is a strange problem: first time | ||
# it will rise an error: | ||
Traceback (most recent call last): | ||
File "<stdin>", line 1, in <module> | ||
File "gensim/models/phrases.py", line 985, in score_item | ||
phraser_model = Phraser(phrases_model) | ||
TypeError: unhashable type: 'list' | ||
#But if i launch it second time, it will return -1. Have no idea how does this work. | ||
|
||
|
||
>>> # -1 means, that there is no suitable phrase among pseudocorpus elements. | ||
|
||
|
||
""" | ||
try: | ||
|
@@ -978,6 +1029,44 @@ def __getitem__(self, sentence): | |
>>> phraser_model["tree", "human"] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. incorrect, please use |
||
[u'tree', u'human'] | ||
|
||
Examples | ||
---------- | ||
>>> from gensim.test.utils import datapath | ||
>>> from gensim.models.word2vec import Text8Corpus | ||
>>> from gensim.models.phrases import Phrases, Phraser | ||
>>> | ||
>>> #Create corpus | ||
>>> sentences = Text8Corpus(datapath('testcorpus.txt')) | ||
>>> | ||
>>> #Train the detector with: | ||
>>> phrases = Phrases(sentences, min_count=1, threshold=1) | ||
>>> # Create a Phraser object to transform any sentence and turn 2 suitable tokens into 1 phrase: | ||
>>> phraser_model = Phraser(phrases) | ||
>>> #Input is a list of unicode strings: | ||
>>> sent = [u'trees', u'graph', u'minors'] | ||
>>> #Both of these tokens appear in corpus at least twice, and phrase score is higher, than treshold = 1: | ||
>>> print(phraser_model[sent]) | ||
[u'trees_graph', u'minors'] | ||
|
||
>>> from gensim.test.utils import datapath | ||
>>> from gensim.models.word2vec import Text8Corpus | ||
>>> from gensim.models.phrases import Phrases, Phraser | ||
>>> | ||
>>> #Create corpus | ||
>>> sentences = Text8Corpus(datapath('testcorpus.txt')) | ||
>>> | ||
>>> #Train the detector with: | ||
>>> phrases = Phrases(sentences, min_count=1, threshold=1) | ||
>>> # Create a Phraser object to transform any sentence and turn 2 suitable tokens into 1 phrase: | ||
>>> phraser_model = Phraser(phrases) | ||
>>> #Input is a corpus: | ||
>>> sent = [[u'trees', u'graph', u'minors'],[u'graph', u'minors']] | ||
>>> #So we get 2 phrases | ||
>>> res = phraser_model[sent] | ||
>>> for phrase in res: | ||
>>> print phrase | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Best use brackets, for py3k compatibility. |
||
[u'trees_graph', u'minors'] | ||
[u'graph_minors'] | ||
|
||
""" | ||
is_single, sentence = _is_single(sentence) | ||
|
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
PEP8:
space (here and elsewhere).
#
followed by one