-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix documentation for gensim.corpora
. Partial fix #1671
#1729
Changes from 39 commits
b260d4b
36d98d1
981ebbb
3428113
69fc7e0
b65a69a
6fa92f3
78e207d
7519382
ae69867
c2765ed
40add21
e044c3a
123327d
2382d01
42409bf
7cb5bbf
56f19e6
9162a7e
5eaaac4
3b6b076
d7f3fc8
c46bff4
7823546
9878133
dba4429
6a95c94
6dcfb07
2f61fc3
ac01abb
833ec64
e656609
3e597fe
da1d5c2
89f6098
9eeea21
2b6aeaf
9b17057
de3ea0f
dafc373
ff980bc
0189d8d
943406c
57cb5a3
08ca492
381fb97
5b5701a
0e5c0cf
627c0e5
b771bb5
d76af8d
7fe753f
a9eb1a3
e3a8ebf
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,9 +5,7 @@ | |
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html | ||
|
||
|
||
""" | ||
Blei's LDA-C format. | ||
""" | ||
"""Blei's LDA-C format.""" | ||
|
||
from __future__ import with_statement | ||
|
||
|
@@ -23,8 +21,7 @@ | |
|
||
|
||
class BleiCorpus(IndexedCorpus): | ||
""" | ||
Corpus in Blei's LDA-C format. | ||
"""Corpus in Blei's LDA-C format. | ||
|
||
The corpus is represented as two files: one describing the documents, and another | ||
describing the mapping between words and their ids. | ||
|
@@ -35,14 +32,25 @@ class BleiCorpus(IndexedCorpus): | |
|
||
The vocabulary is a file with words, one word per line; word at line K has an | ||
implicit ``id=K``. | ||
|
||
""" | ||
|
||
def __init__(self, fname, fname_vocab=None): | ||
""" | ||
Initialize the corpus from a file. | ||
|
||
`fname_vocab` is the file with vocabulary; if not specified, it defaults to | ||
`fname.vocab`. | ||
Parameters | ||
---------- | ||
fname : str | ||
Serialized corpus's filename | ||
fname_vocab : str or None, optional | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Need to understand how to:
|
||
Vocabulary file; takes precedence over | ||
|
||
Raises | ||
------ | ||
IOError | ||
If vocabulary file doesn't exist | ||
|
||
""" | ||
IndexedCorpus.__init__(self, fname) | ||
logger.info("loading corpus from %s", fname) | ||
|
@@ -67,16 +75,27 @@ def __init__(self, fname, fname_vocab=None): | |
self.id2word = dict(enumerate(words)) | ||
|
||
def __iter__(self): | ||
""" | ||
Iterate over the corpus, returning one sparse vector at a time. | ||
""" | ||
"""Iterate over the corpus, returning one sparse vector at a time.""" | ||
lineno = -1 | ||
with utils.smart_open(self.fname) as fin: | ||
for lineno, line in enumerate(fin): | ||
yield self.line2doc(line) | ||
self.length = lineno + 1 | ||
|
||
def line2doc(self, line): | ||
"""Convert line to document. | ||
|
||
Parameters | ||
---------- | ||
line : str | ||
Document's string representation | ||
|
||
Returns | ||
------- | ||
list of (int, float) | ||
document's list representation | ||
|
||
""" | ||
parts = utils.to_unicode(line).split() | ||
if int(parts[0]) != len(parts) - 1: | ||
raise ValueError("invalid format in %s: %s" % (self.fname, repr(line))) | ||
|
@@ -86,14 +105,26 @@ def line2doc(self, line): | |
|
||
@staticmethod | ||
def save_corpus(fname, corpus, id2word=None, metadata=False): | ||
""" | ||
Save a corpus in the LDA-C format. | ||
"""Save a corpus in the LDA-C format. | ||
|
||
There are actually two files saved: `fname` and `fname.vocab`, where | ||
`fname.vocab` is the vocabulary file. | ||
|
||
This function is automatically called by `BleiCorpus.serialize`; don't | ||
call it directly, call `serialize` instead. | ||
Parameters | ||
---------- | ||
fname : str | ||
Filename | ||
corpus : iterable | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. iterable of ... ? (here and everywhere) |
||
Iterable of documents | ||
id2word : dict of (str, str), optional | ||
Transforms id to word (Default value = None) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no default values in docstrings (everywhere) |
||
metadata : bool | ||
Any additional info (Default value = False) | ||
|
||
Returns | ||
------- | ||
list of int | ||
|
||
""" | ||
if id2word is None: | ||
logger.info("no word id mapping provided; initializing from corpus") | ||
|
@@ -121,8 +152,17 @@ def save_corpus(fname, corpus, id2word=None, metadata=False): | |
return offsets | ||
|
||
def docbyoffset(self, offset): | ||
""" | ||
Return the document stored at file position `offset`. | ||
"""Return document corresponding to `offset`. | ||
|
||
Parameters | ||
---------- | ||
offset : int | ||
Position of the document in the file | ||
|
||
Returns | ||
------- | ||
list of (int, float) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Missing parameter description (here and everywhere) |
||
|
||
""" | ||
with utils.smart_open(self.fname) as f: | ||
f.seek(offset) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,10 +4,7 @@ | |
# Copyright (C) 2013 Zygmunt Zając <[email protected]> | ||
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html | ||
|
||
""" | ||
Corpus in CSV format. | ||
|
||
""" | ||
"""Corpus in CSV format.""" | ||
|
||
|
||
from __future__ import with_statement | ||
|
@@ -22,18 +19,24 @@ | |
|
||
|
||
class CsvCorpus(interfaces.CorpusABC): | ||
""" | ||
Corpus in CSV format. The CSV delimiter, headers etc. are guessed automatically | ||
based on the file content. | ||
"""Corpus in CSV format. | ||
|
||
The CSV delimiter, headers etc. are guessed automatically based on the | ||
file content. | ||
|
||
All row values are expected to be ints/floats. | ||
|
||
""" | ||
|
||
def __init__(self, fname, labels): | ||
""" | ||
Initialize the corpus from a file. | ||
`labels` = are class labels present in the input file? => skip the first column | ||
"""Initialize the corpus from a file. | ||
|
||
Parameters | ||
---------- | ||
fname : str | ||
Filename | ||
labels : bool | ||
Whether to skip the first column | ||
|
||
""" | ||
logger.info("loading corpus from %s", fname) | ||
|
@@ -48,8 +51,11 @@ def __init__(self, fname, labels): | |
logger.info("sniffed CSV delimiter=%r, headers=%s", self.dialect.delimiter, self.headers) | ||
|
||
def __iter__(self): | ||
""" | ||
Iterate over the corpus, returning one sparse vector at a time. | ||
"""Iterate over the corpus, returning one sparse vector at a time. | ||
|
||
Yields | ||
------ | ||
list of (int, float) | ||
|
||
""" | ||
reader = csv.reader(utils.smart_open(self.fname), self.dialect) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,17 +5,7 @@ | |
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html | ||
|
||
|
||
""" | ||
Indexed corpus is a mechanism for random-accessing corpora. | ||
|
||
While the standard corpus interface in gensim allows iterating over corpus with | ||
`for doc in corpus: pass`, indexed corpus allows accessing the documents with | ||
`corpus[docno]` (in O(1) look-up time). | ||
|
||
This functionality is achieved by storing an extra file (by default named the same | ||
as the corpus file plus '.index' suffix) that stores the byte offset of the beginning | ||
of each document. | ||
""" | ||
"""Base Indexed Corpus class.""" | ||
|
||
import logging | ||
import six | ||
|
@@ -28,20 +18,39 @@ | |
|
||
|
||
class IndexedCorpus(interfaces.CorpusABC): | ||
"""Indexed corpus is a mechanism for random-accessing corpora. | ||
|
||
While the standard corpus interface in gensim allows iterating over | ||
corpus with `for doc in corpus: pass`, indexed corpus allows accessing | ||
the documents with `corpus[docno]` (in O(1) look-up time). | ||
|
||
Notes | ||
----- | ||
This functionality is achieved by storing an extra file (by default | ||
named the same as the '{corpus name}.index') that stores the byte | ||
offset of the beginning of each document. | ||
|
||
""" | ||
|
||
def __init__(self, fname, index_fname=None): | ||
""" | ||
Initialize this abstract base class, by loading a previously saved index | ||
from `index_fname` (or `fname.index` if `index_fname` is not set). | ||
This index will allow subclasses to support the `corpus[docno]` syntax | ||
(random access to document #`docno` in O(1)). | ||
"""Initialize the corpus. | ||
|
||
Parameters | ||
---------- | ||
fname : string | ||
Corpus filename | ||
index_fname : string or None | ||
Index filename, or None for loading `fname`.index | ||
|
||
Examples | ||
-------- | ||
>>> # save corpus in SvmLightCorpus format with an index | ||
>>> corpus = [[(1, 0.5)], [(0, 1.0), (1, 2.0)]] | ||
>>> gensim.corpora.SvmLightCorpus.serialize('testfile.svmlight', corpus) | ||
>>> # load back as a document stream (*not* plain Python list) | ||
>>> corpus_with_random_access = gensim.corpora.SvmLightCorpus('tstfile.svmlight') | ||
>>> print(corpus_with_random_access[1]) | ||
[(0, 1.0), (1, 2.0)] | ||
>>> corpus = [[(1, 0.5)], [(0, 1.0), (1, 2.0)]] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Examples should be executable and split into 3 sections: imports, data preparation, direct functionality
|
||
>>> gensim.corpora.SvmLightCorpus.serialize('testfile.svmlight', corpus) | ||
>>> # load back as a document stream (*not* plain Python list) | ||
>>> corpus_with_random_access = gensim.corpora.SvmLightCorpus('tstfile.svmlight') | ||
>>> print(corpus_with_random_access[1]) | ||
[(0, 1.0), (1, 2.0)] | ||
|
||
""" | ||
try: | ||
|
@@ -58,22 +67,31 @@ def __init__(self, fname, index_fname=None): | |
@classmethod | ||
def serialize(serializer, fname, corpus, id2word=None, index_fname=None, | ||
progress_cnt=None, labels=None, metadata=False): | ||
""" | ||
Iterate through the document stream `corpus`, saving the documents to `fname` | ||
and recording byte offset of each document. Save the resulting index | ||
structure to file `index_fname` (or `fname`.index is not set). | ||
|
||
This relies on the underlying corpus class `serializer` providing (in | ||
addition to standard iteration): | ||
|
||
* `save_corpus` method that returns a sequence of byte offsets, one for | ||
each saved document, | ||
* the `docbyoffset(offset)` method, which returns a document | ||
positioned at `offset` bytes within the persistent storage (file). | ||
* metadata if set to true will ensure that serialize will write out article titles to a pickle file. | ||
|
||
Example: | ||
|
||
"""Iterate through the document stream `corpus`. | ||
|
||
Saving the documents to | ||
`fname` and recording byte offset of each document. | ||
|
||
Parameters | ||
---------- | ||
fname : str | ||
Filename | ||
corpus : iterable | ||
Iterable of documents | ||
id2word : dict of (str, str), optional | ||
Transforms id to word (Default value = None) | ||
index_fname : str | ||
Where to save resulting index. Saved to `fname`.index if None. | ||
progress_cnt : int | ||
Number of documents after which progress info is printed | ||
labels : bool | ||
Whether to skip the first column (class labels) | ||
metadata : bool | ||
If True will ensure that serialize will write out | ||
article titles to a pickle file. (Default value = False) | ||
|
||
Examples | ||
-------- | ||
>>> MmCorpus.serialize('test.mm', corpus) | ||
>>> mm = MmCorpus('test.mm') # `mm` document stream now has random access | ||
>>> print(mm[42]) # retrieve document no. 42, etc. | ||
|
@@ -108,8 +126,15 @@ def serialize(serializer, fname, corpus, id2word=None, index_fname=None, | |
|
||
def __len__(self): | ||
""" | ||
Return the index length if the corpus is indexed. Otherwise, make a pass | ||
over self to calculate the corpus length and cache this number. | ||
Return the index length. | ||
|
||
If the corpus is not indexed, also count corpus length and cache this | ||
value. | ||
|
||
Returns | ||
------- | ||
int | ||
|
||
""" | ||
if self.index is not None: | ||
return len(self.index) | ||
|
@@ -119,11 +144,24 @@ def __len__(self): | |
return self.length | ||
|
||
def __getitem__(self, docno): | ||
"""Return certain document. | ||
|
||
Parameters | ||
---------- | ||
docno : int | ||
Document number | ||
|
||
Returns | ||
------- | ||
`utils.SlicedCorpus` | ||
|
||
""" | ||
if self.index is None: | ||
raise RuntimeError("Cannot call corpus[docid] without an index") | ||
if isinstance(docno, (slice, list, numpy.ndarray)): | ||
return utils.SlicedCorpus(self, docno) | ||
elif isinstance(docno, six.integer_types + (numpy.integer,)): | ||
return self.docbyoffset(self.index[docno]) | ||
# TODO: no `docbyoffset` method, should be defined in this class | ||
else: | ||
raise ValueError('Unrecognised value for docno, use either a single integer, a slice or a numpy.ndarray') |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Dot on the end of sentence (everywhere)