piskvorky · menshikh-iv · Jan 22, 2018 · Sep 30, 2017 · Oct 2, 2017 · Oct 2, 2017
diff --git a/docs/src/conf.py b/docs/src/conf.py
@@ -25,7 +25,12 @@
 
 # Add any Sphinx extension module names here, as strings. They can be extensions
 # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
-extensions = ['sphinx.ext.autodoc', 'sphinxcontrib.napoleon']
+extensions = ['sphinx.ext.autodoc', 'sphinx.ext.napoleon']
+
+# napoleon_google_docstring = False
+# napoleon_use_param = False
+# napoleon_use_ivar = True
+
 autoclass_content = "both"
 
 # Add any paths that contain templates here, relative to this directory.

diff --git a/gensim/corpora/__init__.py b/gensim/corpora/__init__.py
@@ -1,5 +1,5 @@
 """
-This package contains implementations of various streaming corpus I/O format.
+This package contains implementations of various streaming corpus I/O formats.
 """
 
 # bring corpus classes directly into package namespace, to save some typing

diff --git a/gensim/corpora/bleicorpus.py b/gensim/corpora/bleicorpus.py
@@ -5,9 +5,7 @@
 # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
 
 
-"""
-Blei's LDA-C format.
-"""
+"""Blei's LDA-C format."""
 
 from __future__ import with_statement
 
@@ -23,8 +21,7 @@
 
 
 class BleiCorpus(IndexedCorpus):
-    """
-    Corpus in Blei's LDA-C format.
+    """Corpus in Blei's LDA-C format.
 
     The corpus is represented as two files: one describing the documents, and another
     describing the mapping between words and their ids.
@@ -35,14 +32,25 @@ class BleiCorpus(IndexedCorpus):
 
     The vocabulary is a file with words, one word per line; word at line K has an
     implicit ``id=K``.
+
     """
 
     def __init__(self, fname, fname_vocab=None):
         """
         Initialize the corpus from a file.
 
-        `fname_vocab` is the file with vocabulary; if not specified, it defaults to
-        `fname.vocab`.
+        Parameters
+        ----------
+        fname : str
+            Serialized corpus's filename
+        fname_vocab : str or None, optional
+            Vocabulary file; takes precedence over
+
+        Raises
+        ------
+        IOError
+            If vocabulary file doesn't exist
+
         """
         IndexedCorpus.__init__(self, fname)
         logger.info("loading corpus from %s", fname)
@@ -67,16 +75,27 @@ def __init__(self, fname, fname_vocab=None):
         self.id2word = dict(enumerate(words))
 
     def __iter__(self):
-        """
-        Iterate over the corpus, returning one sparse vector at a time.
-        """
+        """Iterate over the corpus, returning one sparse vector at a time."""
         lineno = -1
         with utils.smart_open(self.fname) as fin:
             for lineno, line in enumerate(fin):
                 yield self.line2doc(line)
         self.length = lineno + 1
 
     def line2doc(self, line):
+        """Convert line to document.
+
+        Parameters
+        ----------
+        line : str
+            Document's string representation
+
+        Returns
+        -------
+        list of (int, float)
+            document's list representation
+
+        """
         parts = utils.to_unicode(line).split()
         if int(parts[0]) != len(parts) - 1:
             raise ValueError("invalid format in %s: %s" % (self.fname, repr(line)))
@@ -86,14 +105,26 @@ def line2doc(self, line):
 
     @staticmethod
     def save_corpus(fname, corpus, id2word=None, metadata=False):
-        """
-        Save a corpus in the LDA-C format.
+        """Save a corpus in the LDA-C format.
 
         There are actually two files saved: `fname` and `fname.vocab`, where
         `fname.vocab` is the vocabulary file.
 
-        This function is automatically called by `BleiCorpus.serialize`; don't
-        call it directly, call `serialize` instead.
+        Parameters
+        ----------
+        fname : str
+            Filename
+        corpus : iterable
+            Iterable of documents
+        id2word : dict of (str, str), optional
+            Transforms id to word (Default value = None)
+        metadata : bool
+            Any additional info (Default value = False)
+
+        Returns
+        -------
+        list of int
+
         """
         if id2word is None:
             logger.info("no word id mapping provided; initializing from corpus")
@@ -121,8 +152,17 @@ def save_corpus(fname, corpus, id2word=None, metadata=False):
         return offsets
 
     def docbyoffset(self, offset):
-        """
-        Return the document stored at file position `offset`.
+        """Return document corresponding to `offset`.
+
+        Parameters
+        ----------
+        offset : int
+            Position of the document in the file
+
+        Returns
+        -------
+        list of (int, float)
+
         """
         with utils.smart_open(self.fname) as f:
             f.seek(offset)

diff --git a/gensim/corpora/csvcorpus.py b/gensim/corpora/csvcorpus.py
@@ -4,10 +4,7 @@
 # Copyright (C) 2013 Zygmunt Zając <[email protected]>
 # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
 
-"""
-Corpus in CSV format.
-
-"""
+"""Corpus in CSV format."""
 
 
 from __future__ import with_statement
@@ -22,18 +19,24 @@
 
 
 class CsvCorpus(interfaces.CorpusABC):
-    """
-    Corpus in CSV format. The CSV delimiter, headers etc. are guessed automatically
-    based on the file content.
+    """Corpus in CSV format.
+
+    The CSV delimiter, headers etc. are guessed automatically based on the
+    file content.
 
     All row values are expected to be ints/floats.
 
     """
 
     def __init__(self, fname, labels):
-        """
-        Initialize the corpus from a file.
-        `labels` = are class labels present in the input file? => skip the first column
+        """Initialize the corpus from a file.
+
+        Parameters
+        ----------
+        fname : str
+            Filename
+        labels : bool
+            Whether to skip the first column
 
         """
         logger.info("loading corpus from %s", fname)
@@ -48,8 +51,11 @@ def __init__(self, fname, labels):
         logger.info("sniffed CSV delimiter=%r, headers=%s", self.dialect.delimiter, self.headers)
 
     def __iter__(self):
-        """
-        Iterate over the corpus, returning one sparse vector at a time.
+        """Iterate over the corpus, returning one sparse vector at a time.
+
+        Yields
+        ------
+        list of (int, float)
 
         """
         reader = csv.reader(utils.smart_open(self.fname), self.dialect)

diff --git a/gensim/corpora/indexedcorpus.py b/gensim/corpora/indexedcorpus.py
@@ -5,17 +5,7 @@
 # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
 
 
-"""
-Indexed corpus is a mechanism for random-accessing corpora.
-
-While the standard corpus interface in gensim allows iterating over corpus with
-`for doc in corpus: pass`, indexed corpus allows accessing the documents with
-`corpus[docno]` (in O(1) look-up time).
-
-This functionality is achieved by storing an extra file (by default named the same
-as the corpus file plus '.index' suffix) that stores the byte offset of the beginning
-of each document.
-"""
+"""Base Indexed Corpus class."""
 
 import logging
 import six
@@ -28,20 +18,39 @@
 
 
 class IndexedCorpus(interfaces.CorpusABC):
+    """Indexed corpus is a mechanism for random-accessing corpora.
+
+    While the standard corpus interface in gensim allows iterating over
+    corpus with `for doc in corpus: pass`, indexed corpus allows accessing
+    the documents with `corpus[docno]` (in O(1) look-up time).
+
+    Notes
+    -----
+    This functionality is achieved by storing an extra file (by default
+    named the same as the '{corpus name}.index') that stores the byte
+    offset of the beginning of each document.
+
+    """
+
     def __init__(self, fname, index_fname=None):
-        """
-        Initialize this abstract base class, by loading a previously saved index
-        from `index_fname` (or `fname.index` if `index_fname` is not set).
-        This index will allow subclasses to support the `corpus[docno]` syntax
-        (random access to document #`docno` in O(1)).
+        """Initialize the corpus.
+
+        Parameters
+        ----------
+        fname : string
+            Corpus filename
+        index_fname : string or None
+            Index filename, or None for loading `fname`.index
 
+        Examples
+        --------
         >>> # save corpus in SvmLightCorpus format with an index
-        >>> corpus = [[(1, 0.5)], [(0, 1.0), (1, 2.0)]]
-        >>> gensim.corpora.SvmLightCorpus.serialize('testfile.svmlight', corpus)
-        >>> # load back as a document stream (*not* plain Python list)
-        >>> corpus_with_random_access = gensim.corpora.SvmLightCorpus('tstfile.svmlight')
-        >>> print(corpus_with_random_access[1])
-        [(0, 1.0), (1, 2.0)]
+            >>> corpus = [[(1, 0.5)], [(0, 1.0), (1, 2.0)]]
+            >>> gensim.corpora.SvmLightCorpus.serialize('testfile.svmlight', corpus)
+            >>> # load back as a document stream (*not* plain Python list)
+            >>> corpus_with_random_access = gensim.corpora.SvmLightCorpus('tstfile.svmlight')
+            >>> print(corpus_with_random_access[1])
+            [(0, 1.0), (1, 2.0)]
 
         """
         try:
@@ -58,22 +67,31 @@ def __init__(self, fname, index_fname=None):
     @classmethod
     def serialize(serializer, fname, corpus, id2word=None, index_fname=None,
                   progress_cnt=None, labels=None, metadata=False):
-        """
-        Iterate through the document stream `corpus`, saving the documents to `fname`
-        and recording byte offset of each document. Save the resulting index
-        structure to file `index_fname` (or `fname`.index is not set).
-
-        This relies on the underlying corpus class `serializer` providing (in
-        addition to standard iteration):
-
-        * `save_corpus` method that returns a sequence of byte offsets, one for
-           each saved document,
-        * the `docbyoffset(offset)` method, which returns a document
-          positioned at `offset` bytes within the persistent storage (file).
-        * metadata if set to true will ensure that serialize will write out article titles to a pickle file.
-
-        Example:
-
+        """Iterate through the document stream `corpus`.
+
+        Saving the documents to
+        `fname` and recording byte offset of each document.
+
+        Parameters
+        ----------
+        fname : str
+            Filename
+        corpus : iterable
+            Iterable of documents
+        id2word : dict of (str, str), optional
+            Transforms id to word (Default value = None)
+        index_fname : str
+             Where to save resulting index. Saved to `fname`.index if None.
+        progress_cnt : int
+            Number of documents after which progress info is printed
+        labels : bool
+             Whether to skip the first column (class labels)
+        metadata : bool
+            If True will ensure that serialize will write out
+            article titles to a pickle file. (Default value = False)
+
+        Examples
+        --------
         >>> MmCorpus.serialize('test.mm', corpus)
         >>> mm = MmCorpus('test.mm') # `mm` document stream now has random access
         >>> print(mm[42]) # retrieve document no. 42, etc.
@@ -108,8 +126,15 @@ def serialize(serializer, fname, corpus, id2word=None, index_fname=None,
 
     def __len__(self):
         """
-        Return the index length if the corpus is indexed. Otherwise, make a pass
-        over self to calculate the corpus length and cache this number.
+        Return the index length.
+
+        If the corpus is not indexed, also count corpus length and cache this
+        value.
+
+        Returns
+        -------
+        int
+
         """
         if self.index is not None:
             return len(self.index)
@@ -119,11 +144,24 @@ def __len__(self):
         return self.length
 
     def __getitem__(self, docno):
+        """Return certain document.
+
+        Parameters
+        ----------
+        docno : int
+            Document number
+
+        Returns
+        -------
+        `utils.SlicedCorpus`
+
+        """
         if self.index is None:
             raise RuntimeError("Cannot call corpus[docid] without an index")
         if isinstance(docno, (slice, list, numpy.ndarray)):
             return utils.SlicedCorpus(self, docno)
         elif isinstance(docno, six.integer_types + (numpy.integer,)):
             return self.docbyoffset(self.index[docno])
+            # TODO: no `docbyoffset` method, should be defined in this class
         else:
             raise ValueError('Unrecognised value for docno, use either a single integer, a slice or a numpy.ndarray')