piskvorky · rishabhvarshney14 · Oct 1, 2020 · Oct 1, 2020 · Oct 1, 2020 · Oct 2, 2020
diff --git a/.gitignore b/.gitignore
@@ -76,3 +76,5 @@ data
 *.inv
 *.js
 docs/_images/
+*.c
+*.cpp
diff --git a/docs/src/apiref.rst b/docs/src/apiref.rst
@@ -27,6 +27,7 @@ Modules:
     corpora/textcorpus
     corpora/ucicorpus
     corpora/wikicorpus
+    corpora/utils
     models/ldamodel
     models/ldamulticore
     models/nmf

diff --git a/docs/src/corpora/utils.rst b/docs/src/corpora/utils.rst
@@ -0,0 +1,9 @@
+:mod:`corpora.utils` -- Implements various corpus utility
+=========================================================
+
+.. automodule:: gensim.corpora.utils
+    :synopsis: Implements various corpus utility
+    :members:
+    :inherited-members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/gensim/corpora/utils.py b/gensim/corpora/utils.py
@@ -0,0 +1,282 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Author: Gensim Contributors
+# Copyright (C) 2020 RaRe Technologies s.r.o.
+# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
+
+import logging
+import os
+import itertools
+from collections import namedtuple
+
+from gensim import utils
+
+logger = logging.getLogger(__name__)
+
+# Shared capped size, in count of words/tokens, for texts that should not be arbitrarily long
+MAX_WORDS = 10000
+
+
+class LineSentence(object):
+    def __init__(self, source, max_sentence_length=MAX_WORDS, limit=None):
+        """Iterate over a file that contains sentences: one line = one sentence.
+        Words must be already preprocessed and separated by whitespace.
+
+        Parameters
+        ----------
+        source : string or a file-like object
+            Path to the file on disk, or an already-open file object (must support `seek(0)`).
+        limit : int or None
+            Clip the file to the first `limit` lines. Do no clipping if `limit is None` (the default).
+
+        Examples
+        --------
+        .. sourcecode:: pycon
+
+            >>> from gensim.test.utils import datapath
+            >>> sentences = LineSentence(datapath('lee_background.cor'))
+            >>> for sentence in sentences:
+            ...     pass
+
+        """
+        self.source = source
+        self.max_sentence_length = max_sentence_length
+        self.limit = limit
+
+    def __iter__(self):
+        """Iterate through the lines in the source."""
+        try:
+            # Assume it is a file-like object and try treating it as such
+            # Things that don't have seek will trigger an exception
+            self.source.seek(0)
+            for line in itertools.islice(self.source, self.limit):
+                line = utils.to_unicode(line).split()
+                i = 0
+                while i < len(line):
+                    yield line[i: i + self.max_sentence_length]
+                    i += self.max_sentence_length
+        except AttributeError:
+            # If it didn't work like a file, use it as a string filename
+            with utils.open(self.source, 'rb') as fin:
+                for line in itertools.islice(fin, self.limit):
+                    line = utils.to_unicode(line).split()
+                    i = 0
+                    while i < len(line):
+                        yield line[i: i + self.max_sentence_length]
+                        i += self.max_sentence_length
+
+
+class PathLineSentences(object):
+    def __init__(self, source, max_sentence_length=MAX_WORDS, limit=None):
+        """Like :class:`~gensim.corpora.utils.LineSentence`, but process all files in a directory
+        in alphabetical order by filename.
+
+        The directory must only contain files that can be read by :class:`gensim.corpora.utils.LineSentence`:
+        .bz2, .gz, and text files. Any file not ending with .bz2 or .gz is assumed to be a text file.
+
+        The format of files (either text, or compressed text files) in the path is one sentence = one line,
+        with words already preprocessed and separated by whitespace.
+
+        Warnings
+        --------
+        Does **not recurse** into subdirectories.
+
+        Parameters
+        ----------
+        source : str
+            Path to the directory.
+        limit : int or None
+            Read only the first `limit` lines from each file. Read all if limit is None (the default).
+
+        """
+        self.source = source
+        self.max_sentence_length = max_sentence_length
+        self.limit = limit
+
+        if os.path.isfile(self.source):
+            logger.debug('single file given as source, rather than a directory of files')
+            logger.debug('consider using gensim.corpora.utils.LineSentence for a single file')
+            self.input_files = [self.source]  # force code compatibility with list of files
+        elif os.path.isdir(self.source):
+            self.source = os.path.join(self.source, '')  # ensures os-specific slash at end of path
+            logger.info('reading directory %s', self.source)
+            self.input_files = os.listdir(self.source)
+            self.input_files = [self.source + filename for filename in self.input_files]  # make full paths
+            self.input_files.sort()  # makes sure it happens in filename order
+        else:  # not a file or a directory, then we can't do anything with it
+            raise ValueError('input is neither a file nor a path')
+        logger.info('files read into PathLineSentences:%s', '\n'.join(self.input_files))
+
+    def __iter__(self):
+        """iterate through the files"""
+        for file_name in self.input_files:
+            logger.info('reading file %s', file_name)
+            with utils.open(file_name, 'rb') as fin:
+                for line in itertools.islice(fin, self.limit):
+                    line = utils.to_unicode(line).split()
+                    i = 0
+                    while i < len(line):
+                        yield line[i:i + self.max_sentence_length]
+                        i += self.max_sentence_length
+
+
+class Text8Corpus(object):
+    def __init__(self, fname, max_sentence_length=MAX_WORDS):
+        """Iterate over sentences from the "text8" corpus, unzipped from http://mattmahoney.net/dc/text8.zip."""
+        self.fname = fname
+        self.max_sentence_length = max_sentence_length
+
+    def __iter__(self):
+        # the entire corpus is one gigantic line -- there are no sentence marks at all
+        # so just split the sequence of tokens arbitrarily: 1 sentence = 1000 tokens
+        sentence, rest = [], b''
+        with utils.open(self.fname, 'rb') as fin:
+            while True:
+                text = rest + fin.read(8192)  # avoid loading the entire file (=1 line) into RAM
+                if text == rest:  # EOF
+                    words = utils.to_unicode(text).split()
+                    sentence.extend(words)  # return the last chunk of words, too (may be shorter/longer)
+                    if sentence:
+                        yield sentence
+                    break
+                last_token = text.rfind(b' ')  # last token may have been split in two... keep for next iteration
+                words, rest = (utils.to_unicode(text[:last_token]).split(),
+                               text[last_token:].strip()) if last_token >= 0 else ([], text)
+                sentence.extend(words)
+                while len(sentence) >= self.max_sentence_length:
+                    yield sentence[:self.max_sentence_length]
+                    sentence = sentence[self.max_sentence_length:]
+
+
+class TaggedDocument(namedtuple('TaggedDocument', 'words tags')):
+    """Represents a document along with a tag, input document format for :class:`~gensim.models.doc2vec.Doc2Vec`.
+
+    A single document, made up of `words` (a list of unicode string tokens) and `tags` (a list of tokens).
+    Tags may be one or more unicode string tokens, but typical practice (which will also be the most memory-efficient)
+    is for the tags list to include a unique integer id as the only tag.
+
+    Replaces "sentence as a list of words" from :class:`gensim.models.word2vec.Word2Vec`.
+
+    """
+    def __str__(self):
+        """Human readable representation of the object's state, used for debugging.
+
+        Returns
+        -------
+        str
+           Human readable representation of the object's state (words and tags).
+
+        """
+        return '%s(%s, %s)' % (self.__class__.__name__, self.words, self.tags)
+
+
+class TaggedLineDocument(object):
+    def __init__(self, source):
+        """Iterate over a file that contains documents: one line = :class:`~gensim.corpora.utils.TaggedDocument` object.
+
+        Words are expected to be already preprocessed and separated by whitespace. Document tags are constructed
+        automatically from the document line number (each document gets a unique integer tag).
+
+        Parameters
+        ----------
+        source : string or a file-like object
+            Path to the file on disk, or an already-open file object (must support `seek(0)`).
+
+        Examples
+        --------
+        .. sourcecode:: pycon
+
+            >>> from gensim.test.utils import datapath
+            >>> from gensim.models.doc2vec import TaggedLineDocument
+            >>>
+            >>> for document in TaggedLineDocument(datapath("head500.noblanks.cor")):
+            ...     pass
+
+        """
+        self.source = source
+
+    def __iter__(self):
+        """Iterate through the lines in the source.
+
+        Yields
+        ------
+        :class:`~gensim.corpora.utils.TaggedDocument`
+            Document from `source` specified in the constructor.
+
+        """
+        try:
+            # Assume it is a file-like object and try treating it as such
+            # Things that don't have seek will trigger an exception
+            self.source.seek(0)
+            for item_no, line in enumerate(self.source):
+                yield TaggedDocument(utils.to_unicode(line).split(), [item_no])
+        except AttributeError:
+            # If it didn't work like a file, use it as a string filename
+            with utils.open(self.source, 'rb') as fin:
+                for item_no, line in enumerate(fin):
+                    yield TaggedDocument(utils.to_unicode(line).split(), [item_no])
+
+
+class BrownCorpus(object):
+    def __init__(self, dirname):
+        """Iterate over sentences from the `Brown corpus <https://en.wikipedia.org/wiki/Brown_Corpus>`_
+        (part of `NLTK data <https://www.nltk.org/data.html>`_).
+
+        """
+        self.dirname = dirname
+
+    def __iter__(self):
+        for fname in os.listdir(self.dirname):
+            fname = os.path.join(self.dirname, fname)
+            if not os.path.isfile(fname):
+                continue
+            with utils.open(fname, 'rb') as fin:
+                for line in fin:
+                    line = utils.to_unicode(line)
+                    # each file line is a single sentence in the Brown corpus
+                    # each token is WORD/POS_TAG
+                    token_tags = [t.split('/') for t in line.split() if len(t.split('/')) == 2]
+                    # ignore words with non-alphabetic tags like ",", "!" etc (punctuation, weird stuff)
+                    words = ["%s/%s" % (token.lower(), tag[:2]) for token, tag in token_tags if tag[:2].isalpha()]
+                    if not words:  # don't bother sending out empty sentences
+                        continue
+                    yield words
+
+
+class TaggedBrownCorpus(object):
+    def __init__(self, dirname):
+        """Reader for the `Brown corpus (part of NLTK data) <http://www.nltk.org/book/ch02.html#tab-brown-sources>`_.
+
+        Parameters
+        ----------
+        dirname : str
+            Path to folder with Brown corpus.
+
+        """
+        self.dirname = dirname
+
+    def __iter__(self):
+        """Iterate through the corpus.
+
+        Yields
+        ------
+        :class:`~gensim.corpora.utils.TaggedDocument`
+            Document from `source`.
+
+        """
+        for fname in os.listdir(self.dirname):
+            fname = os.path.join(self.dirname, fname)
+            if not os.path.isfile(fname):
+                continue
+            with utils.open(fname, 'rb') as fin:
+                for item_no, line in enumerate(fin):
+                    line = utils.to_unicode(line)
+                    # each file line is a single document in the Brown corpus
+                    # each token is WORD/POS_TAG
+                    token_tags = [t.split('/') for t in line.split() if len(t.split('/')) == 2]
+                    # ignore words with non-alphabetic tags like ",", "!" etc (punctuation, weird stuff)
+                    words = ["%s/%s" % (token.lower(), tag[:2]) for token, tag in token_tags if tag[:2].isalpha()]
+                    if not words:  # don't bother sending out empty documents
+                        continue
+                    yield TaggedDocument(words, ['%s_SENT_%s' % (fname, item_no)])
-Original file line number
+Diff line change
@@ Expand Up / @@ -76,3 +76,5 @@ data @@
     *.inv
     *.js
     docs/_images/
+    *.c
+    *.cpp