Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move corpus classes/utilities to gensim.corpora #2970

Open
wants to merge 9 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -76,3 +76,5 @@ data
*.inv
*.js
docs/_images/
*.c
*.cpp
1 change: 1 addition & 0 deletions docs/src/apiref.rst
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ Modules:
corpora/textcorpus
corpora/ucicorpus
corpora/wikicorpus
corpora/utils
models/ldamodel
models/ldamulticore
models/nmf
Expand Down
9 changes: 9 additions & 0 deletions docs/src/corpora/utils.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
:mod:`corpora.utils` -- Implements various corpus utility
=========================================================

.. automodule:: gensim.corpora.utils
:synopsis: Implements various corpus utility
:members:
:inherited-members:
:undoc-members:
:show-inheritance:
282 changes: 282 additions & 0 deletions gensim/corpora/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,282 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Author: Gensim Contributors
# Copyright (C) 2020 RaRe Technologies s.r.o.
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html

import logging
import os
import itertools
from collections import namedtuple

from gensim import utils

logger = logging.getLogger(__name__)

# Shared capped size, in count of words/tokens, for texts that should not be arbitrarily long
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that you may specifically need to use a "#: ` to start this comment for it to be picked up by Sphinx/autodoc tools as a bit of documentation for the variable assignment. (See: https://stackoverflow.com/a/20227174/130288)

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have fixed that thank you.

MAX_WORDS = 10000


class LineSentence(object):
def __init__(self, source, max_sentence_length=MAX_WORDS, limit=None):
"""Iterate over a file that contains sentences: one line = one sentence.
Words must be already preprocessed and separated by whitespace.

Parameters
----------
source : string or a file-like object
Path to the file on disk, or an already-open file object (must support `seek(0)`).
limit : int or None
Clip the file to the first `limit` lines. Do no clipping if `limit is None` (the default).

Examples
--------
.. sourcecode:: pycon

>>> from gensim.test.utils import datapath
>>> sentences = LineSentence(datapath('lee_background.cor'))
>>> for sentence in sentences:
... pass

"""
self.source = source
self.max_sentence_length = max_sentence_length
self.limit = limit

def __iter__(self):
"""Iterate through the lines in the source."""
try:
# Assume it is a file-like object and try treating it as such
# Things that don't have seek will trigger an exception
self.source.seek(0)
for line in itertools.islice(self.source, self.limit):
line = utils.to_unicode(line).split()
i = 0
while i < len(line):
yield line[i: i + self.max_sentence_length]
i += self.max_sentence_length
except AttributeError:
# If it didn't work like a file, use it as a string filename
with utils.open(self.source, 'rb') as fin:
for line in itertools.islice(fin, self.limit):
line = utils.to_unicode(line).split()
i = 0
while i < len(line):
yield line[i: i + self.max_sentence_length]
i += self.max_sentence_length


class PathLineSentences(object):
def __init__(self, source, max_sentence_length=MAX_WORDS, limit=None):
"""Like :class:`~gensim.corpora.utils.LineSentence`, but process all files in a directory
in alphabetical order by filename.

The directory must only contain files that can be read by :class:`gensim.corpora.utils.LineSentence`:
.bz2, .gz, and text files. Any file not ending with .bz2 or .gz is assumed to be a text file.

The format of files (either text, or compressed text files) in the path is one sentence = one line,
with words already preprocessed and separated by whitespace.

Warnings
--------
Does **not recurse** into subdirectories.

Parameters
----------
source : str
Path to the directory.
limit : int or None
Read only the first `limit` lines from each file. Read all if limit is None (the default).

"""
self.source = source
self.max_sentence_length = max_sentence_length
self.limit = limit

if os.path.isfile(self.source):
logger.debug('single file given as source, rather than a directory of files')
logger.debug('consider using gensim.corpora.utils.LineSentence for a single file')
self.input_files = [self.source] # force code compatibility with list of files
elif os.path.isdir(self.source):
self.source = os.path.join(self.source, '') # ensures os-specific slash at end of path
logger.info('reading directory %s', self.source)
self.input_files = os.listdir(self.source)
self.input_files = [self.source + filename for filename in self.input_files] # make full paths
self.input_files.sort() # makes sure it happens in filename order
else: # not a file or a directory, then we can't do anything with it
raise ValueError('input is neither a file nor a path')
logger.info('files read into PathLineSentences:%s', '\n'.join(self.input_files))

def __iter__(self):
"""iterate through the files"""
for file_name in self.input_files:
logger.info('reading file %s', file_name)
with utils.open(file_name, 'rb') as fin:
for line in itertools.islice(fin, self.limit):
line = utils.to_unicode(line).split()
i = 0
while i < len(line):
yield line[i:i + self.max_sentence_length]
i += self.max_sentence_length


class Text8Corpus(object):
def __init__(self, fname, max_sentence_length=MAX_WORDS):
"""Iterate over sentences from the "text8" corpus, unzipped from http://mattmahoney.net/dc/text8.zip."""
self.fname = fname
self.max_sentence_length = max_sentence_length

def __iter__(self):
# the entire corpus is one gigantic line -- there are no sentence marks at all
# so just split the sequence of tokens arbitrarily: 1 sentence = 1000 tokens
sentence, rest = [], b''
with utils.open(self.fname, 'rb') as fin:
while True:
text = rest + fin.read(8192) # avoid loading the entire file (=1 line) into RAM
if text == rest: # EOF
words = utils.to_unicode(text).split()
sentence.extend(words) # return the last chunk of words, too (may be shorter/longer)
if sentence:
yield sentence
break
last_token = text.rfind(b' ') # last token may have been split in two... keep for next iteration
words, rest = (utils.to_unicode(text[:last_token]).split(),
text[last_token:].strip()) if last_token >= 0 else ([], text)
sentence.extend(words)
while len(sentence) >= self.max_sentence_length:
yield sentence[:self.max_sentence_length]
sentence = sentence[self.max_sentence_length:]


class TaggedDocument(namedtuple('TaggedDocument', 'words tags')):
"""Represents a document along with a tag, input document format for :class:`~gensim.models.doc2vec.Doc2Vec`.

A single document, made up of `words` (a list of unicode string tokens) and `tags` (a list of tokens).
Tags may be one or more unicode string tokens, but typical practice (which will also be the most memory-efficient)
is for the tags list to include a unique integer id as the only tag.

Replaces "sentence as a list of words" from :class:`gensim.models.word2vec.Word2Vec`.

"""
def __str__(self):
"""Human readable representation of the object's state, used for debugging.

Returns
-------
str
Human readable representation of the object's state (words and tags).

"""
return '%s(%s, %s)' % (self.__class__.__name__, self.words, self.tags)


class TaggedLineDocument(object):
def __init__(self, source):
"""Iterate over a file that contains documents: one line = :class:`~gensim.corpora.utils.TaggedDocument` object.

Words are expected to be already preprocessed and separated by whitespace. Document tags are constructed
automatically from the document line number (each document gets a unique integer tag).

Parameters
----------
source : string or a file-like object
Path to the file on disk, or an already-open file object (must support `seek(0)`).

Examples
--------
.. sourcecode:: pycon

>>> from gensim.test.utils import datapath
>>> from gensim.models.doc2vec import TaggedLineDocument
>>>
>>> for document in TaggedLineDocument(datapath("head500.noblanks.cor")):
... pass

"""
self.source = source

def __iter__(self):
"""Iterate through the lines in the source.

Yields
------
:class:`~gensim.corpora.utils.TaggedDocument`
Document from `source` specified in the constructor.

"""
try:
# Assume it is a file-like object and try treating it as such
# Things that don't have seek will trigger an exception
self.source.seek(0)
for item_no, line in enumerate(self.source):
yield TaggedDocument(utils.to_unicode(line).split(), [item_no])
except AttributeError:
# If it didn't work like a file, use it as a string filename
with utils.open(self.source, 'rb') as fin:
for item_no, line in enumerate(fin):
yield TaggedDocument(utils.to_unicode(line).split(), [item_no])


class BrownCorpus(object):
def __init__(self, dirname):
"""Iterate over sentences from the `Brown corpus <https://en.wikipedia.org/wiki/Brown_Corpus>`_
(part of `NLTK data <https://www.nltk.org/data.html>`_).

"""
self.dirname = dirname

def __iter__(self):
for fname in os.listdir(self.dirname):
fname = os.path.join(self.dirname, fname)
if not os.path.isfile(fname):
continue
with utils.open(fname, 'rb') as fin:
for line in fin:
line = utils.to_unicode(line)
# each file line is a single sentence in the Brown corpus
# each token is WORD/POS_TAG
token_tags = [t.split('/') for t in line.split() if len(t.split('/')) == 2]
# ignore words with non-alphabetic tags like ",", "!" etc (punctuation, weird stuff)
words = ["%s/%s" % (token.lower(), tag[:2]) for token, tag in token_tags if tag[:2].isalpha()]
if not words: # don't bother sending out empty sentences
continue
yield words


class TaggedBrownCorpus(object):
def __init__(self, dirname):
"""Reader for the `Brown corpus (part of NLTK data) <http://www.nltk.org/book/ch02.html#tab-brown-sources>`_.

Parameters
----------
dirname : str
Path to folder with Brown corpus.

"""
self.dirname = dirname

def __iter__(self):
"""Iterate through the corpus.

Yields
------
:class:`~gensim.corpora.utils.TaggedDocument`
Document from `source`.

"""
for fname in os.listdir(self.dirname):
fname = os.path.join(self.dirname, fname)
if not os.path.isfile(fname):
continue
with utils.open(fname, 'rb') as fin:
for item_no, line in enumerate(fin):
line = utils.to_unicode(line)
# each file line is a single document in the Brown corpus
# each token is WORD/POS_TAG
token_tags = [t.split('/') for t in line.split() if len(t.split('/')) == 2]
# ignore words with non-alphabetic tags like ",", "!" etc (punctuation, weird stuff)
words = ["%s/%s" % (token.lower(), tag[:2]) for token, tag in token_tags if tag[:2].isalpha()]
if not words: # don't bother sending out empty documents
continue
yield TaggedDocument(words, ['%s_SENT_%s' % (fname, item_no)])
Loading