Skip to content

Commit

Permalink
Fixes piskvorky#1869 , Mmcorpus file-like object behaviour fixed
Browse files Browse the repository at this point in the history
  • Loading branch information
sj29-innovate committed Feb 16, 2018
1 parent 6a4e986 commit 6083ca6
Showing 1 changed file with 33 additions and 34 deletions.
67 changes: 33 additions & 34 deletions gensim/matutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@

from six import iteritems, itervalues, string_types
from six.moves import xrange, zip as izip

from contextlib import contextmanager

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -1328,10 +1328,23 @@ class MmReader(object):
This allows us to process corpora which are larger than the available RAM.
"""
@contextmanager
def open_file(self, input):
#Generates 'with' like behaviour excepting closing the file object
mgr = utils.file_or_filename(self.input)
exc = False
try:
yield mgr
except StandardError:
exc = True
if not exit(mgr, *sys.exc_info()):
raise
finally:
if not exc and isinstance(self.input, string_types):
exit(mgr, None, None, None)

def __init__(self, input, transposed=True):
"""
Parameters
----------
input : {str, file-like object}
Expand All @@ -1344,39 +1357,25 @@ def __init__(self, input, transposed=True):
logger.info("initializing corpus reader from %s", input)
self.input, self.transposed = input, transposed

# 'with' statement behaviour without closing the file object
mgr = (utils.file_or_filename(self.input))
exit = type(mgr).__exit__
value = type(mgr).__enter__(mgr)
exc = True
try:
with self.open_file(self.input) as lines:
try:
lines = value
try:
header = utils.to_unicode(next(lines)).strip()
if not header.lower().startswith('%%matrixmarket matrix coordinate real general'):
raise ValueError(
"File %s not in Matrix Market format with coordinate real general; instead found: \n%s" %
(self.input, header)
)
except StopIteration:
pass

self.num_docs = self.num_terms = self.num_nnz = 0
for lineno, line in enumerate(lines):
line = utils.to_unicode(line)
if not line.startswith('%'):
self.num_docs, self.num_terms, self.num_nnz = (int(x) for x in line.split())
if not self.transposed:
self.num_docs, self.num_terms = self.num_terms, self.num_docs
break
except RuntimeError:
exc = False
if not exit(mgr, *sys.exc_info()):
raise
finally:
if exc and isinstance(self.input, string_types):
exit(mgr, None, None, None)
header = utils.to_unicode(next(lines)).strip()
if not header.lower().startswith('%%matrixmarket matrix coordinate real general'):
raise ValueError(
"File %s not in Matrix Market format with coordinate real general; instead found: \n%s" %
(self.input, header)
)
except StopIteration:
pass

self.num_docs = self.num_terms = self.num_nnz = 0
for lineno, line in enumerate(lines):
line = utils.to_unicode(line)
if not line.startswith('%'):
self.num_docs, self.num_terms, self.num_nnz = (int(x) for x in line.split())
if not self.transposed:
self.num_docs, self.num_terms = self.num_terms, self.num_docs
break

logger.info(
"accepted corpus with %i documents, %i features, %i non-zero entries",
Expand Down

0 comments on commit 6083ca6

Please sign in to comment.