piskvorky · mpenkov · Jul 7, 2019 · Jun 14, 2019 · Jun 28, 2019 · Jun 28, 2019
diff --git a/gensim/corpora/_mmreader.c b/gensim/corpora/_mmreader.c
diff --git a/gensim/corpora/_mmreader.pyx b/gensim/corpora/_mmreader.pyx
@@ -188,7 +188,7 @@ cdef class MmReader(object):
         if offset == -1:
             return []
         if isinstance(self.input, string_types):
-            fin, close_fin = utils.smart_open(self.input), True
+            fin, close_fin = utils.open(self.input, 'rb'), True
         else:
             fin, close_fin = self.input, False
 

diff --git a/gensim/corpora/bleicorpus.py b/gensim/corpora/bleicorpus.py
@@ -74,7 +74,7 @@ def __init__(self, fname, fname_vocab=None):
                 raise IOError('BleiCorpus: could not find vocabulary file')
 
         self.fname = fname
-        with utils.smart_open(fname_vocab) as fin:
+        with utils.open(fname_vocab, 'rb') as fin:
             words = [utils.to_unicode(word).rstrip() for word in fin]
         self.id2word = dict(enumerate(words))
 
@@ -88,7 +88,7 @@ def __iter__(self):
 
         """
         lineno = -1
-        with utils.smart_open(self.fname) as fin:
+        with utils.open(self.fname, 'rb') as fin:
             for lineno, line in enumerate(fin):
                 yield self.line2doc(line)
         self.length = lineno + 1
@@ -149,7 +149,7 @@ def save_corpus(fname, corpus, id2word=None, metadata=False):
             num_terms = 0
 
         logger.info("storing corpus in Blei's LDA-C format into %s", fname)
-        with utils.smart_open(fname, 'wb') as fout:
+        with utils.open(fname, 'wb') as fout:
             offsets = []
             for doc in corpus:
                 doc = list(doc)
@@ -160,7 +160,7 @@ def save_corpus(fname, corpus, id2word=None, metadata=False):
         # write out vocabulary, in a format compatible with Blei's topics.py script
         fname_vocab = utils.smart_extension(fname, '.vocab')
         logger.info("saving vocabulary of %i words to %s", num_terms, fname_vocab)
-        with utils.smart_open(fname_vocab, 'wb') as fout:
+        with utils.open(fname_vocab, 'wb') as fout:
             for featureid in range(num_terms):
                 fout.write(utils.to_utf8("%s\n" % id2word.get(featureid, '---')))
 
@@ -181,6 +181,6 @@ def docbyoffset(self, offset):
             Document in BoW format.
 
         """
-        with utils.smart_open(self.fname) as f:
+        with utils.open(self.fname, 'rb') as f:
             f.seek(offset)
             return self.line2doc(f.readline())
diff --git a/gensim/corpora/csvcorpus.py b/gensim/corpora/csvcorpus.py
@@ -45,7 +45,9 @@ def __init__(self, fname, labels):
         self.labels = labels
 
         # load the first few lines, to guess the CSV dialect
-        head = ''.join(itertools.islice(utils.smart_open(self.fname), 5))
+        with utils.open(self.fname, 'rb') as f:
+            head = ''.join(itertools.islice(f, 5))
+
         self.headers = csv.Sniffer().has_header(head)
         self.dialect = csv.Sniffer().sniff(head)
         logger.info("sniffed CSV delimiter=%r, headers=%s", self.dialect.delimiter, self.headers)
@@ -59,14 +61,15 @@ def __iter__(self):
             Document in BoW format.
 
         """
-        reader = csv.reader(utils.smart_open(self.fname), self.dialect)
-        if self.headers:
-            next(reader)    # skip the headers
-
-        line_no = -1
-        for line_no, line in enumerate(reader):
-            if self.labels:
-                line.pop(0)  # ignore the first column = class label
-            yield list(enumerate(float(x) for x in line))
-
-        self.length = line_no + 1  # store the total number of CSV rows = documents
+        with utils.open(self.fname, 'rb') as f:
+            reader = csv.reader(f, self.dialect)
+            if self.headers:
+                next(reader)    # skip the headers
+
+            line_no = -1
+            for line_no, line in enumerate(reader):
+                if self.labels:
+                    line.pop(0)  # ignore the first column = class label
+                yield list(enumerate(float(x) for x in line))
+
+            self.length = line_no + 1  # store the total number of CSV rows = documents
diff --git a/gensim/corpora/dictionary.py b/gensim/corpora/dictionary.py
@@ -516,7 +516,7 @@ def save_as_text(self, fname, sort_by_word=True):
 
         """
         logger.info("saving dictionary mapping to %s", fname)
-        with utils.smart_open(fname, 'wb') as fout:
+        with utils.open(fname, 'wb') as fout:
             numdocs_line = "%d\n" % self.num_docs
             fout.write(utils.to_utf8(numdocs_line))
             if sort_by_word:
@@ -669,7 +669,7 @@ def load_from_text(fname):
 
         """
         result = Dictionary()
-        with utils.smart_open(fname) as f:
+        with utils.open(fname, 'rb') as f:
             for lineno, line in enumerate(f):
                 line = utils.to_unicode(line)
                 if lineno == 0:

diff --git a/gensim/corpora/hashdictionary.py b/gensim/corpora/hashdictionary.py
@@ -341,7 +341,7 @@ def save_as_text(self, fname):
 
         """
         logger.info("saving %s mapping to %s" % (self, fname))
-        with utils.smart_open(fname, 'wb') as fout:
+        with utils.open(fname, 'wb') as fout:
             for tokenid in self.keys():
                 words = sorted(self[tokenid])
                 if words:

diff --git a/gensim/corpora/lowcorpus.py b/gensim/corpora/lowcorpus.py
@@ -131,7 +131,7 @@ def _calculate_num_docs(self):
 
         """
         # the first line in input data is the number of documents (integer). throws exception on bad input.
-        with utils.smart_open(self.fname) as fin:
+        with utils.open(self.fname, 'rb') as fin:
             try:
                 result = int(next(fin))
             except StopIteration:
@@ -191,7 +191,7 @@ def __iter__(self):
             Document in BoW format.
 
         """
-        with utils.smart_open(self.fname) as fin:
+        with utils.open(self.fname, 'rb') as fin:
             for lineno, line in enumerate(fin):
                 if lineno > 0:  # ignore the first line = number of documents
                     yield self.line2doc(line)
@@ -231,7 +231,7 @@ def save_corpus(fname, corpus, id2word=None, metadata=False):
         logger.info("storing corpus in List-Of-Words format into %s" % fname)
         truncated = 0
         offsets = []
-        with utils.smart_open(fname, 'wb') as fout:
+        with utils.open(fname, 'wb') as fout:
             fout.write(utils.to_utf8('%i\n' % len(corpus)))
             for doc in corpus:
                 words = []
@@ -277,7 +277,7 @@ def docbyoffset(self, offset):
             [(0, 1), (3, 1), (4, 1)]
 
         """
-        with utils.smart_open(self.fname) as f:
+        with utils.open(self.fname, 'rb') as f:
             f.seek(offset)
             return self.line2doc(f.readline())
 

diff --git a/gensim/corpora/malletcorpus.py b/gensim/corpora/malletcorpus.py
@@ -83,7 +83,7 @@ def _calculate_num_docs(self):
             Number of documents in file.
 
         """
-        with utils.smart_open(self.fname) as fin:
+        with utils.open(self.fname, 'rb') as fin:
             result = sum(1 for _ in fin)
         return result
 
@@ -96,7 +96,7 @@ def __iter__(self):
             Document in BoW format (+"document_id" and "lang" if metadata=True).
 
         """
-        with utils.smart_open(self.fname) as f:
+        with utils.open(self.fname, 'rb') as f:
             for line in f:
                 yield self.line2doc(line)
 
@@ -180,7 +180,7 @@ def save_corpus(fname, corpus, id2word=None, metadata=False):
 
         truncated = 0
         offsets = []
-        with utils.smart_open(fname, 'wb') as fout:
+        with utils.open(fname, 'wb') as fout:
             for doc_id, doc in enumerate(corpus):
                 if metadata:
                     doc_id, doc_lang = doc[1]
@@ -231,6 +231,6 @@ def docbyoffset(self, offset):
             [(4, 1)]
 
         """
-        with utils.smart_open(self.fname) as f:
+        with utils.open(self.fname, 'rb') as f:
             f.seek(offset)
             return self.line2doc(f.readline())
diff --git a/gensim/corpora/svmlightcorpus.py b/gensim/corpora/svmlightcorpus.py
@@ -74,7 +74,7 @@ def __iter__(self):
         """
         lineno = -1
         self.labels = []
-        with utils.smart_open(self.fname) as fin:
+        with utils.open(self.fname, 'rb') as fin:
             for lineno, line in enumerate(fin):
                 doc = self.line2doc(line)
                 if doc is not None:
@@ -115,7 +115,7 @@ def save_corpus(fname, corpus, id2word=None, labels=False, metadata=False):
             # Cast any sequence (incl. a numpy array) to a list, to simplify the processing below.
             labels = list(labels)
         offsets = []
-        with utils.smart_open(fname, 'wb') as fout:
+        with utils.open(fname, 'wb') as fout:
             for docno, doc in enumerate(corpus):
                 label = labels[docno] if labels else 0  # target class is 0 by default
                 offsets.append(fout.tell())
@@ -135,7 +135,7 @@ def docbyoffset(self, offset):
         tuple of (int, float)
 
         """
-        with utils.smart_open(self.fname) as f:
+        with utils.open(self.fname, 'rb') as f:
             f.seek(offset)
             return self.line2doc(f.readline())[0]
             # TODO: it brakes if gets None from line2doc

diff --git a/gensim/corpora/ucicorpus.py b/gensim/corpora/ucicorpus.py
@@ -39,7 +39,7 @@ def __init__(self, input):
 
         self.input = input
 
-        with utils.smart_open(self.input) as fin:
+        with utils.open(self.input, 'rb') as fin:
             self.num_docs = self.num_terms = self.num_nnz = 0
             try:
                 self.num_docs = int(next(fin).strip())
@@ -188,7 +188,7 @@ def __init__(self, fname, fname_vocab=None):
             fname_vocab = utils.smart_extension(fname, '.vocab')
 
         self.fname = fname
-        with utils.smart_open(fname_vocab) as fin:
+        with utils.open(fname_vocab, 'rb') as fin:
             words = [word.strip() for word in fin]
         self.id2word = dict(enumerate(words))
 
@@ -286,7 +286,7 @@ def save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False)
         # write out vocabulary
         fname_vocab = utils.smart_extension(fname, '.vocab')
         logger.info("saving vocabulary of %i words to %s", num_terms, fname_vocab)
-        with utils.smart_open(fname_vocab, 'wb') as fout:
+        with utils.open(fname_vocab, 'wb') as fout:
             for featureid in range(num_terms):
                 fout.write(utils.to_utf8("%s\n" % id2word.get(featureid, '---')))
 

diff --git a/gensim/matutils.py b/gensim/matutils.py
@@ -1236,7 +1236,7 @@ def __init__(self, fname):
         self.fname = fname
         if fname.endswith(".gz") or fname.endswith('.bz2'):
             raise NotImplementedError("compressed output not supported with MmWriter")
-        self.fout = utils.smart_open(self.fname, 'wb+')  # open for both reading and writing
+        self.fout = utils.open(self.fname, 'wb+')  # open for both reading and writing
         self.headers_written = False
 
     def write_headers(self, num_docs, num_terms, num_nnz):
@@ -1574,7 +1574,7 @@ def docbyoffset(self, offset):
             if offset == -1:
                 return []
             if isinstance(self.input, string_types):
-                fin, close_fin = utils.smart_open(self.input), True
+                fin, close_fin = utils.open(self.input, 'rb'), True
             else:
                 fin, close_fin = self.input, False
 

diff --git a/gensim/models/deprecated/doc2vec.py b/gensim/models/deprecated/doc2vec.py
@@ -965,7 +965,7 @@ def save_word2vec_format(self, fname, doctag_vec=False, word_vec=True, prefix='*
             KeyedVectors.save_word2vec_format(self.wv, fname, fvocab, binary, total_vec)
         # save document vectors
         if doctag_vec:
-            with utils.smart_open(fname, 'ab') as fout:
+            with utils.open(fname, 'ab') as fout:
                 if not word_vec:
                     total_vec = len(self.docvecs)
                     logger.info("storing %sx%s projection weights into %s", total_vec, self.vector_size, fname)
@@ -992,16 +992,17 @@ def __iter__(self):
             fname = os.path.join(self.dirname, fname)
             if not os.path.isfile(fname):
                 continue
-            for item_no, line in enumerate(utils.smart_open(fname)):
-                line = utils.to_unicode(line)
-                # each file line is a single document in the Brown corpus
-                # each token is WORD/POS_TAG
-                token_tags = [t.split('/') for t in line.split() if len(t.split('/')) == 2]
-                # ignore words with non-alphabetic tags like ",", "!" etc (punctuation, weird stuff)
-                words = ["%s/%s" % (token.lower(), tag[:2]) for token, tag in token_tags if tag[:2].isalpha()]
-                if not words:  # don't bother sending out empty documents
-                    continue
-                yield TaggedDocument(words, ['%s_SENT_%s' % (fname, item_no)])
+            with utils.open(fname, 'rb') as f:
+                for item_no, line in enumerate(f):
+                    line = utils.to_unicode(line)
+                    # each file line is a single document in the Brown corpus
+                    # each token is WORD/POS_TAG
+                    token_tags = [t.split('/') for t in line.split() if len(t.split('/')) == 2]
+                    # ignore words with non-alphabetic tags like ",", "!" etc (punctuation, weird stuff)
+                    words = ["%s/%s" % (token.lower(), tag[:2]) for token, tag in token_tags if tag[:2].isalpha()]
+                    if not words:  # don't bother sending out empty documents
+                        continue
+                    yield TaggedDocument(words, ['%s_SENT_%s' % (fname, item_no)])
 
 
 class TaggedLineDocument(object):
@@ -1036,6 +1037,6 @@ def __iter__(self):
                 yield TaggedDocument(utils.to_unicode(line).split(), [item_no])
         except AttributeError:
             # If it didn't work like a file, use it as a string filename
-            with utils.smart_open(self.source) as fin:
+            with utils.open(self.source, 'rb') as fin:
                 for item_no, line in enumerate(fin):
                     yield TaggedDocument(utils.to_unicode(line).split(), [item_no])