Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix smart_open deprecation warning globally #2530

Merged
merged 3 commits into from
Jul 7, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
196 changes: 115 additions & 81 deletions gensim/corpora/_mmreader.c

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion gensim/corpora/_mmreader.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ cdef class MmReader(object):
if offset == -1:
return []
if isinstance(self.input, string_types):
fin, close_fin = utils.smart_open(self.input), True
fin, close_fin = utils.open(self.input, 'rb'), True
mpenkov marked this conversation as resolved.
Show resolved Hide resolved
else:
fin, close_fin = self.input, False

Expand Down
10 changes: 5 additions & 5 deletions gensim/corpora/bleicorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def __init__(self, fname, fname_vocab=None):
raise IOError('BleiCorpus: could not find vocabulary file')

self.fname = fname
with utils.smart_open(fname_vocab) as fin:
with utils.open(fname_vocab, 'rb') as fin:
words = [utils.to_unicode(word).rstrip() for word in fin]
self.id2word = dict(enumerate(words))

Expand All @@ -88,7 +88,7 @@ def __iter__(self):

"""
lineno = -1
with utils.smart_open(self.fname) as fin:
with utils.open(self.fname, 'rb') as fin:
for lineno, line in enumerate(fin):
yield self.line2doc(line)
self.length = lineno + 1
Expand Down Expand Up @@ -149,7 +149,7 @@ def save_corpus(fname, corpus, id2word=None, metadata=False):
num_terms = 0

logger.info("storing corpus in Blei's LDA-C format into %s", fname)
with utils.smart_open(fname, 'wb') as fout:
with utils.open(fname, 'wb') as fout:
offsets = []
for doc in corpus:
doc = list(doc)
Expand All @@ -160,7 +160,7 @@ def save_corpus(fname, corpus, id2word=None, metadata=False):
# write out vocabulary, in a format compatible with Blei's topics.py script
fname_vocab = utils.smart_extension(fname, '.vocab')
logger.info("saving vocabulary of %i words to %s", num_terms, fname_vocab)
with utils.smart_open(fname_vocab, 'wb') as fout:
with utils.open(fname_vocab, 'wb') as fout:
for featureid in range(num_terms):
fout.write(utils.to_utf8("%s\n" % id2word.get(featureid, '---')))

Expand All @@ -181,6 +181,6 @@ def docbyoffset(self, offset):
Document in BoW format.

"""
with utils.smart_open(self.fname) as f:
with utils.open(self.fname, 'rb') as f:
f.seek(offset)
return self.line2doc(f.readline())
27 changes: 15 additions & 12 deletions gensim/corpora/csvcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,9 @@ def __init__(self, fname, labels):
self.labels = labels

# load the first few lines, to guess the CSV dialect
head = ''.join(itertools.islice(utils.smart_open(self.fname), 5))
with utils.open(self.fname, 'rb') as f:
head = ''.join(itertools.islice(f, 5))

self.headers = csv.Sniffer().has_header(head)
self.dialect = csv.Sniffer().sniff(head)
logger.info("sniffed CSV delimiter=%r, headers=%s", self.dialect.delimiter, self.headers)
Expand All @@ -59,14 +61,15 @@ def __iter__(self):
Document in BoW format.

"""
reader = csv.reader(utils.smart_open(self.fname), self.dialect)
if self.headers:
next(reader) # skip the headers

line_no = -1
for line_no, line in enumerate(reader):
if self.labels:
line.pop(0) # ignore the first column = class label
yield list(enumerate(float(x) for x in line))

self.length = line_no + 1 # store the total number of CSV rows = documents
with utils.open(self.fname, 'rb') as f:
reader = csv.reader(f, self.dialect)
if self.headers:
next(reader) # skip the headers

line_no = -1
for line_no, line in enumerate(reader):
if self.labels:
line.pop(0) # ignore the first column = class label
yield list(enumerate(float(x) for x in line))

self.length = line_no + 1 # store the total number of CSV rows = documents
4 changes: 2 additions & 2 deletions gensim/corpora/dictionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -516,7 +516,7 @@ def save_as_text(self, fname, sort_by_word=True):

"""
logger.info("saving dictionary mapping to %s", fname)
with utils.smart_open(fname, 'wb') as fout:
with utils.open(fname, 'wb') as fout:
numdocs_line = "%d\n" % self.num_docs
fout.write(utils.to_utf8(numdocs_line))
if sort_by_word:
Expand Down Expand Up @@ -669,7 +669,7 @@ def load_from_text(fname):

"""
result = Dictionary()
with utils.smart_open(fname) as f:
with utils.open(fname, 'rb') as f:
for lineno, line in enumerate(f):
line = utils.to_unicode(line)
if lineno == 0:
Expand Down
2 changes: 1 addition & 1 deletion gensim/corpora/hashdictionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,7 +341,7 @@ def save_as_text(self, fname):

"""
logger.info("saving %s mapping to %s" % (self, fname))
with utils.smart_open(fname, 'wb') as fout:
with utils.open(fname, 'wb') as fout:
for tokenid in self.keys():
words = sorted(self[tokenid])
if words:
Expand Down
8 changes: 4 additions & 4 deletions gensim/corpora/lowcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def _calculate_num_docs(self):

"""
# the first line in input data is the number of documents (integer). throws exception on bad input.
with utils.smart_open(self.fname) as fin:
with utils.open(self.fname, 'rb') as fin:
try:
result = int(next(fin))
except StopIteration:
Expand Down Expand Up @@ -191,7 +191,7 @@ def __iter__(self):
Document in BoW format.

"""
with utils.smart_open(self.fname) as fin:
with utils.open(self.fname, 'rb') as fin:
for lineno, line in enumerate(fin):
if lineno > 0: # ignore the first line = number of documents
yield self.line2doc(line)
Expand Down Expand Up @@ -231,7 +231,7 @@ def save_corpus(fname, corpus, id2word=None, metadata=False):
logger.info("storing corpus in List-Of-Words format into %s" % fname)
truncated = 0
offsets = []
with utils.smart_open(fname, 'wb') as fout:
with utils.open(fname, 'wb') as fout:
fout.write(utils.to_utf8('%i\n' % len(corpus)))
for doc in corpus:
words = []
Expand Down Expand Up @@ -277,7 +277,7 @@ def docbyoffset(self, offset):
[(0, 1), (3, 1), (4, 1)]

"""
with utils.smart_open(self.fname) as f:
with utils.open(self.fname, 'rb') as f:
f.seek(offset)
return self.line2doc(f.readline())

Expand Down
8 changes: 4 additions & 4 deletions gensim/corpora/malletcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def _calculate_num_docs(self):
Number of documents in file.

"""
with utils.smart_open(self.fname) as fin:
with utils.open(self.fname, 'rb') as fin:
result = sum(1 for _ in fin)
return result

Expand All @@ -96,7 +96,7 @@ def __iter__(self):
Document in BoW format (+"document_id" and "lang" if metadata=True).

"""
with utils.smart_open(self.fname) as f:
with utils.open(self.fname, 'rb') as f:
for line in f:
yield self.line2doc(line)

Expand Down Expand Up @@ -180,7 +180,7 @@ def save_corpus(fname, corpus, id2word=None, metadata=False):

truncated = 0
offsets = []
with utils.smart_open(fname, 'wb') as fout:
with utils.open(fname, 'wb') as fout:
for doc_id, doc in enumerate(corpus):
if metadata:
doc_id, doc_lang = doc[1]
Expand Down Expand Up @@ -231,6 +231,6 @@ def docbyoffset(self, offset):
[(4, 1)]

"""
with utils.smart_open(self.fname) as f:
with utils.open(self.fname, 'rb') as f:
f.seek(offset)
return self.line2doc(f.readline())
6 changes: 3 additions & 3 deletions gensim/corpora/svmlightcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def __iter__(self):
"""
lineno = -1
self.labels = []
with utils.smart_open(self.fname) as fin:
with utils.open(self.fname, 'rb') as fin:
for lineno, line in enumerate(fin):
doc = self.line2doc(line)
if doc is not None:
Expand Down Expand Up @@ -115,7 +115,7 @@ def save_corpus(fname, corpus, id2word=None, labels=False, metadata=False):
# Cast any sequence (incl. a numpy array) to a list, to simplify the processing below.
labels = list(labels)
offsets = []
with utils.smart_open(fname, 'wb') as fout:
with utils.open(fname, 'wb') as fout:
for docno, doc in enumerate(corpus):
label = labels[docno] if labels else 0 # target class is 0 by default
offsets.append(fout.tell())
Expand All @@ -135,7 +135,7 @@ def docbyoffset(self, offset):
tuple of (int, float)

"""
with utils.smart_open(self.fname) as f:
with utils.open(self.fname, 'rb') as f:
f.seek(offset)
return self.line2doc(f.readline())[0]
# TODO: it brakes if gets None from line2doc
Expand Down
6 changes: 3 additions & 3 deletions gensim/corpora/ucicorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def __init__(self, input):

self.input = input

with utils.smart_open(self.input) as fin:
with utils.open(self.input, 'rb') as fin:
self.num_docs = self.num_terms = self.num_nnz = 0
try:
self.num_docs = int(next(fin).strip())
Expand Down Expand Up @@ -188,7 +188,7 @@ def __init__(self, fname, fname_vocab=None):
fname_vocab = utils.smart_extension(fname, '.vocab')

self.fname = fname
with utils.smart_open(fname_vocab) as fin:
with utils.open(fname_vocab, 'rb') as fin:
words = [word.strip() for word in fin]
self.id2word = dict(enumerate(words))

Expand Down Expand Up @@ -286,7 +286,7 @@ def save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False)
# write out vocabulary
fname_vocab = utils.smart_extension(fname, '.vocab')
logger.info("saving vocabulary of %i words to %s", num_terms, fname_vocab)
with utils.smart_open(fname_vocab, 'wb') as fout:
with utils.open(fname_vocab, 'wb') as fout:
for featureid in range(num_terms):
fout.write(utils.to_utf8("%s\n" % id2word.get(featureid, '---')))

Expand Down
4 changes: 2 additions & 2 deletions gensim/matutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1236,7 +1236,7 @@ def __init__(self, fname):
self.fname = fname
if fname.endswith(".gz") or fname.endswith('.bz2'):
raise NotImplementedError("compressed output not supported with MmWriter")
self.fout = utils.smart_open(self.fname, 'wb+') # open for both reading and writing
self.fout = utils.open(self.fname, 'wb+') # open for both reading and writing
self.headers_written = False

def write_headers(self, num_docs, num_terms, num_nnz):
Expand Down Expand Up @@ -1574,7 +1574,7 @@ def docbyoffset(self, offset):
if offset == -1:
return []
if isinstance(self.input, string_types):
fin, close_fin = utils.smart_open(self.input), True
fin, close_fin = utils.open(self.input, 'rb'), True
else:
fin, close_fin = self.input, False

Expand Down
25 changes: 13 additions & 12 deletions gensim/models/deprecated/doc2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -965,7 +965,7 @@ def save_word2vec_format(self, fname, doctag_vec=False, word_vec=True, prefix='*
KeyedVectors.save_word2vec_format(self.wv, fname, fvocab, binary, total_vec)
# save document vectors
if doctag_vec:
with utils.smart_open(fname, 'ab') as fout:
with utils.open(fname, 'ab') as fout:
if not word_vec:
total_vec = len(self.docvecs)
logger.info("storing %sx%s projection weights into %s", total_vec, self.vector_size, fname)
Expand All @@ -992,16 +992,17 @@ def __iter__(self):
fname = os.path.join(self.dirname, fname)
if not os.path.isfile(fname):
continue
for item_no, line in enumerate(utils.smart_open(fname)):
line = utils.to_unicode(line)
# each file line is a single document in the Brown corpus
# each token is WORD/POS_TAG
token_tags = [t.split('/') for t in line.split() if len(t.split('/')) == 2]
# ignore words with non-alphabetic tags like ",", "!" etc (punctuation, weird stuff)
words = ["%s/%s" % (token.lower(), tag[:2]) for token, tag in token_tags if tag[:2].isalpha()]
if not words: # don't bother sending out empty documents
continue
yield TaggedDocument(words, ['%s_SENT_%s' % (fname, item_no)])
with utils.open(fname, 'rb') as f:
for item_no, line in enumerate(f):
line = utils.to_unicode(line)
# each file line is a single document in the Brown corpus
# each token is WORD/POS_TAG
token_tags = [t.split('/') for t in line.split() if len(t.split('/')) == 2]
# ignore words with non-alphabetic tags like ",", "!" etc (punctuation, weird stuff)
words = ["%s/%s" % (token.lower(), tag[:2]) for token, tag in token_tags if tag[:2].isalpha()]
if not words: # don't bother sending out empty documents
continue
yield TaggedDocument(words, ['%s_SENT_%s' % (fname, item_no)])


class TaggedLineDocument(object):
Expand Down Expand Up @@ -1036,6 +1037,6 @@ def __iter__(self):
yield TaggedDocument(utils.to_unicode(line).split(), [item_no])
except AttributeError:
# If it didn't work like a file, use it as a string filename
with utils.smart_open(self.source) as fin:
with utils.open(self.source, 'rb') as fin:
for item_no, line in enumerate(fin):
yield TaggedDocument(utils.to_unicode(line).split(), [item_no])
Loading