From f4ccf0ea79b8f1f78e63e5028688d8d562f6bcab Mon Sep 17 00:00:00 2001 From: Itay Bittan Date: Fri, 14 Jun 2019 16:34:05 +0300 Subject: [PATCH 1/3] Fix smart_open deprecation warning globally --- gensim/corpora/_mmreader.c | 196 +++++++++++-------- gensim/corpora/_mmreader.pyx | 2 +- gensim/corpora/bleicorpus.py | 10 +- gensim/corpora/csvcorpus.py | 32 ++-- gensim/corpora/dictionary.py | 4 +- gensim/corpora/hashdictionary.py | 2 +- gensim/corpora/lowcorpus.py | 8 +- gensim/corpora/malletcorpus.py | 8 +- gensim/corpora/svmlightcorpus.py | 6 +- gensim/corpora/ucicorpus.py | 6 +- gensim/matutils.py | 4 +- gensim/models/deprecated/doc2vec.py | 25 +-- gensim/models/deprecated/keyedvectors.py | 138 +++++++------- gensim/models/deprecated/old_saveload.py | 7 +- gensim/models/deprecated/word2vec.py | 29 +-- gensim/models/doc2vec.py | 28 +-- gensim/models/fasttext.py | 8 +- gensim/models/hdpmodel.py | 2 +- gensim/models/keyedvectors.py | 221 +++++++++++----------- gensim/models/poincare.py | 9 +- gensim/models/utils_any2vec.py | 8 +- gensim/models/word2vec.py | 29 +-- gensim/models/wrappers/dtmmodel.py | 2 +- gensim/models/wrappers/ldamallet.py | 7 +- gensim/models/wrappers/ldavowpalwabbit.py | 14 +- gensim/models/wrappers/wordrank.py | 15 +- gensim/parsing/preprocessing.py | 2 +- gensim/scripts/glove2word2vec.py | 10 +- gensim/scripts/segment_wiki.py | 28 +-- gensim/scripts/word2vec2tensor.py | 4 +- gensim/similarities/index.py | 6 +- gensim/test/test_doc2vec.py | 6 +- gensim/test/test_fasttext.py | 6 +- gensim/test/test_keywords.py | 14 +- gensim/test/test_lee.py | 8 +- gensim/test/test_scripts.py | 11 +- gensim/test/test_similarities.py | 4 +- gensim/test/test_summarization.py | 4 +- gensim/test/test_translation_matrix.py | 2 +- gensim/test/test_utils.py | 8 +- gensim/test/test_word2vec.py | 28 +-- gensim/utils.py | 10 +- 42 files changed, 505 insertions(+), 466 deletions(-) diff --git a/gensim/corpora/_mmreader.c b/gensim/corpora/_mmreader.c index 03734e0f32..d50cae7242 100644 --- a/gensim/corpora/_mmreader.c +++ b/gensim/corpora/_mmreader.c @@ -1444,6 +1444,7 @@ static PyObject *__pyx_builtin_ValueError; static PyObject *__pyx_builtin_StopIteration; static PyObject *__pyx_builtin_enumerate; static const char __pyx_k_[] = "%"; +static const char __pyx_k_rb[] = "rb"; static const char __pyx_k_new[] = "__new__"; static const char __pyx_k_six[] = "six"; static const char __pyx_k_args[] = "args"; @@ -1453,6 +1454,7 @@ static const char __pyx_k_info[] = "info"; static const char __pyx_k_iter[] = "__iter__"; static const char __pyx_k_main[] = "__main__"; static const char __pyx_k_name[] = "__name__"; +static const char __pyx_k_open[] = "open"; static const char __pyx_k_seek[] = "seek"; static const char __pyx_k_send[] = "send"; static const char __pyx_k_test[] = "__test__"; @@ -1486,7 +1488,6 @@ static const char __pyx_k_reduce_ex[] = "__reduce_ex__"; static const char __pyx_k_six_moves[] = "six.moves"; static const char __pyx_k_ValueError[] = "ValueError"; static const char __pyx_k_pyx_result[] = "__pyx_result"; -static const char __pyx_k_smart_open[] = "smart_open"; static const char __pyx_k_startswith[] = "startswith"; static const char __pyx_k_to_unicode[] = "to_unicode"; static const char __pyx_k_transposed[] = "transposed"; @@ -1553,6 +1554,7 @@ static PyObject *__pyx_kp_s_matrix_columns_must_come_in_asce; static PyObject *__pyx_kp_s_matrixmarket_matrix_coordinate; static PyObject *__pyx_n_s_name; static PyObject *__pyx_n_s_new; +static PyObject *__pyx_n_s_open; static PyObject *__pyx_n_s_open_file; static PyObject *__pyx_n_s_pickle; static PyObject *__pyx_n_s_pyx_PickleError; @@ -1562,6 +1564,7 @@ static PyObject *__pyx_n_s_pyx_state; static PyObject *__pyx_n_s_pyx_type; static PyObject *__pyx_n_s_pyx_unpickle_MmReader; static PyObject *__pyx_n_s_range; +static PyObject *__pyx_n_s_rb; static PyObject *__pyx_n_s_reduce; static PyObject *__pyx_n_s_reduce_cython; static PyObject *__pyx_n_s_reduce_ex; @@ -1572,7 +1575,6 @@ static PyObject *__pyx_n_s_setstate_cython; static PyObject *__pyx_n_s_six; static PyObject *__pyx_n_s_six_moves; static PyObject *__pyx_n_s_skip_headers; -static PyObject *__pyx_n_s_smart_open; static PyObject *__pyx_n_s_split; static PyObject *__pyx_n_s_startswith; static PyObject *__pyx_n_s_string_types; @@ -4128,13 +4130,14 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st PyObject *__pyx_t_3 = NULL; int __pyx_t_4; PyObject *__pyx_t_5 = NULL; - PY_LONG_LONG __pyx_t_6; - Py_ssize_t __pyx_t_7; - PyObject *(*__pyx_t_8)(PyObject *); - char const *__pyx_t_9; - PyObject *__pyx_t_10 = NULL; - PY_LONG_LONG __pyx_t_11; - int __pyx_t_12; + int __pyx_t_6; + PyObject *__pyx_t_7 = NULL; + PY_LONG_LONG __pyx_t_8; + Py_ssize_t __pyx_t_9; + PyObject *(*__pyx_t_10)(PyObject *); + char const *__pyx_t_11; + PY_LONG_LONG __pyx_t_12; + int __pyx_t_13; __Pyx_RefNannySetupContext("docbyoffset", 0); /* "gensim/corpora/_mmreader.pyx":188 @@ -4155,7 +4158,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st * if offset == -1: * return [] # <<<<<<<<<<<<<< * if isinstance(self.input, string_types): - * fin, close_fin = utils.smart_open(self.input), True + * fin, close_fin = utils.open(self.input, 'rb'), True */ __Pyx_XDECREF(__pyx_r); __pyx_t_1 = PyList_New(0); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 189, __pyx_L1_error) @@ -4177,7 +4180,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st * if offset == -1: * return [] * if isinstance(self.input, string_types): # <<<<<<<<<<<<<< - * fin, close_fin = utils.smart_open(self.input), True + * fin, close_fin = utils.open(self.input, 'rb'), True * else: */ __pyx_t_1 = __pyx_v_self->input; @@ -4193,16 +4196,17 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st /* "gensim/corpora/_mmreader.pyx":191 * return [] * if isinstance(self.input, string_types): - * fin, close_fin = utils.smart_open(self.input), True # <<<<<<<<<<<<<< + * fin, close_fin = utils.open(self.input, 'rb'), True # <<<<<<<<<<<<<< * else: * fin, close_fin = self.input, False */ __Pyx_GetModuleGlobalName(__pyx_t_1, __pyx_n_s_utils); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 191, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_1); - __pyx_t_5 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_smart_open); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 191, __pyx_L1_error) + __pyx_t_5 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_open); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 191, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_5); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __pyx_t_1 = NULL; + __pyx_t_6 = 0; if (CYTHON_UNPACK_METHODS && unlikely(PyMethod_Check(__pyx_t_5))) { __pyx_t_1 = PyMethod_GET_SELF(__pyx_t_5); if (likely(__pyx_t_1)) { @@ -4210,12 +4214,41 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st __Pyx_INCREF(__pyx_t_1); __Pyx_INCREF(function); __Pyx_DECREF_SET(__pyx_t_5, function); + __pyx_t_6 = 1; } } - __pyx_t_3 = (__pyx_t_1) ? __Pyx_PyObject_Call2Args(__pyx_t_5, __pyx_t_1, __pyx_v_self->input) : __Pyx_PyObject_CallOneArg(__pyx_t_5, __pyx_v_self->input); - __Pyx_XDECREF(__pyx_t_1); __pyx_t_1 = 0; - if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 191, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_3); + #if CYTHON_FAST_PYCALL + if (PyFunction_Check(__pyx_t_5)) { + PyObject *__pyx_temp[3] = {__pyx_t_1, __pyx_v_self->input, __pyx_n_s_rb}; + __pyx_t_3 = __Pyx_PyFunction_FastCall(__pyx_t_5, __pyx_temp+1-__pyx_t_6, 2+__pyx_t_6); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 191, __pyx_L1_error) + __Pyx_XDECREF(__pyx_t_1); __pyx_t_1 = 0; + __Pyx_GOTREF(__pyx_t_3); + } else + #endif + #if CYTHON_FAST_PYCCALL + if (__Pyx_PyFastCFunction_Check(__pyx_t_5)) { + PyObject *__pyx_temp[3] = {__pyx_t_1, __pyx_v_self->input, __pyx_n_s_rb}; + __pyx_t_3 = __Pyx_PyCFunction_FastCall(__pyx_t_5, __pyx_temp+1-__pyx_t_6, 2+__pyx_t_6); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 191, __pyx_L1_error) + __Pyx_XDECREF(__pyx_t_1); __pyx_t_1 = 0; + __Pyx_GOTREF(__pyx_t_3); + } else + #endif + { + __pyx_t_7 = PyTuple_New(2+__pyx_t_6); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 191, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_7); + if (__pyx_t_1) { + __Pyx_GIVEREF(__pyx_t_1); PyTuple_SET_ITEM(__pyx_t_7, 0, __pyx_t_1); __pyx_t_1 = NULL; + } + __Pyx_INCREF(__pyx_v_self->input); + __Pyx_GIVEREF(__pyx_v_self->input); + PyTuple_SET_ITEM(__pyx_t_7, 0+__pyx_t_6, __pyx_v_self->input); + __Pyx_INCREF(__pyx_n_s_rb); + __Pyx_GIVEREF(__pyx_n_s_rb); + PyTuple_SET_ITEM(__pyx_t_7, 1+__pyx_t_6, __pyx_n_s_rb); + __pyx_t_3 = __Pyx_PyObject_Call(__pyx_t_5, __pyx_t_7, NULL); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 191, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_3); + __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0; + } __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; __pyx_t_4 = 1; __pyx_v_fin = __pyx_t_3; @@ -4226,14 +4259,14 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st * if offset == -1: * return [] * if isinstance(self.input, string_types): # <<<<<<<<<<<<<< - * fin, close_fin = utils.smart_open(self.input), True + * fin, close_fin = utils.open(self.input, 'rb'), True * else: */ goto __pyx_L4; } /* "gensim/corpora/_mmreader.pyx":193 - * fin, close_fin = utils.smart_open(self.input), True + * fin, close_fin = utils.open(self.input, 'rb'), True * else: * fin, close_fin = self.input, False # <<<<<<<<<<<<<< * @@ -4258,18 +4291,18 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st */ __pyx_t_5 = __Pyx_PyObject_GetAttrStr(__pyx_v_fin, __pyx_n_s_seek); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 195, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_5); - __pyx_t_1 = NULL; + __pyx_t_7 = NULL; if (CYTHON_UNPACK_METHODS && likely(PyMethod_Check(__pyx_t_5))) { - __pyx_t_1 = PyMethod_GET_SELF(__pyx_t_5); - if (likely(__pyx_t_1)) { + __pyx_t_7 = PyMethod_GET_SELF(__pyx_t_5); + if (likely(__pyx_t_7)) { PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_5); - __Pyx_INCREF(__pyx_t_1); + __Pyx_INCREF(__pyx_t_7); __Pyx_INCREF(function); __Pyx_DECREF_SET(__pyx_t_5, function); } } - __pyx_t_3 = (__pyx_t_1) ? __Pyx_PyObject_Call2Args(__pyx_t_5, __pyx_t_1, __pyx_v_offset) : __Pyx_PyObject_CallOneArg(__pyx_t_5, __pyx_v_offset); - __Pyx_XDECREF(__pyx_t_1); __pyx_t_1 = 0; + __pyx_t_3 = (__pyx_t_7) ? __Pyx_PyObject_Call2Args(__pyx_t_5, __pyx_t_7, __pyx_v_offset) : __Pyx_PyObject_CallOneArg(__pyx_t_5, __pyx_v_offset); + __Pyx_XDECREF(__pyx_t_7); __pyx_t_7 = 0; if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 195, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; @@ -4282,10 +4315,10 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st * for line in fin: * if (sscanf(line, "%lld %lld %lg", &docid, &termid, &val) != 3): */ - __pyx_t_6 = -1LL; + __pyx_t_8 = -1LL; __pyx_t_3 = PyList_New(0); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 196, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); - __pyx_v_previd = __pyx_t_6; + __pyx_v_previd = __pyx_t_8; __pyx_v_document = ((PyObject*)__pyx_t_3); __pyx_t_3 = 0; @@ -4297,34 +4330,34 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st * raise ValueError("unable to parse line: {}".format(line)) */ if (likely(PyList_CheckExact(__pyx_v_fin)) || PyTuple_CheckExact(__pyx_v_fin)) { - __pyx_t_3 = __pyx_v_fin; __Pyx_INCREF(__pyx_t_3); __pyx_t_7 = 0; - __pyx_t_8 = NULL; + __pyx_t_3 = __pyx_v_fin; __Pyx_INCREF(__pyx_t_3); __pyx_t_9 = 0; + __pyx_t_10 = NULL; } else { - __pyx_t_7 = -1; __pyx_t_3 = PyObject_GetIter(__pyx_v_fin); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 197, __pyx_L1_error) + __pyx_t_9 = -1; __pyx_t_3 = PyObject_GetIter(__pyx_v_fin); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 197, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); - __pyx_t_8 = Py_TYPE(__pyx_t_3)->tp_iternext; if (unlikely(!__pyx_t_8)) __PYX_ERR(0, 197, __pyx_L1_error) + __pyx_t_10 = Py_TYPE(__pyx_t_3)->tp_iternext; if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 197, __pyx_L1_error) } for (;;) { - if (likely(!__pyx_t_8)) { + if (likely(!__pyx_t_10)) { if (likely(PyList_CheckExact(__pyx_t_3))) { - if (__pyx_t_7 >= PyList_GET_SIZE(__pyx_t_3)) break; + if (__pyx_t_9 >= PyList_GET_SIZE(__pyx_t_3)) break; #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS - __pyx_t_5 = PyList_GET_ITEM(__pyx_t_3, __pyx_t_7); __Pyx_INCREF(__pyx_t_5); __pyx_t_7++; if (unlikely(0 < 0)) __PYX_ERR(0, 197, __pyx_L1_error) + __pyx_t_5 = PyList_GET_ITEM(__pyx_t_3, __pyx_t_9); __Pyx_INCREF(__pyx_t_5); __pyx_t_9++; if (unlikely(0 < 0)) __PYX_ERR(0, 197, __pyx_L1_error) #else - __pyx_t_5 = PySequence_ITEM(__pyx_t_3, __pyx_t_7); __pyx_t_7++; if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 197, __pyx_L1_error) + __pyx_t_5 = PySequence_ITEM(__pyx_t_3, __pyx_t_9); __pyx_t_9++; if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 197, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_5); #endif } else { - if (__pyx_t_7 >= PyTuple_GET_SIZE(__pyx_t_3)) break; + if (__pyx_t_9 >= PyTuple_GET_SIZE(__pyx_t_3)) break; #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS - __pyx_t_5 = PyTuple_GET_ITEM(__pyx_t_3, __pyx_t_7); __Pyx_INCREF(__pyx_t_5); __pyx_t_7++; if (unlikely(0 < 0)) __PYX_ERR(0, 197, __pyx_L1_error) + __pyx_t_5 = PyTuple_GET_ITEM(__pyx_t_3, __pyx_t_9); __Pyx_INCREF(__pyx_t_5); __pyx_t_9++; if (unlikely(0 < 0)) __PYX_ERR(0, 197, __pyx_L1_error) #else - __pyx_t_5 = PySequence_ITEM(__pyx_t_3, __pyx_t_7); __pyx_t_7++; if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 197, __pyx_L1_error) + __pyx_t_5 = PySequence_ITEM(__pyx_t_3, __pyx_t_9); __pyx_t_9++; if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 197, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_5); #endif } } else { - __pyx_t_5 = __pyx_t_8(__pyx_t_3); + __pyx_t_5 = __pyx_t_10(__pyx_t_3); if (unlikely(!__pyx_t_5)) { PyObject* exc_type = PyErr_Occurred(); if (exc_type) { @@ -4345,8 +4378,8 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st * raise ValueError("unable to parse line: {}".format(line)) * */ - __pyx_t_9 = __Pyx_PyObject_AsString(__pyx_v_line); if (unlikely((!__pyx_t_9) && PyErr_Occurred())) __PYX_ERR(0, 198, __pyx_L1_error) - __pyx_t_4 = ((sscanf(__pyx_t_9, ((char const *)"%lld %lld %lg"), (&__pyx_v_docid), (&__pyx_v_termid), (&__pyx_v_val)) != 3) != 0); + __pyx_t_11 = __Pyx_PyObject_AsString(__pyx_v_line); if (unlikely((!__pyx_t_11) && PyErr_Occurred())) __PYX_ERR(0, 198, __pyx_L1_error) + __pyx_t_4 = ((sscanf(__pyx_t_11, ((char const *)"%lld %lld %lg"), (&__pyx_v_docid), (&__pyx_v_termid), (&__pyx_v_val)) != 3) != 0); if (unlikely(__pyx_t_4)) { /* "gensim/corpora/_mmreader.pyx":199 @@ -4356,28 +4389,28 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st * * if not self.transposed: */ - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_kp_s_unable_to_parse_line, __pyx_n_s_format); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 199, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_1); - __pyx_t_10 = NULL; - if (CYTHON_UNPACK_METHODS && likely(PyMethod_Check(__pyx_t_1))) { - __pyx_t_10 = PyMethod_GET_SELF(__pyx_t_1); - if (likely(__pyx_t_10)) { - PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_1); - __Pyx_INCREF(__pyx_t_10); + __pyx_t_7 = __Pyx_PyObject_GetAttrStr(__pyx_kp_s_unable_to_parse_line, __pyx_n_s_format); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 199, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_7); + __pyx_t_1 = NULL; + if (CYTHON_UNPACK_METHODS && likely(PyMethod_Check(__pyx_t_7))) { + __pyx_t_1 = PyMethod_GET_SELF(__pyx_t_7); + if (likely(__pyx_t_1)) { + PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_7); + __Pyx_INCREF(__pyx_t_1); __Pyx_INCREF(function); - __Pyx_DECREF_SET(__pyx_t_1, function); + __Pyx_DECREF_SET(__pyx_t_7, function); } } - __pyx_t_5 = (__pyx_t_10) ? __Pyx_PyObject_Call2Args(__pyx_t_1, __pyx_t_10, __pyx_v_line) : __Pyx_PyObject_CallOneArg(__pyx_t_1, __pyx_v_line); - __Pyx_XDECREF(__pyx_t_10); __pyx_t_10 = 0; + __pyx_t_5 = (__pyx_t_1) ? __Pyx_PyObject_Call2Args(__pyx_t_7, __pyx_t_1, __pyx_v_line) : __Pyx_PyObject_CallOneArg(__pyx_t_7, __pyx_v_line); + __Pyx_XDECREF(__pyx_t_1); __pyx_t_1 = 0; if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 199, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_5); - __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - __pyx_t_1 = __Pyx_PyObject_CallOneArg(__pyx_builtin_ValueError, __pyx_t_5); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 199, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_1); + __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0; + __pyx_t_7 = __Pyx_PyObject_CallOneArg(__pyx_builtin_ValueError, __pyx_t_5); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 199, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_7); __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0; - __Pyx_Raise(__pyx_t_1, 0, 0, 0); - __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __Pyx_Raise(__pyx_t_7, 0, 0, 0); + __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0; __PYX_ERR(0, 199, __pyx_L1_error) /* "gensim/corpora/_mmreader.pyx":198 @@ -4406,10 +4439,10 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st * * # -1 because matrix market indexes are 1-based => convert to 0-based */ - __pyx_t_6 = __pyx_v_docid; - __pyx_t_11 = __pyx_v_termid; - __pyx_v_termid = __pyx_t_6; - __pyx_v_docid = __pyx_t_11; + __pyx_t_8 = __pyx_v_docid; + __pyx_t_12 = __pyx_v_termid; + __pyx_v_termid = __pyx_t_8; + __pyx_v_docid = __pyx_t_12; /* "gensim/corpora/_mmreader.pyx":201 * raise ValueError("unable to parse line: {}".format(line)) @@ -4517,20 +4550,20 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st * * if close_fin: */ - __pyx_t_1 = __Pyx_PyInt_From_PY_LONG_LONG(__pyx_v_termid); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 214, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_1); + __pyx_t_7 = __Pyx_PyInt_From_PY_LONG_LONG(__pyx_v_termid); if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 214, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_7); __pyx_t_5 = PyFloat_FromDouble(__pyx_v_val); if (unlikely(!__pyx_t_5)) __PYX_ERR(0, 214, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_5); - __pyx_t_10 = PyTuple_New(2); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 214, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_10); - __Pyx_GIVEREF(__pyx_t_1); - PyTuple_SET_ITEM(__pyx_t_10, 0, __pyx_t_1); + __pyx_t_1 = PyTuple_New(2); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 214, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); + __Pyx_GIVEREF(__pyx_t_7); + PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_t_7); __Pyx_GIVEREF(__pyx_t_5); - PyTuple_SET_ITEM(__pyx_t_10, 1, __pyx_t_5); - __pyx_t_1 = 0; + PyTuple_SET_ITEM(__pyx_t_1, 1, __pyx_t_5); + __pyx_t_7 = 0; __pyx_t_5 = 0; - __pyx_t_12 = __Pyx_PyList_Append(__pyx_v_document, __pyx_t_10); if (unlikely(__pyx_t_12 == ((int)-1))) __PYX_ERR(0, 214, __pyx_L1_error) - __Pyx_DECREF(__pyx_t_10); __pyx_t_10 = 0; + __pyx_t_13 = __Pyx_PyList_Append(__pyx_v_document, __pyx_t_1); if (unlikely(__pyx_t_13 == ((int)-1))) __PYX_ERR(0, 214, __pyx_L1_error) + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; /* "gensim/corpora/_mmreader.pyx":197 * fin.seek(offset) # works for gzip/bz2 input, too @@ -4559,23 +4592,23 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st * fin.close() # <<<<<<<<<<<<<< * return document */ - __pyx_t_10 = __Pyx_PyObject_GetAttrStr(__pyx_v_fin, __pyx_n_s_close); if (unlikely(!__pyx_t_10)) __PYX_ERR(0, 217, __pyx_L1_error) - __Pyx_GOTREF(__pyx_t_10); + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_fin, __pyx_n_s_close); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 217, __pyx_L1_error) + __Pyx_GOTREF(__pyx_t_1); __pyx_t_5 = NULL; - if (CYTHON_UNPACK_METHODS && likely(PyMethod_Check(__pyx_t_10))) { - __pyx_t_5 = PyMethod_GET_SELF(__pyx_t_10); + if (CYTHON_UNPACK_METHODS && likely(PyMethod_Check(__pyx_t_1))) { + __pyx_t_5 = PyMethod_GET_SELF(__pyx_t_1); if (likely(__pyx_t_5)) { - PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_10); + PyObject* function = PyMethod_GET_FUNCTION(__pyx_t_1); __Pyx_INCREF(__pyx_t_5); __Pyx_INCREF(function); - __Pyx_DECREF_SET(__pyx_t_10, function); + __Pyx_DECREF_SET(__pyx_t_1, function); } } - __pyx_t_3 = (__pyx_t_5) ? __Pyx_PyObject_CallOneArg(__pyx_t_10, __pyx_t_5) : __Pyx_PyObject_CallNoArg(__pyx_t_10); + __pyx_t_3 = (__pyx_t_5) ? __Pyx_PyObject_CallOneArg(__pyx_t_1, __pyx_t_5) : __Pyx_PyObject_CallNoArg(__pyx_t_1); __Pyx_XDECREF(__pyx_t_5); __pyx_t_5 = 0; if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 217, __pyx_L1_error) __Pyx_GOTREF(__pyx_t_3); - __Pyx_DECREF(__pyx_t_10); __pyx_t_10 = 0; + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; /* "gensim/corpora/_mmreader.pyx":216 @@ -4610,7 +4643,7 @@ static PyObject *__pyx_pf_6gensim_7corpora_9_mmreader_8MmReader_11docbyoffset(st __Pyx_XDECREF(__pyx_t_1); __Pyx_XDECREF(__pyx_t_3); __Pyx_XDECREF(__pyx_t_5); - __Pyx_XDECREF(__pyx_t_10); + __Pyx_XDECREF(__pyx_t_7); __Pyx_AddTraceback("gensim.corpora._mmreader.MmReader.docbyoffset", __pyx_clineno, __pyx_lineno, __pyx_filename); __pyx_r = NULL; __pyx_L0:; @@ -6374,6 +6407,7 @@ static __Pyx_StringTabEntry __pyx_string_tab[] = { {&__pyx_kp_s_matrixmarket_matrix_coordinate, __pyx_k_matrixmarket_matrix_coordinate, sizeof(__pyx_k_matrixmarket_matrix_coordinate), 0, 0, 1, 0}, {&__pyx_n_s_name, __pyx_k_name, sizeof(__pyx_k_name), 0, 0, 1, 1}, {&__pyx_n_s_new, __pyx_k_new, sizeof(__pyx_k_new), 0, 0, 1, 1}, + {&__pyx_n_s_open, __pyx_k_open, sizeof(__pyx_k_open), 0, 0, 1, 1}, {&__pyx_n_s_open_file, __pyx_k_open_file, sizeof(__pyx_k_open_file), 0, 0, 1, 1}, {&__pyx_n_s_pickle, __pyx_k_pickle, sizeof(__pyx_k_pickle), 0, 0, 1, 1}, {&__pyx_n_s_pyx_PickleError, __pyx_k_pyx_PickleError, sizeof(__pyx_k_pyx_PickleError), 0, 0, 1, 1}, @@ -6383,6 +6417,7 @@ static __Pyx_StringTabEntry __pyx_string_tab[] = { {&__pyx_n_s_pyx_type, __pyx_k_pyx_type, sizeof(__pyx_k_pyx_type), 0, 0, 1, 1}, {&__pyx_n_s_pyx_unpickle_MmReader, __pyx_k_pyx_unpickle_MmReader, sizeof(__pyx_k_pyx_unpickle_MmReader), 0, 0, 1, 1}, {&__pyx_n_s_range, __pyx_k_range, sizeof(__pyx_k_range), 0, 0, 1, 1}, + {&__pyx_n_s_rb, __pyx_k_rb, sizeof(__pyx_k_rb), 0, 0, 1, 1}, {&__pyx_n_s_reduce, __pyx_k_reduce, sizeof(__pyx_k_reduce), 0, 0, 1, 1}, {&__pyx_n_s_reduce_cython, __pyx_k_reduce_cython, sizeof(__pyx_k_reduce_cython), 0, 0, 1, 1}, {&__pyx_n_s_reduce_ex, __pyx_k_reduce_ex, sizeof(__pyx_k_reduce_ex), 0, 0, 1, 1}, @@ -6393,7 +6428,6 @@ static __Pyx_StringTabEntry __pyx_string_tab[] = { {&__pyx_n_s_six, __pyx_k_six, sizeof(__pyx_k_six), 0, 0, 1, 1}, {&__pyx_n_s_six_moves, __pyx_k_six_moves, sizeof(__pyx_k_six_moves), 0, 0, 1, 1}, {&__pyx_n_s_skip_headers, __pyx_k_skip_headers, sizeof(__pyx_k_skip_headers), 0, 0, 1, 1}, - {&__pyx_n_s_smart_open, __pyx_k_smart_open, sizeof(__pyx_k_smart_open), 0, 0, 1, 1}, {&__pyx_n_s_split, __pyx_k_split, sizeof(__pyx_k_split), 0, 0, 1, 1}, {&__pyx_n_s_startswith, __pyx_k_startswith, sizeof(__pyx_k_startswith), 0, 0, 1, 1}, {&__pyx_n_s_string_types, __pyx_k_string_types, sizeof(__pyx_k_string_types), 0, 0, 1, 1}, diff --git a/gensim/corpora/_mmreader.pyx b/gensim/corpora/_mmreader.pyx index 36cf11a1b9..1fff966760 100644 --- a/gensim/corpora/_mmreader.pyx +++ b/gensim/corpora/_mmreader.pyx @@ -188,7 +188,7 @@ cdef class MmReader(object): if offset == -1: return [] if isinstance(self.input, string_types): - fin, close_fin = utils.smart_open(self.input), True + fin, close_fin = utils.open(self.input, 'rb'), True else: fin, close_fin = self.input, False diff --git a/gensim/corpora/bleicorpus.py b/gensim/corpora/bleicorpus.py index 701831b1b1..1afde870d2 100644 --- a/gensim/corpora/bleicorpus.py +++ b/gensim/corpora/bleicorpus.py @@ -74,7 +74,7 @@ def __init__(self, fname, fname_vocab=None): raise IOError('BleiCorpus: could not find vocabulary file') self.fname = fname - with utils.smart_open(fname_vocab) as fin: + with utils.open(fname_vocab, 'rb') as fin: words = [utils.to_unicode(word).rstrip() for word in fin] self.id2word = dict(enumerate(words)) @@ -88,7 +88,7 @@ def __iter__(self): """ lineno = -1 - with utils.smart_open(self.fname) as fin: + with utils.open(self.fname, 'rb') as fin: for lineno, line in enumerate(fin): yield self.line2doc(line) self.length = lineno + 1 @@ -149,7 +149,7 @@ def save_corpus(fname, corpus, id2word=None, metadata=False): num_terms = 0 logger.info("storing corpus in Blei's LDA-C format into %s", fname) - with utils.smart_open(fname, 'wb') as fout: + with utils.open(fname, 'wb') as fout: offsets = [] for doc in corpus: doc = list(doc) @@ -160,7 +160,7 @@ def save_corpus(fname, corpus, id2word=None, metadata=False): # write out vocabulary, in a format compatible with Blei's topics.py script fname_vocab = utils.smart_extension(fname, '.vocab') logger.info("saving vocabulary of %i words to %s", num_terms, fname_vocab) - with utils.smart_open(fname_vocab, 'wb') as fout: + with utils.open(fname_vocab, 'wb') as fout: for featureid in range(num_terms): fout.write(utils.to_utf8("%s\n" % id2word.get(featureid, '---'))) @@ -181,6 +181,6 @@ def docbyoffset(self, offset): Document in BoW format. """ - with utils.smart_open(self.fname) as f: + with utils.open(self.fname, 'rb') as f: f.seek(offset) return self.line2doc(f.readline()) diff --git a/gensim/corpora/csvcorpus.py b/gensim/corpora/csvcorpus.py index 16a88a93e9..59fbbe16f2 100644 --- a/gensim/corpora/csvcorpus.py +++ b/gensim/corpora/csvcorpus.py @@ -45,10 +45,11 @@ def __init__(self, fname, labels): self.labels = labels # load the first few lines, to guess the CSV dialect - head = ''.join(itertools.islice(utils.smart_open(self.fname), 5)) - self.headers = csv.Sniffer().has_header(head) - self.dialect = csv.Sniffer().sniff(head) - logger.info("sniffed CSV delimiter=%r, headers=%s", self.dialect.delimiter, self.headers) + with utils.open(self.fname, 'rb') as f: + head = ''.join(itertools.islice(f, 5)) + self.headers = csv.Sniffer().has_header(head) + self.dialect = csv.Sniffer().sniff(head) + logger.info("sniffed CSV delimiter=%r, headers=%s", self.dialect.delimiter, self.headers) def __iter__(self): """Iterate over the corpus, returning one BoW vector at a time. @@ -59,14 +60,15 @@ def __iter__(self): Document in BoW format. """ - reader = csv.reader(utils.smart_open(self.fname), self.dialect) - if self.headers: - next(reader) # skip the headers - - line_no = -1 - for line_no, line in enumerate(reader): - if self.labels: - line.pop(0) # ignore the first column = class label - yield list(enumerate(float(x) for x in line)) - - self.length = line_no + 1 # store the total number of CSV rows = documents + with utils.open(self.fname, 'rb') as f: + reader = csv.reader(f, self.dialect) + if self.headers: + next(reader) # skip the headers + + line_no = -1 + for line_no, line in enumerate(reader): + if self.labels: + line.pop(0) # ignore the first column = class label + yield list(enumerate(float(x) for x in line)) + + self.length = line_no + 1 # store the total number of CSV rows = documents diff --git a/gensim/corpora/dictionary.py b/gensim/corpora/dictionary.py index 21df726f3d..561d61babb 100644 --- a/gensim/corpora/dictionary.py +++ b/gensim/corpora/dictionary.py @@ -516,7 +516,7 @@ def save_as_text(self, fname, sort_by_word=True): """ logger.info("saving dictionary mapping to %s", fname) - with utils.smart_open(fname, 'wb') as fout: + with utils.open(fname, 'wb') as fout: numdocs_line = "%d\n" % self.num_docs fout.write(utils.to_utf8(numdocs_line)) if sort_by_word: @@ -669,7 +669,7 @@ def load_from_text(fname): """ result = Dictionary() - with utils.smart_open(fname) as f: + with utils.open(fname, 'rb') as f: for lineno, line in enumerate(f): line = utils.to_unicode(line) if lineno == 0: diff --git a/gensim/corpora/hashdictionary.py b/gensim/corpora/hashdictionary.py index 87c76c590c..cb3f4053ea 100644 --- a/gensim/corpora/hashdictionary.py +++ b/gensim/corpora/hashdictionary.py @@ -341,7 +341,7 @@ def save_as_text(self, fname): """ logger.info("saving %s mapping to %s" % (self, fname)) - with utils.smart_open(fname, 'wb') as fout: + with utils.open(fname, 'wb') as fout: for tokenid in self.keys(): words = sorted(self[tokenid]) if words: diff --git a/gensim/corpora/lowcorpus.py b/gensim/corpora/lowcorpus.py index c67c34b700..d52f190187 100644 --- a/gensim/corpora/lowcorpus.py +++ b/gensim/corpora/lowcorpus.py @@ -131,7 +131,7 @@ def _calculate_num_docs(self): """ # the first line in input data is the number of documents (integer). throws exception on bad input. - with utils.smart_open(self.fname) as fin: + with utils.open(self.fname, 'rb') as fin: try: result = int(next(fin)) except StopIteration: @@ -191,7 +191,7 @@ def __iter__(self): Document in BoW format. """ - with utils.smart_open(self.fname) as fin: + with utils.open(self.fname, 'rb') as fin: for lineno, line in enumerate(fin): if lineno > 0: # ignore the first line = number of documents yield self.line2doc(line) @@ -231,7 +231,7 @@ def save_corpus(fname, corpus, id2word=None, metadata=False): logger.info("storing corpus in List-Of-Words format into %s" % fname) truncated = 0 offsets = [] - with utils.smart_open(fname, 'wb') as fout: + with utils.open(fname, 'wb') as fout: fout.write(utils.to_utf8('%i\n' % len(corpus))) for doc in corpus: words = [] @@ -277,7 +277,7 @@ def docbyoffset(self, offset): [(0, 1), (3, 1), (4, 1)] """ - with utils.smart_open(self.fname) as f: + with utils.open(self.fname, 'rb') as f: f.seek(offset) return self.line2doc(f.readline()) diff --git a/gensim/corpora/malletcorpus.py b/gensim/corpora/malletcorpus.py index c9a82fffbb..2b83a90bb1 100644 --- a/gensim/corpora/malletcorpus.py +++ b/gensim/corpora/malletcorpus.py @@ -83,7 +83,7 @@ def _calculate_num_docs(self): Number of documents in file. """ - with utils.smart_open(self.fname) as fin: + with utils.open(self.fname, 'rb') as fin: result = sum(1 for _ in fin) return result @@ -96,7 +96,7 @@ def __iter__(self): Document in BoW format (+"document_id" and "lang" if metadata=True). """ - with utils.smart_open(self.fname) as f: + with utils.open(self.fname, 'rb') as f: for line in f: yield self.line2doc(line) @@ -180,7 +180,7 @@ def save_corpus(fname, corpus, id2word=None, metadata=False): truncated = 0 offsets = [] - with utils.smart_open(fname, 'wb') as fout: + with utils.open(fname, 'wb') as fout: for doc_id, doc in enumerate(corpus): if metadata: doc_id, doc_lang = doc[1] @@ -231,6 +231,6 @@ def docbyoffset(self, offset): [(4, 1)] """ - with utils.smart_open(self.fname) as f: + with utils.open(self.fname, 'rb') as f: f.seek(offset) return self.line2doc(f.readline()) diff --git a/gensim/corpora/svmlightcorpus.py b/gensim/corpora/svmlightcorpus.py index 153bd973e0..5f0b049b07 100644 --- a/gensim/corpora/svmlightcorpus.py +++ b/gensim/corpora/svmlightcorpus.py @@ -74,7 +74,7 @@ def __iter__(self): """ lineno = -1 self.labels = [] - with utils.smart_open(self.fname) as fin: + with utils.open(self.fname, 'rb') as fin: for lineno, line in enumerate(fin): doc = self.line2doc(line) if doc is not None: @@ -115,7 +115,7 @@ def save_corpus(fname, corpus, id2word=None, labels=False, metadata=False): # Cast any sequence (incl. a numpy array) to a list, to simplify the processing below. labels = list(labels) offsets = [] - with utils.smart_open(fname, 'wb') as fout: + with utils.open(fname, 'wb') as fout: for docno, doc in enumerate(corpus): label = labels[docno] if labels else 0 # target class is 0 by default offsets.append(fout.tell()) @@ -135,7 +135,7 @@ def docbyoffset(self, offset): tuple of (int, float) """ - with utils.smart_open(self.fname) as f: + with utils.open(self.fname, 'rb') as f: f.seek(offset) return self.line2doc(f.readline())[0] # TODO: it brakes if gets None from line2doc diff --git a/gensim/corpora/ucicorpus.py b/gensim/corpora/ucicorpus.py index 9831c7bba3..6f5f2f85f3 100644 --- a/gensim/corpora/ucicorpus.py +++ b/gensim/corpora/ucicorpus.py @@ -39,7 +39,7 @@ def __init__(self, input): self.input = input - with utils.smart_open(self.input) as fin: + with utils.open(self.input, 'rb') as fin: self.num_docs = self.num_terms = self.num_nnz = 0 try: self.num_docs = int(next(fin).strip()) @@ -188,7 +188,7 @@ def __init__(self, fname, fname_vocab=None): fname_vocab = utils.smart_extension(fname, '.vocab') self.fname = fname - with utils.smart_open(fname_vocab) as fin: + with utils.open(fname_vocab, 'rb') as fin: words = [word.strip() for word in fin] self.id2word = dict(enumerate(words)) @@ -286,7 +286,7 @@ def save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False) # write out vocabulary fname_vocab = utils.smart_extension(fname, '.vocab') logger.info("saving vocabulary of %i words to %s", num_terms, fname_vocab) - with utils.smart_open(fname_vocab, 'wb') as fout: + with utils.open(fname_vocab, 'wb') as fout: for featureid in range(num_terms): fout.write(utils.to_utf8("%s\n" % id2word.get(featureid, '---'))) diff --git a/gensim/matutils.py b/gensim/matutils.py index f4ce4a13d8..9f5368db58 100644 --- a/gensim/matutils.py +++ b/gensim/matutils.py @@ -1236,7 +1236,7 @@ def __init__(self, fname): self.fname = fname if fname.endswith(".gz") or fname.endswith('.bz2'): raise NotImplementedError("compressed output not supported with MmWriter") - self.fout = utils.smart_open(self.fname, 'wb+') # open for both reading and writing + self.fout = utils.open(self.fname, 'wb+') # open for both reading and writing self.headers_written = False def write_headers(self, num_docs, num_terms, num_nnz): @@ -1574,7 +1574,7 @@ def docbyoffset(self, offset): if offset == -1: return [] if isinstance(self.input, string_types): - fin, close_fin = utils.smart_open(self.input), True + fin, close_fin = utils.open(self.input, 'rb'), True else: fin, close_fin = self.input, False diff --git a/gensim/models/deprecated/doc2vec.py b/gensim/models/deprecated/doc2vec.py index 76e4a7e2d4..9378b77d88 100644 --- a/gensim/models/deprecated/doc2vec.py +++ b/gensim/models/deprecated/doc2vec.py @@ -965,7 +965,7 @@ def save_word2vec_format(self, fname, doctag_vec=False, word_vec=True, prefix='* KeyedVectors.save_word2vec_format(self.wv, fname, fvocab, binary, total_vec) # save document vectors if doctag_vec: - with utils.smart_open(fname, 'ab') as fout: + with utils.open(fname, 'ab') as fout: if not word_vec: total_vec = len(self.docvecs) logger.info("storing %sx%s projection weights into %s", total_vec, self.vector_size, fname) @@ -992,16 +992,17 @@ def __iter__(self): fname = os.path.join(self.dirname, fname) if not os.path.isfile(fname): continue - for item_no, line in enumerate(utils.smart_open(fname)): - line = utils.to_unicode(line) - # each file line is a single document in the Brown corpus - # each token is WORD/POS_TAG - token_tags = [t.split('/') for t in line.split() if len(t.split('/')) == 2] - # ignore words with non-alphabetic tags like ",", "!" etc (punctuation, weird stuff) - words = ["%s/%s" % (token.lower(), tag[:2]) for token, tag in token_tags if tag[:2].isalpha()] - if not words: # don't bother sending out empty documents - continue - yield TaggedDocument(words, ['%s_SENT_%s' % (fname, item_no)]) + with utils.open(fname, 'rb') as f: + for item_no, line in enumerate(f): + line = utils.to_unicode(line) + # each file line is a single document in the Brown corpus + # each token is WORD/POS_TAG + token_tags = [t.split('/') for t in line.split() if len(t.split('/')) == 2] + # ignore words with non-alphabetic tags like ",", "!" etc (punctuation, weird stuff) + words = ["%s/%s" % (token.lower(), tag[:2]) for token, tag in token_tags if tag[:2].isalpha()] + if not words: # don't bother sending out empty documents + continue + yield TaggedDocument(words, ['%s_SENT_%s' % (fname, item_no)]) class TaggedLineDocument(object): @@ -1036,6 +1037,6 @@ def __iter__(self): yield TaggedDocument(utils.to_unicode(line).split(), [item_no]) except AttributeError: # If it didn't work like a file, use it as a string filename - with utils.smart_open(self.source) as fin: + with utils.open(self.source, 'rb') as fin: for item_no, line in enumerate(fin): yield TaggedDocument(utils.to_unicode(line).split(), [item_no]) diff --git a/gensim/models/deprecated/keyedvectors.py b/gensim/models/deprecated/keyedvectors.py index 5ead121e48..a8983909d0 100644 --- a/gensim/models/deprecated/keyedvectors.py +++ b/gensim/models/deprecated/keyedvectors.py @@ -154,12 +154,12 @@ def save_word2vec_format(self, fname, fvocab=None, binary=False, total_vec=None) vector_size = self.syn0.shape[1] if fvocab is not None: logger.info("storing vocabulary in %s", fvocab) - with utils.smart_open(fvocab, 'wb') as vout: + with utils.open(fvocab, 'wb') as vout: for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count): vout.write(utils.to_utf8("%s %s\n" % (word, vocab.count))) logger.info("storing %sx%s projection weights into %s", total_vec, vector_size, fname) assert (len(self.vocab), vector_size) == self.syn0.shape - with utils.smart_open(fname, 'wb') as fout: + with utils.open(fname, 'wb') as fout: fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size))) # store in sorted order: most frequent words at the top for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count): @@ -204,13 +204,13 @@ def load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8', if fvocab is not None: logger.info("loading word counts from %s", fvocab) counts = {} - with utils.smart_open(fvocab) as fin: + with utils.open(fvocab, 'rb') as fin: for line in fin: word, count = utils.to_unicode(line).strip().split() counts[word] = int(count) logger.info("loading projection weights from %s", fname) - with utils.smart_open(fname) as fin: + with utils.open(fname, 'rb') as fin: header = utils.to_unicode(fin.readline(), encoding=encoding) vocab_size, vector_size = (int(x) for x in header.split()) # throws for invalid file format if limit: @@ -934,47 +934,48 @@ def accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar, c ok_vocab = {w.upper(): v for w, v in reversed(ok_vocab)} if case_insensitive else dict(ok_vocab) sections, section = [], None - for line_no, line in enumerate(utils.smart_open(questions)): - # TODO: use level3 BLAS (=evaluate multiple questions at once), for speed - line = utils.to_unicode(line) - if line.startswith(': '): - # a new section starts => store the old section - if section: - sections.append(section) - self.log_accuracy(section) - section = {'section': line.lstrip(': ').strip(), 'correct': [], 'incorrect': []} - else: - if not section: - raise ValueError("missing section header before line #%i in %s" % (line_no, questions)) - try: - if case_insensitive: - a, b, c, expected = [word.upper() for word in line.split()] - else: - a, b, c, expected = [word for word in line.split()] - except ValueError: - logger.info("skipping invalid line #%i in %s", line_no, questions) - continue - if a not in ok_vocab or b not in ok_vocab or c not in ok_vocab or expected not in ok_vocab: - logger.debug("skipping line #%i with OOV words: %s", line_no, line.strip()) - continue - - original_vocab = self.vocab - self.vocab = ok_vocab - ignore = {a, b, c} # input words to be ignored - predicted = None - # find the most likely prediction, ignoring OOV words and input words - sims = most_similar(self, positive=[b, c], negative=[a], topn=False, restrict_vocab=restrict_vocab) - self.vocab = original_vocab - for index in matutils.argsort(sims, reverse=True): - predicted = self.index2word[index].upper() if case_insensitive else self.index2word[index] - if predicted in ok_vocab and predicted not in ignore: - if predicted != expected: - logger.debug("%s: expected %s, predicted %s", line.strip(), expected, predicted) - break - if predicted == expected: - section['correct'].append((a, b, c, expected)) + with utils.open(questions, 'rb') as f: + for line_no, line in enumerate(f): + # TODO: use level3 BLAS (=evaluate multiple questions at once), for speed + line = utils.to_unicode(line) + if line.startswith(': '): + # a new section starts => store the old section + if section: + sections.append(section) + self.log_accuracy(section) + section = {'section': line.lstrip(': ').strip(), 'correct': [], 'incorrect': []} else: - section['incorrect'].append((a, b, c, expected)) + if not section: + raise ValueError("missing section header before line #%i in %s" % (line_no, questions)) + try: + if case_insensitive: + a, b, c, expected = [word.upper() for word in line.split()] + else: + a, b, c, expected = [word for word in line.split()] + except ValueError: + logger.info("skipping invalid line #%i in %s", line_no, questions) + continue + if a not in ok_vocab or b not in ok_vocab or c not in ok_vocab or expected not in ok_vocab: + logger.debug("skipping line #%i with OOV words: %s", line_no, line.strip()) + continue + + original_vocab = self.vocab + self.vocab = ok_vocab + ignore = {a, b, c} # input words to be ignored + predicted = None + # find the most likely prediction, ignoring OOV words and input words + sims = most_similar(self, positive=[b, c], negative=[a], topn=False, restrict_vocab=restrict_vocab) + self.vocab = original_vocab + for index in matutils.argsort(sims, reverse=True): + predicted = self.index2word[index].upper() if case_insensitive else self.index2word[index] + if predicted in ok_vocab and predicted not in ignore: + if predicted != expected: + logger.debug("%s: expected %s, predicted %s", line.strip(), expected, predicted) + break + if predicted == expected: + section['correct'].append((a, b, c, expected)) + else: + section['incorrect'].append((a, b, c, expected)) if section: # store the last section, too sections.append(section) @@ -1030,32 +1031,33 @@ def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, original_vocab = self.vocab self.vocab = ok_vocab - for line_no, line in enumerate(utils.smart_open(pairs)): - line = utils.to_unicode(line) - if line.startswith('#'): - # May be a comment - continue - else: - try: - if case_insensitive: - a, b, sim = [word.upper() for word in line.split(delimiter)] - else: - a, b, sim = [word for word in line.split(delimiter)] - sim = float(sim) - except (ValueError, TypeError): - logger.info('skipping invalid line #%d in %s', line_no, pairs) + with utils.open(pairs, 'rb') as f: + for line_no, line in enumerate(f): + line = utils.to_unicode(line) + if line.startswith('#'): + # May be a comment continue - if a not in ok_vocab or b not in ok_vocab: - oov += 1 - if dummy4unknown: - similarity_model.append(0.0) - similarity_gold.append(sim) - continue - else: - logger.debug('skipping line #%d with OOV words: %s', line_no, line.strip()) + else: + try: + if case_insensitive: + a, b, sim = [word.upper() for word in line.split(delimiter)] + else: + a, b, sim = [word for word in line.split(delimiter)] + sim = float(sim) + except (ValueError, TypeError): + logger.info('skipping invalid line #%d in %s', line_no, pairs) continue - similarity_gold.append(sim) # Similarity from the dataset - similarity_model.append(self.similarity(a, b)) # Similarity from the model + if a not in ok_vocab or b not in ok_vocab: + oov += 1 + if dummy4unknown: + similarity_model.append(0.0) + similarity_gold.append(sim) + continue + else: + logger.debug('skipping line #%d with OOV words: %s', line_no, line.strip()) + continue + similarity_gold.append(sim) # Similarity from the dataset + similarity_model.append(self.similarity(a, b)) # Similarity from the model self.vocab = original_vocab spearman = stats.spearmanr(similarity_gold, similarity_model) pearson = stats.pearsonr(similarity_gold, similarity_model) diff --git a/gensim/models/deprecated/old_saveload.py b/gensim/models/deprecated/old_saveload.py index c609dd5532..750d83ed44 100644 --- a/gensim/models/deprecated/old_saveload.py +++ b/gensim/models/deprecated/old_saveload.py @@ -31,7 +31,7 @@ from six import iteritems -from smart_open import smart_open +from gensim import utils if sys.version_info[0] >= 3: unicode = str @@ -367,8 +367,7 @@ def unpickle(fname): Python object loaded from `fname`. """ - with smart_open(fname, 'rb') as f: - # Because of loading from S3 load can't be used (missing readline in smart_open) + with utils.open(fname, 'rb') as f: file_bytes = f.read() file_bytes = file_bytes.replace(b'gensim.models.word2vec', b'gensim.models.deprecated.word2vec') file_bytes = file_bytes.replace(b'gensim.models.keyedvectors', b'gensim.models.deprecated.keyedvectors') @@ -395,5 +394,5 @@ def pickle(obj, fname, protocol=2): Pickle protocol number, default is 2 to support compatible across python 2.x and 3.x. """ - with smart_open(fname, 'wb') as fout: # 'b' for binary, needed on Windows + with utils.open(fname, 'wb') as fout: # 'b' for binary, needed on Windows _pickle.dump(obj, fout, protocol=protocol) diff --git a/gensim/models/deprecated/word2vec.py b/gensim/models/deprecated/word2vec.py index b8b04d4c10..f7c90b6981 100644 --- a/gensim/models/deprecated/word2vec.py +++ b/gensim/models/deprecated/word2vec.py @@ -1413,7 +1413,7 @@ def intersect_word2vec_format(self, fname, lockf=0.0, binary=False, encoding='ut """ overlap_count = 0 logger.info("loading projection weights from %s", fname) - with utils.smart_open(fname) as fin: + with utils.open(fname, 'rb') as fin: header = utils.to_unicode(fin.readline(), encoding=encoding) vocab_size, vector_size = (int(x) for x in header.split()) # throws for invalid file format if not vector_size == self.vector_size: @@ -1699,16 +1699,17 @@ def __iter__(self): fname = os.path.join(self.dirname, fname) if not os.path.isfile(fname): continue - for line in utils.smart_open(fname): - line = utils.to_unicode(line) - # each file line is a single sentence in the Brown corpus - # each token is WORD/POS_TAG - token_tags = [t.split('/') for t in line.split() if len(t.split('/')) == 2] - # ignore words with non-alphabetic tags like ",", "!" etc (punctuation, weird stuff) - words = ["%s/%s" % (token.lower(), tag[:2]) for token, tag in token_tags if tag[:2].isalpha()] - if not words: # don't bother sending out empty sentences - continue - yield words + with utils.open(fname, 'rb') as fin: + for line in fin: + line = utils.to_unicode(line) + # each file line is a single sentence in the Brown corpus + # each token is WORD/POS_TAG + token_tags = [t.split('/') for t in line.split() if len(t.split('/')) == 2] + # ignore words with non-alphabetic tags like ",", "!" etc (punctuation, weird stuff) + words = ["%s/%s" % (token.lower(), tag[:2]) for token, tag in token_tags if tag[:2].isalpha()] + if not words: # don't bother sending out empty sentences + continue + yield words class Text8Corpus(object): @@ -1722,7 +1723,7 @@ def __iter__(self): # the entire corpus is one gigantic line -- there are no sentence marks at all # so just split the sequence of tokens arbitrarily: 1 sentence = 1000 tokens sentence, rest = [], b'' - with utils.smart_open(self.fname) as fin: + with utils.open(self.fname, 'rb') as fin: while True: text = rest + fin.read(8192) # avoid loading the entire file (=1 line) into RAM if text == rest: # EOF @@ -1778,7 +1779,7 @@ def __iter__(self): i += self.max_sentence_length except AttributeError: # If it didn't work like a file, use it as a string filename - with utils.smart_open(self.source) as fin: + with utils.open(self.source, 'rb') as fin: for line in itertools.islice(fin, self.limit): line = utils.to_unicode(line).split() i = 0 @@ -1833,7 +1834,7 @@ def __iter__(self): """iterate through the files""" for file_name in self.input_files: logger.info('reading file %s', file_name) - with utils.smart_open(file_name) as fin: + with utils.open(file_name, 'rb') as fin: for line in itertools.islice(fin, self.limit): line = utils.to_unicode(line).split() i = 0 diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index c5743f320a..44ad35ad9a 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -77,7 +77,7 @@ memmap as np_memmap, vstack, integer, dtype, sum as np_sum, add as np_add, repeat as np_repeat, concatenate -from gensim.utils import call_on_class_only +from gensim.utils import call_on_class_only, deprecated from gensim import utils, matutils # utility fnc for pickling, common scipy operations etc from gensim.models.word2vec import Word2VecKeyedVectors, Word2VecVocab, Word2VecTrainables, train_cbow_pair,\ train_sg_pair, train_batch_sg @@ -86,7 +86,6 @@ from gensim.models.base_any2vec import BaseWordEmbeddingsModel from gensim.models.keyedvectors import Doc2VecKeyedVectors from types import GeneratorType -from gensim.utils import deprecated, smart_open logger = logging.getLogger(__name__) @@ -838,7 +837,7 @@ def _get_offsets_and_start_doctags_for_corpusfile(cls, corpus_file, workers): offsets = [] start_doctags = [] - with smart_open(corpus_file, mode='rb') as fin: + with utils.open(corpus_file, mode='rb') as fin: curr_offset_idx = 0 prev_filepos = 0 @@ -1505,16 +1504,17 @@ def __iter__(self): fname = os.path.join(self.dirname, fname) if not os.path.isfile(fname): continue - for item_no, line in enumerate(utils.smart_open(fname)): - line = utils.to_unicode(line) - # each file line is a single document in the Brown corpus - # each token is WORD/POS_TAG - token_tags = [t.split('/') for t in line.split() if len(t.split('/')) == 2] - # ignore words with non-alphabetic tags like ",", "!" etc (punctuation, weird stuff) - words = ["%s/%s" % (token.lower(), tag[:2]) for token, tag in token_tags if tag[:2].isalpha()] - if not words: # don't bother sending out empty documents - continue - yield TaggedDocument(words, ['%s_SENT_%s' % (fname, item_no)]) + with utils.open(fname, 'rb') as fin: + for item_no, line in enumerate(fin): + line = utils.to_unicode(line) + # each file line is a single document in the Brown corpus + # each token is WORD/POS_TAG + token_tags = [t.split('/') for t in line.split() if len(t.split('/')) == 2] + # ignore words with non-alphabetic tags like ",", "!" etc (punctuation, weird stuff) + words = ["%s/%s" % (token.lower(), tag[:2]) for token, tag in token_tags if tag[:2].isalpha()] + if not words: # don't bother sending out empty documents + continue + yield TaggedDocument(words, ['%s_SENT_%s' % (fname, item_no)]) class TaggedLineDocument(object): @@ -1562,6 +1562,6 @@ def __iter__(self): yield TaggedDocument(utils.to_unicode(line).split(), [item_no]) except AttributeError: # If it didn't work like a file, use it as a string filename - with utils.smart_open(self.source) as fin: + with utils.open(self.source, 'rb') as fin: for item_no, line in enumerate(fin): yield TaggedDocument(utils.to_unicode(line).split(), [item_no]) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index d7690d65e0..b24dbac2a7 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -105,13 +105,13 @@ .. sourcecode:: pycon >>> from gensim.utils import tokenize - >>> import smart_open + >>> from gensim import utils >>> >>> >>> class MyIter(object): ... def __iter__(self): ... path = datapath('crime-and-punishment.txt') - ... with smart_open.smart_open(path, 'r', encoding='utf-8') as fin: + ... with utils.open(path, 'r', encoding='utf-8') as fin: ... for line in fin: ... yield list(tokenize(line)) >>> @@ -293,8 +293,8 @@ from gensim.models.keyedvectors import FastTextKeyedVectors from gensim.models.base_any2vec import BaseWordEmbeddingsModel from gensim.models.utils_any2vec import ft_ngram_hashes -from smart_open import smart_open +from gensim import utils from gensim.utils import deprecated, call_on_class_only logger = logging.getLogger(__name__) @@ -1326,7 +1326,7 @@ def _load_fasttext_format(model_file, encoding='utf-8', full_model=True): The loaded model. """ - with smart_open(model_file, 'rb') as fin: + with utils.open(model_file, 'rb') as fin: m = gensim.models._fasttext_bin.load(fin, encoding=encoding, full_model=full_model) model = FastText( diff --git a/gensim/models/hdpmodel.py b/gensim/models/hdpmodel.py index dde0126209..e0ce5de3a5 100755 --- a/gensim/models/hdpmodel.py +++ b/gensim/models/hdpmodel.py @@ -863,7 +863,7 @@ def save_options(self): logger.error("cannot store options without having specified an output directory") return fname = '%s/options.dat' % self.outputdir - with utils.smart_open(fname, 'wb') as fout: + with utils.open(fname, 'wb') as fout: fout.write('tau: %s\n' % str(self.m_tau - 1)) fout.write('chunksize: %s\n' % str(self.chunksize)) fout.write('var_converge: %s\n' % str(self.m_var_converge)) diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index 27cc21a8d1..b534f6a784 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -1062,52 +1062,53 @@ def evaluate_word_analogies(self, analogies, restrict_vocab=300000, case_insensi logger.info("Evaluating word analogies for top %i words in the model on %s", restrict_vocab, analogies) sections, section = [], None quadruplets_no = 0 - for line_no, line in enumerate(utils.smart_open(analogies)): - line = utils.to_unicode(line) - if line.startswith(': '): - # a new section starts => store the old section - if section: - sections.append(section) - self._log_evaluate_word_analogies(section) - section = {'section': line.lstrip(': ').strip(), 'correct': [], 'incorrect': []} - else: - if not section: - raise ValueError("Missing section header before line #%i in %s" % (line_no, analogies)) - try: - if case_insensitive: - a, b, c, expected = [word.upper() for word in line.split()] + with utils.open(analogies, 'rb') as fin: + for line_no, line in enumerate(fin): + line = utils.to_unicode(line) + if line.startswith(': '): + # a new section starts => store the old section + if section: + sections.append(section) + self._log_evaluate_word_analogies(section) + section = {'section': line.lstrip(': ').strip(), 'correct': [], 'incorrect': []} + else: + if not section: + raise ValueError("Missing section header before line #%i in %s" % (line_no, analogies)) + try: + if case_insensitive: + a, b, c, expected = [word.upper() for word in line.split()] + else: + a, b, c, expected = [word for word in line.split()] + except ValueError: + logger.info("Skipping invalid line #%i in %s", line_no, analogies) + continue + quadruplets_no += 1 + if a not in ok_vocab or b not in ok_vocab or c not in ok_vocab or expected not in ok_vocab: + oov += 1 + if dummy4unknown: + logger.debug('Zero accuracy for line #%d with OOV words: %s', line_no, line.strip()) + section['incorrect'].append((a, b, c, expected)) + else: + logger.debug("Skipping line #%i with OOV words: %s", line_no, line.strip()) + continue + original_vocab = self.vocab + self.vocab = ok_vocab + ignore = {a, b, c} # input words to be ignored + predicted = None + # find the most likely prediction using 3CosAdd (vector offset) method + # TODO: implement 3CosMul and set-based methods for solving analogies + sims = self.most_similar(positive=[b, c], negative=[a], topn=5, restrict_vocab=restrict_vocab) + self.vocab = original_vocab + for element in sims: + predicted = element[0].upper() if case_insensitive else element[0] + if predicted in ok_vocab and predicted not in ignore: + if predicted != expected: + logger.debug("%s: expected %s, predicted %s", line.strip(), expected, predicted) + break + if predicted == expected: + section['correct'].append((a, b, c, expected)) else: - a, b, c, expected = [word for word in line.split()] - except ValueError: - logger.info("Skipping invalid line #%i in %s", line_no, analogies) - continue - quadruplets_no += 1 - if a not in ok_vocab or b not in ok_vocab or c not in ok_vocab or expected not in ok_vocab: - oov += 1 - if dummy4unknown: - logger.debug('Zero accuracy for line #%d with OOV words: %s', line_no, line.strip()) section['incorrect'].append((a, b, c, expected)) - else: - logger.debug("Skipping line #%i with OOV words: %s", line_no, line.strip()) - continue - original_vocab = self.vocab - self.vocab = ok_vocab - ignore = {a, b, c} # input words to be ignored - predicted = None - # find the most likely prediction using 3CosAdd (vector offset) method - # TODO: implement 3CosMul and set-based methods for solving analogies - sims = self.most_similar(positive=[b, c], negative=[a], topn=5, restrict_vocab=restrict_vocab) - self.vocab = original_vocab - for element in sims: - predicted = element[0].upper() if case_insensitive else element[0] - if predicted in ok_vocab and predicted not in ignore: - if predicted != expected: - logger.debug("%s: expected %s, predicted %s", line.strip(), expected, predicted) - break - if predicted == expected: - section['correct'].append((a, b, c, expected)) - else: - section['incorrect'].append((a, b, c, expected)) if section: # store the last section, too sections.append(section) @@ -1174,46 +1175,47 @@ def accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar, c ok_vocab = {w.upper(): v for w, v in reversed(ok_vocab)} if case_insensitive else dict(ok_vocab) sections, section = [], None - for line_no, line in enumerate(utils.smart_open(questions)): - # TODO: use level3 BLAS (=evaluate multiple questions at once), for speed - line = utils.to_unicode(line) - if line.startswith(': '): - # a new section starts => store the old section - if section: - sections.append(section) - self.log_accuracy(section) - section = {'section': line.lstrip(': ').strip(), 'correct': [], 'incorrect': []} - else: - if not section: - raise ValueError("Missing section header before line #%i in %s" % (line_no, questions)) - try: - if case_insensitive: - a, b, c, expected = [word.upper() for word in line.split()] - else: - a, b, c, expected = [word for word in line.split()] - except ValueError: - logger.info("Skipping invalid line #%i in %s", line_no, questions) - continue - if a not in ok_vocab or b not in ok_vocab or c not in ok_vocab or expected not in ok_vocab: - logger.debug("Skipping line #%i with OOV words: %s", line_no, line.strip()) - continue - original_vocab = self.vocab - self.vocab = ok_vocab - ignore = {a, b, c} # input words to be ignored - predicted = None - # find the most likely prediction, ignoring OOV words and input words - sims = most_similar(self, positive=[b, c], negative=[a], topn=None, restrict_vocab=restrict_vocab) - self.vocab = original_vocab - for index in matutils.argsort(sims, reverse=True): - predicted = self.index2word[index].upper() if case_insensitive else self.index2word[index] - if predicted in ok_vocab and predicted not in ignore: - if predicted != expected: - logger.debug("%s: expected %s, predicted %s", line.strip(), expected, predicted) - break - if predicted == expected: - section['correct'].append((a, b, c, expected)) + with utils.open(questions, 'rb') as fin: + for line_no, line in enumerate(fin): + # TODO: use level3 BLAS (=evaluate multiple questions at once), for speed + line = utils.to_unicode(line) + if line.startswith(': '): + # a new section starts => store the old section + if section: + sections.append(section) + self.log_accuracy(section) + section = {'section': line.lstrip(': ').strip(), 'correct': [], 'incorrect': []} else: - section['incorrect'].append((a, b, c, expected)) + if not section: + raise ValueError("Missing section header before line #%i in %s" % (line_no, questions)) + try: + if case_insensitive: + a, b, c, expected = [word.upper() for word in line.split()] + else: + a, b, c, expected = [word for word in line.split()] + except ValueError: + logger.info("Skipping invalid line #%i in %s", line_no, questions) + continue + if a not in ok_vocab or b not in ok_vocab or c not in ok_vocab or expected not in ok_vocab: + logger.debug("Skipping line #%i with OOV words: %s", line_no, line.strip()) + continue + original_vocab = self.vocab + self.vocab = ok_vocab + ignore = {a, b, c} # input words to be ignored + predicted = None + # find the most likely prediction, ignoring OOV words and input words + sims = most_similar(self, positive=[b, c], negative=[a], topn=None, restrict_vocab=restrict_vocab) + self.vocab = original_vocab + for index in matutils.argsort(sims, reverse=True): + predicted = self.index2word[index].upper() if case_insensitive else self.index2word[index] + if predicted in ok_vocab and predicted not in ignore: + if predicted != expected: + logger.debug("%s: expected %s, predicted %s", line.strip(), expected, predicted) + break + if predicted == expected: + section['correct'].append((a, b, c, expected)) + else: + section['incorrect'].append((a, b, c, expected)) if section: # store the last section, too sections.append(section) @@ -1285,33 +1287,34 @@ def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, original_vocab = self.vocab self.vocab = ok_vocab - for line_no, line in enumerate(utils.smart_open(pairs)): - line = utils.to_unicode(line) - if line.startswith('#'): - # May be a comment - continue - else: - try: - if case_insensitive: - a, b, sim = [word.upper() for word in line.split(delimiter)] - else: - a, b, sim = [word for word in line.split(delimiter)] - sim = float(sim) - except (ValueError, TypeError): - logger.info('Skipping invalid line #%d in %s', line_no, pairs) + with utils.open(pairs, 'rb') as fin: + for line_no, line in enumerate(fin): + line = utils.to_unicode(line) + if line.startswith('#'): + # May be a comment continue - if a not in ok_vocab or b not in ok_vocab: - oov += 1 - if dummy4unknown: - logger.debug('Zero similarity for line #%d with OOV words: %s', line_no, line.strip()) - similarity_model.append(0.0) - similarity_gold.append(sim) - continue - else: - logger.debug('Skipping line #%d with OOV words: %s', line_no, line.strip()) + else: + try: + if case_insensitive: + a, b, sim = [word.upper() for word in line.split(delimiter)] + else: + a, b, sim = [word for word in line.split(delimiter)] + sim = float(sim) + except (ValueError, TypeError): + logger.info('Skipping invalid line #%d in %s', line_no, pairs) continue - similarity_gold.append(sim) # Similarity from the dataset - similarity_model.append(self.similarity(a, b)) # Similarity from the model + if a not in ok_vocab or b not in ok_vocab: + oov += 1 + if dummy4unknown: + logger.debug('Zero similarity for line #%d with OOV words: %s', line_no, line.strip()) + similarity_model.append(0.0) + similarity_gold.append(sim) + continue + else: + logger.debug('Skipping line #%d with OOV words: %s', line_no, line.strip()) + continue + similarity_gold.append(sim) # Similarity from the dataset + similarity_model.append(self.similarity(a, b)) # Similarity from the model self.vocab = original_vocab spearman = stats.spearmanr(similarity_gold, similarity_model) pearson = stats.pearsonr(similarity_gold, similarity_model) @@ -1888,7 +1891,7 @@ def save_word2vec_format(self, fname, prefix='*dt_', fvocab=None, """ total_vec = total_vec or len(self) - with utils.smart_open(fname, 'ab') as fout: + with utils.open(fname, 'ab') as fout: if write_first_line: logger.info("storing %sx%s projection weights into %s", total_vec, self.vectors_docs.shape[1], fname) fout.write(utils.to_utf8("%s %s\n" % (total_vec, self.vectors_docs.shape[1]))) diff --git a/gensim/models/poincare.py b/gensim/models/poincare.py index 0c49c761f2..0040352bde 100644 --- a/gensim/models/poincare.py +++ b/gensim/models/poincare.py @@ -53,7 +53,6 @@ from scipy.stats import spearmanr from six import string_types from six.moves import zip, range -from smart_open import smart_open from gensim import utils, matutils from gensim.models.keyedvectors import Vocab, BaseKeyedVectors @@ -1409,7 +1408,7 @@ def __iter__(self): Relation from input file. """ - with smart_open(self.file_path) as file_obj: + with utils.open(self.file_path, 'rb') as file_obj: if sys.version_info[0] < 3: lines = file_obj else: @@ -1490,7 +1489,7 @@ def __init__(self, file_path, embedding): items = set() embedding_vocab = embedding.vocab relations = defaultdict(set) - with smart_open(file_path, 'r') as f: + with utils.open(file_path, 'r') as f: reader = csv.reader(f, delimiter='\t') for row in reader: assert len(row) == 2, 'Hypernym pair has more than two items' @@ -1598,7 +1597,7 @@ def __init__(self, train_path, test_path, embedding): relations = {'known': defaultdict(set), 'unknown': defaultdict(set)} data_files = {'known': train_path, 'unknown': test_path} for relation_type, data_file in data_files.items(): - with smart_open(data_file, 'r') as f: + with utils.open(data_file, 'r') as f: reader = csv.reader(f, delimiter='\t') for row in reader: assert len(row) == 2, 'Hypernym pair has more than two items' @@ -1702,7 +1701,7 @@ def __init__(self, filepath): """ expected_scores = {} - with smart_open(filepath, 'r') as f: + with utils.open(filepath, 'r') as f: reader = csv.DictReader(f, delimiter=' ') for row in reader: word_1, word_2 = row['WORD1'], row['WORD2'] diff --git a/gensim/models/utils_any2vec.py b/gensim/models/utils_any2vec.py index 1d9e03647c..90d2f60fbc 100644 --- a/gensim/models/utils_any2vec.py +++ b/gensim/models/utils_any2vec.py @@ -274,12 +274,12 @@ def _save_word2vec_format(fname, vocab, vectors, fvocab=None, binary=False, tota vector_size = vectors.shape[1] if fvocab is not None: logger.info("storing vocabulary in %s", fvocab) - with utils.smart_open(fvocab, 'wb') as vout: + with utils.open(fvocab, 'wb') as vout: for word, vocab_ in sorted(iteritems(vocab), key=lambda item: -item[1].count): vout.write(utils.to_utf8("%s %s\n" % (word, vocab_.count))) logger.info("storing %sx%s projection weights into %s", total_vec, vector_size, fname) assert (len(vocab), vector_size) == vectors.shape - with utils.smart_open(fname, 'wb') as fout: + with utils.open(fname, 'wb') as fout: fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size))) # store in sorted order: most frequent words at the top for word, vocab_ in sorted(iteritems(vocab), key=lambda item: -item[1].count): @@ -333,13 +333,13 @@ def _load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8' if fvocab is not None: logger.info("loading word counts from %s", fvocab) counts = {} - with utils.smart_open(fvocab) as fin: + with utils.open(fvocab, 'rb') as fin: for line in fin: word, count = utils.to_unicode(line).strip().split() counts[word] = int(count) logger.info("loading projection weights from %s", fname) - with utils.smart_open(fname) as fin: + with utils.open(fname, 'rb') as fin: header = utils.to_unicode(fin.readline(), encoding=encoding) vocab_size, vector_size = (int(x) for x in header.split()) # throws for invalid file format if limit: diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index a2da6a64ff..5fd91c75ec 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -1077,7 +1077,7 @@ def intersect_word2vec_format(self, fname, lockf=0.0, binary=False, encoding='ut """ overlap_count = 0 logger.info("loading projection weights from %s", fname) - with utils.smart_open(fname) as fin: + with utils.open(fname, 'rb') as fin: header = utils.to_unicode(fin.readline(), encoding=encoding) vocab_size, vector_size = (int(x) for x in header.split()) # throws for invalid file format if not vector_size == self.wv.vector_size: @@ -1354,16 +1354,17 @@ def __iter__(self): fname = os.path.join(self.dirname, fname) if not os.path.isfile(fname): continue - for line in utils.smart_open(fname): - line = utils.to_unicode(line) - # each file line is a single sentence in the Brown corpus - # each token is WORD/POS_TAG - token_tags = [t.split('/') for t in line.split() if len(t.split('/')) == 2] - # ignore words with non-alphabetic tags like ",", "!" etc (punctuation, weird stuff) - words = ["%s/%s" % (token.lower(), tag[:2]) for token, tag in token_tags if tag[:2].isalpha()] - if not words: # don't bother sending out empty sentences - continue - yield words + with utils.open(fname, 'rb') as fin: + for line in fin: + line = utils.to_unicode(line) + # each file line is a single sentence in the Brown corpus + # each token is WORD/POS_TAG + token_tags = [t.split('/') for t in line.split() if len(t.split('/')) == 2] + # ignore words with non-alphabetic tags like ",", "!" etc (punctuation, weird stuff) + words = ["%s/%s" % (token.lower(), tag[:2]) for token, tag in token_tags if tag[:2].isalpha()] + if not words: # don't bother sending out empty sentences + continue + yield words class Text8Corpus(object): @@ -1376,7 +1377,7 @@ def __iter__(self): # the entire corpus is one gigantic line -- there are no sentence marks at all # so just split the sequence of tokens arbitrarily: 1 sentence = 1000 tokens sentence, rest = [], b'' - with utils.smart_open(self.fname) as fin: + with utils.open(self.fname, 'rb') as fin: while True: text = rest + fin.read(8192) # avoid loading the entire file (=1 line) into RAM if text == rest: # EOF @@ -1437,7 +1438,7 @@ def __iter__(self): i += self.max_sentence_length except AttributeError: # If it didn't work like a file, use it as a string filename - with utils.smart_open(self.source) as fin: + with utils.open(self.source, 'rb') as fin: for line in itertools.islice(fin, self.limit): line = utils.to_unicode(line).split() i = 0 @@ -1493,7 +1494,7 @@ def __iter__(self): """iterate through the files""" for file_name in self.input_files: logger.info('reading file %s', file_name) - with utils.smart_open(file_name) as fin: + with utils.open(file_name, 'rb') as fin: for line in itertools.islice(fin, self.limit): line = utils.to_unicode(line).split() i = 0 diff --git a/gensim/models/wrappers/dtmmodel.py b/gensim/models/wrappers/dtmmodel.py index 99601cd5c8..a0a2c7c6c5 100644 --- a/gensim/models/wrappers/dtmmodel.py +++ b/gensim/models/wrappers/dtmmodel.py @@ -322,7 +322,7 @@ def convert_input(self, corpus, time_slices): # write out the corpus in a file format that DTM understands: corpora.BleiCorpus.save_corpus(self.fcorpustxt(), corpus) - with utils.smart_open(self.ftimeslices(), 'wb') as fout: + with utils.open(self.ftimeslices(), 'wb') as fout: fout.write(utils.to_utf8(str(len(self.time_slices)) + "\n")) for sl in time_slices: fout.write(utils.to_utf8(str(sl) + "\n")) diff --git a/gensim/models/wrappers/ldamallet.py b/gensim/models/wrappers/ldamallet.py index c82dbf03f6..a7660b2eff 100644 --- a/gensim/models/wrappers/ldamallet.py +++ b/gensim/models/wrappers/ldamallet.py @@ -54,7 +54,6 @@ from itertools import chain import numpy -from smart_open import smart_open from gensim import utils, matutils from gensim.models import basemodel @@ -245,7 +244,7 @@ def convert_input(self, corpus, infer=False, serialize_corpus=True): """ if serialize_corpus: logger.info("serializing temporary corpus to %s", self.fcorpustxt()) - with smart_open(self.fcorpustxt(), 'wb') as fout: + with utils.open(self.fcorpustxt(), 'wb') as fout: self.corpus2mallet(corpus, fout) # convert the text file above into MALLET's internal format @@ -341,7 +340,7 @@ def load_word_topics(self): else: word2id = revdict(self.id2word) - with utils.smart_open(self.fstate()) as fin: + with utils.open(self.fstate(), 'rb') as fin: _ = next(fin) # header self.alpha = numpy.fromiter(next(fin).split()[2:], dtype=float) assert len(self.alpha) == self.num_topics, "mismatch between MALLET vs. requested topics" @@ -505,7 +504,7 @@ def read_doctopics(self, fname, eps=1e-6, renorm=True): """ mallet_version = self.get_version(self.mallet_path) - with utils.smart_open(fname) as fin: + with utils.open(fname, 'rb') as fin: for lineno, line in enumerate(fin): if lineno == 0 and line.startswith(b"#doc "): continue # skip the header line if it exists diff --git a/gensim/models/wrappers/ldavowpalwabbit.py b/gensim/models/wrappers/ldavowpalwabbit.py index 45ba27ba47..f7c286a349 100644 --- a/gensim/models/wrappers/ldavowpalwabbit.py +++ b/gensim/models/wrappers/ldavowpalwabbit.py @@ -414,12 +414,12 @@ def save(self, fname, *args, **kwargs): # variable before serialising this object - keeps all data # self contained within a single serialised file logger.debug("Reading model bytes from '%s'", self._model_filename) - with utils.smart_open(self._model_filename, 'rb') as fhandle: + with utils.open(self._model_filename, 'rb') as fhandle: self._model_data = fhandle.read() if os.path.exists(self._topics_filename): logger.debug("Reading topic bytes from '%s'", self._topics_filename) - with utils.smart_open(self._topics_filename, 'rb') as fhandle: + with utils.open(self._topics_filename, 'rb') as fhandle: self._topics_data = fhandle.read() if 'ignore' not in kwargs: @@ -444,13 +444,13 @@ def load(cls, fname, *args, **kwargs): # Vowpal Wabbit operates on its own binary model file - deserialise # to file at load time, making it immediately ready for use logger.debug("Writing model bytes to '%s'", lda_vw._model_filename) - with utils.smart_open(lda_vw._model_filename, 'wb') as fhandle: + with utils.open(lda_vw._model_filename, 'wb') as fhandle: fhandle.write(lda_vw._model_data) lda_vw._model_data = None # no need to keep in memory after this if lda_vw._topics_data: logger.debug("Writing topic bytes to '%s'", lda_vw._topics_filename) - with utils.smart_open(lda_vw._topics_filename, 'wb') as fhandle: + with utils.open(lda_vw._topics_filename, 'wb') as fhandle: fhandle.write(lda_vw._topics_data) lda_vw._topics_data = None @@ -566,7 +566,7 @@ def _load_vw_topics(self): """Read topics file generated by Vowpal Wabbit, convert to numpy array.""" topics = numpy.zeros((self.num_topics, self.num_terms), dtype=numpy.float32) - with utils.smart_open(self._topics_filename) as topics_file: + with utils.open(self._topics_filename, 'rb') as topics_file: found_data = False for line in topics_file: @@ -620,7 +620,7 @@ def _predict(self, chunk): predictions = numpy.zeros((corpus_size, self.num_topics), dtype=numpy.float32) - with utils.smart_open(self._predict_filename) as fhandle: + with utils.open(self._predict_filename, 'rb') as fhandle: for i, line in enumerate(fhandle): predictions[i, :] = line.split() @@ -796,7 +796,7 @@ def write_corpus_as_vw(corpus, filename): logger.debug("Writing corpus to: %s", filename) corpus_size = 0 - with utils.smart_open(filename, 'wb') as corpus_file: + with utils.open(filename, 'wb') as corpus_file: for line in corpus_to_vw(corpus): corpus_file.write(line.encode('utf-8') + b'\n') corpus_size += 1 diff --git a/gensim/models/wrappers/wordrank.py b/gensim/models/wrappers/wordrank.py index 018fe1f9d6..946787506f 100644 --- a/gensim/models/wrappers/wordrank.py +++ b/gensim/models/wrappers/wordrank.py @@ -59,7 +59,6 @@ from gensim.models.keyedvectors import KeyedVectors from gensim.scripts.glove2word2vec import glove2word2vec -from smart_open import smart_open from shutil import copyfile, rmtree @@ -174,19 +173,19 @@ def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1, logger.info("Prepare training data (%s) using glove code", ", ".join(input_fnames)) for command, input_fname, output_fname in zip(commands, input_fnames, output_fnames): - with smart_open(input_fname, 'rb') as r: - with smart_open(output_fname, 'wb') as w: + with utils.open(input_fname, 'rb') as r: + with utils.open(output_fname, 'wb') as w: utils.check_output(w, args=command, stdin=r) logger.info("Deleting frequencies from vocab file") - with smart_open(vocab_file, 'wb') as w: + with utils.open(vocab_file, 'wb') as w: utils.check_output(w, args=cmd_del_vocab_freq) - with smart_open(vocab_file, 'rb') as f: + with utils.open(vocab_file, 'rb') as f: numwords = sum(1 for _ in f) - with smart_open(cooccurrence_shuf_file, 'rb') as f: + with utils.open(cooccurrence_shuf_file, 'rb') as f: numlines = sum(1 for _ in f) - with smart_open(meta_file, 'wb') as f: + with utils.open(meta_file, 'wb') as f: meta_info = "{0} {1}\n{2} {3}\n{4} {5}".format( numwords, numwords, numlines, cooccurrence_shuf_file.split('/')[-1], numwords, vocab_file.split('/')[-1] @@ -284,7 +283,7 @@ def sort_embeddings(self, vocab_file): self.index2word = [] # sort embeddings using frequency sorted vocab file in wordrank - with utils.smart_open(vocab_file) as fin: + with utils.open(vocab_file, 'rb') as fin: for index, line in enumerate(fin): word, count = utils.to_unicode(line).strip(), vocab_size - index # store word with it's count in a dict diff --git a/gensim/parsing/preprocessing.py b/gensim/parsing/preprocessing.py index c8010a980a..97acef1f22 100644 --- a/gensim/parsing/preprocessing.py +++ b/gensim/parsing/preprocessing.py @@ -401,7 +401,7 @@ def preprocess_documents(docs): def read_file(path): - with utils.smart_open(path) as fin: + with utils.open(path, 'rb') as fin: return fin.read() diff --git a/gensim/scripts/glove2word2vec.py b/gensim/scripts/glove2word2vec.py index 39fb683f58..836b0e6b8f 100644 --- a/gensim/scripts/glove2word2vec.py +++ b/gensim/scripts/glove2word2vec.py @@ -59,7 +59,7 @@ import logging import argparse -from smart_open import smart_open +from gensim import utils logger = logging.getLogger(__name__) @@ -78,9 +78,9 @@ def get_glove_info(glove_file_name): Number of vectors (lines) of input file and its dimension. """ - with smart_open(glove_file_name) as f: + with utils.open(glove_file_name, 'rb') as f: num_lines = sum(1 for _ in f) - with smart_open(glove_file_name) as f: + with utils.open(glove_file_name, 'rb') as f: num_dims = len(f.readline().split()) - 1 return num_lines, num_dims @@ -103,9 +103,9 @@ def glove2word2vec(glove_input_file, word2vec_output_file): """ num_lines, num_dims = get_glove_info(glove_input_file) logger.info("converting %i vectors from %s to %s", num_lines, glove_input_file, word2vec_output_file) - with smart_open(word2vec_output_file, 'wb') as fout: + with utils.open(word2vec_output_file, 'wb') as fout: fout.write("{0} {1}\n".format(num_lines, num_dims).encode('utf-8')) - with smart_open(glove_input_file, 'rb') as fin: + with utils.open(glove_input_file, 'rb') as fin: for line in fin: fout.write(line) return num_lines, num_dims diff --git a/gensim/scripts/segment_wiki.py b/gensim/scripts/segment_wiki.py index e4b6bd9f8d..db15619fd6 100644 --- a/gensim/scripts/segment_wiki.py +++ b/gensim/scripts/segment_wiki.py @@ -23,20 +23,22 @@ .. sourcecode:: pycon - >>> from smart_open import smart_open + >>> from gensim import utils >>> import json >>> >>> # iterate over the plain text data we just created - >>> for line in smart_open('enwiki-latest.json.gz'): - >>> # decode each JSON line into a Python dictionary object - >>> article = json.loads(line) + >>> with utils.open('enwiki-latest.json.gz', 'rb') as f: + >>> for line in f: + >>> # decode each JSON line into a Python dictionary object + >>> article = json.loads(line) >>> - >>> # each article has a "title", a mapping of interlinks and a list of "section_titles" and "section_texts". - >>> print("Article title: %s" % article['title']) - >>> print("Interlinks: %s" + article['interlinks']) - >>> for section_title, section_text in zip(article['section_titles'], article['section_texts']): - >>> print("Section title: %s" % section_title) - >>> print("Section text: %s" % section_text) + >>> # each article has a "title", a mapping of interlinks and a list of "section_titles" and + >>> # "section_texts". + >>> print("Article title: %s" % article['title']) + >>> print("Interlinks: %s" + article['interlinks']) + >>> for section_title, section_text in zip(article['section_titles'], article['section_texts']): + >>> print("Section title: %s" % section_title) + >>> print("Section text: %s" % section_text) Notes @@ -63,7 +65,7 @@ from functools import partial from gensim.corpora.wikicorpus import IGNORED_NAMESPACES, WikiCorpus, filter_wiki, find_interlinks, get_namespace, utils -from smart_open import smart_open +import gensim.utils logger = logging.getLogger(__name__) @@ -92,7 +94,7 @@ def segment_all_articles(file_path, min_article_character=200, workers=None, inc Structure contains (title, [(section_heading, section_content), ...], (Optionally) {interlinks}). """ - with smart_open(file_path, 'rb') as xml_fileobj: + with gensim.utils.open(file_path, 'rb') as xml_fileobj: wiki_sections_corpus = _WikiSectionsCorpus( xml_fileobj, min_article_character=min_article_character, processes=workers, include_interlinks=include_interlinks) @@ -135,7 +137,7 @@ def segment_and_write_all_articles(file_path, output_file, min_article_character if output_file is None: outfile = getattr(sys.stdout, 'buffer', sys.stdout) # we want write bytes, so for py3 we used 'buffer' else: - outfile = smart_open(output_file, 'wb') + outfile = gensim.utils.open(output_file, 'wb') try: article_stream = segment_all_articles(file_path, min_article_character, workers=workers, diff --git a/gensim/scripts/word2vec2tensor.py b/gensim/scripts/word2vec2tensor.py index 5bf8d2e23b..5d151aa854 100644 --- a/gensim/scripts/word2vec2tensor.py +++ b/gensim/scripts/word2vec2tensor.py @@ -44,8 +44,8 @@ import logging import argparse -from smart_open import smart_open import gensim +from gensim import utils logger = logging.getLogger(__name__) @@ -69,7 +69,7 @@ def word2vec2tensor(word2vec_model_path, tensor_filename, binary=False): outfiletsv = tensor_filename + '_tensor.tsv' outfiletsvmeta = tensor_filename + '_metadata.tsv' - with smart_open(outfiletsv, 'wb') as file_vector, smart_open(outfiletsvmeta, 'wb') as file_metadata: + with utils.open(outfiletsv, 'wb') as file_vector, utils.open(outfiletsvmeta, 'wb') as file_metadata: for word in model.index2word: file_metadata.write(gensim.utils.to_utf8(word) + gensim.utils.to_utf8('\n')) vector_row = '\t'.join(str(x) for x in model[word]) diff --git a/gensim/similarities/index.py b/gensim/similarities/index.py index 681fe58ef4..1f27e6c82c 100644 --- a/gensim/similarities/index.py +++ b/gensim/similarities/index.py @@ -33,12 +33,12 @@ """ import os -from smart_open import smart_open try: import cPickle as _pickle except ImportError: import pickle as _pickle +from gensim import utils from gensim.models.doc2vec import Doc2Vec from gensim.models.word2vec import Word2Vec from gensim.models.fasttext import FastText @@ -116,7 +116,7 @@ def save(self, fname, protocol=2): fname_dict = fname + '.d' self.index.save(fname) d = {'f': self.model.vector_size, 'num_trees': self.num_trees, 'labels': self.labels} - with smart_open(fname_dict, 'wb') as fout: + with utils.open(fname_dict, 'wb') as fout: _pickle.dump(d, fout, protocol=protocol) def load(self, fname): @@ -153,7 +153,7 @@ def load(self, fname): "Can't find index files '%s' and '%s' - Unable to restore AnnoyIndexer state." % (fname, fname_dict) ) else: - with smart_open(fname_dict) as f: + with utils.open(fname_dict, 'rb') as f: d = _pickle.loads(f.read()) self.num_trees = d['num_trees'] self.index = AnnoyIndex(d['f']) diff --git a/gensim/test/test_doc2vec.py b/gensim/test/test_doc2vec.py index 6ac510ea8e..712d9778bd 100644 --- a/gensim/test/test_doc2vec.py +++ b/gensim/test/test_doc2vec.py @@ -190,7 +190,7 @@ def test_get_offsets_and_start_doctags(self): lines = ['line1\n', 'line2\n', 'line3\n', 'line4\n', 'line5\n'] tmpf = get_tmpfile('gensim_doc2vec.tst') - with utils.smart_open(tmpf, 'wb', encoding='utf8') as fout: + with utils.open(tmpf, 'wb', encoding='utf8') as fout: for line in lines: fout.write(utils.any2unicode(line)) @@ -224,7 +224,7 @@ def test_get_offsets_and_start_doctags_win(self): lines = ['line1\n', 'line2\n', 'line3\n', 'line4\n', 'line5\n'] tmpf = get_tmpfile('gensim_doc2vec.tst') - with utils.smart_open(tmpf, 'wb', encoding='utf8') as fout: + with utils.open(tmpf, 'wb', encoding='utf8') as fout: for line in lines: fout.write(utils.any2unicode(line)) @@ -257,7 +257,7 @@ def test_cython_linesentence_readline_after_getting_offsets(self): lines = ['line1\n', 'line2\n', 'line3\n', 'line4\n', 'line5\n'] tmpf = get_tmpfile('gensim_doc2vec.tst') - with utils.smart_open(tmpf, 'wb', encoding='utf8') as fout: + with utils.open(tmpf, 'wb', encoding='utf8') as fout: for line in lines: fout.write(utils.any2unicode(line)) diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py index 99dd9c503f..e0f7597925 100644 --- a/gensim/test/test_fasttext.py +++ b/gensim/test/test_fasttext.py @@ -12,8 +12,6 @@ import numpy as np -import smart_open - from gensim import utils from gensim.models.word2vec import LineSentence from gensim.models.fasttext import FastText as FT_gensim @@ -995,7 +993,7 @@ def setUp(self): def test_in_vocab(self): """Test for correct representation of in-vocab words.""" native = load_native() - with smart_open.smart_open(datapath('toy-model.vec'), 'r', encoding='utf-8') as fin: + with utils.open(datapath('toy-model.vec'), 'r', encoding='utf-8') as fin: expected = dict(load_vec(fin)) for word, expected_vector in expected.items(): @@ -1187,7 +1185,7 @@ def setUp(self): # ./fasttext skipgram -minCount 0 -bucket 100 -input crime-and-punishment.txt -output crime-and-punishment -dim 5 # noqa: E501 # self.model = gensim.models.fasttext.load_facebook_model(datapath('crime-and-punishment.bin')) - with smart_open.smart_open(datapath('crime-and-punishment.vec'), 'r', encoding='utf-8') as fin: + with utils.open(datapath('crime-and-punishment.vec'), 'r', encoding='utf-8') as fin: self.expected = dict(load_vec(fin)) def test_ascii(self): diff --git a/gensim/test/test_keywords.py b/gensim/test/test_keywords.py index 79df82fba6..6011c83df4 100644 --- a/gensim/test/test_keywords.py +++ b/gensim/test/test_keywords.py @@ -25,14 +25,14 @@ class TestKeywordsTest(unittest.TestCase): def test_text_keywords(self): pre_path = os.path.join(os.path.dirname(__file__), 'test_data') - with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f: + with utils.open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f: text = f.read() # calculate keywords generated_keywords = keywords(text, split=True) # To be compared to the reference. - with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.kw.txt"), mode="r") as f: + with utils.open(os.path.join(pre_path, "mihalcea_tarau.kw.txt"), mode="r") as f: kw = f.read().strip().split("\n") self.assertEqual({str(x) for x in generated_keywords}, {str(x) for x in kw}) @@ -40,7 +40,7 @@ def test_text_keywords(self): def test_text_keywords_words(self): pre_path = os.path.join(os.path.dirname(__file__), 'test_data') - with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f: + with utils.open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f: text = f.read() # calculate exactly 13 keywords @@ -51,14 +51,14 @@ def test_text_keywords_words(self): def test_text_keywords_pos(self): pre_path = os.path.join(os.path.dirname(__file__), 'test_data') - with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f: + with utils.open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f: text = f.read() # calculate keywords using only certain parts of speech generated_keywords_nnvbjj = keywords(text, pos_filter=['NN', 'VB', 'JJ'], ratio=0.3, split=True) # To be compared to the reference. - with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.kwpos.txt"), mode="r") as f: + with utils.open(os.path.join(pre_path, "mihalcea_tarau.kwpos.txt"), mode="r") as f: kw = f.read().strip().split("\n") self.assertEqual({str(x) for x in generated_keywords_nnvbjj}, {str(x) for x in kw}) @@ -66,7 +66,7 @@ def test_text_keywords_pos(self): def test_text_summarization_raises_exception_on_short_input_text(self): pre_path = os.path.join(os.path.dirname(__file__), 'test_data') - with utils.smart_open(os.path.join(pre_path, "testsummarization_unrelated.txt"), mode="r") as f: + with utils.open(os.path.join(pre_path, "testsummarization_unrelated.txt"), mode="r") as f: text = f.read() # Keeps the first 8 sentences to make the text shorter. @@ -77,7 +77,7 @@ def test_text_summarization_raises_exception_on_short_input_text(self): def test_keywords_ratio(self): pre_path = os.path.join(os.path.dirname(__file__), 'test_data') - with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f: + with utils.open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f: text = f.read() # Check ratio parameter is well behaved. Because length is taken on tokenized clean text diff --git a/gensim/test/test_lee.py b/gensim/test/test_lee.py index ff0de9dc3f..1eadd398a9 100644 --- a/gensim/test/test_lee.py +++ b/gensim/test/test_lee.py @@ -52,13 +52,13 @@ def setUp(self): # read in the corpora latin1 = partial(utils.to_unicode, encoding='latin1') - with utils.smart_open(os.path.join(pre_path, bg_corpus_file)) as f: + with utils.open(os.path.join(pre_path, bg_corpus_file), 'rb') as f: bg_corpus = preprocess_documents(latin1(line) for line in f) - with utils.smart_open(os.path.join(pre_path, corpus_file)) as f: + with utils.open(os.path.join(pre_path, corpus_file), 'rb') as f: corpus = preprocess_documents(latin1(line) for line in f) - with utils.smart_open(os.path.join(pre_path, bg_corpus_file)) as f: + with utils.open(os.path.join(pre_path, bg_corpus_file), 'rb') as f: bg_corpus2 = [preprocess_string(latin1(s), filters=DEFAULT_FILTERS[:-1]) for s in f] - with utils.smart_open(os.path.join(pre_path, corpus_file)) as f: + with utils.open(os.path.join(pre_path, corpus_file), 'rb') as f: corpus2 = [preprocess_string(latin1(s), filters=DEFAULT_FILTERS[:-1]) for s in f] # read the human similarity data diff --git a/gensim/test/test_scripts.py b/gensim/test/test_scripts.py index 2fa625e942..1e0144e2af 100644 --- a/gensim/test/test_scripts.py +++ b/gensim/test/test_scripts.py @@ -16,9 +16,9 @@ import os.path import unittest -from smart_open import smart_open import numpy as np +from gensim import utils from gensim.scripts.segment_wiki import segment_all_articles, segment_and_write_all_articles from gensim.test.utils import datapath, get_tmpfile @@ -85,7 +85,8 @@ def test_json_len(self): segment_and_write_all_articles(self.fname, tmpf, workers=1) expected_num_articles = 106 - num_articles = sum(1 for line in smart_open(tmpf)) + with utils.open(tmpf, 'rb') as f: + num_articles = sum(1 for line in f) self.assertEqual(num_articles, expected_num_articles) def test_segment_and_write_all_articles(self): @@ -120,14 +121,14 @@ def setUp(self): def testConversion(self): word2vec2tensor(word2vec_model_path=self.datapath, tensor_filename=self.output_folder) - with smart_open(self.metadata_file, 'rb') as f: + with utils.open(self.metadata_file, 'rb') as f: metadata = f.readlines() - with smart_open(self.tensor_file, 'rb') as f: + with utils.open(self.tensor_file, 'rb') as f: vectors = f.readlines() # check if number of words and vector size in tensor file line up with word2vec - with smart_open(self.datapath, 'rb') as f: + with utils.open(self.datapath, 'rb') as f: first_line = f.readline().strip() number_words, vector_size = map(int, first_line.split(b' ')) diff --git a/gensim/test/test_similarities.py b/gensim/test/test_similarities.py index 428431ea15..5a739db7e7 100644 --- a/gensim/test/test_similarities.py +++ b/gensim/test/test_similarities.py @@ -17,7 +17,7 @@ import numpy import scipy -from smart_open import smart_open +from gensim import utils from gensim.corpora import Dictionary from gensim.models import word2vec from gensim.models import doc2vec @@ -560,7 +560,7 @@ def __init__(self, fn): self.fn = fn def __iter__(self): - with smart_open(self.fn, 'r', encoding="latin_1") as infile: + with utils.open(self.fn, 'r', encoding="latin_1") as infile: for line in infile: yield line.lower().strip().split() diff --git a/gensim/test/test_summarization.py b/gensim/test/test_summarization.py index 81a562a9d8..c7ef335323 100644 --- a/gensim/test/test_summarization.py +++ b/gensim/test/test_summarization.py @@ -142,7 +142,7 @@ class TestSummarizationTest(unittest.TestCase): def _get_text_from_test_data(self, file): pre_path = os.path.join(os.path.dirname(__file__), 'test_data') - with utils.smart_open(os.path.join(pre_path, file), mode="r") as f: + with utils.open(os.path.join(pre_path, file), mode="r") as f: return f.read() def test_text_summarization(self): @@ -268,7 +268,7 @@ def test_keywords_runs(self): def test_mz_keywords(self): pre_path = os.path.join(os.path.dirname(__file__), 'test_data') - with utils.smart_open(os.path.join(pre_path, "head500.noblanks.cor")) as f: + with utils.open(os.path.join(pre_path, "head500.noblanks.cor"), 'rb') as f: text = utils.to_unicode(f.read()) text = u' '.join(text.split()[:10240]) kwds = mz_keywords(text) diff --git a/gensim/test/test_translation_matrix.py b/gensim/test/test_translation_matrix.py index 4f7959336e..7be7ce4b63 100644 --- a/gensim/test/test_translation_matrix.py +++ b/gensim/test/test_translation_matrix.py @@ -77,7 +77,7 @@ def test_translate_gc(self): def read_sentiment_docs(filename): sentiment_document = namedtuple('SentimentDocument', 'words tags') alldocs = [] # will hold all docs in original order - with utils.smart_open(filename, encoding='utf-8') as alldata: + with utils.open(filename, mode='rb', encoding='utf-8') as alldata: for line_no, line in enumerate(alldata): tokens = utils.to_unicode(line).split() words = tokens diff --git a/gensim/test/test_utils.py b/gensim/test/test_utils.py index c23087580f..f6d954777c 100644 --- a/gensim/test/test_utils.py +++ b/gensim/test/test_utils.py @@ -20,8 +20,6 @@ import gensim.models.utils_any2vec -import smart_open - DISABLE_CYTHON_TESTS = getattr(gensim.models.utils_any2vec, 'FAST_VERSION', None) == -1 @@ -252,7 +250,7 @@ def test_save_as_line_sentence_en(self): utils.save_as_line_sentence(ref_sentences, corpus_file) - with utils.smart_open(corpus_file, encoding='utf8') as fin: + with utils.open(corpus_file, 'rb', encoding='utf8') as fin: sentences = [line.strip().split() for line in fin.read().strip().split('\n')] self.assertEqual(sentences, ref_sentences) @@ -261,7 +259,7 @@ def test_save_as_line_sentence_ru(self): ref_sentences = [l.split() for l in utils.any2unicode('привет мир\nкак ты поживаешь').split('\n')] utils.save_as_line_sentence(ref_sentences, corpus_file) - with utils.smart_open(corpus_file, encoding='utf8') as fin: + with utils.open(corpus_file, 'rb', encoding='utf8') as fin: sentences = [line.strip().split() for line in fin.read().strip().split('\n')] self.assertEqual(sentences, ref_sentences) @@ -520,7 +518,7 @@ def test_bytes_cy(self): def test_fb(self): """Test against results from Facebook's implementation.""" - with smart_open.smart_open(datapath('fb-ngrams.txt'), 'r', encoding='utf-8') as fin: + with utils.open(datapath('fb-ngrams.txt'), 'r', encoding='utf-8') as fin: fb = dict(_read_fb(fin)) for word, expected in fb.items(): diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index 11257bebb1..b7cc5a8fb5 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -1060,7 +1060,7 @@ def testIdenticalSentences(self): class TestWord2VecSentenceIterators(unittest.TestCase): def testLineSentenceWorksWithFilename(self): """Does LineSentence work with a filename argument?""" - with utils.smart_open(datapath('lee_background.cor')) as orig: + with utils.open(datapath('lee_background.cor'), 'rb') as orig: sentences = word2vec.LineSentence(datapath('lee_background.cor')) for words in sentences: self.assertEqual(words, utils.to_unicode(orig.readline()).split()) @@ -1069,41 +1069,41 @@ def testLineSentenceWorksWithFilename(self): def testCythonLineSentenceWorksWithFilename(self): """Does CythonLineSentence work with a filename argument?""" from gensim.models import word2vec_corpusfile - with utils.smart_open(datapath('lee_background.cor')) as orig: + with utils.open(datapath('lee_background.cor'), 'rb') as orig: sentences = word2vec_corpusfile.CythonLineSentence(datapath('lee_background.cor')) for words in sentences: self.assertEqual(words, orig.readline().split()) def testLineSentenceWorksWithCompressedFile(self): """Does LineSentence work with a compressed file object argument?""" - with utils.smart_open(datapath('head500.noblanks.cor')) as orig: + with utils.open(datapath('head500.noblanks.cor'), 'rb') as orig: sentences = word2vec.LineSentence(bz2.BZ2File(datapath('head500.noblanks.cor.bz2'))) for words in sentences: self.assertEqual(words, utils.to_unicode(orig.readline()).split()) def testLineSentenceWorksWithNormalFile(self): """Does LineSentence work with a file object argument, rather than filename?""" - with utils.smart_open(datapath('head500.noblanks.cor')) as orig: - with utils.smart_open(datapath('head500.noblanks.cor')) as fin: + with utils.open(datapath('head500.noblanks.cor'), 'rb') as orig: + with utils.open(datapath('head500.noblanks.cor'), 'rb') as fin: sentences = word2vec.LineSentence(fin) for words in sentences: self.assertEqual(words, utils.to_unicode(orig.readline()).split()) def testPathLineSentences(self): """Does PathLineSentences work with a path argument?""" - with utils.smart_open(os.path.join(datapath('PathLineSentences'), '1.txt')) as orig1,\ - utils.smart_open(os.path.join(datapath('PathLineSentences'), '2.txt.bz2')) as orig2: - sentences = word2vec.PathLineSentences(datapath('PathLineSentences')) - orig = orig1.readlines() + orig2.readlines() - orig_counter = 0 # to go through orig while matching PathLineSentences - for words in sentences: - self.assertEqual(words, utils.to_unicode(orig[orig_counter]).split()) - orig_counter += 1 + with utils.open(os.path.join(datapath('PathLineSentences'), '1.txt'), 'rb') as orig1: + with utils.open(os.path.join(datapath('PathLineSentences'), '2.txt.bz2'), 'rb') as orig2: + sentences = word2vec.PathLineSentences(datapath('PathLineSentences')) + orig = orig1.readlines() + orig2.readlines() + orig_counter = 0 # to go through orig while matching PathLineSentences + for words in sentences: + self.assertEqual(words, utils.to_unicode(orig[orig_counter]).split()) + orig_counter += 1 def testPathLineSentencesOneFile(self): """Does PathLineSentences work with a single file argument?""" test_file = os.path.join(datapath('PathLineSentences'), '1.txt') - with utils.smart_open(test_file) as orig: + with utils.open(test_file, 'rb') as orig: sentences = word2vec.PathLineSentences(test_file) for words in sentences: self.assertEqual(words, utils.to_unicode(orig.readline()).split()) diff --git a/gensim/utils.py b/gensim/utils.py index 4b6853a3b8..cf7eca6c90 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -42,7 +42,7 @@ from six import iterkeys, iteritems, itervalues, u, string_types, unichr from six.moves import range -from smart_open import smart_open +from smart_open import open from multiprocessing import cpu_count @@ -128,7 +128,7 @@ def file_or_filename(input): """ if isinstance(input, string_types): # input was a filename: open as file - return smart_open(input) + return open(input, 'rb') else: # input already a file-like object; just reset to the beginning input.seek(0) @@ -1360,7 +1360,7 @@ def pickle(obj, fname, protocol=2): Pickle protocol number. Default is 2 in order to support compatibility across python 2.x and 3.x. """ - with smart_open(fname, 'wb') as fout: # 'b' for binary, needed on Windows + with open(fname, 'wb') as fout: # 'b' for binary, needed on Windows _pickle.dump(obj, fout, protocol=protocol) @@ -1378,7 +1378,7 @@ def unpickle(fname): Python object loaded from `fname`. """ - with smart_open(fname, 'rb') as f: + with open(fname, 'rb') as f: # Because of loading from S3 load can't be used (missing readline in smart_open) if sys.version_info > (3, 0): return _pickle.load(f, encoding='latin1') @@ -2079,7 +2079,7 @@ def save_as_line_sentence(corpus, filename): corpus : iterable of iterables of strings """ - with smart_open(filename, mode='wb', encoding='utf8') as fout: + with open(filename, mode='wb', encoding='utf8') as fout: for sentence in corpus: line = any2unicode(' '.join(sentence) + '\n') fout.write(line) From fd025137aafa045fb244ff36b4c39c28e86eb686 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Fri, 28 Jun 2019 13:35:42 +0900 Subject: [PATCH 2/3] reduce scope of context manager in csvcorpus.py --- gensim/corpora/csvcorpus.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/gensim/corpora/csvcorpus.py b/gensim/corpora/csvcorpus.py index 59fbbe16f2..a3e94ae85e 100644 --- a/gensim/corpora/csvcorpus.py +++ b/gensim/corpora/csvcorpus.py @@ -47,9 +47,10 @@ def __init__(self, fname, labels): # load the first few lines, to guess the CSV dialect with utils.open(self.fname, 'rb') as f: head = ''.join(itertools.islice(f, 5)) - self.headers = csv.Sniffer().has_header(head) - self.dialect = csv.Sniffer().sniff(head) - logger.info("sniffed CSV delimiter=%r, headers=%s", self.dialect.delimiter, self.headers) + + self.headers = csv.Sniffer().has_header(head) + self.dialect = csv.Sniffer().sniff(head) + logger.info("sniffed CSV delimiter=%r, headers=%s", self.dialect.delimiter, self.headers) def __iter__(self): """Iterate over the corpus, returning one BoW vector at a time. From bb1a4f34aebb69f8167791be4609744ceb305465 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Fri, 28 Jun 2019 14:38:47 +0900 Subject: [PATCH 3/3] Update csvcorpus.py --- gensim/corpora/csvcorpus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/corpora/csvcorpus.py b/gensim/corpora/csvcorpus.py index a3e94ae85e..505d7c9be1 100644 --- a/gensim/corpora/csvcorpus.py +++ b/gensim/corpora/csvcorpus.py @@ -47,7 +47,7 @@ def __init__(self, fname, labels): # load the first few lines, to guess the CSV dialect with utils.open(self.fname, 'rb') as f: head = ''.join(itertools.islice(f, 5)) - + self.headers = csv.Sniffer().has_header(head) self.dialect = csv.Sniffer().sniff(head) logger.info("sniffed CSV delimiter=%r, headers=%s", self.dialect.delimiter, self.headers)