diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py index fb402da517..f5f30281eb 100755 --- a/gensim/corpora/wikicorpus.py +++ b/gensim/corpora/wikicorpus.py @@ -173,7 +173,7 @@ def tokenize(content): """ # TODO maybe ignore tokens with non-latin characters? (no chinese, arabic, russian etc.) return [ - token.encode('utf8') for token in utils.tokenize(content, lower=True, errors='ignore') + utils.to_unicode(token) for token in utils.tokenize(content, lower=True, errors='ignore') if 2 <= len(token) <= 15 and not token.startswith('_') ] diff --git a/gensim/test/test_data/bgwiki-latest-pages-articles-shortened.xml.bz2 b/gensim/test/test_data/bgwiki-latest-pages-articles-shortened.xml.bz2 new file mode 100644 index 0000000000..11f3d795c3 Binary files /dev/null and b/gensim/test/test_data/bgwiki-latest-pages-articles-shortened.xml.bz2 differ diff --git a/gensim/test/test_wikicorpus.py b/gensim/test/test_wikicorpus.py index 9bbb441c17..36594b205e 100644 --- a/gensim/test/test_wikicorpus.py +++ b/gensim/test/test_wikicorpus.py @@ -21,6 +21,7 @@ module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder datapath = lambda fname: os.path.join(module_path, 'test_data', fname) FILENAME = 'enwiki-latest-pages-articles1.xml-p000000010p000030302-shortened.bz2' +FILENAME_U = 'bgwiki-latest-pages-articles-shortened.xml.bz2' logger = logging.getLogger(__name__) @@ -45,14 +46,21 @@ def test_first_element(self): 1) anarchism 2) autism """ - if sys.version_info < (2, 7, 0): - return - wc = WikiCorpus(datapath(FILENAME)) + wc = WikiCorpus(datapath(FILENAME), processes=1) l = wc.get_texts() - self.assertTrue(b"anarchism" in next(l)) - self.assertTrue(b"autism" in next(l)) + self.assertTrue(u'anarchism' in next(l)) + self.assertTrue(u'autism' in next(l)) + def test_unicode_element(self): + """ + First unicode article in this sample is + 1) папа + """ + wc = WikiCorpus(datapath(FILENAME_U), processes=1) + + l = wc.get_texts() + self.assertTrue(u'папа' in next(l)) if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)