Skip to content

Commit

Permalink
Correction of tests for the WikiCorpus class to be able to receive a …
Browse files Browse the repository at this point in the history
…list of tokenizing functions.
  • Loading branch information
fabriciorsf committed Jul 23, 2024
1 parent f2b2ce1 commit 9ce1e14
Showing 1 changed file with 3 additions and 2 deletions.
5 changes: 3 additions & 2 deletions gensim/test/test_corpora.py
Original file line number Diff line number Diff line change
Expand Up @@ -620,7 +620,7 @@ def test_indexing(self):

# Needed for the test_simple_tokenizer and test_list_tokenizers are the TestWikiCorpus class.
# Cannot be nested due to serializing.
def simple_tokenize(content, token_min_len=2, token_max_len=15, lower=True):
def simple_tokenizer(content, token_min_len=2, token_max_len=15, lower=True):
return [
token for token in (content.lower() if lower else content).split()
if token_min_len <= len(token) <= token_max_len]
Expand Down Expand Up @@ -732,7 +732,8 @@ def test_list_tokenizers(self):
"""
define a list containing two tokenizers functions (simple and custom) and use it
"""
wc = self.corpus_class(self.enwiki, processes=1, tokenizer_func=[simple_tokenizer, custom_tokenizer],
wc = self.corpus_class(self.enwiki, processes=1,
tokenizer_func=[simple_tokenizer, custom_tokenizer],
token_max_len=16, token_min_len=1, lower=False)
row = wc.get_texts()
list_tokens = next(row)
Expand Down

0 comments on commit 9ce1e14

Please sign in to comment.