Improvement: WikiCorpus class now can receive multiple tokenizing fun…

…ctions, that can be simple, list or tuple. Addition of tests for the WikiCorpus class to be able to receive multiple tokenizing functions.
piskvorky · Jul 23, 2024 · 64a3727 · 64a3727
1 parent c964f92
commit 64a3727
Show file tree

Hide file tree

Showing 2 changed files with 1,003 additions and 965 deletions.
diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py
@@ -25,13 +25,11 @@
 # LXML isn't faster, so let's go with the built-in solution
 from xml.etree.ElementTree import iterparse
 
-
 from gensim import utils
 # cannot import whole gensim.corpora, because that imports wikicorpus...
 from gensim.corpora.dictionary import Dictionary
 from gensim.corpora.textcorpus import TextCorpus
 
-
 logger = logging.getLogger(__name__)
 
 ARTICLE_MIN_WORDS = 50
@@ -468,10 +466,10 @@ def process_article(
     ----------
     args : (str, str, int)
         Article text, article title, page identificator.
-    tokenizer_func : function
-        Function for tokenization (defaults is :func:`~gensim.corpora.wikicorpus.tokenize`).
-        Needs to have interface:
-        tokenizer_func(text: str, token_min_len: int, token_max_len: int, lower: bool) -> list of str.
+    tokenizer_func : function OR list of function, optional
+            Function for tokenization (defaults is :func:`~gensim.corpora.wikicorpus.tokenize`).
+            Each function needs to have interface:
+            `tokenizer_func(text: str, token_min_len: int, token_max_len: int, lower: bool) -> list of str.`
     token_min_len : int
         Minimal token length.
     token_max_len : int
@@ -487,7 +485,11 @@ def process_article(
     """
     text, title, pageid = args
     text = filter_wiki(text)
-    result = tokenizer_func(text, token_min_len, token_max_len, lower)
+    tokenizers = [] if (tokenizer_func is None) \
+                    else (list(tokenizer_func) if isinstance(tokenizer_func, (list, tuple)) else [tokenizer_func])
+    for tokenizer in tokenizers:
+        text = " ".join(tokenizer(text, token_min_len, token_max_len, lower))
+    result = text.split()
     return result, title, pageid
 
 
@@ -569,6 +571,7 @@ class WikiCorpus(TextCorpus):
         >>> MmCorpus.serialize(corpus_path, wiki)  # another 8h, creates a file in MatrixMarket format and mapping
 
     """
+
     def __init__(
             self, fname, processes=None, lemmatize=None, dictionary=None, metadata=False,
             filter_namespaces=('0',), tokenizer_func=tokenize, article_min_tokens=ARTICLE_MIN_WORDS,
@@ -590,10 +593,10 @@ def __init__(
             **IMPORTANT: this needs a really long time**.
         filter_namespaces : tuple of str, optional
             Namespaces to consider.
-        tokenizer_func : function, optional
-            Function that will be used for tokenization. By default, use :func:`~gensim.corpora.wikicorpus.tokenize`.
-            If you inject your own tokenizer, it must conform to this interface:
-            `tokenizer_func(text: str, token_min_len: int, token_max_len: int, lower: bool) -> list of str`
+        tokenizer_func : function OR list of function, optional
+            Function for tokenization (defaults is :func:`~gensim.corpora.wikicorpus.tokenize`).
+            Each function needs to have interface:
+            `tokenizer_func(text: str, token_min_len: int, token_max_len: int, lower: bool) -> list of str.`
         article_min_tokens : int, optional
             Minimum tokens in article. Article will be ignored if number of tokens is less.
         token_min_len : int, optional