Skip to content

Commit

Permalink
Improvement: WikiCorpus class now can receive multiple tokenizing fun…
Browse files Browse the repository at this point in the history
…ctions, that can be simple, list or tuple.

Addition of tests for the WikiCorpus class to be able to receive multiple tokenizing functions.
  • Loading branch information
fabriciorsf committed Jul 23, 2024
1 parent c964f92 commit 64a3727
Show file tree
Hide file tree
Showing 2 changed files with 1,003 additions and 965 deletions.
25 changes: 14 additions & 11 deletions gensim/corpora/wikicorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,11 @@
# LXML isn't faster, so let's go with the built-in solution
from xml.etree.ElementTree import iterparse


from gensim import utils
# cannot import whole gensim.corpora, because that imports wikicorpus...
from gensim.corpora.dictionary import Dictionary
from gensim.corpora.textcorpus import TextCorpus


logger = logging.getLogger(__name__)

ARTICLE_MIN_WORDS = 50
Expand Down Expand Up @@ -468,10 +466,10 @@ def process_article(
----------
args : (str, str, int)
Article text, article title, page identificator.
tokenizer_func : function
Function for tokenization (defaults is :func:`~gensim.corpora.wikicorpus.tokenize`).
Needs to have interface:
tokenizer_func(text: str, token_min_len: int, token_max_len: int, lower: bool) -> list of str.
tokenizer_func : function OR list of function, optional
Function for tokenization (defaults is :func:`~gensim.corpora.wikicorpus.tokenize`).
Each function needs to have interface:
`tokenizer_func(text: str, token_min_len: int, token_max_len: int, lower: bool) -> list of str.`
token_min_len : int
Minimal token length.
token_max_len : int
Expand All @@ -487,7 +485,11 @@ def process_article(
"""
text, title, pageid = args
text = filter_wiki(text)
result = tokenizer_func(text, token_min_len, token_max_len, lower)
tokenizers = [] if (tokenizer_func is None) \
else (list(tokenizer_func) if isinstance(tokenizer_func, (list, tuple)) else [tokenizer_func])
for tokenizer in tokenizers:
text = " ".join(tokenizer(text, token_min_len, token_max_len, lower))
result = text.split()
return result, title, pageid


Expand Down Expand Up @@ -569,6 +571,7 @@ class WikiCorpus(TextCorpus):
>>> MmCorpus.serialize(corpus_path, wiki) # another 8h, creates a file in MatrixMarket format and mapping
"""

def __init__(
self, fname, processes=None, lemmatize=None, dictionary=None, metadata=False,
filter_namespaces=('0',), tokenizer_func=tokenize, article_min_tokens=ARTICLE_MIN_WORDS,
Expand All @@ -590,10 +593,10 @@ def __init__(
**IMPORTANT: this needs a really long time**.
filter_namespaces : tuple of str, optional
Namespaces to consider.
tokenizer_func : function, optional
Function that will be used for tokenization. By default, use :func:`~gensim.corpora.wikicorpus.tokenize`.
If you inject your own tokenizer, it must conform to this interface:
`tokenizer_func(text: str, token_min_len: int, token_max_len: int, lower: bool) -> list of str`
tokenizer_func : function OR list of function, optional
Function for tokenization (defaults is :func:`~gensim.corpora.wikicorpus.tokenize`).
Each function needs to have interface:
`tokenizer_func(text: str, token_min_len: int, token_max_len: int, lower: bool) -> list of str.`
article_min_tokens : int, optional
Minimum tokens in article. Article will be ignored if number of tokens is less.
token_min_len : int, optional
Expand Down
Loading

0 comments on commit 64a3727

Please sign in to comment.