Skip to content

Commit

Permalink
Merge pull request #296 from artefactory/fix/tokenizer-when-no-tokens
Browse files Browse the repository at this point in the history
fix: tokenizer when no tokens
  • Loading branch information
julesbertrand authored Oct 26, 2023
2 parents d428d73 + 7bc7bb0 commit 9a66205
Show file tree
Hide file tree
Showing 4 changed files with 2,429 additions and 2,237 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ endif

.PHONY: download-poetry
download-poetry:
curl -sSL https://raw.githubusercontent.com/python-poetry/poetry/master/install-poetry.py | python
curl -sSL https://install.python-poetry.org | python3 -

.PHONY: install
install:
Expand Down
15 changes: 9 additions & 6 deletions nlpretext/token/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from sacremoses import MosesDetokenizer, MosesTokenizer

MODEL_REGEX = re.compile(r"^[a-z]{2}_(?:core|dep|ent|sent)_(?:web|news|wiki|ud)_(?:sm|md|lg|trf)$")
SUPPORTED_LANG_MODULES = {"en_spacy", "en_nltk", "fr_spacy", "fr_moses", "ko_spacy", "ja_spacy"}


class LanguageNotHandled(Exception):
Expand Down Expand Up @@ -120,6 +121,12 @@ def tokenize(text: str, lang_module: str = "en_spacy") -> List[str]:
ValueError
If lang_module is not a valid module name
"""
if lang_module not in SUPPORTED_LANG_MODULES:
raise ValueError(
f"Invalid lang_module: {lang_module}. "
f"lang_module must be one of {SUPPORTED_LANG_MODULES}."
)

tokenized_words: List[str] = []
if "spacy" in lang_module:
lang = lang_module.split("_")[0]
Expand All @@ -131,12 +138,8 @@ def tokenize(text: str, lang_module: str = "en_spacy") -> List[str]:
tokenized_words = nltk.word_tokenize(text)
if lang_module == "fr_moses":
tokenized_words = MosesTokenizer(lang="fr").tokenize(text, escape=False)
if tokenized_words:
return tokenized_words
raise ValueError(
"Please pass a lang_module in list of values "
"{'en_spacy', 'en_nltk', 'fr_spacy', 'fr_moses', 'ko_spacy', 'ja_spacy'}"
)

return tokenized_words


def untokenize(tokens: List[str], lang: str = "fr") -> str:
Expand Down
Loading

0 comments on commit 9a66205

Please sign in to comment.