artefactory · hugovasselin · Jul 28, 2021 · Jul 28, 2021
diff --git a/nlpretext/token/tokenizer.py b/nlpretext/token/tokenizer.py
@@ -15,22 +15,27 @@
 # You should have received a copy of the GNU Lesser General Public License
 # along with this program; if not, write to the Free Software Foundation,
 # Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
-from typing import List, Union
+from typing import Any, List, Union
 import nltk
 from sacremoses import MosesTokenizer, MosesDetokenizer
 import spacy
 
 class LanguageNotHandled(Exception):
     pass
 
+
+class LanguageNotInstalledError(Exception):
+    pass
+
+
 class SpacyModel:
     class SingletonSpacyModel:
-        def __init__(self, lang):
+        def __init__(self, lang: str) -> None:
             self.lang = lang
             if lang == 'en':
-                self.model = spacy.load('en_core_web_sm')
+                self.model = _load_spacy_model('en_core_web_sm')
             elif lang == 'fr':
-                self.model = spacy.load('fr_core_news_sm')
+                self.model = _load_spacy_model('fr_core_news_sm')
             elif lang == 'ko':
                 self.model = spacy.blank('ko')
             elif lang == 'ja':
@@ -48,6 +53,18 @@ def get_lang_model(self):
         return self.model.lang
 
 
+def _load_spacy_model(model: str) -> Any:
+    try:
+        return spacy.load(model)
+    except OSError:
+        raise LanguageNotInstalledError(
+            (
+                f'Model {model} is not installed. '
+                f'To install, run: python -m spacy download {model}'
+            )
+        )
+
+
 def _get_spacy_tokenizer(lang: str):
     """
     Function that gets the right tokenizer given the language

diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
@@ -0,0 +1,22 @@
+import pytest
+import spacy
+from nlpretext.token.tokenizer import SpacyModel, LanguageNotInstalledError
+
+@pytest.mark.parametrize(
+    "fake_input, expected_model_in_message",
+    [
+        ("en", "en_core_web_sm"),
+        ("fr", "fr_core_news_sm")
+    ]
+)
+def test_get_spacy_tokenizer_when_model_not_downloaded(monkeypatch, fake_input, expected_model_in_message):
+
+    def mock_spacy_load(lang):
+        raise OSError(
+            "[E050] Can't find model 'en_core_web_sm'. It doesn't seem to be ..."
+        )
+
+    monkeypatch.setattr(spacy, "load", mock_spacy_load)
+    with pytest.raises(LanguageNotInstalledError) as e:
+        SpacyModel.SingletonSpacyModel(fake_input)
+    assert expected_model_in_message in str(e.value)