diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml index aa9bafc..0033e13 100755 --- a/.github/workflows/cd.yml +++ b/.github/workflows/cd.yml @@ -37,16 +37,16 @@ jobs: file: ./docker/Dockerfile push: true tags: | - ghcr.io/artefactory/NLPretext:${{ steps.tag.outputs.tag_name }} - ghcr.io/artefactory/NLPretext:latest - cache-from: type=registry,ref=ghcr.io/artefactory/NLPretext:latest + ghcr.io/artefactory/nlpretext:${{ steps.tag.outputs.tag_name }} + ghcr.io/artefactory/nlpretext:latest + cache-from: type=registry,ref=ghcr.io/artefactory/nlpretext:latest cache-to: type=inline - name: Scan image uses: anchore/scan-action@v2 id: scan with: - image: "ghcr.io/artefactory/NLPretext:${{ steps.tag.outputs.tag_name }}" + image: "ghcr.io/artefactory/nlpretext:${{ steps.tag.outputs.tag_name }}" acs-report-enable: true - name: upload Anchore scan SARIF report uses: github/codeql-action/upload-sarif@v1 @@ -87,7 +87,7 @@ jobs: - name: Install dependencies run: | - poetry install + poetry install -E torch - name: Publish to PyPI env: diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e629287..5f807fb 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -67,7 +67,7 @@ jobs: - name: Install dependencies run: | poetry run pip install --upgrade pip - poetry install -E spacy-tokenizer -E torch + poetry install -E torch - name: Run safety checks run: | diff --git a/README.md b/README.md index d49b8ca..ba0e1dd 100644 --- a/README.md +++ b/README.md @@ -71,15 +71,6 @@ or with `Poetry` poetry add nlpretext ``` -This library uses Spacy as tokenizer. Current models supported are `en_core_web_sm` and `fr_core_news_sm`. If not installed, run the following commands: -```bash -pip install nlpretext[spacy-tokenizer] -``` - -```bash -poetry add nlpretext -E spacy-tokenizer -``` - # Usage diff --git a/docs/source/index.rst b/docs/source/index.rst index 4689bb2..49680da 100755 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -17,9 +17,6 @@ To install this library you should first clone the repository: pip install nlpretext -This library uses Spacy as tokenizer. Current models supported are `en_core_web_sm` and `fr_core_news_sm`. If not installed, run the following commands: - -pip install nlpretext[spacy-tokenizer] .. toctree:: :maxdepth: 4 diff --git a/nlpretext/token/tokenizer.py b/nlpretext/token/tokenizer.py index 664479a..3b2d360 100644 --- a/nlpretext/token/tokenizer.py +++ b/nlpretext/token/tokenizer.py @@ -17,10 +17,15 @@ # Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. from typing import Any, List, Optional, Union +import os +import re + import nltk import spacy from sacremoses import MosesDetokenizer, MosesTokenizer +MODEL_REGEX = re.compile(r"^[a-z]{2}_(?:core|dep|ent|sent)_(?:web|news|wiki|ud)_(?:sm|md|lg|trf)$") + class LanguageNotHandled(Exception): pass @@ -62,9 +67,14 @@ def _load_spacy_model(model: str) -> Any: try: return spacy.load(model) except OSError: - raise LanguageNotInstalledError( - f"Model {model} is not installed. " f"To install, run: python -m spacy download {model}" - ) + if MODEL_REGEX.match(model): + os.system(f"python -m spacy download {model}") # nosec + return spacy.load(model) + else: + raise LanguageNotInstalledError( + f"Model {model} is not installed. " + f"To install, run: python -m spacy download {model}" + ) def _get_spacy_tokenizer(lang: str) -> Optional[spacy.tokenizer.Tokenizer]: diff --git a/poetry.lock b/poetry.lock index 0f2b2ae..578a1ce 100644 --- a/poetry.lock +++ b/poetry.lock @@ -455,20 +455,6 @@ python-versions = "*" [package.extras] dev = ["pytest", "coverage", "coveralls"] -[[package]] -name = "en-core-web-sm" -version = "3.1.0" -description = "English pipeline optimized for CPU. Components: tok2vec, tagger, parser, senter, ner, attribute_ruler, lemmatizer." -category = "main" -optional = true -python-versions = "*" - -[package.dependencies] -spacy = ">=3.1.0,<3.2.0" - -[package.source] -type = "url" -url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.1.0/en_core_web_sm-3.1.0.tar.gz" [[package]] name = "entrypoints" version = "0.3" @@ -511,20 +497,6 @@ category = "main" optional = false python-versions = "*" -[[package]] -name = "fr-core-news-sm" -version = "3.1.0" -description = "French pipeline optimized for CPU. Components: tok2vec, morphologizer, parser, senter, ner, attribute_ruler, lemmatizer." -category = "main" -optional = true -python-versions = "*" - -[package.dependencies] -spacy = ">=3.1.0,<3.2.0" - -[package.source] -type = "url" -url = "https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.1.0/fr_core_news_sm-3.1.0.tar.gz" [[package]] name = "fsspec" version = "2021.8.1" @@ -2306,13 +2278,12 @@ docs = ["sphinx", "jaraco.packaging (>=8.2)", "rst.linker (>=1.9)"] testing = ["pytest (>=4.6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-cov", "pytest-enabler (>=1.0.1)", "jaraco.itertools", "func-timeout", "pytest-black (>=0.3.7)", "pytest-mypy"] [extras] -spacy-tokenizer = ["fr-core-news-sm", "en-core-web-sm"] torch = ["torch"] [metadata] lock-version = "1.1" python-versions = ">=3.7,<4.0" -content-hash = "e6d3b593fa9088783ee71b3c7be7886652c62b7159fbaf0002cf653e6d9fab70" +content-hash = "6612c0667541f6a9688a7de0251fb3bb9c8c05e170e535b1d663146c3f108aea" [metadata.files] alabaster = [ @@ -2652,7 +2623,6 @@ dparse = [ emoji = [ {file = "emoji-1.4.2.tar.gz", hash = "sha256:21257f311e24468031e85685867c00b87249dc7612b82dc763a771ba5fb00c01"}, ] -en-core-web-sm = [] entrypoints = [ {file = "entrypoints-0.3-py2.py3-none-any.whl", hash = "sha256:589f874b313739ad35be6e0cd7efde2a4e9b6fea91edcc34e58ecbb8dbe56d19"}, {file = "entrypoints-0.3.tar.gz", hash = "sha256:c70dd71abe5a8c85e55e12c19bd91ccfeec11a6e99044204511f9ed547d48451"}, @@ -2691,7 +2661,6 @@ filelock = [ flashtext = [ {file = "flashtext-2.7.tar.gz", hash = "sha256:a1be2b93e09d4f0deee4aad72b91a7127b61fb8b8034ca9a9c78ea745d8b05cf"}, ] -fr-core-news-sm = [] fsspec = [ {file = "fsspec-2021.8.1-py3-none-any.whl", hash = "sha256:30f27c059a414d1f434b14b2d0e75c8d9c3dd473ad8daeccb444d9d4069b9f03"}, {file = "fsspec-2021.8.1.tar.gz", hash = "sha256:af125917788b77782899bbd4484d29e5407f53d2bb04cdfa025fe4931201a555"}, diff --git a/pyproject.toml b/pyproject.toml index 458f6c3..257870c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -58,8 +58,6 @@ fastparquet = ">=0.4.1" dask = {version = ">=2021.5.0", extras = ["complete"]} distributed = ">=2021.5.0" tornado = ">=6.0.3" -fr-core-news-sm = {url = "https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.1.0/fr_core_news_sm-3.1.0.tar.gz", optional = true} -en-core-web-sm = {url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.1.0/en_core_web_sm-3.1.0.tar.gz", optional = true} torch = {version = "^1.9.0", optional = true} [tool.poetry.dev-dependencies] @@ -99,7 +97,6 @@ types-chardet = ">=0.1.3" types-click = ">=7.1.2" [tool.poetry.extras] -spacy-tokenizer = ["fr-core-news-sm", "en-core-web-sm"] torch = ["torch"] [tool.black] diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py deleted file mode 100644 index c1b9135..0000000 --- a/tests/test_tokenizer.py +++ /dev/null @@ -1,18 +0,0 @@ -import pytest -import spacy -from nlpretext.token.tokenizer import LanguageNotInstalledError, SpacyModel - - -@pytest.mark.parametrize( - "fake_input, expected_model_in_message", [("en", "en_core_web_sm"), ("fr", "fr_core_news_sm")] -) -def test_get_spacy_tokenizer_when_model_not_downloaded( - monkeypatch, fake_input, expected_model_in_message -): - def mock_spacy_load(lang): - raise OSError("[E050] Can't find model 'en_core_web_sm'. It doesn't seem to be ...") - - monkeypatch.setattr(spacy, "load", mock_spacy_load) - with pytest.raises(LanguageNotInstalledError) as e: - SpacyModel.SingletonSpacyModel(fake_input) - assert expected_model_in_message in str(e.value)