From f193ded35a094b6277c87de70b6afcae6d5fb395 Mon Sep 17 00:00:00 2001 From: TF Tang Date: Thu, 5 May 2022 00:32:52 +0200 Subject: [PATCH] Minor tweaks, added endpoints, added helper download staticmethod --- scrubadub_stanford/detectors/stanford.py | 2 +- scrubadub_stanford/detectors/stanza.py | 14 +++++++++++--- scrubadub_stanford/detectors/utils/utils.py | 2 +- setup.cfg | 2 ++ setup.py | 5 +---- tests/run.py | 2 +- tox.ini | 2 +- 7 files changed, 18 insertions(+), 11 deletions(-) diff --git a/scrubadub_stanford/detectors/stanford.py b/scrubadub_stanford/detectors/stanford.py index 7f3884e..2cf595c 100644 --- a/scrubadub_stanford/detectors/stanford.py +++ b/scrubadub_stanford/detectors/stanford.py @@ -15,7 +15,7 @@ except ImportError: raise ImportError( 'To use scrubadub.detectors.stanford extra dependencies need to be installed.\n' - 'Please run: pip install scrubadub[stanford]' + 'Please run: pip install scrubadub-stanford' ) from typing import Dict, Type, Optional diff --git a/scrubadub_stanford/detectors/stanza.py b/scrubadub_stanford/detectors/stanza.py index c328b9f..c3969cf 100644 --- a/scrubadub_stanford/detectors/stanza.py +++ b/scrubadub_stanford/detectors/stanza.py @@ -31,6 +31,9 @@ os.path.join(HOME_DIR, 'stanza_resources') ) +# Processors to be used in annotation Pipeline: +PROCESSORS = ['tokenize', 'ner'] + class StanzaEntityDetector(Detector): """Search for people's names, organization's names and locations within text using the stanford 3 class model. @@ -99,6 +102,12 @@ def _check_downloaded(directory: str = DEFAULT_STANZA_DIR) -> bool: return True return False + @staticmethod + def _download() -> Pipeline: + """Helper method to trigger downloading of Stanza's resources upon first init of Pipeline""" + pipeline = Pipeline(processors=PROCESSORS) + return pipeline + def iter_filth(self, text: str, document_name: Optional[str] = None): """Yields discovered filth in the provided ``text``. @@ -109,10 +118,9 @@ def iter_filth(self, text: str, document_name: Optional[str] = None): :return: An iterator to the discovered :class:`Filth` :rtype: Iterator[:class:`Filth`] """ - processors = ['tokenize', 'ner'] if not self._check_downloaded(): - pipeline = Pipeline(processors=processors) - pipeline = Pipeline(processors=processors, download_method=None) + pipeline = self._download() + pipeline = Pipeline(processors=PROCESSORS, download_method=None) doc = pipeline(text) # List of tuples of text/type for each entity in document tags = [(ent.text, ent.type) for ent in doc.ents] diff --git a/scrubadub_stanford/detectors/utils/utils.py b/scrubadub_stanford/detectors/utils/utils.py index 8436cff..7cb71d7 100644 --- a/scrubadub_stanford/detectors/utils/utils.py +++ b/scrubadub_stanford/detectors/utils/utils.py @@ -1,5 +1,5 @@ """ -Helper function for iterating through annotated list of entities done by Stanford NER models" +Helper function for iterating through list of entities annotated by Stanford NER models" """ from typing import Dict, Type, List, Tuple, Optional import re diff --git a/setup.cfg b/setup.cfg index 0f7c31e..601f881 100644 --- a/setup.cfg +++ b/setup.cfg @@ -19,3 +19,5 @@ cover-package=scrubadub_stanford [options.entry_points] scrubadub_detectors = stanford = scrubadub_stanford.detectors:StanfordEntityDetector + corenlp = scrubadub_stanford.detectors:CoreNlpEntityDetector + stanza = scrubadub_stanford.detectors:StanzaEntityDetector \ No newline at end of file diff --git a/setup.py b/setup.py index e87ec7c..a41f938 100644 --- a/setup.py +++ b/setup.py @@ -47,12 +47,9 @@ def get_package_list(location): 'Development Status :: 5 - Production/Stable', 'License :: OSI Approved :: Apache Software License', 'Natural Language :: English', - 'Programming Language :: Python', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.5', - 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: 3.9', 'Topic :: Software Development :: Libraries', 'Topic :: Scientific/Engineering :: Information Analysis', 'Topic :: Text Processing', diff --git a/tests/run.py b/tests/run.py index 682e18e..dea0b9a 100755 --- a/tests/run.py +++ b/tests/run.py @@ -10,7 +10,7 @@ "mypy --config-file setup.cfg scrubadub_stanford/", "flake8 --config setup.cfg scrubadub_stanford/", # If py3.5 then examples with spacy don't work so disable doctests - 'if python3 --version | grep -Evq "Python (3\\.5\\.)" ; then nosetests --with-doctest --doctest-extension=rst ./tests/ ./scrubadub_stanford/ ; else nosetests ; fi', + 'if python --version | grep -Evq "Python (3\\.5\\.)" ; then nosetests --with-doctest --doctest-extension=rst ./tests/ ./scrubadub_stanford/ ; else nosetests ; fi', ] diff --git a/tox.ini b/tox.ini index 9690e6e..4dd784d 100644 --- a/tox.ini +++ b/tox.ini @@ -14,5 +14,5 @@ setenv = commands = pip install --upgrade pip wheel setuptools pip install -r requirements/python-dev - python3 -c "import nltk; nltk.download('punkt')" + python -c "import nltk; nltk.download('punkt')" python tests/run.py