From f193ded35a094b6277c87de70b6afcae6d5fb395 Mon Sep 17 00:00:00 2001
From: TF Tang <tftang@Ms-MacBook-Pro.local>
Date: Thu, 5 May 2022 00:32:52 +0200
Subject: [PATCH] Minor tweaks, added endpoints, added helper download
 staticmethod

---
 scrubadub_stanford/detectors/stanford.py    |  2 +-
 scrubadub_stanford/detectors/stanza.py      | 14 +++++++++++---
 scrubadub_stanford/detectors/utils/utils.py |  2 +-
 setup.cfg                                   |  2 ++
 setup.py                                    |  5 +----
 tests/run.py                                |  2 +-
 tox.ini                                     |  2 +-
 7 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/scrubadub_stanford/detectors/stanford.py b/scrubadub_stanford/detectors/stanford.py
index 7f3884e..2cf595c 100644
--- a/scrubadub_stanford/detectors/stanford.py
+++ b/scrubadub_stanford/detectors/stanford.py
@@ -15,7 +15,7 @@
 except ImportError:
     raise ImportError(
         'To use scrubadub.detectors.stanford extra dependencies need to be installed.\n'
-        'Please run: pip install scrubadub[stanford]'
+        'Please run: pip install scrubadub-stanford'
     )
 
 from typing import Dict, Type, Optional
diff --git a/scrubadub_stanford/detectors/stanza.py b/scrubadub_stanford/detectors/stanza.py
index c328b9f..c3969cf 100644
--- a/scrubadub_stanford/detectors/stanza.py
+++ b/scrubadub_stanford/detectors/stanza.py
@@ -31,6 +31,9 @@
     os.path.join(HOME_DIR, 'stanza_resources')
 )
 
+# Processors to be used in annotation Pipeline:
+PROCESSORS = ['tokenize', 'ner']
+
 
 class StanzaEntityDetector(Detector):
     """Search for people's names, organization's names and locations within text using the stanford 3 class model.
@@ -99,6 +102,12 @@ def _check_downloaded(directory: str = DEFAULT_STANZA_DIR) -> bool:
             return True
         return False
 
+    @staticmethod
+    def _download() -> Pipeline:
+        """Helper method to trigger downloading of Stanza's resources upon first init of Pipeline"""
+        pipeline = Pipeline(processors=PROCESSORS)
+        return pipeline
+
     def iter_filth(self, text: str, document_name: Optional[str] = None):
         """Yields discovered filth in the provided ``text``.
 
@@ -109,10 +118,9 @@ def iter_filth(self, text: str, document_name: Optional[str] = None):
         :return: An iterator to the discovered :class:`Filth`
         :rtype: Iterator[:class:`Filth`]
         """
-        processors = ['tokenize', 'ner']
         if not self._check_downloaded():
-            pipeline = Pipeline(processors=processors)
-        pipeline = Pipeline(processors=processors, download_method=None)
+            pipeline = self._download()
+        pipeline = Pipeline(processors=PROCESSORS, download_method=None)
         doc = pipeline(text)
         # List of tuples of text/type for each entity in document
         tags = [(ent.text, ent.type) for ent in doc.ents]
diff --git a/scrubadub_stanford/detectors/utils/utils.py b/scrubadub_stanford/detectors/utils/utils.py
index 8436cff..7cb71d7 100644
--- a/scrubadub_stanford/detectors/utils/utils.py
+++ b/scrubadub_stanford/detectors/utils/utils.py
@@ -1,5 +1,5 @@
 """
-Helper function for iterating through annotated list of entities done by Stanford NER models"
+Helper function for iterating through list of entities annotated by Stanford NER models"
 """
 from typing import Dict, Type, List, Tuple, Optional
 import re
diff --git a/setup.cfg b/setup.cfg
index 0f7c31e..601f881 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -19,3 +19,5 @@ cover-package=scrubadub_stanford
 [options.entry_points]
 scrubadub_detectors =
     stanford = scrubadub_stanford.detectors:StanfordEntityDetector
+    corenlp = scrubadub_stanford.detectors:CoreNlpEntityDetector
+    stanza = scrubadub_stanford.detectors:StanzaEntityDetector
\ No newline at end of file
diff --git a/setup.py b/setup.py
index e87ec7c..a41f938 100644
--- a/setup.py
+++ b/setup.py
@@ -47,12 +47,9 @@ def get_package_list(location):
         'Development Status :: 5 - Production/Stable',
         'License :: OSI Approved :: Apache Software License',
         'Natural Language :: English',
-        'Programming Language :: Python',
-        'Programming Language :: Python :: 3',
-        'Programming Language :: Python :: 3.5',
-        'Programming Language :: Python :: 3.6',
         'Programming Language :: Python :: 3.7',
         'Programming Language :: Python :: 3.8',
+        'Programming Language :: Python :: 3.9',
         'Topic :: Software Development :: Libraries',
         'Topic :: Scientific/Engineering :: Information Analysis',
         'Topic :: Text Processing',
diff --git a/tests/run.py b/tests/run.py
index 682e18e..dea0b9a 100755
--- a/tests/run.py
+++ b/tests/run.py
@@ -10,7 +10,7 @@
     "mypy --config-file setup.cfg scrubadub_stanford/",
     "flake8  --config setup.cfg scrubadub_stanford/",
     # If py3.5 then examples with spacy don't work so disable doctests
-    'if python3 --version | grep -Evq "Python (3\\.5\\.)" ; then nosetests --with-doctest --doctest-extension=rst ./tests/ ./scrubadub_stanford/ ; else nosetests ; fi',
+    'if python --version | grep -Evq "Python (3\\.5\\.)" ; then nosetests --with-doctest --doctest-extension=rst ./tests/ ./scrubadub_stanford/ ; else nosetests ; fi',
 ]
 
 
diff --git a/tox.ini b/tox.ini
index 9690e6e..4dd784d 100644
--- a/tox.ini
+++ b/tox.ini
@@ -14,5 +14,5 @@ setenv =
 commands =
     pip install --upgrade pip wheel setuptools
     pip install -r requirements/python-dev
-    python3 -c "import nltk; nltk.download('punkt')"
+    python -c "import nltk; nltk.download('punkt')"
     python tests/run.py