Skip to content

Commit

Permalink
Minor tweaks, added endpoints, added helper download staticmethod
Browse files Browse the repository at this point in the history
  • Loading branch information
TF Tang authored and TF Tang committed May 4, 2022
1 parent d9a0f8a commit f193ded
Show file tree
Hide file tree
Showing 7 changed files with 18 additions and 11 deletions.
2 changes: 1 addition & 1 deletion scrubadub_stanford/detectors/stanford.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
except ImportError:
raise ImportError(
'To use scrubadub.detectors.stanford extra dependencies need to be installed.\n'
'Please run: pip install scrubadub[stanford]'
'Please run: pip install scrubadub-stanford'
)

from typing import Dict, Type, Optional
Expand Down
14 changes: 11 additions & 3 deletions scrubadub_stanford/detectors/stanza.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@
os.path.join(HOME_DIR, 'stanza_resources')
)

# Processors to be used in annotation Pipeline:
PROCESSORS = ['tokenize', 'ner']


class StanzaEntityDetector(Detector):
"""Search for people's names, organization's names and locations within text using the stanford 3 class model.
Expand Down Expand Up @@ -99,6 +102,12 @@ def _check_downloaded(directory: str = DEFAULT_STANZA_DIR) -> bool:
return True
return False

@staticmethod
def _download() -> Pipeline:
"""Helper method to trigger downloading of Stanza's resources upon first init of Pipeline"""
pipeline = Pipeline(processors=PROCESSORS)
return pipeline

def iter_filth(self, text: str, document_name: Optional[str] = None):
"""Yields discovered filth in the provided ``text``.
Expand All @@ -109,10 +118,9 @@ def iter_filth(self, text: str, document_name: Optional[str] = None):
:return: An iterator to the discovered :class:`Filth`
:rtype: Iterator[:class:`Filth`]
"""
processors = ['tokenize', 'ner']
if not self._check_downloaded():
pipeline = Pipeline(processors=processors)
pipeline = Pipeline(processors=processors, download_method=None)
pipeline = self._download()
pipeline = Pipeline(processors=PROCESSORS, download_method=None)
doc = pipeline(text)
# List of tuples of text/type for each entity in document
tags = [(ent.text, ent.type) for ent in doc.ents]
Expand Down
2 changes: 1 addition & 1 deletion scrubadub_stanford/detectors/utils/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
Helper function for iterating through annotated list of entities done by Stanford NER models"
Helper function for iterating through list of entities annotated by Stanford NER models"
"""
from typing import Dict, Type, List, Tuple, Optional
import re
Expand Down
2 changes: 2 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,5 @@ cover-package=scrubadub_stanford
[options.entry_points]
scrubadub_detectors =
stanford = scrubadub_stanford.detectors:StanfordEntityDetector
corenlp = scrubadub_stanford.detectors:CoreNlpEntityDetector
stanza = scrubadub_stanford.detectors:StanzaEntityDetector
5 changes: 1 addition & 4 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,9 @@ def get_package_list(location):
'Development Status :: 5 - Production/Stable',
'License :: OSI Approved :: Apache Software License',
'Natural Language :: English',
'Programming Language :: Python',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3.9',
'Topic :: Software Development :: Libraries',
'Topic :: Scientific/Engineering :: Information Analysis',
'Topic :: Text Processing',
Expand Down
2 changes: 1 addition & 1 deletion tests/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
"mypy --config-file setup.cfg scrubadub_stanford/",
"flake8 --config setup.cfg scrubadub_stanford/",
# If py3.5 then examples with spacy don't work so disable doctests
'if python3 --version | grep -Evq "Python (3\\.5\\.)" ; then nosetests --with-doctest --doctest-extension=rst ./tests/ ./scrubadub_stanford/ ; else nosetests ; fi',
'if python --version | grep -Evq "Python (3\\.5\\.)" ; then nosetests --with-doctest --doctest-extension=rst ./tests/ ./scrubadub_stanford/ ; else nosetests ; fi',
]


Expand Down
2 changes: 1 addition & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,5 @@ setenv =
commands =
pip install --upgrade pip wheel setuptools
pip install -r requirements/python-dev
python3 -c "import nltk; nltk.download('punkt')"
python -c "import nltk; nltk.download('punkt')"
python tests/run.py

0 comments on commit f193ded

Please sign in to comment.