-
Notifications
You must be signed in to change notification settings - Fork 95
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
move to RegionLocalisedRegexDetector to reduce the code needed in det…
…ectors
- Loading branch information
Thomas Bird
committed
Sep 27, 2021
1 parent
46bd7e2
commit 82a7493
Showing
3 changed files
with
26 additions
and
85 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,34 +1,22 @@ | ||
import re | ||
|
||
from scrubadub.detectors.catalogue import register_detector | ||
from scrubadub.detectors.base import RegexDetector | ||
from scrubadub.detectors.base import RegionLocalisedRegexDetector | ||
from scrubadub.filth import NationalInsuranceNumberFilth | ||
|
||
|
||
@register_detector | ||
class NationalInsuranceNumberDetector(RegexDetector): | ||
"""Use regular expressions to remove the UK National Insurance number (NINO), | ||
class NationalInsuranceNumberDetector(RegionLocalisedRegexDetector): | ||
"""Use regular expressions to remove the GB National Insurance number (NINO), | ||
Simple pattern matching, no checksum solution. | ||
""" | ||
|
||
name = 'national_insurance_number' | ||
autoload = True | ||
filth_cls = NationalInsuranceNumberFilth | ||
# this regex is looking for NINO that does not begin with certain letters | ||
regex = re.compile( | ||
r'(?!BG)(?!GB)(?!NK)(?!KN)(?!TN)(?!NT)(?!ZZ)(?:[A-CEGHJ-PR-TW-Z][A-CEGHJ-NPR-TW-Z])(?:\s*\d\s*){6}[A-D]', | ||
re.IGNORECASE | re.VERBOSE | ||
) | ||
|
||
@classmethod | ||
def supported_locale(cls, locale: str) -> bool: | ||
"""Returns true if this ``Detector`` supports the given locale. | ||
:param locale: The locale of the documents in the format: 2 letter lower-case language code followed by an | ||
underscore and the two letter upper-case country code, eg "en_GB" or "de_CH". | ||
:type locale: str | ||
:return: ``True`` if the locale is supported, otherwise ``False`` | ||
:rtype: bool | ||
""" | ||
language, region = cls.locale_split(locale) | ||
return region in ['GB'] | ||
region_regex = { | ||
'GB': re.compile( | ||
r'(?!BG)(?!GB)(?!NK)(?!KN)(?!TN)(?!NT)(?!ZZ)(?:[A-CEGHJ-PR-TW-Z][A-CEGHJ-NPR-TW-Z])(?:\s*\d\s*){6}[A-D]', | ||
re.IGNORECASE | re.VERBOSE | ||
), | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,61 +1,25 @@ | ||
import re | ||
import stdnum.exceptions | ||
import stdnum.us.ssn | ||
|
||
from typing import Optional, Generator | ||
|
||
from scrubadub.detectors.catalogue import register_detector | ||
from scrubadub.detectors.base import RegexDetector | ||
from scrubadub.filth import Filth, SocialSecurityNumberFilth | ||
from scrubadub.detectors.base import RegionLocalisedRegexDetector | ||
from scrubadub.filth import SocialSecurityNumberFilth | ||
|
||
|
||
@register_detector | ||
class SocialSecurityNumberDetector(RegexDetector): | ||
class SocialSecurityNumberDetector(RegionLocalisedRegexDetector): | ||
"""Use regular expressions to detect a social security number (SSN) in | ||
dirty dirty ``text``. | ||
""" | ||
|
||
filth_cls = SocialSecurityNumberFilth | ||
name = 'social_security_number' | ||
autoload = True | ||
regex = re.compile(( | ||
r"[0-9][0-9][0-9]" # first three digits | ||
r"[\-. ]" # separator | ||
r"[0-9][0-9]" # next two digits | ||
r"[\-. ]" # separator | ||
r"[0-9][0-9][0-9][0-9]" # last four digits | ||
), re.VERBOSE) | ||
|
||
def __init__(self, *args, validate: bool = True, **kwargs): | ||
"""Initialise the detector. | ||
:param validate: Validate the SSN using the the stdnum package | ||
:type validate: bool, default True | ||
:param name: Overrides the default name of the :class:``Detector`` | ||
:type name: str, optional | ||
:param locale: The locale of the documents in the format: 2 letter lower-case language code followed by an | ||
underscore and the two letter upper-case country code, eg "en_GB" or "de_CH". | ||
:type locale: str, optional | ||
""" | ||
self.validate = validate | ||
super(SocialSecurityNumberDetector, self).__init__(*args, **kwargs) | ||
|
||
def iter_filth(self, text: str, document_name: Optional[str] = None) -> Generator[Filth, None, None]: | ||
for filth in super(SocialSecurityNumberDetector, self).iter_filth(text=text, document_name=document_name): | ||
if not self.validate: | ||
yield filth | ||
elif stdnum.us.ssn.is_valid(''.join(char for char in filth.text if char not in '. -')): | ||
yield filth | ||
|
||
@classmethod | ||
def supported_locale(cls, locale: str) -> bool: | ||
"""Returns true if this ``Detector`` supports the given locale. | ||
:param locale: The locale of the documents in the format: 2 letter lower-case language code followed by an | ||
underscore and the two letter upper-case country code, eg "en_GB" or "de_CH". | ||
:type locale: str | ||
:return: ``True`` if the locale is supported, otherwise ``False`` | ||
:rtype: bool | ||
""" | ||
language, region = cls.locale_split(locale) | ||
return region in ['US'] | ||
region_regex = { | ||
'US': re.compile(( | ||
r"[0-9][0-9][0-9]" # first three digits | ||
r"[\-. ]" # separator | ||
r"[0-9][0-9]" # next two digits | ||
r"[\-. ]" # separator | ||
r"[0-9][0-9][0-9][0-9]" # last four digits | ||
), re.VERBOSE), | ||
} |