Skip to content

Commit

Permalink
move to RegionLocalisedRegexDetector to reduce the code needed in det…
Browse files Browse the repository at this point in the history
…ectors
  • Loading branch information
Thomas Bird committed Sep 27, 2021
1 parent 46bd7e2 commit 82a7493
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 85 deletions.
30 changes: 9 additions & 21 deletions scrubadub/detectors/en_GB/national_insurance_number.py
Original file line number Diff line number Diff line change
@@ -1,34 +1,22 @@
import re

from scrubadub.detectors.catalogue import register_detector
from scrubadub.detectors.base import RegexDetector
from scrubadub.detectors.base import RegionLocalisedRegexDetector
from scrubadub.filth import NationalInsuranceNumberFilth


@register_detector
class NationalInsuranceNumberDetector(RegexDetector):
"""Use regular expressions to remove the UK National Insurance number (NINO),
class NationalInsuranceNumberDetector(RegionLocalisedRegexDetector):
"""Use regular expressions to remove the GB National Insurance number (NINO),
Simple pattern matching, no checksum solution.
"""

name = 'national_insurance_number'
autoload = True
filth_cls = NationalInsuranceNumberFilth
# this regex is looking for NINO that does not begin with certain letters
regex = re.compile(
r'(?!BG)(?!GB)(?!NK)(?!KN)(?!TN)(?!NT)(?!ZZ)(?:[A-CEGHJ-PR-TW-Z][A-CEGHJ-NPR-TW-Z])(?:\s*\d\s*){6}[A-D]',
re.IGNORECASE | re.VERBOSE
)

@classmethod
def supported_locale(cls, locale: str) -> bool:
"""Returns true if this ``Detector`` supports the given locale.
:param locale: The locale of the documents in the format: 2 letter lower-case language code followed by an
underscore and the two letter upper-case country code, eg "en_GB" or "de_CH".
:type locale: str
:return: ``True`` if the locale is supported, otherwise ``False``
:rtype: bool
"""
language, region = cls.locale_split(locale)
return region in ['GB']
region_regex = {
'GB': re.compile(
r'(?!BG)(?!GB)(?!NK)(?!KN)(?!TN)(?!NT)(?!ZZ)(?:[A-CEGHJ-PR-TW-Z][A-CEGHJ-NPR-TW-Z])(?:\s*\d\s*){6}[A-D]',
re.IGNORECASE | re.VERBOSE
),
}
21 changes: 5 additions & 16 deletions scrubadub/detectors/en_GB/tax_reference_number.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import re

from scrubadub.detectors.catalogue import register_detector
from scrubadub.detectors.base import RegexDetector
from scrubadub.detectors.base import RegionLocalisedRegexDetector
from scrubadub.filth import TaxReferenceNumberFilth


@register_detector
class TaxReferenceNumberDetector(RegexDetector):
class TaxReferenceNumberDetector(RegionLocalisedRegexDetector):
"""Use regular expressions to detect the UK PAYE temporary reference number (TRN),
Simple pattern matching, no checksum solution.
"""
Expand All @@ -15,17 +15,6 @@ class TaxReferenceNumberDetector(RegexDetector):
autoload = True
filth_cls = TaxReferenceNumberFilth
# this regex is looking for NINO that does not begin with certain letters
regex = re.compile(r'''\d{2}\s?[a-zA-Z]{1}(?:\s*\d\s*){5}''', re.IGNORECASE)

@classmethod
def supported_locale(cls, locale: str) -> bool:
"""Returns true if this ``Detector`` supports the given locale.
:param locale: The locale of the documents in the format: 2 letter lower-case language code followed by an
underscore and the two letter upper-case country code, eg "en_GB" or "de_CH".
:type locale: str
:return: ``True`` if the locale is supported, otherwise ``False``
:rtype: bool
"""
language, region = cls.locale_split(locale)
return region in ['GB']
region_regex = {
'GB': re.compile(r'''\d{2}\s?[a-zA-Z]{1}(?:\s*\d\s*){5}''', re.IGNORECASE),
}
60 changes: 12 additions & 48 deletions scrubadub/detectors/en_US/social_security_number.py
Original file line number Diff line number Diff line change
@@ -1,61 +1,25 @@
import re
import stdnum.exceptions
import stdnum.us.ssn

from typing import Optional, Generator

from scrubadub.detectors.catalogue import register_detector
from scrubadub.detectors.base import RegexDetector
from scrubadub.filth import Filth, SocialSecurityNumberFilth
from scrubadub.detectors.base import RegionLocalisedRegexDetector
from scrubadub.filth import SocialSecurityNumberFilth


@register_detector
class SocialSecurityNumberDetector(RegexDetector):
class SocialSecurityNumberDetector(RegionLocalisedRegexDetector):
"""Use regular expressions to detect a social security number (SSN) in
dirty dirty ``text``.
"""

filth_cls = SocialSecurityNumberFilth
name = 'social_security_number'
autoload = True
regex = re.compile((
r"[0-9][0-9][0-9]" # first three digits
r"[\-. ]" # separator
r"[0-9][0-9]" # next two digits
r"[\-. ]" # separator
r"[0-9][0-9][0-9][0-9]" # last four digits
), re.VERBOSE)

def __init__(self, *args, validate: bool = True, **kwargs):
"""Initialise the detector.
:param validate: Validate the SSN using the the stdnum package
:type validate: bool, default True
:param name: Overrides the default name of the :class:``Detector``
:type name: str, optional
:param locale: The locale of the documents in the format: 2 letter lower-case language code followed by an
underscore and the two letter upper-case country code, eg "en_GB" or "de_CH".
:type locale: str, optional
"""
self.validate = validate
super(SocialSecurityNumberDetector, self).__init__(*args, **kwargs)

def iter_filth(self, text: str, document_name: Optional[str] = None) -> Generator[Filth, None, None]:
for filth in super(SocialSecurityNumberDetector, self).iter_filth(text=text, document_name=document_name):
if not self.validate:
yield filth
elif stdnum.us.ssn.is_valid(''.join(char for char in filth.text if char not in '. -')):
yield filth

@classmethod
def supported_locale(cls, locale: str) -> bool:
"""Returns true if this ``Detector`` supports the given locale.
:param locale: The locale of the documents in the format: 2 letter lower-case language code followed by an
underscore and the two letter upper-case country code, eg "en_GB" or "de_CH".
:type locale: str
:return: ``True`` if the locale is supported, otherwise ``False``
:rtype: bool
"""
language, region = cls.locale_split(locale)
return region in ['US']
region_regex = {
'US': re.compile((
r"[0-9][0-9][0-9]" # first three digits
r"[\-. ]" # separator
r"[0-9][0-9]" # next two digits
r"[\-. ]" # separator
r"[0-9][0-9][0-9][0-9]" # last four digits
), re.VERBOSE),
}

0 comments on commit 82a7493

Please sign in to comment.