Skip to content

Commit

Permalink
move around detectors
Browse files Browse the repository at this point in the history
  • Loading branch information
Thomas Bird committed Dec 3, 2020
1 parent 10c1558 commit d254d1f
Show file tree
Hide file tree
Showing 10 changed files with 135 additions and 72 deletions.
18 changes: 18 additions & 0 deletions scrubadub/detectors/drivers_licence.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import re

from .base import RegionLocalisedRegexDetector
from ..filth import DriversLicenceFilth


class DriversLicenceDetector(RegionLocalisedRegexDetector):
"""Use regular expressions to detect UK driving licence numbers,
Simple pattern matching, no checksum solution.
"""

name = 'drivers_licence'
filth_cls = DriversLicenceFilth

region_regex = {
# this regex is looking for UK driving licence numbers that follow a pattern, no checksum
'GB': re.compile(r'''([a-zA-Z9]{5}\s?)((?:\s*\d\s*){6}[a-zA-Z9]{2}\w{3})\s?(\d{2})''', re.IGNORECASE)
}
Empty file.
31 changes: 31 additions & 0 deletions scrubadub/detectors/en_GB/nino.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import re

from scrubadub.detectors.base import RegexDetector
from scrubadub.filth import NinoFilth


class NINODetector(RegexDetector):
"""Use regular expressions to remove the UK National Insurance number (NINO),
Simple pattern matching, no checksum solution.
"""

name = 'nino'
filth_cls = NinoFilth
# this regex is looking for NINO that does not begin with certain letters
regex = re.compile(
r'(?!BG)(?!GB)(?!NK)(?!KN)(?!TN)(?!NT)(?!ZZ)(?:[A-CEGHJ-PR-TW-Z][A-CEGHJ-NPR-TW-Z])(?:\s*\d\s*){6}[A-D]',
re.IGNORECASE | re.VERBOSE
)

@classmethod
def supported_locale(cls, locale: str) -> bool:
"""Returns true if this ``Detector`` supports the given locale.
:param locale: The locale of the documents in the format: 2 letter lower-case language code followed by an
underscore and the two letter upper-case country code, eg "en_GB" or "de_CH".
:type locale: str
:return: ``True`` if the locale is supported, otherwise ``False``
:rtype: bool
"""
language, region = cls.locale_split(locale)
return region in ['GB']
28 changes: 28 additions & 0 deletions scrubadub/detectors/en_GB/trn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import re

from scrubadub.detectors.base import RegexDetector
from scrubadub.filth import TrnFilth


class TrnDetector(RegexDetector):
"""Use regular expressions to detect the UK PAYE temporary reference number (TRN),
Simple pattern matching, no checksum solution.
"""

name = 'trn'
filth_cls = TrnFilth
# this regex is looking for NINO that does not begin with certain letters
regex = re.compile(r'''\d{2}\s?[a-zA-Z]{1}(?:\s*\d\s*){5}''', re.IGNORECASE)

@classmethod
def supported_locale(cls, locale: str) -> bool:
"""Returns true if this ``Detector`` supports the given locale.
:param locale: The locale of the documents in the format: 2 letter lower-case language code followed by an
underscore and the two letter upper-case country code, eg "en_GB" or "de_CH".
:type locale: str
:return: ``True`` if the locale is supported, otherwise ``False``
:rtype: bool
"""
language, region = cls.locale_split(locale)
return region in ['GB']
Empty file.
58 changes: 58 additions & 0 deletions scrubadub/detectors/en_US/ssn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import re
import stdnum.exceptions
import stdnum.us.ssn

from typing import Optional, Generator

from scrubadub.detectors.base import RegexDetector
from scrubadub.filth import Filth, SSNFilth


class SSNDetector(RegexDetector):
"""Use regular expressions to detect a social security number (SSN) in
dirty dirty ``text``.
"""

filth_cls = SSNFilth
name = 'ssn'
regex = re.compile((
r"[0-9][0-9][0-9]" # first three digits
r"[\-. ]" # separator
r"[0-9][0-9]" # next two digits
r"[\-. ]" # separator
r"[0-9][0-9][0-9][0-9]" # last four digits
), re.VERBOSE)

def __init__(self, *args, validate: bool = True, **kwargs):
"""Initialise the detector.
:param validate: Validate the SSN using the the stdnum package
:type validate: bool, default True
:param name: Overrides the default name of the :class:``Detector``
:type name: str, optional
:param locale: The locale of the documents in the format: 2 letter lower-case language code followed by an
underscore and the two letter upper-case country code, eg "en_GB" or "de_CH".
:type locale: str, optional
"""
self.validate = validate
super(SSNDetector, self).__init__(*args, **kwargs)

def iter_filth(self, text: str, document_name: Optional[str] = None) -> Generator[Filth, None, None]:
for filth in super(SSNDetector, self).iter_filth(text=text, document_name=document_name):
if not self.validate:
yield filth
elif stdnum.us.ssn.is_valid(''.join(char for char in filth.text if char not in '. -')):
yield filth

@classmethod
def supported_locale(cls, locale: str) -> bool:
"""Returns true if this ``Detector`` supports the given locale.
:param locale: The locale of the documents in the format: 2 letter lower-case language code followed by an
underscore and the two letter upper-case country code, eg "en_GB" or "de_CH".
:type locale: str
:return: ``True`` if the locale is supported, otherwise ``False``
:rtype: bool
"""
language, region = cls.locale_split(locale)
return region in ['US']
15 changes: 0 additions & 15 deletions scrubadub/detectors/gb_drivers.py

This file was deleted.

17 changes: 0 additions & 17 deletions scrubadub/detectors/gb_nino.py

This file was deleted.

15 changes: 0 additions & 15 deletions scrubadub/detectors/gb_trn.py

This file was deleted.

25 changes: 0 additions & 25 deletions scrubadub/detectors/ssn.py

This file was deleted.

0 comments on commit d254d1f

Please sign in to comment.