move around detectors

LeapBeyond · Dec 3, 2020 · d254d1f · d254d1f
1 parent 10c1558
commit d254d1f
Show file tree

Hide file tree

Showing 10 changed files with 135 additions and 72 deletions.
diff --git a/scrubadub/detectors/drivers_licence.py b/scrubadub/detectors/drivers_licence.py
@@ -0,0 +1,18 @@
+import re
+
+from .base import RegionLocalisedRegexDetector
+from ..filth import DriversLicenceFilth
+
+
+class DriversLicenceDetector(RegionLocalisedRegexDetector):
+    """Use regular expressions to detect UK driving licence numbers,
+    Simple pattern matching, no checksum solution.
+    """
+
+    name = 'drivers_licence'
+    filth_cls = DriversLicenceFilth
+
+    region_regex = {
+        # this regex is looking for UK driving licence numbers that follow a pattern, no checksum
+        'GB': re.compile(r'''([a-zA-Z9]{5}\s?)((?:\s*\d\s*){6}[a-zA-Z9]{2}\w{3})\s?(\d{2})''', re.IGNORECASE)
+    }
diff --git a/scrubadub/detectors/en_GB/__init__.py b/scrubadub/detectors/en_GB/__init__.py
diff --git a/scrubadub/detectors/en_GB/nino.py b/scrubadub/detectors/en_GB/nino.py
@@ -0,0 +1,31 @@
+import re
+
+from scrubadub.detectors.base import RegexDetector
+from scrubadub.filth import NinoFilth
+
+
+class NINODetector(RegexDetector):
+    """Use regular expressions to remove the UK National Insurance number (NINO),
+    Simple pattern matching, no checksum solution.
+    """
+
+    name = 'nino'
+    filth_cls = NinoFilth
+    # this regex is looking for NINO that does not begin with certain letters
+    regex = re.compile(
+        r'(?!BG)(?!GB)(?!NK)(?!KN)(?!TN)(?!NT)(?!ZZ)(?:[A-CEGHJ-PR-TW-Z][A-CEGHJ-NPR-TW-Z])(?:\s*\d\s*){6}[A-D]',
+        re.IGNORECASE | re.VERBOSE
+    )
+
+    @classmethod
+    def supported_locale(cls, locale: str) -> bool:
+        """Returns true if this ``Detector`` supports the given locale.
+
+        :param locale: The locale of the documents in the format: 2 letter lower-case language code followed by an
+                       underscore and the two letter upper-case country code, eg "en_GB" or "de_CH".
+        :type locale: str
+        :return: ``True`` if the locale is supported, otherwise ``False``
+        :rtype: bool
+        """
+        language, region = cls.locale_split(locale)
+        return region in ['GB']
diff --git a/scrubadub/detectors/en_GB/trn.py b/scrubadub/detectors/en_GB/trn.py
@@ -0,0 +1,28 @@
+import re
+
+from scrubadub.detectors.base import RegexDetector
+from scrubadub.filth import TrnFilth
+
+
+class TrnDetector(RegexDetector):
+    """Use regular expressions to detect the UK PAYE temporary reference number (TRN),
+    Simple pattern matching, no checksum solution.
+    """
+
+    name = 'trn'
+    filth_cls = TrnFilth
+    # this regex is looking for NINO that does not begin with certain letters
+    regex = re.compile(r'''\d{2}\s?[a-zA-Z]{1}(?:\s*\d\s*){5}''', re.IGNORECASE)
+
+    @classmethod
+    def supported_locale(cls, locale: str) -> bool:
+        """Returns true if this ``Detector`` supports the given locale.
+
+        :param locale: The locale of the documents in the format: 2 letter lower-case language code followed by an
+                       underscore and the two letter upper-case country code, eg "en_GB" or "de_CH".
+        :type locale: str
+        :return: ``True`` if the locale is supported, otherwise ``False``
+        :rtype: bool
+        """
+        language, region = cls.locale_split(locale)
+        return region in ['GB']
diff --git a/scrubadub/detectors/en_US/__init__.py b/scrubadub/detectors/en_US/__init__.py
diff --git a/scrubadub/detectors/en_US/ssn.py b/scrubadub/detectors/en_US/ssn.py
@@ -0,0 +1,58 @@
+import re
+import stdnum.exceptions
+import stdnum.us.ssn
+
+from typing import Optional, Generator
+
+from scrubadub.detectors.base import RegexDetector
+from scrubadub.filth import Filth, SSNFilth
+
+
+class SSNDetector(RegexDetector):
+    """Use regular expressions to detect a social security number (SSN) in
+    dirty dirty ``text``.
+    """
+
+    filth_cls = SSNFilth
+    name = 'ssn'
+    regex = re.compile((
+        r"[0-9][0-9][0-9]"       # first three digits
+        r"[\-. ]"                # separator
+        r"[0-9][0-9]"            # next two digits
+        r"[\-. ]"                # separator
+        r"[0-9][0-9][0-9][0-9]"  # last four digits
+    ), re.VERBOSE)
+
+    def __init__(self, *args, validate: bool = True, **kwargs):
+        """Initialise the detector.
+
+        :param validate: Validate the SSN using the the stdnum package
+        :type validate: bool, default True
+        :param name: Overrides the default name of the :class:``Detector``
+        :type name: str, optional
+        :param locale: The locale of the documents in the format: 2 letter lower-case language code followed by an
+                       underscore and the two letter upper-case country code, eg "en_GB" or "de_CH".
+        :type locale: str, optional
+        """
+        self.validate = validate
+        super(SSNDetector, self).__init__(*args, **kwargs)
+
+    def iter_filth(self, text: str, document_name: Optional[str] = None) -> Generator[Filth, None, None]:
+        for filth in super(SSNDetector, self).iter_filth(text=text, document_name=document_name):
+            if not self.validate:
+                yield filth
+            elif stdnum.us.ssn.is_valid(''.join(char for char in filth.text if char not in '. -')):
+                yield filth
+
+    @classmethod
+    def supported_locale(cls, locale: str) -> bool:
+        """Returns true if this ``Detector`` supports the given locale.
+
+        :param locale: The locale of the documents in the format: 2 letter lower-case language code followed by an
+                       underscore and the two letter upper-case country code, eg "en_GB" or "de_CH".
+        :type locale: str
+        :return: ``True`` if the locale is supported, otherwise ``False``
+        :rtype: bool
+        """
+        language, region = cls.locale_split(locale)
+        return region in ['US']
diff --git a/scrubadub/detectors/gb_drivers.py b/scrubadub/detectors/gb_drivers.py
diff --git a/scrubadub/detectors/gb_nino.py b/scrubadub/detectors/gb_nino.py
diff --git a/scrubadub/detectors/gb_trn.py b/scrubadub/detectors/gb_trn.py
diff --git a/scrubadub/detectors/ssn.py b/scrubadub/detectors/ssn.py