move to RegionLocalisedRegexDetector to reduce the code needed in det…

…ectors
LeapBeyond · Sep 27, 2021 · 82a7493 · 82a7493
1 parent 46bd7e2
commit 82a7493
Show file tree

Hide file tree

Showing 3 changed files with 26 additions and 85 deletions.
diff --git a/scrubadub/detectors/en_GB/national_insurance_number.py b/scrubadub/detectors/en_GB/national_insurance_number.py
@@ -1,34 +1,22 @@
 import re
 
 from scrubadub.detectors.catalogue import register_detector
-from scrubadub.detectors.base import RegexDetector
+from scrubadub.detectors.base import RegionLocalisedRegexDetector
 from scrubadub.filth import NationalInsuranceNumberFilth
 
 
 @register_detector
-class NationalInsuranceNumberDetector(RegexDetector):
-    """Use regular expressions to remove the UK National Insurance number (NINO),
+class NationalInsuranceNumberDetector(RegionLocalisedRegexDetector):
+    """Use regular expressions to remove the GB National Insurance number (NINO),
     Simple pattern matching, no checksum solution.
     """
-
     name = 'national_insurance_number'
     autoload = True
     filth_cls = NationalInsuranceNumberFilth
     # this regex is looking for NINO that does not begin with certain letters
-    regex = re.compile(
-        r'(?!BG)(?!GB)(?!NK)(?!KN)(?!TN)(?!NT)(?!ZZ)(?:[A-CEGHJ-PR-TW-Z][A-CEGHJ-NPR-TW-Z])(?:\s*\d\s*){6}[A-D]',
-        re.IGNORECASE | re.VERBOSE
-    )
-
-    @classmethod
-    def supported_locale(cls, locale: str) -> bool:
-        """Returns true if this ``Detector`` supports the given locale.
-
-        :param locale: The locale of the documents in the format: 2 letter lower-case language code followed by an
-                       underscore and the two letter upper-case country code, eg "en_GB" or "de_CH".
-        :type locale: str
-        :return: ``True`` if the locale is supported, otherwise ``False``
-        :rtype: bool
-        """
-        language, region = cls.locale_split(locale)
-        return region in ['GB']
+    region_regex = {
+        'GB': re.compile(
+            r'(?!BG)(?!GB)(?!NK)(?!KN)(?!TN)(?!NT)(?!ZZ)(?:[A-CEGHJ-PR-TW-Z][A-CEGHJ-NPR-TW-Z])(?:\s*\d\s*){6}[A-D]',
+            re.IGNORECASE | re.VERBOSE
+        ),
+    }
diff --git a/scrubadub/detectors/en_GB/tax_reference_number.py b/scrubadub/detectors/en_GB/tax_reference_number.py
@@ -1,12 +1,12 @@
 import re
 
 from scrubadub.detectors.catalogue import register_detector
-from scrubadub.detectors.base import RegexDetector
+from scrubadub.detectors.base import RegionLocalisedRegexDetector
 from scrubadub.filth import TaxReferenceNumberFilth
 
 
 @register_detector
-class TaxReferenceNumberDetector(RegexDetector):
+class TaxReferenceNumberDetector(RegionLocalisedRegexDetector):
     """Use regular expressions to detect the UK PAYE temporary reference number (TRN),
     Simple pattern matching, no checksum solution.
     """
@@ -15,17 +15,6 @@ class TaxReferenceNumberDetector(RegexDetector):
     autoload = True
     filth_cls = TaxReferenceNumberFilth
     # this regex is looking for NINO that does not begin with certain letters
-    regex = re.compile(r'''\d{2}\s?[a-zA-Z]{1}(?:\s*\d\s*){5}''', re.IGNORECASE)
-
-    @classmethod
-    def supported_locale(cls, locale: str) -> bool:
-        """Returns true if this ``Detector`` supports the given locale.
-
-        :param locale: The locale of the documents in the format: 2 letter lower-case language code followed by an
-                       underscore and the two letter upper-case country code, eg "en_GB" or "de_CH".
-        :type locale: str
-        :return: ``True`` if the locale is supported, otherwise ``False``
-        :rtype: bool
-        """
-        language, region = cls.locale_split(locale)
-        return region in ['GB']
+    region_regex = {
+        'GB': re.compile(r'''\d{2}\s?[a-zA-Z]{1}(?:\s*\d\s*){5}''', re.IGNORECASE),
+    }
diff --git a/scrubadub/detectors/en_US/social_security_number.py b/scrubadub/detectors/en_US/social_security_number.py
@@ -1,61 +1,25 @@
 import re
-import stdnum.exceptions
-import stdnum.us.ssn
-
-from typing import Optional, Generator
 
 from scrubadub.detectors.catalogue import register_detector
-from scrubadub.detectors.base import RegexDetector
-from scrubadub.filth import Filth, SocialSecurityNumberFilth
+from scrubadub.detectors.base import RegionLocalisedRegexDetector
+from scrubadub.filth import SocialSecurityNumberFilth
 
 
 @register_detector
-class SocialSecurityNumberDetector(RegexDetector):
+class SocialSecurityNumberDetector(RegionLocalisedRegexDetector):
     """Use regular expressions to detect a social security number (SSN) in
     dirty dirty ``text``.
     """
 
     filth_cls = SocialSecurityNumberFilth
     name = 'social_security_number'
     autoload = True
-    regex = re.compile((
-        r"[0-9][0-9][0-9]"       # first three digits
-        r"[\-. ]"                # separator
-        r"[0-9][0-9]"            # next two digits
-        r"[\-. ]"                # separator
-        r"[0-9][0-9][0-9][0-9]"  # last four digits
-    ), re.VERBOSE)
-
-    def __init__(self, *args, validate: bool = True, **kwargs):
-        """Initialise the detector.
-
-        :param validate: Validate the SSN using the the stdnum package
-        :type validate: bool, default True
-        :param name: Overrides the default name of the :class:``Detector``
-        :type name: str, optional
-        :param locale: The locale of the documents in the format: 2 letter lower-case language code followed by an
-                       underscore and the two letter upper-case country code, eg "en_GB" or "de_CH".
-        :type locale: str, optional
-        """
-        self.validate = validate
-        super(SocialSecurityNumberDetector, self).__init__(*args, **kwargs)
-
-    def iter_filth(self, text: str, document_name: Optional[str] = None) -> Generator[Filth, None, None]:
-        for filth in super(SocialSecurityNumberDetector, self).iter_filth(text=text, document_name=document_name):
-            if not self.validate:
-                yield filth
-            elif stdnum.us.ssn.is_valid(''.join(char for char in filth.text if char not in '. -')):
-                yield filth
-
-    @classmethod
-    def supported_locale(cls, locale: str) -> bool:
-        """Returns true if this ``Detector`` supports the given locale.
-
-        :param locale: The locale of the documents in the format: 2 letter lower-case language code followed by an
-                       underscore and the two letter upper-case country code, eg "en_GB" or "de_CH".
-        :type locale: str
-        :return: ``True`` if the locale is supported, otherwise ``False``
-        :rtype: bool
-        """
-        language, region = cls.locale_split(locale)
-        return region in ['US']
+    region_regex = {
+        'US': re.compile((
+            r"[0-9][0-9][0-9]"  # first three digits
+            r"[\-. ]"  # separator
+            r"[0-9][0-9]"  # next two digits
+            r"[\-. ]"  # separator
+            r"[0-9][0-9][0-9][0-9]"  # last four digits
+        ), re.VERBOSE),
+    }