-
Notifications
You must be signed in to change notification settings - Fork 95
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Thomas Bird
committed
Dec 3, 2020
1 parent
10c1558
commit d254d1f
Showing
10 changed files
with
135 additions
and
72 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
import re | ||
|
||
from .base import RegionLocalisedRegexDetector | ||
from ..filth import DriversLicenceFilth | ||
|
||
|
||
class DriversLicenceDetector(RegionLocalisedRegexDetector): | ||
"""Use regular expressions to detect UK driving licence numbers, | ||
Simple pattern matching, no checksum solution. | ||
""" | ||
|
||
name = 'drivers_licence' | ||
filth_cls = DriversLicenceFilth | ||
|
||
region_regex = { | ||
# this regex is looking for UK driving licence numbers that follow a pattern, no checksum | ||
'GB': re.compile(r'''([a-zA-Z9]{5}\s?)((?:\s*\d\s*){6}[a-zA-Z9]{2}\w{3})\s?(\d{2})''', re.IGNORECASE) | ||
} |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
import re | ||
|
||
from scrubadub.detectors.base import RegexDetector | ||
from scrubadub.filth import NinoFilth | ||
|
||
|
||
class NINODetector(RegexDetector): | ||
"""Use regular expressions to remove the UK National Insurance number (NINO), | ||
Simple pattern matching, no checksum solution. | ||
""" | ||
|
||
name = 'nino' | ||
filth_cls = NinoFilth | ||
# this regex is looking for NINO that does not begin with certain letters | ||
regex = re.compile( | ||
r'(?!BG)(?!GB)(?!NK)(?!KN)(?!TN)(?!NT)(?!ZZ)(?:[A-CEGHJ-PR-TW-Z][A-CEGHJ-NPR-TW-Z])(?:\s*\d\s*){6}[A-D]', | ||
re.IGNORECASE | re.VERBOSE | ||
) | ||
|
||
@classmethod | ||
def supported_locale(cls, locale: str) -> bool: | ||
"""Returns true if this ``Detector`` supports the given locale. | ||
:param locale: The locale of the documents in the format: 2 letter lower-case language code followed by an | ||
underscore and the two letter upper-case country code, eg "en_GB" or "de_CH". | ||
:type locale: str | ||
:return: ``True`` if the locale is supported, otherwise ``False`` | ||
:rtype: bool | ||
""" | ||
language, region = cls.locale_split(locale) | ||
return region in ['GB'] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
import re | ||
|
||
from scrubadub.detectors.base import RegexDetector | ||
from scrubadub.filth import TrnFilth | ||
|
||
|
||
class TrnDetector(RegexDetector): | ||
"""Use regular expressions to detect the UK PAYE temporary reference number (TRN), | ||
Simple pattern matching, no checksum solution. | ||
""" | ||
|
||
name = 'trn' | ||
filth_cls = TrnFilth | ||
# this regex is looking for NINO that does not begin with certain letters | ||
regex = re.compile(r'''\d{2}\s?[a-zA-Z]{1}(?:\s*\d\s*){5}''', re.IGNORECASE) | ||
|
||
@classmethod | ||
def supported_locale(cls, locale: str) -> bool: | ||
"""Returns true if this ``Detector`` supports the given locale. | ||
:param locale: The locale of the documents in the format: 2 letter lower-case language code followed by an | ||
underscore and the two letter upper-case country code, eg "en_GB" or "de_CH". | ||
:type locale: str | ||
:return: ``True`` if the locale is supported, otherwise ``False`` | ||
:rtype: bool | ||
""" | ||
language, region = cls.locale_split(locale) | ||
return region in ['GB'] |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
import re | ||
import stdnum.exceptions | ||
import stdnum.us.ssn | ||
|
||
from typing import Optional, Generator | ||
|
||
from scrubadub.detectors.base import RegexDetector | ||
from scrubadub.filth import Filth, SSNFilth | ||
|
||
|
||
class SSNDetector(RegexDetector): | ||
"""Use regular expressions to detect a social security number (SSN) in | ||
dirty dirty ``text``. | ||
""" | ||
|
||
filth_cls = SSNFilth | ||
name = 'ssn' | ||
regex = re.compile(( | ||
r"[0-9][0-9][0-9]" # first three digits | ||
r"[\-. ]" # separator | ||
r"[0-9][0-9]" # next two digits | ||
r"[\-. ]" # separator | ||
r"[0-9][0-9][0-9][0-9]" # last four digits | ||
), re.VERBOSE) | ||
|
||
def __init__(self, *args, validate: bool = True, **kwargs): | ||
"""Initialise the detector. | ||
:param validate: Validate the SSN using the the stdnum package | ||
:type validate: bool, default True | ||
:param name: Overrides the default name of the :class:``Detector`` | ||
:type name: str, optional | ||
:param locale: The locale of the documents in the format: 2 letter lower-case language code followed by an | ||
underscore and the two letter upper-case country code, eg "en_GB" or "de_CH". | ||
:type locale: str, optional | ||
""" | ||
self.validate = validate | ||
super(SSNDetector, self).__init__(*args, **kwargs) | ||
|
||
def iter_filth(self, text: str, document_name: Optional[str] = None) -> Generator[Filth, None, None]: | ||
for filth in super(SSNDetector, self).iter_filth(text=text, document_name=document_name): | ||
if not self.validate: | ||
yield filth | ||
elif stdnum.us.ssn.is_valid(''.join(char for char in filth.text if char not in '. -')): | ||
yield filth | ||
|
||
@classmethod | ||
def supported_locale(cls, locale: str) -> bool: | ||
"""Returns true if this ``Detector`` supports the given locale. | ||
:param locale: The locale of the documents in the format: 2 letter lower-case language code followed by an | ||
underscore and the two letter upper-case country code, eg "en_GB" or "de_CH". | ||
:type locale: str | ||
:return: ``True`` if the locale is supported, otherwise ``False`` | ||
:rtype: bool | ||
""" | ||
language, region = cls.locale_split(locale) | ||
return region in ['US'] |
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.