From fcb39cea78624d720ac3178d8261ed9eb6de77ef Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Tue, 6 Feb 2024 07:20:34 -0500 Subject: [PATCH] Parse `display name ` syntax Per request in #116, parse display name syntax also, but don't allow it unless a new allow_display_name option is set. Parsing according to the MIME specification probably isn't what's generally wanted since the use case is probably parsing inputs in email composition-like user interfaces. So it's in the spirit of a MIME message but not the letter. If display name syntax is used, return the unquoted/unescaped display name in the returned object. --- CHANGELOG.md | 1 + README.md | 16 +-- email_validator/__init__.py | 1 + email_validator/exceptions_types.py | 7 +- email_validator/rfc_constants.py | 5 +- email_validator/syntax.py | 164 ++++++++++++++++++++++++---- email_validator/validate_email.py | 22 +++- tests/test_syntax.py | 50 ++++++++- 8 files changed, 223 insertions(+), 43 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8be1a97..99f7482 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ In Development -------------- * The library now includes an asynchronous version of the main method named validate_email_async, which can be called with await, that runs DNS-based deliverability checks asychronously. +* A new option to parse `My Name ` strings, i.e. a display name plus an email address in angle brackets, is now included. It is off by default. 2.1.1 (February 26, 2024) ------------------------- diff --git a/README.md b/README.md index fe666b1..61dd7ac 100644 --- a/README.md +++ b/README.md @@ -7,8 +7,7 @@ Python 3.8+ by [Joshua Tauberer](https://joshdata.me). This library validates that a string is of the form `name@example.com` and optionally checks that the domain name is set up to receive email. This is the sort of validation you would want when you are identifying -users by their email address like on a registration/login form (but not -necessarily for composing an email message, see below). +users by their email address like on a registration form. Key features: @@ -19,7 +18,8 @@ Key features: * Checks deliverability (optional): Does the domain name resolve? (You can override the default DNS resolver to add query caching.) * Can be called asynchronously with `await`. -* Supports internationalized domain names and internationalized local parts. +* Supports internationalized domain names and internationalized local parts, + and optionally supports display names (e.g. `"My Name" `). * Rejects addresses with unsafe Unicode characters, obsolete email address syntax that you'd find unexpected, special use domain names like `@localhost`, and domains without a dot by default. This is an @@ -29,9 +29,8 @@ Key features: * Python type annotations are used. This is an opinionated library. You should definitely also consider using -the less-opinionated [pyIsEmail](https://github.com/michaelherold/pyIsEmail) and -[flanker](https://github.com/mailgun/flanker) if they are better for your -use case. +the less-opinionated [pyIsEmail](https://github.com/michaelherold/pyIsEmail) +if it works better for you. [![Build Status](https://github.com/JoshData/python-email-validator/actions/workflows/test_and_build.yaml/badge.svg)](https://github.com/JoshData/python-email-validator/actions/workflows/test_and_build.yaml) @@ -148,6 +147,8 @@ The `validate_email` function also accepts the following keyword arguments `allow_domain_literal=False`: Set to `True` to allow bracketed IPv4 and "IPv6:"-prefixd IPv6 addresses in the domain part of the email address. No deliverability checks are performed for these addresses. In the object returned by `validate_email`, the normalized domain will use the condensed IPv6 format, if applicable. The object's `domain_address` attribute will hold the parsed `ipaddress.IPv4Address` or `ipaddress.IPv6Address` object if applicable. You can also set `email_validator.ALLOW_DOMAIN_LITERAL` to `True` to turn this on for all calls by default. +`allow_display_name=False`: Set to `True` to allow a display name and bracketed address in the input string, like `My Name `. It's implemented in the spirit but not the letter of RFC 5322 3.4, so it may be stricter or more relaxed than what you want. The display name, if present, is provided in the returned object's `display_name` field after being unquoted and unescaped. You can also set `email_validator.ALLOW_DISPLAY_NAME` to `True` to turn this on for all calls by default. + `allow_empty_local=False`: Set to `True` to allow an empty local part (i.e. `@example.com`), e.g. for validating Postfix aliases. @@ -423,6 +424,7 @@ are: | `domain` | The canonical internationalized Unicode form of the domain part of the email address. If the returned string contains non-ASCII characters, either the [SMTPUTF8](https://tools.ietf.org/html/rfc6531) feature of your mail relay will be required to transmit the message or else the email address's domain part must be converted to IDNA ASCII first: Use `ascii_domain` field instead. | | `ascii_domain` | The [IDNA](https://tools.ietf.org/html/rfc5891) [Punycode](https://www.rfc-editor.org/rfc/rfc3492.txt)-encoded form of the domain part of the given email address, as it would be transmitted on the wire. | | `domain_address` | If domain literals are allowed and if the email address contains one, an `ipaddress.IPv4Address` or `ipaddress.IPv6Address` object. | +| `display_name` | If no display name was present and angle brackets do not surround the address, this will be `None`; otherwise, it will be set to the display name, or the empty string if there were angle brackets but no display name. If the display name was quoted, it will be unquoted and unescaped. | | `smtputf8` | A boolean indicating that the [SMTPUTF8](https://tools.ietf.org/html/rfc6531) feature of your mail relay will be required to transmit messages to this address because the local part of the address has non-ASCII characters (the local part cannot be IDNA-encoded). If `allow_smtputf8=False` is passed as an argument, this flag will always be false because an exception is raised if it would have been true. | | `mx` | A list of (priority, domain) tuples of MX records specified in the DNS for the domain (see [RFC 5321 section 5](https://tools.ietf.org/html/rfc5321#section-5)). May be `None` if the deliverability check could not be completed because of a temporary issue like a timeout. | | `mx_fallback_type` | `None` if an `MX` record is found. If no MX records are actually specified in DNS and instead are inferred, through an obsolete mechanism, from A or AAAA records, the value is the type of DNS record used instead (`A` or `AAAA`). May be `None` if the deliverability check could not be completed because of a temporary issue like a timeout. | @@ -486,4 +488,4 @@ git push --tags License ------- -This project is free of any copyright restrictions per the [Unlicense](https://unlicense.org/). (Prior to Feb. 4, 2024, the project was made available under the terms of the [CC0 1.0 Universal public domain dedication](http://creativecommons.org/publicdomain/zero/1.0/).) See [LICENSE](LICENSE) and [CONTRIBUTING.md](CONTRIBUTING.md). \ No newline at end of file +This project is free of any copyright restrictions per the [Unlicense](https://unlicense.org/). (Prior to Feb. 4, 2024, the project was made available under the terms of the [CC0 1.0 Universal public domain dedication](http://creativecommons.org/publicdomain/zero/1.0/).) See [LICENSE](LICENSE) and [CONTRIBUTING.md](CONTRIBUTING.md). diff --git a/email_validator/__init__.py b/email_validator/__init__.py index cb942ef..5eb2e06 100644 --- a/email_validator/__init__.py +++ b/email_validator/__init__.py @@ -33,6 +33,7 @@ def caching_async_resolver(*args, **kwargs): ALLOW_SMTPUTF8 = True ALLOW_QUOTED_LOCAL = False ALLOW_DOMAIN_LITERAL = False +ALLOW_DISPLAY_NAME = False GLOBALLY_DELIVERABLE = True CHECK_DELIVERABILITY = True TEST_ENVIRONMENT = False diff --git a/email_validator/exceptions_types.py b/email_validator/exceptions_types.py index 4522b4f..261e3d0 100644 --- a/email_validator/exceptions_types.py +++ b/email_validator/exceptions_types.py @@ -62,6 +62,9 @@ class ValidatedEmail: mechanism, from A or AAAA records, the value is the type of DNS record used instead (`A` or `AAAA`).""" mx_fallback_type: str + """The display name in the original input text.""" + display_name: str + """Tests use this constructor.""" def __init__(self, **kwargs): for k, v in kwargs.items(): @@ -120,6 +123,7 @@ def __eq__(self, other): and repr(sorted(self.mx) if getattr(self, 'mx', None) else None) == repr(sorted(other.mx) if getattr(other, 'mx', None) else None) and getattr(self, 'mx_fallback_type', None) == getattr(other, 'mx_fallback_type', None) + and getattr(self, 'display_name', None) == getattr(other, 'display_name', None) ) """This helps producing the README.""" @@ -128,7 +132,8 @@ def as_constructor(self): + ",".join(f"\n {key}={repr(getattr(self, key))}" for key in ('normalized', 'local_part', 'domain', 'ascii_email', 'ascii_local_part', 'ascii_domain', - 'smtputf8', 'mx', 'mx_fallback_type') + 'smtputf8', 'mx', 'mx_fallback_type', + 'display_name') if hasattr(self, key) ) \ + ")" diff --git a/email_validator/rfc_constants.py b/email_validator/rfc_constants.py index a6b9c59..a802c97 100644 --- a/email_validator/rfc_constants.py +++ b/email_validator/rfc_constants.py @@ -13,7 +13,7 @@ # RFC 3629 section 4, which appear to be the Unicode code points from # U+0080 to U+10FFFF. ATEXT_INTL = ATEXT + "\u0080-\U0010FFFF" -ATEXT_INTL_RE = re.compile('[.' + ATEXT_INTL + ']') # ATEXT_INTL plus dots +ATEXT_INTL_DOT_RE = re.compile('[.' + ATEXT_INTL + ']') # ATEXT_INTL plus dots DOT_ATOM_TEXT_INTL = re.compile('[' + ATEXT_INTL + ']+(?:\\.[' + ATEXT_INTL + r']+)*\Z') # The domain part of the email address, after IDNA (ASCII) encoding, @@ -30,10 +30,9 @@ # Quoted-string local part (RFC 5321 4.1.2, internationalized by RFC 6531 3.3) # The permitted characters in a quoted string are the characters in the range # 32-126, except that quotes and (literal) backslashes can only appear when escaped -# by a backslash. When internationalized, UTF8 strings are also permitted except +# by a backslash. When internationalized, UTF-8 strings are also permitted except # the ASCII characters that are not previously permitted (see above). # QUOTED_LOCAL_PART_ADDR = re.compile(r"^\"((?:[\u0020-\u0021\u0023-\u005B\u005D-\u007E]|\\[\u0020-\u007E])*)\"@(.*)") -QUOTED_LOCAL_PART_ADDR = re.compile(r"^\"((?:[^\"\\]|\\.)*)\"@(.*)") QTEXT_INTL = re.compile(r"[\u0020-\u007E\u0080-\U0010FFFF]") # Length constants diff --git a/email_validator/syntax.py b/email_validator/syntax.py index 6634ace..b8df0e6 100644 --- a/email_validator/syntax.py +++ b/email_validator/syntax.py @@ -1,8 +1,7 @@ from .exceptions_types import EmailSyntaxError from .rfc_constants import EMAIL_MAX_LENGTH, LOCAL_PART_MAX_LENGTH, DOMAIN_MAX_LENGTH, \ - DOT_ATOM_TEXT, DOT_ATOM_TEXT_INTL, ATEXT_RE, ATEXT_INTL_RE, ATEXT_HOSTNAME_INTL, QTEXT_INTL, \ - DNS_LABEL_LENGTH_LIMIT, DOT_ATOM_TEXT_HOSTNAME, DOMAIN_NAME_REGEX, DOMAIN_LITERAL_CHARS, \ - QUOTED_LOCAL_PART_ADDR + DOT_ATOM_TEXT, DOT_ATOM_TEXT_INTL, ATEXT_RE, ATEXT_INTL_DOT_RE, ATEXT_HOSTNAME_INTL, QTEXT_INTL, \ + DNS_LABEL_LENGTH_LIMIT, DOT_ATOM_TEXT_HOSTNAME, DOMAIN_NAME_REGEX, DOMAIN_LITERAL_CHARS import re import unicodedata @@ -12,31 +11,148 @@ def split_email(email): - # Return the local part and domain part of the address and - # whether the local part was quoted as a three-tuple. + # Return the display name, unescaped local part, and domain part + # of the address, and whether the local part was quoted. If no + # display name was present and angle brackets do not surround + # the address, display name will be None; otherwise, it will be + # set to the display name or the empty string if there were + # angle brackets but no display name. + + # Typical email addresses have a single @-sign and no quote + # characters, but the awkward "quoted string" local part form + # (RFC 5321 4.1.2) allows @-signs and escaped quotes to appear + # in the local part if the local part is quoted. + + # A `display name ` format is also present in MIME messages + # (RFC 5322 3.4) and this format is also often recognized in + # mail UIs. It's not allowed in SMTP commands or in typical web + # login forms, but parsing it has been requested, so it's done + # here as a convenience. It's implemented in the spirit but not + # the letter of RFC 5322 3.4 because MIME messages allow newlines + # and comments as a part of the CFWS rule, but this is typically + # not allowed in mail UIs (although comment syntax was requested + # once too). + # + # Display names are either basic characters (the same basic characters + # permitted in email addresses, but periods are not allowed and spaces + # are allowed; see RFC 5322 Appendix A.1.2), or or a quoted string with + # the same rules as a quoted local part. (Multiple quoted strings might + # be allowed? Unclear.) Optional space (RFC 5322 3.4 CFWS) and then the + # email address follows in angle brackets. + # + # An initial quote is ambiguous between starting a display name or + # a quoted local part --- fun. + # + # We assume the input string is already stripped of leading and + # trailing CFWS. + + def split_string_at_unquoted_special(text, specials): + # Split the string at the first character in specials (an @-sign + # or left angle bracket) that does not occur within quotes. + inside_quote = False + escaped = False + left_part = "" + for c in text: + if inside_quote: + left_part += c + if c == '\\' and not escaped: + escaped = True + elif c == '"' and not escaped: + # The only way to exit the quote is an unescaped quote. + inside_quote = False + escaped = False + else: + escaped = False + elif c == '"': + left_part += c + inside_quote = True + elif c in specials: + # When unquoted, stop before a special character. + break + else: + left_part += c + + # The right part is whatever is left. + right_part = text[len(left_part):] + + return left_part, right_part + + def unquote_quoted_string(text): + # Remove surrounding quotes and unescape escaped backslashes + # and quotes. Escapes are parsed liberally. I think only + # backslashes and quotes can be escaped but we'll allow anything + # to be. + quoted = False + escaped = False + value = "" + for i, c in enumerate(text): + if quoted: + if escaped: + value += c + escaped = False + elif c == '\\': + escaped = True + elif c == '"': + if i != len(text) - 1: + raise EmailSyntaxError("Extra character(s) found after close quote: " + + ", ".join(safe_character_display(c) for c in text[i + 1:])) + break + else: + value += c + elif i == 0 and c == '"': + quoted = True + else: + value += c + + return value, quoted + + # Split the string at the first unquoted @-sign or left angle bracket. + left_part, right_part = split_string_at_unquoted_special(email, ("@", "<")) + + # If the right part starts with an angle bracket, + # then the left part is a display name and the rest + # of the right part up to the final right angle bracket + # is the email address, . + if right_part.startswith("<"): + # Remove space between the display name and angle bracket. + left_part = left_part.rstrip() + + # Unquote and unescape the display name. + display_name, display_name_quoted = unquote_quoted_string(left_part) + + # Check that only basic characters are present in a + # non-quoted display name. + if not display_name_quoted: + bad_chars = { + safe_character_display(c) + for c in display_name + if (not ATEXT_RE.match(c) and c != ' ') or c == '.' + } + if bad_chars: + raise EmailSyntaxError("The display name contains invalid characters when not quoted: " + ", ".join(sorted(bad_chars)) + ".") - # Typical email addresses have a single @-sign, but the - # awkward "quoted string" local part form (RFC 5321 4.1.2) - # allows @-signs (and escaped quotes) to appear in the local - # part if the local part is quoted. If the address is quoted, - # split it at a non-escaped @-sign and unescape the escaping. - if m := QUOTED_LOCAL_PART_ADDR.match(email): - local_part, domain_part = m.groups() + # Check for other unsafe characters. + check_unsafe_chars(display_name, allow_space=True) - # Since backslash-escaping is no longer needed because - # the quotes are removed, remove backslash-escaping - # to return in the normalized form. - local_part = re.sub(r"\\(.)", "\\1", local_part) + # Remove the initial and trailing angle brackets. + addr_spec = right_part[1:].rstrip(">") - return local_part, domain_part, True + # Split the email address at the first unquoted @-sign. + local_part, domain_part = split_string_at_unquoted_special(addr_spec, ("@",)) + # Otherwise there is no display name. The left part is the local + # part and the right part is the domain. else: - # Split at the one and only at-sign. - parts = email.split('@') - if len(parts) != 2: - raise EmailSyntaxError("The email address is not valid. It must have exactly one @-sign.") - local_part, domain_part = parts - return local_part, domain_part, False + display_name = None + local_part, domain_part = left_part, right_part + + if domain_part.startswith("@"): + domain_part = domain_part[1:] + + # Unquote the local part if it is quoted. + local_part, is_quoted_local_part = unquote_quoted_string(local_part) + + return display_name, local_part, domain_part, is_quoted_local_part def get_length_reason(addr, utf8=False, limit=EMAIL_MAX_LENGTH): @@ -215,7 +331,7 @@ def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_emp bad_chars = { safe_character_display(c) for c in local - if not ATEXT_INTL_RE.match(c) + if not ATEXT_INTL_DOT_RE.match(c) } if bad_chars: raise EmailSyntaxError("The email address contains invalid characters before the @-sign: " + ", ".join(sorted(bad_chars)) + ".") diff --git a/email_validator/validate_email.py b/email_validator/validate_email.py index dafc33c..ea6c9f3 100644 --- a/email_validator/validate_email.py +++ b/email_validator/validate_email.py @@ -21,6 +21,7 @@ def validate_email_sync_or_async( allow_empty_local: bool = False, allow_quoted_local: Optional[bool] = None, allow_domain_literal: Optional[bool] = None, + allow_display_name: Optional[bool] = None, check_deliverability: Optional[bool] = None, test_environment: Optional[bool] = None, globally_deliverable: Optional[bool] = None, @@ -35,7 +36,7 @@ def validate_email_sync_or_async( """ # Fill in default values of arguments. - from . import ALLOW_SMTPUTF8, ALLOW_QUOTED_LOCAL, ALLOW_DOMAIN_LITERAL, \ + from . import ALLOW_SMTPUTF8, ALLOW_QUOTED_LOCAL, ALLOW_DOMAIN_LITERAL, ALLOW_DISPLAY_NAME, \ GLOBALLY_DELIVERABLE, CHECK_DELIVERABILITY, TEST_ENVIRONMENT, DEFAULT_TIMEOUT if allow_smtputf8 is None: allow_smtputf8 = ALLOW_SMTPUTF8 @@ -43,6 +44,8 @@ def validate_email_sync_or_async( allow_quoted_local = ALLOW_QUOTED_LOCAL if allow_domain_literal is None: allow_domain_literal = ALLOW_DOMAIN_LITERAL + if allow_display_name is None: + allow_display_name = ALLOW_DISPLAY_NAME if check_deliverability is None: check_deliverability = CHECK_DELIVERABILITY if test_environment is None: @@ -61,17 +64,20 @@ def validate_email_sync_or_async( except ValueError as e: raise EmailSyntaxError("The email address is not valid ASCII.") from e - # Split the address into the local part (before the @-sign) - # and the domain part (after the @-sign). Normally, there - # is only one @-sign. But the awkward "quoted string" local - # part form (RFC 5321 4.1.2) allows @-signs in the local + # Split the address into the display name (or None), the local part + # (before the @-sign), and the domain part (after the @-sign). + # Normally, there is only one @-sign. But the awkward "quoted string" + # local part form (RFC 5321 4.1.2) allows @-signs in the local # part if the local part is quoted. - local_part, domain_part, is_quoted_local_part \ + display_name, local_part, domain_part, is_quoted_local_part \ = split_email(email) + if display_name is not None and not allow_display_name: + raise EmailSyntaxError("A display name and angle brackets around the email address are not permitted here.") # Collect return values in this instance. ret = ValidatedEmail() ret.original = email + ret.display_name = display_name # Validate the email address's local part syntax and get a normalized form. # If the original address was quoted and the decoded local part is a valid @@ -229,6 +235,7 @@ def validate_email_sync( allow_empty_local: bool = False, allow_quoted_local: Optional[bool] = None, allow_domain_literal: Optional[bool] = None, + allow_display_name: Optional[bool] = None, check_deliverability: Optional[bool] = None, test_environment: Optional[bool] = None, globally_deliverable: Optional[bool] = None, @@ -241,6 +248,7 @@ def validate_email_sync( allow_empty_local=allow_empty_local, allow_quoted_local=allow_quoted_local, allow_domain_literal=allow_domain_literal, + allow_display_name=allow_display_name, check_deliverability=check_deliverability, test_environment=test_environment, globally_deliverable=globally_deliverable, @@ -261,6 +269,7 @@ async def validate_email_async( allow_empty_local: bool = False, allow_quoted_local: Optional[bool] = None, allow_domain_literal: Optional[bool] = None, + allow_display_name: Optional[bool] = None, check_deliverability: Optional[bool] = None, test_environment: Optional[bool] = None, globally_deliverable: Optional[bool] = None, @@ -274,6 +283,7 @@ async def validate_email_async( allow_empty_local=allow_empty_local, allow_quoted_local=allow_quoted_local, allow_domain_literal=allow_domain_literal, + allow_display_name=allow_display_name, check_deliverability=check_deliverability, test_environment=test_environment, globally_deliverable=globally_deliverable, diff --git a/tests/test_syntax.py b/tests/test_syntax.py index 693d7da..65e3ec0 100644 --- a/tests/test_syntax.py +++ b/tests/test_syntax.py @@ -92,6 +92,45 @@ ascii_email='de-quoted.local.part@example.org' ), ), + ( + 'MyName ', + ValidatedEmail( + local_part='me', + ascii_local_part='me', + smtputf8=False, + ascii_domain='example.org', + domain='example.org', + normalized='me@example.org', + ascii_email='me@example.org', + display_name="MyName" + ), + ), + ( + 'My Name ', + ValidatedEmail( + local_part='me', + ascii_local_part='me', + smtputf8=False, + ascii_domain='example.org', + domain='example.org', + normalized='me@example.org', + ascii_email='me@example.org', + display_name="My Name" + ), + ), + ( + r'"My.\"Na\\me\".Is" <"me \" \\ me"@example.org>', + ValidatedEmail( + local_part=r'"me \" \\ me"', + ascii_local_part=r'"me \" \\ me"', + smtputf8=False, + ascii_domain='example.org', + domain='example.org', + normalized=r'"me \" \\ me"@example.org', + ascii_email=r'"me \" \\ me"@example.org', + display_name='My."Na\\me".Is' + ), + ), ], ) def test_email_valid(email_input, output): @@ -99,10 +138,11 @@ def test_email_valid(email_input, output): # for addresses that are valid but require SMTPUTF8. Check that it passes with # allow_smtput8 both on and off. emailinfo = validate_email(email_input, check_deliverability=False, allow_smtputf8=False, - allow_quoted_local=True) + allow_quoted_local=True, allow_display_name=True) + assert emailinfo == output assert validate_email(email_input, check_deliverability=False, allow_smtputf8=True, - allow_quoted_local=True) == output + allow_quoted_local=True, allow_display_name=True) == output # Check that the old `email` attribute to access the normalized form still works # if the DeprecationWarning is suppressed. @@ -363,6 +403,12 @@ def test_domain_literal(): ('me@[tag:text]', 'The part after the @-sign contains an invalid address literal tag in brackets.'), ('me@[untaggedtext]', 'The part after the @-sign in brackets is not an IPv4 address and has no address literal tag.'), ('me@[tag:invalid space]', 'The part after the @-sign contains invalid characters in brackets: SPACE.'), + ('', 'A display name and angle brackets around the email address are not permitted here.'), + ('DisplayName ', 'A display name and angle brackets around the email address are not permitted here.'), + ('Display Name ', 'A display name and angle brackets around the email address are not permitted here.'), + ('\"Display Name\" ', 'A display name and angle brackets around the email address are not permitted here.'), + ('Display.Name ', 'The display name contains invalid characters when not quoted: \'.\'.'), + ('\"Display.Name\" ', 'A display name and angle brackets around the email address are not permitted here.'), ], ) def test_email_invalid_syntax(email_input, error_msg):