From d6a5d4b6870ff34a8ef3799174d6fe69776445f1 Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Wed, 1 Mar 2023 09:19:49 -0500 Subject: [PATCH] Add more citations throughout the library --- email_validator/deliverability.py | 16 ++++--- email_validator/rfc_constants.py | 9 ++-- email_validator/syntax.py | 78 +++++++++++++++++++++++-------- 3 files changed, 72 insertions(+), 31 deletions(-) diff --git a/email_validator/deliverability.py b/email_validator/deliverability.py index 9616afc..19eee65 100644 --- a/email_validator/deliverability.py +++ b/email_validator/deliverability.py @@ -34,15 +34,16 @@ def validate_email_deliverability(domain, domain_i18n, timeout=None, dns_resolve try: try: - # Try resolving for MX records. + # Try resolving for MX records (RFC 5321 Section 5). response = dns_resolver.resolve(domain, "MX") # For reporting, put them in priority order and remove the trailing dot in the qnames. mtas = sorted([(r.preference, str(r.exchange).rstrip('.')) for r in response]) - # Remove "null MX" records from the list (their value is (0, ".") but we've stripped - # trailing dots, so the 'exchange' is just ""). If there was only a null MX record, - # email is not deliverable. + # RFC 7505: Null MX (0, ".") records signify the domain does not accept email. + # Remove null MX records from the mtas list (but we've stripped trailing dots, + # so the 'exchange' is just "") so we can check if there are no non-null MX + # records remaining. mtas = [(preference, exchange) for preference, exchange in mtas if exchange != ""] if len(mtas) == 0: # null MX only, if there were no MX records originally a NoAnswer exception would have occurred @@ -52,7 +53,7 @@ def validate_email_deliverability(domain, domain_i18n, timeout=None, dns_resolve deliverability_info["mx_fallback_type"] = None except dns.resolver.NoAnswer: - # If there was no MX record, fall back to an A record, as SMTP servers do. + # If there was no MX record, fall back to an A record. (RFC 5321 Section 5) try: response = dns_resolver.resolve(domain, "A") deliverability_info["mx"] = [(0, str(r)) for r in response] @@ -61,6 +62,7 @@ def validate_email_deliverability(domain, domain_i18n, timeout=None, dns_resolve except dns.resolver.NoAnswer: # If there was no A record, fall back to an AAAA record. + # (It's unclear if SMTP servers actually do this.) try: response = dns_resolver.resolve(domain, "AAAA") deliverability_info["mx"] = [(0, str(r)) for r in response] @@ -73,8 +75,8 @@ def validate_email_deliverability(domain, domain_i18n, timeout=None, dns_resolve # have been raised). raise EmailUndeliverableError("The domain name %s does not accept email." % domain_i18n) - # Check for a SPF reject-all record ("v=spf1 -all") which indicates - # no emails are sent from this domain (similar to a NULL MX record + # Check for a SPF (RFC 7208) reject-all record ("v=spf1 -all") which indicates + # no emails are sent from this domain (similar to a Null MX record # but for sending rather than receiving). In combination with the # absence of an MX record, this is probably a good sign that the # domain is not used for email. diff --git a/email_validator/rfc_constants.py b/email_validator/rfc_constants.py index 82bc726..bf21a9c 100644 --- a/email_validator/rfc_constants.py +++ b/email_validator/rfc_constants.py @@ -18,9 +18,8 @@ DOT_ATOM_TEXT_INTL = re.compile('[' + ATEXT_INTL + ']+(?:\\.[' + ATEXT_INTL + r']+)*\Z') # The domain part of the email address, after IDNA (ASCII) encoding, -# must also satisfy the requirements of RFC 952/RFC 1123 which restrict -# the allowed characters of hostnames further. The hyphen cannot be at -# the beginning or end of a *dot-atom component* of a hostname either. +# must also satisfy the requirements of RFC 952/RFC 1123 Section 2.1 which +# restrict the allowed characters of hostnames further. ATEXT_HOSTNAME_INTL = re.compile(r"[a-zA-Z0-9\-\." + "\u0080-\U0010FFFF" + "]") HOSTNAME_LABEL = r'(?:(?:[a-zA-Z0-9][a-zA-Z0-9\-]*)?[a-zA-Z0-9])' DOT_ATOM_TEXT_HOSTNAME = re.compile(HOSTNAME_LABEL + r'(?:\.' + HOSTNAME_LABEL + r')*\Z') @@ -31,5 +30,5 @@ # explains the maximum length of an email address is 254 octets. EMAIL_MAX_LENGTH = 254 LOCAL_PART_MAX_LENGTH = 64 -DNS_LABEL_LENGTH_LIMIT = 63 # RFC 1035 2.3.1 -DOMAIN_MAX_LENGTH = 255 # RFC 1035 2.3.4 +DNS_LABEL_LENGTH_LIMIT = 63 # in "octets", RFC 1035 2.3.1 +DOMAIN_MAX_LENGTH = 255 # in "octets", RFC 1035 2.3.4 and RFC 5321 4.5.3.1.2 diff --git a/email_validator/syntax.py b/email_validator/syntax.py index 9d57ddb..cf7c304 100644 --- a/email_validator/syntax.py +++ b/email_validator/syntax.py @@ -46,7 +46,8 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals "smtputf8": False, } - # RFC 5321 4.5.3.1.1 + # Check the length of the local part by couting characters. + # (RFC 5321 4.5.3.1.1) # We're checking the number of characters here. If the local part # is ASCII-only, then that's the same as bytes (octets). If it's # internationalized, then the UTF-8 encoding may be longer, but @@ -57,6 +58,8 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals raise EmailSyntaxError("The email address is too long before the @-sign {}.".format(reason)) # Check for invalid characters. + # (RFC 2822 Section 3.2.4 / RFC 5322 Section 3.2.3, plus RFC 6531 section 3.3 + # if internationalized local parts are allowed) atext_re = re.compile('[.' + (ATEXT if not allow_smtputf8 else ATEXT_INTL) + ']') bad_chars = set( safe_character_display(c) @@ -67,9 +70,11 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals raise EmailSyntaxError("The email address contains invalid characters before the @-sign: " + ", ".join(sorted(bad_chars)) + ".") # Check for dot errors imposted by the dot-atom rule. + # (RFC 2822 3.2.4) check_dot_atom(local, 'An email address cannot start with a {}.', 'An email address cannot have a {} immediately before the @-sign.', is_hostname=False) - # Check the local part against the regular expression for the older ASCII requirements. + # Check the local part against the non-internationalized regular expression. + # (RFC 2822 3.2.4) m = DOT_ATOM_TEXT.match(local) if m: # Return the local part unchanged and flag that SMTPUTF8 is not needed. @@ -82,6 +87,7 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals else: # The local part failed the ASCII check. Now try the extended internationalized requirements. # This should already be handled by the bad_chars and check_dot_atom tests above. + # It's the same pattern but with additional characters permitted. m = DOT_ATOM_TEXT_INTL.match(local) if not m: raise EmailSyntaxError("The email address contains invalid characters before the @-sign.") @@ -97,7 +103,8 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals # Check for unsafe characters. # Some of this may be redundant with the range U+0080 to U+10FFFF that is checked - # by DOT_ATOM_TEXT_INTL. + # by DOT_ATOM_TEXT_INTL. Other characters may be permitted by the email specs, but + # they may not be valid, safe, or sensible Unicode strings. check_unsafe_chars(local) # Try encoding to UTF-8. Failure is possible with some characters like @@ -117,25 +124,39 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals def check_unsafe_chars(s): + # Check for unsafe characters or characters that would make the string + # invalid or non-sensible Unicode. bad_chars = set() for i, c in enumerate(s): category = unicodedata.category(c) if category[0] in ("L", "N", "P", "S"): - # letters, numbers, punctuation, and symbols are permitted + # Letters, numbers, punctuation, and symbols are permitted. pass elif category[0] == "M": - # combining character in first position would combine with something - # outside of the email address if concatenated to the right, but are - # otherwise permitted + # Combining character in first position would combine with something + # outside of the email address if concatenated, so they are not safe. + # We also check if this occurs after the @-sign, which would not be + # sensible. if i == 0: bad_chars.add(c) - elif category[0] in ("Z", "C"): - # spaces and line/paragraph characters (Z) and - # control, format, surrogate, private use, and unassigned code points (C) + elif category[0] == "Z": + # Spaces and line/paragraph characters (Z) outside of the ASCII range + # are not specifically disallowed as far as I can tell, but they + # violate the spirit of the non-internationalized specification that + # email addresses do not contain spaces or line breaks when not quoted. + bad_chars.add(c) + elif category[0] == "C": + # Control, format, surrogate, private use, and unassigned code points (C) + # are all unsafe in various ways. Control and format characters can affect + # text rendering if the email address is concatenated with other text. + # Bidirectional format characters are unsafe, even if used properly, because + # they cause an email address to render as a different email address. + # Private use characters do not make sense for publicly deliverable + # email addresses. bad_chars.add(c) else: # All categories should be handled above, but in case there is something new - # in the future. + # to the Unicode specification in the future, reject all other categories. bad_chars.add(c) if bad_chars: raise EmailSyntaxError("The email address contains unsafe characters: " @@ -143,13 +164,16 @@ def check_unsafe_chars(s): def check_dot_atom(label, start_descr, end_descr, is_hostname): + # RFC 2822 3.2.4 if label.endswith("."): raise EmailSyntaxError(end_descr.format("period")) if label.startswith("."): raise EmailSyntaxError(start_descr.format("period")) if ".." in label: raise EmailSyntaxError("An email address cannot have two periods in a row.") + if is_hostname: + # RFC 952 if label.endswith("-"): raise EmailSyntaxError(end_descr.format("hyphen")) if label.startswith("-"): @@ -166,6 +190,7 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera raise EmailSyntaxError("There must be something after the @-sign.") # Check for invalid characters before normalization. + # (RFC 952 plus RFC 6531 section 3.3 for internationalized addresses) bad_chars = set( safe_character_display(c) for c in domain @@ -173,6 +198,11 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera ) if bad_chars: raise EmailSyntaxError("The part after the @-sign contains invalid characters: " + ", ".join(sorted(bad_chars)) + ".") + + # Check for unsafe characters. + # Some of this may be redundant with the range U+0080 to U+10FFFF that is checked + # by DOT_ATOM_TEXT_INTL. Other characters may be permitted by the email specs, but + # they may not be valid, safe, or sensible Unicode strings. check_unsafe_chars(domain) # Perform UTS-46 normalization, which includes casefolding, NFC normalization, @@ -191,9 +221,13 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera # Check that before we do IDNA encoding because the IDNA library gives # unfriendly errors for these cases, but after UTS-46 normalization because # it can insert periods and hyphens (from fullwidth characters). + # (RFC 952, RFC 2822 3.2.4) check_dot_atom(domain, 'An email address cannot have a {} immediately after the @-sign.', 'An email address cannot end with a {}.', is_hostname=True) + + # Check for RFC 5890's invalid R-LDH labels, which are labels that start + # with two characters other than "xn" and two dashes. for label in domain.split("."): - if re.match(r"(?!xn)..--", label, re.I): # RFC 5890 invalid R-LDH labels + if re.match(r"(?!xn)..--", label, re.I): raise EmailSyntaxError("An email address cannot have two letters followed by two dashes immediately after the @-sign or after a period, except Punycode.") if DOT_ATOM_TEXT_HOSTNAME.match(domain): @@ -230,8 +264,9 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera if not m: raise EmailSyntaxError("The email address contains invalid characters after the @-sign after IDNA encoding.") - # RFC 5321 4.5.3.1.2 - # We're checking the number of bytes (octets) here, which can be much + # Check the length of the domain name in bytes. + # (RFC 1035 2.3.4 and RFC 5321 4.5.3.1.2) + # We're checking the number of bytes ("octets") here, which can be much # higher than the number of characters in internationalized domains, # on the assumption that the domain may be transmitted without SMTPUTF8 # as IDNA ASCII. (This is also checked by idna.encode, so this exception @@ -239,14 +274,19 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera if len(ascii_domain) > DOMAIN_MAX_LENGTH: reason = get_length_reason(ascii_domain, limit=DOMAIN_MAX_LENGTH) raise EmailSyntaxError("The email address is too long after the @-sign {}.".format(reason)) + + # Also check the label length limit. + # (RFC 1035 2.3.1) for label in ascii_domain.split("."): if len(label) > DNS_LABEL_LENGTH_LIMIT: reason = get_length_reason(label, limit=DNS_LABEL_LENGTH_LIMIT) - raise EmailSyntaxError("On either side of the @-sign, periods cannot be separated by so many characters {}.".format(reason)) + raise EmailSyntaxError("After the @-sign, periods cannot be separated by so many characters {}.".format(reason)) if globally_deliverable: # All publicly deliverable addresses have domain named with at least - # one period, and we'll consider the lack of a period a syntax error + # one period, at least for gTLDs created since 2013 (per the ICANN Board + # New gTLD Program Committee, https://www.icann.org/en/announcements/details/new-gtld-dotless-domain-names-prohibited-30-8-2013-en). + # We'll consider the lack of a period a syntax error # since that will match people's sense of what an email address looks # like. We'll skip this in test environments to allow '@test' email # addresses. @@ -260,6 +300,7 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera # Check special-use and reserved domain names. # Some might fail DNS-based deliverability checks, but that # can be turned off, so we should fail them all sooner. + # See the references in __init__.py. from . import SPECIAL_USE_DOMAIN_NAMES for d in SPECIAL_USE_DOMAIN_NAMES: # See the note near the definition of SPECIAL_USE_DOMAIN_NAMES. @@ -274,15 +315,14 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera # but not be actual IDNA. For ASCII-only domains, the conversion out # of IDNA just gives the same thing back. # - # This gives us the canonical internationalized form of the domain, - # which we should use in all error messages. + # This gives us the canonical internationalized form of the domain. try: domain_i18n = idna.decode(ascii_domain.encode('ascii')) except idna.IDNAError as e: raise EmailSyntaxError("The part after the @-sign is not valid IDNA ({}).".format(str(e))) # Check for invalid characters after normalization. These - # should never arise. + # should never arise. See the similar checks above. bad_chars = set( safe_character_display(c) for c in domain