Skip to content

Commit

Permalink
Add more citations throughout the library
Browse files Browse the repository at this point in the history
  • Loading branch information
JoshData committed Mar 1, 2023
1 parent 210c661 commit d6a5d4b
Show file tree
Hide file tree
Showing 3 changed files with 72 additions and 31 deletions.
16 changes: 9 additions & 7 deletions email_validator/deliverability.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,15 +34,16 @@ def validate_email_deliverability(domain, domain_i18n, timeout=None, dns_resolve

try:
try:
# Try resolving for MX records.
# Try resolving for MX records (RFC 5321 Section 5).
response = dns_resolver.resolve(domain, "MX")

# For reporting, put them in priority order and remove the trailing dot in the qnames.
mtas = sorted([(r.preference, str(r.exchange).rstrip('.')) for r in response])

# Remove "null MX" records from the list (their value is (0, ".") but we've stripped
# trailing dots, so the 'exchange' is just ""). If there was only a null MX record,
# email is not deliverable.
# RFC 7505: Null MX (0, ".") records signify the domain does not accept email.
# Remove null MX records from the mtas list (but we've stripped trailing dots,
# so the 'exchange' is just "") so we can check if there are no non-null MX
# records remaining.
mtas = [(preference, exchange) for preference, exchange in mtas
if exchange != ""]
if len(mtas) == 0: # null MX only, if there were no MX records originally a NoAnswer exception would have occurred
Expand All @@ -52,7 +53,7 @@ def validate_email_deliverability(domain, domain_i18n, timeout=None, dns_resolve
deliverability_info["mx_fallback_type"] = None

except dns.resolver.NoAnswer:
# If there was no MX record, fall back to an A record, as SMTP servers do.
# If there was no MX record, fall back to an A record. (RFC 5321 Section 5)
try:
response = dns_resolver.resolve(domain, "A")
deliverability_info["mx"] = [(0, str(r)) for r in response]
Expand All @@ -61,6 +62,7 @@ def validate_email_deliverability(domain, domain_i18n, timeout=None, dns_resolve
except dns.resolver.NoAnswer:

# If there was no A record, fall back to an AAAA record.
# (It's unclear if SMTP servers actually do this.)
try:
response = dns_resolver.resolve(domain, "AAAA")
deliverability_info["mx"] = [(0, str(r)) for r in response]
Expand All @@ -73,8 +75,8 @@ def validate_email_deliverability(domain, domain_i18n, timeout=None, dns_resolve
# have been raised).
raise EmailUndeliverableError("The domain name %s does not accept email." % domain_i18n)

# Check for a SPF reject-all record ("v=spf1 -all") which indicates
# no emails are sent from this domain (similar to a NULL MX record
# Check for a SPF (RFC 7208) reject-all record ("v=spf1 -all") which indicates
# no emails are sent from this domain (similar to a Null MX record
# but for sending rather than receiving). In combination with the
# absence of an MX record, this is probably a good sign that the
# domain is not used for email.
Expand Down
9 changes: 4 additions & 5 deletions email_validator/rfc_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,8 @@
DOT_ATOM_TEXT_INTL = re.compile('[' + ATEXT_INTL + ']+(?:\\.[' + ATEXT_INTL + r']+)*\Z')

# The domain part of the email address, after IDNA (ASCII) encoding,
# must also satisfy the requirements of RFC 952/RFC 1123 which restrict
# the allowed characters of hostnames further. The hyphen cannot be at
# the beginning or end of a *dot-atom component* of a hostname either.
# must also satisfy the requirements of RFC 952/RFC 1123 Section 2.1 which
# restrict the allowed characters of hostnames further.
ATEXT_HOSTNAME_INTL = re.compile(r"[a-zA-Z0-9\-\." + "\u0080-\U0010FFFF" + "]")
HOSTNAME_LABEL = r'(?:(?:[a-zA-Z0-9][a-zA-Z0-9\-]*)?[a-zA-Z0-9])'
DOT_ATOM_TEXT_HOSTNAME = re.compile(HOSTNAME_LABEL + r'(?:\.' + HOSTNAME_LABEL + r')*\Z')
Expand All @@ -31,5 +30,5 @@
# explains the maximum length of an email address is 254 octets.
EMAIL_MAX_LENGTH = 254
LOCAL_PART_MAX_LENGTH = 64
DNS_LABEL_LENGTH_LIMIT = 63 # RFC 1035 2.3.1
DOMAIN_MAX_LENGTH = 255 # RFC 1035 2.3.4
DNS_LABEL_LENGTH_LIMIT = 63 # in "octets", RFC 1035 2.3.1
DOMAIN_MAX_LENGTH = 255 # in "octets", RFC 1035 2.3.4 and RFC 5321 4.5.3.1.2
78 changes: 59 additions & 19 deletions email_validator/syntax.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,8 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals
"smtputf8": False,
}

# RFC 5321 4.5.3.1.1
# Check the length of the local part by couting characters.
# (RFC 5321 4.5.3.1.1)
# We're checking the number of characters here. If the local part
# is ASCII-only, then that's the same as bytes (octets). If it's
# internationalized, then the UTF-8 encoding may be longer, but
Expand All @@ -57,6 +58,8 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals
raise EmailSyntaxError("The email address is too long before the @-sign {}.".format(reason))

# Check for invalid characters.
# (RFC 2822 Section 3.2.4 / RFC 5322 Section 3.2.3, plus RFC 6531 section 3.3
# if internationalized local parts are allowed)
atext_re = re.compile('[.' + (ATEXT if not allow_smtputf8 else ATEXT_INTL) + ']')
bad_chars = set(
safe_character_display(c)
Expand All @@ -67,9 +70,11 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals
raise EmailSyntaxError("The email address contains invalid characters before the @-sign: " + ", ".join(sorted(bad_chars)) + ".")

# Check for dot errors imposted by the dot-atom rule.
# (RFC 2822 3.2.4)
check_dot_atom(local, 'An email address cannot start with a {}.', 'An email address cannot have a {} immediately before the @-sign.', is_hostname=False)

# Check the local part against the regular expression for the older ASCII requirements.
# Check the local part against the non-internationalized regular expression.
# (RFC 2822 3.2.4)
m = DOT_ATOM_TEXT.match(local)
if m:
# Return the local part unchanged and flag that SMTPUTF8 is not needed.
Expand All @@ -82,6 +87,7 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals
else:
# The local part failed the ASCII check. Now try the extended internationalized requirements.
# This should already be handled by the bad_chars and check_dot_atom tests above.
# It's the same pattern but with additional characters permitted.
m = DOT_ATOM_TEXT_INTL.match(local)
if not m:
raise EmailSyntaxError("The email address contains invalid characters before the @-sign.")
Expand All @@ -97,7 +103,8 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals

# Check for unsafe characters.
# Some of this may be redundant with the range U+0080 to U+10FFFF that is checked
# by DOT_ATOM_TEXT_INTL.
# by DOT_ATOM_TEXT_INTL. Other characters may be permitted by the email specs, but
# they may not be valid, safe, or sensible Unicode strings.
check_unsafe_chars(local)

# Try encoding to UTF-8. Failure is possible with some characters like
Expand All @@ -117,39 +124,56 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals


def check_unsafe_chars(s):
# Check for unsafe characters or characters that would make the string
# invalid or non-sensible Unicode.
bad_chars = set()
for i, c in enumerate(s):
category = unicodedata.category(c)
if category[0] in ("L", "N", "P", "S"):
# letters, numbers, punctuation, and symbols are permitted
# Letters, numbers, punctuation, and symbols are permitted.
pass
elif category[0] == "M":
# combining character in first position would combine with something
# outside of the email address if concatenated to the right, but are
# otherwise permitted
# Combining character in first position would combine with something
# outside of the email address if concatenated, so they are not safe.
# We also check if this occurs after the @-sign, which would not be
# sensible.
if i == 0:
bad_chars.add(c)
elif category[0] in ("Z", "C"):
# spaces and line/paragraph characters (Z) and
# control, format, surrogate, private use, and unassigned code points (C)
elif category[0] == "Z":
# Spaces and line/paragraph characters (Z) outside of the ASCII range
# are not specifically disallowed as far as I can tell, but they
# violate the spirit of the non-internationalized specification that
# email addresses do not contain spaces or line breaks when not quoted.
bad_chars.add(c)
elif category[0] == "C":
# Control, format, surrogate, private use, and unassigned code points (C)
# are all unsafe in various ways. Control and format characters can affect
# text rendering if the email address is concatenated with other text.
# Bidirectional format characters are unsafe, even if used properly, because
# they cause an email address to render as a different email address.
# Private use characters do not make sense for publicly deliverable
# email addresses.
bad_chars.add(c)
else:
# All categories should be handled above, but in case there is something new
# in the future.
# to the Unicode specification in the future, reject all other categories.
bad_chars.add(c)
if bad_chars:
raise EmailSyntaxError("The email address contains unsafe characters: "
+ ", ".join(safe_character_display(c) for c in sorted(bad_chars)) + ".")


def check_dot_atom(label, start_descr, end_descr, is_hostname):
# RFC 2822 3.2.4
if label.endswith("."):
raise EmailSyntaxError(end_descr.format("period"))
if label.startswith("."):
raise EmailSyntaxError(start_descr.format("period"))
if ".." in label:
raise EmailSyntaxError("An email address cannot have two periods in a row.")

if is_hostname:
# RFC 952
if label.endswith("-"):
raise EmailSyntaxError(end_descr.format("hyphen"))
if label.startswith("-"):
Expand All @@ -166,13 +190,19 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera
raise EmailSyntaxError("There must be something after the @-sign.")

# Check for invalid characters before normalization.
# (RFC 952 plus RFC 6531 section 3.3 for internationalized addresses)
bad_chars = set(
safe_character_display(c)
for c in domain
if not ATEXT_HOSTNAME_INTL.match(c)
)
if bad_chars:
raise EmailSyntaxError("The part after the @-sign contains invalid characters: " + ", ".join(sorted(bad_chars)) + ".")

# Check for unsafe characters.
# Some of this may be redundant with the range U+0080 to U+10FFFF that is checked
# by DOT_ATOM_TEXT_INTL. Other characters may be permitted by the email specs, but
# they may not be valid, safe, or sensible Unicode strings.
check_unsafe_chars(domain)

# Perform UTS-46 normalization, which includes casefolding, NFC normalization,
Expand All @@ -191,9 +221,13 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera
# Check that before we do IDNA encoding because the IDNA library gives
# unfriendly errors for these cases, but after UTS-46 normalization because
# it can insert periods and hyphens (from fullwidth characters).
# (RFC 952, RFC 2822 3.2.4)
check_dot_atom(domain, 'An email address cannot have a {} immediately after the @-sign.', 'An email address cannot end with a {}.', is_hostname=True)

# Check for RFC 5890's invalid R-LDH labels, which are labels that start
# with two characters other than "xn" and two dashes.
for label in domain.split("."):
if re.match(r"(?!xn)..--", label, re.I): # RFC 5890 invalid R-LDH labels
if re.match(r"(?!xn)..--", label, re.I):
raise EmailSyntaxError("An email address cannot have two letters followed by two dashes immediately after the @-sign or after a period, except Punycode.")

if DOT_ATOM_TEXT_HOSTNAME.match(domain):
Expand Down Expand Up @@ -230,23 +264,29 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera
if not m:
raise EmailSyntaxError("The email address contains invalid characters after the @-sign after IDNA encoding.")

# RFC 5321 4.5.3.1.2
# We're checking the number of bytes (octets) here, which can be much
# Check the length of the domain name in bytes.
# (RFC 1035 2.3.4 and RFC 5321 4.5.3.1.2)
# We're checking the number of bytes ("octets") here, which can be much
# higher than the number of characters in internationalized domains,
# on the assumption that the domain may be transmitted without SMTPUTF8
# as IDNA ASCII. (This is also checked by idna.encode, so this exception
# is never reached for internationalized domains.)
if len(ascii_domain) > DOMAIN_MAX_LENGTH:
reason = get_length_reason(ascii_domain, limit=DOMAIN_MAX_LENGTH)
raise EmailSyntaxError("The email address is too long after the @-sign {}.".format(reason))

# Also check the label length limit.
# (RFC 1035 2.3.1)
for label in ascii_domain.split("."):
if len(label) > DNS_LABEL_LENGTH_LIMIT:
reason = get_length_reason(label, limit=DNS_LABEL_LENGTH_LIMIT)
raise EmailSyntaxError("On either side of the @-sign, periods cannot be separated by so many characters {}.".format(reason))
raise EmailSyntaxError("After the @-sign, periods cannot be separated by so many characters {}.".format(reason))

if globally_deliverable:
# All publicly deliverable addresses have domain named with at least
# one period, and we'll consider the lack of a period a syntax error
# one period, at least for gTLDs created since 2013 (per the ICANN Board
# New gTLD Program Committee, https://www.icann.org/en/announcements/details/new-gtld-dotless-domain-names-prohibited-30-8-2013-en).
# We'll consider the lack of a period a syntax error
# since that will match people's sense of what an email address looks
# like. We'll skip this in test environments to allow '@test' email
# addresses.
Expand All @@ -260,6 +300,7 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera
# Check special-use and reserved domain names.
# Some might fail DNS-based deliverability checks, but that
# can be turned off, so we should fail them all sooner.
# See the references in __init__.py.
from . import SPECIAL_USE_DOMAIN_NAMES
for d in SPECIAL_USE_DOMAIN_NAMES:
# See the note near the definition of SPECIAL_USE_DOMAIN_NAMES.
Expand All @@ -274,15 +315,14 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera
# but not be actual IDNA. For ASCII-only domains, the conversion out
# of IDNA just gives the same thing back.
#
# This gives us the canonical internationalized form of the domain,
# which we should use in all error messages.
# This gives us the canonical internationalized form of the domain.
try:
domain_i18n = idna.decode(ascii_domain.encode('ascii'))
except idna.IDNAError as e:
raise EmailSyntaxError("The part after the @-sign is not valid IDNA ({}).".format(str(e)))

# Check for invalid characters after normalization. These
# should never arise.
# should never arise. See the similar checks above.
bad_chars = set(
safe_character_display(c)
for c in domain
Expand Down

0 comments on commit d6a5d4b

Please sign in to comment.