Skip to content

Commit

Permalink
Improve some code comments, refactor some code, mention length checks…
Browse files Browse the repository at this point in the history
… in the README
  • Loading branch information
JoshData committed Oct 19, 2023
1 parent 814b488 commit 786defc
Show file tree
Hide file tree
Showing 4 changed files with 111 additions and 77 deletions.
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,14 @@ they are unnecessary. For IPv6 domain literals, the IPv6 address is
normalized to condensed form. [RFC 2142](https://datatracker.ietf.org/doc/html/rfc2142)
also requires lowercase normalization for some specific mailbox names like `postmaster@`.

### Length checks

This library checks that the length of the email address is not longer than
the maximum length. The check is performed on the normalized form of the
address, which might be different from a string provided by a user. If you
send email to the original string and not the normalized address, the email
might be rejected because the original address could be too long.

Examples
--------

Expand Down
83 changes: 76 additions & 7 deletions email_validator/syntax.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from .exceptions_types import EmailSyntaxError
from .rfc_constants import EMAIL_MAX_LENGTH, LOCAL_PART_MAX_LENGTH, DOMAIN_MAX_LENGTH, \
DOT_ATOM_TEXT, DOT_ATOM_TEXT_INTL, ATEXT_RE, ATEXT_INTL_RE, ATEXT_HOSTNAME_INTL, QTEXT_INTL, \
DNS_LABEL_LENGTH_LIMIT, DOT_ATOM_TEXT_HOSTNAME, DOMAIN_NAME_REGEX, DOMAIN_LITERAL_CHARS
DNS_LABEL_LENGTH_LIMIT, DOT_ATOM_TEXT_HOSTNAME, DOMAIN_NAME_REGEX, DOMAIN_LITERAL_CHARS, \
QUOTED_LOCAL_PART_ADDR

import re
import unicodedata
Expand All @@ -10,6 +11,35 @@
from typing import Optional


def split_email(email):
# Return the local part and domain part of the address and
# whether the local part was quoted as a three-tuple.

# Typical email addresses have a single @-sign, but the
# awkward "quoted string" local part form (RFC 5321 4.1.2)
# allows @-signs (and escaped quotes) to appear in the local
# part if the local part is quoted. If the address is quoted,
# split it at a non-escaped @-sign and unescape the escaping.
if m := QUOTED_LOCAL_PART_ADDR.match(email):
local_part, domain_part = m.groups()

# Since backslash-escaping is no longer needed because
# the quotes are removed, remove backslash-escaping
# to return in the normalized form.
import re
local_part = re.sub(r"\\(.)", "\\1", local_part)

return local_part, domain_part, True

else:
# Split at the one and only at-sign.
parts = email.split('@')
if len(parts) != 2:
raise EmailSyntaxError("The email address is not valid. It must have exactly one @-sign.")
local_part, domain_part = parts
return local_part, domain_part, False


def get_length_reason(addr, utf8=False, limit=EMAIL_MAX_LENGTH):
"""Helper function to return an error message related to invalid length."""
diff = len(addr) - limit
Expand Down Expand Up @@ -367,7 +397,7 @@ def validate_email_domain_name(domain, test_environment=False, globally_delivera
raise EmailSyntaxError(f"After the @-sign, periods cannot be separated by so many characters {reason}.")

if globally_deliverable:
# All publicly deliverable addresses have domain named with at least
# All publicly deliverable addresses have domain names with at least
# one period, at least for gTLDs created since 2013 (per the ICANN Board
# New gTLD Program Committee, https://www.icann.org/en/announcements/details/new-gtld-dotless-domain-names-prohibited-30-8-2013-en).
# We'll consider the lack of a period a syntax error
Expand Down Expand Up @@ -428,7 +458,48 @@ def validate_email_domain_name(domain, test_environment=False, globally_delivera
}


def validate_email_domain_literal(domain_literal, allow_domain_literal=False):
def validate_email_length(addrinfo):
# If the email address has an ASCII representation, then we assume it may be
# transmitted in ASCII (we can't assume SMTPUTF8 will be used on all hops to
# the destination) and the length limit applies to ASCII characters (which is
# the same as octets). The number of characters in the internationalized form
# may be many fewer (because IDNA ASCII is verbose) and could be less than 254
# Unicode characters, and of course the number of octets over the limit may
# not be the number of characters over the limit, so if the email address is
# internationalized, we can't give any simple information about why the address
# is too long.
if addrinfo.ascii_email and len(addrinfo.ascii_email) > EMAIL_MAX_LENGTH:
if addrinfo.ascii_email == addrinfo.normalized:
reason = get_length_reason(addrinfo.ascii_email)
elif len(addrinfo.normalized) > EMAIL_MAX_LENGTH:
# If there are more than 254 characters, then the ASCII
# form is definitely going to be too long.
reason = get_length_reason(addrinfo.normalized, utf8=True)
else:
reason = "(when converted to IDNA ASCII)"
raise EmailSyntaxError(f"The email address is too long {reason}.")

# In addition, check that the UTF-8 encoding (i.e. not IDNA ASCII and not
# Unicode characters) is at most 254 octets. If the addres is transmitted using
# SMTPUTF8, then the length limit probably applies to the UTF-8 encoded octets.
# If the email address has an ASCII form that differs from its internationalized
# form, I don't think the internationalized form can be longer, and so the ASCII
# form length check would be sufficient. If there is no ASCII form, then we have
# to check the UTF-8 encoding. The UTF-8 encoding could be up to about four times
# longer than the number of characters.
#
# See the length checks on the local part and the domain.
if len(addrinfo.normalized.encode("utf8")) > EMAIL_MAX_LENGTH:
if len(addrinfo.normalized) > EMAIL_MAX_LENGTH:
# If there are more than 254 characters, then the UTF-8
# encoding is definitely going to be too long.
reason = get_length_reason(addrinfo.normalized, utf8=True)
else:
reason = "(when encoded in bytes)"
raise EmailSyntaxError(f"The email address is too long {reason}.")


def validate_email_domain_literal(domain_literal):
# This is obscure domain-literal syntax. Parse it and return
# a compressed/normalized address.
# RFC 5321 4.1.3 and RFC 5322 3.4.1.
Expand All @@ -441,8 +512,6 @@ def validate_email_domain_literal(domain_literal, allow_domain_literal=False):
addr = ipaddress.IPv4Address(domain_literal)
except ValueError as e:
raise EmailSyntaxError(f"The address in brackets after the @-sign is not valid: It is not an IPv4 address ({e}) or is missing an address literal tag.")
if not allow_domain_literal:
raise EmailSyntaxError("A bracketed IPv4 address after the @-sign is not allowed here.")

# Return the IPv4Address object and the domain back unchanged.
return {
Expand All @@ -456,8 +525,6 @@ def validate_email_domain_literal(domain_literal, allow_domain_literal=False):
addr = ipaddress.IPv6Address(domain_literal[5:])
except ValueError as e:
raise EmailSyntaxError(f"The IPv6 address in brackets after the @-sign is not valid ({e}).")
if not allow_domain_literal:
raise EmailSyntaxError("A bracketed IPv6 address after the @-sign is not allowed here.")

# Return the IPv6Address object and construct a normalized
# domain literal.
Expand All @@ -466,6 +533,8 @@ def validate_email_domain_literal(domain_literal, allow_domain_literal=False):
"domain": f"[IPv6:{addr.compressed}]",
}

# Nothing else is valid.

if ":" not in domain_literal:
raise EmailSyntaxError("The part after the @-sign in brackets is not an IPv4 address and has no address literal tag.")

Expand Down
93 changes: 25 additions & 68 deletions email_validator/validate_email.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from typing import Optional, Union

from .exceptions_types import EmailSyntaxError, ValidatedEmail
from .syntax import validate_email_local_part, validate_email_domain_name, validate_email_domain_literal, get_length_reason
from .rfc_constants import EMAIL_MAX_LENGTH, QUOTED_LOCAL_PART_ADDR, CASE_INSENSITIVE_MAILBOX_NAMES
from .syntax import split_email, validate_email_local_part, validate_email_domain_name, validate_email_domain_literal, validate_email_length
from .rfc_constants import CASE_INSENSITIVE_MAILBOX_NAMES


def validate_email(
Expand All @@ -20,9 +20,9 @@ def validate_email(
dns_resolver: Optional[object] = None
) -> ValidatedEmail:
"""
Validates an email address, raising an EmailNotValidError if the address is not valid or returning a dict of
information when the address is valid. The email argument can be a str or a bytes instance,
but if bytes it must be ASCII-only. This is the main method of this library.
Given an email address, and some options, returns a ValidatedEmail instance
with information about the address if it is valid or, if the address is not
valid, raises an EmailNotValidError. This is the main function of the module.
"""

# Fill in default values of arguments.
Expand Down Expand Up @@ -52,26 +52,13 @@ def validate_email(
except ValueError:
raise EmailSyntaxError("The email address is not valid ASCII.")

# Typical email addresses have a single @-sign, but the
# awkward "quoted string" local part form (RFC 5321 4.1.2)
# allows @-signs (and escaped quotes) to appear in the local
# part if the local part is quoted. If the address is quoted,
# split it at a non-escaped @-sign and unescape the escaping.
quoted_local_part = False
if m := QUOTED_LOCAL_PART_ADDR.match(email):
quoted_local_part = True
local_part, domain_part = m.groups()

# Remove backslashes.
import re
local_part = re.sub(r"\\(.)", "\\1", local_part)

else:
# Split at the one and only at-sign.
parts = email.split('@')
if len(parts) != 2:
raise EmailSyntaxError("The email address is not valid. It must have exactly one @-sign.")
local_part, domain_part = parts
# Split the address into the local part (before the @-sign)
# and the domain part (after the @-sign). Normally, there
# is only one @-sign. But the awkward "quoted string" local
# part form (RFC 5321 4.1.2) allows @-signs in the local
# part if the local part is quoted.
local_part, domain_part, is_quoted_local_part \
= split_email(email)

# Collect return values in this instance.
ret = ValidatedEmail()
Expand All @@ -84,13 +71,17 @@ def validate_email(
local_part_info = validate_email_local_part(local_part,
allow_smtputf8=allow_smtputf8,
allow_empty_local=allow_empty_local,
quoted_local_part=quoted_local_part)
if quoted_local_part and not allow_quoted_local:
raise EmailSyntaxError("Quoting the part before the @-sign is not allowed here.")
quoted_local_part=is_quoted_local_part)
ret.local_part = local_part_info["local_part"]
ret.ascii_local_part = local_part_info["ascii_local_part"]
ret.smtputf8 = local_part_info["smtputf8"]

# If a quoted local part isn't allowed but is present, now raise an exception.
# This is done after any exceptions raised by validate_email_local_part so
# that mandatory checks have highest precedence.
if is_quoted_local_part and not allow_quoted_local:
raise EmailSyntaxError("Quoting the part before the @-sign is not allowed here.")

# Some local parts are required to be case-insensitive, so we should normalize
# to lowercase.
# RFC 2142
Expand All @@ -107,7 +98,9 @@ def validate_email(

elif domain_part.startswith("[") and domain_part.endswith("]"):
# Parse the address in the domain literal and get back a normalized domain.
domain_part_info = validate_email_domain_literal(domain_part[1:-1], allow_domain_literal=allow_domain_literal)
domain_part_info = validate_email_domain_literal(domain_part[1:-1])
if not allow_domain_literal:
raise EmailSyntaxError("A bracketed IP address after the @-sign is not allowed here.")
ret.domain = domain_part_info["domain"]
ret.ascii_domain = domain_part_info["domain"] # Domain literals are always ASCII.
ret.domain_address = domain_part_info["domain_address"]
Expand All @@ -131,48 +124,12 @@ def validate_email(
else:
ret.ascii_email = None

# If the email address has an ASCII representation, then we assume it may be
# transmitted in ASCII (we can't assume SMTPUTF8 will be used on all hops to
# the destination) and the length limit applies to ASCII characters (which is
# the same as octets). The number of characters in the internationalized form
# may be many fewer (because IDNA ASCII is verbose) and could be less than 254
# Unicode characters, and of course the number of octets over the limit may
# not be the number of characters over the limit, so if the email address is
# internationalized, we can't give any simple information about why the address
# is too long.
#
# In addition, check that the UTF-8 encoding (i.e. not IDNA ASCII and not
# Unicode characters) is at most 254 octets. If the addres is transmitted using
# SMTPUTF8, then the length limit probably applies to the UTF-8 encoded octets.
# If the email address has an ASCII form that differs from its internationalized
# form, I don't think the internationalized form can be longer, and so the ASCII
# form length check would be sufficient. If there is no ASCII form, then we have
# to check the UTF-8 encoding. The UTF-8 encoding could be up to about four times
# longer than the number of characters.
#
# See the length checks on the local part and the domain.
if ret.ascii_email and len(ret.ascii_email) > EMAIL_MAX_LENGTH:
if ret.ascii_email == ret.normalized:
reason = get_length_reason(ret.ascii_email)
elif len(ret.normalized) > EMAIL_MAX_LENGTH:
# If there are more than 254 characters, then the ASCII
# form is definitely going to be too long.
reason = get_length_reason(ret.normalized, utf8=True)
else:
reason = "(when converted to IDNA ASCII)"
raise EmailSyntaxError(f"The email address is too long {reason}.")
if len(ret.normalized.encode("utf8")) > EMAIL_MAX_LENGTH:
if len(ret.normalized) > EMAIL_MAX_LENGTH:
# If there are more than 254 characters, then the UTF-8
# encoding is definitely going to be too long.
reason = get_length_reason(ret.normalized, utf8=True)
else:
reason = "(when encoded in bytes)"
raise EmailSyntaxError(f"The email address is too long {reason}.")
# Check the length of the address.
validate_email_length(ret)

if check_deliverability and not test_environment:
# Validate the email address's deliverability using DNS
# and update the return dict with metadata.
# and update the returned ValidatedEmail object with metadata.

if is_domain_literal:
# There is nothing to check --- skip deliverability checks.
Expand Down
4 changes: 2 additions & 2 deletions tests/test_syntax.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,9 +330,9 @@ def test_domain_literal():
('[email protected]', 'The part after the @-sign is not valid IDNA (Invalid A-label).'),
('[email protected]', 'An email address cannot have two letters followed by two dashes immediately after the @-sign or after a period, except Punycode.'),
('me@yy--0.tld', 'An email address cannot have two letters followed by two dashes immediately after the @-sign or after a period, except Punycode.'),
('me@[127.0.0.1]', 'A bracketed IPv4 address after the @-sign is not allowed here.'),
('me@[127.0.0.1]', 'A bracketed IP address after the @-sign is not allowed here.'),
('me@[127.0.0.999]', 'The address in brackets after the @-sign is not valid: It is not an IPv4 address (Octet 999 (> 255) not permitted in \'127.0.0.999\') or is missing an address literal tag.'),
('me@[IPv6:::1]', 'A bracketed IPv6 address after the @-sign is not allowed here.'),
('me@[IPv6:::1]', 'A bracketed IP address after the @-sign is not allowed here.'),
('me@[IPv6:::G]', 'The IPv6 address in brackets after the @-sign is not valid (Only hex digits permitted in \'G\' in \'::G\').'),
('me@[tag:text]', 'The part after the @-sign contains an invalid address literal tag in brackets.'),
('me@[untaggedtext]', 'The part after the @-sign in brackets is not an IPv4 address and has no address literal tag.'),
Expand Down

0 comments on commit 786defc

Please sign in to comment.