Improve some code comments, refactor some code, mention length checks…

… in the README
JoshData · Oct 19, 2023 · 786defc · 786defc
1 parent 814b488
commit 786defc
Show file tree

Hide file tree

Showing 4 changed files with 111 additions and 77 deletions.
diff --git a/README.md b/README.md
@@ -315,6 +315,14 @@ they are unnecessary. For IPv6 domain literals, the IPv6 address is
 normalized to condensed form. [RFC 2142](https://datatracker.ietf.org/doc/html/rfc2142)
 also requires lowercase normalization for some specific mailbox names like `postmaster@`.
 
+### Length checks
+
+This library checks that the length of the email address is not longer than
+the maximum length. The check is performed on the normalized form of the
+address, which might be different from a string provided by a user. If you
+send email to the original string and not the normalized address, the email
+might be rejected because the original address could be too long.
+
 Examples
 --------
 

diff --git a/email_validator/syntax.py b/email_validator/syntax.py
@@ -1,7 +1,8 @@
 from .exceptions_types import EmailSyntaxError
 from .rfc_constants import EMAIL_MAX_LENGTH, LOCAL_PART_MAX_LENGTH, DOMAIN_MAX_LENGTH, \
     DOT_ATOM_TEXT, DOT_ATOM_TEXT_INTL, ATEXT_RE, ATEXT_INTL_RE, ATEXT_HOSTNAME_INTL, QTEXT_INTL, \
-    DNS_LABEL_LENGTH_LIMIT, DOT_ATOM_TEXT_HOSTNAME, DOMAIN_NAME_REGEX, DOMAIN_LITERAL_CHARS
+    DNS_LABEL_LENGTH_LIMIT, DOT_ATOM_TEXT_HOSTNAME, DOMAIN_NAME_REGEX, DOMAIN_LITERAL_CHARS, \
+    QUOTED_LOCAL_PART_ADDR
 
 import re
 import unicodedata
@@ -10,6 +11,35 @@
 from typing import Optional
 
 
+def split_email(email):
+    # Return the local part and domain part of the address and
+    # whether the local part was quoted as a three-tuple.
+
+    # Typical email addresses have a single @-sign, but the
+    # awkward "quoted string" local part form (RFC 5321 4.1.2)
+    # allows @-signs (and escaped quotes) to appear in the local
+    # part if the local part is quoted. If the address is quoted,
+    # split it at a non-escaped @-sign and unescape the escaping.
+    if m := QUOTED_LOCAL_PART_ADDR.match(email):
+        local_part, domain_part = m.groups()
+
+        # Since backslash-escaping is no longer needed because
+        # the quotes are removed, remove backslash-escaping
+        # to return in the normalized form.
+        import re
+        local_part = re.sub(r"\\(.)", "\\1", local_part)
+
+        return local_part, domain_part, True
+
+    else:
+        # Split at the one and only at-sign.
+        parts = email.split('@')
+        if len(parts) != 2:
+            raise EmailSyntaxError("The email address is not valid. It must have exactly one @-sign.")
+        local_part, domain_part = parts
+        return local_part, domain_part, False
+
+
 def get_length_reason(addr, utf8=False, limit=EMAIL_MAX_LENGTH):
     """Helper function to return an error message related to invalid length."""
     diff = len(addr) - limit
@@ -367,7 +397,7 @@ def validate_email_domain_name(domain, test_environment=False, globally_delivera
             raise EmailSyntaxError(f"After the @-sign, periods cannot be separated by so many characters {reason}.")
 
     if globally_deliverable:
-        # All publicly deliverable addresses have domain named with at least
+        # All publicly deliverable addresses have domain names with at least
         # one period, at least for gTLDs created since 2013 (per the ICANN Board
         # New gTLD Program Committee, https://www.icann.org/en/announcements/details/new-gtld-dotless-domain-names-prohibited-30-8-2013-en).
         # We'll consider the lack of a period a syntax error
@@ -428,7 +458,48 @@ def validate_email_domain_name(domain, test_environment=False, globally_delivera
     }
 
 
-def validate_email_domain_literal(domain_literal, allow_domain_literal=False):
+def validate_email_length(addrinfo):
+    # If the email address has an ASCII representation, then we assume it may be
+    # transmitted in ASCII (we can't assume SMTPUTF8 will be used on all hops to
+    # the destination) and the length limit applies to ASCII characters (which is
+    # the same as octets). The number of characters in the internationalized form
+    # may be many fewer (because IDNA ASCII is verbose) and could be less than 254
+    # Unicode characters, and of course the number of octets over the limit may
+    # not be the number of characters over the limit, so if the email address is
+    # internationalized, we can't give any simple information about why the address
+    # is too long.
+    if addrinfo.ascii_email and len(addrinfo.ascii_email) > EMAIL_MAX_LENGTH:
+        if addrinfo.ascii_email == addrinfo.normalized:
+            reason = get_length_reason(addrinfo.ascii_email)
+        elif len(addrinfo.normalized) > EMAIL_MAX_LENGTH:
+            # If there are more than 254 characters, then the ASCII
+            # form is definitely going to be too long.
+            reason = get_length_reason(addrinfo.normalized, utf8=True)
+        else:
+            reason = "(when converted to IDNA ASCII)"
+        raise EmailSyntaxError(f"The email address is too long {reason}.")
+
+    # In addition, check that the UTF-8 encoding (i.e. not IDNA ASCII and not
+    # Unicode characters) is at most 254 octets. If the addres is transmitted using
+    # SMTPUTF8, then the length limit probably applies to the UTF-8 encoded octets.
+    # If the email address has an ASCII form that differs from its internationalized
+    # form, I don't think the internationalized form can be longer, and so the ASCII
+    # form length check would be sufficient. If there is no ASCII form, then we have
+    # to check the UTF-8 encoding. The UTF-8 encoding could be up to about four times
+    # longer than the number of characters.
+    #
+    # See the length checks on the local part and the domain.
+    if len(addrinfo.normalized.encode("utf8")) > EMAIL_MAX_LENGTH:
+        if len(addrinfo.normalized) > EMAIL_MAX_LENGTH:
+            # If there are more than 254 characters, then the UTF-8
+            # encoding is definitely going to be too long.
+            reason = get_length_reason(addrinfo.normalized, utf8=True)
+        else:
+            reason = "(when encoded in bytes)"
+        raise EmailSyntaxError(f"The email address is too long {reason}.")
+
+
+def validate_email_domain_literal(domain_literal):
     # This is obscure domain-literal syntax. Parse it and return
     # a compressed/normalized address.
     # RFC 5321 4.1.3 and RFC 5322 3.4.1.
@@ -441,8 +512,6 @@ def validate_email_domain_literal(domain_literal, allow_domain_literal=False):
             addr = ipaddress.IPv4Address(domain_literal)
         except ValueError as e:
             raise EmailSyntaxError(f"The address in brackets after the @-sign is not valid: It is not an IPv4 address ({e}) or is missing an address literal tag.")
-        if not allow_domain_literal:
-            raise EmailSyntaxError("A bracketed IPv4 address after the @-sign is not allowed here.")
 
         # Return the IPv4Address object and the domain back unchanged.
         return {
@@ -456,8 +525,6 @@ def validate_email_domain_literal(domain_literal, allow_domain_literal=False):
             addr = ipaddress.IPv6Address(domain_literal[5:])
         except ValueError as e:
             raise EmailSyntaxError(f"The IPv6 address in brackets after the @-sign is not valid ({e}).")
-        if not allow_domain_literal:
-            raise EmailSyntaxError("A bracketed IPv6 address after the @-sign is not allowed here.")
 
         # Return the IPv6Address object and construct a normalized
         # domain literal.
@@ -466,6 +533,8 @@ def validate_email_domain_literal(domain_literal, allow_domain_literal=False):
             "domain": f"[IPv6:{addr.compressed}]",
         }
 
+    # Nothing else is valid.
+
     if ":" not in domain_literal:
         raise EmailSyntaxError("The part after the @-sign in brackets is not an IPv4 address and has no address literal tag.")
 

diff --git a/email_validator/validate_email.py b/email_validator/validate_email.py
@@ -1,8 +1,8 @@
 from typing import Optional, Union
 
 from .exceptions_types import EmailSyntaxError, ValidatedEmail
-from .syntax import validate_email_local_part, validate_email_domain_name, validate_email_domain_literal, get_length_reason
-from .rfc_constants import EMAIL_MAX_LENGTH, QUOTED_LOCAL_PART_ADDR, CASE_INSENSITIVE_MAILBOX_NAMES
+from .syntax import split_email, validate_email_local_part, validate_email_domain_name, validate_email_domain_literal, validate_email_length
+from .rfc_constants import CASE_INSENSITIVE_MAILBOX_NAMES
 
 
 def validate_email(
@@ -20,9 +20,9 @@ def validate_email(
     dns_resolver: Optional[object] = None
 ) -> ValidatedEmail:
     """
-    Validates an email address, raising an EmailNotValidError if the address is not valid or returning a dict of
-    information when the address is valid. The email argument can be a str or a bytes instance,
-    but if bytes it must be ASCII-only. This is the main method of this library.
+    Given an email address, and some options, returns a ValidatedEmail instance
+    with information about the address if it is valid or, if the address is not
+    valid, raises an EmailNotValidError. This is the main function of the module.
     """
 
     # Fill in default values of arguments.
@@ -52,26 +52,13 @@ def validate_email(
         except ValueError:
             raise EmailSyntaxError("The email address is not valid ASCII.")
 
-    # Typical email addresses have a single @-sign, but the
-    # awkward "quoted string" local part form (RFC 5321 4.1.2)
-    # allows @-signs (and escaped quotes) to appear in the local
-    # part if the local part is quoted. If the address is quoted,
-    # split it at a non-escaped @-sign and unescape the escaping.
-    quoted_local_part = False
-    if m := QUOTED_LOCAL_PART_ADDR.match(email):
-        quoted_local_part = True
-        local_part, domain_part = m.groups()
-
-        # Remove backslashes.
-        import re
-        local_part = re.sub(r"\\(.)", "\\1", local_part)
-
-    else:
-        # Split at the one and only at-sign.
-        parts = email.split('@')
-        if len(parts) != 2:
-            raise EmailSyntaxError("The email address is not valid. It must have exactly one @-sign.")
-        local_part, domain_part = parts
+    # Split the address into the local part (before the @-sign)
+    # and the domain part (after the @-sign). Normally, there
+    # is only one @-sign. But the awkward "quoted string" local
+    # part form (RFC 5321 4.1.2) allows @-signs in the local
+    # part if the local part is quoted.
+    local_part, domain_part, is_quoted_local_part \
+        = split_email(email)
 
     # Collect return values in this instance.
     ret = ValidatedEmail()
@@ -84,13 +71,17 @@ def validate_email(
     local_part_info = validate_email_local_part(local_part,
                                                 allow_smtputf8=allow_smtputf8,
                                                 allow_empty_local=allow_empty_local,
-                                                quoted_local_part=quoted_local_part)
-    if quoted_local_part and not allow_quoted_local:
-        raise EmailSyntaxError("Quoting the part before the @-sign is not allowed here.")
+                                                quoted_local_part=is_quoted_local_part)
     ret.local_part = local_part_info["local_part"]
     ret.ascii_local_part = local_part_info["ascii_local_part"]
     ret.smtputf8 = local_part_info["smtputf8"]
 
+    # If a quoted local part isn't allowed but is present, now raise an exception.
+    # This is done after any exceptions raised by validate_email_local_part so
+    # that mandatory checks have highest precedence.
+    if is_quoted_local_part and not allow_quoted_local:
+        raise EmailSyntaxError("Quoting the part before the @-sign is not allowed here.")
+
     # Some local parts are required to be case-insensitive, so we should normalize
     # to lowercase.
     # RFC 2142
@@ -107,7 +98,9 @@ def validate_email(
 
     elif domain_part.startswith("[") and domain_part.endswith("]"):
         # Parse the address in the domain literal and get back a normalized domain.
-        domain_part_info = validate_email_domain_literal(domain_part[1:-1], allow_domain_literal=allow_domain_literal)
+        domain_part_info = validate_email_domain_literal(domain_part[1:-1])
+        if not allow_domain_literal:
+            raise EmailSyntaxError("A bracketed IP address after the @-sign is not allowed here.")
         ret.domain = domain_part_info["domain"]
         ret.ascii_domain = domain_part_info["domain"]  # Domain literals are always ASCII.
         ret.domain_address = domain_part_info["domain_address"]
@@ -131,48 +124,12 @@ def validate_email(
     else:
         ret.ascii_email = None
 
-    # If the email address has an ASCII representation, then we assume it may be
-    # transmitted in ASCII (we can't assume SMTPUTF8 will be used on all hops to
-    # the destination) and the length limit applies to ASCII characters (which is
-    # the same as octets). The number of characters in the internationalized form
-    # may be many fewer (because IDNA ASCII is verbose) and could be less than 254
-    # Unicode characters, and of course the number of octets over the limit may
-    # not be the number of characters over the limit, so if the email address is
-    # internationalized, we can't give any simple information about why the address
-    # is too long.
-    #
-    # In addition, check that the UTF-8 encoding (i.e. not IDNA ASCII and not
-    # Unicode characters) is at most 254 octets. If the addres is transmitted using
-    # SMTPUTF8, then the length limit probably applies to the UTF-8 encoded octets.
-    # If the email address has an ASCII form that differs from its internationalized
-    # form, I don't think the internationalized form can be longer, and so the ASCII
-    # form length check would be sufficient. If there is no ASCII form, then we have
-    # to check the UTF-8 encoding. The UTF-8 encoding could be up to about four times
-    # longer than the number of characters.
-    #
-    # See the length checks on the local part and the domain.
-    if ret.ascii_email and len(ret.ascii_email) > EMAIL_MAX_LENGTH:
-        if ret.ascii_email == ret.normalized:
-            reason = get_length_reason(ret.ascii_email)
-        elif len(ret.normalized) > EMAIL_MAX_LENGTH:
-            # If there are more than 254 characters, then the ASCII
-            # form is definitely going to be too long.
-            reason = get_length_reason(ret.normalized, utf8=True)
-        else:
-            reason = "(when converted to IDNA ASCII)"
-        raise EmailSyntaxError(f"The email address is too long {reason}.")
-    if len(ret.normalized.encode("utf8")) > EMAIL_MAX_LENGTH:
-        if len(ret.normalized) > EMAIL_MAX_LENGTH:
-            # If there are more than 254 characters, then the UTF-8
-            # encoding is definitely going to be too long.
-            reason = get_length_reason(ret.normalized, utf8=True)
-        else:
-            reason = "(when encoded in bytes)"
-        raise EmailSyntaxError(f"The email address is too long {reason}.")
+    # Check the length of the address.
+    validate_email_length(ret)
 
     if check_deliverability and not test_environment:
         # Validate the email address's deliverability using DNS
-        # and update the return dict with metadata.
+        # and update the returned ValidatedEmail object with metadata.
 
         if is_domain_literal:
             # There is nothing to check --- skip deliverability checks.

diff --git a/tests/test_syntax.py b/tests/test_syntax.py
@@ -330,9 +330,9 @@ def test_domain_literal():
         ('[email protected]', 'The part after the @-sign is not valid IDNA (Invalid A-label).'),
         ('[email protected]', 'An email address cannot have two letters followed by two dashes immediately after the @-sign or after a period, except Punycode.'),
         ('me@yy－－0.tld', 'An email address cannot have two letters followed by two dashes immediately after the @-sign or after a period, except Punycode.'),
-        ('me@[127.0.0.1]', 'A bracketed IPv4 address after the @-sign is not allowed here.'),
+        ('me@[127.0.0.1]', 'A bracketed IP address after the @-sign is not allowed here.'),
         ('me@[127.0.0.999]', 'The address in brackets after the @-sign is not valid: It is not an IPv4 address (Octet 999 (> 255) not permitted in \'127.0.0.999\') or is missing an address literal tag.'),
-        ('me@[IPv6:::1]', 'A bracketed IPv6 address after the @-sign is not allowed here.'),
+        ('me@[IPv6:::1]', 'A bracketed IP address after the @-sign is not allowed here.'),
         ('me@[IPv6:::G]', 'The IPv6 address in brackets after the @-sign is not valid (Only hex digits permitted in \'G\' in \'::G\').'),
         ('me@[tag:text]', 'The part after the @-sign contains an invalid address literal tag in brackets.'),
         ('me@[untaggedtext]', 'The part after the @-sign in brackets is not an IPv4 address and has no address literal tag.'),