From 80513471731d9fadd65c6fe5694a229a56294beb Mon Sep 17 00:00:00 2001 From: Joshua Tauberer Date: Mon, 17 Jun 2024 09:55:19 -0400 Subject: [PATCH] Improve the error message for invalid characters in domain names after Unicode NFC normalization These cases were previously handled by the call to idna.encode or idna.alabel, but the error message wasn't consistent with similar checks we do for the local part. See #142. --- CHANGELOG.md | 2 +- email_validator/syntax.py | 10 ++++++++++ tests/test_syntax.py | 8 ++------ 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fcaa452..632b1ca 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,7 @@ In Development * Email addresses with internationalized local parts could, with rare Unicode characters, be returned as valid but actually be invalid in their normalized form (returned in the `normalized` field). Local parts now re-validated after Unicode NFC normalization to ensure that invalid characters cannot be injected into the normalized address and that characters with length-increasing NFC normalizations cannot cause a local part to exceed the maximum length after normalization. * The length check for email addresses with internationalized local parts is now also applied to the original address string prior to Unicode NFC normalization, which may be longer and could exceed the maximum email address length, to protect callers who do not use the returned normalized address. -* Improved error message for IDNA domains that are too long. +* Improved error message for IDNA domains that are too long or have invalid characters after Unicode normalization. * A new option to parse `My Name ` strings, i.e. a display name plus an email address in angle brackets, is now available. It is off by default. 2.1.2 (June 16, 2024) diff --git a/email_validator/syntax.py b/email_validator/syntax.py index 78586c6..c655451 100644 --- a/email_validator/syntax.py +++ b/email_validator/syntax.py @@ -476,6 +476,16 @@ def validate_email_domain_name(domain: str, test_environment: bool = False, glob except idna.IDNAError as e: raise EmailSyntaxError(f"The part after the @-sign contains invalid characters ({e}).") from e + # Check for invalid characters after Unicode normalization which are not caught + # by uts46_remap (see tests for examples). + bad_chars = { + safe_character_display(c) + for c in domain + if not ATEXT_HOSTNAME_INTL.match(c) + } + if bad_chars: + raise EmailSyntaxError("The part after the @-sign contains invalid characters after Unicode normalization: " + ", ".join(sorted(bad_chars)) + ".") + # The domain part is made up dot-separated "labels." Each label must # have at least one character and cannot start or end with dashes, which # means there are some surprising restrictions on periods and dashes. diff --git a/tests/test_syntax.py b/tests/test_syntax.py index 619932a..ffe4963 100644 --- a/tests/test_syntax.py +++ b/tests/test_syntax.py @@ -392,12 +392,8 @@ def test_domain_literal() -> None: ('me@â’ˆwouldbeinvalid.com', "The part after the @-sign contains invalid characters (Codepoint U+2488 not allowed " "at position 1 in 'â’ˆwouldbeinvalid.com')."), - ('me@\u037e.com', - "The part after the @-sign is invalid (Codepoint U+003B at position 1 " - "of ';' not allowed)."), - ('me@\u1fef.com', - "The part after the @-sign is invalid (Codepoint U+0060 at position 1 " - "of '`' not allowed)."), + ('me@\u037e.com', "The part after the @-sign contains invalid characters after Unicode normalization: ';'."), + ('me@\u1fef.com', "The part after the @-sign contains invalid characters after Unicode normalization: '`'."), ('@example.com', 'There must be something before the @-sign.'), ('white space@test', 'The email address contains invalid characters before the @-sign: SPACE.'), ('test@white space', 'The part after the @-sign contains invalid characters: SPACE.'),