Improve parsing of author information

Instead of relying on regular expressions, this patch leverages Python’s builtin `email.utils.parseaddr()` functionality to parse an RFC-822-compliant email address string into its name and address parts. This should also resolve issues with special characters in the name part; see for example Poetry issues python-poetry#370 and python-poetry#798. python-poetry/poetry#370 python-poetry/poetry#798
yggi49 · Nov 12, 2022 · 62d33da · 62d33da
1 parent b0b1823
commit 62d33da
Show file tree

Hide file tree

Showing 4 changed files with 93 additions and 18 deletions.
diff --git a/src/poetry/core/masonry/builders/builder.py b/src/poetry/core/masonry/builders/builder.py
@@ -1,7 +1,6 @@
 from __future__ import annotations
 
 import logging
-import re
 import sys
 import warnings
 
@@ -14,8 +13,6 @@
     from poetry.core.poetry import Poetry
 
 
-AUTHOR_REGEX = re.compile(r"(?u)^(?P<name>[- .,\w\d'’\"()]+) <(?P<email>.+?)>$")
-
 METADATA_BASE = """\
 Metadata-Version: 2.1
 Name: {name}
@@ -344,12 +341,11 @@ def convert_script_files(self) -> list[Path]:
 
     @classmethod
     def convert_author(cls, author: str) -> dict[str, str]:
-        m = AUTHOR_REGEX.match(author)
-        if m is None:
-            raise RuntimeError(f"{author} does not match regex")
+        from poetry.core.utils.helpers import parse_author
 
-        name = m.group("name")
-        email = m.group("email")
+        name, email = parse_author(author)
+        if not name or not email:
+            raise RuntimeError(f"{author} does not match regex")
 
         return {"name": name, "email": email}
 

diff --git a/src/poetry/core/packages/package.py b/src/poetry/core/packages/package.py
@@ -16,6 +16,7 @@
 from poetry.core.packages.dependency_group import MAIN_GROUP
 from poetry.core.packages.specification import PackageSpecification
 from poetry.core.packages.utils.utils import create_nested_marker
+from poetry.core.utils.helpers import parse_author
 from poetry.core.version.exceptions import InvalidVersion
 from poetry.core.version.markers import parse_marker
 
@@ -32,6 +33,8 @@
 
     T = TypeVar("T", bound="Package")
 
+# TODO: once poetry.console.commands.init.InitCommand._validate_author
+# uses poetry.core.utils.helpers.parse_author, this can be removed.
 AUTHOR_REGEX = re.compile(r"(?u)^(?P<name>[- .,\w\d'’\"():&]+)(?: <(?P<email>.+?)>)?$")
 
 
@@ -231,34 +234,28 @@ def _get_author(self) -> dict[str, str | None]:
         if not self._authors:
             return {"name": None, "email": None}
 
-        m = AUTHOR_REGEX.match(self._authors[0])
+        name, email = parse_author(self._authors[0])
 
-        if m is None:
+        if not name or not email:
             raise ValueError(
                 "Invalid author string. Must be in the format: "
                 "John Smith <[email protected]>"
             )
 
-        name = m.group("name")
-        email = m.group("email")
-
         return {"name": name, "email": email}
 
     def _get_maintainer(self) -> dict[str, str | None]:
         if not self._maintainers:
             return {"name": None, "email": None}
 
-        m = AUTHOR_REGEX.match(self._maintainers[0])
+        name, email = parse_author(self._maintainers[0])
 
-        if m is None:
+        if not name or not email:
             raise ValueError(
                 "Invalid maintainer string. Must be in the format: "
                 "John Smith <[email protected]>"
             )
 
-        name = m.group("name")
-        email = m.group("email")
-
         return {"name": name, "email": email}
 
     @property

diff --git a/src/poetry/core/utils/helpers.py b/src/poetry/core/utils/helpers.py
@@ -8,6 +8,7 @@
 import warnings
 
 from contextlib import contextmanager
+from email.utils import parseaddr
 from pathlib import Path
 from typing import Any
 from typing import Iterator
@@ -105,3 +106,26 @@ def readme_content_type(path: str | Path) -> str:
         return "text/markdown"
     else:
         return "text/plain"
+
+
+def parse_author(address: str) -> tuple[str | None, str | None]:
+    """Parse name and address parts from an email address string.
+
+    >>> parse_author("John Doe <[email protected]>")
+    ('John Doe', '[email protected]')
+
+    .. note::
+
+       If the input string does not contain an ``@`` character, it is
+       assumed that it represents only a name without an email address.
+
+    :param address: the email address string to parse.
+    :return: a 2-tuple with the parsed name and email address.  If a
+             part is missing, ``None`` will be returned in its place.
+    """
+    if "@" not in address:
+        return address, None
+    name, email = parseaddr(address)
+    if not name and "@" not in email:
+        return email, None
+    return name or None, email or None
diff --git a/tests/utils/test_helpers.py b/tests/utils/test_helpers.py
@@ -8,6 +8,7 @@
 import pytest
 
 from poetry.core.utils.helpers import combine_unicode
+from poetry.core.utils.helpers import parse_author
 from poetry.core.utils.helpers import parse_requires
 from poetry.core.utils.helpers import readme_content_type
 from poetry.core.utils.helpers import temporary_directory
@@ -118,3 +119,60 @@ def test_utils_helpers_readme_content_type(
     readme: str | Path, content_type: str
 ) -> None:
     assert readme_content_type(readme) == content_type
+
+
+def test_utils_helpers_parse_author():
+    """Test the :func:`parse_author` function."""
+
+    # Verify the (probable) default use case
+    name, email = parse_author("John Doe <[email protected]>")
+    assert name == "John Doe"
+    assert email == "[email protected]"
+
+    # Name only
+    name, email = parse_author("John Doe")
+    assert name == "John Doe"
+    assert email is None
+
+    # Name with a “special” character + email address
+    name, email = parse_author("R&D <[email protected]>")
+    assert name == "R&D"
+    assert email == "[email protected]"
+
+    # Name with a “special” character only
+    name, email = parse_author("R&D")
+    assert name == "R&D"
+    assert email is None
+
+    # Name with fancy unicode character + email address
+    name, email = parse_author("my·fancy corp <[email protected]>")
+    assert name == "my·fancy corp"
+    assert email == "[email protected]"
+
+    # Name with fancy unicode character only
+    name, email = parse_author("my·fancy corp")
+    assert name == "my·fancy corp"
+    assert email is None
+
+    # Email address only, wrapped in angular brackets
+    name, email = parse_author("<[email protected]>")
+    assert name is None
+    assert email == "[email protected]"
+
+    # Email address only
+    name, email = parse_author("[email protected]")
+    assert name is None
+    assert email == "[email protected]"
+
+    # Non-RFC-conform cases with unquoted commas
+    name, email = parse_author("asf,[email protected]")
+    assert name == "asf"
+    assert email is None
+
+    name, email = parse_author("asf,<[email protected]>")
+    assert name == "asf"
+    assert email is None
+
+    name, email = parse_author("asf, [email protected]")
+    assert name == "asf"
+    assert email is None