Skip to content

Commit

Permalink
Improve parsing of author information
Browse files Browse the repository at this point in the history
Instead of relying on regular expressions, this patch leverages Python’s
builtin `email.utils.parseaddr()` functionality to parse an RFC-822-compliant
email address string into its name and address parts.

This should also resolve issues with special characters in the name
part; see for example Poetry issues python-poetry#370 and python-poetry#798.

python-poetry/poetry#370
python-poetry/poetry#798
  • Loading branch information
yggi49 committed Nov 12, 2022
1 parent b0b1823 commit 62d33da
Show file tree
Hide file tree
Showing 4 changed files with 93 additions and 18 deletions.
12 changes: 4 additions & 8 deletions src/poetry/core/masonry/builders/builder.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from __future__ import annotations

import logging
import re
import sys
import warnings

Expand All @@ -14,8 +13,6 @@
from poetry.core.poetry import Poetry


AUTHOR_REGEX = re.compile(r"(?u)^(?P<name>[- .,\w\d'’\"()]+) <(?P<email>.+?)>$")

METADATA_BASE = """\
Metadata-Version: 2.1
Name: {name}
Expand Down Expand Up @@ -344,12 +341,11 @@ def convert_script_files(self) -> list[Path]:

@classmethod
def convert_author(cls, author: str) -> dict[str, str]:
m = AUTHOR_REGEX.match(author)
if m is None:
raise RuntimeError(f"{author} does not match regex")
from poetry.core.utils.helpers import parse_author

name = m.group("name")
email = m.group("email")
name, email = parse_author(author)
if not name or not email:
raise RuntimeError(f"{author} does not match regex")

return {"name": name, "email": email}

Expand Down
17 changes: 7 additions & 10 deletions src/poetry/core/packages/package.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from poetry.core.packages.dependency_group import MAIN_GROUP
from poetry.core.packages.specification import PackageSpecification
from poetry.core.packages.utils.utils import create_nested_marker
from poetry.core.utils.helpers import parse_author
from poetry.core.version.exceptions import InvalidVersion
from poetry.core.version.markers import parse_marker

Expand All @@ -32,6 +33,8 @@

T = TypeVar("T", bound="Package")

# TODO: once poetry.console.commands.init.InitCommand._validate_author
# uses poetry.core.utils.helpers.parse_author, this can be removed.
AUTHOR_REGEX = re.compile(r"(?u)^(?P<name>[- .,\w\d'’\"():&]+)(?: <(?P<email>.+?)>)?$")


Expand Down Expand Up @@ -231,34 +234,28 @@ def _get_author(self) -> dict[str, str | None]:
if not self._authors:
return {"name": None, "email": None}

m = AUTHOR_REGEX.match(self._authors[0])
name, email = parse_author(self._authors[0])

if m is None:
if not name or not email:
raise ValueError(
"Invalid author string. Must be in the format: "
"John Smith <[email protected]>"
)

name = m.group("name")
email = m.group("email")

return {"name": name, "email": email}

def _get_maintainer(self) -> dict[str, str | None]:
if not self._maintainers:
return {"name": None, "email": None}

m = AUTHOR_REGEX.match(self._maintainers[0])
name, email = parse_author(self._maintainers[0])

if m is None:
if not name or not email:
raise ValueError(
"Invalid maintainer string. Must be in the format: "
"John Smith <[email protected]>"
)

name = m.group("name")
email = m.group("email")

return {"name": name, "email": email}

@property
Expand Down
24 changes: 24 additions & 0 deletions src/poetry/core/utils/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import warnings

from contextlib import contextmanager
from email.utils import parseaddr
from pathlib import Path
from typing import Any
from typing import Iterator
Expand Down Expand Up @@ -105,3 +106,26 @@ def readme_content_type(path: str | Path) -> str:
return "text/markdown"
else:
return "text/plain"


def parse_author(address: str) -> tuple[str | None, str | None]:
"""Parse name and address parts from an email address string.
>>> parse_author("John Doe <[email protected]>")
('John Doe', '[email protected]')
.. note::
If the input string does not contain an ``@`` character, it is
assumed that it represents only a name without an email address.
:param address: the email address string to parse.
:return: a 2-tuple with the parsed name and email address. If a
part is missing, ``None`` will be returned in its place.
"""
if "@" not in address:
return address, None
name, email = parseaddr(address)
if not name and "@" not in email:
return email, None
return name or None, email or None
58 changes: 58 additions & 0 deletions tests/utils/test_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import pytest

from poetry.core.utils.helpers import combine_unicode
from poetry.core.utils.helpers import parse_author
from poetry.core.utils.helpers import parse_requires
from poetry.core.utils.helpers import readme_content_type
from poetry.core.utils.helpers import temporary_directory
Expand Down Expand Up @@ -118,3 +119,60 @@ def test_utils_helpers_readme_content_type(
readme: str | Path, content_type: str
) -> None:
assert readme_content_type(readme) == content_type


def test_utils_helpers_parse_author():
"""Test the :func:`parse_author` function."""

# Verify the (probable) default use case
name, email = parse_author("John Doe <[email protected]>")
assert name == "John Doe"
assert email == "[email protected]"

# Name only
name, email = parse_author("John Doe")
assert name == "John Doe"
assert email is None

# Name with a “special” character + email address
name, email = parse_author("R&D <[email protected]>")
assert name == "R&D"
assert email == "[email protected]"

# Name with a “special” character only
name, email = parse_author("R&D")
assert name == "R&D"
assert email is None

# Name with fancy unicode character + email address
name, email = parse_author("my·fancy corp <[email protected]>")
assert name == "my·fancy corp"
assert email == "[email protected]"

# Name with fancy unicode character only
name, email = parse_author("my·fancy corp")
assert name == "my·fancy corp"
assert email is None

# Email address only, wrapped in angular brackets
name, email = parse_author("<[email protected]>")
assert name is None
assert email == "[email protected]"

# Email address only
name, email = parse_author("[email protected]")
assert name is None
assert email == "[email protected]"

# Non-RFC-conform cases with unquoted commas
name, email = parse_author("asf,[email protected]")
assert name == "asf"
assert email is None

name, email = parse_author("asf,<[email protected]>")
assert name == "asf"
assert email is None

name, email = parse_author("asf, [email protected]")
assert name == "asf"
assert email is None

0 comments on commit 62d33da

Please sign in to comment.