From 7722044f4b0f689eec8cb89afc8d3e4bb8e62908 Mon Sep 17 00:00:00 2001
From: Brett Cannon <brett@python.org>
Date: Wed, 1 Feb 2023 11:25:20 -0800
Subject: [PATCH] Parse raw metadata (#671)

Co-authored-by: Donald Stufft <donald@stufft.io>
Co-authored-by: Paul Moore <p.f.moore@gmail.com>
Co-authored-by: Shantanu <12621235+hauntsaninja@users.noreply.github.com>
---
 .github/workflows/test.yml         |   2 +-
 docs/index.rst                     |   1 +
 docs/metadata.rst                  |  42 +++
 noxfile.py                         |   4 +-
 src/packaging/_manylinux.py        |   2 +
 src/packaging/metadata.py          | 408 +++++++++++++++++++++++++++++
 tests/metadata/everything.metadata |  42 +++
 tests/test_manylinux.py            |   8 +-
 tests/test_metadata.py             | 249 ++++++++++++++++++
 9 files changed, 749 insertions(+), 9 deletions(-)
 create mode 100644 docs/metadata.rst
 create mode 100644 src/packaging/metadata.py
 create mode 100644 tests/metadata/everything.metadata
 create mode 100644 tests/test_metadata.py

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index b4b4066a..d3d38710 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -23,7 +23,7 @@ jobs:
       matrix:
         os: [Ubuntu, Windows, macOS]
         python_version:
-          ["3.7", "3.8", "3.9", "3.10", "3.11", "pypy3.7", "pypy3.8", "pypy3.9"]
+          ["3.7", "3.8", "3.9", "3.10", "3.11", "pypy3.8", "pypy3.9"]
 
     steps:
       - uses: actions/checkout@v3
diff --git a/docs/index.rst b/docs/index.rst
index aafdae83..6850e9e8 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -25,6 +25,7 @@ You can install packaging with ``pip``:
     specifiers
     markers
     requirements
+    metadata
     tags
     utils
 
diff --git a/docs/metadata.rst b/docs/metadata.rst
new file mode 100644
index 00000000..b87574cb
--- /dev/null
+++ b/docs/metadata.rst
@@ -0,0 +1,42 @@
+Metadata
+========
+
+.. currentmodule:: packaging.markers
+
+
+Both `source distributions`_ and `binary distributions`
+(_sdists_ and _wheels_, respectively) contain files recording the
+`core metadata`_ for the distribution. This information is used for
+everything from recording the name of the distribution to the
+installation dependencies.
+
+
+Usage
+-----
+
+.. doctest::
+
+    >>> from packaging.metadata import parse_email
+    >>> metadata = "Metadata-Version: 2.3\nName: packaging\nVersion: 24.0"
+    >>> raw, unparsed = parse_email(metadata)
+    >>> raw["metadata_version"]
+    '2.3'
+    >>> raw["name"]
+    'packaging'
+    >>> raw["version"]
+    '24.0'
+
+
+Reference
+---------
+
+Low Level Interface
+'''''''''''''''''''
+
+.. automodule:: packaging.metadata
+    :members:
+
+
+.. _source distributions: https://packaging.python.org/en/latest/specifications/source-distribution-format/
+.. _binary distributions: https://packaging.python.org/en/latest/specifications/binary-distribution-format/
+.. _core metadata: https://packaging.python.org/en/latest/specifications/core-metadata/
diff --git a/noxfile.py b/noxfile.py
index 6c480595..da5abc73 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -21,9 +21,7 @@
 nox.options.reuse_existing_virtualenvs = True
 
 
-@nox.session(
-    python=["3.7", "3.8", "3.9", "3.10", "3.11", "pypy3.7", "pypy3.8", "pypy3.9"]
-)
+@nox.session(python=["3.7", "3.8", "3.9", "3.10", "3.11", "pypy3.8", "pypy3.9"])
 def tests(session):
     def coverage(*args):
         session.run("python", "-m", "coverage", *args)
diff --git a/src/packaging/_manylinux.py b/src/packaging/_manylinux.py
index 2f0cc743..449c655b 100644
--- a/src/packaging/_manylinux.py
+++ b/src/packaging/_manylinux.py
@@ -14,6 +14,8 @@
 EF_ARM_ABI_FLOAT_HARD = 0x00000400
 
 
+# `os.PathLike` not a generic type until Python 3.9, so sticking with `str`
+# as the type for `path` until then.
 @contextlib.contextmanager
 def _parse_elf(path: str) -> Generator[Optional[ELFFile], None, None]:
     try:
diff --git a/src/packaging/metadata.py b/src/packaging/metadata.py
new file mode 100644
index 00000000..e76a60c3
--- /dev/null
+++ b/src/packaging/metadata.py
@@ -0,0 +1,408 @@
+import email.feedparser
+import email.header
+import email.message
+import email.parser
+import email.policy
+import sys
+import typing
+from typing import Dict, List, Optional, Tuple, Union, cast
+
+if sys.version_info >= (3, 8):  # pragma: no cover
+    from typing import TypedDict
+else:  # pragma: no cover
+    if typing.TYPE_CHECKING:
+        from typing_extensions import TypedDict
+    else:
+        try:
+            from typing_extensions import TypedDict
+        except ImportError:
+
+            class TypedDict:
+                def __init_subclass__(*_args, **_kwargs):
+                    pass
+
+
+# The RawMetadata class attempts to make as few assumptions about the underlying
+# serialization formats as possible. The idea is that as long as a serialization
+# formats offer some very basic primitives in *some* way then we can support
+# serializing to and from that format.
+class RawMetadata(TypedDict, total=False):
+    """A dictionary of raw core metadata.
+
+    Each field in core metadata maps to a key of this dictionary (when data is
+    provided). The key is lower-case and underscores are used instead of dashes
+    compared to the equivalent core metadata field. Any core metadata field that
+    can be specified multiple times or can hold multiple values in a single
+    field have a key with a plural name.
+
+    Core metadata fields that can be specified multiple times are stored as a
+    list or dict depending on which is appropriate for the field. Any fields
+    which hold multiple values in a single field are stored as a list.
+
+    """
+
+    # Metadata 1.0 - PEP 241
+    metadata_version: str
+    name: str
+    version: str
+    platforms: List[str]
+    summary: str
+    description: str
+    keywords: List[str]
+    home_page: str
+    author: str
+    author_email: str
+    license: str
+
+    # Metadata 1.1 - PEP 314
+    supported_platforms: List[str]
+    download_url: str
+    classifiers: List[str]
+    requires: List[str]
+    provides: List[str]
+    obsoletes: List[str]
+
+    # Metadata 1.2 - PEP 345
+    maintainer: str
+    maintainer_email: str
+    requires_dist: List[str]
+    provides_dist: List[str]
+    obsoletes_dist: List[str]
+    requires_python: str
+    requires_external: List[str]
+    project_urls: Dict[str, str]
+
+    # Metadata 2.0
+    # PEP 426 attempted to completely revamp the metadata format
+    # but got stuck without ever being able to build consensus on
+    # it and ultimately ended up withdrawn.
+    #
+    # However, a number of tools had started emiting METADATA with
+    # `2.0` Metadata-Version, so for historical reasons, this version
+    # was skipped.
+
+    # Metadata 2.1 - PEP 566
+    description_content_type: str
+    provides_extra: List[str]
+
+    # Metadata 2.2 - PEP 643
+    dynamic: List[str]
+
+    # Metadata 2.3 - PEP 685
+    # No new fields were added in PEP 685, just some edge case were
+    # tightened up to provide better interoptability.
+
+
+_STRING_FIELDS = {
+    "author",
+    "author_email",
+    "description",
+    "description_content_type",
+    "download_url",
+    "home_page",
+    "license",
+    "maintainer",
+    "maintainer_email",
+    "metadata_version",
+    "name",
+    "requires_python",
+    "summary",
+    "version",
+}
+
+_LIST_STRING_FIELDS = {
+    "classifiers",
+    "dynamic",
+    "obsoletes",
+    "obsoletes_dist",
+    "platforms",
+    "provides",
+    "provides_dist",
+    "provides_extra",
+    "requires",
+    "requires_dist",
+    "requires_external",
+    "supported_platforms",
+}
+
+
+def _parse_keywords(data: str) -> List[str]:
+    """Split a string of comma-separate keyboards into a list of keywords."""
+    return [k.strip() for k in data.split(",")]
+
+
+def _parse_project_urls(data: List[str]) -> Dict[str, str]:
+    """Parse a list of label/URL string pairings separated by a comma."""
+    urls = {}
+    for pair in data:
+        # Our logic is slightly tricky here as we want to try and do
+        # *something* reasonable with malformed data.
+        #
+        # The main thing that we have to worry about, is data that does
+        # not have a ',' at all to split the label from the Value. There
+        # isn't a singular right answer here, and we will fail validation
+        # later on (if the caller is validating) so it doesn't *really*
+        # matter, but since the missing value has to be an empty str
+        # and our return value is dict[str, str], if we let the key
+        # be the missing value, then they'd have multiple '' values that
+        # overwrite each other in a accumulating dict.
+        #
+        # The other potentional issue is that it's possible to have the
+        # same label multiple times in the metadata, with no solid "right"
+        # answer with what to do in that case. As such, we'll do the only
+        # thing we can, which is treat the field as unparseable and add it
+        # to our list of unparsed fields.
+        parts = [p.strip() for p in pair.split(",", 1)]
+        parts.extend([""] * (max(0, 2 - len(parts))))  # Ensure 2 items
+
+        # TODO: The spec doesn't say anything about if the keys should be
+        #       considered case sensitive or not... logically they should
+        #       be case-preserving and case-insensitive, but doing that
+        #       would open up more cases where we might have duplicate
+        #       entries.
+        label, url = parts
+        if label in urls:
+            # The label already exists in our set of urls, so this field
+            # is unparseable, and we can just add the whole thing to our
+            # unparseable data and stop processing it.
+            raise KeyError("duplicate labels in project urls")
+        urls[label] = url
+
+    return urls
+
+
+def _get_payload(msg: email.message.Message, source: Union[bytes, str]) -> str:
+    """Get the body of the message."""
+    # If our source is a str, then our caller has managed encodings for us,
+    # and we don't need to deal with it.
+    if isinstance(source, str):
+        payload: str = msg.get_payload()
+        return payload
+    # If our source is a bytes, then we're managing the encoding and we need
+    # to deal with it.
+    else:
+        bpayload: bytes = msg.get_payload(decode=True)
+        try:
+            return bpayload.decode("utf8", "strict")
+        except UnicodeDecodeError:
+            raise ValueError("payload in an invalid encoding")
+
+
+# The various parse_FORMAT functions here are intended to be as lenient as
+# possible in their parsing, while still returning a correctly typed
+# RawMetadata.
+#
+# To aid in this, we also generally want to do as little touching of the
+# data as possible, except where there are possibly some historic holdovers
+# that make valid data awkward to work with.
+#
+# While this is a lower level, intermediate format than our ``Metadata``
+# class, some light touch ups can make a massive difference in usability.
+
+# Map METADATA fields to RawMetadata.
+_EMAIL_TO_RAW_MAPPING = {
+    "author": "author",
+    "author-email": "author_email",
+    "classifier": "classifiers",
+    "description": "description",
+    "description-content-type": "description_content_type",
+    "download-url": "download_url",
+    "dynamic": "dynamic",
+    "home-page": "home_page",
+    "keywords": "keywords",
+    "license": "license",
+    "maintainer": "maintainer",
+    "maintainer-email": "maintainer_email",
+    "metadata-version": "metadata_version",
+    "name": "name",
+    "obsoletes": "obsoletes",
+    "obsoletes-dist": "obsoletes_dist",
+    "platform": "platforms",
+    "project-url": "project_urls",
+    "provides": "provides",
+    "provides-dist": "provides_dist",
+    "provides-extra": "provides_extra",
+    "requires": "requires",
+    "requires-dist": "requires_dist",
+    "requires-external": "requires_external",
+    "requires-python": "requires_python",
+    "summary": "summary",
+    "supported-platform": "supported_platforms",
+    "version": "version",
+}
+
+
+def parse_email(data: Union[bytes, str]) -> Tuple[RawMetadata, Dict[str, List[str]]]:
+    """Parse a distribution's metadata.
+
+    This function returns a two-item tuple of dicts. The first dict is of
+    recognized fields from the core metadata specification. Fields that can be
+    parsed and translated into Python's built-in types are converted
+    appropriately. All other fields are left as-is. Fields that are allowed to
+    appear multiple times are stored as lists.
+
+    The second dict contains all other fields from the metadata. This includes
+    any unrecognized fields. It also includes any fields which are expected to
+    be parsed into a built-in type but were not formatted appropriately. Finally,
+    any fields that are expected to appear only once but are repeated are
+    included in this dict.
+
+    """
+    raw: Dict[str, Union[str, List[str], Dict[str, str]]] = {}
+    unparsed: Dict[str, List[str]] = {}
+
+    if isinstance(data, str):
+        parsed = email.parser.Parser(policy=email.policy.compat32).parsestr(data)
+    else:
+        parsed = email.parser.BytesParser(policy=email.policy.compat32).parsebytes(data)
+
+    # We have to wrap parsed.keys() in a set, because in the case of multiple
+    # values for a key (a list), the key will appear multiple times in the
+    # list of keys, but we're avoiding that by using get_all().
+    for name in frozenset(parsed.keys()):
+        # Header names in RFC are case insensitive, so we'll normalize to all
+        # lower case to make comparisons easier.
+        name = name.lower()
+
+        # We use get_all() here, even for fields that aren't multiple use,
+        # because otherwise someone could have e.g. two Name fields, and we
+        # would just silently ignore it rather than doing something about it.
+        headers = parsed.get_all(name)
+
+        # The way the email module works when parsing bytes is that it
+        # unconditionally decodes the bytes as ascii using the surrogateescape
+        # handler. When you pull that data back out (such as with get_all() ),
+        # it looks to see if the str has any surrogate escapes, and if it does
+        # it wraps it in a Header object instead of returning the string.
+        #
+        # As such, we'll look for those Header objects, and fix up the encoding.
+        value = []
+        # Flag if we have run into any issues processing the headers, thus
+        # signalling that the data belongs in 'unparsed'.
+        valid_encoding = True
+        for h in headers:
+            # It's unclear if this can return more types than just a Header or
+            # a str, so we'll just assert here to make sure.
+            assert isinstance(h, (email.header.Header, str))
+
+            # If it's a header object, we need to do our little dance to get
+            # the real data out of it. In cases where there is invalid data
+            # we're going to end up with mojibake, but there's no obvious, good
+            # way around that without reimplementing parts of the Header object
+            # ourselves.
+            #
+            # That should be fine since, if mojibacked happens, this key is
+            # going into the unparsed dict anyways.
+            if isinstance(h, email.header.Header):
+                # The Header object stores it's data as chunks, and each chunk
+                # can be independently encoded, so we'll need to check each
+                # of them.
+                chunks: List[Tuple[bytes, Optional[str]]] = []
+                for bin, encoding in email.header.decode_header(h):
+                    try:
+                        bin.decode("utf8", "strict")
+                    except UnicodeDecodeError:
+                        # Enable mojibake.
+                        encoding = "latin1"
+                        valid_encoding = False
+                    else:
+                        encoding = "utf8"
+                    chunks.append((bin, encoding))
+
+                # Turn our chunks back into a Header object, then let that
+                # Header object do the right thing to turn them into a
+                # string for us.
+                value.append(str(email.header.make_header(chunks)))
+            # This is already a string, so just add it.
+            else:
+                value.append(h)
+
+        # We've processed all of our values to get them into a list of str,
+        # but we may have mojibake data, in which case this is an unparsed
+        # field.
+        if not valid_encoding:
+            unparsed[name] = value
+            continue
+
+        raw_name = _EMAIL_TO_RAW_MAPPING.get(name)
+        if raw_name is None:
+            # This is a bit of a weird situation, we've encountered a key that
+            # we don't know what it means, so we don't know whether it's meant
+            # to be a list or not.
+            #
+            # Since we can't really tell one way or another, we'll just leave it
+            # as a list, even though it may be a single item list, because that's
+            # what makes the most sense for email headers.
+            unparsed[name] = value
+            continue
+
+        # If this is one of our string fields, then we'll check to see if our
+        # value is a list of a single item. If it is then we'll assume that
+        # it was emitted as a single string, and unwrap the str from inside
+        # the list.
+        #
+        # If it's any other kind of data, then we haven't the faintest clue
+        # what we should parse it as, and we have to just add it to our list
+        # of unparsed stuff.
+        if raw_name in _STRING_FIELDS and len(value) == 1:
+            raw[raw_name] = value[0]
+        # If this is one of our list of string fields, then we can just assign
+        # the value, since email *only* has strings, and our get_all() call
+        # above ensures that this is a list.
+        elif raw_name in _LIST_STRING_FIELDS:
+            raw[raw_name] = value
+        # Special Case: Keywords
+        # The keywords field is implemented in the metadata spec as a str,
+        # but it conceptually is a list of strings, and is serialized using
+        # ", ".join(keywords), so we'll do some light data massaging to turn
+        # this into what it logically is.
+        elif raw_name == "keywords" and len(value) == 1:
+            raw[raw_name] = _parse_keywords(value[0])
+        # Special Case: Project-URL
+        # The project urls is implemented in the metadata spec as a list of
+        # specially-formatted strings that represent a key and a value, which
+        # is fundamentally a mapping, however the email format doesn't support
+        # mappings in a sane way, so it was crammed into a list of strings
+        # instead.
+        #
+        # We will do a little light data massaging to turn this into a map as
+        # it logically should be.
+        elif raw_name == "project_urls":
+            try:
+                raw[raw_name] = _parse_project_urls(value)
+            except KeyError:
+                unparsed[name] = value
+        # Nothing that we've done has managed to parse this, so it'll just
+        # throw it in our unparseable data and move on.
+        else:
+            unparsed[name] = value
+
+    # We need to support getting the Description from the message payload in
+    # addition to getting it from the the headers. This does mean, though, there
+    # is the possibility of it being set both ways, in which case we put both
+    # in 'unparsed' since we don't know which is right.
+    try:
+        payload = _get_payload(parsed, data)
+    except ValueError:
+        unparsed.setdefault("description", []).append(
+            parsed.get_payload(decode=isinstance(data, bytes))
+        )
+    else:
+        if payload:
+            # Check to see if we've already got a description, if so then both
+            # it, and this body move to unparseable.
+            if "description" in raw:
+                description_header = cast(str, raw.pop("description"))
+                unparsed.setdefault("description", []).extend(
+                    [description_header, payload]
+                )
+            elif "description" in unparsed:
+                unparsed["description"].append(payload)
+            else:
+                raw["description"] = payload
+
+    # We need to cast our `raw` to a metadata, because a TypedDict only support
+    # literal key names, but we're computing our key names on purpose, but the
+    # way this function is implemented, our `TypedDict` can only have valid key
+    # names.
+    return cast(RawMetadata, raw), unparsed
diff --git a/tests/metadata/everything.metadata b/tests/metadata/everything.metadata
new file mode 100644
index 00000000..5412a083
--- /dev/null
+++ b/tests/metadata/everything.metadata
@@ -0,0 +1,42 @@
+Metadata-Version: 2.3
+Name: BeagleVote
+Version: 1.0a2
+Platform: ObscureUnix
+Platform: RareDOS
+Supported-Platform: RedHat 7.2
+Supported-Platform: i386-win32-2791
+Summary: A module for collecting votes from beagles.
+Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
+Keywords: dog,puppy,voting,election
+Home-page: http://www.example.com/~cschultz/bvote/
+Download-URL: …/BeagleVote-0.45.tgz
+Author: C. Schultz, Universal Features Syndicate,
+        Los Angeles, CA <cschultz@peanuts.example.com>
+Author-email: "C. Schultz" <cschultz@example.com>
+Maintainer: C. Schultz, Universal Features Syndicate,
+        Los Angeles, CA <cschultz@peanuts.example.com>
+Maintainer-email: "C. Schultz" <cschultz@example.com>
+License: This software may only be obtained by sending the
+        author a postcard, and then the user promises not
+        to redistribute it.
+Classifier: Development Status :: 4 - Beta
+Classifier: Environment :: Console (Text Based)
+Provides-Extra: pdf
+Requires-Dist: reportlab; extra == 'pdf'
+Requires-Dist: pkginfo
+Requires-Dist: PasteDeploy
+Requires-Dist: zope.interface (>3.5.0)
+Requires-Dist: pywin32 >1.0; sys_platform == 'win32'
+Requires-Python: >=3
+Requires-External: C
+Requires-External: libpng (>=1.5)
+Requires-External: make; sys_platform != "win32"
+Project-URL: Bug Tracker, http://bitbucket.org/tarek/distribute/issues/
+Project-URL: Documentation, https://example.com/BeagleVote
+Provides-Dist: OtherProject
+Provides-Dist: AnotherProject (3.4)
+Provides-Dist: virtual_package; python_version >= "3.4"
+Dynamic: Obsoletes-Dist
+ThisIsNotReal: Hello!
+
+This description intentionally left blank.
diff --git a/tests/test_manylinux.py b/tests/test_manylinux.py
index dafdfc3d..3561bb99 100644
--- a/tests/test_manylinux.py
+++ b/tests/test_manylinux.py
@@ -3,6 +3,7 @@
 except ImportError:
     ctypes = None
 import os
+import pathlib
 import platform
 import sys
 import types
@@ -169,11 +170,8 @@ def test_glibc_version_string_none(monkeypatch):
 )
 def test_parse_elf_bad_executable(monkeypatch, content):
     if content:
-        path = os.path.join(
-            os.path.dirname(__file__),
-            "manylinux",
-            f"hello-world-{content}",
-        )
+        path = pathlib.Path(__file__).parent / "manylinux" / f"hello-world-{content}"
+        path = os.fsdecode(path)
     else:
         path = None
     with _parse_elf(path) as ef:
diff --git a/tests/test_metadata.py b/tests/test_metadata.py
new file mode 100644
index 00000000..22fe76ba
--- /dev/null
+++ b/tests/test_metadata.py
@@ -0,0 +1,249 @@
+import pathlib
+
+import pytest
+
+from packaging import metadata
+
+_RAW_TO_EMAIL_MAPPING = {
+    raw: email for email, raw in metadata._EMAIL_TO_RAW_MAPPING.items()
+}
+
+
+class TestRawMetadata:
+    @pytest.mark.parametrize("raw_field", metadata._STRING_FIELDS)
+    def test_non_repeating_fields_only_once(self, raw_field):
+        data = "VaLuE"
+        header_field = _RAW_TO_EMAIL_MAPPING[raw_field]
+        single_header = f"{header_field}: {data}"
+        raw, unparsed = metadata.parse_email(single_header)
+        assert not unparsed
+        assert len(raw) == 1
+        assert raw_field in raw
+        assert raw[raw_field] == data
+
+    @pytest.mark.parametrize("raw_field", metadata._STRING_FIELDS)
+    def test_non_repeating_fields_repeated(self, raw_field):
+        header_field = _RAW_TO_EMAIL_MAPPING[raw_field]
+        data = "VaLuE"
+        single_header = f"{header_field}: {data}"
+        repeated_header = "\n".join([single_header] * 2)
+        raw, unparsed = metadata.parse_email(repeated_header)
+        assert not raw
+        assert len(unparsed) == 1
+        assert header_field in unparsed
+        assert unparsed[header_field] == [data] * 2
+
+    @pytest.mark.parametrize("raw_field", metadata._LIST_STRING_FIELDS)
+    def test_repeating_fields_only_once(self, raw_field):
+        data = "VaLuE"
+        header_field = _RAW_TO_EMAIL_MAPPING[raw_field]
+        single_header = f"{header_field}: {data}"
+        raw, unparsed = metadata.parse_email(single_header)
+        assert not unparsed
+        assert len(raw) == 1
+        assert raw_field in raw
+        assert raw[raw_field] == [data]
+
+    @pytest.mark.parametrize("raw_field", metadata._LIST_STRING_FIELDS)
+    def test_repeating_fields_repeated(self, raw_field):
+        header_field = _RAW_TO_EMAIL_MAPPING[raw_field]
+        data = "VaLuE"
+        single_header = f"{header_field}: {data}"
+        repeated_header = "\n".join([single_header] * 2)
+        raw, unparsed = metadata.parse_email(repeated_header)
+        assert not unparsed
+        assert len(raw) == 1
+        assert raw_field in raw
+        assert raw[raw_field] == [data] * 2
+
+    @pytest.mark.parametrize(
+        ["given", "expected"],
+        [
+            ("A", ["A"]),
+            ("A ", ["A"]),
+            (" A", ["A"]),
+            ("A, B", ["A", "B"]),
+            ("A,B", ["A", "B"]),
+            (" A, B", ["A", "B"]),
+            ("A,B ", ["A", "B"]),
+            ("A B", ["A B"]),
+        ],
+    )
+    def test_keywords(self, given, expected):
+        header = f"Keywords: {given}"
+        raw, unparsed = metadata.parse_email(header)
+        assert not unparsed
+        assert len(raw) == 1
+        assert "keywords" in raw
+        assert raw["keywords"] == expected
+
+    @pytest.mark.parametrize(
+        ["given", "expected"],
+        [
+            ("", {"": ""}),
+            ("A", {"A": ""}),
+            ("A,B", {"A": "B"}),
+            ("A, B", {"A": "B"}),
+            (" A,B", {"A": "B"}),
+            ("A,B ", {"A": "B"}),
+            ("A,B,C", {"A": "B,C"}),
+        ],
+    )
+    def test_project_urls_parsing(self, given, expected):
+        header = f"project-url: {given}"
+        raw, unparsed = metadata.parse_email(header)
+        assert not unparsed
+        assert len(raw) == 1
+        assert "project_urls" in raw
+        assert raw["project_urls"] == expected
+
+    def test_duplicate_project_urls(self):
+        header = "project-url: A, B\nproject-url: A, C"
+        raw, unparsed = metadata.parse_email(header)
+        assert not raw
+        assert len(unparsed) == 1
+        assert "project-url" in unparsed
+        assert unparsed["project-url"] == ["A, B", "A, C"]
+
+    def test_str_input(self):
+        name = "Tarek Ziadé"
+        header = f"author: {name}"
+        raw, unparsed = metadata.parse_email(header)
+        assert not unparsed
+        assert len(raw) == 1
+        assert "author" in raw
+        assert raw["author"] == name
+
+    def test_bytes_input(self):
+        name = "Tarek Ziadé"
+        header = f"author: {name}".encode()
+        raw, unparsed = metadata.parse_email(header)
+        assert not unparsed
+        assert len(raw) == 1
+        assert "author" in raw
+        assert raw["author"] == name
+
+    def test_header_mojibake(self):
+        value = "\xc0msterdam"
+        header_name = "value"
+        header_bytes = f"{header_name}: {value}".encode("latin1")
+        raw, unparsed = metadata.parse_email(header_bytes)
+        # Sanity check
+        with pytest.raises(UnicodeDecodeError):
+            header_bytes.decode("utf-8")
+        assert not raw
+        assert len(unparsed) == 1
+        assert header_name in unparsed
+        assert unparsed[header_name] == [value]
+
+    @pytest.mark.parametrize(
+        ["given"], [("hello",), ("description: hello",), (b"hello",)]
+    )
+    def test_description(self, given):
+        raw, unparsed = metadata.parse_email(given)
+        assert not unparsed
+        assert len(raw) == 1
+        assert "description" in raw
+        assert raw["description"] == "hello"
+
+    def test_description_non_utf8(self):
+        header = "\xc0msterdam"
+        header_bytes = header.encode("latin1")
+        raw, unparsed = metadata.parse_email(header_bytes)
+        assert not raw
+        assert len(unparsed) == 1
+        assert "description" in unparsed
+        assert unparsed["description"] == [header_bytes]
+
+    @pytest.mark.parametrize(
+        ["given", "expected"],
+        [
+            ("description: 1\ndescription: 2", ["1", "2"]),
+            ("description: 1\n\n2", ["1", "2"]),
+            ("description: 1\ndescription: 2\n\n3", ["1", "2", "3"]),
+        ],
+    )
+    def test_description_multiple(self, given, expected):
+        raw, unparsed = metadata.parse_email(given)
+        assert not raw
+        assert len(unparsed) == 1
+        assert "description" in unparsed
+        assert unparsed["description"] == expected
+
+    def test_lowercase_keys(self):
+        header = "AUTHOR: Tarek Ziadé\nWhatever: Else"
+        raw, unparsed = metadata.parse_email(header)
+        assert len(raw) == 1
+        assert "author" in raw
+        assert len(unparsed) == 1
+        assert "whatever" in unparsed
+
+    def test_complete(self):
+        """Test all fields (except `Obsoletes-Dist`).
+
+        `Obsoletes-Dist` was sacrificed to provide a value for `Dynamic`.
+        """
+        path = pathlib.Path(__file__).parent / "metadata" / "everything.metadata"
+        with path.open("r", encoding="utf-8") as file:
+            metadata_contents = file.read()
+        raw, unparsed = metadata.parse_email(metadata_contents)
+        assert len(unparsed) == 1
+        assert unparsed["thisisnotreal"] == ["Hello!"]
+        assert len(raw) == 24
+        assert raw["metadata_version"] == "2.3"
+        assert raw["name"] == "BeagleVote"
+        assert raw["version"] == "1.0a2"
+        assert raw["platforms"] == ["ObscureUnix", "RareDOS"]
+        assert raw["supported_platforms"] == ["RedHat 7.2", "i386-win32-2791"]
+        assert raw["summary"] == "A module for collecting votes from beagles."
+        assert (
+            raw["description_content_type"]
+            == "text/markdown; charset=UTF-8; variant=GFM"
+        )
+        assert raw["keywords"] == ["dog", "puppy", "voting", "election"]
+        assert raw["home_page"] == "http://www.example.com/~cschultz/bvote/"
+        assert raw["download_url"] == "…/BeagleVote-0.45.tgz"
+        assert raw["author"] == (
+            "C. Schultz, Universal Features Syndicate,\n"
+            "        Los Angeles, CA <cschultz@peanuts.example.com>"
+        )
+        assert raw["author_email"] == '"C. Schultz" <cschultz@example.com>'
+        assert raw["maintainer"] == (
+            "C. Schultz, Universal Features Syndicate,\n"
+            "        Los Angeles, CA <cschultz@peanuts.example.com>"
+        )
+        assert raw["maintainer_email"] == '"C. Schultz" <cschultz@example.com>'
+        assert raw["license"] == (
+            "This software may only be obtained by sending the\n"
+            "        author a postcard, and then the user promises not\n"
+            "        to redistribute it."
+        )
+        assert raw["classifiers"] == [
+            "Development Status :: 4 - Beta",
+            "Environment :: Console (Text Based)",
+        ]
+        assert raw["provides_extra"] == ["pdf"]
+        assert raw["requires_dist"] == [
+            "reportlab; extra == 'pdf'",
+            "pkginfo",
+            "PasteDeploy",
+            "zope.interface (>3.5.0)",
+            "pywin32 >1.0; sys_platform == 'win32'",
+        ]
+        assert raw["requires_python"] == ">=3"
+        assert raw["requires_external"] == [
+            "C",
+            "libpng (>=1.5)",
+            'make; sys_platform != "win32"',
+        ]
+        assert raw["project_urls"] == {
+            "Bug Tracker": "http://bitbucket.org/tarek/distribute/issues/",
+            "Documentation": "https://example.com/BeagleVote",
+        }
+        assert raw["provides_dist"] == [
+            "OtherProject",
+            "AnotherProject (3.4)",
+            'virtual_package; python_version >= "3.4"',
+        ]
+        assert raw["dynamic"] == ["Obsoletes-Dist"]
+        assert raw["description"] == "This description intentionally left blank.\n"