Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix requires_python metadata + add repair metadata command #779

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGES/773.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Fixed `requires_python` field not being properly set on package upload.

Run the new `pulpcore-manager repair-python-metadata` command with repositories containing affected
packages to repair their metadata.
Empty file.
Empty file.
118 changes: 118 additions & 0 deletions pulp_python/app/management/commands/repair-python-metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
import re
import os
from django.core.management import BaseCommand, CommandError
from gettext import gettext as _

from django.conf import settings

from pulpcore.plugin.util import extract_pk
from pulp_python.app.models import PythonPackageContent, PythonRepository
from pulp_python.app.utils import artifact_to_python_content_data


def repair_metadata(content):
"""
Repairs the metadata for the passed in content queryset.
:param content: The PythonPackageContent queryset.
Return: number of content units that were repaired
"""
# TODO: Add on_demand content repair?
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This data repair is not meant to unblock a migration / update, right?
In that case I'd say a best effort approach is just fine. We never really own the on-demand content anyway.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No it's not meant to unblock, but I have a feeling that a lot of packages in users' repositories contain bad metadata. Like when we do a sync (which are default on-demand) currently all the non-latest versions of a package potentially get incorrect metadata because the majority of the available metadata (the info section) is from the latest package. There is a way to get the correct metadata for each version, but it adds an API call for each version, so I've never implemented it.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How bad is it? Is it documented?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not too hard, it's this api: https://docs.pypi.org/api/json/. You've seen it around, I've been using it in some of my scripts (oci-images check updates script for example)

os.chdir(settings.WORKING_DIRECTORY)
content = content.select_related("pulp_domain")
immediate_content = content.filter(contentartifact__artifact__isnull=False)
batch = []
set_of_update_fields = set()
total_repaired = 0
for package in immediate_content.prefetch_related('_artifacts').iterator(chunk_size=1000):
new_data = artifact_to_python_content_data(
package.filename, package._artifacts.get(), package.pulp_domain
)
changed = False
for field, value in new_data.items():
if getattr(package, field) != value:
setattr(package, field, value)
set_of_update_fields.add(field)
changed = True
if changed:
batch.append(package)
if len(batch) == 1000:
total_repaired += len(batch)
PythonPackageContent.objects.bulk_update(batch, set_of_update_fields)
batch = []
set_of_update_fields.clear()

if len(batch) > 0:
total_repaired += len(batch)
PythonPackageContent.objects.bulk_update(batch, set_of_update_fields)

return total_repaired


def href_prn_list_handler(value):
"""Common list parsing for a string of hrefs/prns."""
r = re.compile(
rf"""
(?:{settings.API_ROOT}(?:[-_a-zA-Z0-9]+/)?api/v3/repositories/python/python/[-a-f0-9]+/)
|(?:prn:python\.pythonrepository:[-a-f0-9]+)
""",
re.VERBOSE
)
values = []
for v in value.split(","):
if v:
if match := r.match(v.strip()):
values.append(match.group(0))
else:
raise CommandError(f"Invalid href/prn: {v}")
return values


class Command(BaseCommand):
"""
Management command to repair metadata of PythonPackageContent.
"""

help = _("Repair the metadata of PythonPackageContent stored in PythonRepositories")

def add_arguments(self, parser):
"""Set up arguments."""
parser.add_argument(
"--repositories",
type=href_prn_list_handler,
required=False,
help=_(
"List of PythonRepository hrefs/prns whose content's metadata will be repaired. "
"Leave blank to include all repositories in all domains. Mutually exclusive "
"with domain."
),
)
parser.add_argument(
"--domain",
default=None,
required=False,
help=_(
"The pulp domain to gather the repositories from if specified. Mutually"
" exclusive with repositories."
),
)

def handle(self, *args, **options):
"""Implement the command."""
domain = options.get("domain")
repository_hrefs = options.get("repositories")
if domain and repository_hrefs:
raise CommandError(_("--domain and --repositories are mutually exclusive"))

repositories = PythonRepository.objects.all()
if repository_hrefs:
repos_ids = [extract_pk(r) for r in repository_hrefs]
repositories = repositories.filter(pk__in=repos_ids)
elif domain:
repositories = repositories.filter(pulp_domain__name=domain)

content_set = set()
for repository in repositories:
content_set.update(repository.latest_version().content.values_list("pk", flat=True))
content = PythonPackageContent.objects.filter(pk__in=content_set)
num_repaired = repair_metadata(content)
print(f"{len(content_set)} packages processed, {num_repaired} package metadata repaired.")
12 changes: 2 additions & 10 deletions pulp_python/app/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,8 @@

from pathlib import PurePath
from .utils import (
artifact_to_python_content_data,
canonicalize_name,
get_project_metadata_from_artifact,
parse_project_metadata,
python_content_to_json,
PYPI_LAST_SERIAL,
PYPI_SERIAL_CONSTANT,
Expand Down Expand Up @@ -189,14 +188,7 @@ class PythonPackageContent(Content):
def init_from_artifact_and_relative_path(artifact, relative_path):
"""Used when downloading package from pull-through cache."""
path = PurePath(relative_path)
metadata = get_project_metadata_from_artifact(path.name, artifact)
data = parse_project_metadata(vars(metadata))
data["packagetype"] = metadata.packagetype
data["version"] = metadata.version
data["filename"] = path.name
data["sha256"] = artifact.sha256
data["pulp_domain_id"] = artifact.pulp_domain_id
data["_pulp_domain_id"] = artifact.pulp_domain_id
data = artifact_to_python_content_data(path.name, artifact, domain=get_domain())
return PythonPackageContent(**data)

def __str__(self):
Expand Down
12 changes: 2 additions & 10 deletions pulp_python/app/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from pulp_python.app import models as python_models
from pulp_python.app import fields
from pulp_python.app.utils import get_project_metadata_from_artifact, parse_project_metadata
from pulp_python.app.utils import artifact_to_python_content_data


class PythonRepositorySerializer(core_serializers.RepositorySerializer):
Expand Down Expand Up @@ -217,7 +217,7 @@ def deferred_validate(self, data):

artifact = data["artifact"]
try:
metadata = get_project_metadata_from_artifact(filename, artifact)
_data = artifact_to_python_content_data(filename, artifact, domain=get_domain())
except ValueError:
raise serializers.ValidationError(_(
"Extension on {} is not a valid python extension "
Expand All @@ -231,14 +231,6 @@ def deferred_validate(self, data):
)}
)

_data = parse_project_metadata(vars(metadata))
_data['packagetype'] = metadata.packagetype
_data['version'] = metadata.version
_data['filename'] = filename
_data['sha256'] = artifact.sha256
data["pulp_domain_id"] = artifact.pulp_domain_id
data["_pulp_domain_id"] = artifact.pulp_domain_id

data.update(_data)

return data
Expand Down
12 changes: 2 additions & 10 deletions pulp_python/app/tasks/upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from pulpcore.plugin.util import get_domain

from pulp_python.app.models import PythonPackageContent, PythonRepository
from pulp_python.app.utils import get_project_metadata_from_artifact, parse_project_metadata
from pulp_python.app.utils import artifact_to_python_content_data


def upload(artifact_sha256, filename, repository_pk=None):
Expand Down Expand Up @@ -76,15 +76,7 @@ def create_content(artifact_sha256, filename, domain):
queryset of the new created content
"""
artifact = Artifact.objects.get(sha256=artifact_sha256, pulp_domain=domain)
metadata = get_project_metadata_from_artifact(filename, artifact)

data = parse_project_metadata(vars(metadata))
data['packagetype'] = metadata.packagetype
data['version'] = metadata.version
data['filename'] = filename
data['sha256'] = artifact.sha256
data['pulp_domain'] = domain
data['_pulp_domain'] = domain
data = artifact_to_python_content_data(filename, artifact, domain)

@transaction.atomic()
def create():
Expand Down
48 changes: 43 additions & 5 deletions pulp_python/app/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import pkginfo
import re
import shutil
import tempfile
import json
Expand Down Expand Up @@ -51,6 +52,20 @@
".zip": "sdist",
}

DIST_REGEXES = {
# regex from https://github.com/pypa/pip/blob/18.0/src/pip/_internal/wheel.py#L569
".whl": re.compile(
r"""^(?P<name>.+?)-(?P<version>.*?)
((-(?P<build>\d[^-]*?))?-(?P<pyver>.+?)-(?P<abi>.+?)-(?P<plat>.+?)
\.whl|\.dist-info)$""",
re.VERBOSE
),
# regex based on https://setuptools.pypa.io/en/latest/deprecated/python_eggs.html#filename-embedded-metadata # noqa: E501
".egg": re.compile(r"^(?P<name>.+?)-(?P<version>.*?)(-(?P<pyver>.+?(-(?P<plat>.+?))?))?\.egg|\.egg-info$"), # noqa: E501
# regex based on https://github.com/python/cpython/blob/v3.7.0/Lib/distutils/command/bdist_wininst.py#L292 # noqa: E501
".exe": re.compile(r"^(?P<name>.+?)-(?P<version>.*?)\.(?P<plat>.+?)(-(?P<pyver>.+?))?\.exe$"),
}

DIST_TYPES = {
"bdist_wheel": pkginfo.Wheel,
"bdist_wininst": pkginfo.Distribution,
Expand All @@ -72,6 +87,8 @@ def parse_project_metadata(project):
"""
package = {}
package['name'] = project.get('name') or ""
package['version'] = project.get('version') or ""
package['packagetype'] = project.get('packagetype') or ""
package['metadata_version'] = project.get('metadata_version') or ""
package['summary'] = project.get('summary') or ""
package['description'] = project.get('description') or ""
Expand All @@ -86,13 +103,15 @@ def parse_project_metadata(project):
package['project_url'] = project.get('project_url') or ""
package['platform'] = project.get('platform') or ""
package['supported_platform'] = project.get('supported_platform') or ""
package['requires_python'] = project.get('requires_python') or ""
package['requires_dist'] = json.dumps(project.get('requires_dist', []))
package['provides_dist'] = json.dumps(project.get('provides_dist', []))
package['obsoletes_dist'] = json.dumps(project.get('obsoletes_dist', []))
package['requires_external'] = json.dumps(project.get('requires_external', []))
package['classifiers'] = json.dumps(project.get('classifiers', []))
package['project_urls'] = json.dumps(project.get('project_urls', {}))
package['description_content_type'] = project.get('description_content_type') or ""
package['python_version'] = project.get('python_version') or ""

return package

Expand All @@ -113,17 +132,15 @@ def parse_metadata(project, version, distribution):
dictionary: of useful python metadata

"""
package = {}
package = parse_project_metadata(project)

package['filename'] = distribution.get('filename') or ""
package['packagetype'] = distribution.get('packagetype') or ""
package['version'] = version
package['url'] = distribution.get('url') or ""
package['sha256'] = distribution.get('digests', {}).get('sha256') or ""
package['python_version'] = distribution.get('python_version') or ""
package['requires_python'] = distribution.get('requires_python') or ""

package.update(parse_project_metadata(project))
package['python_version'] = distribution.get('python_version') or package.get('python_version')
package['requires_python'] = distribution.get('requires_python') or package.get('requires_python') # noqa: E501

return package

Expand All @@ -147,9 +164,30 @@ def get_project_metadata_from_artifact(filename, artifact):
temp_file.flush()
metadata = DIST_TYPES[packagetype](temp_file.name)
metadata.packagetype = packagetype
if packagetype == "sdist":
metadata.python_version = "source"
else:
pyver = ""
regex = DIST_REGEXES[extensions[pkg_type_index]]
if bdist_name := regex.match(filename):
pyver = bdist_name.group("pyver") or ""
metadata.python_version = pyver
return metadata


def artifact_to_python_content_data(filename, artifact, domain=None):
"""
Takes the artifact/filename and returns the metadata needed to create a PythonPackageContent.
"""
metadata = get_project_metadata_from_artifact(filename, artifact)
data = parse_project_metadata(vars(metadata))
data['sha256'] = artifact.sha256
data['filename'] = filename
data['pulp_domain'] = domain or artifact.pulp_domain
data['_pulp_domain'] = data['pulp_domain']
return data


def python_content_to_json(base_path, content_query, version=None, domain=None):
"""
Converts a QuerySet of PythonPackageContent into the PyPi JSON format
Expand Down
12 changes: 12 additions & 0 deletions pulp_python/tests/functional/api/test_crud_content_unit.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,3 +122,15 @@ def test_upload_metadata_23_spec(python_content_factory):
content = python_content_factory(filename, url=package.url)
assert content.metadata_version == "2.3"
break


@pytest.mark.parallel
def test_upload_requires_python(python_content_factory):
filename = "pip-24.3.1-py3-none-any.whl"
with PyPISimple() as client:
page = client.get_project_page("pip")
for package in page.packages:
if package.filename == filename:
content = python_content_factory(filename, url=package.url)
assert content.requires_python == ">=3.8"
break
Loading
Loading