From ac33f4b53089d3fccb90a5bfe956d28619f0df5f Mon Sep 17 00:00:00 2001 From: Gerrod Ubben Date: Thu, 12 Dec 2024 03:05:09 -0500 Subject: [PATCH] Fix requires_python metadata + add repair metadata command fixes: #773 --- CHANGES/773.bugfix | 4 + pulp_python/app/management/__init__.py | 0 .../app/management/commands/__init__.py | 0 .../commands/repair-python-metadata.py | 118 ++++++++++++++++++ pulp_python/app/models.py | 12 +- pulp_python/app/serializers.py | 12 +- pulp_python/app/tasks/upload.py | 12 +- pulp_python/app/utils.py | 48 ++++++- .../functional/api/test_crud_content_unit.py | 12 ++ .../tests/functional/api/test_repair.py | 78 ++++++++++++ 10 files changed, 261 insertions(+), 35 deletions(-) create mode 100644 CHANGES/773.bugfix create mode 100644 pulp_python/app/management/__init__.py create mode 100644 pulp_python/app/management/commands/__init__.py create mode 100644 pulp_python/app/management/commands/repair-python-metadata.py create mode 100644 pulp_python/tests/functional/api/test_repair.py diff --git a/CHANGES/773.bugfix b/CHANGES/773.bugfix new file mode 100644 index 00000000..cc947f69 --- /dev/null +++ b/CHANGES/773.bugfix @@ -0,0 +1,4 @@ +Fixed `requires_python` field not being properly set on package upload. + +Run the new `pulpcore-manager repair-python-metadata` command with repositories containing affected +packages to repair their metadata. diff --git a/pulp_python/app/management/__init__.py b/pulp_python/app/management/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pulp_python/app/management/commands/__init__.py b/pulp_python/app/management/commands/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pulp_python/app/management/commands/repair-python-metadata.py b/pulp_python/app/management/commands/repair-python-metadata.py new file mode 100644 index 00000000..4582f328 --- /dev/null +++ b/pulp_python/app/management/commands/repair-python-metadata.py @@ -0,0 +1,118 @@ +import re +import os +from django.core.management import BaseCommand, CommandError +from gettext import gettext as _ + +from django.conf import settings + +from pulpcore.plugin.util import extract_pk +from pulp_python.app.models import PythonPackageContent, PythonRepository +from pulp_python.app.utils import artifact_to_python_content_data + + +def repair_metadata(content): + """ + Repairs the metadata for the passed in content queryset. + :param content: The PythonPackageContent queryset. + Return: number of content units that were repaired + """ + # TODO: Add on_demand content repair? + os.chdir(settings.WORKING_DIRECTORY) + content = content.select_related("pulp_domain") + immediate_content = content.filter(contentartifact__artifact__isnull=False) + batch = [] + set_of_update_fields = set() + total_repaired = 0 + for package in immediate_content.prefetch_related('_artifacts').iterator(chunk_size=1000): + new_data = artifact_to_python_content_data( + package.filename, package._artifacts.get(), package.pulp_domain + ) + changed = False + for field, value in new_data.items(): + if getattr(package, field) != value: + setattr(package, field, value) + set_of_update_fields.add(field) + changed = True + if changed: + batch.append(package) + if len(batch) == 1000: + total_repaired += len(batch) + PythonPackageContent.objects.bulk_update(batch, set_of_update_fields) + batch = [] + set_of_update_fields.clear() + + if len(batch) > 0: + total_repaired += len(batch) + PythonPackageContent.objects.bulk_update(batch, set_of_update_fields) + + return total_repaired + + +def href_prn_list_handler(value): + """Common list parsing for a string of hrefs/prns.""" + r = re.compile( + rf""" + (?:{settings.API_ROOT}(?:[-_a-zA-Z0-9]+/)?api/v3/repositories/python/python/[-a-f0-9]+/) + |(?:prn:python\.pythonrepository:[-a-f0-9]+) + """, + re.VERBOSE + ) + values = [] + for v in value.split(","): + if v: + if match := r.match(v.strip()): + values.append(match.group(0)) + else: + raise CommandError(f"Invalid href/prn: {v}") + return values + + +class Command(BaseCommand): + """ + Management command to repair metadata of PythonPackageContent. + """ + + help = _("Repair the metadata of PythonPackageContent stored in PythonRepositories") + + def add_arguments(self, parser): + """Set up arguments.""" + parser.add_argument( + "--repositories", + type=href_prn_list_handler, + required=False, + help=_( + "List of PythonRepository hrefs/prns whose content's metadata will be repaired. " + "Leave blank to include all repositories in all domains. Mutually exclusive " + "with domain." + ), + ) + parser.add_argument( + "--domain", + default=None, + required=False, + help=_( + "The pulp domain to gather the repositories from if specified. Mutually" + " exclusive with repositories." + ), + ) + + def handle(self, *args, **options): + """Implement the command.""" + domain = options.get("domain") + repository_hrefs = options.get("repositories") + if domain and repository_hrefs: + raise CommandError(_("--domain and --repositories are mutually exclusive")) + + repositories = PythonRepository.objects.all() + if repository_hrefs: + repos_ids = [extract_pk(r) for r in repository_hrefs] + repositories = repositories.filter(pk__in=repos_ids) + elif domain: + repositories = repositories.filter(pulp_domain__name=domain) + + content_set = set() + for repository in repositories: + content_set.update(repository.latest_version().content.values_list("pk", flat=True)) + content = PythonPackageContent.objects.filter(pk__in=content_set) + num_repaired = repair_metadata(content) + print(f"{len(content_set)} packages processed, {num_repaired} package metadata repaired.") diff --git a/pulp_python/app/models.py b/pulp_python/app/models.py index b17f4cf4..c326a54a 100644 --- a/pulp_python/app/models.py +++ b/pulp_python/app/models.py @@ -17,9 +17,8 @@ from pathlib import PurePath from .utils import ( + artifact_to_python_content_data, canonicalize_name, - get_project_metadata_from_artifact, - parse_project_metadata, python_content_to_json, PYPI_LAST_SERIAL, PYPI_SERIAL_CONSTANT, @@ -189,14 +188,7 @@ class PythonPackageContent(Content): def init_from_artifact_and_relative_path(artifact, relative_path): """Used when downloading package from pull-through cache.""" path = PurePath(relative_path) - metadata = get_project_metadata_from_artifact(path.name, artifact) - data = parse_project_metadata(vars(metadata)) - data["packagetype"] = metadata.packagetype - data["version"] = metadata.version - data["filename"] = path.name - data["sha256"] = artifact.sha256 - data["pulp_domain_id"] = artifact.pulp_domain_id - data["_pulp_domain_id"] = artifact.pulp_domain_id + data = artifact_to_python_content_data(path.name, artifact, domain=get_domain()) return PythonPackageContent(**data) def __str__(self): diff --git a/pulp_python/app/serializers.py b/pulp_python/app/serializers.py index bb588b47..0c8c9070 100644 --- a/pulp_python/app/serializers.py +++ b/pulp_python/app/serializers.py @@ -9,7 +9,7 @@ from pulp_python.app import models as python_models from pulp_python.app import fields -from pulp_python.app.utils import get_project_metadata_from_artifact, parse_project_metadata +from pulp_python.app.utils import artifact_to_python_content_data class PythonRepositorySerializer(core_serializers.RepositorySerializer): @@ -217,7 +217,7 @@ def deferred_validate(self, data): artifact = data["artifact"] try: - metadata = get_project_metadata_from_artifact(filename, artifact) + _data = artifact_to_python_content_data(filename, artifact, domain=get_domain()) except ValueError: raise serializers.ValidationError(_( "Extension on {} is not a valid python extension " @@ -231,14 +231,6 @@ def deferred_validate(self, data): )} ) - _data = parse_project_metadata(vars(metadata)) - _data['packagetype'] = metadata.packagetype - _data['version'] = metadata.version - _data['filename'] = filename - _data['sha256'] = artifact.sha256 - data["pulp_domain_id"] = artifact.pulp_domain_id - data["_pulp_domain_id"] = artifact.pulp_domain_id - data.update(_data) return data diff --git a/pulp_python/app/tasks/upload.py b/pulp_python/app/tasks/upload.py index 9f72c364..d573c56a 100644 --- a/pulp_python/app/tasks/upload.py +++ b/pulp_python/app/tasks/upload.py @@ -7,7 +7,7 @@ from pulpcore.plugin.util import get_domain from pulp_python.app.models import PythonPackageContent, PythonRepository -from pulp_python.app.utils import get_project_metadata_from_artifact, parse_project_metadata +from pulp_python.app.utils import artifact_to_python_content_data def upload(artifact_sha256, filename, repository_pk=None): @@ -76,15 +76,7 @@ def create_content(artifact_sha256, filename, domain): queryset of the new created content """ artifact = Artifact.objects.get(sha256=artifact_sha256, pulp_domain=domain) - metadata = get_project_metadata_from_artifact(filename, artifact) - - data = parse_project_metadata(vars(metadata)) - data['packagetype'] = metadata.packagetype - data['version'] = metadata.version - data['filename'] = filename - data['sha256'] = artifact.sha256 - data['pulp_domain'] = domain - data['_pulp_domain'] = domain + data = artifact_to_python_content_data(filename, artifact, domain) @transaction.atomic() def create(): diff --git a/pulp_python/app/utils.py b/pulp_python/app/utils.py index 126037a8..19007ab6 100644 --- a/pulp_python/app/utils.py +++ b/pulp_python/app/utils.py @@ -1,4 +1,5 @@ import pkginfo +import re import shutil import tempfile import json @@ -51,6 +52,20 @@ ".zip": "sdist", } +DIST_REGEXES = { + # regex from https://github.com/pypa/pip/blob/18.0/src/pip/_internal/wheel.py#L569 + ".whl": re.compile( + r"""^(?P.+?)-(?P.*?) + ((-(?P\d[^-]*?))?-(?P.+?)-(?P.+?)-(?P.+?) + \.whl|\.dist-info)$""", + re.VERBOSE + ), + # regex based on https://setuptools.pypa.io/en/latest/deprecated/python_eggs.html#filename-embedded-metadata # noqa: E501 + ".egg": re.compile(r"^(?P.+?)-(?P.*?)(-(?P.+?(-(?P.+?))?))?\.egg|\.egg-info$"), # noqa: E501 + # regex based on https://github.com/python/cpython/blob/v3.7.0/Lib/distutils/command/bdist_wininst.py#L292 # noqa: E501 + ".exe": re.compile(r"^(?P.+?)-(?P.*?)\.(?P.+?)(-(?P.+?))?\.exe$"), +} + DIST_TYPES = { "bdist_wheel": pkginfo.Wheel, "bdist_wininst": pkginfo.Distribution, @@ -72,6 +87,8 @@ def parse_project_metadata(project): """ package = {} package['name'] = project.get('name') or "" + package['version'] = project.get('version') or "" + package['packagetype'] = project.get('packagetype') or "" package['metadata_version'] = project.get('metadata_version') or "" package['summary'] = project.get('summary') or "" package['description'] = project.get('description') or "" @@ -86,6 +103,7 @@ def parse_project_metadata(project): package['project_url'] = project.get('project_url') or "" package['platform'] = project.get('platform') or "" package['supported_platform'] = project.get('supported_platform') or "" + package['requires_python'] = project.get('requires_python') or "" package['requires_dist'] = json.dumps(project.get('requires_dist', [])) package['provides_dist'] = json.dumps(project.get('provides_dist', [])) package['obsoletes_dist'] = json.dumps(project.get('obsoletes_dist', [])) @@ -93,6 +111,7 @@ def parse_project_metadata(project): package['classifiers'] = json.dumps(project.get('classifiers', [])) package['project_urls'] = json.dumps(project.get('project_urls', {})) package['description_content_type'] = project.get('description_content_type') or "" + package['python_version'] = project.get('python_version') or "" return package @@ -113,17 +132,15 @@ def parse_metadata(project, version, distribution): dictionary: of useful python metadata """ - package = {} + package = parse_project_metadata(project) package['filename'] = distribution.get('filename') or "" package['packagetype'] = distribution.get('packagetype') or "" package['version'] = version package['url'] = distribution.get('url') or "" package['sha256'] = distribution.get('digests', {}).get('sha256') or "" - package['python_version'] = distribution.get('python_version') or "" - package['requires_python'] = distribution.get('requires_python') or "" - - package.update(parse_project_metadata(project)) + package['python_version'] = distribution.get('python_version') or package.get('python_version') + package['requires_python'] = distribution.get('requires_python') or package.get('requires_python') # noqa: E501 return package @@ -147,9 +164,30 @@ def get_project_metadata_from_artifact(filename, artifact): temp_file.flush() metadata = DIST_TYPES[packagetype](temp_file.name) metadata.packagetype = packagetype + if packagetype == "sdist": + metadata.python_version = "source" + else: + pyver = "" + regex = DIST_REGEXES[extensions[pkg_type_index]] + if bdist_name := regex.match(filename): + pyver = bdist_name.group("pyver") or "" + metadata.python_version = pyver return metadata +def artifact_to_python_content_data(filename, artifact, domain=None): + """ + Takes the artifact/filename and returns the metadata needed to create a PythonPackageContent. + """ + metadata = get_project_metadata_from_artifact(filename, artifact) + data = parse_project_metadata(vars(metadata)) + data['sha256'] = artifact.sha256 + data['filename'] = filename + data['pulp_domain'] = domain or artifact.pulp_domain + data['_pulp_domain'] = data['pulp_domain'] + return data + + def python_content_to_json(base_path, content_query, version=None, domain=None): """ Converts a QuerySet of PythonPackageContent into the PyPi JSON format diff --git a/pulp_python/tests/functional/api/test_crud_content_unit.py b/pulp_python/tests/functional/api/test_crud_content_unit.py index eda3cf4e..66439343 100644 --- a/pulp_python/tests/functional/api/test_crud_content_unit.py +++ b/pulp_python/tests/functional/api/test_crud_content_unit.py @@ -122,3 +122,15 @@ def test_upload_metadata_23_spec(python_content_factory): content = python_content_factory(filename, url=package.url) assert content.metadata_version == "2.3" break + + +@pytest.mark.parallel +def test_upload_requires_python(python_content_factory): + filename = "pip-24.3.1-py3-none-any.whl" + with PyPISimple() as client: + page = client.get_project_page("pip") + for package in page.packages: + if package.filename == filename: + content = python_content_factory(filename, url=package.url) + assert content.requires_python == ">=3.8" + break diff --git a/pulp_python/tests/functional/api/test_repair.py b/pulp_python/tests/functional/api/test_repair.py new file mode 100644 index 00000000..792d49c2 --- /dev/null +++ b/pulp_python/tests/functional/api/test_repair.py @@ -0,0 +1,78 @@ +import pytest +import subprocess + +from pulp_python.tests.functional.constants import PYTHON_EGG_FILENAME + + +@pytest.fixture +def create_content_direct(python_bindings): + def _create(artifact_filename, filename, content_data): + commands = ( + "from pulpcore.plugin.models import Artifact, ContentArtifact; " + "from pulpcore.plugin.util import get_url; " + "from pulp_python.app.models import PythonPackageContent; " + f"a = Artifact.init_and_validate('{artifact_filename}'); " + "a.save(); " + f"c = PythonPackageContent(sha256=a.sha256, filename={filename!r}, **{content_data!r}); " # noqa: E501 + "c.save(); " + f"ca = ContentArtifact(artifact=a, content=c, relative_path={filename!r}); " + "ca.save(); " + "print(get_url(c))" + ) + process = subprocess.run(["pulpcore-manager", "shell", "-c", commands], capture_output=True) + + assert process.returncode == 0 + content_href = process.stdout.decode().strip() + return python_bindings.ContentPackagesApi.read(content_href) + + return _create + + +@pytest.fixture +def move_to_repository(python_bindings, monitor_task): + def _move(repo_href, content_hrefs): + body = {"add_content_units": content_hrefs} + task = monitor_task(python_bindings.RepositoriesPythonApi.modify(repo_href, body).task) + assert len(task.created_resources) == 1 + return python_bindings.RepositoriesPythonApi.read(repo_href) + + return _move + + +def test_metadata_repair_command( + create_content_direct, + python_file, + python_repo, + move_to_repository, + python_bindings, + delete_orphans_pre, +): + """Test pulpcore-manager repair-python-metadata command.""" + data = { + "name": "shelf-reader", + # Wrong metadata + "version": "0.2", + "packagetype": "bdist", + "requires_python": ">=3.8", + "author": "ME", + } + content = create_content_direct(python_file, PYTHON_EGG_FILENAME, data) + for field, wrong_value in data.items(): + if field == "python_version": + continue + assert getattr(content, field) == wrong_value + + move_to_repository(python_repo.pulp_href, [content.pulp_href]) + process = subprocess.run( + ["pulpcore-manager", "repair-python-metadata", "--repositories", python_repo.pulp_href], + capture_output=True + ) + assert process.returncode == 0 + output = process.stdout.decode().strip() + assert output == "1 packages processed, 1 package metadata repaired." + + content = python_bindings.ContentPackagesApi.read(content.pulp_href) + assert content.version == "0.1" + assert content.packagetype == "sdist" + assert content.requires_python == "" # technically null + assert content.author == "Austin Macdonald"