From ab001644f390c029ceec1e91fd8330283a4f62ba Mon Sep 17 00:00:00 2001 From: Matthias Dellweg Date: Wed, 6 Nov 2024 13:28:31 +0100 Subject: [PATCH 1/2] [ZDU] Finalize Collection Version sha256 This allows uploads of different files claiming to be the same collections into different repositories. relates to #1052 --- CHANGES/1052.feature | 2 + CHANGES/1052.removal | 7 +++ .../datarepair-ansible-collection-sha256.py | 56 ------------------- .../0058_collectionversion_unique_sha256.py | 25 +++++++++ pulp_ansible/app/models.py | 9 +-- pulp_ansible/app/serializers.py | 24 +------- pulp_ansible/app/tasks/collections.py | 8 +-- pulp_ansible/app/tasks/upload.py | 4 +- .../test_crud_collection_versions.py | 32 +++++++++++ ..._0057_collection_version_sha256_migrate.py | 6 ++ 10 files changed, 83 insertions(+), 90 deletions(-) create mode 100644 CHANGES/1052.feature create mode 100644 CHANGES/1052.removal delete mode 100644 pulp_ansible/app/management/commands/datarepair-ansible-collection-sha256.py create mode 100644 pulp_ansible/app/migrations/0058_collectionversion_unique_sha256.py diff --git a/CHANGES/1052.feature b/CHANGES/1052.feature new file mode 100644 index 000000000..bcab5c4a3 --- /dev/null +++ b/CHANGES/1052.feature @@ -0,0 +1,2 @@ +CollectionVersion global uniqueness constraint is now its sha256 digest. Repository level uniqueness +is still (namespace, name, version). diff --git a/CHANGES/1052.removal b/CHANGES/1052.removal new file mode 100644 index 000000000..2d10a4d3b --- /dev/null +++ b/CHANGES/1052.removal @@ -0,0 +1,7 @@ +Added the final migration to make the sha256 of the collection version artifact the uniqueness +constraint. This allows users to serve their own interpretation of the content in their private +repositories. +The migration will only succeed if all the content has been adjusted. To account for content that +was not migrated by the migration shipped with 0.22.0, you can run the content repair command +``datarepair-ansible-collection-sha256`` prior to upgrading. +This version removed the content repair command. diff --git a/pulp_ansible/app/management/commands/datarepair-ansible-collection-sha256.py b/pulp_ansible/app/management/commands/datarepair-ansible-collection-sha256.py deleted file mode 100644 index cd3c615ae..000000000 --- a/pulp_ansible/app/management/commands/datarepair-ansible-collection-sha256.py +++ /dev/null @@ -1,56 +0,0 @@ -from gettext import gettext as _ - -from django.core.management import BaseCommand -from django.db import transaction - -from pulp_ansible.app.models import CollectionVersion - - -class Command(BaseCommand): - """ - Django management command to repair ansible collection versions without sha256. - """ - - help = ( - "This script repairs ansible collection versions without sha256 if artifacts are available." - ) - - def add_arguments(self, parser): - """Set up arguments.""" - parser.add_argument( - "--dry-run", - action="store_true", - help=_("Don't modify anything, just collect results."), - ) - - def handle(self, *args, **options): - dry_run = options["dry_run"] - failed_units = 0 - repaired_units = 0 - - unit_qs = CollectionVersion.objects.filter(sha256__isnull=True) - count = unit_qs.count() - print(f"CollectionVersions to repair: {count}") - if count == 0: - return - - for unit in unit_qs.prefetch_related("contenartifact_set").iterator(): - try: - content_artifact = unit.contentartifact_set.get() - artifact = content_artifact.artifact - unit.sha256 = artifact.sha256 - - if not dry_run: - with transaction.atomic(): - unit.save(update_fields=["sha256"]) - except Exception as e: - failed_units += 1 - print( - f"Failed to migrate collection version '{unit.namespace}.{unit.name}' " - f"'{unit.version}': {e}" - ) - else: - repaired_units += 1 - - print(f"Successfully repaired collection versions: {repaired_units}") - print(f"Collection versions failed to repair: {failed_units}") diff --git a/pulp_ansible/app/migrations/0058_collectionversion_unique_sha256.py b/pulp_ansible/app/migrations/0058_collectionversion_unique_sha256.py new file mode 100644 index 000000000..99c888d3f --- /dev/null +++ b/pulp_ansible/app/migrations/0058_collectionversion_unique_sha256.py @@ -0,0 +1,25 @@ +# Generated by Django 3.2.14 on 2022-07-21 23:05 + +from django.db import migrations, models +from pulpcore.plugin.migrations import RequireVersion + + +class Migration(migrations.Migration): + + dependencies = [ + ('ansible', '0057_collectionversion_sha256_migrate'), + ('core', '0110_apiappstatus'), + ] + + operations = [ + RequireVersion("ansible", "0.23.0"), + migrations.AlterField( + model_name='collectionversion', + name='sha256', + field=models.CharField(db_index=True, max_length=64, null=False), + ), + migrations.AlterUniqueTogether( + name='collectionversion', + unique_together={('sha256',)}, + ), + ] diff --git a/pulp_ansible/app/models.py b/pulp_ansible/app/models.py index d4accf68f..3274d00c4 100644 --- a/pulp_ansible/app/models.py +++ b/pulp_ansible/app/models.py @@ -180,7 +180,7 @@ class CollectionVersion(Content): namespace = models.CharField(max_length=64, editable=False) repository = models.CharField(default="", blank=True, max_length=2000, editable=False) requires_ansible = models.CharField(null=True, max_length=255) - sha256 = models.CharField(max_length=64, null=True, blank=False) + sha256 = models.CharField(max_length=64, db_index=True, null=False, blank=False) version = models.CharField(max_length=128, db_collation="pulp_ansible_semver") version_major = models.IntegerField() @@ -207,11 +207,6 @@ class CollectionVersion(Content): # search_vector is built. search_vector = psql_search.SearchVectorField(default="") - @classmethod - def natural_key_fields(cls): - # This method should be removed when the uniqueness is properly switched to sha256. - return cls._meta.unique_together[0] - @hook(BEFORE_SAVE) def calculate_version_parts(self): v = Version(self.version) @@ -236,7 +231,7 @@ def __str__(self): class Meta: default_related_name = "%(app_label)s_%(model_name)s" - unique_together = (("namespace", "name", "version"), ("sha256",)) + unique_together = ("sha256",) constraints = [ UniqueConstraint( fields=("collection", "is_highest"), diff --git a/pulp_ansible/app/serializers.py b/pulp_ansible/app/serializers.py index 0cd9c86f5..ff6582d3e 100644 --- a/pulp_ansible/app/serializers.py +++ b/pulp_ansible/app/serializers.py @@ -505,10 +505,9 @@ def deferred_validate(self, data): # Call super to ensure that data contains artifact data = super().deferred_validate(data) artifact = data.get("artifact") + if (sha256 := data.get("sha256")) and sha256 != artifact.sha256: raise ValidationError(_("Expected sha256 did not match uploaded artifact's sha256")) - else: - data["sha256"] = artifact.sha256 collection_info = process_collection_artifact( artifact=artifact, @@ -519,35 +518,18 @@ def deferred_validate(self, data): # repository field clashes collection_info["origin_repository"] = collection_info.pop("repository", None) data.update(collection_info) - # `retrieve` needs artifact, but it won't be in validated_data because of `get_artifacts` - self.context["artifact"] = artifact return data def retrieve(self, validated_data): """Reuse existing CollectionVersion if provided artifact matches.""" - namespace = validated_data["namespace"] - name = validated_data["name"] - version = validated_data["version"] - artifact = self.context["artifact"] - # TODO switch this check to use digest when ColVersion uniqueness constraint is changed - col = CollectionVersion.objects.filter( - namespace=namespace, name=name, version=version - ).first() - if col: - if col._artifacts.get() != artifact: - raise ValidationError( - _("Collection {}.{}-{} already exists with a different artifact").format( - namespace, name, version - ) - ) - - return col + return CollectionVersion.objects.filter(sha256=validated_data["sha256"]).first() def create(self, validated_data): """Final step in creating the CollectionVersion.""" tags = validated_data.pop("tags") origin_repository = validated_data.pop("origin_repository") + # Create CollectionVersion from its metadata and adds to repository if specified content = super().create(validated_data) diff --git a/pulp_ansible/app/tasks/collections.py b/pulp_ansible/app/tasks/collections.py index a6b2b08fc..01141d8dc 100644 --- a/pulp_ansible/app/tasks/collections.py +++ b/pulp_ansible/app/tasks/collections.py @@ -145,7 +145,7 @@ async def declarative_content_from_git_repo(remote, url, git_ref=None, metadata_ version = metadata["metadata"]["version"] else: raise e - collection_version = await sync_to_async(CollectionVersion.objects.get)( + collection_version = await CollectionVersion.objects.aget( namespace=namespace, name=name, version=version ) if artifact is None: @@ -275,7 +275,7 @@ def import_collection( temp_file = PulpTemporaryFile.objects.get(pk=temp_file_pk) filename = CollectionFilename(expected_namespace, expected_name, expected_version) log.info(f"Processing collection from {temp_file.file.name}") - user_facing_logger = logging.getLogger("pulp_ansible.app.tasks.collection.import_collection") + user_facing_logger = logging.getLogger("pulp_ansible.app.tasks.collections.import_collection") try: with temp_file.file.open() as artifact_file: @@ -673,9 +673,7 @@ async def _add_namespace(self, name, namespace_sha): """Adds A Namespace metadata content to the pipeline.""" try: - ns = await sync_to_async(AnsibleNamespaceMetadata.objects.get)( - metadata_sha256=namespace_sha - ) + ns = await AnsibleNamespaceMetadata.objects.aget(metadata_sha256=namespace_sha) await self.put(DeclarativeContent(ns)) return True except AnsibleNamespaceMetadata.DoesNotExist: diff --git a/pulp_ansible/app/tasks/upload.py b/pulp_ansible/app/tasks/upload.py index 0ce543d51..dea90217c 100644 --- a/pulp_ansible/app/tasks/upload.py +++ b/pulp_ansible/app/tasks/upload.py @@ -25,7 +25,9 @@ def process_collection_artifact(artifact, namespace, name, version): # Set up logging for CollectionImport object CollectionImport.objects.get_or_create(task_id=Task.current().pulp_id) - user_facing_logger = logging.getLogger("pulp_ansible.app.tasks.collection.import_collection") + user_facing_logger = logging.getLogger( + "pulp_ansible.app.tasks.upload.process_collection_artifact" + ) artifact_url = reverse("artifacts-detail", args=[artifact.pk]) filename = CollectionFilename(namespace, name, version) diff --git a/pulp_ansible/tests/functional/api/collection/test_crud_collection_versions.py b/pulp_ansible/tests/functional/api/collection/test_crud_collection_versions.py index 3e23b4676..b966f041f 100644 --- a/pulp_ansible/tests/functional/api/collection/test_crud_collection_versions.py +++ b/pulp_ansible/tests/functional/api/collection/test_crud_collection_versions.py @@ -1,5 +1,8 @@ """Tests related to sync ansible plugin collection content type.""" +import hashlib +from pathlib import Path + import pytest from pulp_ansible.tests.functional.utils import randstr @@ -99,3 +102,32 @@ def test_content_unit_lifecycle(ansible_bindings, build_and_upload_collection, m ansible_bindings.ContentCollectionVersionsApi.create(file=collection_artifact.filename).task ) assert content_unit_href in create_task.created_resources + + +@pytest.mark.parallel +def test_duplicate_collection_key( + ansible_bindings, + ansible_repo_factory, + build_and_upload_collection, + monitor_task, +): + """Create two content units with the same name but different artifacts.""" + repository1 = ansible_repo_factory() + repository2 = ansible_repo_factory() + + attrs = {"namespace": randstr(), "name": "squeezer", "version": "0.0.9"} + collection_artifact1, content_unit_href1 = build_and_upload_collection( + repository1, config=attrs + ) + collection_artifact2, content_unit_href2 = build_and_upload_collection( + repository2, config=attrs + ) + + sha256_1 = hashlib.sha256(Path(collection_artifact1.filename).read_bytes()).hexdigest() + sha256_2 = hashlib.sha256(Path(collection_artifact2.filename).read_bytes()).hexdigest() + assert sha256_1 != sha256_2 + + content_unit_1 = ansible_bindings.ContentCollectionVersionsApi.read(content_unit_href1) + assert content_unit_1.sha256 == sha256_1 + content_unit_2 = ansible_bindings.ContentCollectionVersionsApi.read(content_unit_href2) + assert content_unit_2.sha256 == sha256_2 diff --git a/pulp_ansible/tests/unit/migrations/test_0057_collection_version_sha256_migrate.py b/pulp_ansible/tests/unit/migrations/test_0057_collection_version_sha256_migrate.py index c81f89afe..f1f1419fa 100644 --- a/pulp_ansible/tests/unit/migrations/test_0057_collection_version_sha256_migrate.py +++ b/pulp_ansible/tests/unit/migrations/test_0057_collection_version_sha256_migrate.py @@ -37,3 +37,9 @@ def test_collection_version_sha256_migrate(migrate): cv = CollectionVersion.objects.get(pk=cv.pk) assert cv.sha256 == "SENTINEL" + + apps = migrate([("ansible", "0058_collectionversion_unique_sha256")]) + CollectionVersion = apps.get_model("ansible", "CollectionVersion") + + cv = CollectionVersion.objects.get(pk=cv.pk) + assert cv.sha256 == "SENTINEL" From ae7321ee2223a0fa856ae1c303e12ef3136e0cac Mon Sep 17 00:00:00 2001 From: Matthias Dellweg Date: Thu, 21 Nov 2024 09:54:53 +0100 Subject: [PATCH 2/2] Remove deprecated is_highest form CollectionVersion fixes #1550 --- CHANGES/1550.removal | 1 + ...ctionversion_unique_is_highest_and_more.py | 23 +++++++++++++++++++ pulp_ansible/app/models.py | 19 +-------------- 3 files changed, 25 insertions(+), 18 deletions(-) create mode 100644 CHANGES/1550.removal create mode 100644 pulp_ansible/app/migrations/0059_remove_collectionversion_unique_is_highest_and_more.py diff --git a/CHANGES/1550.removal b/CHANGES/1550.removal new file mode 100644 index 000000000..88abd48c6 --- /dev/null +++ b/CHANGES/1550.removal @@ -0,0 +1 @@ +Remove the `is_highest` field from CollectionVersion table. diff --git a/pulp_ansible/app/migrations/0059_remove_collectionversion_unique_is_highest_and_more.py b/pulp_ansible/app/migrations/0059_remove_collectionversion_unique_is_highest_and_more.py new file mode 100644 index 000000000..a88d38343 --- /dev/null +++ b/pulp_ansible/app/migrations/0059_remove_collectionversion_unique_is_highest_and_more.py @@ -0,0 +1,23 @@ +# Generated by Django 4.2.16 on 2024-11-21 08:53 + +from django.db import migrations +from pulpcore.plugin.migrations import RequireVersion + + +class Migration(migrations.Migration): + + dependencies = [ + ("ansible", "0058_collectionversion_unique_sha256"), + ] + + operations = [ + RequireVersion("ansible", "0.23.0"), + migrations.RemoveConstraint( + model_name="collectionversion", + name="unique_is_highest", + ), + migrations.RemoveField( + model_name="collectionversion", + name="is_highest", + ), + ] diff --git a/pulp_ansible/app/models.py b/pulp_ansible/app/models.py index 3274d00c4..d4b6cf2cb 100644 --- a/pulp_ansible/app/models.py +++ b/pulp_ansible/app/models.py @@ -6,7 +6,6 @@ from django.conf import settings from django.db import models -from django.db.models import UniqueConstraint, Q from django.db.utils import IntegrityError from django.contrib.postgres import fields as psql_fields from django.contrib.postgres import search as psql_search @@ -150,10 +149,6 @@ class CollectionVersion(Content): repository (models.CharField): The URL of the originating SCM repository. version (models.CharField): The version of the collection. requires_ansible (models.CharField): The version of Ansible required to use the collection. - is_highest (models.BooleanField): Indicates that the version is the highest one - in the collection. Import and sync workflows update this field, which then - triggers the database to [re]build the search_vector. - This field is Deprecated and scheduled for removal as soon as 0.24.0. Relations: @@ -188,9 +183,6 @@ class CollectionVersion(Content): version_patch = models.IntegerField() version_prerelease = models.CharField(max_length=128) - # This field is deprecated. We keep it for some releases for 0-Downtime upgrades. - is_highest = models.BooleanField(editable=False, default=False) - # Foreign Key Fields collection = models.ForeignKey( Collection, on_delete=models.PROTECT, related_name="versions", editable=False @@ -202,9 +194,7 @@ class CollectionVersion(Content): # a migration file. The trigger only runs when the table is # updated. CollectionVersions are INSERT'ed into the table, so # the search_vector does not get populated at initial creation - # time. In the import or sync workflows, is_highest gets toggled - # back and forth, which causes an UPDATE operation and then the - # search_vector is built. + # time. search_vector = psql_search.SearchVectorField(default="") @hook(BEFORE_SAVE) @@ -232,13 +222,6 @@ def __str__(self): class Meta: default_related_name = "%(app_label)s_%(model_name)s" unique_together = ("sha256",) - constraints = [ - UniqueConstraint( - fields=("collection", "is_highest"), - name="unique_is_highest", - condition=Q(is_highest=True), - ) - ] class CollectionVersionMark(Content):