From 59f1d05d0fbd1ebae492db5b08126bd0f2095bff Mon Sep 17 00:00:00 2001 From: Felix Fontein Date: Tue, 2 May 2023 21:25:56 +0200 Subject: [PATCH] Allow Galaxy downloader to trust collection cache (#78) * Normalize the collection cache filename so we don't depend on Galaxy servers always using the same method. * Allow to trust collection cache. * Improve docs, add to sample config. * Follow-up to previous PR. * Fix typo. * Improvements from review. * Use antsibull_core.utils.io.copy_file instead of shutil.copyfile. * Skip check part of copying. * Reformat. * Update changelog. --- antsibull.cfg | 8 +++- changelogs/fragments/78-collection-cache.yml | 4 ++ src/antsibull_core/galaxy.py | 45 ++++++++++++-------- src/antsibull_core/schemas/context.py | 11 ++++- src/antsibull_core/utils/io.py | 9 +++- 5 files changed, 55 insertions(+), 22 deletions(-) create mode 100644 changelogs/fragments/78-collection-cache.yml diff --git a/antsibull.cfg b/antsibull.cfg index 685a919..57f039f 100644 --- a/antsibull.cfg +++ b/antsibull.cfg @@ -2,7 +2,7 @@ # GNU General Public License v3.0+ (see LICENSES/GPL-3.0-or-later.txt or https://www.gnu.org/licenses/gpl-3.0.txt) # SPDX-License-Identifier: GPL-3.0-or-later -ansible_base_url = https://github.com/ansible/ansible/ +ansible_core_repo_url = https://github.com/ansible/ansible/ # Number of bytes to read or write at one time for network or file I/O: chunksize = 4096 galaxy_url = https://galaxy.ansible.com/ @@ -23,6 +23,12 @@ file_check_content = 262144 # to cache downloaded collection artefacts from Ansible Galaxy in that directory # and speed up the docs build time: # collection_cache = ~/.cache/antsibull/collections +# Uncomment the following to trust the collection cache. This means that antsibull-core +# will assume that if the collection cache contains an artifact, it is the current one +# available on the Galaxy server. This avoids making a request to the Galaxy server to +# figure out the artifact's checksum and comparing it before trusting the cached +# artifact. +# trust_collection_cache = true logging_cfg = { version = 1.0 outputs = { diff --git a/changelogs/fragments/78-collection-cache.yml b/changelogs/fragments/78-collection-cache.yml new file mode 100644 index 0000000..4fdcce2 --- /dev/null +++ b/changelogs/fragments/78-collection-cache.yml @@ -0,0 +1,4 @@ +minor_changes: + - "Avoid using the collection artifact filename returned by the Galaxy server. Instead compose it in a uniform way (https://github.com/ansible-community/antsibull-core/pull/78)." + - "Allow the Galaxy downloader to trust its collection cache to avoid having to query the Galaxy server if an artifact exists in the cache. This can be set with the new configuration file option ``trust_collection_cache`` (https://github.com/ansible-community/antsibull-core/pull/78)." + - "Allow to skip content check when doing async file copying using ``antsibull_core.utils.io.copy_file()`` (https://github.com/ansible-community/antsibull-core/pull/78)." diff --git a/src/antsibull_core/galaxy.py b/src/antsibull_core/galaxy.py index c750cfb..7fb852e 100644 --- a/src/antsibull_core/galaxy.py +++ b/src/antsibull_core/galaxy.py @@ -10,7 +10,6 @@ import asyncio import dataclasses import os.path -import shutil import typing as t from enum import Enum from urllib.parse import urljoin @@ -21,6 +20,7 @@ from . import app_context from .utils.hashing import verify_hash from .utils.http import retry_get +from .utils.io import copy_file # The type checker can handle finding aiohttp.client but flake8 cannot :-( if t.TYPE_CHECKING: @@ -384,6 +384,7 @@ def __init__( galaxy_server: t.Optional[str] = None, collection_cache: str | None = None, context: t.Optional[GalaxyContext] = None, + trust_collection_cache: t.Optional[bool] = None, ) -> None: """ Create an object to download collections from galaxy. @@ -398,12 +399,20 @@ def __init__( These tarballs will be used instead of downloading new tarballs provided that the versions match the criteria (latest compatible version known to galaxy). Defaults to ``lib_ctx.get().collection_cache``. + :kwarg trust_collection_cache: If set to ``True``, will assume that if the collection + cache contains an artifact, it is the current one available on the Galaxy server. + This avoids making a request to the Galaxy server to figure out the artifact's + checksum and comparting it before trusting the cached artifact. """ super().__init__(aio_session, galaxy_server=galaxy_server, context=context) self.download_dir = download_dir + lib_ctx = app_context.lib_ctx.get() if collection_cache is None: - collection_cache = app_context.lib_ctx.get().collection_cache + collection_cache = lib_ctx.collection_cache self.collection_cache: t.Final[str | None] = collection_cache + if trust_collection_cache is None: + trust_collection_cache = lib_ctx.trust_collection_cache + self.trust_collection_cache: t.Final[bool] = trust_collection_cache async def download( self, @@ -419,24 +428,26 @@ async def download( :arg version: Version of the collection to download. :returns: The full path to the downloaded collection. """ - collection = collection.replace(".", "/") - release_info = await self.get_release_info(collection, version) + namespace, name = collection.split(".", 1) + filename = f"{namespace}-{name}-{version}.tar.gz" + download_filename = os.path.join(self.download_dir, filename) + + if self.collection_cache and self.trust_collection_cache: + cached_copy = os.path.join(self.collection_cache, filename) + if os.path.isfile(cached_copy): + await copy_file(cached_copy, download_filename, check_content=False) + return download_filename + + release_info = await self.get_release_info(f"{namespace}/{name}", version) release_url = release_info["download_url"] - download_filename = os.path.join( - self.download_dir, release_info["artifact"]["filename"] - ) sha256sum = release_info["artifact"]["sha256"] if self.collection_cache: - if release_info["artifact"]["filename"] in os.listdir( - self.collection_cache - ): - cached_copy = os.path.join( - self.collection_cache, release_info["artifact"]["filename"] - ) + cached_copy = os.path.join(self.collection_cache, filename) + if os.path.isfile(cached_copy): if await verify_hash(cached_copy, sha256sum): - shutil.copyfile(cached_copy, download_filename) + await copy_file(cached_copy, download_filename, check_content=False) return download_filename async with retry_get( @@ -459,10 +470,8 @@ async def download( # Copy downloaded collection into cache if self.collection_cache: - cached_copy = os.path.join( - self.collection_cache, release_info["artifact"]["filename"] - ) - shutil.copyfile(download_filename, cached_copy) + cached_copy = os.path.join(self.collection_cache, filename) + await copy_file(download_filename, cached_copy, check_content=False) return download_filename diff --git a/src/antsibull_core/schemas/context.py b/src/antsibull_core/schemas/context.py index 6fbbecb..acfb660 100644 --- a/src/antsibull_core/schemas/context.py +++ b/src/antsibull_core/schemas/context.py @@ -11,7 +11,7 @@ from ..utils.collections import ContextDict from .config import DEFAULT_LOGGING_CONFIG, LoggingModel -from .validators import convert_none, convert_path +from .validators import convert_bool, convert_none, convert_path class BaseModel(p.BaseModel): @@ -111,6 +111,10 @@ class LibContext(BaseModel): :ivar pypi_url: URL of the pypi server to query for information :ivar collection_cache: If set, must be a path pointing to a directory where collection tarballs are cached so they do not need to be downloaded from Galaxy twice. + :ivar trust_collection_cache: If set to ``True``, will assume that if the collection + cache contains an artifact, it is the current one available on the Galaxy server. + This avoids making a request to the Galaxy server to figure out the artifact's + checksum and comparing it before trusting the cached artifact. """ chunksize: int = 4096 @@ -131,6 +135,7 @@ class LibContext(BaseModel): # pyre-ignore[8]: https://github.com/samuelcolvin/pydantic/issues/1684 pypi_url: p.HttpUrl = "https://pypi.org/" # type: ignore[assignment] collection_cache: t.Optional[str] = None + trust_collection_cache: bool = False # pylint: disable-next=unused-private-member __convert_nones = p.validator("process_max", pre=True, allow_reuse=True)( @@ -140,3 +145,7 @@ class LibContext(BaseModel): __convert_paths = p.validator("collection_cache", pre=True, allow_reuse=True)( convert_path ) + # pylint: disable-next=unused-private-member + __convert_bools = p.validator("trust_collection_cache", pre=True, allow_reuse=True)( + convert_bool + ) diff --git a/src/antsibull_core/utils/io.py b/src/antsibull_core/utils/io.py index 96f067a..ac7cf48 100644 --- a/src/antsibull_core/utils/io.py +++ b/src/antsibull_core/utils/io.py @@ -22,18 +22,23 @@ mlog = log.fields(mod=__name__) -async def copy_file(source_path: StrOrBytesPath, dest_path: StrOrBytesPath) -> None: +async def copy_file( + source_path: StrOrBytesPath, dest_path: StrOrBytesPath, check_content: bool = True +) -> None: """ Copy content from one file to another. :arg source_path: Source path. Must be a file. :arg dest_path: Destination path. + :kwarg check_content: If ``True`` (default) and ``lib_ctx.file_check_content > 0`` and the + destination file exists, first check whether source and destination are potentially equal + before actually copying, """ flog = mlog.fields(func="copy_file") flog.debug("Enter") lib_ctx = app_context.lib_ctx.get() - if lib_ctx.file_check_content > 0: + if check_content and lib_ctx.file_check_content > 0: # Check whether the destination file exists and has the same content as the source file, # in which case we won't overwrite the destination file try: