From fccb028f463a362e731cc20a280e829da0216383 Mon Sep 17 00:00:00 2001 From: Nick Farrell Date: Tue, 2 Jul 2024 12:50:30 +1000 Subject: [PATCH] Support zstd-compressed repositories A repository may now be compressed using zstd. Extend support to allow for this, in addition to continuing to support gzip-encoded repositories. --- pyproject.toml | 1 + rpm_s3_mirror.spec | 1 + rpm_s3_mirror/repository.py | 39 +++++++++++++++++++++++-------------- tests/test_repository.py | 24 +++++++++++++++++++++-- 4 files changed, 48 insertions(+), 17 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 57a29c6..cd2b373 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,6 +31,7 @@ dependencies = [ "python-dateutil>=2.8.1,<3", "botocore>=1.23.50,<2", "lxml>=4.6.5,<5", + "zstd>=1.5.5.1", ] [project.optional-dependencies] diff --git a/rpm_s3_mirror.spec b/rpm_s3_mirror.spec index f934745..3064a01 100644 --- a/rpm_s3_mirror.spec +++ b/rpm_s3_mirror.spec @@ -13,6 +13,7 @@ Requires: python3-requests Requires: python3-dateutil Requires: python3-botocore Requires: python3-lxml +Requires: python3-zstd Requires: systemd Requires: zchunk diff --git a/rpm_s3_mirror/repository.py b/rpm_s3_mirror/repository.py index d81cbb9..b88806b 100644 --- a/rpm_s3_mirror/repository.py +++ b/rpm_s3_mirror/repository.py @@ -1,6 +1,7 @@ # Copyright (c) 2020 Aiven, Helsinki, Finland. https://aiven.io/ import dataclasses import lzma +import zstd import re import subprocess from abc import abstractmethod @@ -16,6 +17,7 @@ from tempfile import TemporaryDirectory import os import shutil +from pathlib import Path from os.path import join, basename import gzip @@ -211,6 +213,15 @@ def _compress(self, root, open_size, open_checksum): ) +def decompress(filename: Path | str) -> bytes: + try: + with open(filename, "rb") as f: + return zstd.decompress(f.read()) + except zstd.Error: + with gzip.open(filename) as f: + return f.read() + + class ZCKUpdateInfoSection(UpdateInfoSection): def _read(self): return subprocess.check_output(["unzck", self.path, "--stdout"]) @@ -348,19 +359,18 @@ def create_snapshot(self, scratch_dir): def _rewrite_primary(self, temp_dir, primary: RepodataSection): with self._req(self.session.get, path=primary.location, stream=True) as request: local_path = download_repodata_section(primary, request, temp_dir) - with gzip.open(local_path) as f: - file_bytes = f.read() - primary_xml = safe_parse_xml(xml_bytes=file_bytes) - open_checksum = sha256(content=file_bytes) - open_size = len(file_bytes) - for package_element in primary_xml: - location = package_element.find("common:location", namespaces=namespaces) - # As our S3 structure is https:///snapshots//, and the "location" - # attribute of the packages in primary.xml references a path relative to the root like: - # "Packages/v/vim.rmp", we need to rewrite this location to point to back a few directories - # from our snapshot root. - relative_location = f"../../{location.get('href')}" - location.set("href", relative_location) + file_bytes = decompress(local_path) + primary_xml = safe_parse_xml(xml_bytes=file_bytes) + open_checksum = sha256(content=file_bytes) + open_size = len(file_bytes) + for package_element in primary_xml: + location = package_element.find("common:location", namespaces=namespaces) + # As our S3 structure is https:///snapshots//, and the "location" + # attribute of the packages in primary.xml references a path relative to the root like: + # "Packages/v/vim.rmp", we need to rewrite this location to point to back a few directories + # from our snapshot root. + relative_location = f"../../{location.get('href')}" + location.set("href", relative_location) # Now we have rewritten our XML file the checksums no longer match, so calculate some new ones (along with # size etc from above). @@ -404,8 +414,7 @@ def _extract_package_list(self, primary: RepodataSection) -> PackageList: with self._req(self.session.get, path=primary.location, stream=True) as request: with TemporaryDirectory(prefix="/var/tmp/") as temp_dir: local_path = download_repodata_section(primary, request, temp_dir) - with gzip.open(local_path) as f: - return PackageList(base_url=self.base_url, packages_xml=f.read()) + return PackageList(base_url=self.base_url, packages_xml=decompress(local_path)) def parse_repomd(self, xml: Element) -> Dict[str, RepodataSection]: sections = {} diff --git a/tests/test_repository.py b/tests/test_repository.py index 9da988e..f9d4b3c 100644 --- a/tests/test_repository.py +++ b/tests/test_repository.py @@ -1,7 +1,7 @@ # Copyright (c) 2020 Aiven, Helsinki, Finland. https://aiven.io/ import pytest - -from rpm_s3_mirror.repository import Package, PackageList, RPMRepository, safe_parse_xml +import tempfile +from rpm_s3_mirror.repository import Package, PackageList, RPMRepository, safe_parse_xml, decompress TEST_BASE_URL = "https://some.repo/some/path" CHANGED_PACKAGE_NAME = "GMT" @@ -91,3 +91,23 @@ def test_parse_repomd_xml(repomd_xml): def test_reject_http_upstream_repository(): with pytest.raises(ValueError): RPMRepository(base_url="http://dangerdanger") + + +GZIP_CONTENT = b"\x1f\x8b\x08\x08\xe0\x84\x84f\x00\x03content\x00+\xc8/I,\xc9\xe7\x02\x00I:&V\x07\x00\x00\x00" +ZSTD_CONTENT = b"(\xb5/\xfd$\x079\x00\x00potato\nE.\xa8%" +UNCOMPRESSED_CONTENT = b"potato\n" + + +@pytest.mark.parametrize( + ["content", "expected"], + [ + pytest.param(GZIP_CONTENT, UNCOMPRESSED_CONTENT, id="gzip"), + pytest.param(ZSTD_CONTENT, UNCOMPRESSED_CONTENT, id="zstd"), + ], +) +def test_decompress(content: bytes, expected: bytes): + with tempfile.NamedTemporaryFile() as f: + f.write(content) + f.flush() + actual = decompress(f.name) + assert actual == expected