Skip to content

Commit

Permalink
Support zstd-compressed repositories
Browse files Browse the repository at this point in the history
A repository may now be compressed using zstd. Extend support
to allow for this, in addition to continuing to support
gzip-encoded repositories.
  • Loading branch information
nicois committed Jul 2, 2024
1 parent 1b38cfd commit fccb028
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 17 deletions.
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ dependencies = [
"python-dateutil>=2.8.1,<3",
"botocore>=1.23.50,<2",
"lxml>=4.6.5,<5",
"zstd>=1.5.5.1",
]

[project.optional-dependencies]
Expand Down
1 change: 1 addition & 0 deletions rpm_s3_mirror.spec
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ Requires: python3-requests
Requires: python3-dateutil
Requires: python3-botocore
Requires: python3-lxml
Requires: python3-zstd
Requires: systemd
Requires: zchunk

Expand Down
39 changes: 24 additions & 15 deletions rpm_s3_mirror/repository.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Copyright (c) 2020 Aiven, Helsinki, Finland. https://aiven.io/
import dataclasses
import lzma
import zstd
import re
import subprocess
from abc import abstractmethod
Expand All @@ -16,6 +17,7 @@
from tempfile import TemporaryDirectory
import os
import shutil
from pathlib import Path
from os.path import join, basename

import gzip
Expand Down Expand Up @@ -211,6 +213,15 @@ def _compress(self, root, open_size, open_checksum):
)


def decompress(filename: Path | str) -> bytes:
try:
with open(filename, "rb") as f:
return zstd.decompress(f.read())
except zstd.Error:
with gzip.open(filename) as f:
return f.read()


class ZCKUpdateInfoSection(UpdateInfoSection):
def _read(self):
return subprocess.check_output(["unzck", self.path, "--stdout"])
Expand Down Expand Up @@ -348,19 +359,18 @@ def create_snapshot(self, scratch_dir):
def _rewrite_primary(self, temp_dir, primary: RepodataSection):
with self._req(self.session.get, path=primary.location, stream=True) as request:
local_path = download_repodata_section(primary, request, temp_dir)
with gzip.open(local_path) as f:
file_bytes = f.read()
primary_xml = safe_parse_xml(xml_bytes=file_bytes)
open_checksum = sha256(content=file_bytes)
open_size = len(file_bytes)
for package_element in primary_xml:
location = package_element.find("common:location", namespaces=namespaces)
# As our S3 structure is https://<base-repo>/snapshots/<snapshot-uuid>/, and the "location"
# attribute of the packages in primary.xml references a path relative to the root like:
# "Packages/v/vim.rmp", we need to rewrite this location to point to back a few directories
# from our snapshot root.
relative_location = f"../../{location.get('href')}"
location.set("href", relative_location)
file_bytes = decompress(local_path)
primary_xml = safe_parse_xml(xml_bytes=file_bytes)
open_checksum = sha256(content=file_bytes)
open_size = len(file_bytes)
for package_element in primary_xml:
location = package_element.find("common:location", namespaces=namespaces)
# As our S3 structure is https://<base-repo>/snapshots/<snapshot-uuid>/, and the "location"
# attribute of the packages in primary.xml references a path relative to the root like:
# "Packages/v/vim.rmp", we need to rewrite this location to point to back a few directories
# from our snapshot root.
relative_location = f"../../{location.get('href')}"
location.set("href", relative_location)

# Now we have rewritten our XML file the checksums no longer match, so calculate some new ones (along with
# size etc from above).
Expand Down Expand Up @@ -404,8 +414,7 @@ def _extract_package_list(self, primary: RepodataSection) -> PackageList:
with self._req(self.session.get, path=primary.location, stream=True) as request:
with TemporaryDirectory(prefix="/var/tmp/") as temp_dir:
local_path = download_repodata_section(primary, request, temp_dir)
with gzip.open(local_path) as f:
return PackageList(base_url=self.base_url, packages_xml=f.read())
return PackageList(base_url=self.base_url, packages_xml=decompress(local_path))

def parse_repomd(self, xml: Element) -> Dict[str, RepodataSection]:
sections = {}
Expand Down
24 changes: 22 additions & 2 deletions tests/test_repository.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Copyright (c) 2020 Aiven, Helsinki, Finland. https://aiven.io/
import pytest

from rpm_s3_mirror.repository import Package, PackageList, RPMRepository, safe_parse_xml
import tempfile
from rpm_s3_mirror.repository import Package, PackageList, RPMRepository, safe_parse_xml, decompress

TEST_BASE_URL = "https://some.repo/some/path"
CHANGED_PACKAGE_NAME = "GMT"
Expand Down Expand Up @@ -91,3 +91,23 @@ def test_parse_repomd_xml(repomd_xml):
def test_reject_http_upstream_repository():
with pytest.raises(ValueError):
RPMRepository(base_url="http://dangerdanger")


GZIP_CONTENT = b"\x1f\x8b\x08\x08\xe0\x84\x84f\x00\x03content\x00+\xc8/I,\xc9\xe7\x02\x00I:&V\x07\x00\x00\x00"
ZSTD_CONTENT = b"(\xb5/\xfd$\x079\x00\x00potato\nE.\xa8%"
UNCOMPRESSED_CONTENT = b"potato\n"


@pytest.mark.parametrize(
["content", "expected"],
[
pytest.param(GZIP_CONTENT, UNCOMPRESSED_CONTENT, id="gzip"),
pytest.param(ZSTD_CONTENT, UNCOMPRESSED_CONTENT, id="zstd"),
],
)
def test_decompress(content: bytes, expected: bytes):
with tempfile.NamedTemporaryFile() as f:
f.write(content)
f.flush()
actual = decompress(f.name)
assert actual == expected

0 comments on commit fccb028

Please sign in to comment.