From fccb028f463a362e731cc20a280e829da0216383 Mon Sep 17 00:00:00 2001
From: Nick Farrell <nick.farrell@aiven.io>
Date: Tue, 2 Jul 2024 12:50:30 +1000
Subject: [PATCH] Support zstd-compressed repositories

A repository may now be compressed using zstd. Extend support
to allow for this, in addition to continuing to support
gzip-encoded repositories.
---
 pyproject.toml              |  1 +
 rpm_s3_mirror.spec          |  1 +
 rpm_s3_mirror/repository.py | 39 +++++++++++++++++++++++--------------
 tests/test_repository.py    | 24 +++++++++++++++++++++--
 4 files changed, 48 insertions(+), 17 deletions(-)
diff --git a/pyproject.toml b/pyproject.toml
index 57a29c6..cd2b373 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -31,6 +31,7 @@ dependencies = [
     "python-dateutil>=2.8.1,<3",
     "botocore>=1.23.50,<2",
     "lxml>=4.6.5,<5",
+    "zstd>=1.5.5.1",
 ]
 
 [project.optional-dependencies]
diff --git a/rpm_s3_mirror.spec b/rpm_s3_mirror.spec
index f934745..3064a01 100644
--- a/rpm_s3_mirror.spec
+++ b/rpm_s3_mirror.spec
@@ -13,6 +13,7 @@ Requires:       python3-requests
 Requires:       python3-dateutil
 Requires:       python3-botocore
 Requires:       python3-lxml
+Requires:       python3-zstd
 Requires:       systemd
 Requires:       zchunk
 
diff --git a/rpm_s3_mirror/repository.py b/rpm_s3_mirror/repository.py
index d81cbb9..b88806b 100644
--- a/rpm_s3_mirror/repository.py
+++ b/rpm_s3_mirror/repository.py
@@ -1,6 +1,7 @@
 # Copyright (c) 2020 Aiven, Helsinki, Finland. https://aiven.io/
 import dataclasses
 import lzma
+import zstd
 import re
 import subprocess
 from abc import abstractmethod
@@ -16,6 +17,7 @@
 from tempfile import TemporaryDirectory
 import os
 import shutil
+from pathlib import Path
 from os.path import join, basename
 
 import gzip
@@ -211,6 +213,15 @@ def _compress(self, root, open_size, open_checksum):
         )
 
 
+def decompress(filename: Path | str) -> bytes:
+    try:
+        with open(filename, "rb") as f:
+            return zstd.decompress(f.read())
+    except zstd.Error:
+        with gzip.open(filename) as f:
+            return f.read()
+
+
 class ZCKUpdateInfoSection(UpdateInfoSection):
     def _read(self):
         return subprocess.check_output(["unzck", self.path, "--stdout"])
@@ -348,19 +359,18 @@ def create_snapshot(self, scratch_dir):
     def _rewrite_primary(self, temp_dir, primary: RepodataSection):
         with self._req(self.session.get, path=primary.location, stream=True) as request:
             local_path = download_repodata_section(primary, request, temp_dir)
-            with gzip.open(local_path) as f:
-                file_bytes = f.read()
-                primary_xml = safe_parse_xml(xml_bytes=file_bytes)
-                open_checksum = sha256(content=file_bytes)
-                open_size = len(file_bytes)
-                for package_element in primary_xml:
-                    location = package_element.find("common:location", namespaces=namespaces)
-                    # As our S3 structure is https://<base-repo>/snapshots/<snapshot-uuid>/, and the "location"
-                    # attribute of the packages in primary.xml references a path relative to the root like:
-                    # "Packages/v/vim.rmp", we need to rewrite this location to point to back a few directories
-                    # from our snapshot root.
-                    relative_location = f"../../{location.get('href')}"
-                    location.set("href", relative_location)
+            file_bytes = decompress(local_path)
+            primary_xml = safe_parse_xml(xml_bytes=file_bytes)
+            open_checksum = sha256(content=file_bytes)
+            open_size = len(file_bytes)
+            for package_element in primary_xml:
+                location = package_element.find("common:location", namespaces=namespaces)
+                # As our S3 structure is https://<base-repo>/snapshots/<snapshot-uuid>/, and the "location"
+                # attribute of the packages in primary.xml references a path relative to the root like:
+                # "Packages/v/vim.rmp", we need to rewrite this location to point to back a few directories
+                # from our snapshot root.
+                relative_location = f"../../{location.get('href')}"
+                location.set("href", relative_location)
 
             # Now we have rewritten our XML file the checksums no longer match, so calculate some new ones (along with
             # size etc from above).
@@ -404,8 +414,7 @@ def _extract_package_list(self, primary: RepodataSection) -> PackageList:
         with self._req(self.session.get, path=primary.location, stream=True) as request:
             with TemporaryDirectory(prefix="/var/tmp/") as temp_dir:
                 local_path = download_repodata_section(primary, request, temp_dir)
-                with gzip.open(local_path) as f:
-                    return PackageList(base_url=self.base_url, packages_xml=f.read())
+                return PackageList(base_url=self.base_url, packages_xml=decompress(local_path))
 
     def parse_repomd(self, xml: Element) -> Dict[str, RepodataSection]:
         sections = {}
diff --git a/tests/test_repository.py b/tests/test_repository.py
index 9da988e..f9d4b3c 100644
--- a/tests/test_repository.py
+++ b/tests/test_repository.py
@@ -1,7 +1,7 @@
 # Copyright (c) 2020 Aiven, Helsinki, Finland. https://aiven.io/
 import pytest
-
-from rpm_s3_mirror.repository import Package, PackageList, RPMRepository, safe_parse_xml
+import tempfile
+from rpm_s3_mirror.repository import Package, PackageList, RPMRepository, safe_parse_xml, decompress
 
 TEST_BASE_URL = "https://some.repo/some/path"
 CHANGED_PACKAGE_NAME = "GMT"
@@ -91,3 +91,23 @@ def test_parse_repomd_xml(repomd_xml):
 def test_reject_http_upstream_repository():
     with pytest.raises(ValueError):
         RPMRepository(base_url="http://dangerdanger")
+
+
+GZIP_CONTENT = b"\x1f\x8b\x08\x08\xe0\x84\x84f\x00\x03content\x00+\xc8/I,\xc9\xe7\x02\x00I:&V\x07\x00\x00\x00"
+ZSTD_CONTENT = b"(\xb5/\xfd$\x079\x00\x00potato\nE.\xa8%"
+UNCOMPRESSED_CONTENT = b"potato\n"
+
+
+@pytest.mark.parametrize(
+    ["content", "expected"],
+    [
+        pytest.param(GZIP_CONTENT, UNCOMPRESSED_CONTENT, id="gzip"),
+        pytest.param(ZSTD_CONTENT, UNCOMPRESSED_CONTENT, id="zstd"),
+    ],
+)
+def test_decompress(content: bytes, expected: bytes):
+    with tempfile.NamedTemporaryFile() as f:
+        f.write(content)
+        f.flush()
+        actual = decompress(f.name)
+        assert actual == expected