Skip to content

Commit

Permalink
refactor transmute() on top of new "create" module (#90)
Browse files Browse the repository at this point in the history
* refactor transmute() using new transmute_stream() function
* add 'create' module
* Update CHANGELOG.md
* enable napoleon (better docstring style) extension
* Use tarfile.open(encoding="utf-8") by default
* add intersphinx to link to Python 3 documentation

---------

Co-authored-by: jaimergp <[email protected]>
  • Loading branch information
dholth and jaimergp authored Aug 14, 2024
1 parent 6ff1cd3 commit 19fd0e4
Show file tree
Hide file tree
Showing 12 changed files with 388 additions and 101 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
[//]: # (current developments)

* Add `transmute_stream(...)` to create `.conda` from `(TarFile, TarInfo)`. (#90)
iterators, allowing more creative data sources than just `.tar.bz2` inputs.
* Add `create` module with `TarFile` interface for creating `.conda`
archives, also used by `transmute`. (#90)
* Pass `encoding="utf-8"` to `TarFile` instead of the system default, avoiding
rare potential issues with non-ASCII filenames. (#93)

## 0.10.0 (2024-06)

* Use zip64 extensions when converting .tar.bz2 to .conda, if uncompressed size
Expand Down
179 changes: 179 additions & 0 deletions conda_package_streaming/create.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
"""
Tools for creating ``.conda``-format archives.
Uses ``tempfile.SpooledTemporaryFile`` to buffer ``pkg-*.tar`` and
``info-*.tar``, then compress directly into an open `ZipFile` at the end.
`SpooledTemporaryFile` buffers the first 10MB of the package and its metadata in
memory, but writes out to disk for larger packages.
Uses more disk space than ``conda-package-handling`` (temporary uncompressed
tarballs of the package contents) but accepts streams instead of just
files-on-the-filesystem.
"""

from __future__ import annotations

import json
import shutil
import tarfile
import tempfile
import zipfile
from contextlib import contextmanager
from pathlib import Path
from typing import Callable, Iterator

import zstandard

# increase to reduce speed and increase compression (levels above 19 use much
# more memory)
ZSTD_COMPRESS_LEVEL = 19
# increase to reduce compression and increase speed
ZSTD_COMPRESS_THREADS = 1

CONDA_PACKAGE_FORMAT_VERSION = 2

# Account for growth from "2 GB of /dev/urandom" to not exceed ZIP64_LIMIT after
# compression
CONDA_ZIP64_LIMIT = zipfile.ZIP64_LIMIT - (1 << 18) - 1


def anonymize(tarinfo: tarfile.TarInfo):
"""
Pass to ``tarfile.add(..., filter=anonymize)`` to anonymize uid/gid.
Does not anonymize mtime or any other field.
"""
tarinfo.uid = tarinfo.gid = 0
tarinfo.uname = tarinfo.gname = ""
return tarinfo


class CondaTarFile(tarfile.TarFile):
"""
Subclass of :external+python:py:class:`tarfile.TarFile` that adds members to
a second ``info`` tar if they match ``is_info(name)``.
Create this with ``conda_builder(...)`` which sets up the component
archives, then wraps them into a ``.conda`` on exit.
Only useful for creating, not extracting ``.conda``.
"""

info_tar: tarfile.TarFile
is_info: Callable

def __init__(
self,
*args,
info_tar: tarfile.TarFile,
is_info=lambda name: name.startswith("info/"),
**kwargs,
):
super().__init__(*args, **kwargs)
self.info_tar = info_tar
self.is_info = is_info

def addfile(self, tarinfo, fileobj=None):
"""
Add the TarInfo object ``tarinfo`` to the archive. If ``fileobj`` is
given, it should be a binary file, and tarinfo.size bytes are read from
it and added to the archive. You can create TarInfo objects directly, or
by using ``gettarinfo()``.
If ``self.is_info(tarinfo.name)`` returns ``True``, add ``tarinfo`` to
``self.info_tar`` instead.
"""
if self.is_info(tarinfo.name):
return self.info_tar.addfile(tarinfo, fileobj=fileobj)
else:
return super().addfile(tarinfo, fileobj)


@contextmanager
def conda_builder(
stem,
path,
*,
compressor: Callable[
[], zstandard.ZstdCompressor
] = lambda: zstandard.ZstdCompressor(
level=ZSTD_COMPRESS_LEVEL, threads=ZSTD_COMPRESS_THREADS
),
is_info: Callable[[str], bool] = lambda filename: filename.startswith("info/"),
encoding="utf-8",
) -> Iterator[CondaTarFile]:
"""
Produce a ``TarFile`` subclass used to build a ``.conda`` package. The
subclass delegates ``addfile()`` to the ``info-`` component when ``is_info``
returns True.
When the context manager exits, ``{path}/{stem}.conda`` is written with
the component tar archives.
Args:
stem: output filename without extension
path: destination path for transmuted .conda package compressor: A
function that creates instances of ``zstandard.ZstdCompressor()``.
encoding: passed to TarFile constructor. Keep default "utf-8" for valid
.conda.
Yields:
``CondaTarFile``
"""
output_path = Path(path, f"{stem}.conda")
with tempfile.SpooledTemporaryFile() as info_file, tempfile.SpooledTemporaryFile() as pkg_file:
with tarfile.TarFile(
fileobj=info_file, mode="w", encoding=encoding
) as info_tar, CondaTarFile(
fileobj=pkg_file,
mode="w",
info_tar=info_tar,
is_info=is_info,
encoding=encoding,
) as pkg_tar:
# If we wanted to compress these at a low setting to save temporary
# space, we could insert a file object that counts bytes written in
# front of a zstd (level between 1..3) compressor.
yield pkg_tar

info_tar.close()
pkg_tar.close()

info_size = info_file.tell()
pkg_size = pkg_file.tell()

info_file.seek(0)
pkg_file.seek(0)

with zipfile.ZipFile(
output_path,
"x", # x to not append to existing
compresslevel=zipfile.ZIP_STORED,
) as conda_file:
# Use a maximum of one Zstd compressor, stream_writer at a time to save memory.
data_compress = compressor()

pkg_metadata = {"conda_pkg_format_version": CONDA_PACKAGE_FORMAT_VERSION}
conda_file.writestr("metadata.json", json.dumps(pkg_metadata))

with conda_file.open(
f"pkg-{stem}.tar.zst",
"w",
force_zip64=(pkg_size > CONDA_ZIP64_LIMIT),
) as pkg_file_zip, data_compress.stream_writer(
pkg_file_zip, size=pkg_size, closefd=False
) as pkg_stream:
shutil.copyfileobj(pkg_file._file, pkg_stream)

with conda_file.open(
f"info-{stem}.tar.zst",
"w",
force_zip64=(info_size > CONDA_ZIP64_LIMIT),
) as info_file_zip, data_compress.stream_writer(
info_file_zip,
size=info_size,
closefd=False,
) as info_stream:
shutil.copyfileobj(info_file._file, info_stream)
21 changes: 15 additions & 6 deletions conda_package_streaming/package_streaming.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,15 +67,20 @@ def chmod(self, tarinfo, targetpath):


def tar_generator(
fileobj, tarfile_open=TarfileNoSameOwner.open, closefd=False
fileobj, tarfile_open=TarfileNoSameOwner.open, closefd=False, *, encoding="utf-8"
) -> Generator[tuple[tarfile.TarFile, tarfile.TarInfo]]:
"""
Yield (tar, member) from fileobj.
Args:
fileobj: file-like object
encoding: "utf-8" passed to TarFile.open(); can be changed for testing.
"""
# tarfile will not close fileobj because _extfileobj is True
# caller should take care to close files all the way back to the http request...
try:
with tarfile_open(fileobj=fileobj, mode="r|") as tar:
with tarfile_open(fileobj=fileobj, mode="r|", encoding=encoding) as tar:
for member in tar:
yield tar, member
finally:
Expand Down Expand Up @@ -104,7 +109,11 @@ def stream_conda_info(


def stream_conda_component(
filename, fileobj=None, component: CondaComponent | str = CondaComponent.pkg
filename,
fileobj=None,
component: CondaComponent | str = CondaComponent.pkg,
*,
encoding="utf-8",
) -> Generator[tuple[tarfile.TarFile, tarfile.TarInfo]]:
"""
Yield members from .conda's embedded {component}- tarball. "info" or "pkg".
Expand All @@ -124,8 +133,8 @@ def stream_conda_component(
raise RuntimeError("Cannot unpack `.conda` without zstandard")

zf = zipfile.ZipFile(fileobj or filename)
file_id, _, _ = os.path.basename(filename).rpartition(".")
component_name = f"{component}-{file_id}"
stem, _, _ = os.path.basename(filename).rpartition(".")
component_name = f"{component}-{stem}"
component_filename = [
info for info in zf.infolist() if info.filename.startswith(component_name)
]
Expand All @@ -139,4 +148,4 @@ def stream_conda_component(
reader = bz2.open(fileobj or filename, mode="rb")
else:
raise ValueError("unsupported file extension")
return tar_generator(reader, closefd=fileobj is None)
return tar_generator(reader, closefd=fileobj is None, encoding=encoding)
Loading

0 comments on commit 19fd0e4

Please sign in to comment.