Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Set a default gzip compression level globally #138

Merged
merged 12 commits into from
Jan 16, 2024
2 changes: 2 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,8 @@ Changelog

in-development
~~~~~~~~~~~~~~~~~~~
* #138: The gzip default compression level is now 1 when no value is provided
by the calling function. The default used to be determined by the backend.
* #135: xopen now uses zlib-ng when available and applicable.

v1.8.0 (2023-11-03)
Expand Down
151 changes: 57 additions & 94 deletions src/xopen/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@
import subprocess
import tempfile
import time
import zlib
from abc import ABC, abstractmethod
from subprocess import Popen, PIPE, DEVNULL
from typing import (
Expand All @@ -55,6 +54,9 @@
# 128K buffer size also used by cat, pigz etc. It is faster than the 8K default.
BUFFER_SIZE = max(io.DEFAULT_BUFFER_SIZE, 128 * 1024)

# 1 is by far the fastest and most efficient level.
marcelm marked this conversation as resolved.
Show resolved Hide resolved
XOPEN_DEFAULT_GZIP_COMPRESSION = 1

igzip: Optional[ModuleType]
isal_zlib: Optional[ModuleType]
igzip_threaded: Optional[ModuleType]
Expand Down Expand Up @@ -1032,88 +1034,65 @@ def _open_zst( # noqa: C901
return f


def _open_external_gzip_reader(
filename, mode, compresslevel, threads, **text_mode_kwargs
):
assert mode in ("rt", "rb")
try:
return PipedPigzReader(filename, mode, threads=threads, **text_mode_kwargs)
except OSError:
return PipedGzipReader(filename, mode, **text_mode_kwargs)


def _open_external_gzip_writer(
filename, mode, compresslevel, threads, **text_mode_kwargs
):
assert mode in ("wt", "wb", "at", "ab")
try:
return PipedPigzWriter(
filename, mode, compresslevel, threads=threads, **text_mode_kwargs
)
except OSError:
return PipedGzipWriter(filename, mode, compresslevel, **text_mode_kwargs)


def _open_gz( # noqa: C901
filename, mode: str, compresslevel, threads, **text_mode_kwargs
):
assert mode in ("rt", "rb", "wt", "wb", "at", "ab")
# With threads == 0 igzip_threaded defers to igzip.open, but that is not
# desirable as a reproducible header is required.
if igzip_threaded and threads != 0:
try:
return igzip_threaded.open( # type: ignore
filename,
mode,
isal_zlib.ISAL_DEFAULT_COMPRESSION # type: ignore
if compresslevel is None
else compresslevel,
**text_mode_kwargs,
threads=1 if threads is None else threads,
)
except ValueError: # Wrong compression level
pass
if gzip_ng_threaded and zlib_ng and threads != 0:
try:
if compresslevel is None:
level = zlib_ng.Z_DEFAULT_COMPRESSION
elif compresslevel == 1:
# zlib-ng level 1 is 50% bigger than zlib level 1.
# This will be wildly outside user ballpark expectations, so
# increase the level
level = 2
else:
level = compresslevel

return gzip_ng_threaded.open(
filename,
mode,
level,
**text_mode_kwargs,
threads=1 if threads is None else threads,
)
except zlib_ng.error: # Bad compression level
pass
if compresslevel is None:
# Force the same compression level on every tool regardless of
# library defaults
compresslevel = XOPEN_DEFAULT_GZIP_COMPRESSION

if threads != 0:
if igzip_threaded:
try:
return igzip_threaded.open( # type: ignore
filename,
mode,
compresslevel,
**text_mode_kwargs,
threads=1,
)
except ValueError: # Wrong compression level
pass
if gzip_ng_threaded and zlib_ng:
try:
return gzip_ng_threaded.open(
filename,
mode,
# zlib-ng level 1 is 50% bigger than zlib level 1.
# This will be wildly outside user ballpark expectations, so
# increase the level
max(compresslevel, 2),
**text_mode_kwargs,
threads=threads or max(_available_cpu_count(), 4),
)
except zlib_ng.error: # Bad compression level
pass
try:
if "r" in mode:
return _open_external_gzip_reader(
filename, mode, compresslevel, threads, **text_mode_kwargs
)
try:
return PipedPigzReader(
filename, mode, threads=threads, **text_mode_kwargs
)
except OSError:
return PipedGzipReader(filename, mode, **text_mode_kwargs)
else:
return _open_external_gzip_writer(
filename, mode, compresslevel, threads, **text_mode_kwargs
)
try:
return PipedPigzWriter(
filename,
mode,
compresslevel,
threads=threads,
**text_mode_kwargs,
)
except OSError:
return PipedGzipWriter(
filename, mode, compresslevel, **text_mode_kwargs
)
except OSError:
pass # We try without threads.

if "r" in mode:
if igzip is not None:
return igzip.open(filename, mode, **text_mode_kwargs)
elif gzip_ng is not None:
return gzip_ng.open(filename, mode, **text_mode_kwargs)
return gzip.open(filename, mode, **text_mode_kwargs)

g = _open_reproducible_gzip(
filename,
mode=mode[0] + "b",
Expand All @@ -1124,13 +1103,14 @@ def _open_gz( # noqa: C901
return g


def _open_reproducible_gzip(filename, mode, compresslevel):
def _open_reproducible_gzip(filename, mode: str, compresslevel: int):
"""
Open a gzip file for writing (without external processes)
that has neither mtime nor the file name in the header
(equivalent to gzip --no-name)
"""
assert mode in ("rb", "wb", "ab")
assert compresslevel is not None
# Neither gzip.open nor igzip.open have an mtime option, and they will
# always write the file name, so we need to open the file separately
# and pass it to gzip.GzipFile/igzip.IGzipFile.
Expand All @@ -1144,33 +1124,16 @@ def _open_reproducible_gzip(filename, mode, compresslevel):
gzip_file = None
if igzip is not None:
try:
gzip_file = igzip.IGzipFile(
**kwargs,
compresslevel=isal_zlib.ISAL_DEFAULT_COMPRESSION
if compresslevel is None
else compresslevel,
)
gzip_file = igzip.IGzipFile(**kwargs, compresslevel=compresslevel)
except ValueError:
# Compression level not supported, move to built-in gzip.
pass
elif gzip_ng is not None:
if compresslevel == 1:
level = 2
elif compresslevel is None:
level = zlib_ng.Z_DEFAULT_COMPRESSION
else:
level = compresslevel
gzip_file = gzip_ng.GzipNGFile(**kwargs, compresslevel=level)
# Compression level should be at least 2 for zlib-ng to prevent very big files.
gzip_file = gzip_ng.GzipNGFile(**kwargs, compresslevel=max(compresslevel, 2))

if gzip_file is None:
gzip_file = gzip.GzipFile(
**kwargs,
# Override gzip.open's default of 9 for consistency
# with command-line gzip.
compresslevel=zlib.Z_DEFAULT_COMPRESSION
if compresslevel is None
else compresslevel,
)
gzip_file = gzip.GzipFile(**kwargs, compresslevel=compresslevel) # type: ignore
# When (I)GzipFile is created with a fileobj instead of a filename,
# the passed file object is not closed when (I)GzipFile.close()
# is called. This forces it to be closed.
Expand Down