Skip to content

Commit

Permalink
Merge pull request #138 from pycompression/simplifygzopen
Browse files Browse the repository at this point in the history
Set a default gzip compression level globally
  • Loading branch information
marcelm authored Jan 16, 2024
2 parents 2a413d8 + 7ea3ae4 commit ca7a41b
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 94 deletions.
2 changes: 2 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,8 @@ Changelog

in-development
~~~~~~~~~~~~~~~~~~~
* #138: The gzip default compression level is now 1 when no value is provided
by the calling function. The default used to be determined by the backend.
* #135: xopen now uses zlib-ng when available and applicable.

v1.8.0 (2023-11-03)
Expand Down
150 changes: 56 additions & 94 deletions src/xopen/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@
import subprocess
import tempfile
import time
import zlib
from abc import ABC, abstractmethod
from subprocess import Popen, PIPE, DEVNULL
from typing import (
Expand All @@ -55,6 +54,8 @@
# 128K buffer size also used by cat, pigz etc. It is faster than the 8K default.
BUFFER_SIZE = max(io.DEFAULT_BUFFER_SIZE, 128 * 1024)

XOPEN_DEFAULT_GZIP_COMPRESSION = 1

igzip: Optional[ModuleType]
isal_zlib: Optional[ModuleType]
igzip_threaded: Optional[ModuleType]
Expand Down Expand Up @@ -1032,88 +1033,65 @@ def _open_zst( # noqa: C901
return f


def _open_external_gzip_reader(
filename, mode, compresslevel, threads, **text_mode_kwargs
):
assert mode in ("rt", "rb")
try:
return PipedPigzReader(filename, mode, threads=threads, **text_mode_kwargs)
except OSError:
return PipedGzipReader(filename, mode, **text_mode_kwargs)


def _open_external_gzip_writer(
filename, mode, compresslevel, threads, **text_mode_kwargs
):
assert mode in ("wt", "wb", "at", "ab")
try:
return PipedPigzWriter(
filename, mode, compresslevel, threads=threads, **text_mode_kwargs
)
except OSError:
return PipedGzipWriter(filename, mode, compresslevel, **text_mode_kwargs)


def _open_gz( # noqa: C901
filename, mode: str, compresslevel, threads, **text_mode_kwargs
):
assert mode in ("rt", "rb", "wt", "wb", "at", "ab")
# With threads == 0 igzip_threaded defers to igzip.open, but that is not
# desirable as a reproducible header is required.
if igzip_threaded and threads != 0:
try:
return igzip_threaded.open( # type: ignore
filename,
mode,
isal_zlib.ISAL_DEFAULT_COMPRESSION # type: ignore
if compresslevel is None
else compresslevel,
**text_mode_kwargs,
threads=1 if threads is None else threads,
)
except ValueError: # Wrong compression level
pass
if gzip_ng_threaded and zlib_ng and threads != 0:
try:
if compresslevel is None:
level = zlib_ng.Z_DEFAULT_COMPRESSION
elif compresslevel == 1:
# zlib-ng level 1 is 50% bigger than zlib level 1.
# This will be wildly outside user ballpark expectations, so
# increase the level
level = 2
else:
level = compresslevel

return gzip_ng_threaded.open(
filename,
mode,
level,
**text_mode_kwargs,
threads=1 if threads is None else threads,
)
except zlib_ng.error: # Bad compression level
pass
if compresslevel is None:
# Force the same compression level on every tool regardless of
# library defaults
compresslevel = XOPEN_DEFAULT_GZIP_COMPRESSION

if threads != 0:
if igzip_threaded:
try:
return igzip_threaded.open( # type: ignore
filename,
mode,
compresslevel,
**text_mode_kwargs,
threads=1,
)
except ValueError: # Wrong compression level
pass
if gzip_ng_threaded and zlib_ng:
try:
return gzip_ng_threaded.open(
filename,
mode,
# zlib-ng level 1 is 50% bigger than zlib level 1.
# This will be wildly outside user ballpark expectations, so
# increase the level
max(compresslevel, 2),
**text_mode_kwargs,
threads=threads or max(_available_cpu_count(), 4),
)
except zlib_ng.error: # Bad compression level
pass
try:
if "r" in mode:
return _open_external_gzip_reader(
filename, mode, compresslevel, threads, **text_mode_kwargs
)
try:
return PipedPigzReader(
filename, mode, threads=threads, **text_mode_kwargs
)
except OSError:
return PipedGzipReader(filename, mode, **text_mode_kwargs)
else:
return _open_external_gzip_writer(
filename, mode, compresslevel, threads, **text_mode_kwargs
)
try:
return PipedPigzWriter(
filename,
mode,
compresslevel,
threads=threads,
**text_mode_kwargs,
)
except OSError:
return PipedGzipWriter(
filename, mode, compresslevel, **text_mode_kwargs
)
except OSError:
pass # We try without threads.

if "r" in mode:
if igzip is not None:
return igzip.open(filename, mode, **text_mode_kwargs)
elif gzip_ng is not None:
return gzip_ng.open(filename, mode, **text_mode_kwargs)
return gzip.open(filename, mode, **text_mode_kwargs)

g = _open_reproducible_gzip(
filename,
mode=mode[0] + "b",
Expand All @@ -1124,13 +1102,14 @@ def _open_gz( # noqa: C901
return g


def _open_reproducible_gzip(filename, mode, compresslevel):
def _open_reproducible_gzip(filename, mode: str, compresslevel: int):
"""
Open a gzip file for writing (without external processes)
that has neither mtime nor the file name in the header
(equivalent to gzip --no-name)
"""
assert mode in ("rb", "wb", "ab")
assert compresslevel is not None
# Neither gzip.open nor igzip.open have an mtime option, and they will
# always write the file name, so we need to open the file separately
# and pass it to gzip.GzipFile/igzip.IGzipFile.
Expand All @@ -1144,33 +1123,16 @@ def _open_reproducible_gzip(filename, mode, compresslevel):
gzip_file = None
if igzip is not None:
try:
gzip_file = igzip.IGzipFile(
**kwargs,
compresslevel=isal_zlib.ISAL_DEFAULT_COMPRESSION
if compresslevel is None
else compresslevel,
)
gzip_file = igzip.IGzipFile(**kwargs, compresslevel=compresslevel)
except ValueError:
# Compression level not supported, move to built-in gzip.
pass
elif gzip_ng is not None:
if compresslevel == 1:
level = 2
elif compresslevel is None:
level = zlib_ng.Z_DEFAULT_COMPRESSION
else:
level = compresslevel
gzip_file = gzip_ng.GzipNGFile(**kwargs, compresslevel=level)
# Compression level should be at least 2 for zlib-ng to prevent very big files.
gzip_file = gzip_ng.GzipNGFile(**kwargs, compresslevel=max(compresslevel, 2))

if gzip_file is None:
gzip_file = gzip.GzipFile(
**kwargs,
# Override gzip.open's default of 9 for consistency
# with command-line gzip.
compresslevel=zlib.Z_DEFAULT_COMPRESSION
if compresslevel is None
else compresslevel,
)
gzip_file = gzip.GzipFile(**kwargs, compresslevel=compresslevel) # type: ignore
# When (I)GzipFile is created with a fileobj instead of a filename,
# the passed file object is not closed when (I)GzipFile.close()
# is called. This forces it to be closed.
Expand Down

0 comments on commit ca7a41b

Please sign in to comment.