diff --git a/README.rst b/README.rst index 8611aab..c07837e 100644 --- a/README.rst +++ b/README.rst @@ -113,6 +113,8 @@ Changelog in-development ~~~~~~~~~~~~~~~~~~~ +* #138: The gzip default compression level is now 1 when no value is provided + by the calling function. The default used to be determined by the backend. * #135: xopen now uses zlib-ng when available and applicable. v1.8.0 (2023-11-03) diff --git a/src/xopen/__init__.py b/src/xopen/__init__.py index 287176e..3d00694 100644 --- a/src/xopen/__init__.py +++ b/src/xopen/__init__.py @@ -33,7 +33,6 @@ import subprocess import tempfile import time -import zlib from abc import ABC, abstractmethod from subprocess import Popen, PIPE, DEVNULL from typing import ( @@ -55,6 +54,8 @@ # 128K buffer size also used by cat, pigz etc. It is faster than the 8K default. BUFFER_SIZE = max(io.DEFAULT_BUFFER_SIZE, 128 * 1024) +XOPEN_DEFAULT_GZIP_COMPRESSION = 1 + igzip: Optional[ModuleType] isal_zlib: Optional[ModuleType] igzip_threaded: Optional[ModuleType] @@ -1032,88 +1033,65 @@ def _open_zst( # noqa: C901 return f -def _open_external_gzip_reader( - filename, mode, compresslevel, threads, **text_mode_kwargs -): - assert mode in ("rt", "rb") - try: - return PipedPigzReader(filename, mode, threads=threads, **text_mode_kwargs) - except OSError: - return PipedGzipReader(filename, mode, **text_mode_kwargs) - - -def _open_external_gzip_writer( - filename, mode, compresslevel, threads, **text_mode_kwargs -): - assert mode in ("wt", "wb", "at", "ab") - try: - return PipedPigzWriter( - filename, mode, compresslevel, threads=threads, **text_mode_kwargs - ) - except OSError: - return PipedGzipWriter(filename, mode, compresslevel, **text_mode_kwargs) - - def _open_gz( # noqa: C901 filename, mode: str, compresslevel, threads, **text_mode_kwargs ): assert mode in ("rt", "rb", "wt", "wb", "at", "ab") - # With threads == 0 igzip_threaded defers to igzip.open, but that is not - # desirable as a reproducible header is required. - if igzip_threaded and threads != 0: - try: - return igzip_threaded.open( # type: ignore - filename, - mode, - isal_zlib.ISAL_DEFAULT_COMPRESSION # type: ignore - if compresslevel is None - else compresslevel, - **text_mode_kwargs, - threads=1 if threads is None else threads, - ) - except ValueError: # Wrong compression level - pass - if gzip_ng_threaded and zlib_ng and threads != 0: - try: - if compresslevel is None: - level = zlib_ng.Z_DEFAULT_COMPRESSION - elif compresslevel == 1: - # zlib-ng level 1 is 50% bigger than zlib level 1. - # This will be wildly outside user ballpark expectations, so - # increase the level - level = 2 - else: - level = compresslevel - - return gzip_ng_threaded.open( - filename, - mode, - level, - **text_mode_kwargs, - threads=1 if threads is None else threads, - ) - except zlib_ng.error: # Bad compression level - pass + if compresslevel is None: + # Force the same compression level on every tool regardless of + # library defaults + compresslevel = XOPEN_DEFAULT_GZIP_COMPRESSION + if threads != 0: + if igzip_threaded: + try: + return igzip_threaded.open( # type: ignore + filename, + mode, + compresslevel, + **text_mode_kwargs, + threads=1, + ) + except ValueError: # Wrong compression level + pass + if gzip_ng_threaded and zlib_ng: + try: + return gzip_ng_threaded.open( + filename, + mode, + # zlib-ng level 1 is 50% bigger than zlib level 1. + # This will be wildly outside user ballpark expectations, so + # increase the level + max(compresslevel, 2), + **text_mode_kwargs, + threads=threads or max(_available_cpu_count(), 4), + ) + except zlib_ng.error: # Bad compression level + pass try: if "r" in mode: - return _open_external_gzip_reader( - filename, mode, compresslevel, threads, **text_mode_kwargs - ) + try: + return PipedPigzReader( + filename, mode, threads=threads, **text_mode_kwargs + ) + except OSError: + return PipedGzipReader(filename, mode, **text_mode_kwargs) else: - return _open_external_gzip_writer( - filename, mode, compresslevel, threads, **text_mode_kwargs - ) + try: + return PipedPigzWriter( + filename, + mode, + compresslevel, + threads=threads, + **text_mode_kwargs, + ) + except OSError: + return PipedGzipWriter( + filename, mode, compresslevel, **text_mode_kwargs + ) except OSError: pass # We try without threads. - if "r" in mode: - if igzip is not None: - return igzip.open(filename, mode, **text_mode_kwargs) - elif gzip_ng is not None: - return gzip_ng.open(filename, mode, **text_mode_kwargs) - return gzip.open(filename, mode, **text_mode_kwargs) - g = _open_reproducible_gzip( filename, mode=mode[0] + "b", @@ -1124,13 +1102,14 @@ def _open_gz( # noqa: C901 return g -def _open_reproducible_gzip(filename, mode, compresslevel): +def _open_reproducible_gzip(filename, mode: str, compresslevel: int): """ Open a gzip file for writing (without external processes) that has neither mtime nor the file name in the header (equivalent to gzip --no-name) """ assert mode in ("rb", "wb", "ab") + assert compresslevel is not None # Neither gzip.open nor igzip.open have an mtime option, and they will # always write the file name, so we need to open the file separately # and pass it to gzip.GzipFile/igzip.IGzipFile. @@ -1144,33 +1123,16 @@ def _open_reproducible_gzip(filename, mode, compresslevel): gzip_file = None if igzip is not None: try: - gzip_file = igzip.IGzipFile( - **kwargs, - compresslevel=isal_zlib.ISAL_DEFAULT_COMPRESSION - if compresslevel is None - else compresslevel, - ) + gzip_file = igzip.IGzipFile(**kwargs, compresslevel=compresslevel) except ValueError: # Compression level not supported, move to built-in gzip. pass elif gzip_ng is not None: - if compresslevel == 1: - level = 2 - elif compresslevel is None: - level = zlib_ng.Z_DEFAULT_COMPRESSION - else: - level = compresslevel - gzip_file = gzip_ng.GzipNGFile(**kwargs, compresslevel=level) + # Compression level should be at least 2 for zlib-ng to prevent very big files. + gzip_file = gzip_ng.GzipNGFile(**kwargs, compresslevel=max(compresslevel, 2)) if gzip_file is None: - gzip_file = gzip.GzipFile( - **kwargs, - # Override gzip.open's default of 9 for consistency - # with command-line gzip. - compresslevel=zlib.Z_DEFAULT_COMPRESSION - if compresslevel is None - else compresslevel, - ) + gzip_file = gzip.GzipFile(**kwargs, compresslevel=compresslevel) # type: ignore # When (I)GzipFile is created with a fileobj instead of a filename, # the passed file object is not closed when (I)GzipFile.close() # is called. This forces it to be closed.