From 49e4c3b2bcc7bd4143d170705a6fc2025f4330fb Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Wed, 1 Feb 2023 13:56:13 +0100 Subject: [PATCH 01/17] Add zlib-ng dependency --- setup.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.cfg b/setup.cfg index 2480439..a5d36dd 100644 --- a/setup.cfg +++ b/setup.cfg @@ -19,6 +19,7 @@ package_dir = packages = find: install_requires = isal>=1.0.0; platform.python_implementation == 'CPython' and (platform.machine == "x86_64" or platform.machine == "AMD64") + zlib-ng>=0.1.0; platform.machine == "x86_64" or platform.machine == "AMD64" or platform.machine == "aarch64" typing_extensions; python_version<'3.8' [options.packages.find] From 99dbe8f26dae090d84548b4cdba2d9d2ea17e809 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Wed, 1 Feb 2023 14:12:06 +0100 Subject: [PATCH 02/17] Use zlib-ng for compression when available --- src/xopen/__init__.py | 73 +++++++++++++++++++++++++++++++++++++++++-- tests/test_piped.py | 6 ++++ 2 files changed, 76 insertions(+), 3 deletions(-) diff --git a/src/xopen/__init__.py b/src/xopen/__init__.py index a49f894..ebc0d83 100644 --- a/src/xopen/__init__.py +++ b/src/xopen/__init__.py @@ -33,6 +33,7 @@ import subprocess import tempfile import time +import zlib from abc import ABC, abstractmethod from subprocess import Popen, PIPE, DEVNULL from typing import Optional, Union, TextIO, AnyStr, IO, List, Set, overload, BinaryIO @@ -51,6 +52,7 @@ igzip: Optional[ModuleType] isal_zlib: Optional[ModuleType] +gzip_ng: Optional[ModuleType] try: from isal import igzip, isal_zlib @@ -58,6 +60,11 @@ igzip = None isal_zlib = None +try: + from zlib_ng import gzip_ng +except: + gzip_ng = None + try: import zstandard # type: ignore except ImportError: @@ -79,7 +86,9 @@ _MAX_PIPE_SIZE = int( _MAX_PIPE_SIZE_PATH.read_text(encoding="ascii") ) # type: Optional[int] -except OSError: # Catches file not found and permission errors. Possible other errors too. +except ( + OSError +): # Catches file not found and permission errors. Possible other errors too. _MAX_PIPE_SIZE = None @@ -921,6 +930,44 @@ def __init__( ) +class PipedPythonZlibNGReader(PipedCompressionReader): + def __init__( + self, path, mode: str = "r", *, encoding="utf-8", errors=None, newline=None + ): + super().__init__( + path, + [sys.executable, "-m", "zlib_ng.gzip_ng"], + mode, + encoding=encoding, + errors=errors, + newline=newline, + ) + + +class PipedPythonZlibNGWriter(PipedCompressionWriter): + def __init__( + self, + path, + mode: str = "wt", + compresslevel: Optional[int] = None, + *, + encoding="utf-8", + errors=None, + newline=None, + ): + if compresslevel is not None and compresslevel not in range(1, 10): + raise ValueError("compresslevel must be between 1 and 10") + super().__init__( + path, + [sys.executable, "-m", "zlib_ng.gzip_ng", "--no-name"], + mode, + compresslevel, + encoding=encoding, + errors=errors, + newline=newline, + ) + + def _open_stdin_or_out(mode: str, **text_mode_kwargs) -> IO: # Do not return sys.stdin or sys.stdout directly as we want the returned object # to be closable without closing sys.stdout. @@ -1024,6 +1071,8 @@ def _open_external_gzip_reader( pass if igzip: return PipedPythonIsalReader(filename, mode, **text_mode_kwargs) + if gzip_ng: + return PipedPythonZlibNGReader(filename, mode, **text_mode_kwargs) try: return PipedPigzReader(filename, mode, threads=threads, **text_mode_kwargs) except OSError: @@ -1046,6 +1095,13 @@ def _open_external_gzip_writer( ) except ValueError: # Wrong compression level pass + if gzip_ng: + try: + return PipedPythonZlibNGWriter( + filename, mode, compresslevel, **text_mode_kwargs + ) + except ValueError: # Wrong compression level + pass try: return PipedPigzWriter( filename, mode, compresslevel, threads=threads, **text_mode_kwargs @@ -1072,6 +1128,8 @@ def _open_gz(filename, mode: str, compresslevel, threads, **text_mode_kwargs): if "r" in mode: if igzip is not None: return igzip.open(filename, mode, **text_mode_kwargs) + if gzip_ng is not None: + return gzip_ng.open(filename, mode, **text_mode_kwargs) return gzip.open(filename, mode, **text_mode_kwargs) g = _open_reproducible_gzip( @@ -1111,14 +1169,23 @@ def _open_reproducible_gzip(filename, mode, compresslevel): else compresslevel, ) except ValueError: - # Compression level not supported, move to built-in gzip. + # Compression level not supported, move to gzip_ng or builtin gzip. pass + if gzip_ng is not None: + gzip_file = gzip_ng.GzipNGFile( + **kwargs, + compresslevel=zlib.Z_DEFAULT_COMPRESSION + if compresslevel is None + else compresslevel, + ) if gzip_file is None: gzip_file = gzip.GzipFile( **kwargs, # Override gzip.open's default of 9 for consistency # with command-line gzip. - compresslevel=6 if compresslevel is None else compresslevel, + compresslevel=zlib.Z_DEFAULT_COMPRESSION + if compresslevel is None + else compresslevel, ) # When (I)GzipFile is created with a fileobj instead of a filename, # the passed file object is not closed when (I)GzipFile.close() diff --git a/tests/test_piped.py b/tests/test_piped.py index 87ed750..262f43f 100644 --- a/tests/test_piped.py +++ b/tests/test_piped.py @@ -25,6 +25,8 @@ PipedIGzipWriter, PipedPythonIsalReader, PipedPythonIsalWriter, + PipedPythonZlibNGReader, + PipedPythonZlibNGWriter, PipedXzReader, PipedXzWriter, PipedZstdReader, @@ -32,6 +34,7 @@ _MAX_PIPE_SIZE, _can_read_concatenated_gz, igzip, + gzip_ng, ) extensions = ["", ".gz", ".bz2", ".xz", ".zst"] @@ -76,6 +79,9 @@ def available_gzip_readers_and_writers(): if igzip is not None: readers.append(PipedPythonIsalReader) writers.append(PipedPythonIsalWriter) + if gzip_ng is not None: + readers.append(PipedPythonZlibNGReader) + writers.append(PipedPythonZlibNGWriter) return readers, writers From ce33273bf1c4149c43529f723d0c07db4266a251 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Wed, 1 Feb 2023 14:41:52 +0100 Subject: [PATCH 03/17] Do not use compression level 1 for zlib-ng --- src/xopen/__init__.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/xopen/__init__.py b/src/xopen/__init__.py index ebc0d83..5adb56e 100644 --- a/src/xopen/__init__.py +++ b/src/xopen/__init__.py @@ -957,6 +957,11 @@ def __init__( ): if compresslevel is not None and compresslevel not in range(1, 10): raise ValueError("compresslevel must be between 1 and 10") + if compresslevel == 1: + # Compresslevel 1 results in files that are typically 50% larger + # than zlib. So in that case use level 2, which is more similar + # to zlib and also still faster. + compresslevel = 2 super().__init__( path, [sys.executable, "-m", "zlib_ng.gzip_ng", "--no-name"], @@ -1174,9 +1179,11 @@ def _open_reproducible_gzip(filename, mode, compresslevel): if gzip_ng is not None: gzip_file = gzip_ng.GzipNGFile( **kwargs, - compresslevel=zlib.Z_DEFAULT_COMPRESSION - if compresslevel is None - else compresslevel, + compresslevel=zlib.Z_DEFAULT_COMPRESSION if compresslevel is None + # Compresslevel 1 results in files that are typically 50% larger + # than zlib. So in that case use level 2, which is more similar + # to zlib and also still faster. + else max(compresslevel, 2), ) if gzip_file is None: gzip_file = gzip.GzipFile( From efe9ed25282d6b68581cf076fdb731fa8436c591 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Wed, 1 Feb 2023 14:43:40 +0100 Subject: [PATCH 04/17] Fix flake8 errors --- src/xopen/__init__.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/xopen/__init__.py b/src/xopen/__init__.py index 5adb56e..3dc1d25 100644 --- a/src/xopen/__init__.py +++ b/src/xopen/__init__.py @@ -62,7 +62,7 @@ try: from zlib_ng import gzip_ng -except: +except ImportError: gzip_ng = None try: @@ -1100,13 +1100,10 @@ def _open_external_gzip_writer( ) except ValueError: # Wrong compression level pass - if gzip_ng: - try: - return PipedPythonZlibNGWriter( - filename, mode, compresslevel, **text_mode_kwargs - ) - except ValueError: # Wrong compression level - pass + elif gzip_ng: + return PipedPythonZlibNGWriter( + filename, mode, compresslevel, **text_mode_kwargs + ) try: return PipedPigzWriter( filename, mode, compresslevel, threads=threads, **text_mode_kwargs From ea36b333b9a380de1423ad7661c58d2ad220bb00 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Wed, 1 Feb 2023 15:16:18 +0100 Subject: [PATCH 05/17] Make sure python-isal is preferred --- src/xopen/__init__.py | 37 ++++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/src/xopen/__init__.py b/src/xopen/__init__.py index 3dc1d25..21c95d9 100644 --- a/src/xopen/__init__.py +++ b/src/xopen/__init__.py @@ -1173,24 +1173,27 @@ def _open_reproducible_gzip(filename, mode, compresslevel): except ValueError: # Compression level not supported, move to gzip_ng or builtin gzip. pass - if gzip_ng is not None: - gzip_file = gzip_ng.GzipNGFile( - **kwargs, - compresslevel=zlib.Z_DEFAULT_COMPRESSION if compresslevel is None - # Compresslevel 1 results in files that are typically 50% larger - # than zlib. So in that case use level 2, which is more similar - # to zlib and also still faster. - else max(compresslevel, 2), - ) + if gzip_file is None: - gzip_file = gzip.GzipFile( - **kwargs, - # Override gzip.open's default of 9 for consistency - # with command-line gzip. - compresslevel=zlib.Z_DEFAULT_COMPRESSION - if compresslevel is None - else compresslevel, - ) + if gzip_ng is not None: + gzip_file = gzip_ng.GzipNGFile( + **kwargs, + compresslevel=zlib.Z_DEFAULT_COMPRESSION if compresslevel is None + # Compresslevel 1 results in files that are typically 50% + # larger + # than zlib. So in that case use level 2, which is more similar + # to zlib and also still faster. + else max(compresslevel, 2), + ) + else: + gzip_file = gzip.GzipFile( + **kwargs, + # Override gzip.open's default of 9 for consistency + # with command-line gzip. + compresslevel=zlib.Z_DEFAULT_COMPRESSION + if compresslevel is None + else compresslevel, + ) # When (I)GzipFile is created with a fileobj instead of a filename, # the passed file object is not closed when (I)GzipFile.close() # is called. This forces it to be closed. From 0ecfdab29fe1cb08290417f3ef51b5f8f422a536 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Wed, 1 Feb 2023 15:20:30 +0100 Subject: [PATCH 06/17] Make sure python-zlib-ng writer is preferred if available --- src/xopen/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/xopen/__init__.py b/src/xopen/__init__.py index 21c95d9..fa13e02 100644 --- a/src/xopen/__init__.py +++ b/src/xopen/__init__.py @@ -1100,7 +1100,7 @@ def _open_external_gzip_writer( ) except ValueError: # Wrong compression level pass - elif gzip_ng: + if gzip_ng: return PipedPythonZlibNGWriter( filename, mode, compresslevel, **text_mode_kwargs ) From ff207f40359cfdc871a8c0cc4d539ef4506fcd54 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Wed, 1 Feb 2023 15:24:48 +0100 Subject: [PATCH 07/17] Make sure PipedPythonZlibNGWriter is tested --- tests/test_piped.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_piped.py b/tests/test_piped.py index 262f43f..db8859f 100644 --- a/tests/test_piped.py +++ b/tests/test_piped.py @@ -340,6 +340,8 @@ def writers_and_levels(): elif writer == PipedIGzipWriter or writer == PipedPythonIsalWriter: # Levels 0-3 are supported yield from ((writer, i) for i in range(4)) + elif writer == PipedPythonZlibNGWriter: + yield from ((writer, i) for i in range(1, 10)) else: raise NotImplementedError( f"Test should be implemented for " f"{writer}" From a8ca58dc6443fdff1c0841a21ea3db11476a0dee Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Fri, 3 Feb 2023 08:55:09 +0100 Subject: [PATCH 08/17] Refactor external application choice for gzip --- src/xopen/__init__.py | 93 +++++++++++++++++++++++++++---------------- 1 file changed, 59 insertions(+), 34 deletions(-) diff --git a/src/xopen/__init__.py b/src/xopen/__init__.py index fa13e02..1a01860 100644 --- a/src/xopen/__init__.py +++ b/src/xopen/__init__.py @@ -21,6 +21,7 @@ "__version__", ] +import functools import gzip import sys import io @@ -1068,48 +1069,72 @@ def _open_external_gzip_reader( filename, mode, compresslevel, threads, **text_mode_kwargs ): assert mode in ("rt", "rb") - try: - return PipedIGzipReader(filename, mode, **text_mode_kwargs) - except (OSError, ValueError): - # No igzip installed or version does not support reading - # concatenated files. - pass - if igzip: - return PipedPythonIsalReader(filename, mode, **text_mode_kwargs) - if gzip_ng: - return PipedPythonZlibNGReader(filename, mode, **text_mode_kwargs) - try: - return PipedPigzReader(filename, mode, threads=threads, **text_mode_kwargs) - except OSError: - return PipedGzipReader(filename, mode, **text_mode_kwargs) + if sys.platform == "AMD64" or sys.platform == "x86_64": + # For x86-64 there are optimized libraries available for decompression + preferred_applications = [ + PipedIGzipReader, + PipedPythonIsalReader, + PipedPythonZlibNGReader, + functools.partial(PipedPigzReader, threads=threads), + PipedGzipReader, + ] + else: + # For other platforms, libraries zlib, zlib-ng and isal perform + # similarly at decompressions. C implementations perform better than + # Python implementations. + preferred_applications = [ + functools.partial(PipedPigzReader, threads=threads), + PipedIGzipReader, + PipedPythonIsalReader, + PipedPythonZlibNGReader, + PipedGzipReader, # Gzip decompresses very slowly. + ] + for app_class in preferred_applications: + try: + return app_class(filename, mode, **text_mode_kwargs) + except (OSError, ValueError): + continue + raise OSError("No external applications available") def _open_external_gzip_writer( filename, mode, compresslevel, threads, **text_mode_kwargs ): assert mode in ("wt", "wb", "at", "ab") - try: - return PipedIGzipWriter(filename, mode, compresslevel, **text_mode_kwargs) - except (OSError, ValueError): - # No igzip installed or compression level higher than 3 - pass - if igzip: # We can use the CLI from isal.igzip + if compresslevel is None: + preferred_applications = [ + PipedIGzipWriter, + PipedPythonIsalWriter, + PipedPythonZlibNGWriter, + functools.partial(PipedPigzWriter, threads=threads), + PipedGzipWriter, + ] + elif compresslevel < 3: + preferred_applications = [ + PipedIGzipWriter, + PipedPythonIsalWriter, + PipedPythonZlibNGWriter, + functools.partial(PipedPigzWriter, threads=threads), + PipedGzipWriter, + ] + else: + # ISA-L level 3 is very similar in compression to levels 1 and 2. + # prefer zlib-ng instead for better compression. + preferred_applications = [ + PipedPythonZlibNGWriter, + PipedIGzipWriter, + PipedPythonIsalWriter, + functools.partial(PipedPigzWriter, threads=threads), + PipedGzipWriter, + ] + for app_class in preferred_applications: try: - return PipedPythonIsalWriter( - filename, mode, compresslevel, **text_mode_kwargs + return app_class( + filename, mode, compresslevel=compresslevel, **text_mode_kwargs ) - except ValueError: # Wrong compression level - pass - if gzip_ng: - return PipedPythonZlibNGWriter( - filename, mode, compresslevel, **text_mode_kwargs - ) - try: - return PipedPigzWriter( - filename, mode, compresslevel, threads=threads, **text_mode_kwargs - ) - except OSError: - return PipedGzipWriter(filename, mode, compresslevel, **text_mode_kwargs) + except (OSError, ValueError): + continue + raise OSError("No external applications available") def _open_gz(filename, mode: str, compresslevel, threads, **text_mode_kwargs): From c04ff39240c2337ec90dc6d6b40e02f137b0d410 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Fri, 3 Feb 2023 09:30:39 +0100 Subject: [PATCH 09/17] Refactor open_reproducible --- src/xopen/__init__.py | 56 +++++++++++++++++++------------------------ 1 file changed, 25 insertions(+), 31 deletions(-) diff --git a/src/xopen/__init__.py b/src/xopen/__init__.py index 1a01860..2dfa0a5 100644 --- a/src/xopen/__init__.py +++ b/src/xopen/__init__.py @@ -1186,39 +1186,33 @@ def _open_reproducible_gzip(filename, mode, compresslevel): mode=mode, mtime=0, ) - gzip_file = None + preferred_class = [gzip.GzipFile] + if compresslevel is None: + # By default, the highest available compression level is chosen by + # the python implementations. Choose medium copmpression level for + # consistency with command line applications. + if igzip is not None: + compresslevel = isal_zlib.ISAL_DEFAULT_COMPRESSION + else: + compresslevel = zlib.Z_DEFAULT_COMPRESSION + if gzip_ng is not None: + preferred_class = [gzip_ng.GzipNGFile] + preferred_class + # Compresslevel 1 results in files that are typically 50% larger than + # zlib. So in that case use level 2, which is more similar to zlib and + # also still faster. + compresslevel = max(compresslevel, 2) if igzip is not None: + preferred_class = [igzip.IGzipFile] + preferred_class + last_error = None + for klass in preferred_class: try: - gzip_file = igzip.IGzipFile( - **kwargs, - compresslevel=isal_zlib.ISAL_DEFAULT_COMPRESSION - if compresslevel is None - else compresslevel, - ) - except ValueError: - # Compression level not supported, move to gzip_ng or builtin gzip. - pass - - if gzip_file is None: - if gzip_ng is not None: - gzip_file = gzip_ng.GzipNGFile( - **kwargs, - compresslevel=zlib.Z_DEFAULT_COMPRESSION if compresslevel is None - # Compresslevel 1 results in files that are typically 50% - # larger - # than zlib. So in that case use level 2, which is more similar - # to zlib and also still faster. - else max(compresslevel, 2), - ) - else: - gzip_file = gzip.GzipFile( - **kwargs, - # Override gzip.open's default of 9 for consistency - # with command-line gzip. - compresslevel=zlib.Z_DEFAULT_COMPRESSION - if compresslevel is None - else compresslevel, - ) + gzip_file = klass(**kwargs, compresslevel=compresslevel) + break + except ValueError as e: # igzip does not support all compression levels + last_error = e + continue + else: # no break + raise last_error # When (I)GzipFile is created with a fileobj instead of a filename, # the passed file object is not closed when (I)GzipFile.close() # is called. This forces it to be closed. From 62fc2f0d945ca9fcb7741689859c34424bdc4c6b Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Fri, 3 Feb 2023 09:46:20 +0100 Subject: [PATCH 10/17] Fix wrong compresslevel test --- tests/test_xopen.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/test_xopen.py b/tests/test_xopen.py index 216f50b..ddc0633 100644 --- a/tests/test_xopen.py +++ b/tests/test_xopen.py @@ -259,7 +259,11 @@ def test_invalid_compression_level(tmp_path): with pytest.raises(ValueError) as e: with xopen(tmp_path / "out.gz", mode="w", compresslevel=17) as f: f.write("hello") # pragma: no cover - assert "compresslevel must be" in e.value.args[0] + # The value error should warn about compresslevel or about an invalid + # initialization option. + assert ( + "compresslevel must be" in e.value.args[0] or "nitialization" in e.value.args[0] + ) @pytest.mark.parametrize("ext", extensions) From 27d8a9424ee635dc1c7d551f32bee86553f9d1a0 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Fri, 3 Feb 2023 09:53:20 +0100 Subject: [PATCH 11/17] Do not set compression level globally when gzip_ng is present --- src/xopen/__init__.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/xopen/__init__.py b/src/xopen/__init__.py index 2dfa0a5..5d5aa07 100644 --- a/src/xopen/__init__.py +++ b/src/xopen/__init__.py @@ -1196,11 +1196,15 @@ def _open_reproducible_gzip(filename, mode, compresslevel): else: compresslevel = zlib.Z_DEFAULT_COMPRESSION if gzip_ng is not None: - preferred_class = [gzip_ng.GzipNGFile] + preferred_class # Compresslevel 1 results in files that are typically 50% larger than # zlib. So in that case use level 2, which is more similar to zlib and # also still faster. - compresslevel = max(compresslevel, 2) + def zlibng_class_opener(*args, compresslevel, **kwargs): + return gzip_ng.GzipNGFile( + *args, compresslevel=max(compresslevel, 2), **kwargs + ) + + preferred_class = [zlibng_class_opener] + preferred_class if igzip is not None: preferred_class = [igzip.IGzipFile] + preferred_class last_error = None From 7a76d73661a6441ad71f3a7aa8024baafe022a77 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Fri, 3 Feb 2023 09:54:59 +0100 Subject: [PATCH 12/17] Use platform.machine correctly --- src/xopen/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/xopen/__init__.py b/src/xopen/__init__.py index 5d5aa07..31f09c3 100644 --- a/src/xopen/__init__.py +++ b/src/xopen/__init__.py @@ -23,6 +23,7 @@ import functools import gzip +import platform import sys import io import os @@ -1069,7 +1070,7 @@ def _open_external_gzip_reader( filename, mode, compresslevel, threads, **text_mode_kwargs ): assert mode in ("rt", "rb") - if sys.platform == "AMD64" or sys.platform == "x86_64": + if platform.machine() == "AMD64" or platform.machine() == "x86_64": # For x86-64 there are optimized libraries available for decompression preferred_applications = [ PipedIGzipReader, From cc3b43e9524c3bbccae2dc68f147e0ea37a24703 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Fri, 3 Feb 2023 10:19:51 +0100 Subject: [PATCH 13/17] Make 0 threads consistent with 1 or more threads --- src/xopen/__init__.py | 49 +++++++++++++++++++++++++------------------ 1 file changed, 29 insertions(+), 20 deletions(-) diff --git a/src/xopen/__init__.py b/src/xopen/__init__.py index 31f09c3..febfce6 100644 --- a/src/xopen/__init__.py +++ b/src/xopen/__init__.py @@ -1170,6 +1170,28 @@ def _open_gz(filename, mode: str, compresslevel, threads, **text_mode_kwargs): return g +def gzip_class(*args, compresslevel, **kwargs): + if compresslevel is None: + compresslevel = zlib.Z_DEFAULT_COMPRESSION + return gzip.GzipFile(*args, compresslevel=compresslevel, **kwargs) + + +def igzip_class(*args, compresslevel, **kwargs): + if igzip is None: + raise ValueError("No igzip available") + if compresslevel is None: + compresslevel = isal_zlib.ISAL_DEFAULT_COMPRESSION + return igzip.IGzipFile(*args, compresslevel=compresslevel, **kwargs) + + +def gzip_ng_class(*args, compresslevel, **kwargs): + if gzip_ng is None: + raise ValueError("No gzip_ng available") + if compresslevel is None: + compresslevel = zlib.Z_DEFAULT_COMPRESSION + return gzip_ng.GzipNGFile(*args, compresslevel=max(compresslevel, 2), **kwargs) + + def _open_reproducible_gzip(filename, mode, compresslevel): """ Open a gzip file for writing (without external processes) @@ -1187,29 +1209,16 @@ def _open_reproducible_gzip(filename, mode, compresslevel): mode=mode, mtime=0, ) - preferred_class = [gzip.GzipFile] + if compresslevel is None: - # By default, the highest available compression level is chosen by - # the python implementations. Choose medium copmpression level for - # consistency with command line applications. - if igzip is not None: - compresslevel = isal_zlib.ISAL_DEFAULT_COMPRESSION - else: - compresslevel = zlib.Z_DEFAULT_COMPRESSION - if gzip_ng is not None: - # Compresslevel 1 results in files that are typically 50% larger than - # zlib. So in that case use level 2, which is more similar to zlib and - # also still faster. - def zlibng_class_opener(*args, compresslevel, **kwargs): - return gzip_ng.GzipNGFile( - *args, compresslevel=max(compresslevel, 2), **kwargs - ) + preferred_classes = [igzip_class, gzip_ng_class, gzip_class] + elif compresslevel < 3: + preferred_classes = [igzip_class, gzip_ng_class, gzip_class] + else: + preferred_classes = [gzip_ng_class, igzip_class, gzip_class] - preferred_class = [zlibng_class_opener] + preferred_class - if igzip is not None: - preferred_class = [igzip.IGzipFile] + preferred_class last_error = None - for klass in preferred_class: + for klass in preferred_classes: try: gzip_file = klass(**kwargs, compresslevel=compresslevel) break From d5f03bc331b75171eb249434956ad187d2476782 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Fri, 3 Feb 2023 10:25:03 +0100 Subject: [PATCH 14/17] Simplify logic --- src/xopen/__init__.py | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/src/xopen/__init__.py b/src/xopen/__init__.py index febfce6..cd21697 100644 --- a/src/xopen/__init__.py +++ b/src/xopen/__init__.py @@ -1102,15 +1102,7 @@ def _open_external_gzip_writer( filename, mode, compresslevel, threads, **text_mode_kwargs ): assert mode in ("wt", "wb", "at", "ab") - if compresslevel is None: - preferred_applications = [ - PipedIGzipWriter, - PipedPythonIsalWriter, - PipedPythonZlibNGWriter, - functools.partial(PipedPigzWriter, threads=threads), - PipedGzipWriter, - ] - elif compresslevel < 3: + if compresslevel is None or compresslevel < 3: preferred_applications = [ PipedIGzipWriter, PipedPythonIsalWriter, @@ -1120,7 +1112,7 @@ def _open_external_gzip_writer( ] else: # ISA-L level 3 is very similar in compression to levels 1 and 2. - # prefer zlib-ng instead for better compression. + # prefer zlib-ng instead for better compression at levels higher than 2. preferred_applications = [ PipedPythonZlibNGWriter, PipedIGzipWriter, @@ -1210,11 +1202,11 @@ def _open_reproducible_gzip(filename, mode, compresslevel): mtime=0, ) - if compresslevel is None: - preferred_classes = [igzip_class, gzip_ng_class, gzip_class] - elif compresslevel < 3: + if compresslevel is None or compresslevel < 3: preferred_classes = [igzip_class, gzip_ng_class, gzip_class] else: + # ISA-L level 3 is very similar in compression to levels 1 and 2. + # prefer zlib-ng instead for better compression at levels higher than 2. preferred_classes = [gzip_ng_class, igzip_class, gzip_class] last_error = None From 7e7fca996d84644a1901a4f8c1c0b118ed1c60dc Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Fri, 3 Feb 2023 10:41:08 +0100 Subject: [PATCH 15/17] Make class creators private --- src/xopen/__init__.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/xopen/__init__.py b/src/xopen/__init__.py index cd21697..e47337d 100644 --- a/src/xopen/__init__.py +++ b/src/xopen/__init__.py @@ -1162,13 +1162,13 @@ def _open_gz(filename, mode: str, compresslevel, threads, **text_mode_kwargs): return g -def gzip_class(*args, compresslevel, **kwargs): +def _gzip_class(*args, compresslevel, **kwargs): if compresslevel is None: compresslevel = zlib.Z_DEFAULT_COMPRESSION return gzip.GzipFile(*args, compresslevel=compresslevel, **kwargs) -def igzip_class(*args, compresslevel, **kwargs): +def _igzip_class(*args, compresslevel, **kwargs): if igzip is None: raise ValueError("No igzip available") if compresslevel is None: @@ -1176,7 +1176,7 @@ def igzip_class(*args, compresslevel, **kwargs): return igzip.IGzipFile(*args, compresslevel=compresslevel, **kwargs) -def gzip_ng_class(*args, compresslevel, **kwargs): +def _gzip_ng_class(*args, compresslevel, **kwargs): if gzip_ng is None: raise ValueError("No gzip_ng available") if compresslevel is None: @@ -1203,11 +1203,11 @@ def _open_reproducible_gzip(filename, mode, compresslevel): ) if compresslevel is None or compresslevel < 3: - preferred_classes = [igzip_class, gzip_ng_class, gzip_class] + preferred_classes = [_igzip_class, _gzip_ng_class, _gzip_class] else: # ISA-L level 3 is very similar in compression to levels 1 and 2. # prefer zlib-ng instead for better compression at levels higher than 2. - preferred_classes = [gzip_ng_class, igzip_class, gzip_class] + preferred_classes = [_gzip_ng_class, _igzip_class, _gzip_class] last_error = None for klass in preferred_classes: From f9eabe309106021b75efbe41aa881000ac775167 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Fri, 3 Feb 2023 10:57:19 +0100 Subject: [PATCH 16/17] Parametrize tests with different available libraries --- tests/test_xopen.py | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/tests/test_xopen.py b/tests/test_xopen.py index ddc0633..1385679 100644 --- a/tests/test_xopen.py +++ b/tests/test_xopen.py @@ -80,12 +80,13 @@ def lacking_xz_permissions(tmp_path): yield -@pytest.fixture -def xopen_without_igzip(monkeypatch): +@pytest.fixture(params=[("igzip",), ("gzip_ng",), ("igzip", "gzip_ng")]) +def xopen_without_libs(monkeypatch, request): import xopen # xopen local overrides xopen global variable - monkeypatch.setattr(xopen, "igzip", None) - return xopen.xopen + for lib in request.param: + monkeypatch.setattr(xopen, lib, None) + yield xopen.xopen def test_text(fname): @@ -115,17 +116,17 @@ def test_roundtrip(ext, tmp_path, threads, mode): assert f.read() == data -def test_binary_no_isal_no_threads(fname, xopen_without_igzip): +def test_binary_no_libs_no_threads(fname, xopen_without_libs): if fname.endswith(".zst") and zstandard is None: return - with xopen_without_igzip(fname, "rb", threads=0) as f: + with xopen_without_libs(fname, "rb", threads=0) as f: lines = list(f) assert len(lines) == 2 assert lines[1] == b"The second line.\n", fname -def test_binary_no_isal(fname, xopen_without_igzip): - with xopen_without_igzip(fname, "rb", threads=1) as f: +def test_binary_no_libs(fname, xopen_without_libs): + with xopen_without_libs(fname, "rb", threads=1) as f: lines = list(f) assert len(lines) == 2 assert lines[1] == b"The second line.\n", fname @@ -366,11 +367,11 @@ def test_write_threads(tmp_path, ext): assert f.read() == "hello" -def test_write_pigz_threads_no_isal(tmp_path, xopen_without_igzip): +def test_write_pigz_threads_no_libs(tmp_path, xopen_without_libs): path = tmp_path / "out.gz" - with xopen_without_igzip(path, mode="w", threads=3) as f: + with xopen_without_libs(path, mode="w", threads=3) as f: f.write("hello") - with xopen_without_igzip(path) as f: + with xopen_without_libs(path) as f: assert f.read() == "hello" @@ -392,10 +393,10 @@ def test_write_no_threads(tmp_path, ext): assert isinstance(f.raw, klass), f -def test_write_gzip_no_threads_no_isal(tmp_path, xopen_without_igzip): +def test_write_gzip_no_threads_no_libs(tmp_path, xopen_without_libs): import gzip - with xopen_without_igzip(tmp_path / "out.gz", "wb", threads=0) as f: + with xopen_without_libs(tmp_path / "out.gz", "wb", threads=0) as f: assert isinstance(f.raw, gzip.GzipFile), f @@ -448,16 +449,16 @@ def test_falls_back_to_gzip_open(lacking_pigz_permissions): assert f.readline() == CONTENT_LINES[0].encode("utf-8") -def test_falls_back_to_gzip_open_no_isal(lacking_pigz_permissions, xopen_without_igzip): - with xopen_without_igzip(TEST_DIR / "file.txt.gz", "rb") as f: +def test_falls_back_to_gzip_open_no_libs(lacking_pigz_permissions, xopen_without_libs): + with xopen_without_libs(TEST_DIR / "file.txt.gz", "rb") as f: assert f.readline() == CONTENT_LINES[0].encode("utf-8") -def test_fals_back_to_gzip_open_write_no_isal( - lacking_pigz_permissions, xopen_without_igzip, tmp_path +def test_fals_back_to_gzip_open_write_no_libs( + lacking_pigz_permissions, xopen_without_libs, tmp_path ): tmp = tmp_path / "test.gz" - with xopen_without_igzip(tmp, "wb") as f: + with xopen_without_libs(tmp, "wb") as f: f.write(b"hello") assert gzip.decompress(tmp.read_bytes()) == b"hello" From 80965b9d7e309df36e8b1e20d8d804c80ed05488 Mon Sep 17 00:00:00 2001 From: Ruben Vorderman Date: Fri, 3 Feb 2023 14:17:41 +0100 Subject: [PATCH 17/17] Make PipedPython readers robust against not having modules installed --- src/xopen/__init__.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/xopen/__init__.py b/src/xopen/__init__.py index e47337d..b98f187 100644 --- a/src/xopen/__init__.py +++ b/src/xopen/__init__.py @@ -898,6 +898,9 @@ class PipedPythonIsalReader(PipedCompressionReader): def __init__( self, path, mode: str = "r", *, encoding="utf-8", errors=None, newline=None ): + if not igzip: + # Raise error here, otherwise it will occur during write. + raise OSError("isal module not installed.") super().__init__( path, [sys.executable, "-m", "isal.igzip"], @@ -919,6 +922,9 @@ def __init__( errors=None, newline=None, ): + if not igzip: + # Raise error here, otherwise it will occur during write. + raise OSError("isal module not installed.") if compresslevel is not None and compresslevel not in range(0, 4): raise ValueError("compresslevel must be between 0 and 3") super().__init__( @@ -936,6 +942,9 @@ class PipedPythonZlibNGReader(PipedCompressionReader): def __init__( self, path, mode: str = "r", *, encoding="utf-8", errors=None, newline=None ): + if not gzip_ng: + # Raise error here, otherwise it will occur during write. + raise OSError("zlib-ng module not installed.") super().__init__( path, [sys.executable, "-m", "zlib_ng.gzip_ng"], @@ -957,6 +966,9 @@ def __init__( errors=None, newline=None, ): + if not gzip_ng: + # Raise error here, otherwise it will occur during write. + raise OSError("zlib-ng module not installed.") if compresslevel is not None and compresslevel not in range(1, 10): raise ValueError("compresslevel must be between 1 and 10") if compresslevel == 1: