Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add zlib-ng and refactor gzip open functions. #124

Closed
wants to merge 18 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package_dir =
packages = find:
install_requires =
isal>=1.0.0; platform.machine == "x86_64" or platform.machine == "AMD64" or platform.machine == "aarch64"
zlib-ng>=0.1.0; platform.machine == "x86_64" or platform.machine == "AMD64" or platform.machine == "aarch64"
typing_extensions; python_version<'3.8'

[options.packages.find]
Expand Down
205 changes: 158 additions & 47 deletions src/xopen/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@
"__version__",
]

import functools
import gzip
import platform
import sys
import io
import os
Expand All @@ -33,6 +35,7 @@
import subprocess
import tempfile
import time
import zlib
from abc import ABC, abstractmethod
from subprocess import Popen, PIPE, DEVNULL
from typing import Optional, Union, TextIO, AnyStr, IO, List, Set, overload, BinaryIO
Expand All @@ -51,13 +54,19 @@

igzip: Optional[ModuleType]
isal_zlib: Optional[ModuleType]
gzip_ng: Optional[ModuleType]

try:
from isal import igzip, isal_zlib
except ImportError:
igzip = None
isal_zlib = None

try:
from zlib_ng import gzip_ng
except ImportError:
gzip_ng = None

try:
import zstandard # type: ignore
except ImportError:
Expand All @@ -79,7 +88,9 @@
_MAX_PIPE_SIZE = int(
_MAX_PIPE_SIZE_PATH.read_text(encoding="ascii")
) # type: Optional[int]
except OSError: # Catches file not found and permission errors. Possible other errors too.
except (
OSError
): # Catches file not found and permission errors. Possible other errors too.
_MAX_PIPE_SIZE = None


Expand Down Expand Up @@ -887,6 +898,9 @@ class PipedPythonIsalReader(PipedCompressionReader):
def __init__(
self, path, mode: str = "r", *, encoding="utf-8", errors=None, newline=None
):
if not igzip:
# Raise error here, otherwise it will occur during write.
raise OSError("isal module not installed.")
super().__init__(
path,
[sys.executable, "-m", "isal.igzip"],
Expand All @@ -908,6 +922,9 @@ def __init__(
errors=None,
newline=None,
):
if not igzip:
# Raise error here, otherwise it will occur during write.
raise OSError("isal module not installed.")
if compresslevel is not None and compresslevel not in range(0, 4):
raise ValueError("compresslevel must be between 0 and 3")
super().__init__(
Expand All @@ -921,6 +938,55 @@ def __init__(
)


class PipedPythonZlibNGReader(PipedCompressionReader):
def __init__(
self, path, mode: str = "r", *, encoding="utf-8", errors=None, newline=None
):
if not gzip_ng:
# Raise error here, otherwise it will occur during write.
raise OSError("zlib-ng module not installed.")
super().__init__(
path,
[sys.executable, "-m", "zlib_ng.gzip_ng"],
mode,
encoding=encoding,
errors=errors,
newline=newline,
)


class PipedPythonZlibNGWriter(PipedCompressionWriter):
def __init__(
self,
path,
mode: str = "wt",
compresslevel: Optional[int] = None,
*,
encoding="utf-8",
errors=None,
newline=None,
):
if not gzip_ng:
# Raise error here, otherwise it will occur during write.
raise OSError("zlib-ng module not installed.")
if compresslevel is not None and compresslevel not in range(1, 10):
raise ValueError("compresslevel must be between 1 and 10")
if compresslevel == 1:
# Compresslevel 1 results in files that are typically 50% larger
# than zlib. So in that case use level 2, which is more similar
# to zlib and also still faster.
compresslevel = 2
super().__init__(
path,
[sys.executable, "-m", "zlib_ng.gzip_ng", "--no-name"],
mode,
compresslevel,
encoding=encoding,
errors=errors,
newline=newline,
)


def _open_stdin_or_out(mode: str, **text_mode_kwargs) -> IO:
# Do not return sys.stdin or sys.stdout directly as we want the returned object
# to be closable without closing sys.stdout.
Expand Down Expand Up @@ -1016,42 +1082,64 @@ def _open_external_gzip_reader(
filename, mode, compresslevel, threads, **text_mode_kwargs
):
assert mode in ("rt", "rb")
try:
return PipedIGzipReader(filename, mode, **text_mode_kwargs)
except (OSError, ValueError):
# No igzip installed or version does not support reading
# concatenated files.
pass
if igzip:
return PipedPythonIsalReader(filename, mode, **text_mode_kwargs)
try:
return PipedPigzReader(filename, mode, threads=threads, **text_mode_kwargs)
except OSError:
return PipedGzipReader(filename, mode, **text_mode_kwargs)
if platform.machine() == "AMD64" or platform.machine() == "x86_64":
# For x86-64 there are optimized libraries available for decompression
preferred_applications = [
PipedIGzipReader,
PipedPythonIsalReader,
PipedPythonZlibNGReader,
functools.partial(PipedPigzReader, threads=threads),
PipedGzipReader,
]
else:
# For other platforms, libraries zlib, zlib-ng and isal perform
# similarly at decompressions. C implementations perform better than
# Python implementations.
preferred_applications = [
functools.partial(PipedPigzReader, threads=threads),
PipedIGzipReader,
PipedPythonIsalReader,
PipedPythonZlibNGReader,
PipedGzipReader, # Gzip decompresses very slowly.
]
for app_class in preferred_applications:
try:
return app_class(filename, mode, **text_mode_kwargs)
except (OSError, ValueError):
continue
raise OSError("No external applications available")


def _open_external_gzip_writer(
filename, mode, compresslevel, threads, **text_mode_kwargs
):
assert mode in ("wt", "wb", "at", "ab")
try:
return PipedIGzipWriter(filename, mode, compresslevel, **text_mode_kwargs)
except (OSError, ValueError):
# No igzip installed or compression level higher than 3
pass
if igzip: # We can use the CLI from isal.igzip
if compresslevel is None or compresslevel < 3:
preferred_applications = [
PipedIGzipWriter,
PipedPythonIsalWriter,
PipedPythonZlibNGWriter,
functools.partial(PipedPigzWriter, threads=threads),
PipedGzipWriter,
]
else:
# ISA-L level 3 is very similar in compression to levels 1 and 2.
# prefer zlib-ng instead for better compression at levels higher than 2.
preferred_applications = [
PipedPythonZlibNGWriter,
PipedIGzipWriter,
PipedPythonIsalWriter,
functools.partial(PipedPigzWriter, threads=threads),
PipedGzipWriter,
]
for app_class in preferred_applications:
try:
return PipedPythonIsalWriter(
filename, mode, compresslevel, **text_mode_kwargs
return app_class(
filename, mode, compresslevel=compresslevel, **text_mode_kwargs
)
except ValueError: # Wrong compression level
pass
try:
return PipedPigzWriter(
filename, mode, compresslevel, threads=threads, **text_mode_kwargs
)
except OSError:
return PipedGzipWriter(filename, mode, compresslevel, **text_mode_kwargs)
except (OSError, ValueError):
continue
raise OSError("No external applications available")


def _open_gz(filename, mode: str, compresslevel, threads, **text_mode_kwargs):
Expand All @@ -1072,6 +1160,8 @@ def _open_gz(filename, mode: str, compresslevel, threads, **text_mode_kwargs):
if "r" in mode:
if igzip is not None:
return igzip.open(filename, mode, **text_mode_kwargs)
if gzip_ng is not None:
return gzip_ng.open(filename, mode, **text_mode_kwargs)
return gzip.open(filename, mode, **text_mode_kwargs)

g = _open_reproducible_gzip(
Expand All @@ -1084,6 +1174,28 @@ def _open_gz(filename, mode: str, compresslevel, threads, **text_mode_kwargs):
return g


def _gzip_class(*args, compresslevel, **kwargs):
if compresslevel is None:
compresslevel = zlib.Z_DEFAULT_COMPRESSION
return gzip.GzipFile(*args, compresslevel=compresslevel, **kwargs)


def _igzip_class(*args, compresslevel, **kwargs):
if igzip is None:
raise ValueError("No igzip available")
if compresslevel is None:
compresslevel = isal_zlib.ISAL_DEFAULT_COMPRESSION
return igzip.IGzipFile(*args, compresslevel=compresslevel, **kwargs)


def _gzip_ng_class(*args, compresslevel, **kwargs):
if gzip_ng is None:
raise ValueError("No gzip_ng available")
if compresslevel is None:
compresslevel = zlib.Z_DEFAULT_COMPRESSION
return gzip_ng.GzipNGFile(*args, compresslevel=max(compresslevel, 2), **kwargs)


def _open_reproducible_gzip(filename, mode, compresslevel):
"""
Open a gzip file for writing (without external processes)
Expand All @@ -1101,25 +1213,24 @@ def _open_reproducible_gzip(filename, mode, compresslevel):
mode=mode,
mtime=0,
)
gzip_file = None
if igzip is not None:

if compresslevel is None or compresslevel < 3:
preferred_classes = [_igzip_class, _gzip_ng_class, _gzip_class]
else:
# ISA-L level 3 is very similar in compression to levels 1 and 2.
# prefer zlib-ng instead for better compression at levels higher than 2.
preferred_classes = [_gzip_ng_class, _igzip_class, _gzip_class]

last_error = None
for klass in preferred_classes:
try:
gzip_file = igzip.IGzipFile(
**kwargs,
compresslevel=isal_zlib.ISAL_DEFAULT_COMPRESSION
if compresslevel is None
else compresslevel,
)
except ValueError:
# Compression level not supported, move to built-in gzip.
pass
if gzip_file is None:
gzip_file = gzip.GzipFile(
**kwargs,
# Override gzip.open's default of 9 for consistency
# with command-line gzip.
compresslevel=6 if compresslevel is None else compresslevel,
)
gzip_file = klass(**kwargs, compresslevel=compresslevel)
break
except ValueError as e: # igzip does not support all compression levels
last_error = e
continue
else: # no break
raise last_error
# When (I)GzipFile is created with a fileobj instead of a filename,
# the passed file object is not closed when (I)GzipFile.close()
# is called. This forces it to be closed.
Expand Down
8 changes: 8 additions & 0 deletions tests/test_piped.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,16 @@
PipedIGzipWriter,
PipedPythonIsalReader,
PipedPythonIsalWriter,
PipedPythonZlibNGReader,
PipedPythonZlibNGWriter,
PipedXzReader,
PipedXzWriter,
PipedZstdReader,
PipedZstdWriter,
_MAX_PIPE_SIZE,
_can_read_concatenated_gz,
igzip,
gzip_ng,
)

extensions = ["", ".gz", ".bz2", ".xz", ".zst"]
Expand Down Expand Up @@ -76,6 +79,9 @@ def available_gzip_readers_and_writers():
if igzip is not None:
readers.append(PipedPythonIsalReader)
writers.append(PipedPythonIsalWriter)
if gzip_ng is not None:
readers.append(PipedPythonZlibNGReader)
writers.append(PipedPythonZlibNGWriter)
return readers, writers


Expand Down Expand Up @@ -334,6 +340,8 @@ def writers_and_levels():
elif writer == PipedIGzipWriter or writer == PipedPythonIsalWriter:
# Levels 0-3 are supported
yield from ((writer, i) for i in range(4))
elif writer == PipedPythonZlibNGWriter:
yield from ((writer, i) for i in range(1, 10))
else:
raise NotImplementedError(
f"Test should be implemented for " f"{writer}"
Expand Down
Loading