Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds checksum flag to zstd codec #519

Merged
merged 18 commits into from
Jun 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions docs/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,10 @@ Maintenance

Enhancements
~~~~~~~~~~~~

* Add checksum flag to zstd and sets the default level to 0.
By :user:`Norman Rzepka <normanrz>`, :issue:`519`.
* Add PCodec
By :user:`Ryan Abernathey <rabernat>`.
By :user:`Ryan Abernathey <rabernat>`, :issue:`501`.
* Use PyData theme for docs
By :user:`John Kirkham <jakirkham>`, :issue:`485`.

Expand Down
2 changes: 1 addition & 1 deletion fixture/zstd/codec.00/config.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{
"id": "zstd",
"level": 1
"level": 0
}
5 changes: 5 additions & 0 deletions fixture/zstd/codec.07/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"checksum": true,
"id": "zstd",
"level": 0
}
Binary file added fixture/zstd/codec.07/encoded.00.dat
Binary file not shown.
Binary file added fixture/zstd/codec.07/encoded.01.dat
Binary file not shown.
Binary file added fixture/zstd/codec.07/encoded.02.dat
Binary file not shown.
Binary file added fixture/zstd/codec.07/encoded.03.dat
Binary file not shown.
Binary file added fixture/zstd/codec.07/encoded.04.dat
Binary file not shown.
Binary file added fixture/zstd/codec.07/encoded.05.dat
Binary file not shown.
Binary file added fixture/zstd/codec.07/encoded.06.dat
Binary file not shown.
Binary file added fixture/zstd/codec.07/encoded.07.dat
Binary file not shown.
Binary file added fixture/zstd/codec.07/encoded.08.dat
Binary file not shown.
Binary file added fixture/zstd/codec.07/encoded.09.dat
Binary file not shown.
Binary file added fixture/zstd/codec.07/encoded.10.dat
Binary file not shown.
Binary file added fixture/zstd/codec.07/encoded.11.dat
Binary file not shown.
Binary file added fixture/zstd/codec.07/encoded.12.dat
Binary file not shown.
5 changes: 5 additions & 0 deletions fixture/zstd/codec.08/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"checksum": true,
"id": "zstd",
"level": 0
}
Binary file added fixture/zstd/codec.08/encoded.00.dat
Binary file not shown.
Binary file added fixture/zstd/codec.08/encoded.01.dat
Binary file not shown.
Binary file added fixture/zstd/codec.08/encoded.02.dat
Binary file not shown.
Binary file added fixture/zstd/codec.08/encoded.03.dat
Binary file not shown.
Binary file added fixture/zstd/codec.08/encoded.04.dat
Binary file not shown.
Binary file added fixture/zstd/codec.08/encoded.05.dat
Binary file not shown.
Binary file added fixture/zstd/codec.08/encoded.06.dat
Binary file not shown.
Binary file added fixture/zstd/codec.08/encoded.07.dat
Binary file not shown.
Binary file added fixture/zstd/codec.08/encoded.08.dat
Binary file not shown.
Binary file added fixture/zstd/codec.08/encoded.09.dat
Binary file not shown.
Binary file added fixture/zstd/codec.08/encoded.10.dat
Binary file not shown.
Binary file added fixture/zstd/codec.08/encoded.11.dat
Binary file not shown.
Binary file added fixture/zstd/codec.08/encoded.12.dat
Binary file not shown.
5 changes: 5 additions & 0 deletions fixture/zstd/codec.09/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"checksum": true,
"id": "zstd",
"level": 22
}
Binary file added fixture/zstd/codec.09/encoded.00.dat
Binary file not shown.
Binary file added fixture/zstd/codec.09/encoded.01.dat
Binary file not shown.
Binary file added fixture/zstd/codec.09/encoded.02.dat
Binary file not shown.
Binary file added fixture/zstd/codec.09/encoded.03.dat
Binary file not shown.
Binary file added fixture/zstd/codec.09/encoded.04.dat
Binary file not shown.
Binary file added fixture/zstd/codec.09/encoded.05.dat
Binary file not shown.
Binary file added fixture/zstd/codec.09/encoded.06.dat
Binary file not shown.
Binary file added fixture/zstd/codec.09/encoded.07.dat
Binary file not shown.
Binary file added fixture/zstd/codec.09/encoded.08.dat
Binary file not shown.
Binary file added fixture/zstd/codec.09/encoded.09.dat
Binary file not shown.
Binary file added fixture/zstd/codec.09/encoded.10.dat
Binary file not shown.
Binary file added fixture/zstd/codec.09/encoded.11.dat
Binary file not shown.
Binary file added fixture/zstd/codec.09/encoded.12.dat
Binary file not shown.
21 changes: 19 additions & 2 deletions numcodecs/tests/test_zstd.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,18 @@
Zstd(level=10),
Zstd(level=22),
Zstd(level=100),
Zstd(checksum=True),
Zstd(level=0, checksum=True),
Zstd(level=22, checksum=True),
]


# mix of dtypes: integer, float, bool, string
# mix of shapes: 1D, 2D, 3D
# mix of orders: C, F
arrays = [
np.arange(1000, dtype='i4'),
np.linspace(1000, 1001, 1000, dtype='f8'),
np.arange(1000, dtype="i4"),
np.linspace(1000, 1001, 1000, dtype="f8"),
np.random.normal(loc=1000, scale=1, size=(100, 10)),
np.random.randint(0, 2, size=1000, dtype=bool).reshape(100, 10, order='F'),
np.random.choice([b'a', b'bb', b'ccc'], size=1000).reshape(10, 10, 10),
Expand Down Expand Up @@ -76,3 +79,17 @@ def test_err_decode_object_buffer():

def test_err_encode_object_buffer():
check_err_encode_object_buffer(Zstd())


def test_checksum():
data = np.arange(0, 64, dtype="uint8")
assert len(Zstd(level=0, checksum=False).encode(data)) + 4 == len(
Zstd(level=0, checksum=True).encode(data)
)


def test_native_functions():
# Note, these assertions might need to be changed for new versions of zstd
assert Zstd.default_level == 3
assert Zstd.min_level == -131072
assert Zstd.max_level == 22
88 changes: 70 additions & 18 deletions numcodecs/zstd.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -19,21 +19,38 @@ cdef extern from "zstd.h":

unsigned ZSTD_versionNumber() nogil

size_t ZSTD_compress(void* dst,
size_t dstCapacity,
const void* src,
size_t srcSize,
int compressionLevel) nogil
struct ZSTD_CCtx_s:
pass
ctypedef ZSTD_CCtx_s ZSTD_CCtx
cdef enum ZSTD_cParameter:
ZSTD_c_compressionLevel=100
ZSTD_c_checksumFlag=201

ZSTD_CCtx* ZSTD_createCCtx() nogil
size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx) nogil
size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx,
ZSTD_cParameter param,
int value) nogil

size_t ZSTD_compress2(ZSTD_CCtx* cctx,
void* dst,
size_t dstCapacity,
const void* src,
size_t srcSize) nogil

size_t ZSTD_decompress(void* dst,
size_t dstCapacity,
const void* src,
size_t compressedSize) nogil

unsigned long long ZSTD_getDecompressedSize(const void* src,
cdef long ZSTD_CONTENTSIZE_UNKNOWN
cdef long ZSTD_CONTENTSIZE_ERROR
unsigned long long ZSTD_getFrameContentSize(const void* src,
size_t srcSize) nogil

int ZSTD_minCLevel() nogil
int ZSTD_maxCLevel() nogil
int ZSTD_defaultCLevel() nogil

size_t ZSTD_compressBound(size_t srcSize) nogil

Expand All @@ -51,11 +68,11 @@ MICRO_VERSION_NUMBER = (
(MINOR_VERSION_NUMBER * 100)
)
__version__ = '%s.%s.%s' % (MAJOR_VERSION_NUMBER, MINOR_VERSION_NUMBER, MICRO_VERSION_NUMBER)
DEFAULT_CLEVEL = 1
DEFAULT_CLEVEL = 0
MAX_CLEVEL = ZSTD_maxCLevel()


def compress(source, int level=DEFAULT_CLEVEL):
def compress(source, int level=DEFAULT_CLEVEL, bint checksum=False):
"""Compress data.

Parameters
Expand All @@ -64,7 +81,9 @@ def compress(source, int level=DEFAULT_CLEVEL):
Data to be compressed. Can be any object supporting the buffer
protocol.
level : int
Compression level (1-22).
Compression level (-131072 to 22).
checksum : bool
Flag to enable checksums. The default is False.

Returns
-------
Expand All @@ -80,8 +99,6 @@ def compress(source, int level=DEFAULT_CLEVEL):
bytes dest

# check level
if level <= 0:
level = DEFAULT_CLEVEL
if level > MAX_CLEVEL:
level = MAX_CLEVEL

Expand All @@ -90,6 +107,19 @@ def compress(source, int level=DEFAULT_CLEVEL):
source_ptr = source_buffer.ptr
source_size = source_buffer.nbytes

cctx = ZSTD_createCCtx()
param_set_result = ZSTD_CCtx_setParameter(cctx, ZSTD_c_compressionLevel, level)

if ZSTD_isError(param_set_result):
error = ZSTD_getErrorName(param_set_result)
raise RuntimeError('Could not set zstd compression level: %s' % error)

param_set_result = ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, 1 if checksum else 0)

if ZSTD_isError(param_set_result):
error = ZSTD_getErrorName(param_set_result)
raise RuntimeError('Could not set zstd checksum flag: %s' % error)

try:

# setup destination
Expand All @@ -99,10 +129,11 @@ def compress(source, int level=DEFAULT_CLEVEL):

# perform compression
with nogil:
compressed_size = ZSTD_compress(dest_ptr, dest_size, source_ptr, source_size, level)
compressed_size = ZSTD_compress2(cctx, dest_ptr, dest_size, source_ptr, source_size)

finally:

if cctx:
ZSTD_freeCCtx(cctx)
# release buffers
source_buffer.release()

Expand Down Expand Up @@ -148,8 +179,8 @@ def decompress(source, dest=None):
try:

# determine uncompressed size
dest_size = ZSTD_getDecompressedSize(source_ptr, source_size)
if dest_size == 0:
dest_size = ZSTD_getFrameContentSize(source_ptr, source_size)
if dest_size == 0 or dest_size == ZSTD_CONTENTSIZE_UNKNOWN or dest_size == ZSTD_CONTENTSIZE_ERROR:
raise RuntimeError('Zstd decompression error: invalid input data')

# setup destination buffer
Expand Down Expand Up @@ -193,7 +224,9 @@ class Zstd(Codec):
Parameters
----------
level : int
Compression level (1-22).
Compression level (-131072 to 22).
checksum : bool
Flag to enable checksums. The default is False.

See Also
--------
Expand All @@ -207,12 +240,13 @@ class Zstd(Codec):
# practical limit on the size of buffers that Zstd can process and so we don't
# enforce a max_buffer_size option here.

def __init__(self, level=DEFAULT_CLEVEL):
def __init__(self, level=DEFAULT_CLEVEL, checksum=False):
self.level = level
self.checksum = checksum

def encode(self, buf):
buf = ensure_contiguous_ndarray(buf)
return compress(buf, self.level)
return compress(buf, self.level, self.checksum)

def decode(self, buf, out=None):
buf = ensure_contiguous_ndarray(buf)
Expand All @@ -223,3 +257,21 @@ class Zstd(Codec):
(type(self).__name__,
self.level)
return r

@classmethod
@property
def default_level(cls):
"""Returns the default compression level of the underlying zstd library."""
return ZSTD_defaultCLevel()

@classmethod
@property
def min_level(cls):
"""Returns the minimum compression level of the underlying zstd library."""
return ZSTD_minCLevel()

@classmethod
@property
def max_level(cls):
"""Returns the maximum compression level of the underlying zstd library."""
return ZSTD_maxCLevel()
Loading