Skip to content

Commit

Permalink
Adds checksum flag to zstd codec (#519)
Browse files Browse the repository at this point in the history
* expose checksum toggle for zstd

* fixes zstd checksumming

* less fixtures

* write_checksum -> checksum

* adds release notes

* set default clevel to 0

* release

* update fixtures

* fix checksum flag

* add test for checksum

* adds wrapper codecs for the v2 codec pipeline

* docstring
  • Loading branch information
normanrz authored Jun 24, 2024
1 parent bef2e16 commit 5b12b15
Show file tree
Hide file tree
Showing 46 changed files with 108 additions and 23 deletions.
5 changes: 3 additions & 2 deletions docs/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,10 @@ Maintenance

Enhancements
~~~~~~~~~~~~

* Add checksum flag to zstd and sets the default level to 0.
By :user:`Norman Rzepka <normanrz>`, :issue:`519`.
* Add PCodec
By :user:`Ryan Abernathey <rabernat>`.
By :user:`Ryan Abernathey <rabernat>`, :issue:`501`.
* Use PyData theme for docs
By :user:`John Kirkham <jakirkham>`, :issue:`485`.

Expand Down
2 changes: 1 addition & 1 deletion fixture/zstd/codec.00/config.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{
"id": "zstd",
"level": 1
"level": 0
}
5 changes: 5 additions & 0 deletions fixture/zstd/codec.07/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"checksum": true,
"id": "zstd",
"level": 0
}
Binary file added fixture/zstd/codec.07/encoded.00.dat
Binary file not shown.
Binary file added fixture/zstd/codec.07/encoded.01.dat
Binary file not shown.
Binary file added fixture/zstd/codec.07/encoded.02.dat
Binary file not shown.
Binary file added fixture/zstd/codec.07/encoded.03.dat
Binary file not shown.
Binary file added fixture/zstd/codec.07/encoded.04.dat
Binary file not shown.
Binary file added fixture/zstd/codec.07/encoded.05.dat
Binary file not shown.
Binary file added fixture/zstd/codec.07/encoded.06.dat
Binary file not shown.
Binary file added fixture/zstd/codec.07/encoded.07.dat
Binary file not shown.
Binary file added fixture/zstd/codec.07/encoded.08.dat
Binary file not shown.
Binary file added fixture/zstd/codec.07/encoded.09.dat
Binary file not shown.
Binary file added fixture/zstd/codec.07/encoded.10.dat
Binary file not shown.
Binary file added fixture/zstd/codec.07/encoded.11.dat
Binary file not shown.
Binary file added fixture/zstd/codec.07/encoded.12.dat
Binary file not shown.
5 changes: 5 additions & 0 deletions fixture/zstd/codec.08/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"checksum": true,
"id": "zstd",
"level": 0
}
Binary file added fixture/zstd/codec.08/encoded.00.dat
Binary file not shown.
Binary file added fixture/zstd/codec.08/encoded.01.dat
Binary file not shown.
Binary file added fixture/zstd/codec.08/encoded.02.dat
Binary file not shown.
Binary file added fixture/zstd/codec.08/encoded.03.dat
Binary file not shown.
Binary file added fixture/zstd/codec.08/encoded.04.dat
Binary file not shown.
Binary file added fixture/zstd/codec.08/encoded.05.dat
Binary file not shown.
Binary file added fixture/zstd/codec.08/encoded.06.dat
Binary file not shown.
Binary file added fixture/zstd/codec.08/encoded.07.dat
Binary file not shown.
Binary file added fixture/zstd/codec.08/encoded.08.dat
Binary file not shown.
Binary file added fixture/zstd/codec.08/encoded.09.dat
Binary file not shown.
Binary file added fixture/zstd/codec.08/encoded.10.dat
Binary file not shown.
Binary file added fixture/zstd/codec.08/encoded.11.dat
Binary file not shown.
Binary file added fixture/zstd/codec.08/encoded.12.dat
Binary file not shown.
5 changes: 5 additions & 0 deletions fixture/zstd/codec.09/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"checksum": true,
"id": "zstd",
"level": 22
}
Binary file added fixture/zstd/codec.09/encoded.00.dat
Binary file not shown.
Binary file added fixture/zstd/codec.09/encoded.01.dat
Binary file not shown.
Binary file added fixture/zstd/codec.09/encoded.02.dat
Binary file not shown.
Binary file added fixture/zstd/codec.09/encoded.03.dat
Binary file not shown.
Binary file added fixture/zstd/codec.09/encoded.04.dat
Binary file not shown.
Binary file added fixture/zstd/codec.09/encoded.05.dat
Binary file not shown.
Binary file added fixture/zstd/codec.09/encoded.06.dat
Binary file not shown.
Binary file added fixture/zstd/codec.09/encoded.07.dat
Binary file not shown.
Binary file added fixture/zstd/codec.09/encoded.08.dat
Binary file not shown.
Binary file added fixture/zstd/codec.09/encoded.09.dat
Binary file not shown.
Binary file added fixture/zstd/codec.09/encoded.10.dat
Binary file not shown.
Binary file added fixture/zstd/codec.09/encoded.11.dat
Binary file not shown.
Binary file added fixture/zstd/codec.09/encoded.12.dat
Binary file not shown.
21 changes: 19 additions & 2 deletions numcodecs/tests/test_zstd.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,18 @@
Zstd(level=10),
Zstd(level=22),
Zstd(level=100),
Zstd(checksum=True),
Zstd(level=0, checksum=True),
Zstd(level=22, checksum=True),
]


# mix of dtypes: integer, float, bool, string
# mix of shapes: 1D, 2D, 3D
# mix of orders: C, F
arrays = [
np.arange(1000, dtype='i4'),
np.linspace(1000, 1001, 1000, dtype='f8'),
np.arange(1000, dtype="i4"),
np.linspace(1000, 1001, 1000, dtype="f8"),
np.random.normal(loc=1000, scale=1, size=(100, 10)),
np.random.randint(0, 2, size=1000, dtype=bool).reshape(100, 10, order='F'),
np.random.choice([b'a', b'bb', b'ccc'], size=1000).reshape(10, 10, 10),
Expand Down Expand Up @@ -76,3 +79,17 @@ def test_err_decode_object_buffer():

def test_err_encode_object_buffer():
check_err_encode_object_buffer(Zstd())


def test_checksum():
data = np.arange(0, 64, dtype="uint8")
assert len(Zstd(level=0, checksum=False).encode(data)) + 4 == len(
Zstd(level=0, checksum=True).encode(data)
)


def test_native_functions():
# Note, these assertions might need to be changed for new versions of zstd
assert Zstd.default_level == 3
assert Zstd.min_level == -131072
assert Zstd.max_level == 22
88 changes: 70 additions & 18 deletions numcodecs/zstd.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -19,21 +19,38 @@ cdef extern from "zstd.h":

unsigned ZSTD_versionNumber() nogil

size_t ZSTD_compress(void* dst,
size_t dstCapacity,
const void* src,
size_t srcSize,
int compressionLevel) nogil
struct ZSTD_CCtx_s:
pass
ctypedef ZSTD_CCtx_s ZSTD_CCtx
cdef enum ZSTD_cParameter:
ZSTD_c_compressionLevel=100
ZSTD_c_checksumFlag=201

ZSTD_CCtx* ZSTD_createCCtx() nogil
size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx) nogil
size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx,
ZSTD_cParameter param,
int value) nogil

size_t ZSTD_compress2(ZSTD_CCtx* cctx,
void* dst,
size_t dstCapacity,
const void* src,
size_t srcSize) nogil

size_t ZSTD_decompress(void* dst,
size_t dstCapacity,
const void* src,
size_t compressedSize) nogil

unsigned long long ZSTD_getDecompressedSize(const void* src,
cdef long ZSTD_CONTENTSIZE_UNKNOWN
cdef long ZSTD_CONTENTSIZE_ERROR
unsigned long long ZSTD_getFrameContentSize(const void* src,
size_t srcSize) nogil

int ZSTD_minCLevel() nogil
int ZSTD_maxCLevel() nogil
int ZSTD_defaultCLevel() nogil

size_t ZSTD_compressBound(size_t srcSize) nogil

Expand All @@ -51,11 +68,11 @@ MICRO_VERSION_NUMBER = (
(MINOR_VERSION_NUMBER * 100)
)
__version__ = '%s.%s.%s' % (MAJOR_VERSION_NUMBER, MINOR_VERSION_NUMBER, MICRO_VERSION_NUMBER)
DEFAULT_CLEVEL = 1
DEFAULT_CLEVEL = 0
MAX_CLEVEL = ZSTD_maxCLevel()


def compress(source, int level=DEFAULT_CLEVEL):
def compress(source, int level=DEFAULT_CLEVEL, bint checksum=False):
"""Compress data.
Parameters
Expand All @@ -64,7 +81,9 @@ def compress(source, int level=DEFAULT_CLEVEL):
Data to be compressed. Can be any object supporting the buffer
protocol.
level : int
Compression level (1-22).
Compression level (-131072 to 22).
checksum : bool
Flag to enable checksums. The default is False.
Returns
-------
Expand All @@ -80,8 +99,6 @@ def compress(source, int level=DEFAULT_CLEVEL):
bytes dest

# check level
if level <= 0:
level = DEFAULT_CLEVEL
if level > MAX_CLEVEL:
level = MAX_CLEVEL

Expand All @@ -90,6 +107,19 @@ def compress(source, int level=DEFAULT_CLEVEL):
source_ptr = source_buffer.ptr
source_size = source_buffer.nbytes

cctx = ZSTD_createCCtx()
param_set_result = ZSTD_CCtx_setParameter(cctx, ZSTD_c_compressionLevel, level)

if ZSTD_isError(param_set_result):
error = ZSTD_getErrorName(param_set_result)
raise RuntimeError('Could not set zstd compression level: %s' % error)

param_set_result = ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, 1 if checksum else 0)

if ZSTD_isError(param_set_result):
error = ZSTD_getErrorName(param_set_result)
raise RuntimeError('Could not set zstd checksum flag: %s' % error)

try:

# setup destination
Expand All @@ -99,10 +129,11 @@ def compress(source, int level=DEFAULT_CLEVEL):

# perform compression
with nogil:
compressed_size = ZSTD_compress(dest_ptr, dest_size, source_ptr, source_size, level)
compressed_size = ZSTD_compress2(cctx, dest_ptr, dest_size, source_ptr, source_size)

finally:

if cctx:
ZSTD_freeCCtx(cctx)
# release buffers
source_buffer.release()

Expand Down Expand Up @@ -148,8 +179,8 @@ def decompress(source, dest=None):
try:

# determine uncompressed size
dest_size = ZSTD_getDecompressedSize(source_ptr, source_size)
if dest_size == 0:
dest_size = ZSTD_getFrameContentSize(source_ptr, source_size)
if dest_size == 0 or dest_size == ZSTD_CONTENTSIZE_UNKNOWN or dest_size == ZSTD_CONTENTSIZE_ERROR:
raise RuntimeError('Zstd decompression error: invalid input data')

# setup destination buffer
Expand Down Expand Up @@ -193,7 +224,9 @@ class Zstd(Codec):
Parameters
----------
level : int
Compression level (1-22).
Compression level (-131072 to 22).
checksum : bool
Flag to enable checksums. The default is False.
See Also
--------
Expand All @@ -207,12 +240,13 @@ class Zstd(Codec):
# practical limit on the size of buffers that Zstd can process and so we don't
# enforce a max_buffer_size option here.

def __init__(self, level=DEFAULT_CLEVEL):
def __init__(self, level=DEFAULT_CLEVEL, checksum=False):
self.level = level
self.checksum = checksum

def encode(self, buf):
buf = ensure_contiguous_ndarray(buf)
return compress(buf, self.level)
return compress(buf, self.level, self.checksum)

def decode(self, buf, out=None):
buf = ensure_contiguous_ndarray(buf)
Expand All @@ -223,3 +257,21 @@ class Zstd(Codec):
(type(self).__name__,
self.level)
return r

@classmethod
@property
def default_level(cls):
"""Returns the default compression level of the underlying zstd library."""
return ZSTD_defaultCLevel()

@classmethod
@property
def min_level(cls):
"""Returns the minimum compression level of the underlying zstd library."""
return ZSTD_minCLevel()

@classmethod
@property
def max_level(cls):
"""Returns the maximum compression level of the underlying zstd library."""
return ZSTD_maxCLevel()

0 comments on commit 5b12b15

Please sign in to comment.