From 1dc39fb105d8a3e153d87cccbc17917c46d3b1fb Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Tue, 20 Dec 2022 09:50:37 -0500 Subject: [PATCH 1/9] implement fletcher32 --- numcodecs/__init__.py | 3 ++ numcodecs/fletcher32.pyx | 54 ++++++++++++++++++++++++++++++ numcodecs/tests/test_fletcher32.py | 24 +++++++++++++ setup.py | 28 +++++++++++++++- 4 files changed, 108 insertions(+), 1 deletion(-) create mode 100644 numcodecs/fletcher32.pyx create mode 100644 numcodecs/tests/test_fletcher32.py diff --git a/numcodecs/__init__.py b/numcodecs/__init__.py index 53f3e795..1e3c8536 100644 --- a/numcodecs/__init__.py +++ b/numcodecs/__init__.py @@ -111,3 +111,6 @@ register_codec(VLenUTF8) register_codec(VLenBytes) register_codec(VLenArray) + +from numcodecs.fletcher32 import Fletcher32 +register_codec(Fletcher32) diff --git a/numcodecs/fletcher32.pyx b/numcodecs/fletcher32.pyx new file mode 100644 index 00000000..6c300f34 --- /dev/null +++ b/numcodecs/fletcher32.pyx @@ -0,0 +1,54 @@ +# cython: boundscheck=False +# cython: wraparound=False +# cython: overflowcheck=False +# cython: cdivision=True + +import struct +import numpy as np + +from numcodecs.abc import Codec +from numcodecs.compat import ensure_contiguous_ndarray + +from libc.stdint cimport uint8_t, uint16_t, uint32_t + +cpdef uint32_t fletcher32(const uint16_t[::1] data): + cdef: + uint32_t sum1 = 0 + uint32_t sum2 = 0 + int index + int size = data.shape[0] + + for index in range(0, size): + sum1 = (sum1 + data[index]) % 0xffff + sum2 = (sum2 + sum1) % 0xffff + + return (sum2 << 16) | sum1 + + +class Fletcher32(Codec): + codec_id = "fletcher32" + + def encode(self, buf): + buf = ensure_contiguous_ndarray(buf).ravel() + if len(buf) % 2: + # rare, odd size of bytes data only + arr = np.frombuffer(buf.tobytes() + b"\x00", dtype="uint16") + val = fletcher32(arr) + else: + val = fletcher32(buf.view('uint16')) + return buf.tobytes() + struct.pack(" Date: Tue, 20 Dec 2022 13:00:26 -0500 Subject: [PATCH 2/9] Update numcodecs/fletcher32.pyx Co-authored-by: Ryan Abernathey --- numcodecs/fletcher32.pyx | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/numcodecs/fletcher32.pyx b/numcodecs/fletcher32.pyx index 6c300f34..6a0ae7ae 100644 --- a/numcodecs/fletcher32.pyx +++ b/numcodecs/fletcher32.pyx @@ -47,7 +47,11 @@ class Fletcher32(Codec): else: val = fletcher32(b[:-4].view('uint16')) found = b[-4:].view('uint32')[0] - assert val == found + if val != found: + raise ValueError( + f"The flecher32 checksum of the data ({found}) did not match the expected checksum ({val}). " + "This could be a sign that the data has been corrupted." + ) if out: out.view("uint8")[:] = b[:-4] return out From db2275e2236c31da9a5ea2693f8c8568eaf3b820 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Tue, 20 Dec 2022 13:06:53 -0500 Subject: [PATCH 3/9] Add docstring and erorr test --- numcodecs/fletcher32.pyx | 11 ++++++++++- numcodecs/tests/test_fletcher32.py | 12 ++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/numcodecs/fletcher32.pyx b/numcodecs/fletcher32.pyx index 6a0ae7ae..a60547af 100644 --- a/numcodecs/fletcher32.pyx +++ b/numcodecs/fletcher32.pyx @@ -26,6 +26,14 @@ cpdef uint32_t fletcher32(const uint16_t[::1] data): class Fletcher32(Codec): + """The fletcher checksum with 16-bit words and 32-bit output + + With this codec, the checksum is concatenated on the end of the data + bytes when encoded. At decode time, the checksum is performed on + the data portion and compared with the four-byte checksum, raising + ValueError if inconsistent. + """ + codec_id = "fletcher32" def encode(self, buf): @@ -49,7 +57,8 @@ class Fletcher32(Codec): found = b[-4:].view('uint32')[0] if val != found: raise ValueError( - f"The flecher32 checksum of the data ({found}) did not match the expected checksum ({val}). " + f"The fletcher32 checksum of the data ({found}) did not" + f" match the expected checksum ({val}).\n" "This could be a sign that the data has been corrupted." ) if out: diff --git a/numcodecs/tests/test_fletcher32.py b/numcodecs/tests/test_fletcher32.py index 5db75e13..d9435576 100644 --- a/numcodecs/tests/test_fletcher32.py +++ b/numcodecs/tests/test_fletcher32.py @@ -22,3 +22,15 @@ def test_with_data(dtype): f = Fletcher32() arr = np.frombuffer(f.decode(f.encode(data)), dtype=dtype) assert (arr == data).all() + + +def test_error(): + data = np.arange(100) + f = Fletcher32() + enc = f.encode(data) + enc2 = bytearray(enc) + enc2[0] += 1 + with pytest.raises(ValueError) as e: + f.decode(enc2) + assert "fletcher32 checksum" in str(e.value) + From 4366b5b45ea6b6d10d6238595da43c8a8b449a37 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Tue, 20 Dec 2022 15:18:07 -0500 Subject: [PATCH 4/9] Use HDF C impl --- numcodecs/_fletcher.c | 43 ++++++++++++++++++++++++++++++ numcodecs/fletcher32.pyx | 39 +++++++-------------------- numcodecs/tests/test_fletcher32.py | 28 +++++++++++-------- 3 files changed, 70 insertions(+), 40 deletions(-) create mode 100644 numcodecs/_fletcher.c diff --git a/numcodecs/_fletcher.c b/numcodecs/_fletcher.c new file mode 100644 index 00000000..15310d3a --- /dev/null +++ b/numcodecs/_fletcher.c @@ -0,0 +1,43 @@ +#include +#include + +// https://github.com/Unidata/netcdf-c/blob/8eb71290eb9360dcfd4955ba94759ba8d02c40a9/plugins/H5checksum.c + + +uint32_t H5_checksum_fletcher32(const void *_data, size_t _len) +{ + const uint8_t *data = (const uint8_t *)_data; /* Pointer to the data to be summed */ + size_t len = _len / 2; /* Length in 16-bit words */ + uint32_t sum1 = 0, sum2 = 0; + + + /* Compute checksum for pairs of bytes */ + /* (the magic "360" value is is the largest number of sums that can be + * performed without numeric overflow) + */ + while (len) { + size_t tlen = len > 360 ? 360 : len; + len -= tlen; + do { + sum1 += (uint32_t)(((uint16_t)data[0]) << 8) | ((uint16_t)data[1]); + data += 2; + sum2 += sum1; + } while (--tlen); + sum1 = (sum1 & 0xffff) + (sum1 >> 16); + sum2 = (sum2 & 0xffff) + (sum2 >> 16); + } + + /* Check for odd # of bytes */ + if(_len % 2) { + sum1 += (uint32_t)(((uint16_t)*data) << 8); + sum2 += sum1; + sum1 = (sum1 & 0xffff) + (sum1 >> 16); + sum2 = (sum2 & 0xffff) + (sum2 >> 16); + } /* end if */ + + /* Second reduction step to reduce sums to 16 bits */ + sum1 = (sum1 & 0xffff) + (sum1 >> 16); + sum2 = (sum2 & 0xffff) + (sum2 >> 16); + + return (sum2 << 16) | sum1; +} /* end H5_checksum_fletcher32() */ diff --git a/numcodecs/fletcher32.pyx b/numcodecs/fletcher32.pyx index a60547af..3bf01704 100644 --- a/numcodecs/fletcher32.pyx +++ b/numcodecs/fletcher32.pyx @@ -11,19 +11,8 @@ from numcodecs.compat import ensure_contiguous_ndarray from libc.stdint cimport uint8_t, uint16_t, uint32_t -cpdef uint32_t fletcher32(const uint16_t[::1] data): - cdef: - uint32_t sum1 = 0 - uint32_t sum2 = 0 - int index - int size = data.shape[0] - - for index in range(0, size): - sum1 = (sum1 + data[index]) % 0xffff - sum2 = (sum2 + sum1) % 0xffff - - return (sum2 << 16) | sum1 - +cdef extern from "_fletcher.c": + uint32_t H5_checksum_fletcher32(const void *_data, size_t _len) class Fletcher32(Codec): """The fletcher checksum with 16-bit words and 32-bit output @@ -37,28 +26,20 @@ class Fletcher32(Codec): codec_id = "fletcher32" def encode(self, buf): - buf = ensure_contiguous_ndarray(buf).ravel() - if len(buf) % 2: - # rare, odd size of bytes data only - arr = np.frombuffer(buf.tobytes() + b"\x00", dtype="uint16") - val = fletcher32(arr) - else: - val = fletcher32(buf.view('uint16')) + buf = ensure_contiguous_ndarray(buf).ravel().view('uint8') + cdef const uint8_t[::1] b_ptr = buf + val = H5_checksum_fletcher32(&b_ptr[0], buf.nbytes) return buf.tobytes() + struct.pack(" Date: Tue, 20 Dec 2022 15:21:01 -0500 Subject: [PATCH 5/9] Remove unused, add docstrings --- numcodecs/fletcher32.pyx | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/numcodecs/fletcher32.pyx b/numcodecs/fletcher32.pyx index 3bf01704..52d30acb 100644 --- a/numcodecs/fletcher32.pyx +++ b/numcodecs/fletcher32.pyx @@ -1,10 +1,4 @@ -# cython: boundscheck=False -# cython: wraparound=False -# cython: overflowcheck=False -# cython: cdivision=True - import struct -import numpy as np from numcodecs.abc import Codec from numcodecs.compat import ensure_contiguous_ndarray @@ -14,6 +8,7 @@ from libc.stdint cimport uint8_t, uint16_t, uint32_t cdef extern from "_fletcher.c": uint32_t H5_checksum_fletcher32(const void *_data, size_t _len) + class Fletcher32(Codec): """The fletcher checksum with 16-bit words and 32-bit output @@ -26,12 +21,14 @@ class Fletcher32(Codec): codec_id = "fletcher32" def encode(self, buf): + """Return buffer plus 4-byte fletcher checksum""" buf = ensure_contiguous_ndarray(buf).ravel().view('uint8') cdef const uint8_t[::1] b_ptr = buf val = H5_checksum_fletcher32(&b_ptr[0], buf.nbytes) return buf.tobytes() + struct.pack(" Date: Tue, 20 Dec 2022 15:37:30 -0500 Subject: [PATCH 6/9] to runtime and int test --- numcodecs/fletcher32.pyx | 2 +- numcodecs/tests/test_fletcher32.py | 20 ++++++++++---------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/numcodecs/fletcher32.pyx b/numcodecs/fletcher32.pyx index 52d30acb..befb367e 100644 --- a/numcodecs/fletcher32.pyx +++ b/numcodecs/fletcher32.pyx @@ -34,7 +34,7 @@ class Fletcher32(Codec): val = H5_checksum_fletcher32(&b_ptr[0], b.nbytes - 4) found = b[-4:].view(" Date: Tue, 20 Dec 2022 20:40:00 -0500 Subject: [PATCH 7/9] to cython --- numcodecs/_fletcher.c | 43 --------------------------------------- numcodecs/fletcher32.pyx | 44 +++++++++++++++++++++++++++++++++++----- 2 files changed, 39 insertions(+), 48 deletions(-) delete mode 100644 numcodecs/_fletcher.c diff --git a/numcodecs/_fletcher.c b/numcodecs/_fletcher.c deleted file mode 100644 index 15310d3a..00000000 --- a/numcodecs/_fletcher.c +++ /dev/null @@ -1,43 +0,0 @@ -#include -#include - -// https://github.com/Unidata/netcdf-c/blob/8eb71290eb9360dcfd4955ba94759ba8d02c40a9/plugins/H5checksum.c - - -uint32_t H5_checksum_fletcher32(const void *_data, size_t _len) -{ - const uint8_t *data = (const uint8_t *)_data; /* Pointer to the data to be summed */ - size_t len = _len / 2; /* Length in 16-bit words */ - uint32_t sum1 = 0, sum2 = 0; - - - /* Compute checksum for pairs of bytes */ - /* (the magic "360" value is is the largest number of sums that can be - * performed without numeric overflow) - */ - while (len) { - size_t tlen = len > 360 ? 360 : len; - len -= tlen; - do { - sum1 += (uint32_t)(((uint16_t)data[0]) << 8) | ((uint16_t)data[1]); - data += 2; - sum2 += sum1; - } while (--tlen); - sum1 = (sum1 & 0xffff) + (sum1 >> 16); - sum2 = (sum2 & 0xffff) + (sum2 >> 16); - } - - /* Check for odd # of bytes */ - if(_len % 2) { - sum1 += (uint32_t)(((uint16_t)*data) << 8); - sum2 += sum1; - sum1 = (sum1 & 0xffff) + (sum1 >> 16); - sum2 = (sum2 & 0xffff) + (sum2 >> 16); - } /* end if */ - - /* Second reduction step to reduce sums to 16 bits */ - sum1 = (sum1 & 0xffff) + (sum1 >> 16); - sum2 = (sum2 & 0xffff) + (sum2 >> 16); - - return (sum2 << 16) | sum1; -} /* end H5_checksum_fletcher32() */ diff --git a/numcodecs/fletcher32.pyx b/numcodecs/fletcher32.pyx index befb367e..9728735d 100644 --- a/numcodecs/fletcher32.pyx +++ b/numcodecs/fletcher32.pyx @@ -1,3 +1,6 @@ +# cython: language_level=3 +# cython: overflowcheck=False +# cython: cdivision=True import struct from numcodecs.abc import Codec @@ -5,8 +8,39 @@ from numcodecs.compat import ensure_contiguous_ndarray from libc.stdint cimport uint8_t, uint16_t, uint32_t -cdef extern from "_fletcher.c": - uint32_t H5_checksum_fletcher32(const void *_data, size_t _len) + +cdef uint32_t _fletcher32(const uint8_t[::1] _data): + cdef: + const uint8_t *data = &_data[0] + size_t _len = _data.shape[0] + size_t len = _len / 2 + size_t tlen + uint32_t sum1 = 0, sum2 = 0; + + + while len: + tlen = 360 if len > 360 else len + len -= tlen + while True: + sum1 += ((data[0]) << 8) | (data[1]) + data += 2 + sum2 += sum1 + tlen -= 1 + if tlen < 1: + break + sum1 = (sum1 & 0xffff) + (sum1 >> 16) + sum2 = (sum2 & 0xffff) + (sum2 >> 16) + + if _len % 2: + sum1 += (((data[0])) << 8) + sum2 += sum1 + sum1 = (sum1 & 0xffff) + (sum1 >> 16) + sum2 = (sum2 & 0xffff) + (sum2 >> 16) + + sum1 = (sum1 & 0xffff) + (sum1 >> 16) + sum2 = (sum2 & 0xffff) + (sum2 >> 16) + + return (sum2 << 16) | sum1 class Fletcher32(Codec): @@ -24,14 +58,14 @@ class Fletcher32(Codec): """Return buffer plus 4-byte fletcher checksum""" buf = ensure_contiguous_ndarray(buf).ravel().view('uint8') cdef const uint8_t[::1] b_ptr = buf - val = H5_checksum_fletcher32(&b_ptr[0], buf.nbytes) + val = _fletcher32(b_ptr) return buf.tobytes() + struct.pack(" Date: Wed, 21 Dec 2022 08:52:13 -0500 Subject: [PATCH 8/9] Update numcodecs/fletcher32.pyx Co-authored-by: Ryan Abernathey --- numcodecs/fletcher32.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numcodecs/fletcher32.pyx b/numcodecs/fletcher32.pyx index 9728735d..8ad54c12 100644 --- a/numcodecs/fletcher32.pyx +++ b/numcodecs/fletcher32.pyx @@ -49,7 +49,7 @@ class Fletcher32(Codec): With this codec, the checksum is concatenated on the end of the data bytes when encoded. At decode time, the checksum is performed on the data portion and compared with the four-byte checksum, raising - ValueError if inconsistent. + RuntimeError if inconsistent. """ codec_id = "fletcher32" From 4825a1d3b229f5b7a246b08f5e18db159b9c31c4 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Wed, 21 Dec 2022 09:16:17 -0500 Subject: [PATCH 9/9] Add docs --- docs/checksum32.rst | 11 +++++++++++ docs/release.rst | 3 ++- numcodecs/fletcher32.pyx | 6 ++++++ 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/docs/checksum32.rst b/docs/checksum32.rst index 1d5522e2..5e682afc 100644 --- a/docs/checksum32.rst +++ b/docs/checksum32.rst @@ -22,3 +22,14 @@ Adler32 .. automethod:: decode .. automethod:: get_config .. automethod:: from_config + + +Fletcher32 +---------- + +.. autoclass:: numcodecs.fletcher32.Fletcher32 + + .. autoattribute:: codec_id + .. automethod:: encode + .. automethod:: decode + diff --git a/docs/release.rst b/docs/release.rst index 6f176b8c..90d62750 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -15,7 +15,8 @@ Unreleased Enhancements ~~~~~~~~~~~~ -* +* Add ``fletcher32`` checksum codec + By :user:`Martin Durant `, :issue:`410`. Fix ~~~ diff --git a/numcodecs/fletcher32.pyx b/numcodecs/fletcher32.pyx index 8ad54c12..02f9319c 100644 --- a/numcodecs/fletcher32.pyx +++ b/numcodecs/fletcher32.pyx @@ -10,6 +10,8 @@ from libc.stdint cimport uint8_t, uint16_t, uint32_t cdef uint32_t _fletcher32(const uint8_t[::1] _data): + # converted from + # https://github.com/Unidata/netcdf-c/blob/main/plugins/H5checksum.c#L109 cdef: const uint8_t *data = &_data[0] size_t _len = _data.shape[0] @@ -46,6 +48,10 @@ cdef uint32_t _fletcher32(const uint8_t[::1] _data): class Fletcher32(Codec): """The fletcher checksum with 16-bit words and 32-bit output + This is the netCDF4/HED5 implementation, which is not equivalent + to the one in wikipedia + https://github.com/Unidata/netcdf-c/blob/main/plugins/H5checksum.c#L95 + With this codec, the checksum is concatenated on the end of the data bytes when encoded. At decode time, the checksum is performed on the data portion and compared with the four-byte checksum, raising