From 46c45de4cb55c05cf27b983d9b7c2e189c07678d Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sat, 26 Oct 2024 04:05:17 +0200 Subject: [PATCH 1/4] more dict-like: .__init__ and .items --- borghash.pyx | 36 ++++++++++++++++++++++++++++-------- tests/hashtable_test.py | 18 ++++++++++++++---- tests/hashtablent_test.py | 15 ++++++++++++--- 3 files changed, 54 insertions(+), 15 deletions(-) diff --git a/borghash.pyx b/borghash.pyx index c733854..cc4d0b8 100644 --- a/borghash.pyx +++ b/borghash.pyx @@ -33,7 +33,8 @@ cdef uint32_t RESERVED = 0xFFFFFF00 # all >= this is reserved _NoDefault = object() cdef class HashTable: - def __init__(self, key_size: int, value_size: int, capacity: int = MIN_CAPACITY, + def __init__(self, items=None, *, + key_size: int = 0, value_size: int = 0, capacity: int = MIN_CAPACITY, max_load_factor: float = 0.5, min_load_factor: float = 0.10, shrink_factor: float = 0.4, grow_factor: float = 2.0, kv_grow_factor: float = 1.3): @@ -42,6 +43,10 @@ cdef class HashTable: # .keys and .values array. # the keys/values arrays have bigger elements and are not hash tables, thus collisions and load # factor are no concern there. the kv_grow_factor can be relatively small. + if not key_size: + raise ValueError("key_size must be specified and must be > 0.") + if not value_size: + raise ValueError("value_size must be specified and must be > 0.") self.ksize = key_size self.vsize = value_size # vvv hash table vvv @@ -66,11 +71,15 @@ cdef class HashTable: self.stats_get = 0 self.stats_set = 0 self.stats_del = 0 - self.stats_iter = 0 # iteritems calls + self.stats_iter = 0 # .items() calls self.stats_lookup = 0 # _lookup_index calls self.stats_linear = 0 # how many steps the linear search inside _lookup_index needed self.stats_resize_table = 0 self.stats_resize_kv = 0 + # initialize? + if items is not None: + for key, value in items: + self[key] = value def __del__(self): free(self.table) @@ -208,7 +217,7 @@ cdef class HashTable: raise return default - def iteritems(self): + def items(self): cdef int i cdef uint32_t kv_index self.stats_iter += 1 @@ -313,12 +322,23 @@ cdef class HashTable: cdef class HashTableNT: - def __init__(self, int key_size, str value_format, object namedtuple_type, int capacity = MIN_CAPACITY): + def __init__(self, items=None, *, + key_size: int = 0, value_format: str = "", namedtuple_type: object = None, + capacity: int = MIN_CAPACITY): + if not key_size: + raise ValueError("key_size must be specified and must be > 0.") + if not value_format: + raise ValueError("value_format must be specified and must be non-empty.") + if namedtuple_type is None: + raise ValueError("namedtuple_type must be specified.") self.key_size = key_size self.value_format = value_format self.value_size = struct.calcsize(self.value_format) self.namedtuple_type = namedtuple_type - self.inner = HashTable(self.key_size, self.value_size, capacity=capacity) + self.inner = HashTable(key_size=self.key_size, value_size=self.value_size, capacity=capacity) + if items is not None: + for key, value in items: + self[key] = value def clear(self): self.inner.clear() @@ -361,8 +381,8 @@ cdef class HashTableNT: self._check_key(key) return key in self.inner - def iteritems(self): - for key, binary_value in self.inner.iteritems(): + def items(self): + for key, binary_value in self.inner.items(): yield (key, self._to_namedtuple_value(binary_value)) def __len__(self): @@ -435,7 +455,7 @@ cdef class HashTableNT: fd.write(header_bytes) fd.write(meta_bytes) count = 0 - for key, value in self.inner.iteritems(): + for key, value in self.inner.items(): fd.write(key) fd.write(value) count += 1 diff --git a/tests/hashtable_test.py b/tests/hashtable_test.py index dd16f14..ae30380 100644 --- a/tests/hashtable_test.py +++ b/tests/hashtable_test.py @@ -33,6 +33,16 @@ def ht12(ht): return ht +def test_init(): + ht = HashTable(key_size=32, value_size=4) + assert len(ht) == 0 + items = [(key1, value1), (key2, value2)] + ht = HashTable(items, key_size=32, value_size=4) + assert ht[key1] == value1 + assert ht[key2] == value2 + + + def test_insert_lookup(ht12): assert ht12[key1] == value1 assert ht12[key2] == value2 @@ -49,7 +59,7 @@ def test_remove_lookup(ht12): def test_items(ht12): - items = set(ht12.iteritems()) + items = set(ht12.items()) assert (key1, value1) in items assert (key2, value2) in items @@ -88,7 +98,7 @@ def test_pop(ht12): def test_clear(ht12): ht12.clear() assert len(ht12) == 0 - assert len(list(ht12.iteritems())) == 0 + assert len(list(ht12.items())) == 0 with pytest.raises(KeyError): ht12[key1] with pytest.raises(KeyError): @@ -104,7 +114,7 @@ def test_ht_stress(ht): ht[key] = value keys.add(key) found_keys = set() - for key, value in ht.iteritems(): + for key, value in ht.items(): found_keys.add(key) assert value == key[:4] assert keys == found_keys @@ -134,7 +144,7 @@ def test_stats(ht): del ht[key1] assert ht.stats["del"] == 1 assert ht.stats["lookup"] == 3 - list(ht.iteritems()) + list(ht.items()) assert ht.stats["iter"] == 1 diff --git a/tests/hashtablent_test.py b/tests/hashtablent_test.py index a9b53bf..eb516eb 100644 --- a/tests/hashtablent_test.py +++ b/tests/hashtablent_test.py @@ -18,7 +18,7 @@ @pytest.fixture def ntht(): - return HashTableNT(key_size, value_format, value_type) + return HashTableNT(key_size=key_size, value_format=value_format, namedtuple_type=value_type) @pytest.fixture @@ -28,6 +28,15 @@ def ntht12(ntht): return ntht +def test_init(): + ht = HashTableNT(key_size=32, value_format=value_format, namedtuple_type=value_type) + assert len(ht) == 0 + items = [(key1, value1), (key2, value2)] + ht = HashTableNT(items, key_size=32, value_format=value_format, namedtuple_type=value_type) + assert ht[key1] == value1 + assert ht[key2] == value2 + + def test_insert_lookup(ntht12): assert ntht12[key1] == value1 assert ntht12[key2] == value2 @@ -46,7 +55,7 @@ def test_remove_lookup(ntht12): def test_items(ntht12): - items = set(ntht12.iteritems()) + items = set(ntht12.items()) assert (key1, value1) in items assert (key2, value2) in items @@ -92,7 +101,7 @@ def test_ntht_stress(ntht): ntht[key] = value keys.add(key) found_keys = set() - for key, value in ntht.iteritems(): + for key, value in ntht.items(): found_keys.add(key) v = key[0] assert value == value_type(v, v*2, v*3) From 29d91f0666abf60e47d29ab14903f0242c96dbd6 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sat, 26 Oct 2024 14:10:47 +0200 Subject: [PATCH 2/4] argument name: namedtuple_type -> value_type --- borghash.pxd | 4 ++-- borghash.pyx | 22 +++++++++++----------- tests/hashtablent_test.py | 6 +++--- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/borghash.pxd b/borghash.pxd index 9d9012a..a71b212 100644 --- a/borghash.pxd +++ b/borghash.pxd @@ -21,6 +21,6 @@ cdef class HashTable: cdef class HashTableNT: cdef int key_size cdef str value_format - cdef object namedtuple_type - cdef HashTable inner + cdef object value_type cdef int value_size + cdef HashTable inner diff --git a/borghash.pyx b/borghash.pyx index cc4d0b8..02fe15b 100644 --- a/borghash.pyx +++ b/borghash.pyx @@ -323,18 +323,18 @@ cdef class HashTable: cdef class HashTableNT: def __init__(self, items=None, *, - key_size: int = 0, value_format: str = "", namedtuple_type: object = None, + key_size: int = 0, value_format: str = "", value_type: object = None, capacity: int = MIN_CAPACITY): if not key_size: raise ValueError("key_size must be specified and must be > 0.") if not value_format: raise ValueError("value_format must be specified and must be non-empty.") - if namedtuple_type is None: - raise ValueError("namedtuple_type must be specified.") + if value_type is None: + raise ValueError("value_type must be specified (a namedtuple type corresponding to value_format).") self.key_size = key_size self.value_format = value_format self.value_size = struct.calcsize(self.value_format) - self.namedtuple_type = namedtuple_type + self.value_type = value_type self.inner = HashTable(key_size=self.key_size, value_size=self.value_size, capacity=capacity) if items is not None: for key, value in items: @@ -350,13 +350,13 @@ cdef class HashTableNT: raise ValueError(f"Key must be {self.key_size} bytes long") def _to_binary_value(self, value): - #if not isinstance(value, self.namedtuple_type): - # raise TypeError(f"Expected an instance of {self.namedtuple_type}, got {type(value)}") + #if not isinstance(value, self.value_type): + # raise TypeError(f"Expected an instance of {self.value_type}, got {type(value)}") return struct.pack(self.value_format, *value) def _to_namedtuple_value(self, binary_value): unpacked_data = struct.unpack(self.value_format, binary_value) - return self.namedtuple_type(*unpacked_data) + return self.value_type(*unpacked_data) def _set_raw(self, key: bytes, value: bytes): self.inner[key] = value @@ -444,8 +444,8 @@ cdef class HashTableNT: 'key_size': self.key_size, 'value_size': self.value_size, 'value_format': self.value_format, - 'namedtuple_type_name': self.namedtuple_type.__name__, - 'namedtuple_type_fields': self.namedtuple_type._fields, + 'value_type_name': self.value_type.__name__, + 'value_type_fields': self.value_type._fields, 'capacity': self.inner.capacity, 'used': self.inner.used, # count of keys / values } @@ -484,8 +484,8 @@ cdef class HashTableNT: if len(meta_bytes) < meta_size: raise ValueError(f"Invalid file, file is too short.") meta = json.loads(meta_bytes.decode("utf-8")) - namedtuple_type = namedtuple(meta['namedtuple_type_name'], meta['namedtuple_type_fields']) - ht = cls(key_size=meta['key_size'], value_format=meta['value_format'], namedtuple_type=namedtuple_type, capacity=meta['capacity']) + value_type = namedtuple(meta['value_type_name'], meta['value_type_fields']) + ht = cls(key_size=meta['key_size'], value_format=meta['value_format'], value_type=value_type, capacity=meta['capacity']) count = 0 ksize, vsize = meta['key_size'], meta['value_size'] for i in range(meta['used']): diff --git a/tests/hashtablent_test.py b/tests/hashtablent_test.py index eb516eb..4325807 100644 --- a/tests/hashtablent_test.py +++ b/tests/hashtablent_test.py @@ -18,7 +18,7 @@ @pytest.fixture def ntht(): - return HashTableNT(key_size=key_size, value_format=value_format, namedtuple_type=value_type) + return HashTableNT(key_size=key_size, value_format=value_format, value_type=value_type) @pytest.fixture @@ -29,10 +29,10 @@ def ntht12(ntht): def test_init(): - ht = HashTableNT(key_size=32, value_format=value_format, namedtuple_type=value_type) + ht = HashTableNT(key_size=32, value_format=value_format, value_type=value_type) assert len(ht) == 0 items = [(key1, value1), (key2, value2)] - ht = HashTableNT(items, key_size=32, value_format=value_format, namedtuple_type=value_type) + ht = HashTableNT(items, key_size=32, value_format=value_format, value_type=value_type) assert ht[key1] == value1 assert ht[key2] == value2 From 92739a02d4ef39e8fa3523876916f4a7259363b9 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sat, 26 Oct 2024 14:24:15 +0200 Subject: [PATCH 3/4] init with given items: be more flexible --- borghash.pyx | 27 ++++++++++++++++++--------- tests/hashtable_test.py | 4 ++++ 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/borghash.pyx b/borghash.pyx index 02fe15b..db5550b 100644 --- a/borghash.pyx +++ b/borghash.pyx @@ -8,13 +8,13 @@ HashTable: low-level ht mapping fully random bytes keys to bytes values. HashTableNT: wrapper around HashTable, providing namedtuple values and serialization. """ -from typing import Tuple - from libc.stdlib cimport malloc, free, realloc from libc.string cimport memcpy, memset, memcmp from libc.stdint cimport uint8_t, uint32_t +from typing import Tuple from collections import namedtuple +from collections.abc import Mapping import json import struct @@ -32,6 +32,20 @@ cdef uint32_t RESERVED = 0xFFFFFF00 # all >= this is reserved _NoDefault = object() +def _fill(this, other): + if other is None: + return + if isinstance(other, Mapping): + for key in other: + this[key] = other[key] + elif hasattr(other, "keys"): + for key in other.keys(): + this[key] = other[key] + else: + for key, value in other: + this[key] = value + + cdef class HashTable: def __init__(self, items=None, *, key_size: int = 0, value_size: int = 0, capacity: int = MIN_CAPACITY, @@ -76,10 +90,7 @@ cdef class HashTable: self.stats_linear = 0 # how many steps the linear search inside _lookup_index needed self.stats_resize_table = 0 self.stats_resize_kv = 0 - # initialize? - if items is not None: - for key, value in items: - self[key] = value + _fill(self, items) def __del__(self): free(self.table) @@ -336,9 +347,7 @@ cdef class HashTableNT: self.value_size = struct.calcsize(self.value_format) self.value_type = value_type self.inner = HashTable(key_size=self.key_size, value_size=self.value_size, capacity=capacity) - if items is not None: - for key, value in items: - self[key] = value + _fill(self, items) def clear(self): self.inner.clear() diff --git a/tests/hashtable_test.py b/tests/hashtable_test.py index ae30380..572a8bf 100644 --- a/tests/hashtable_test.py +++ b/tests/hashtable_test.py @@ -40,6 +40,10 @@ def test_init(): ht = HashTable(items, key_size=32, value_size=4) assert ht[key1] == value1 assert ht[key2] == value2 + items = dict(items) + ht = HashTable(items, key_size=32, value_size=4) + assert ht[key1] == value1 + assert ht[key2] == value2 From 8add294204a00a217a2be11bdf2a72540b845edc Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sat, 26 Oct 2024 15:04:07 +0200 Subject: [PATCH 4/4] more type hints --- borghash.pyx | 84 +++++++++++++++++++++++++++------------------------- 1 file changed, 43 insertions(+), 41 deletions(-) diff --git a/borghash.pyx b/borghash.pyx index db5550b..4684e31 100644 --- a/borghash.pyx +++ b/borghash.pyx @@ -8,11 +8,13 @@ HashTable: low-level ht mapping fully random bytes keys to bytes values. HashTableNT: wrapper around HashTable, providing namedtuple values and serialization. """ +from __future__ import annotations +from typing import BinaryIO, Iterator, Any + from libc.stdlib cimport malloc, free, realloc from libc.string cimport memcpy, memset, memcmp from libc.stdint cimport uint8_t, uint32_t -from typing import Tuple from collections import namedtuple from collections.abc import Mapping import json @@ -32,7 +34,7 @@ cdef uint32_t RESERVED = 0xFFFFFF00 # all >= this is reserved _NoDefault = object() -def _fill(this, other): +def _fill(this: Any, other: Any) -> None: if other is None: return if isinstance(other, Mapping): @@ -51,7 +53,7 @@ cdef class HashTable: key_size: int = 0, value_size: int = 0, capacity: int = MIN_CAPACITY, max_load_factor: float = 0.5, min_load_factor: float = 0.10, shrink_factor: float = 0.4, grow_factor: float = 2.0, - kv_grow_factor: float = 1.3): + kv_grow_factor: float = 1.3) -> None: # the load of the ht (.table) shall be between 0.25 and 0.5, so it is fast and has few collisions. # it is cheap to have a low hash table load, because .table only stores uint32_t indexes into the # .keys and .values array. @@ -92,12 +94,12 @@ cdef class HashTable: self.stats_resize_kv = 0 _fill(self, items) - def __del__(self): + def __del__(self) -> None: free(self.table) free(self.keys) free(self.values) - def clear(self): + def clear(self) -> None: """empty HashTable, start from scratch""" self.capacity = 0 self.used = 0 @@ -105,7 +107,7 @@ cdef class HashTable: self.kv_used = 0 self._resize_kv(int(self.initial_capacity * self.max_load_factor)) - def __len__(self): + def __len__(self) -> int: return self.used cdef int _get_index(self, uint8_t* key): @@ -133,7 +135,7 @@ cdef class HashTable: index_ptr[0] = index return 0 # not found - def __setitem__(self, key: bytes, value: bytes): + def __setitem__(self, key: bytes, value: bytes) -> None: if len(key) != self.ksize or len(value) != self.vsize: raise ValueError("Key or value size does not match the defined sizes") @@ -167,12 +169,12 @@ cdef class HashTable: if self.used + self.tombstones > self.capacity * self.max_load_factor: self._resize_table(int(self.capacity * self.grow_factor)) - def __contains__(self, key: bytes): + def __contains__(self, key: bytes) -> bool: if len(key) != self.ksize: raise ValueError("Key size does not match the defined size") return bool(self._lookup_index( key, NULL)) - def __getitem__(self, key: bytes): + def __getitem__(self, key: bytes) -> bytes: if len(key) != self.ksize: raise ValueError("Key size does not match the defined size") cdef uint32_t kv_index @@ -184,7 +186,7 @@ cdef class HashTable: else: raise KeyError("Key not found") - def __delitem__(self, key: bytes): + def __delitem__(self, key: bytes) -> None: if len(key) != self.ksize: raise ValueError("Key size does not match the defined size") cdef uint8_t* key_ptr = key @@ -207,18 +209,18 @@ cdef class HashTable: else: raise KeyError("Key not found") - def setdefault(self, key: bytes, value: bytes): + def setdefault(self, key: bytes, value: bytes) -> bytes: if not key in self: self[key] = value return self[key] - def get(self, key: bytes, default=None): + def get(self, key: bytes, default: Any = None) -> bytes|Any: try: return self[key] except KeyError: return default - def pop(self, key: bytes, default=_NoDefault): + def pop(self, key: bytes, default: Any = _NoDefault) -> bytes|Any: try: value = self[key] del self[key] @@ -228,7 +230,7 @@ cdef class HashTable: raise return default - def items(self): + def items(self) -> Iterator[tuple[bytes, bytes]]: cdef int i cdef uint32_t kv_index self.stats_iter += 1 @@ -308,7 +310,7 @@ cdef class HashTable: return kv_index raise KeyError("Key/Value not found") - def idx_to_kv(self, idx: int) -> Tuple[bytes, bytes]: + def idx_to_kv(self, idx: int) -> tuple[bytes, bytes]: """ for a given index, return the key/value stored at that index in the keys/values array. this is the reverse of kv_to_idx (e.g. 32bit index -> 256bit key + 32bit value). @@ -319,7 +321,7 @@ cdef class HashTable: return key, value @property - def stats(self): + def stats(self) -> dict[str, int]: return { "get": self.stats_get, "set": self.stats_set, @@ -334,8 +336,8 @@ cdef class HashTable: cdef class HashTableNT: def __init__(self, items=None, *, - key_size: int = 0, value_format: str = "", value_type: object = None, - capacity: int = MIN_CAPACITY): + key_size: int = 0, value_format: str = "", value_type: Any = None, + capacity: int = MIN_CAPACITY) -> None: if not key_size: raise ValueError("key_size must be specified and must be > 0.") if not value_format: @@ -349,55 +351,55 @@ cdef class HashTableNT: self.inner = HashTable(key_size=self.key_size, value_size=self.value_size, capacity=capacity) _fill(self, items) - def clear(self): + def clear(self) -> None: self.inner.clear() - def _check_key(self, key): + def _check_key(self, key: bytes) -> None: if not isinstance(key, bytes): raise TypeError(f"Expected an instance of bytes, got {type(key)}") if len(key) != self.key_size: raise ValueError(f"Key must be {self.key_size} bytes long") - def _to_binary_value(self, value): + def _to_binary_value(self, value: Any) -> bytes: #if not isinstance(value, self.value_type): # raise TypeError(f"Expected an instance of {self.value_type}, got {type(value)}") return struct.pack(self.value_format, *value) - def _to_namedtuple_value(self, binary_value): + def _to_namedtuple_value(self, binary_value: bytes) -> Any: unpacked_data = struct.unpack(self.value_format, binary_value) return self.value_type(*unpacked_data) - def _set_raw(self, key: bytes, value: bytes): + def _set_raw(self, key: bytes, value: bytes) -> None: self.inner[key] = value - def _get_raw(self, key: bytes): + def _get_raw(self, key: bytes) -> bytes: return self.inner[key] - def __setitem__(self, key: bytes, value): + def __setitem__(self, key: bytes, value: Any) -> None: self._check_key(key) self.inner[key] = self._to_binary_value(value) - def __getitem__(self, key: bytes): + def __getitem__(self, key: bytes) -> Any: self._check_key(key) binary_value = self.inner[key] return self._to_namedtuple_value(binary_value) - def __delitem__(self, key: bytes): + def __delitem__(self, key: bytes) -> None: self._check_key(key) del self.inner[key] - def __contains__(self, key: bytes): + def __contains__(self, key: bytes) -> bool: self._check_key(key) return key in self.inner - def items(self): + def items(self) -> Iterator[tuple[bytes, Any]]: for key, binary_value in self.inner.items(): yield (key, self._to_namedtuple_value(binary_value)) - def __len__(self): + def __len__(self) -> int: return len(self.inner) - def get(self, key: bytes, default=None): + def get(self, key: bytes, default: Any = None) -> Any: self._check_key(key) try: binary_value = self.inner[key] @@ -406,13 +408,13 @@ cdef class HashTableNT: else: return self._to_namedtuple_value(binary_value) - def setdefault(self, key: bytes, default): + def setdefault(self, key: bytes, default: Any) -> Any: self._check_key(key) binary_default = self._to_binary_value(default) binary_value = self.inner.setdefault(key, binary_default) return self._to_namedtuple_value(binary_value) - def pop(self, key: bytes, default=_NoDefault): + def pop(self, key: bytes, default: Any = _NoDefault) -> Any: self._check_key(key) try: binary_value = self.inner.pop(key) @@ -429,26 +431,26 @@ cdef class HashTableNT: def idx_to_k(self, idx: int) -> bytes: return self.inner.idx_to_k(idx) - def kv_to_idx(self, key: bytes, value) -> int: + def kv_to_idx(self, key: bytes, value: Any) -> int: binary_value = self._to_binary_value(value) return self.inner.kv_to_idx(key, binary_value) - def idx_to_kv(self, idx: int) -> Tuple[bytes, Tuple]: + def idx_to_kv(self, idx: int) -> tuple[bytes, Any]: key, binary_value = self.inner.idx_to_kv(idx) return key, self._to_namedtuple_value(binary_value) @property - def stats(self): + def stats(self) -> dict[str, int]: return self.inner.stats - def write(self, file): + def write(self, file: BinaryIO|str|bytes): if isinstance(file, (str, bytes)): with open(file, 'wb') as fd: self._write_fd(fd) else: self._write_fd(file) - def _write_fd(self, fd): + def _write_fd(self, fd: BinaryIO): meta = { 'key_size': self.key_size, 'value_size': self.value_size, @@ -471,7 +473,7 @@ cdef class HashTableNT: assert count == self.inner.used @classmethod - def read(cls, file): + def read(cls, file: BinaryIO|str|bytes): if isinstance(file, (str, bytes)): with open(file, 'rb') as fd: return cls._read_fd(fd) @@ -479,7 +481,7 @@ cdef class HashTableNT: return cls._read_fd(file) @classmethod - def _read_fd(cls, fd): + def _read_fd(cls, fd: BinaryIO): header_size = struct.calcsize(HEADER_FMT) header_bytes = fd.read(header_size) if len(header_bytes) < header_size: @@ -503,7 +505,7 @@ cdef class HashTableNT: ht._set_raw(key, value) return ht - def size(self): + def size(self) -> int: """ do a rough worst-case estimate of the on-disk size when using .write().