Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

more dict like #14

Merged
merged 4 commits into from
Oct 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions borghash.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,6 @@ cdef class HashTable:
cdef class HashTableNT:
cdef int key_size
cdef str value_format
cdef object namedtuple_type
cdef HashTable inner
cdef object value_type
cdef int value_size
cdef HashTable inner
135 changes: 83 additions & 52 deletions borghash.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,15 @@ HashTable: low-level ht mapping fully random bytes keys to bytes values.

HashTableNT: wrapper around HashTable, providing namedtuple values and serialization.
"""
from typing import Tuple
from __future__ import annotations
from typing import BinaryIO, Iterator, Any

from libc.stdlib cimport malloc, free, realloc
from libc.string cimport memcpy, memset, memcmp
from libc.stdint cimport uint8_t, uint32_t

from collections import namedtuple
from collections.abc import Mapping
import json
import struct

Expand All @@ -32,16 +34,35 @@ cdef uint32_t RESERVED = 0xFFFFFF00 # all >= this is reserved

_NoDefault = object()

def _fill(this: Any, other: Any) -> None:
if other is None:
return
if isinstance(other, Mapping):
for key in other:
this[key] = other[key]
elif hasattr(other, "keys"):
for key in other.keys():
this[key] = other[key]
else:
for key, value in other:
this[key] = value


cdef class HashTable:
def __init__(self, key_size: int, value_size: int, capacity: int = MIN_CAPACITY,
def __init__(self, items=None, *,
key_size: int = 0, value_size: int = 0, capacity: int = MIN_CAPACITY,
max_load_factor: float = 0.5, min_load_factor: float = 0.10,
shrink_factor: float = 0.4, grow_factor: float = 2.0,
kv_grow_factor: float = 1.3):
kv_grow_factor: float = 1.3) -> None:
# the load of the ht (.table) shall be between 0.25 and 0.5, so it is fast and has few collisions.
# it is cheap to have a low hash table load, because .table only stores uint32_t indexes into the
# .keys and .values array.
# the keys/values arrays have bigger elements and are not hash tables, thus collisions and load
# factor are no concern there. the kv_grow_factor can be relatively small.
if not key_size:
raise ValueError("key_size must be specified and must be > 0.")
if not value_size:
raise ValueError("value_size must be specified and must be > 0.")
self.ksize = key_size
self.vsize = value_size
# vvv hash table vvv
Expand All @@ -66,26 +87,27 @@ cdef class HashTable:
self.stats_get = 0
self.stats_set = 0
self.stats_del = 0
self.stats_iter = 0 # iteritems calls
self.stats_iter = 0 # .items() calls
self.stats_lookup = 0 # _lookup_index calls
self.stats_linear = 0 # how many steps the linear search inside _lookup_index needed
self.stats_resize_table = 0
self.stats_resize_kv = 0
_fill(self, items)

def __del__(self):
def __del__(self) -> None:
free(self.table)
free(self.keys)
free(self.values)

def clear(self):
def clear(self) -> None:
"""empty HashTable, start from scratch"""
self.capacity = 0
self.used = 0
self._resize_table(self.initial_capacity)
self.kv_used = 0
self._resize_kv(int(self.initial_capacity * self.max_load_factor))

def __len__(self):
def __len__(self) -> int:
return self.used

cdef int _get_index(self, uint8_t* key):
Expand Down Expand Up @@ -113,7 +135,7 @@ cdef class HashTable:
index_ptr[0] = index
return 0 # not found

def __setitem__(self, key: bytes, value: bytes):
def __setitem__(self, key: bytes, value: bytes) -> None:
if len(key) != self.ksize or len(value) != self.vsize:
raise ValueError("Key or value size does not match the defined sizes")

Expand Down Expand Up @@ -147,12 +169,12 @@ cdef class HashTable:
if self.used + self.tombstones > self.capacity * self.max_load_factor:
self._resize_table(int(self.capacity * self.grow_factor))

def __contains__(self, key: bytes):
def __contains__(self, key: bytes) -> bool:
if len(key) != self.ksize:
raise ValueError("Key size does not match the defined size")
return bool(self._lookup_index(<uint8_t*> key, NULL))

def __getitem__(self, key: bytes):
def __getitem__(self, key: bytes) -> bytes:
if len(key) != self.ksize:
raise ValueError("Key size does not match the defined size")
cdef uint32_t kv_index
Expand All @@ -164,7 +186,7 @@ cdef class HashTable:
else:
raise KeyError("Key not found")

def __delitem__(self, key: bytes):
def __delitem__(self, key: bytes) -> None:
if len(key) != self.ksize:
raise ValueError("Key size does not match the defined size")
cdef uint8_t* key_ptr = <uint8_t*> key
Expand All @@ -187,18 +209,18 @@ cdef class HashTable:
else:
raise KeyError("Key not found")

def setdefault(self, key: bytes, value: bytes):
def setdefault(self, key: bytes, value: bytes) -> bytes:
if not key in self:
self[key] = value
return self[key]

def get(self, key: bytes, default=None):
def get(self, key: bytes, default: Any = None) -> bytes|Any:
try:
return self[key]
except KeyError:
return default

def pop(self, key: bytes, default=_NoDefault):
def pop(self, key: bytes, default: Any = _NoDefault) -> bytes|Any:
try:
value = self[key]
del self[key]
Expand All @@ -208,7 +230,7 @@ cdef class HashTable:
raise
return default

def iteritems(self):
def items(self) -> Iterator[tuple[bytes, bytes]]:
cdef int i
cdef uint32_t kv_index
self.stats_iter += 1
Expand Down Expand Up @@ -288,7 +310,7 @@ cdef class HashTable:
return kv_index
raise KeyError("Key/Value not found")

def idx_to_kv(self, idx: int) -> Tuple[bytes, bytes]:
def idx_to_kv(self, idx: int) -> tuple[bytes, bytes]:
"""
for a given index, return the key/value stored at that index in the keys/values array.
this is the reverse of kv_to_idx (e.g. 32bit index -> 256bit key + 32bit value).
Expand All @@ -299,7 +321,7 @@ cdef class HashTable:
return key, value

@property
def stats(self):
def stats(self) -> dict[str, int]:
return {
"get": self.stats_get,
"set": self.stats_set,
Expand All @@ -313,62 +335,71 @@ cdef class HashTable:


cdef class HashTableNT:
def __init__(self, int key_size, str value_format, object namedtuple_type, int capacity = MIN_CAPACITY):
def __init__(self, items=None, *,
key_size: int = 0, value_format: str = "", value_type: Any = None,
capacity: int = MIN_CAPACITY) -> None:
if not key_size:
raise ValueError("key_size must be specified and must be > 0.")
if not value_format:
raise ValueError("value_format must be specified and must be non-empty.")
if value_type is None:
raise ValueError("value_type must be specified (a namedtuple type corresponding to value_format).")
self.key_size = key_size
self.value_format = value_format
self.value_size = struct.calcsize(self.value_format)
self.namedtuple_type = namedtuple_type
self.inner = HashTable(self.key_size, self.value_size, capacity=capacity)
self.value_type = value_type
self.inner = HashTable(key_size=self.key_size, value_size=self.value_size, capacity=capacity)
_fill(self, items)

def clear(self):
def clear(self) -> None:
self.inner.clear()

def _check_key(self, key):
def _check_key(self, key: bytes) -> None:
if not isinstance(key, bytes):
raise TypeError(f"Expected an instance of bytes, got {type(key)}")
if len(key) != self.key_size:
raise ValueError(f"Key must be {self.key_size} bytes long")

def _to_binary_value(self, value):
#if not isinstance(value, self.namedtuple_type):
# raise TypeError(f"Expected an instance of {self.namedtuple_type}, got {type(value)}")
def _to_binary_value(self, value: Any) -> bytes:
#if not isinstance(value, self.value_type):
# raise TypeError(f"Expected an instance of {self.value_type}, got {type(value)}")
return struct.pack(self.value_format, *value)

def _to_namedtuple_value(self, binary_value):
def _to_namedtuple_value(self, binary_value: bytes) -> Any:
unpacked_data = struct.unpack(self.value_format, binary_value)
return self.namedtuple_type(*unpacked_data)
return self.value_type(*unpacked_data)

def _set_raw(self, key: bytes, value: bytes):
def _set_raw(self, key: bytes, value: bytes) -> None:
self.inner[key] = value

def _get_raw(self, key: bytes):
def _get_raw(self, key: bytes) -> bytes:
return self.inner[key]

def __setitem__(self, key: bytes, value):
def __setitem__(self, key: bytes, value: Any) -> None:
self._check_key(key)
self.inner[key] = self._to_binary_value(value)

def __getitem__(self, key: bytes):
def __getitem__(self, key: bytes) -> Any:
self._check_key(key)
binary_value = self.inner[key]
return self._to_namedtuple_value(binary_value)

def __delitem__(self, key: bytes):
def __delitem__(self, key: bytes) -> None:
self._check_key(key)
del self.inner[key]

def __contains__(self, key: bytes):
def __contains__(self, key: bytes) -> bool:
self._check_key(key)
return key in self.inner

def iteritems(self):
for key, binary_value in self.inner.iteritems():
def items(self) -> Iterator[tuple[bytes, Any]]:
for key, binary_value in self.inner.items():
yield (key, self._to_namedtuple_value(binary_value))

def __len__(self):
def __len__(self) -> int:
return len(self.inner)

def get(self, key: bytes, default=None):
def get(self, key: bytes, default: Any = None) -> Any:
self._check_key(key)
try:
binary_value = self.inner[key]
Expand All @@ -377,13 +408,13 @@ cdef class HashTableNT:
else:
return self._to_namedtuple_value(binary_value)

def setdefault(self, key: bytes, default):
def setdefault(self, key: bytes, default: Any) -> Any:
self._check_key(key)
binary_default = self._to_binary_value(default)
binary_value = self.inner.setdefault(key, binary_default)
return self._to_namedtuple_value(binary_value)

def pop(self, key: bytes, default=_NoDefault):
def pop(self, key: bytes, default: Any = _NoDefault) -> Any:
self._check_key(key)
try:
binary_value = self.inner.pop(key)
Expand All @@ -400,32 +431,32 @@ cdef class HashTableNT:
def idx_to_k(self, idx: int) -> bytes:
return self.inner.idx_to_k(idx)

def kv_to_idx(self, key: bytes, value) -> int:
def kv_to_idx(self, key: bytes, value: Any) -> int:
binary_value = self._to_binary_value(value)
return self.inner.kv_to_idx(key, binary_value)

def idx_to_kv(self, idx: int) -> Tuple[bytes, Tuple]:
def idx_to_kv(self, idx: int) -> tuple[bytes, Any]:
key, binary_value = self.inner.idx_to_kv(idx)
return key, self._to_namedtuple_value(binary_value)

@property
def stats(self):
def stats(self) -> dict[str, int]:
return self.inner.stats

def write(self, file):
def write(self, file: BinaryIO|str|bytes):
if isinstance(file, (str, bytes)):
with open(file, 'wb') as fd:
self._write_fd(fd)
else:
self._write_fd(file)

def _write_fd(self, fd):
def _write_fd(self, fd: BinaryIO):
meta = {
'key_size': self.key_size,
'value_size': self.value_size,
'value_format': self.value_format,
'namedtuple_type_name': self.namedtuple_type.__name__,
'namedtuple_type_fields': self.namedtuple_type._fields,
'value_type_name': self.value_type.__name__,
'value_type_fields': self.value_type._fields,
'capacity': self.inner.capacity,
'used': self.inner.used, # count of keys / values
}
Expand All @@ -435,22 +466,22 @@ cdef class HashTableNT:
fd.write(header_bytes)
fd.write(meta_bytes)
count = 0
for key, value in self.inner.iteritems():
for key, value in self.inner.items():
fd.write(key)
fd.write(value)
count += 1
assert count == self.inner.used

@classmethod
def read(cls, file):
def read(cls, file: BinaryIO|str|bytes):
if isinstance(file, (str, bytes)):
with open(file, 'rb') as fd:
return cls._read_fd(fd)
else:
return cls._read_fd(file)

@classmethod
def _read_fd(cls, fd):
def _read_fd(cls, fd: BinaryIO):
header_size = struct.calcsize(HEADER_FMT)
header_bytes = fd.read(header_size)
if len(header_bytes) < header_size:
Expand All @@ -464,8 +495,8 @@ cdef class HashTableNT:
if len(meta_bytes) < meta_size:
raise ValueError(f"Invalid file, file is too short.")
meta = json.loads(meta_bytes.decode("utf-8"))
namedtuple_type = namedtuple(meta['namedtuple_type_name'], meta['namedtuple_type_fields'])
ht = cls(key_size=meta['key_size'], value_format=meta['value_format'], namedtuple_type=namedtuple_type, capacity=meta['capacity'])
value_type = namedtuple(meta['value_type_name'], meta['value_type_fields'])
ht = cls(key_size=meta['key_size'], value_format=meta['value_format'], value_type=value_type, capacity=meta['capacity'])
count = 0
ksize, vsize = meta['key_size'], meta['value_size']
for i in range(meta['used']):
Expand All @@ -474,7 +505,7 @@ cdef class HashTableNT:
ht._set_raw(key, value)
return ht

def size(self):
def size(self) -> int:
"""
do a rough worst-case estimate of the on-disk size when using .write().

Expand Down
Loading