Skip to content

Commit

Permalink
more type hints
Browse files Browse the repository at this point in the history
  • Loading branch information
ThomasWaldmann committed Oct 26, 2024
1 parent 92739a0 commit dc92268
Showing 1 changed file with 42 additions and 40 deletions.
82 changes: 42 additions & 40 deletions borghash.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,13 @@ HashTable: low-level ht mapping fully random bytes keys to bytes values.
HashTableNT: wrapper around HashTable, providing namedtuple values and serialization.
"""
from __future__ import annotations
from typing import BinaryIO, Iterator

from libc.stdlib cimport malloc, free, realloc
from libc.string cimport memcpy, memset, memcmp
from libc.stdint cimport uint8_t, uint32_t

from typing import Tuple
from collections import namedtuple
from collections.abc import Mapping
import json
Expand All @@ -32,7 +34,7 @@ cdef uint32_t RESERVED = 0xFFFFFF00 # all >= this is reserved

_NoDefault = object()

def _fill(this, other):
def _fill(this: object, other: object) -> None:
if other is None:
return
if isinstance(other, Mapping):
Expand All @@ -51,7 +53,7 @@ cdef class HashTable:
key_size: int = 0, value_size: int = 0, capacity: int = MIN_CAPACITY,
max_load_factor: float = 0.5, min_load_factor: float = 0.10,
shrink_factor: float = 0.4, grow_factor: float = 2.0,
kv_grow_factor: float = 1.3):
kv_grow_factor: float = 1.3) -> None:
# the load of the ht (.table) shall be between 0.25 and 0.5, so it is fast and has few collisions.
# it is cheap to have a low hash table load, because .table only stores uint32_t indexes into the
# .keys and .values array.
Expand Down Expand Up @@ -92,20 +94,20 @@ cdef class HashTable:
self.stats_resize_kv = 0
_fill(self, items)

def __del__(self):
def __del__(self) -> None:
free(self.table)
free(self.keys)
free(self.values)

def clear(self):
def clear(self) -> None:
"""empty HashTable, start from scratch"""
self.capacity = 0
self.used = 0
self._resize_table(self.initial_capacity)
self.kv_used = 0
self._resize_kv(int(self.initial_capacity * self.max_load_factor))

def __len__(self):
def __len__(self) -> int:
return self.used

cdef int _get_index(self, uint8_t* key):
Expand Down Expand Up @@ -133,7 +135,7 @@ cdef class HashTable:
index_ptr[0] = index
return 0 # not found

def __setitem__(self, key: bytes, value: bytes):
def __setitem__(self, key: bytes, value: bytes) -> None:
if len(key) != self.ksize or len(value) != self.vsize:
raise ValueError("Key or value size does not match the defined sizes")

Expand Down Expand Up @@ -167,12 +169,12 @@ cdef class HashTable:
if self.used + self.tombstones > self.capacity * self.max_load_factor:
self._resize_table(int(self.capacity * self.grow_factor))

def __contains__(self, key: bytes):
def __contains__(self, key: bytes) -> bool:
if len(key) != self.ksize:
raise ValueError("Key size does not match the defined size")
return bool(self._lookup_index(<uint8_t*> key, NULL))

def __getitem__(self, key: bytes):
def __getitem__(self, key: bytes) -> bytes:
if len(key) != self.ksize:
raise ValueError("Key size does not match the defined size")
cdef uint32_t kv_index
Expand All @@ -184,7 +186,7 @@ cdef class HashTable:
else:
raise KeyError("Key not found")

def __delitem__(self, key: bytes):
def __delitem__(self, key: bytes) -> None:
if len(key) != self.ksize:
raise ValueError("Key size does not match the defined size")
cdef uint8_t* key_ptr = <uint8_t*> key
Expand All @@ -207,18 +209,18 @@ cdef class HashTable:
else:
raise KeyError("Key not found")

def setdefault(self, key: bytes, value: bytes):
def setdefault(self, key: bytes, value: bytes) -> bytes:
if not key in self:
self[key] = value
return self[key]

def get(self, key: bytes, default=None):
def get(self, key: bytes, default: object = None) -> bytes|object:
try:
return self[key]
except KeyError:
return default

def pop(self, key: bytes, default=_NoDefault):
def pop(self, key: bytes, default: object = _NoDefault) -> bytes|object:
try:
value = self[key]
del self[key]
Expand All @@ -228,7 +230,7 @@ cdef class HashTable:
raise
return default

def items(self):
def items(self) -> Iterator[tuple[bytes, bytes]]:
cdef int i
cdef uint32_t kv_index
self.stats_iter += 1
Expand Down Expand Up @@ -308,7 +310,7 @@ cdef class HashTable:
return kv_index
raise KeyError("Key/Value not found")

def idx_to_kv(self, idx: int) -> Tuple[bytes, bytes]:
def idx_to_kv(self, idx: int) -> tuple[bytes, bytes]:
"""
for a given index, return the key/value stored at that index in the keys/values array.
this is the reverse of kv_to_idx (e.g. 32bit index -> 256bit key + 32bit value).
Expand All @@ -319,7 +321,7 @@ cdef class HashTable:
return key, value

@property
def stats(self):
def stats(self) -> dict[str, int]:
return {
"get": self.stats_get,
"set": self.stats_set,
Expand All @@ -335,7 +337,7 @@ cdef class HashTable:
cdef class HashTableNT:
def __init__(self, items=None, *,
key_size: int = 0, value_format: str = "", value_type: object = None,
capacity: int = MIN_CAPACITY):
capacity: int = MIN_CAPACITY) -> None:
if not key_size:
raise ValueError("key_size must be specified and must be > 0.")
if not value_format:
Expand All @@ -349,55 +351,55 @@ cdef class HashTableNT:
self.inner = HashTable(key_size=self.key_size, value_size=self.value_size, capacity=capacity)
_fill(self, items)

def clear(self):
def clear(self) -> None:
self.inner.clear()

def _check_key(self, key):
def _check_key(self, key: bytes) -> None:
if not isinstance(key, bytes):
raise TypeError(f"Expected an instance of bytes, got {type(key)}")
if len(key) != self.key_size:
raise ValueError(f"Key must be {self.key_size} bytes long")

def _to_binary_value(self, value):
def _to_binary_value(self, value: object) -> bytes:
#if not isinstance(value, self.value_type):
# raise TypeError(f"Expected an instance of {self.value_type}, got {type(value)}")
return struct.pack(self.value_format, *value)

def _to_namedtuple_value(self, binary_value):
def _to_namedtuple_value(self, binary_value: bytes) -> object:
unpacked_data = struct.unpack(self.value_format, binary_value)
return self.value_type(*unpacked_data)

def _set_raw(self, key: bytes, value: bytes):
def _set_raw(self, key: bytes, value: bytes) -> None:
self.inner[key] = value

def _get_raw(self, key: bytes):
def _get_raw(self, key: bytes) -> bytes:
return self.inner[key]

def __setitem__(self, key: bytes, value):
def __setitem__(self, key: bytes, value: object) -> None:
self._check_key(key)
self.inner[key] = self._to_binary_value(value)

def __getitem__(self, key: bytes):
def __getitem__(self, key: bytes) -> object:
self._check_key(key)
binary_value = self.inner[key]
return self._to_namedtuple_value(binary_value)

def __delitem__(self, key: bytes):
def __delitem__(self, key: bytes) -> None:
self._check_key(key)
del self.inner[key]

def __contains__(self, key: bytes):
def __contains__(self, key: bytes) -> bool:
self._check_key(key)
return key in self.inner

def items(self):
def items(self) -> Iterator[tuple[bytes, object]]:
for key, binary_value in self.inner.items():
yield (key, self._to_namedtuple_value(binary_value))

def __len__(self):
def __len__(self) -> int:
return len(self.inner)

def get(self, key: bytes, default=None):
def get(self, key: bytes, default: object = None) -> object:
self._check_key(key)
try:
binary_value = self.inner[key]
Expand All @@ -406,13 +408,13 @@ cdef class HashTableNT:
else:
return self._to_namedtuple_value(binary_value)

def setdefault(self, key: bytes, default):
def setdefault(self, key: bytes, default: object) -> object:
self._check_key(key)
binary_default = self._to_binary_value(default)
binary_value = self.inner.setdefault(key, binary_default)
return self._to_namedtuple_value(binary_value)

def pop(self, key: bytes, default=_NoDefault):
def pop(self, key: bytes, default: object = _NoDefault) -> object:
self._check_key(key)
try:
binary_value = self.inner.pop(key)
Expand All @@ -429,26 +431,26 @@ cdef class HashTableNT:
def idx_to_k(self, idx: int) -> bytes:
return self.inner.idx_to_k(idx)

def kv_to_idx(self, key: bytes, value) -> int:
def kv_to_idx(self, key: bytes, value: object) -> int:
binary_value = self._to_binary_value(value)
return self.inner.kv_to_idx(key, binary_value)

def idx_to_kv(self, idx: int) -> Tuple[bytes, Tuple]:
def idx_to_kv(self, idx: int) -> tuple[bytes, object]:
key, binary_value = self.inner.idx_to_kv(idx)
return key, self._to_namedtuple_value(binary_value)

@property
def stats(self):
def stats(self) -> dict[str, int]:
return self.inner.stats

def write(self, file):
def write(self, file: BinaryIO|str|bytes):
if isinstance(file, (str, bytes)):
with open(file, 'wb') as fd:
self._write_fd(fd)
else:
self._write_fd(file)

def _write_fd(self, fd):
def _write_fd(self, fd: BinaryIO):
meta = {
'key_size': self.key_size,
'value_size': self.value_size,
Expand All @@ -471,15 +473,15 @@ cdef class HashTableNT:
assert count == self.inner.used

@classmethod
def read(cls, file):
def read(cls, file: BinaryIO|str|bytes):
if isinstance(file, (str, bytes)):
with open(file, 'rb') as fd:
return cls._read_fd(fd)
else:
return cls._read_fd(file)

@classmethod
def _read_fd(cls, fd):
def _read_fd(cls, fd: BinaryIO):
header_size = struct.calcsize(HEADER_FMT)
header_bytes = fd.read(header_size)
if len(header_bytes) < header_size:
Expand All @@ -503,7 +505,7 @@ cdef class HashTableNT:
ht._set_raw(key, value)
return ht

def size(self):
def size(self) -> int:
"""
do a rough worst-case estimate of the on-disk size when using .write().

Expand Down

0 comments on commit dc92268

Please sign in to comment.