Skip to content

Commit

Permalink
evaluate jsonpickle to serialize/deserialise objects
Browse files Browse the repository at this point in the history
verdict: order of magnitude slower than 'custom' serialization

    these are baseline results on @karlicoss desktop pc (initialization + hitting the cache)
    11.13s call     src/cachew/tests/test_cachew.py::test_many[1000000-False]
    5.61s call     src/cachew/tests/test_cachew.py::test_many[500000-False]
    1.13s call     src/cachew/tests/test_cachew.py::test_many[100000-True]

    these are results with jsonpickle.. not great
    86.42s call     src/cachew/tests/test_cachew.py::test_many[1000000-False]
    44.08s call     src/cachew/tests/test_cachew.py::test_many[500000-False]
    8.78s call     src/cachew/tests/test_cachew.py::test_many[100000-True]
  • Loading branch information
karlicoss committed Sep 7, 2023
1 parent b867166 commit 048df33
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 108 deletions.
117 changes: 12 additions & 105 deletions src/cachew/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
import warnings


import jsonpickle

import appdirs

import sqlalchemy
Expand Down Expand Up @@ -403,103 +405,6 @@ def make(tp: Type[NT], name: Optional[str]=None) -> 'NTBinder[NT]':
fields=fields,
)

@property
def columns(self) -> List[Column]:
return list(self.iter_columns())

# TODO not necessarily namedtuple? could be primitive type
def to_row(self, obj: NT) -> Tuple[Optional[Values], ...]:
return tuple(self._to_row(obj))

def from_row(self, row: Iterable[Any]) -> NT:
riter = iter(row)
res = self._from_row(riter)
remaining = list(islice(riter, 0, 1))
if len(remaining) != 0:
raise CachewException(f'unconsumed items in iterator {remaining}')
assert res is not None # nosec # help mypy; top level will not be None
return res


def _to_row(self, obj) -> Iterator[Optional[Values]]:
if self.primitive:
yield obj
elif self.union is not None:
CachewUnion = self.union
(uf,) = self.fields
# TODO assert only one of them matches??
union = CachewUnion(**{
f.name: obj if isinstance(obj, f.type_) else None
for f in uf.fields
})
yield from uf._to_row(union)
else:
if self.optional:
is_none = obj is None
yield is_none
else:
is_none = False; assert obj is not None # TODO hmm, that last assert is not very symmetric...

if is_none:
for _ in range(self.span - 1):
yield None
else:
yield from chain.from_iterable(
f._to_row(getattr(obj, f.name))
for f in self.fields
)

def _from_row(self, row_iter):
if self.primitive:
return next(row_iter)
elif self.union is not None:
CachewUnion = self.union
(uf,) = self.fields
# TODO assert only one of them is not None?
union_params = [
r
for r in uf._from_row(row_iter) if r is not None
]
assert len(union_params) == 1, union_params
return union_params[0]
else:
if self.optional:
is_none = next(row_iter)
else:
is_none = False

if is_none:
for _ in range(self.span - 1):
x = next(row_iter)
assert x is None, x # huh. assert is kinda opposite of producing value
return None
else:
return self.type_(*(
f._from_row(row_iter)
for f in self.fields
))

# TODO not sure if we want to allow optionals on top level?
def iter_columns(self) -> Iterator[Column]:
used_names: Set[str] = set()

def col(name: str, tp) -> Column:
while name in used_names:
name = '_' + name
used_names.add(name)
return Column(name, tp)

if self.primitive:
if self.name is None: raise AssertionError
yield col(self.name, PRIMITIVES[self.type_])
else:
prefix = '' if self.name is None else self.name + '_'
if self.optional:
yield col(f'_{prefix}is_null', sqlalchemy.Boolean)
for f in self.fields:
for c in f.iter_columns():
yield col(f'{prefix}{c.name}', c.type)

def __str__(self):
lines = [' ' * level + str(x.name) + ('?' if x.optional else '') + f' <span {x.span}>' for level, x in self.flatten()]
return '\n'.join(lines)
Expand Down Expand Up @@ -562,9 +467,10 @@ def do_begin(conn):

self.binder = NTBinder.make(tp=cls)
# actual cache
self.table_cache = Table('cache' , self.meta, *self.binder.columns)
# FIXME change table definition
self.table_cache = Table('cache' , self.meta, Column('data', sqlalchemy.String))
# temporary table, we use it to insert and then (atomically?) rename to the above table at the very end
self.table_cache_tmp = Table('cache_tmp', self.meta, *self.binder.columns)
self.table_cache_tmp = Table('cache_tmp', self.meta, Column('data', sqlalchemy.String))

def __enter__(self) -> 'DbHelper':
return self
Expand Down Expand Up @@ -882,7 +788,7 @@ def composite_hash(self, *args, **kwargs) -> Dict[str, Any]:
}
kwargs = {**defaults, **kwargs}
binder = NTBinder.make(tp=self.cls_)
schema = str(binder.columns) # todo not super nice, but works fine for now
schema = str('FIXME') # todo not super nice, but works fine for now
hash_parts = {
'cachew' : CACHEW_VERSION,
'schema' : schema,
Expand Down Expand Up @@ -996,8 +902,8 @@ def cachew_wrapper(

def cached_items():
rows = conn.execute(table_cache.select())
for row in rows:
yield binder.from_row(row)
for (js,) in rows:
yield jsonpickle.decode(js)

if new_hash == old_hash:
logger.debug('hash matched: loading from cache')
Expand Down Expand Up @@ -1107,7 +1013,7 @@ def flush() -> None:
dict(zip(column_names, row))
for row in chunk
]
conn.execute(insert_into_table_cache_tmp, chunk_dict)
conn.execute(insert_into_table_cache_tmp, [{'data': c} for c in chunk])
chunk = []

total_objects = 0
Expand All @@ -1118,8 +1024,9 @@ def flush() -> None:
except GeneratorExit:
early_exit = True
return

chunk.append(binder.to_row(d))

js = jsonpickle.encode(d)
chunk.append(js)
if len(chunk) >= chunk_by:
flush()
flush()
Expand Down
22 changes: 19 additions & 3 deletions src/cachew/tests/test_cachew.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,8 +295,9 @@ class TE2(NamedTuple):
# e.g. -k 'test_many[500000-False]'
# fmt: off
@pytest.mark.parametrize('count,on_ci', [
(100000, True),
(500000, False),
(100_000, True),
(500_000, False),
(1_000_000, False),
])
# fmt: on
def test_many(count: int, on_ci: bool, tmp_path: Path) -> None:
Expand All @@ -317,7 +318,22 @@ def iter_data() -> Iterator[TE2]:

assert ilen(iter_data()) == count # initial
assert ilen(iter_data()) == count # hitting cache
assert last(iter_data()) == TE2(value=count - 1, uuu=UUU(xx=count - 1, yy=count - 1), value2=count - 1)

# these are baseline results on @karlicoss desktop pc (initialization + hitting the cache)
# 11.13s call src/cachew/tests/test_cachew.py::test_many[1000000-False]
# 5.61s call src/cachew/tests/test_cachew.py::test_many[500000-False]
# 1.13s call src/cachew/tests/test_cachew.py::test_many[100000-True]


# these are results with jsonpickle.. not great
# 86.42s call src/cachew/tests/test_cachew.py::test_many[1000000-False]
# 44.08s call src/cachew/tests/test_cachew.py::test_many[500000-False]
# 8.78s call src/cachew/tests/test_cachew.py::test_many[100000-True]



# assert last(iter_data()) == TE2(value=count - 1, uuu=UUU(xx=count - 1, yy=count - 1), value2=count - 1)


# serializing to db
# in-memory: 16 seconds
Expand Down

0 comments on commit 048df33

Please sign in to comment.