Skip to content

Commit

Permalink
perf(python): Pre-allocate size for the dictionary (apache#1949)
Browse files Browse the repository at this point in the history
## What does this PR do?

Pre-allocate memory for the dictionary based on the data size to avoid
resizing and improve deserialization performance.

## Related issues

## Does this PR introduce any user-facing change?

- [ ] Does this PR introduce any public API change?
- [ ] Does this PR introduce any binary protocol compatibility change?

## Benchmark
```
# python format
fury_large_dict: Mean +- std dev: [dict_base] 548 us +- 33 us -> [dict_resize] 531 us +- 33 us: 1.03x faster

# xlang format
fury_large_dict: Mean +- std dev: [dict_xlang_base] 550 us +- 39 us -> [dict_xlang_resize] 527 us +- 35 us: 1.05x faster
```
  • Loading branch information
penguin-wwy authored Nov 20, 2024
1 parent 8add13c commit 9295e58
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 3 deletions.
4 changes: 4 additions & 0 deletions integration_tests/cpython_benchmark/fury_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@
"view_count": 7,
"zip": "",
}
LARGE_DICT = {str(i): i for i in range(2**10 + 1)}

TUPLE = (
[
Expand Down Expand Up @@ -177,6 +178,9 @@ def micro_benchmark():
runner.parse_args()
language = pyfury.Language.XLANG if args.xlang else pyfury.Language.PYTHON
runner.bench_func("fury_dict", fury_object, language, not args.no_ref, DICT)
runner.bench_func(
"fury_large_dict", fury_object, language, not args.no_ref, LARGE_DICT
)
runner.bench_func(
"fury_dict_group", fury_object, language, not args.no_ref, DICT_GROUP
)
Expand Down
7 changes: 4 additions & 3 deletions python/pyfury/_serialization.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ cdef extern from *:
"""
object int2obj(int64_t obj_addr)
int64_t obj2int(object obj_ref)
dict _PyDict_NewPresized(Py_ssize_t minused)


cdef int8_t NULL_FLAG = -3
Expand Down Expand Up @@ -2081,9 +2082,9 @@ cdef class MapSerializer(Serializer):
cpdef inline read(self, Buffer buffer):
cdef MapRefResolver ref_resolver = self.ref_resolver
cdef ClassResolver class_resolver = self.class_resolver
cdef dict map_ = {}
ref_resolver.reference(map_)
cdef int32_t len_ = buffer.read_varint32()
cdef dict map_ = _PyDict_NewPresized(len_)
ref_resolver.reference(map_)
cdef int32_t ref_id
cdef ClassInfo key_classinfo
cdef ClassInfo value_classinfo
Expand Down Expand Up @@ -2131,7 +2132,7 @@ cdef class MapSerializer(Serializer):

cpdef inline xread(self, Buffer buffer):
cdef int32_t len_ = buffer.read_varint32()
cdef dict map_ = {}
cdef dict map_ = _PyDict_NewPresized(len_)
self.fury.ref_resolver.reference(map_)
for i in range(len_):
k = self.fury.xdeserialize_ref(
Expand Down

0 comments on commit 9295e58

Please sign in to comment.