Skip to content

Commit

Permalink
PERF: Index._shallow_copy shares _cache with copies of self (pandas-d…
Browse files Browse the repository at this point in the history
…ev#36840)

* PERF: Index.equals when comparing to copies of self

* refactor _shallow_copy, add GH number

* PERF: share _cache, don't share _id

* rename tests

* fix memory usage test

Co-authored-by: Jeff Reback <[email protected]>
  • Loading branch information
2 people authored and jbrockmendel committed Oct 6, 2020
1 parent 6e34063 commit 2847337
Show file tree
Hide file tree
Showing 8 changed files with 40 additions and 45 deletions.
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,8 @@ Performance improvements
- ``Styler`` uuid method altered to compress data transmission over web whilst maintaining reasonably low table collision probability (:issue:`36345`)
- Performance improvement in :meth:`pd.to_datetime` with non-ns time unit for ``float`` ``dtype`` columns (:issue:`20445`)
- Performance improvement in setting values on a :class:`IntervalArray` (:issue:`36310`)
- The internal index method :meth:`~Index._shallow_copy` now makes the new index and original index share cached attributes,
avoiding creating these again, if created on either. This can speed up operations that depend on creating copies of existing indexes (:issue:`36840`)
- Performance improvement in :meth:`RollingGroupby.count` (:issue:`35625`)

.. ---------------------------------------------------------------------------
Expand Down
10 changes: 5 additions & 5 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -561,12 +561,12 @@ def _shallow_copy(self, values=None, name: Label = no_default):
name : Label, defaults to self.name
"""
name = self.name if name is no_default else name
cache = self._cache.copy() if values is None else {}
if values is None:
values = self._values

result = self._simple_new(values, name=name)
result._cache = cache
if values is not None:
return self._simple_new(values, name=name)

result = self._simple_new(self._values, name=name)
result._cache = self._cache
return result

def is_(self, other) -> bool:
Expand Down
14 changes: 6 additions & 8 deletions pandas/core/indexes/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -673,17 +673,15 @@ def _with_freq(self, freq):

def _shallow_copy(self, values=None, name: Label = lib.no_default):
name = self.name if name is lib.no_default else name
cache = self._cache.copy() if values is None else {}

if values is None:
values = self._data

if isinstance(values, np.ndarray):
if values is not None:
# TODO: We would rather not get here
values = type(self._data)(values, dtype=self.dtype)
if isinstance(values, np.ndarray):
values = type(self._data)(values, dtype=self.dtype)
return self._simple_new(values, name=name)

result = type(self)._simple_new(values, name=name)
result._cache = cache
result = self._simple_new(self._data, name=name)
result._cache = self._cache
return result

# --------------------------------------------------------------------
Expand Down
10 changes: 5 additions & 5 deletions pandas/core/indexes/interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,12 +335,12 @@ def _shallow_copy(
self, values: Optional[IntervalArray] = None, name: Label = lib.no_default
):
name = self.name if name is lib.no_default else name
cache = self._cache.copy() if values is None else {}
if values is None:
values = self._data

result = self._simple_new(values, name=name)
result._cache = cache
if values is not None:
return self._simple_new(values, name=name)

result = self._simple_new(self._data, name=name)
result._cache = self._cache
return result

@cache_readonly
Expand Down
10 changes: 5 additions & 5 deletions pandas/core/indexes/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,12 +260,12 @@ def _has_complex_internals(self) -> bool:

def _shallow_copy(self, values=None, name: Label = no_default):
name = name if name is not no_default else self.name
cache = self._cache.copy() if values is None else {}
if values is None:
values = self._data

result = self._simple_new(values, name=name)
result._cache = cache
if values is not None:
return self._simple_new(values, name=name)

result = self._simple_new(self._data, name=name)
result._cache = self._cache
return result

def _maybe_convert_timedelta(self, other):
Expand Down
10 changes: 5 additions & 5 deletions pandas/core/indexes/range.py
Original file line number Diff line number Diff line change
Expand Up @@ -397,13 +397,13 @@ def __iter__(self):
def _shallow_copy(self, values=None, name: Label = no_default):
name = self.name if name is no_default else name

if values is None:
result = self._simple_new(self._range, name=name)
result._cache = self._cache.copy()
return result
else:
if values is not None:
return Int64Index._simple_new(values, name=name)

result = self._simple_new(self._range, name=name)
result._cache = self._cache
return result

@doc(Int64Index.copy)
def copy(self, name=None, deep=False, dtype=None, names=None):
name = self._validate_names(name=name, names=names, deep=deep)[0]
Expand Down
3 changes: 2 additions & 1 deletion pandas/tests/base/test_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,8 @@ def test_memory_usage(index_or_series_obj):
)

if len(obj) == 0:
assert res_deep == res == 0
expected = 0 if isinstance(obj, Index) else 80
assert res_deep == res == expected
elif is_object or is_categorical:
# only deep will pick them up
assert res_deep > res
Expand Down
26 changes: 10 additions & 16 deletions pandas/tests/indexes/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -935,28 +935,22 @@ def test_contains_requires_hashable_raises(self):
with pytest.raises(TypeError, match=msg):
{} in idx._engine

def test_copy_copies_cache(self):
# GH32898
def test_copy_shares_cache(self):
# GH32898, GH36840
idx = self.create_index()
idx.get_loc(idx[0]) # populates the _cache.
copy = idx.copy()

# check that the copied cache is a copy of the original
assert idx._cache == copy._cache
assert idx._cache is not copy._cache
# cache values should reference the same object
for key, val in idx._cache.items():
assert copy._cache[key] is val, key
assert copy._cache is idx._cache

def test_shallow_copy_copies_cache(self):
# GH32669
def test_shallow_copy_shares_cache(self):
# GH32669, GH36840
idx = self.create_index()
idx.get_loc(idx[0]) # populates the _cache.
shallow_copy = idx._shallow_copy()

# check that the shallow_copied cache is a copy of the original
assert idx._cache == shallow_copy._cache
assert idx._cache is not shallow_copy._cache
# cache values should reference the same object
for key, val in idx._cache.items():
assert shallow_copy._cache[key] is val, key
assert shallow_copy._cache is idx._cache

shallow_copy = idx._shallow_copy(idx._data)
assert shallow_copy._cache is not idx._cache
assert shallow_copy._cache == {}

0 comments on commit 2847337

Please sign in to comment.