diff --git a/asv_bench/benchmarks/multiindex_object.py b/asv_bench/benchmarks/multiindex_object.py index 3b6eaa52ae382..d7c557203b8b8 100644 --- a/asv_bench/benchmarks/multiindex_object.py +++ b/asv_bench/benchmarks/multiindex_object.py @@ -276,6 +276,45 @@ def time_operation(self, index_structure, dtype, method): getattr(self.left, method)(self.right) +class Difference: + + params = [ + ("datetime", "int", "string", "ea_int"), + ] + param_names = ["dtype"] + + def setup(self, dtype): + N = 10**4 * 2 + level1 = range(1000) + + level2 = date_range(start="1/1/2000", periods=N // 1000) + dates_left = MultiIndex.from_product([level1, level2]) + + level2 = range(N // 1000) + int_left = MultiIndex.from_product([level1, level2]) + + level2 = Series(range(N // 1000), dtype="Int64") + level2[0] = NA + ea_int_left = MultiIndex.from_product([level1, level2]) + + level2 = tm.makeStringIndex(N // 1000).values + str_left = MultiIndex.from_product([level1, level2]) + + data = { + "datetime": dates_left, + "int": int_left, + "ea_int": ea_int_left, + "string": str_left, + } + + data = {k: {"left": mi, "right": mi[:5]} for k, mi in data.items()} + self.left = data[dtype]["left"] + self.right = data[dtype]["right"] + + def time_difference(self, dtype): + self.left.difference(self.right) + + class Unique: params = [ (("Int64", NA), ("int64", 0)), diff --git a/doc/source/whatsnew/v1.6.0.rst b/doc/source/whatsnew/v1.6.0.rst index b2eb54659b048..d726f69286469 100644 --- a/doc/source/whatsnew/v1.6.0.rst +++ b/doc/source/whatsnew/v1.6.0.rst @@ -135,6 +135,7 @@ Performance improvements - Performance improvement in :meth:`MultiIndex.argsort` and :meth:`MultiIndex.sort_values` (:issue:`48406`) - Performance improvement in :meth:`MultiIndex.size` (:issue:`48723`) - Performance improvement in :meth:`MultiIndex.union` without missing values and without duplicates (:issue:`48505`) +- Performance improvement in :meth:`MultiIndex.difference` (:issue:`48606`) - Performance improvement in :meth:`.DataFrameGroupBy.mean`, :meth:`.SeriesGroupBy.mean`, :meth:`.DataFrameGroupBy.var`, and :meth:`.SeriesGroupBy.var` for extension array dtypes (:issue:`37493`) - Performance improvement in :meth:`MultiIndex.isin` when ``level=None`` (:issue:`48622`) - Performance improvement for :meth:`Series.value_counts` with nullable dtype (:issue:`48338`) @@ -210,6 +211,7 @@ Missing MultiIndex ^^^^^^^^^^ +- Bug in :meth:`MultiIndex.difference` losing extension array dtype (:issue:`48606`) - Bug in :class:`MultiIndex.set_levels` raising ``IndexError`` when setting empty level (:issue:`48636`) - Bug in :meth:`MultiIndex.unique` losing extension array dtype (:issue:`48335`) - Bug in :meth:`MultiIndex.intersection` losing extension array (:issue:`48604`) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index aaa840ef19650..7dc04474cbcd8 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3684,7 +3684,12 @@ def _difference(self, other, sort): indexer = indexer.take((indexer != -1).nonzero()[0]) label_diff = np.setdiff1d(np.arange(this.size), indexer, assume_unique=True) - the_diff = this._values.take(label_diff) + + the_diff: MultiIndex | ArrayLike + if isinstance(this, ABCMultiIndex): + the_diff = this.take(label_diff) + else: + the_diff = this._values.take(label_diff) the_diff = _maybe_try_sort(the_diff, sort) return the_diff diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index a399baeacfd6c..26dd957ff4d57 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3746,18 +3746,13 @@ def _wrap_intersection_result(self, other, result) -> MultiIndex: _, result_names = self._convert_can_do_setop(other) return result.set_names(result_names) - def _wrap_difference_result(self, other, result) -> MultiIndex: + def _wrap_difference_result(self, other, result: MultiIndex) -> MultiIndex: _, result_names = self._convert_can_do_setop(other) if len(result) == 0: - return MultiIndex( - levels=[[]] * self.nlevels, - codes=[[]] * self.nlevels, - names=result_names, - verify_integrity=False, - ) + return result.remove_unused_levels().set_names(result_names) else: - return MultiIndex.from_tuples(result, sortorder=0, names=result_names) + return result.set_names(result_names) def _convert_can_do_setop(self, other): result_names = self.names diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index 4a5ef3f4b1d5b..718ac407d4a3f 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -440,6 +440,27 @@ def test_setops_disallow_true(method): getattr(idx1, method)(idx2, sort=True) +@pytest.mark.parametrize("val", [pd.NA, 100]) +def test_difference_keep_ea_dtypes(any_numeric_ea_dtype, val): + # GH#48606 + midx = MultiIndex.from_arrays( + [Series([1, 2], dtype=any_numeric_ea_dtype), [2, 1]], names=["a", None] + ) + midx2 = MultiIndex.from_arrays( + [Series([1, 2, val], dtype=any_numeric_ea_dtype), [1, 1, 3]] + ) + result = midx.difference(midx2) + expected = MultiIndex.from_arrays([Series([1], dtype=any_numeric_ea_dtype), [2]]) + tm.assert_index_equal(result, expected) + + result = midx.difference(midx.sort_values(ascending=False)) + expected = MultiIndex.from_arrays( + [Series([], dtype=any_numeric_ea_dtype), Series([], dtype=int)], + names=["a", None], + ) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("val", [pd.NA, 5]) def test_symmetric_difference_keeping_ea_dtype(any_numeric_ea_dtype, val): # GH#48607