diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst index 62a9b6396404a..4949dd580414f 100644 --- a/doc/source/user_guide/advanced.rst +++ b/doc/source/user_guide/advanced.rst @@ -783,27 +783,41 @@ values **not** in the categories, similarly to how you can reindex **any** panda .. ipython:: python - df2.reindex(['a', 'e']) - df2.reindex(['a', 'e']).index - df2.reindex(pd.Categorical(['a', 'e'], categories=list('abcde'))) - df2.reindex(pd.Categorical(['a', 'e'], categories=list('abcde'))).index + df3 = pd.DataFrame({'A': np.arange(3), + 'B': pd.Series(list('abc')).astype('category')}) + df3 = df3.set_index('B') + df3 + +.. ipython:: python + + df3.reindex(['a', 'e']) + df3.reindex(['a', 'e']).index + df3.reindex(pd.Categorical(['a', 'e'], categories=list('abe'))) + df3.reindex(pd.Categorical(['a', 'e'], categories=list('abe'))).index .. warning:: Reshaping and Comparison operations on a ``CategoricalIndex`` must have the same categories or a ``TypeError`` will be raised. - .. code-block:: ipython + .. ipython:: python - In [9]: df3 = pd.DataFrame({'A': np.arange(6), 'B': pd.Series(list('aabbca')).astype('category')}) + df4 = pd.DataFrame({'A': np.arange(2), + 'B': list('ba')}) + df4['B'] = df4['B'].astype(CategoricalDtype(list('ab'))) + df4 = df4.set_index('B') + df4.index - In [11]: df3 = df3.set_index('B') + df5 = pd.DataFrame({'A': np.arange(2), + 'B': list('bc')}) + df5['B'] = df5['B'].astype(CategoricalDtype(list('bc'))) + df5 = df5.set_index('B') + df5.index - In [11]: df3.index - Out[11]: CategoricalIndex(['a', 'a', 'b', 'b', 'c', 'a'], categories=['a', 'b', 'c'], ordered=False, name='B', dtype='category') + .. code-block:: ipython - In [12]: pd.concat([df2, df3]) - TypeError: categories must match existing categories when appending + In [1]: pd.concat([df4, df5]) + TypeError: categories must match existing categories when appending .. _indexing.rangeindex: diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 5b5098f7d2426..7c86ad0f029ed 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -223,6 +223,7 @@ Categorical - Added test to assert the :func:`fillna` raises the correct ValueError message when the value isn't a value from categories (:issue:`13628`) - Bug in :meth:`Categorical.astype` where ``NaN`` values were handled incorrectly when casting to int (:issue:`28406`) +- :meth:`DataFrame.reindex` with a :class:`CategoricalIndex` would fail when the targets contained duplicates, and wouldn't fail if the source contained duplicates (:issue:`28107`) - Bug in :meth:`Categorical.astype` not allowing for casting to extension dtypes (:issue:`28668`) - Bug where :func:`merge` was unable to join on categorical and extension dtype columns (:issue:`28668`) - :meth:`Categorical.searchsorted` and :meth:`CategoricalIndex.searchsorted` now work on unordered categoricals also (:issue:`21667`) @@ -292,6 +293,9 @@ Indexing - Bug in reindexing a :meth:`PeriodIndex` with another type of index that contained a `Period` (:issue:`28323`) (:issue:`28337`) - Fix assignment of column via `.loc` with numpy non-ns datetime type (:issue:`27395`) - Bug in :meth:`Float64Index.astype` where ``np.inf`` was not handled properly when casting to an integer dtype (:issue:`28475`) +- :meth:`Index.union` could fail when the left contained duplicates (:issue:`28257`) +- :meth:`Index.get_indexer_non_unique` could fail with `TypeError` in some cases, such as when searching for ints in a string index (:issue:`28257`) +- Missing ^^^^^^^ diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 22f7104debf10..144d555258c50 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -286,7 +286,7 @@ cdef class IndexEngine: cdef: ndarray values, x ndarray[int64_t] result, missing - set stargets + set stargets, remaining_stargets dict d = {} object val int count = 0, count_missing = 0 @@ -309,12 +309,20 @@ cdef class IndexEngine: if stargets and len(stargets) < 5 and self.is_monotonic_increasing: # if there are few enough stargets and the index is monotonically # increasing, then use binary search for each starget + remaining_stargets = set() for starget in stargets: - start = values.searchsorted(starget, side='left') - end = values.searchsorted(starget, side='right') - if start != end: - d[starget] = list(range(start, end)) - else: + try: + start = values.searchsorted(starget, side='left') + end = values.searchsorted(starget, side='right') + except TypeError: # e.g. if we tried to search for string in int array + remaining_stargets.add(starget) + else: + if start != end: + d[starget] = list(range(start, end)) + + stargets = remaining_stargets + + if stargets: # otherwise, map by iterating through all items in the index for i in range(n): val = values[i] diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 7dee3a17f8f9e..464cd49f135ae 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2493,8 +2493,12 @@ def _union(self, other, sort): value_set = set(lvals) result.extend([x for x in rvals if x not in value_set]) else: - indexer = self.get_indexer(other) - indexer, = (indexer == -1).nonzero() + # find indexes of things in "other" that are not in "self" + if self.is_unique: + indexer = self.get_indexer(other) + indexer = (indexer == -1).nonzero()[0] + else: + indexer = algos.unique1d(self.get_indexer_non_unique(other)[1]) if len(indexer) > 0: other_diff = algos.take_nd(rvals, indexer, allow_fill=False) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index b538c4df00e19..e5a8edb56e413 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -552,10 +552,6 @@ def get_value(self, series: AnyArrayLike, key: Any): # we might be a positional inexer return super().get_value(series, key) - def _can_reindex(self, indexer): - """ always allow reindexing """ - pass - @Substitution(klass="CategoricalIndex") @Appender(_shared_docs["searchsorted"]) def searchsorted(self, value, side="left", sorter=None): @@ -585,7 +581,6 @@ def reindex(self, target, method=None, level=None, limit=None, tolerance=None): Indices of output values in original index """ - if method is not None: raise NotImplementedError( "argument method is not implemented for CategoricalIndex.reindex" @@ -605,9 +600,6 @@ def reindex(self, target, method=None, level=None, limit=None, tolerance=None): indexer = None missing = [] else: - if not target.is_unique: - raise ValueError("cannot reindex with a non-unique indexer") - indexer, missing = self.get_indexer_non_unique(np.array(target)) if len(self.codes) and indexer is not None: diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 4326c3f8188fc..8ed7f1a890c39 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -599,15 +599,19 @@ def test_reindex_dtype(self): tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp)) def test_reindex_duplicate_target(self): - # See GH23963 - c = CategoricalIndex(["a", "b", "c", "a"], categories=["a", "b", "c", "d"]) - with pytest.raises(ValueError, match="non-unique indexer"): - c.reindex(["a", "a", "c"]) + # See GH25459 + cat = CategoricalIndex(["a", "b", "c"], categories=["a", "b", "c", "d"]) + res, indexer = cat.reindex(["a", "c", "c"]) + exp = Index(["a", "c", "c"], dtype="object") + tm.assert_index_equal(res, exp, exact=True) + tm.assert_numpy_array_equal(indexer, np.array([0, 2, 2], dtype=np.intp)) - with pytest.raises(ValueError, match="non-unique indexer"): - c.reindex( - CategoricalIndex(["a", "a", "c"], categories=["a", "b", "c", "d"]) - ) + res, indexer = cat.reindex( + CategoricalIndex(["a", "c", "c"], categories=["a", "b", "c", "d"]) + ) + exp = CategoricalIndex(["a", "c", "c"], categories=["a", "b", "c", "d"]) + tm.assert_index_equal(res, exp, exact=True) + tm.assert_numpy_array_equal(indexer, np.array([0, 2, 2], dtype=np.intp)) def test_reindex_empty_index(self): # See GH16770 diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index c365c985eb4b6..005a9a24dc597 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -561,26 +561,30 @@ def test_read_only_source(self): assert_frame_equal(rw_df.loc[1:3], ro_df.loc[1:3]) def test_reindexing(self): + df = DataFrame( + { + "A": np.arange(3, dtype="int64"), + "B": Series(list("abc")).astype(CDT(list("cabe"))), + } + ).set_index("B") # reindexing # convert to a regular index - result = self.df2.reindex(["a", "b", "e"]) - expected = DataFrame( - {"A": [0, 1, 5, 2, 3, np.nan], "B": Series(list("aaabbe"))} - ).set_index("B") + result = df.reindex(["a", "b", "e"]) + expected = DataFrame({"A": [0, 1, np.nan], "B": Series(list("abe"))}).set_index( + "B" + ) assert_frame_equal(result, expected, check_index_type=True) - result = self.df2.reindex(["a", "b"]) - expected = DataFrame( - {"A": [0, 1, 5, 2, 3], "B": Series(list("aaabb"))} - ).set_index("B") + result = df.reindex(["a", "b"]) + expected = DataFrame({"A": [0, 1], "B": Series(list("ab"))}).set_index("B") assert_frame_equal(result, expected, check_index_type=True) - result = self.df2.reindex(["e"]) + result = df.reindex(["e"]) expected = DataFrame({"A": [np.nan], "B": Series(["e"])}).set_index("B") assert_frame_equal(result, expected, check_index_type=True) - result = self.df2.reindex(["d"]) + result = df.reindex(["d"]) expected = DataFrame({"A": [np.nan], "B": Series(["d"])}).set_index("B") assert_frame_equal(result, expected, check_index_type=True) @@ -588,65 +592,58 @@ def test_reindexing(self): # then return a Categorical cats = list("cabe") - result = self.df2.reindex(Categorical(["a", "d"], categories=cats)) + result = df.reindex(Categorical(["a", "e"], categories=cats)) expected = DataFrame( - {"A": [0, 1, 5, np.nan], "B": Series(list("aaad")).astype(CDT(cats))} + {"A": [0, np.nan], "B": Series(list("ae")).astype(CDT(cats))} ).set_index("B") assert_frame_equal(result, expected, check_index_type=True) - result = self.df2.reindex(Categorical(["a"], categories=cats)) + result = df.reindex(Categorical(["a"], categories=cats)) expected = DataFrame( - {"A": [0, 1, 5], "B": Series(list("aaa")).astype(CDT(cats))} + {"A": [0], "B": Series(list("a")).astype(CDT(cats))} ).set_index("B") assert_frame_equal(result, expected, check_index_type=True) - result = self.df2.reindex(["a", "b", "e"]) - expected = DataFrame( - {"A": [0, 1, 5, 2, 3, np.nan], "B": Series(list("aaabbe"))} - ).set_index("B") + result = df.reindex(["a", "b", "e"]) + expected = DataFrame({"A": [0, 1, np.nan], "B": Series(list("abe"))}).set_index( + "B" + ) assert_frame_equal(result, expected, check_index_type=True) - result = self.df2.reindex(["a", "b"]) - expected = DataFrame( - {"A": [0, 1, 5, 2, 3], "B": Series(list("aaabb"))} - ).set_index("B") + result = df.reindex(["a", "b"]) + expected = DataFrame({"A": [0, 1], "B": Series(list("ab"))}).set_index("B") assert_frame_equal(result, expected, check_index_type=True) - result = self.df2.reindex(["e"]) + result = df.reindex(["e"]) expected = DataFrame({"A": [np.nan], "B": Series(["e"])}).set_index("B") assert_frame_equal(result, expected, check_index_type=True) # give back the type of categorical that we received - result = self.df2.reindex( - Categorical(["a", "d"], categories=cats, ordered=True) - ) + result = df.reindex(Categorical(["a", "e"], categories=cats, ordered=True)) expected = DataFrame( - { - "A": [0, 1, 5, np.nan], - "B": Series(list("aaad")).astype(CDT(cats, ordered=True)), - } + {"A": [0, np.nan], "B": Series(list("ae")).astype(CDT(cats, ordered=True))} ).set_index("B") assert_frame_equal(result, expected, check_index_type=True) - result = self.df2.reindex(Categorical(["a", "d"], categories=["a", "d"])) + result = df.reindex(Categorical(["a", "d"], categories=["a", "d"])) expected = DataFrame( - {"A": [0, 1, 5, np.nan], "B": Series(list("aaad")).astype(CDT(["a", "d"]))} + {"A": [0, np.nan], "B": Series(list("ad")).astype(CDT(["a", "d"]))} ).set_index("B") assert_frame_equal(result, expected, check_index_type=True) # passed duplicate indexers are not allowed - msg = "cannot reindex with a non-unique indexer" + msg = "cannot reindex from a duplicate axis" with pytest.raises(ValueError, match=msg): - self.df2.reindex(["a", "a"]) + self.df2.reindex(["a", "b"]) # args NotImplemented ATM msg = r"argument {} is not implemented for CategoricalIndex\.reindex" with pytest.raises(NotImplementedError, match=msg.format("method")): - self.df2.reindex(["a"], method="ffill") + df.reindex(["a"], method="ffill") with pytest.raises(NotImplementedError, match=msg.format("level")): - self.df2.reindex(["a"], level=1) + df.reindex(["a"], level=1) with pytest.raises(NotImplementedError, match=msg.format("limit")): - self.df2.reindex(["a"], limit=2) + df.reindex(["a"], limit=2) def test_loc_slice(self): # slicing diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index 467f2c177850a..6bfcc02ca633a 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -7,7 +7,6 @@ import pandas as pd from pandas import Categorical, DataFrame, Index, Series, bdate_range, date_range, isna from pandas.core import ops -from pandas.core.indexes.base import InvalidIndexError import pandas.core.nanops as nanops import pandas.util.testing as tm from pandas.util.testing import ( @@ -282,13 +281,27 @@ def test_logical_ops_with_index(self, op): result = op(ser, idx2) assert_series_equal(result, expected) + def test_reversed_xor_with_index_returns_index(self): + # GH#22092, GH#19792 + ser = Series([True, True, False, False]) + idx1 = Index([True, False, True, False]) + idx2 = Index([1, 0, 1, 0]) + + expected = Index.symmetric_difference(idx1, ser) + result = idx1 ^ ser + assert_index_equal(result, expected) + + expected = Index.symmetric_difference(idx2, ser) + result = idx2 ^ ser + assert_index_equal(result, expected) + @pytest.mark.parametrize( "op", [ pytest.param( ops.rand_, marks=pytest.mark.xfail( - reason="GH#22092 Index implementation returns Index", + reason="GH#22092 Index __and__ returns Index intersection", raises=AssertionError, strict=True, ), @@ -296,30 +309,26 @@ def test_logical_ops_with_index(self, op): pytest.param( ops.ror_, marks=pytest.mark.xfail( - reason="Index.get_indexer with non unique index", - raises=InvalidIndexError, + reason="GH#22092 Index __or__ returns Index union", + raises=AssertionError, strict=True, ), ), - ops.rxor, ], ) - def test_reversed_logical_ops_with_index(self, op): + def test_reversed_logical_op_with_index_returns_series(self, op): # GH#22092, GH#19792 ser = Series([True, True, False, False]) idx1 = Index([True, False, True, False]) idx2 = Index([1, 0, 1, 0]) - # symmetric_difference is only for rxor, but other 2 should fail - expected = idx1.symmetric_difference(ser) - + expected = pd.Series(op(idx1.values, ser.values)) result = op(ser, idx1) - assert_index_equal(result, expected) - - expected = idx2.symmetric_difference(ser) + assert_series_equal(result, expected) + expected = pd.Series(op(idx2.values, ser.values)) result = op(ser, idx2) - assert_index_equal(result, expected) + assert_series_equal(result, expected) @pytest.mark.parametrize( "op, expected", diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 483122a0eeaba..1f19f58e80f26 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -1009,6 +1009,12 @@ def test_bool_indexing(self, indexer_klass, indexer): s = pd.Series(idx) tm.assert_series_equal(s[indexer_klass(indexer)], s.iloc[exp_idx]) + def test_get_indexer_non_unique_dtype_mismatch(self): + # GH 25459 + indexes, missing = pd.Index(["A", "B"]).get_indexer_non_unique(pd.Index([0])) + tm.assert_numpy_array_equal(np.array([-1], dtype=np.intp), indexes) + tm.assert_numpy_array_equal(np.array([0], dtype=np.int64), missing) + class TestTranspose(Ops): errmsg = "the 'axes' parameter is not supported" diff --git a/pandas/util/testing.py b/pandas/util/testing.py index c8b41a87baa9d..4cf2776f5aa7c 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -1600,7 +1600,9 @@ def makeUnicodeIndex(k=10, name=None): def makeCategoricalIndex(k=10, n=3, name=None, **kwargs): """ make a length k index or n categories """ x = rands_array(nchars=4, size=n) - return CategoricalIndex(np.random.choice(x, k), name=name, **kwargs) + return CategoricalIndex( + Categorical.from_codes(np.arange(k) % n, categories=x), name=name, **kwargs + ) def makeIntervalIndex(k=10, name=None, **kwargs):