From 4be579ef70284867148b8e73c18e88511665787c Mon Sep 17 00:00:00 2001 From: Max Bolingbroke Date: Mon, 2 Sep 2019 22:59:17 +0100 Subject: [PATCH 01/22] BUG: CategoricalIndex allowed reindexing duplicate sources, but not duplicate targets: this is the wrong way around --- doc/source/whatsnew/v1.0.0.rst | 2 +- pandas/core/indexes/category.py | 8 --- pandas/tests/indexes/test_category.py | 51 +++++++++------ pandas/tests/indexing/test_categorical.py | 77 ++++++++++++----------- 4 files changed, 75 insertions(+), 63 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index a3d75d69e1e82..e6dbc22c1a859 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -165,7 +165,7 @@ Categorical - Added test to assert the :func:`fillna` raises the correct ValueError message when the value isn't a value from categories (:issue:`13628`) - Bug in :meth:`Categorical.astype` where ``NaN`` values were handled incorrectly when casting to int (:issue:`28406`) -- +- For :class:`CategoricalIndex`, `DataFrame.reindex` would fail when the targets contained duplicates, and wouldn't fail if the source contained duplicates (:issue:`28107`) - diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index c4321c993e638..c73e9af7d73b8 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -551,10 +551,6 @@ def get_value(self, series: AnyArrayLike, key: Any): # we might be a positional inexer return super().get_value(series, key) - def _can_reindex(self, indexer): - """ always allow reindexing """ - pass - @Appender(_index_shared_docs["where"]) def where(self, cond, other=None): # TODO: Investigate an alternative implementation with @@ -579,7 +575,6 @@ def reindex(self, target, method=None, level=None, limit=None, tolerance=None): Indices of output values in original index """ - if method is not None: raise NotImplementedError( "argument method is not implemented for CategoricalIndex.reindex" @@ -599,9 +594,6 @@ def reindex(self, target, method=None, level=None, limit=None, tolerance=None): indexer = None missing = [] else: - if not target.is_unique: - raise ValueError("cannot reindex with a non-unique indexer") - indexer, missing = self.get_indexer_non_unique(np.array(target)) if len(self.codes) and indexer is not None: diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 67bf9bd20e716..e496047b399c7 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -574,41 +574,56 @@ def test_reindexing(self): tm.assert_numpy_array_equal(expected, actual) def test_reindex_dtype(self): - c = CategoricalIndex(["a", "b", "c", "a"]) + c = CategoricalIndex(["a", "b", "c"]) res, indexer = c.reindex(["a", "c"]) - tm.assert_index_equal(res, Index(["a", "a", "c"]), exact=True) - tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp)) + tm.assert_index_equal(res, Index(["a", "c"]), exact=True) + tm.assert_numpy_array_equal(indexer, np.array([0, 2], dtype=np.intp)) - c = CategoricalIndex(["a", "b", "c", "a"]) + c = CategoricalIndex(["a", "b", "c"]) res, indexer = c.reindex(Categorical(["a", "c"])) - exp = CategoricalIndex(["a", "a", "c"], categories=["a", "c"]) + exp = CategoricalIndex(["a", "c"], categories=["a", "c"]) tm.assert_index_equal(res, exp, exact=True) - tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp)) + tm.assert_numpy_array_equal(indexer, np.array([0, 2], dtype=np.intp)) - c = CategoricalIndex(["a", "b", "c", "a"], categories=["a", "b", "c", "d"]) + c = CategoricalIndex(["a", "b", "c"], categories=["a", "b", "c", "d"]) res, indexer = c.reindex(["a", "c"]) - exp = Index(["a", "a", "c"], dtype="object") + exp = Index(["a", "c"], dtype="object") tm.assert_index_equal(res, exp, exact=True) - tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp)) + tm.assert_numpy_array_equal(indexer, np.array([0, 2], dtype=np.intp)) - c = CategoricalIndex(["a", "b", "c", "a"], categories=["a", "b", "c", "d"]) + c = CategoricalIndex(["a", "b", "c"], categories=["a", "b", "c", "d"]) res, indexer = c.reindex(Categorical(["a", "c"])) - exp = CategoricalIndex(["a", "a", "c"], categories=["a", "c"]) + exp = CategoricalIndex(["a", "c"], categories=["a", "c"]) tm.assert_index_equal(res, exp, exact=True) - tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp)) + tm.assert_numpy_array_equal(indexer, np.array([0, 2], dtype=np.intp)) - def test_reindex_duplicate_target(self): + def test_reindex_duplicate_source(self): # See GH23963 c = CategoricalIndex(["a", "b", "c", "a"], categories=["a", "b", "c", "d"]) - with pytest.raises(ValueError, match="non-unique indexer"): - c.reindex(["a", "a", "c"]) + with pytest.raises(ValueError, match="duplicate axis"): + c._can_reindex(["a", "c"]) - with pytest.raises(ValueError, match="non-unique indexer"): - c.reindex( - CategoricalIndex(["a", "a", "c"], categories=["a", "b", "c", "d"]) + with pytest.raises(ValueError, match="duplicate axis"): + c._can_reindex( + CategoricalIndex(["a", "c"], categories=["a", "b", "c", "d"]) ) + def test_reindex_duplicate_target(self): + # See GH25459 + c = CategoricalIndex(["a", "b", "c"], categories=["a", "b", "c", "d"]) + res, indexer = c.reindex(["a", "c", "c"]) + exp = Index(["a", "c", "c"], dtype="object") + tm.assert_index_equal(res, exp, exact=True) + tm.assert_numpy_array_equal(indexer, np.array([0, 2, 2], dtype=np.intp)) + + res, indexer = c.reindex( + CategoricalIndex(["a", "c", "c"], categories=["a", "b", "c", "d"]) + ) + exp = CategoricalIndex(["a", "c", "c"], categories=["a", "b", "c", "d"]) + tm.assert_index_equal(res, exp, exact=True) + tm.assert_numpy_array_equal(indexer, np.array([0, 2, 2], dtype=np.intp)) + def test_reindex_empty_index(self): # See GH16770 c = CategoricalIndex([]) diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index c365c985eb4b6..29e03246a5750 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -46,6 +46,18 @@ def setup_method(self, method): "B": (Series([1, 1, 2, 1, 3, 2]).astype(CDT([3, 2, 1], ordered=False))), } ).set_index("B") + self.df5 = DataFrame( + { + "A": np.arange(3, dtype="int64"), + "B": Series(list("abc")).astype(CDT(list("cabe"))), + } + ).set_index("B") + self.df6 = DataFrame( + { + "A": np.arange(3, dtype="int64"), + "B": (Series([1, 3, 2]).astype(CDT([3, 2, 1], ordered=False))), + } + ).set_index("B") def test_loc_scalar(self): result = self.df.loc["a"] @@ -564,23 +576,21 @@ def test_reindexing(self): # reindexing # convert to a regular index - result = self.df2.reindex(["a", "b", "e"]) - expected = DataFrame( - {"A": [0, 1, 5, 2, 3, np.nan], "B": Series(list("aaabbe"))} - ).set_index("B") + result = self.df5.reindex(["a", "b", "e"]) + expected = DataFrame({"A": [0, 1, np.nan], "B": Series(list("abe"))}).set_index( + "B" + ) assert_frame_equal(result, expected, check_index_type=True) - result = self.df2.reindex(["a", "b"]) - expected = DataFrame( - {"A": [0, 1, 5, 2, 3], "B": Series(list("aaabb"))} - ).set_index("B") + result = self.df5.reindex(["a", "b"]) + expected = DataFrame({"A": [0, 1], "B": Series(list("ab"))}).set_index("B") assert_frame_equal(result, expected, check_index_type=True) - result = self.df2.reindex(["e"]) + result = self.df5.reindex(["e"]) expected = DataFrame({"A": [np.nan], "B": Series(["e"])}).set_index("B") assert_frame_equal(result, expected, check_index_type=True) - result = self.df2.reindex(["d"]) + result = self.df5.reindex(["d"]) expected = DataFrame({"A": [np.nan], "B": Series(["d"])}).set_index("B") assert_frame_equal(result, expected, check_index_type=True) @@ -588,65 +598,60 @@ def test_reindexing(self): # then return a Categorical cats = list("cabe") - result = self.df2.reindex(Categorical(["a", "d"], categories=cats)) + result = self.df5.reindex(Categorical(["a", "e"], categories=cats)) expected = DataFrame( - {"A": [0, 1, 5, np.nan], "B": Series(list("aaad")).astype(CDT(cats))} + {"A": [0, np.nan], "B": Series(list("ae")).astype(CDT(cats))} ).set_index("B") assert_frame_equal(result, expected, check_index_type=True) - result = self.df2.reindex(Categorical(["a"], categories=cats)) + result = self.df5.reindex(Categorical(["a"], categories=cats)) expected = DataFrame( - {"A": [0, 1, 5], "B": Series(list("aaa")).astype(CDT(cats))} + {"A": [0], "B": Series(list("a")).astype(CDT(cats))} ).set_index("B") assert_frame_equal(result, expected, check_index_type=True) - result = self.df2.reindex(["a", "b", "e"]) - expected = DataFrame( - {"A": [0, 1, 5, 2, 3, np.nan], "B": Series(list("aaabbe"))} - ).set_index("B") + result = self.df5.reindex(["a", "b", "e"]) + expected = DataFrame({"A": [0, 1, np.nan], "B": Series(list("abe"))}).set_index( + "B" + ) assert_frame_equal(result, expected, check_index_type=True) - result = self.df2.reindex(["a", "b"]) - expected = DataFrame( - {"A": [0, 1, 5, 2, 3], "B": Series(list("aaabb"))} - ).set_index("B") + result = self.df5.reindex(["a", "b"]) + expected = DataFrame({"A": [0, 1], "B": Series(list("ab"))}).set_index("B") assert_frame_equal(result, expected, check_index_type=True) - result = self.df2.reindex(["e"]) + result = self.df5.reindex(["e"]) expected = DataFrame({"A": [np.nan], "B": Series(["e"])}).set_index("B") assert_frame_equal(result, expected, check_index_type=True) # give back the type of categorical that we received - result = self.df2.reindex( - Categorical(["a", "d"], categories=cats, ordered=True) + result = self.df5.reindex( + Categorical(["a", "e"], categories=cats, ordered=True) ) expected = DataFrame( - { - "A": [0, 1, 5, np.nan], - "B": Series(list("aaad")).astype(CDT(cats, ordered=True)), - } + {"A": [0, np.nan], "B": Series(list("ae")).astype(CDT(cats, ordered=True))} ).set_index("B") assert_frame_equal(result, expected, check_index_type=True) - result = self.df2.reindex(Categorical(["a", "d"], categories=["a", "d"])) + result = self.df5.reindex(Categorical(["a", "d"], categories=["a", "d"])) expected = DataFrame( - {"A": [0, 1, 5, np.nan], "B": Series(list("aaad")).astype(CDT(["a", "d"]))} + {"A": [0, np.nan], "B": Series(list("ad")).astype(CDT(["a", "d"]))} ).set_index("B") assert_frame_equal(result, expected, check_index_type=True) # passed duplicate indexers are not allowed - msg = "cannot reindex with a non-unique indexer" + msg = "cannot reindex from a duplicate axis" with pytest.raises(ValueError, match=msg): - self.df2.reindex(["a", "a"]) + self.df2.reindex(["a", "b"]) # args NotImplemented ATM msg = r"argument {} is not implemented for CategoricalIndex\.reindex" with pytest.raises(NotImplementedError, match=msg.format("method")): - self.df2.reindex(["a"], method="ffill") + self.df5.reindex(["a"], method="ffill") with pytest.raises(NotImplementedError, match=msg.format("level")): - self.df2.reindex(["a"], level=1) + self.df5.reindex(["a"], level=1) with pytest.raises(NotImplementedError, match=msg.format("limit")): - self.df2.reindex(["a"], limit=2) + self.df5.reindex(["a"], limit=2) def test_loc_slice(self): # slicing From cca2565f4b71b08ada9dde32ca8cc704dd9cfcb2 Mon Sep 17 00:00:00 2001 From: Max Bolingbroke Date: Tue, 3 Sep 2019 19:05:16 +0100 Subject: [PATCH 02/22] Restore original CategoricalIndex.reindex test --- pandas/tests/indexes/test_category.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index e496047b399c7..f3743a579a049 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -574,29 +574,29 @@ def test_reindexing(self): tm.assert_numpy_array_equal(expected, actual) def test_reindex_dtype(self): - c = CategoricalIndex(["a", "b", "c"]) + c = CategoricalIndex(["a", "b", "c", "a"]) res, indexer = c.reindex(["a", "c"]) - tm.assert_index_equal(res, Index(["a", "c"]), exact=True) - tm.assert_numpy_array_equal(indexer, np.array([0, 2], dtype=np.intp)) + tm.assert_index_equal(res, Index(["a", "a", "c"]), exact=True) + tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp)) - c = CategoricalIndex(["a", "b", "c"]) + c = CategoricalIndex(["a", "b", "c", "a"]) res, indexer = c.reindex(Categorical(["a", "c"])) - exp = CategoricalIndex(["a", "c"], categories=["a", "c"]) + exp = CategoricalIndex(["a", "a", "c"], categories=["a", "c"]) tm.assert_index_equal(res, exp, exact=True) - tm.assert_numpy_array_equal(indexer, np.array([0, 2], dtype=np.intp)) + tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp)) - c = CategoricalIndex(["a", "b", "c"], categories=["a", "b", "c", "d"]) + c = CategoricalIndex(["a", "b", "c", "a"], categories=["a", "b", "c", "d"]) res, indexer = c.reindex(["a", "c"]) - exp = Index(["a", "c"], dtype="object") + exp = Index(["a", "a", "c"], dtype="object") tm.assert_index_equal(res, exp, exact=True) - tm.assert_numpy_array_equal(indexer, np.array([0, 2], dtype=np.intp)) + tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp)) - c = CategoricalIndex(["a", "b", "c"], categories=["a", "b", "c", "d"]) + c = CategoricalIndex(["a", "b", "c", "a"], categories=["a", "b", "c", "d"]) res, indexer = c.reindex(Categorical(["a", "c"])) - exp = CategoricalIndex(["a", "c"], categories=["a", "c"]) + exp = CategoricalIndex(["a", "a", "c"], categories=["a", "c"]) tm.assert_index_equal(res, exp, exact=True) - tm.assert_numpy_array_equal(indexer, np.array([0, 2], dtype=np.intp)) + tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp)) def test_reindex_duplicate_source(self): # See GH23963 From 9fa3ed3696fa24def41c1608f749b1a8fcda2d6b Mon Sep 17 00:00:00 2001 From: Max Bolingbroke Date: Tue, 3 Sep 2019 19:05:40 +0100 Subject: [PATCH 03/22] Fix buggy code in doc, pytables test --- doc/source/user_guide/advanced.rst | 8 ++++---- pandas/util/testing.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst index 62a9b6396404a..5c38e2ec2bb3f 100644 --- a/doc/source/user_guide/advanced.rst +++ b/doc/source/user_guide/advanced.rst @@ -783,10 +783,10 @@ values **not** in the categories, similarly to how you can reindex **any** panda .. ipython:: python - df2.reindex(['a', 'e']) - df2.reindex(['a', 'e']).index - df2.reindex(pd.Categorical(['a', 'e'], categories=list('abcde'))) - df2.reindex(pd.Categorical(['a', 'e'], categories=list('abcde'))).index + df2.iloc[[0]].reindex(['a', 'e']) + df2.iloc[[0]].reindex(['a', 'e']).index + df2.iloc[[0]].reindex(pd.Categorical(['a', 'e'], categories=list('abcde'))) + df2.iloc[[0]].reindex(pd.Categorical(['a', 'e'], categories=list('abcde'))).index .. warning:: diff --git a/pandas/util/testing.py b/pandas/util/testing.py index aee58f808d9e6..005cad03e6e6b 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -1597,7 +1597,7 @@ def makeUnicodeIndex(k=10, name=None): def makeCategoricalIndex(k=10, n=3, name=None, **kwargs): """ make a length k index or n categories """ x = rands_array(nchars=4, size=n) - return CategoricalIndex(np.random.choice(x, k), name=name, **kwargs) + return CategoricalIndex(Categorical.from_codes(np.arange(k) % n, categories=x), name=name, **kwargs) def makeIntervalIndex(k=10, name=None, **kwargs): From 1c480c232b1b098b85c3a2de758e1e998f031b50 Mon Sep 17 00:00:00 2001 From: Max Bolingbroke Date: Tue, 3 Sep 2019 19:12:37 +0100 Subject: [PATCH 04/22] Fix formatting --- pandas/util/testing.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 005cad03e6e6b..7045d3ecfa7c9 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -1597,7 +1597,10 @@ def makeUnicodeIndex(k=10, name=None): def makeCategoricalIndex(k=10, n=3, name=None, **kwargs): """ make a length k index or n categories """ x = rands_array(nchars=4, size=n) - return CategoricalIndex(Categorical.from_codes(np.arange(k) % n, categories=x), name=name, **kwargs) + return CategoricalIndex( + Categorical.from_codes(np.arange(k) % n, categories=x), + name=name, **kwargs + ) def makeIntervalIndex(k=10, name=None, **kwargs): From 3162ce5c4cbde2bc27d323dfb5d4a6fa3c931460 Mon Sep 17 00:00:00 2001 From: Max Bolingbroke Date: Tue, 3 Sep 2019 21:48:28 +0100 Subject: [PATCH 05/22] Make docs shorter to shut up linter --- doc/source/user_guide/advanced.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst index 5c38e2ec2bb3f..6c8e6dcbb1db9 100644 --- a/doc/source/user_guide/advanced.rst +++ b/doc/source/user_guide/advanced.rst @@ -785,8 +785,8 @@ values **not** in the categories, similarly to how you can reindex **any** panda df2.iloc[[0]].reindex(['a', 'e']) df2.iloc[[0]].reindex(['a', 'e']).index - df2.iloc[[0]].reindex(pd.Categorical(['a', 'e'], categories=list('abcde'))) - df2.iloc[[0]].reindex(pd.Categorical(['a', 'e'], categories=list('abcde'))).index + df2.iloc[[0]].reindex(pd.Categorical(['a', 'e'], categories=list('abe'))) + df2.iloc[[0]].reindex(pd.Categorical(['a', 'e'], categories=list('abe'))).index .. warning:: From 6ff11059f96ffaf456967446a3608f834b9917e3 Mon Sep 17 00:00:00 2001 From: Max Bolingbroke Date: Tue, 3 Sep 2019 21:49:30 +0100 Subject: [PATCH 06/22] Fix Index.union and get_indexer_non_unique bugs exposed by my categorical index fixes --- pandas/_libs/index.pyx | 18 +++++++++++++----- pandas/core/indexes/base.py | 10 ++++++++-- pandas/tests/test_base.py | 5 +++++ 3 files changed, 26 insertions(+), 7 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 979dad6db0838..6d2f98ab6640e 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -304,12 +304,20 @@ cdef class IndexEngine: if stargets and len(stargets) < 5 and self.is_monotonic_increasing: # if there are few enough stargets and the index is monotonically # increasing, then use binary search for each starget + remaining_stargets = set() for starget in stargets: - start = values.searchsorted(starget, side='left') - end = values.searchsorted(starget, side='right') - if start != end: - d[starget] = list(range(start, end)) - else: + try: + start = values.searchsorted(starget, side='left') + end = values.searchsorted(starget, side='right') + except TypeError: # e.g. if we tried to search for string in int array + remaining_stargets.add(starget) + else: + if start != end: + d[starget] = list(range(start, end)) + + stargets = remaining_stargets + + if stargets: # otherwise, map by iterating through all items in the index for i in range(n): val = values[i] diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 62662edb692a7..2967e51a285de 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2489,8 +2489,14 @@ def _union(self, other, sort): value_set = set(lvals) result.extend([x for x in rvals if x not in value_set]) else: - indexer = self.get_indexer(other) - indexer, = (indexer == -1).nonzero() + # find indexes of things in "other" that are not in "self" + try: + indexer = self.get_indexer(other) + except InvalidIndexError: + # duplicates + indexer = algos.unique1d(self.get_indexer_non_unique(other)[1]) + else: + indexer, = (indexer == -1).nonzero() if len(indexer) > 0: other_diff = algos.take_nd(rvals, indexer, allow_fill=False) diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index c760c75e44f6b..8e57ec66f77c9 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -990,6 +990,11 @@ def test_bool_indexing(self, indexer_klass, indexer): s = pd.Series(idx) tm.assert_series_equal(s[indexer_klass(indexer)], s.iloc[exp_idx]) + def test_get_indexer_non_unique_dtype_mismatch(self): + indexes, missing = pd.Index(['A', 'B']).get_indexer_non_unique(pd.Index([0])) + tm.assert_numpy_array_equal(np.array([], dtype=np.intp), indexes) + tm.assert_numpy_array_equal(np.array([0, 1], dtype=np.intp), missing) + class TestTranspose(Ops): errmsg = "the 'axes' parameter is not supported" From 3051ce5bf5e684e65575de682708df405fdb0b13 Mon Sep 17 00:00:00 2001 From: Max Bolingbroke Date: Tue, 3 Sep 2019 22:34:38 +0100 Subject: [PATCH 07/22] Small fixes --- pandas/_libs/index.pyx | 4 ++-- pandas/tests/test_base.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 6d2f98ab6640e..0599e9c26b57c 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -309,14 +309,14 @@ cdef class IndexEngine: try: start = values.searchsorted(starget, side='left') end = values.searchsorted(starget, side='right') - except TypeError: # e.g. if we tried to search for string in int array + except TypeError: # e.g. if we tried to search for string in int array remaining_stargets.add(starget) else: if start != end: d[starget] = list(range(start, end)) stargets = remaining_stargets - + if stargets: # otherwise, map by iterating through all items in the index for i in range(n): diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 8e57ec66f77c9..47b6e572e3e0c 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -992,8 +992,8 @@ def test_bool_indexing(self, indexer_klass, indexer): def test_get_indexer_non_unique_dtype_mismatch(self): indexes, missing = pd.Index(['A', 'B']).get_indexer_non_unique(pd.Index([0])) - tm.assert_numpy_array_equal(np.array([], dtype=np.intp), indexes) - tm.assert_numpy_array_equal(np.array([0, 1], dtype=np.intp), missing) + tm.assert_numpy_array_equal(np.array([-1], dtype=np.intp), indexes) + tm.assert_numpy_array_equal(np.array([0], dtype=np.intp), missing) class TestTranspose(Ops): From 85fdd7d6c13c2a3d66f66d6543ca3ae42f56d3ac Mon Sep 17 00:00:00 2001 From: Max Bolingbroke Date: Tue, 3 Sep 2019 22:39:58 +0100 Subject: [PATCH 08/22] series | index still fails, but now only due to a dtype mismatch --- pandas/tests/series/test_operators.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index c2cf91e582c47..c8189714f7a4e 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -288,8 +288,8 @@ def test_logical_ops_with_index(self, op): pytest.param( ops.ror_, marks=pytest.mark.xfail( - reason="Index.get_indexer with non unique index", - raises=InvalidIndexError, + reason="GH#22092 Index implementation returns Index", + raises=AssertionError, strict=True, ), ), From 78faa9d5e78081987ec786b02eeeb6de49ecb884 Mon Sep 17 00:00:00 2001 From: Max Bolingbroke Date: Tue, 3 Sep 2019 23:24:37 +0100 Subject: [PATCH 09/22] Last small issues --- pandas/tests/series/test_operators.py | 1 - pandas/tests/test_base.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index c8189714f7a4e..f2bdf643f09c3 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -7,7 +7,6 @@ import pandas as pd from pandas import Categorical, DataFrame, Index, Series, bdate_range, date_range, isna from pandas.core import ops -from pandas.core.indexes.base import InvalidIndexError import pandas.core.nanops as nanops import pandas.util.testing as tm from pandas.util.testing import ( diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 47b6e572e3e0c..07c0ebef0750f 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -992,8 +992,8 @@ def test_bool_indexing(self, indexer_klass, indexer): def test_get_indexer_non_unique_dtype_mismatch(self): indexes, missing = pd.Index(['A', 'B']).get_indexer_non_unique(pd.Index([0])) - tm.assert_numpy_array_equal(np.array([-1], dtype=np.intp), indexes) - tm.assert_numpy_array_equal(np.array([0], dtype=np.intp), missing) + tm.assert_numpy_array_equal(np.array([-1], dtype=np.int64), indexes) + tm.assert_numpy_array_equal(np.array([0], dtype=np.int64), missing) class TestTranspose(Ops): From 5eaf83e6d80ed1bba0908b3f797127a3bf2c0cc8 Mon Sep 17 00:00:00 2001 From: Max Bolingbroke Date: Tue, 3 Sep 2019 23:47:26 +0100 Subject: [PATCH 10/22] More whatsnew --- doc/source/whatsnew/v1.0.0.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index e6dbc22c1a859..0980c3b640737 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -229,6 +229,9 @@ Indexing - Bug in reindexing a :meth:`PeriodIndex` with another type of index that contained a `Period` (:issue:`28323`) (:issue:`28337`) - Fix assignment of column via `.loc` with numpy non-ns datetime type (:issue:`27395`) - Bug in :meth:`Float64Index.astype` where ``np.inf`` was not handled properly when casting to an integer dtype (:issue:`28475`) +- :meth:`Index.union` could fail when the LHS contained duplicates (:issue:`28257`) +- :meth:`Index.get_indexer_non_unique` could fail with `TypeError` in some cases, such as when searching for ints in a string index (:issue:`28257`) +- Missing ^^^^^^^ From 5ea41c6da3b561527000f70844b9e2b8f008aa55 Mon Sep 17 00:00:00 2001 From: Max Bolingbroke Date: Tue, 3 Sep 2019 23:47:44 +0100 Subject: [PATCH 11/22] Blacker --- pandas/tests/test_base.py | 2 +- pandas/util/testing.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 07c0ebef0750f..af6f87f9a8348 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -991,7 +991,7 @@ def test_bool_indexing(self, indexer_klass, indexer): tm.assert_series_equal(s[indexer_klass(indexer)], s.iloc[exp_idx]) def test_get_indexer_non_unique_dtype_mismatch(self): - indexes, missing = pd.Index(['A', 'B']).get_indexer_non_unique(pd.Index([0])) + indexes, missing = pd.Index(["A", "B"]).get_indexer_non_unique(pd.Index([0])) tm.assert_numpy_array_equal(np.array([-1], dtype=np.int64), indexes) tm.assert_numpy_array_equal(np.array([0], dtype=np.int64), missing) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 7045d3ecfa7c9..af4962c9a0c55 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -1598,8 +1598,7 @@ def makeCategoricalIndex(k=10, n=3, name=None, **kwargs): """ make a length k index or n categories """ x = rands_array(nchars=4, size=n) return CategoricalIndex( - Categorical.from_codes(np.arange(k) % n, categories=x), - name=name, **kwargs + Categorical.from_codes(np.arange(k) % n, categories=x), name=name, **kwargs ) From b23e408829c1df11c3732f91ce72b4f9a72971a5 Mon Sep 17 00:00:00 2001 From: Max Bolingbroke Date: Tue, 3 Sep 2019 23:56:09 +0100 Subject: [PATCH 12/22] get_indexer_non_unique makes strange dtype choices --- pandas/tests/test_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index af6f87f9a8348..cd02c37ba83c3 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -992,7 +992,7 @@ def test_bool_indexing(self, indexer_klass, indexer): def test_get_indexer_non_unique_dtype_mismatch(self): indexes, missing = pd.Index(["A", "B"]).get_indexer_non_unique(pd.Index([0])) - tm.assert_numpy_array_equal(np.array([-1], dtype=np.int64), indexes) + tm.assert_numpy_array_equal(np.array([-1], dtype=np.intp), indexes) tm.assert_numpy_array_equal(np.array([0], dtype=np.int64), missing) From 06a6580d45f14f8cd923292912eda7a4c1188be5 Mon Sep 17 00:00:00 2001 From: Max Bolingbroke Date: Wed, 4 Sep 2019 19:30:31 +0100 Subject: [PATCH 13/22] Address some review comments --- pandas/_libs/index.pyx | 2 +- pandas/core/indexes/base.py | 8 ++-- pandas/tests/indexing/test_categorical.py | 46 ++++++++++------------- pandas/tests/series/test_operators.py | 31 ++++----------- pandas/tests/test_base.py | 1 + 5 files changed, 32 insertions(+), 56 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 0599e9c26b57c..4ce7a6f43a527 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -281,7 +281,7 @@ cdef class IndexEngine: cdef: ndarray values, x ndarray[int64_t] result, missing - set stargets + set stargets, remaining_stargets dict d = {} object val int count = 0, count_missing = 0 diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 2967e51a285de..942aa82906272 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2490,13 +2490,11 @@ def _union(self, other, sort): result.extend([x for x in rvals if x not in value_set]) else: # find indexes of things in "other" that are not in "self" - try: + if self.is_unique: indexer = self.get_indexer(other) - except InvalidIndexError: - # duplicates - indexer = algos.unique1d(self.get_indexer_non_unique(other)[1]) - else: indexer, = (indexer == -1).nonzero() + else: + indexer = algos.unique1d(self.get_indexer_non_unique(other)[1]) if len(indexer) > 0: other_diff = algos.take_nd(rvals, indexer, allow_fill=False) diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index 29e03246a5750..0d97680c37c29 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -46,18 +46,6 @@ def setup_method(self, method): "B": (Series([1, 1, 2, 1, 3, 2]).astype(CDT([3, 2, 1], ordered=False))), } ).set_index("B") - self.df5 = DataFrame( - { - "A": np.arange(3, dtype="int64"), - "B": Series(list("abc")).astype(CDT(list("cabe"))), - } - ).set_index("B") - self.df6 = DataFrame( - { - "A": np.arange(3, dtype="int64"), - "B": (Series([1, 3, 2]).astype(CDT([3, 2, 1], ordered=False))), - } - ).set_index("B") def test_loc_scalar(self): result = self.df.loc["a"] @@ -573,24 +561,30 @@ def test_read_only_source(self): assert_frame_equal(rw_df.loc[1:3], ro_df.loc[1:3]) def test_reindexing(self): + df = DataFrame( + { + "A": np.arange(3, dtype="int64"), + "B": Series(list("abc")).astype(CDT(list("cabe"))), + } + ).set_index("B") # reindexing # convert to a regular index - result = self.df5.reindex(["a", "b", "e"]) + result = df.reindex(["a", "b", "e"]) expected = DataFrame({"A": [0, 1, np.nan], "B": Series(list("abe"))}).set_index( "B" ) assert_frame_equal(result, expected, check_index_type=True) - result = self.df5.reindex(["a", "b"]) + result = df.reindex(["a", "b"]) expected = DataFrame({"A": [0, 1], "B": Series(list("ab"))}).set_index("B") assert_frame_equal(result, expected, check_index_type=True) - result = self.df5.reindex(["e"]) + result = df.reindex(["e"]) expected = DataFrame({"A": [np.nan], "B": Series(["e"])}).set_index("B") assert_frame_equal(result, expected, check_index_type=True) - result = self.df5.reindex(["d"]) + result = df.reindex(["d"]) expected = DataFrame({"A": [np.nan], "B": Series(["d"])}).set_index("B") assert_frame_equal(result, expected, check_index_type=True) @@ -598,34 +592,34 @@ def test_reindexing(self): # then return a Categorical cats = list("cabe") - result = self.df5.reindex(Categorical(["a", "e"], categories=cats)) + result = df.reindex(Categorical(["a", "e"], categories=cats)) expected = DataFrame( {"A": [0, np.nan], "B": Series(list("ae")).astype(CDT(cats))} ).set_index("B") assert_frame_equal(result, expected, check_index_type=True) - result = self.df5.reindex(Categorical(["a"], categories=cats)) + result = df.reindex(Categorical(["a"], categories=cats)) expected = DataFrame( {"A": [0], "B": Series(list("a")).astype(CDT(cats))} ).set_index("B") assert_frame_equal(result, expected, check_index_type=True) - result = self.df5.reindex(["a", "b", "e"]) + result = df.reindex(["a", "b", "e"]) expected = DataFrame({"A": [0, 1, np.nan], "B": Series(list("abe"))}).set_index( "B" ) assert_frame_equal(result, expected, check_index_type=True) - result = self.df5.reindex(["a", "b"]) + result = df.reindex(["a", "b"]) expected = DataFrame({"A": [0, 1], "B": Series(list("ab"))}).set_index("B") assert_frame_equal(result, expected, check_index_type=True) - result = self.df5.reindex(["e"]) + result = df.reindex(["e"]) expected = DataFrame({"A": [np.nan], "B": Series(["e"])}).set_index("B") assert_frame_equal(result, expected, check_index_type=True) # give back the type of categorical that we received - result = self.df5.reindex( + result = df.reindex( Categorical(["a", "e"], categories=cats, ordered=True) ) expected = DataFrame( @@ -633,7 +627,7 @@ def test_reindexing(self): ).set_index("B") assert_frame_equal(result, expected, check_index_type=True) - result = self.df5.reindex(Categorical(["a", "d"], categories=["a", "d"])) + result = df.reindex(Categorical(["a", "d"], categories=["a", "d"])) expected = DataFrame( {"A": [0, np.nan], "B": Series(list("ad")).astype(CDT(["a", "d"]))} ).set_index("B") @@ -647,11 +641,11 @@ def test_reindexing(self): # args NotImplemented ATM msg = r"argument {} is not implemented for CategoricalIndex\.reindex" with pytest.raises(NotImplementedError, match=msg.format("method")): - self.df5.reindex(["a"], method="ffill") + df.reindex(["a"], method="ffill") with pytest.raises(NotImplementedError, match=msg.format("level")): - self.df5.reindex(["a"], level=1) + df.reindex(["a"], level=1) with pytest.raises(NotImplementedError, match=msg.format("limit")): - self.df5.reindex(["a"], limit=2) + df.reindex(["a"], limit=2) def test_loc_slice(self): # slicing diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index f2bdf643f09c3..44bdae6cd6165 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -274,41 +274,24 @@ def test_logical_ops_with_index(self, op): assert_series_equal(result, expected) @pytest.mark.parametrize( - "op", + "op, index_op", [ - pytest.param( - ops.rand_, - marks=pytest.mark.xfail( - reason="GH#22092 Index implementation returns Index", - raises=AssertionError, - strict=True, - ), - ), - pytest.param( - ops.ror_, - marks=pytest.mark.xfail( - reason="GH#22092 Index implementation returns Index", - raises=AssertionError, - strict=True, - ), - ), - ops.rxor, + (ops.rand_, Index.intersection), + (ops.ror_, Index.union), + (ops.rxor, Index.symmetric_difference), ], ) - def test_reversed_logical_ops_with_index(self, op): + def test_reversed_logical_ops_with_index(self, op, index_op): # GH#22092, GH#19792 ser = Series([True, True, False, False]) idx1 = Index([True, False, True, False]) idx2 = Index([1, 0, 1, 0]) - # symmetric_difference is only for rxor, but other 2 should fail - expected = idx1.symmetric_difference(ser) - + expected = index_op(idx1, ser) result = op(ser, idx1) assert_index_equal(result, expected) - expected = idx2.symmetric_difference(ser) - + expected = index_op(idx2, ser) result = op(ser, idx2) assert_index_equal(result, expected) diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index cd02c37ba83c3..bde9c61f6f9ff 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -991,6 +991,7 @@ def test_bool_indexing(self, indexer_klass, indexer): tm.assert_series_equal(s[indexer_klass(indexer)], s.iloc[exp_idx]) def test_get_indexer_non_unique_dtype_mismatch(self): + # GH 25459 indexes, missing = pd.Index(["A", "B"]).get_indexer_non_unique(pd.Index([0])) tm.assert_numpy_array_equal(np.array([-1], dtype=np.intp), indexes) tm.assert_numpy_array_equal(np.array([0], dtype=np.int64), missing) From ad573ef13ce77ab99a17f77a914eacf647c71675 Mon Sep 17 00:00:00 2001 From: Max Bolingbroke Date: Wed, 4 Sep 2019 21:04:52 +0100 Subject: [PATCH 14/22] More blackening --- pandas/tests/indexing/test_categorical.py | 4 +--- pandas/tests/series/test_operators.py | 4 ++-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index 0d97680c37c29..005a9a24dc597 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -619,9 +619,7 @@ def test_reindexing(self): assert_frame_equal(result, expected, check_index_type=True) # give back the type of categorical that we received - result = df.reindex( - Categorical(["a", "e"], categories=cats, ordered=True) - ) + result = df.reindex(Categorical(["a", "e"], categories=cats, ordered=True)) expected = DataFrame( {"A": [0, np.nan], "B": Series(list("ae")).astype(CDT(cats, ordered=True))} ).set_index("B") diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index 44bdae6cd6165..ec2853890b3de 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -277,8 +277,8 @@ def test_logical_ops_with_index(self, op): "op, index_op", [ (ops.rand_, Index.intersection), - (ops.ror_, Index.union), - (ops.rxor, Index.symmetric_difference), + (ops.ror_, Index.union), + (ops.rxor, Index.symmetric_difference), ], ) def test_reversed_logical_ops_with_index(self, op, index_op): From e26026581308e4cb2094e22ace3c1bb7ddf912ec Mon Sep 17 00:00:00 2001 From: Max Bolingbroke Date: Tue, 10 Sep 2019 19:16:28 +0100 Subject: [PATCH 15/22] Doc changes --- doc/source/user_guide/advanced.rst | 10 ++++++---- doc/source/whatsnew/v1.0.0.rst | 2 +- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst index 6c8e6dcbb1db9..5a86561fb5101 100644 --- a/doc/source/user_guide/advanced.rst +++ b/doc/source/user_guide/advanced.rst @@ -783,10 +783,12 @@ values **not** in the categories, similarly to how you can reindex **any** panda .. ipython:: python - df2.iloc[[0]].reindex(['a', 'e']) - df2.iloc[[0]].reindex(['a', 'e']).index - df2.iloc[[0]].reindex(pd.Categorical(['a', 'e'], categories=list('abe'))) - df2.iloc[[0]].reindex(pd.Categorical(['a', 'e'], categories=list('abe'))).index + df3 = pd.DataFrame({'A': np.arange(3), 'B': pd.Series(list('abc')).astype('category')}) + df3 = df3.set_index('B') + df3.reindex(['a', 'e']) + df3.reindex(['a', 'e']).index + df3.reindex(pd.Categorical(['a', 'e'], categories=list('abe'))) + df3.reindex(pd.Categorical(['a', 'e'], categories=list('abe'))).index .. warning:: diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 0980c3b640737..cdd9075eaab74 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -165,7 +165,7 @@ Categorical - Added test to assert the :func:`fillna` raises the correct ValueError message when the value isn't a value from categories (:issue:`13628`) - Bug in :meth:`Categorical.astype` where ``NaN`` values were handled incorrectly when casting to int (:issue:`28406`) -- For :class:`CategoricalIndex`, `DataFrame.reindex` would fail when the targets contained duplicates, and wouldn't fail if the source contained duplicates (:issue:`28107`) +- For :class:`CategoricalIndex`, :meth:`DataFrame.reindex` with a :class:`CategoricalIndex`, would fail when the targets contained duplicates, and wouldn't fail if the source contained duplicates (:issue:`28107`) - From 7df795a22ef7de1596bb8a0891757f5bfe8d0beb Mon Sep 17 00:00:00 2001 From: Max Bolingbroke Date: Tue, 10 Sep 2019 21:13:37 +0100 Subject: [PATCH 16/22] Docs docs --- doc/source/user_guide/advanced.rst | 3 ++- doc/source/whatsnew/v1.0.0.rst | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst index 5a86561fb5101..7b185e2faa128 100644 --- a/doc/source/user_guide/advanced.rst +++ b/doc/source/user_guide/advanced.rst @@ -783,7 +783,8 @@ values **not** in the categories, similarly to how you can reindex **any** panda .. ipython:: python - df3 = pd.DataFrame({'A': np.arange(3), 'B': pd.Series(list('abc')).astype('category')}) + df3 = pd.DataFrame({'A': np.arange(3), + 'B': pd.Series(list('abc')).astype('category')}) df3 = df3.set_index('B') df3.reindex(['a', 'e']) df3.reindex(['a', 'e']).index diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index cdd9075eaab74..25bcfb0bcb1e9 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -165,7 +165,7 @@ Categorical - Added test to assert the :func:`fillna` raises the correct ValueError message when the value isn't a value from categories (:issue:`13628`) - Bug in :meth:`Categorical.astype` where ``NaN`` values were handled incorrectly when casting to int (:issue:`28406`) -- For :class:`CategoricalIndex`, :meth:`DataFrame.reindex` with a :class:`CategoricalIndex`, would fail when the targets contained duplicates, and wouldn't fail if the source contained duplicates (:issue:`28107`) +- :meth:`DataFrame.reindex` with a :class:`CategoricalIndex` would fail when the targets contained duplicates, and wouldn't fail if the source contained duplicates (:issue:`28107`) - @@ -229,7 +229,7 @@ Indexing - Bug in reindexing a :meth:`PeriodIndex` with another type of index that contained a `Period` (:issue:`28323`) (:issue:`28337`) - Fix assignment of column via `.loc` with numpy non-ns datetime type (:issue:`27395`) - Bug in :meth:`Float64Index.astype` where ``np.inf`` was not handled properly when casting to an integer dtype (:issue:`28475`) -- :meth:`Index.union` could fail when the LHS contained duplicates (:issue:`28257`) +- :meth:`Index.union` could fail when the left contained duplicates (:issue:`28257`) - :meth:`Index.get_indexer_non_unique` could fail with `TypeError` in some cases, such as when searching for ints in a string index (:issue:`28257`) - From 5d3a861cac75bc3ecd790e63057d71f3b8c18499 Mon Sep 17 00:00:00 2001 From: Max Bolingbroke Date: Wed, 25 Sep 2019 19:42:12 +0100 Subject: [PATCH 17/22] Address review comments --- pandas/core/indexes/base.py | 2 +- pandas/tests/indexes/test_category.py | 12 +++---- pandas/tests/series/test_operators.py | 45 +++++++++++++++++++++------ 3 files changed, 43 insertions(+), 16 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 942aa82906272..3f3ba5224b8af 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2492,7 +2492,7 @@ def _union(self, other, sort): # find indexes of things in "other" that are not in "self" if self.is_unique: indexer = self.get_indexer(other) - indexer, = (indexer == -1).nonzero() + indexer = (indexer == -1).nonzero()[0] else: indexer = algos.unique1d(self.get_indexer_non_unique(other)[1]) diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index f3743a579a049..221da492ddc63 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -600,24 +600,24 @@ def test_reindex_dtype(self): def test_reindex_duplicate_source(self): # See GH23963 - c = CategoricalIndex(["a", "b", "c", "a"], categories=["a", "b", "c", "d"]) + cat = CategoricalIndex(["a", "b", "c", "a"], categories=["a", "b", "c", "d"]) with pytest.raises(ValueError, match="duplicate axis"): - c._can_reindex(["a", "c"]) + cat._can_reindex(["a", "c"]) with pytest.raises(ValueError, match="duplicate axis"): - c._can_reindex( + cat._can_reindex( CategoricalIndex(["a", "c"], categories=["a", "b", "c", "d"]) ) def test_reindex_duplicate_target(self): # See GH25459 - c = CategoricalIndex(["a", "b", "c"], categories=["a", "b", "c", "d"]) - res, indexer = c.reindex(["a", "c", "c"]) + cat = CategoricalIndex(["a", "b", "c"], categories=["a", "b", "c", "d"]) + res, indexer = cat.reindex(["a", "c", "c"]) exp = Index(["a", "c", "c"], dtype="object") tm.assert_index_equal(res, exp, exact=True) tm.assert_numpy_array_equal(indexer, np.array([0, 2, 2], dtype=np.intp)) - res, indexer = c.reindex( + res, indexer = cat.reindex( CategoricalIndex(["a", "c", "c"], categories=["a", "b", "c", "d"]) ) exp = CategoricalIndex(["a", "c", "c"], categories=["a", "b", "c", "d"]) diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index ec2853890b3de..d9b3fd27183ef 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -273,27 +273,54 @@ def test_logical_ops_with_index(self, op): result = op(ser, idx2) assert_series_equal(result, expected) + def test_reversed_xor_with_index_returns_index(self): + # GH#22092, GH#19792 + ser = Series([True, True, False, False]) + idx1 = Index([True, False, True, False]) + idx2 = Index([1, 0, 1, 0]) + + expected = Index.symmetric_difference(idx1, ser) + result = idx1 ^ ser + assert_index_equal(result, expected) + + expected = Index.symmetric_difference(idx2, ser) + result = idx2 ^ ser + assert_index_equal(result, expected) + @pytest.mark.parametrize( - "op, index_op", + "op", [ - (ops.rand_, Index.intersection), - (ops.ror_, Index.union), - (ops.rxor, Index.symmetric_difference), + pytest.param( + ops.rand_, + marks=pytest.mark.xfail( + reason="GH#22092 Index __and__ returns Index intersection", + raises=AssertionError, + strict=True, + ), + ), + pytest.param( + ops.ror_, + marks=pytest.mark.xfail( + reason="GH#22092 Index __or__ returns Index union", + raises=AssertionError, + strict=True, + ), + ), ], ) - def test_reversed_logical_ops_with_index(self, op, index_op): + def test_reversed_logical_op_with_index_returns_series(self, op): # GH#22092, GH#19792 ser = Series([True, True, False, False]) idx1 = Index([True, False, True, False]) idx2 = Index([1, 0, 1, 0]) - expected = index_op(idx1, ser) + expected = pd.Series(op(idx1.values, ser.values)) result = op(ser, idx1) - assert_index_equal(result, expected) + assert_series_equal(result, expected) - expected = index_op(idx2, ser) + expected = pd.Series(op(idx2.values, ser.values)) result = op(ser, idx2) - assert_index_equal(result, expected) + assert_series_equal(result, expected) @pytest.mark.parametrize( "op, expected", From 0ee4f892be48117707d1a22c39e902453077f7c2 Mon Sep 17 00:00:00 2001 From: Max Bolingbroke Date: Wed, 25 Sep 2019 19:49:34 +0100 Subject: [PATCH 18/22] Delete test that checks internal method --- pandas/tests/indexes/test_category.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 221da492ddc63..6ec7301c279f5 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -598,17 +598,6 @@ def test_reindex_dtype(self): tm.assert_index_equal(res, exp, exact=True) tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp)) - def test_reindex_duplicate_source(self): - # See GH23963 - cat = CategoricalIndex(["a", "b", "c", "a"], categories=["a", "b", "c", "d"]) - with pytest.raises(ValueError, match="duplicate axis"): - cat._can_reindex(["a", "c"]) - - with pytest.raises(ValueError, match="duplicate axis"): - cat._can_reindex( - CategoricalIndex(["a", "c"], categories=["a", "b", "c", "d"]) - ) - def test_reindex_duplicate_target(self): # See GH25459 cat = CategoricalIndex(["a", "b", "c"], categories=["a", "b", "c", "d"]) From f07faa3b57001d508476f42b7dacb509991de89e Mon Sep 17 00:00:00 2001 From: Max Bolingbroke Date: Wed, 25 Sep 2019 20:46:42 +0100 Subject: [PATCH 19/22] Strip trailing wspace --- pandas/tests/series/test_operators.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index d9b3fd27183ef..2b5479e322971 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -294,17 +294,17 @@ def test_reversed_xor_with_index_returns_index(self): ops.rand_, marks=pytest.mark.xfail( reason="GH#22092 Index __and__ returns Index intersection", - raises=AssertionError, - strict=True, - ), + raises=AssertionError, + strict=True, + ), ), - pytest.param( - ops.ror_, - marks=pytest.mark.xfail( + pytest.param( + ops.ror_, + marks=pytest.mark.xfail( reason="GH#22092 Index __or__ returns Index union", - raises=AssertionError, - strict=True, - ), + raises=AssertionError, + strict=True, + ), ), ], ) From 2b05a5553ae01eec6883de9a0fa4ba50b79234a2 Mon Sep 17 00:00:00 2001 From: Max Bolingbroke Date: Wed, 25 Sep 2019 20:46:54 +0100 Subject: [PATCH 20/22] Split docs into two blocks?? --- doc/source/user_guide/advanced.rst | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst index 7b185e2faa128..d8dc556b35b81 100644 --- a/doc/source/user_guide/advanced.rst +++ b/doc/source/user_guide/advanced.rst @@ -796,17 +796,15 @@ values **not** in the categories, similarly to how you can reindex **any** panda Reshaping and Comparison operations on a ``CategoricalIndex`` must have the same categories or a ``TypeError`` will be raised. - .. code-block:: ipython + .. ipython:: python - In [9]: df3 = pd.DataFrame({'A': np.arange(6), 'B': pd.Series(list('aabbca')).astype('category')}) + df3 = pd.DataFrame({'A': np.arange(6), 'B': pd.Series(list('aabbca')).astype('category')}) + df3 = df3.set_index('B') + df3.index + + .. ipython:: python - In [11]: df3 = df3.set_index('B') - - In [11]: df3.index - Out[11]: CategoricalIndex(['a', 'a', 'b', 'b', 'c', 'a'], categories=['a', 'b', 'c'], ordered=False, name='B', dtype='category') - - In [12]: pd.concat([df2, df3]) - TypeError: categories must match existing categories when appending + pd.concat([df2, df3]) .. _indexing.rangeindex: From 4ad347c6c2b1329e314aaea139b5f82ab2afd361 Mon Sep 17 00:00:00 2001 From: Max Bolingbroke Date: Wed, 25 Sep 2019 21:26:21 +0100 Subject: [PATCH 21/22] Check failures --- doc/source/user_guide/advanced.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst index d8dc556b35b81..2e27e2f7d2e73 100644 --- a/doc/source/user_guide/advanced.rst +++ b/doc/source/user_guide/advanced.rst @@ -798,10 +798,11 @@ values **not** in the categories, similarly to how you can reindex **any** panda .. ipython:: python - df3 = pd.DataFrame({'A': np.arange(6), 'B': pd.Series(list('aabbca')).astype('category')}) + df3 = pd.DataFrame({'A': np.arange(6), + 'B': pd.Series(list('aabbca')).astype('category')}) df3 = df3.set_index('B') df3.index - + .. ipython:: python pd.concat([df2, df3]) From ff13dea80cd106af3cbaee9b494e4b5ab4492a17 Mon Sep 17 00:00:00 2001 From: Max Bolingbroke Date: Thu, 26 Sep 2019 00:23:23 +0100 Subject: [PATCH 22/22] Maybe this is better docs? --- doc/source/user_guide/advanced.rst | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst index 2e27e2f7d2e73..4949dd580414f 100644 --- a/doc/source/user_guide/advanced.rst +++ b/doc/source/user_guide/advanced.rst @@ -786,6 +786,10 @@ values **not** in the categories, similarly to how you can reindex **any** panda df3 = pd.DataFrame({'A': np.arange(3), 'B': pd.Series(list('abc')).astype('category')}) df3 = df3.set_index('B') + df3 + +.. ipython:: python + df3.reindex(['a', 'e']) df3.reindex(['a', 'e']).index df3.reindex(pd.Categorical(['a', 'e'], categories=list('abe'))) @@ -798,14 +802,22 @@ values **not** in the categories, similarly to how you can reindex **any** panda .. ipython:: python - df3 = pd.DataFrame({'A': np.arange(6), - 'B': pd.Series(list('aabbca')).astype('category')}) - df3 = df3.set_index('B') - df3.index + df4 = pd.DataFrame({'A': np.arange(2), + 'B': list('ba')}) + df4['B'] = df4['B'].astype(CategoricalDtype(list('ab'))) + df4 = df4.set_index('B') + df4.index - .. ipython:: python + df5 = pd.DataFrame({'A': np.arange(2), + 'B': list('bc')}) + df5['B'] = df5['B'].astype(CategoricalDtype(list('bc'))) + df5 = df5.set_index('B') + df5.index + + .. code-block:: ipython - pd.concat([df2, df3]) + In [1]: pd.concat([df4, df5]) + TypeError: categories must match existing categories when appending .. _indexing.rangeindex: