Skip to content

Commit

Permalink
ENH: union_categorical supports identical categories with ordered
Browse files Browse the repository at this point in the history
xref #13410, #13524

Author: sinhrks <[email protected]>

Closes #13763 from sinhrks/union_categoricals_ordered and squashes the following commits:

9cadc4e [sinhrks] ENH: union_categorical supports identical categories with ordered
  • Loading branch information
sinhrks authored and jreback committed Jul 29, 2016
1 parent 54b2777 commit 59f2557
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 15 deletions.
7 changes: 4 additions & 3 deletions doc/source/categorical.rst
Original file line number Diff line number Diff line change
Expand Up @@ -669,9 +669,10 @@ will be the union of the categories being combined.
.. note::

`union_categoricals` only works with unordered categoricals
and will raise if any are ordered.

In addition to the "easy" case of combining two categoricals of the same
categories and order information (e.g. what you could also ``append`` for),
``union_categoricals`` only works with unordered categoricals and will
raise if any are ordered.

Getting Data In/Out
-------------------
Expand Down
61 changes: 53 additions & 8 deletions pandas/tools/tests/test_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -872,23 +872,26 @@ def test_union_categorical(self):
# new categories ordered by appearance
s = Categorical(['x', 'y', 'z'])
s2 = Categorical(['a', 'b', 'c'])
result = union_categoricals([s, s2]).categories
expected = Index(['x', 'y', 'z', 'a', 'b', 'c'])
tm.assert_index_equal(result, expected)
result = union_categoricals([s, s2])
expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'],
categories=['x', 'y', 'z', 'a', 'b', 'c'])
tm.assert_categorical_equal(result, expected)

# can't be ordered
s = Categorical([0, 1.2, 2], ordered=True)
s2 = Categorical([0, 1.2, 2], ordered=True)
with tm.assertRaises(TypeError):
union_categoricals([s, s2])
result = union_categoricals([s, s2])
expected = Categorical([0, 1.2, 2, 0, 1.2, 2], ordered=True)
tm.assert_categorical_equal(result, expected)

# must exactly match types
s = Categorical([0, 1.2, 2])
s2 = Categorical([2, 3, 4])
with tm.assertRaises(TypeError):
msg = 'dtype of categories must be the same'
with tm.assertRaisesRegexp(TypeError, msg):
union_categoricals([s, s2])

with tm.assertRaises(ValueError):
msg = 'No Categoricals to union'
with tm.assertRaisesRegexp(ValueError, msg):
union_categoricals([])

def test_union_categoricals_nan(self):
Expand Down Expand Up @@ -944,6 +947,48 @@ def test_union_categoricals_empty(self):
pd.Categorical([])])
tm.assert_categorical_equal(res, nanc)

def test_union_categorical_same_category(self):
# check fastpath
c1 = Categorical([1, 2, 3, 4], categories=[1, 2, 3, 4])
c2 = Categorical([3, 2, 1, np.nan], categories=[1, 2, 3, 4])
res = union_categoricals([c1, c2])
exp = Categorical([1, 2, 3, 4, 3, 2, 1, np.nan],
categories=[1, 2, 3, 4])
tm.assert_categorical_equal(res, exp)

c1 = Categorical(['z', 'z', 'z'], categories=['x', 'y', 'z'])
c2 = Categorical(['x', 'x', 'x'], categories=['x', 'y', 'z'])
res = union_categoricals([c1, c2])
exp = Categorical(['z', 'z', 'z', 'x', 'x', 'x'],
categories=['x', 'y', 'z'])
tm.assert_categorical_equal(res, exp)

def test_union_categoricals_ordered(self):
c1 = Categorical([1, 2, 3], ordered=True)
c2 = Categorical([1, 2, 3], ordered=False)

msg = 'Categorical.ordered must be the same'
with tm.assertRaisesRegexp(TypeError, msg):
union_categoricals([c1, c2])

res = union_categoricals([c1, c1])
exp = Categorical([1, 2, 3, 1, 2, 3], ordered=True)
tm.assert_categorical_equal(res, exp)

c1 = Categorical([1, 2, 3, np.nan], ordered=True)
c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True)

res = union_categoricals([c1, c2])
exp = Categorical([1, 2, 3, np.nan, 3, 2], ordered=True)
tm.assert_categorical_equal(res, exp)

c1 = Categorical([1, 2, 3], ordered=True)
c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True)

msg = "to union ordered Categoricals, all categories must be the same"
with tm.assertRaisesRegexp(TypeError, msg):
union_categoricals([c1, c2])

def test_concat_bug_1719(self):
ts1 = tm.makeTimeSeries()
ts2 = tm.makeTimeSeries()[::2]
Expand Down
23 changes: 19 additions & 4 deletions pandas/types/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,8 +231,9 @@ def union_categoricals(to_union):
Raises
------
TypeError
If any of the categoricals are ordered or all do not
have the same dtype
- all inputs do not have the same dtype
- all inputs do not have the same ordered property
- all inputs are ordered and their categories are not identical
ValueError
Emmpty list of categoricals passed
"""
Expand All @@ -242,13 +243,27 @@ def union_categoricals(to_union):
raise ValueError('No Categoricals to union')

first = to_union[0]
if any(c.ordered for c in to_union):
raise TypeError("Can only combine unordered Categoricals")

if not all(is_dtype_equal(c.categories.dtype, first.categories.dtype)
for c in to_union):
raise TypeError("dtype of categories must be the same")

if all(first.is_dtype_equal(other) for other in to_union[1:]):
return Categorical(np.concatenate([c.codes for c in to_union]),
categories=first.categories, ordered=first.ordered,
fastpath=True)
elif all(not c.ordered for c in to_union):
# not ordered
pass
else:
# to show a proper error message
if all(c.ordered for c in to_union):
msg = ("to union ordered Categoricals, "
"all categories must be the same")
raise TypeError(msg)
else:
raise TypeError('Categorical.ordered must be the same')

cats = first.categories
unique_cats = cats.append([c.categories for c in to_union[1:]]).unique()
categories = Index(unique_cats)
Expand Down

0 comments on commit 59f2557

Please sign in to comment.