Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

API: Add dtype parameter to Categorical.from_codes #24398

Merged
merged 10 commits into from
Jan 8, 2019
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.24.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -403,6 +403,7 @@ Other Enhancements
- :meth:`pandas.api.types.is_list_like` has gained a keyword ``allow_sets`` which is ``True`` by default; if ``False``,
all instances of ``set`` will not be considered "list-like" anymore (:issue:`23061`)
- :meth:`Index.to_frame` now supports overriding column name(s) (:issue:`22580`).
- :meth:`Categorical.from_codes` now can take a ``dtype`` parameter as an alternative to passing ``categories`` and ``ordered`` (:issue:`24398`).
- New attribute :attr:`__git_version__` will return git commit sha of current build (:issue:`21295`).
- Compatibility with Matplotlib 3.0 (:issue:`22790`).
- Added :meth:`Interval.overlaps`, :meth:`IntervalArray.overlaps`, and :meth:`IntervalIndex.overlaps` for determining overlaps between interval-like objects (:issue:`21998`)
Expand Down
67 changes: 39 additions & 28 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -603,13 +603,13 @@ def _from_inferred_categories(cls, inferred_categories, inferred_codes,
return cls(codes, dtype=dtype, fastpath=True)

@classmethod
def from_codes(cls, codes, categories, ordered=False):
def from_codes(cls, codes, categories=None, ordered=None, dtype=None):
jreback marked this conversation as resolved.
Show resolved Hide resolved
"""
Make a Categorical type from codes and categories arrays.
Make a Categorical type from codes and categories or dtype.

This constructor is useful if you already have codes and categories and
so do not need the (computation intensive) factorization step, which is
usually done on the constructor.
This constructor is useful if you already have codes and
categories/dtype and so do not need the (computation intensive)
factorization step, which is usually done on the constructor.

If your data does not follow this convention, please use the normal
constructor.
Expand All @@ -618,16 +618,38 @@ def from_codes(cls, codes, categories, ordered=False):
----------
codes : array-like, integers
An integer array, where each integer points to a category in
categories or -1 for NaN
categories : index-like
categories or dtype.categories, or else is -1 for NaN
TomAugspurger marked this conversation as resolved.
Show resolved Hide resolved
categories : index-like, optional
The categories for the categorical. Items need to be unique.
ordered : boolean, (default False)
Whether or not this categorical is treated as a ordered
categorical. If not given, the resulting categorical will be
unordered.
"""
dtype = CategoricalDtype._from_values_or_dtype(codes, categories,
ordered)
If the categories are not given here, then they must be provided
in `dtype`.
ordered : bool, optional
Whether or not this categorical is treated as an ordered
categorical. If not given here or in `dtype`, the resulting
categorical will be unordered.
dtype : CategoricalDtype or the string "category", optional
If :class:`CategoricalDtype`, cannot be used together with
`categories` or `ordered`.

.. versionadded:: 0.24.0

When `dtype` is provided, neither `categories` nor `ordered`
should be provided.

Examples
--------
>>> dtype = pd.CategoricalDtype(['a', 'b'], ordered=True)
>>> pd.Categorical.from_codes(codes=[0, 1, 0, 1], dtype=dtype)
[a, b, a, b]
Categories (2, object): [a < b]
"""
dtype = CategoricalDtype._from_values_or_dtype(categories=categories,
ordered=ordered,
dtype=dtype)
if dtype.categories is None:
msg = ("The categories must be provided in 'categories' or "
"'dtype'. Both were None.")
raise ValueError(msg)

codes = np.asarray(codes) # #21767
if not is_integer_dtype(codes):
Expand All @@ -642,12 +664,6 @@ def from_codes(cls, codes, categories, ordered=False):
if msg:
raise ValueError(msg)

try:
codes = coerce_indexer_dtype(codes, categories)
except (ValueError, TypeError):
raise ValueError(
"codes need to be convertible to an arrays of integers")

TomAugspurger marked this conversation as resolved.
Show resolved Hide resolved
if len(codes) and (
codes.max() >= len(dtype.categories) or codes.min() < -1):
raise ValueError("codes need to be between -1 and "
Expand Down Expand Up @@ -1265,8 +1281,7 @@ def shift(self, periods, fill_value=None):
else:
codes[periods:] = fill_value

return self.from_codes(codes, categories=self.categories,
ordered=self.ordered)
return self.from_codes(codes, dtype=self.dtype)

def __array__(self, dtype=None):
"""
Expand Down Expand Up @@ -1887,9 +1902,7 @@ def take_nd(self, indexer, allow_fill=None, fill_value=None):

codes = take(self._codes, indexer, allow_fill=allow_fill,
fill_value=fill_value)
result = type(self).from_codes(codes,
categories=dtype.categories,
ordered=dtype.ordered)
result = type(self).from_codes(codes, dtype=dtype)
return result

take = take_nd
Expand Down Expand Up @@ -2078,9 +2091,7 @@ def __setitem__(self, key, value):
new_codes = _recode_for_categories(
value.codes, value.categories, self.categories
)
value = Categorical.from_codes(new_codes,
categories=self.categories,
ordered=self.ordered)
value = Categorical.from_codes(new_codes, dtype=self.dtype)

rvalue = value if is_list_like(value) else [value]

Expand Down
3 changes: 1 addition & 2 deletions pandas/core/indexes/category.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,8 +148,7 @@ def _create_from_codes(self, codes, dtype=None, name=None):
dtype = self.dtype
if name is None:
name = self.name
cat = Categorical.from_codes(codes, categories=dtype.categories,
ordered=dtype.ordered)
cat = Categorical.from_codes(codes, dtype=dtype)
return CategoricalIndex(cat, name=name)

@classmethod
Expand Down
122 changes: 82 additions & 40 deletions pandas/tests/arrays/categorical/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,9 @@ def test_constructor_unsortable(self):
assert not factor.ordered

# this however will raise as cannot be sorted
with pytest.raises(TypeError):
msg = ("'values' is not ordered, please explicitly specify the "
"categories order by passing in a categories argument.")
with pytest.raises(TypeError, match=msg):
Categorical(arr, ordered=True)

def test_constructor_interval(self):
Expand All @@ -99,10 +101,11 @@ def test_constructor(self):
tm.assert_numpy_array_equal(c2.__array__(), exp_arr)

# categories must be unique
with pytest.raises(ValueError):
msg = "Categorical categories must be unique"
with pytest.raises(ValueError, match=msg):
Categorical([1, 2], [1, 2, 2])

with pytest.raises(ValueError):
with pytest.raises(ValueError, match=msg):
Categorical(["a", "b"], ["a", "b", "b"])

# The default should be unordered
Expand Down Expand Up @@ -211,21 +214,23 @@ def test_constructor(self):

def test_constructor_not_sequence(self):
# https://github.com/pandas-dev/pandas/issues/16022
with pytest.raises(TypeError):
msg = r"^Parameter 'categories' must be list-like, was"
with pytest.raises(TypeError, match=msg):
Categorical(['a', 'b'], categories='a')

def test_constructor_with_null(self):

# Cannot have NaN in categories
with pytest.raises(ValueError):
msg = "Categorial categories cannot be null"
with pytest.raises(ValueError, match=msg):
Categorical([np.nan, "a", "b", "c"],
categories=[np.nan, "a", "b", "c"])

with pytest.raises(ValueError):
with pytest.raises(ValueError, match=msg):
Categorical([None, "a", "b", "c"],
categories=[None, "a", "b", "c"])

with pytest.raises(ValueError):
with pytest.raises(ValueError, match=msg):
Categorical(DatetimeIndex(['nat', '20160101']),
categories=[NaT, Timestamp('20160101')])

Expand Down Expand Up @@ -347,13 +352,14 @@ def test_constructor_with_dtype(self, ordered):

def test_constructor_dtype_and_others_raises(self):
dtype = CategoricalDtype(['a', 'b'], ordered=True)
with pytest.raises(ValueError, match="Cannot"):
msg = "Cannot specify `categories` or `ordered` together with `dtype`."
with pytest.raises(ValueError, match=msg):
Categorical(['a', 'b'], categories=['a', 'b'], dtype=dtype)

with pytest.raises(ValueError, match="Cannot"):
with pytest.raises(ValueError, match=msg):
Categorical(['a', 'b'], ordered=True, dtype=dtype)

with pytest.raises(ValueError, match="Cannot"):
with pytest.raises(ValueError, match=msg):
Categorical(['a', 'b'], ordered=False, dtype=dtype)

@pytest.mark.parametrize('categories', [
Expand Down Expand Up @@ -417,33 +423,44 @@ def test_constructor_with_categorical_categories(self):
def test_from_codes(self):

# too few categories
with pytest.raises(ValueError):
Categorical.from_codes([1, 2], [1, 2])
dtype = CategoricalDtype(categories=[1, 2])
msg = "codes need to be between "
with pytest.raises(ValueError, match=msg):
Categorical.from_codes([1, 2], categories=dtype.categories)
with pytest.raises(ValueError, match=msg):
Categorical.from_codes([1, 2], dtype=dtype)

# no int codes
with pytest.raises(ValueError):
Categorical.from_codes(["a"], [1, 2])
msg = "codes need to be array-like integers"
with pytest.raises(ValueError, match=msg):
Categorical.from_codes(["a"], categories=dtype.categories)
with pytest.raises(ValueError, match=msg):
Categorical.from_codes(["a"], dtype=dtype)

# no unique categories
with pytest.raises(ValueError):
Categorical.from_codes([0, 1, 2], ["a", "a", "b"])
with pytest.raises(ValueError,
match="Categorical categories must be unique"):
Categorical.from_codes([0, 1, 2], categories=["a", "a", "b"])

# NaN categories included
with pytest.raises(ValueError):
Categorical.from_codes([0, 1, 2], ["a", "b", np.nan])
with pytest.raises(ValueError,
match="Categorial categories cannot be null"):
Categorical.from_codes([0, 1, 2], categories=["a", "b", np.nan])

# too negative
with pytest.raises(ValueError):
Categorical.from_codes([-2, 1, 2], ["a", "b", "c"])
dtype = CategoricalDtype(categories=["a", "b", "c"])
msg = r"codes need to be between -1 and len\(categories\)-1"
with pytest.raises(ValueError, match=msg):
Categorical.from_codes([-2, 1, 2], categories=dtype.categories)
with pytest.raises(ValueError, match=msg):
Categorical.from_codes([-2, 1, 2], dtype=dtype)

exp = Categorical(["a", "b", "c"], ordered=False)
res = Categorical.from_codes([0, 1, 2], ["a", "b", "c"])
res = Categorical.from_codes([0, 1, 2], categories=dtype.categories)
tm.assert_categorical_equal(exp, res)

# Not available in earlier numpy versions
if hasattr(np.random, "choice"):
codes = np.random.choice([0, 1], 5, p=[0.9, 0.1])
Categorical.from_codes(codes, categories=["train", "test"])
res = Categorical.from_codes([0, 1, 2], dtype=dtype)
tm.assert_categorical_equal(exp, res)

def test_from_codes_with_categorical_categories(self):
# GH17884
Expand All @@ -458,28 +475,56 @@ def test_from_codes_with_categorical_categories(self):
tm.assert_categorical_equal(result, expected)

# non-unique Categorical still raises
with pytest.raises(ValueError):
with pytest.raises(ValueError,
match="Categorical categories must be unique"):
Categorical.from_codes([0, 1], Categorical(['a', 'b', 'a']))

def test_from_codes_with_nan_code(self):
# GH21767
codes = [1, 2, np.nan]
categories = ['a', 'b', 'c']
with pytest.raises(ValueError):
Categorical.from_codes(codes, categories)
dtype = CategoricalDtype(categories=['a', 'b', 'c'])
with pytest.raises(ValueError,
match="codes need to be array-like integers"):
Categorical.from_codes(codes, categories=dtype.categories)
with pytest.raises(ValueError,
match="codes need to be array-like integers"):
Categorical.from_codes(codes, dtype=dtype)

def test_from_codes_with_float(self):
# GH21767
codes = [1.0, 2.0, 0] # integer, but in float dtype
categories = ['a', 'b', 'c']
dtype = CategoricalDtype(categories=['a', 'b', 'c'])

with tm.assert_produces_warning(FutureWarning):
cat = Categorical.from_codes(codes, dtype.categories)
tm.assert_numpy_array_equal(cat.codes, np.array([1, 2, 0], dtype='i1'))

with tm.assert_produces_warning(FutureWarning):
cat = Categorical.from_codes(codes, categories)
cat = Categorical.from_codes(codes, dtype=dtype)
tm.assert_numpy_array_equal(cat.codes, np.array([1, 2, 0], dtype='i1'))

codes = [1.1, 2.0, 0] # non-integer
with pytest.raises(ValueError):
Categorical.from_codes(codes, categories)
with pytest.raises(ValueError,
match="codes need to be array-like integers"):
Categorical.from_codes(codes, dtype.categories)
with pytest.raises(ValueError,
match="codes need to be array-like integers"):
Categorical.from_codes(codes, dtype=dtype)

def test_from_codes_with_dtype_raises(self):
msg = 'Cannot specify'
with pytest.raises(ValueError, match=msg):
Categorical.from_codes([0, 1], categories=['a', 'b'],
dtype=CategoricalDtype(['a', 'b']))

with pytest.raises(ValueError, match=msg):
Categorical.from_codes([0, 1], ordered=True,
dtype=CategoricalDtype(['a', 'b']))

def test_from_codes_neither(self):
msg = "Both were None"
with pytest.raises(ValueError, match=msg):
Categorical.from_codes([0, 1])

@pytest.mark.parametrize('dtype', [None, 'category'])
def test_from_inferred_categories(self, dtype):
Expand Down Expand Up @@ -515,14 +560,11 @@ def test_from_inferred_categories_coerces(self):
expected = Categorical([1, 1, 2, np.nan])
tm.assert_categorical_equal(result, expected)

def test_construction_with_ordered(self):
@pytest.mark.parametrize('ordered', [None, True, False])
def test_construction_with_ordered(self, ordered):
# GH 9347, 9190
cat = Categorical([0, 1, 2])
assert not cat.ordered
cat = Categorical([0, 1, 2], ordered=False)
assert not cat.ordered
cat = Categorical([0, 1, 2], ordered=True)
assert cat.ordered
cat = Categorical([0, 1, 2], ordered=ordered)
assert cat.ordered == bool(ordered)

@pytest.mark.xfail(reason="Imaginary values not supported in Categorical")
def test_constructor_imaginary(self):
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/indexes/test_category.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ def test_construction_with_categorical_dtype(self):
tm.assert_index_equal(result, expected, exact=True)

# error when combining categories/ordered and dtype kwargs
msg = 'Cannot specify `categories` or `ordered` together with `dtype`.'
msg = "Cannot specify `categories` or `ordered` together with `dtype`."
with pytest.raises(ValueError, match=msg):
CategoricalIndex(data, categories=cats, dtype=dtype)

Expand Down