-
-
Notifications
You must be signed in to change notification settings - Fork 18.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ENH: Add dropna in groupby to allow NaN in keys #30584
Changes from 17 commits
7e461a1
1314059
8bcb313
13b03a8
98f6127
d5fd74c
eb717ec
de2ee5d
def05cc
2888807
b357659
dc4fef1
25482ec
015336d
ac2a79f
eb9a6f7
ffb70f8
b0e3cce
a1d5510
11ef56a
b247a8b
7cb027c
d730c4a
42c4934
2ba79b9
8b79b6c
a4fdf2d
4ac15e3
4ebbad3
f141b80
23ad19b
bafc4a5
c98bafe
86a5958
6cf31d7
2b77f37
451ec97
1089b18
63da563
1b3f22a
3f360a9
5cabe4b
76ffb9f
6c126c7
6d61d6a
3630e8b
1cec7f1
1a1bb49
7ea2e79
13b1e9a
92a7eed
1315a9d
a7959d5
9fec9a8
ffbae76
ef90d7c
e219748
2940908
4ea6aa0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -191,6 +191,16 @@ method on a :func:`pandas.api.indexers.BaseIndexer` subclass that will generate | |
indices used for each window during the rolling aggregation. For more details and example usage, see | ||
the :ref:`custom window rolling documentation <stats.custom_rolling_window>` | ||
|
||
.. _whatsnew_1000.groupby_key: | ||
|
||
Allow NaN in groupby key | ||
^^^^^^^^^^^^^^^^^^^^^^^^ | ||
|
||
We've added a ``dropna`` keyword to :meth:`DataFrame.groupby` and :meth:`Series.groupby` in order to | ||
allow ``NaN`` values in group keys. Users can define ``dropna`` to ``False`` if they want to include | ||
``NaN`` values in groupby keys. The default is set to ``True`` for ``dropna`` to keep backwards | ||
compatibility (:issue:`3729`) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add an example (like you have in the doc-strings) also add examples in groupby.rst (and point from here) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. added in both! |
||
|
||
.. _whatsnew_1000.enhancements.other: | ||
|
||
Other enhancements | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -596,7 +596,11 @@ def _factorize_array( | |
) | ||
@Appender(_shared_docs["factorize"]) | ||
def factorize( | ||
values, sort: bool = False, na_sentinel: int = -1, size_hint: Optional[int] = None | ||
values, | ||
sort: bool = False, | ||
na_sentinel: int = -1, | ||
size_hint: Optional[int] = None, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add some independent tests for factorize with dropna There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. emm, I think i added two test cases already in |
||
dropna: bool = True, | ||
) -> Tuple[np.ndarray, Union[np.ndarray, ABCIndex]]: | ||
# Implementation notes: This method is responsible for 3 things | ||
# 1.) coercing data to array-like (ndarray, Index, extension array) | ||
|
@@ -630,6 +634,9 @@ def factorize( | |
uniques, codes = safe_sort( | ||
uniques, codes, na_sentinel=na_sentinel, assume_unique=True, verify=False | ||
) | ||
if not dropna and (codes == na_sentinel).any(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you assign There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done, thanks! |
||
uniques = np.append(uniques, [np.nan]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ideally we push this down to cython, but ok here There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hmm, this should be a dtype appropriate for the dtype There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i think There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. right so instead of adding a null to the categories like you are doing, you just add the appropriate -1 entries in the codes which automatically handle the nullness There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. emm, this has to be a |
||
codes = np.where(codes == na_sentinel, len(uniques) - 1, codes) | ||
|
||
uniques = _reconstruct_data(uniques, dtype, original) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1988,7 +1988,7 @@ def to_feather(self, path): | |
@Substitution(klass="DataFrame") | ||
@Appender(_shared_docs["to_markdown"]) | ||
def to_markdown( | ||
self, buf: Optional[IO[str]] = None, mode: Optional[str] = None, **kwargs, | ||
self, buf: Optional[IO[str]] = None, mode: Optional[str] = None, **kwargs | ||
) -> Optional[str]: | ||
kwargs.setdefault("headers", "keys") | ||
kwargs.setdefault("tablefmt", "pipe") | ||
|
@@ -5648,6 +5648,41 @@ def update( | |
Type | ||
Captive 210.0 | ||
Wild 185.0 | ||
|
||
We can also choose to include NaN in group keys or not by defining | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. defining -> setting There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. rephrased |
||
`dropna` parameter: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. mention the default |
||
|
||
>>> l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]] | ||
>>> df = pd.DataFrame(l, columns=["a", "b", "c"]) | ||
|
||
>>> df.groupby(by=["b"]).sum() | ||
a c | ||
b | ||
1.0 2 3 | ||
2.0 2 5 | ||
|
||
>>> df.groupby(by=["b"], dropna=False).sum() | ||
a c | ||
b | ||
1.0 2 3 | ||
2.0 2 5 | ||
NaN 1 4 | ||
|
||
>>> l = [["a", 12, 12], [None, 12.3, 33.], ["b", 12.3, 123], ["a", 1, 1]] | ||
>>> df = pd.DataFrame(l, columns=["a", "b", "c"]) | ||
|
||
>>> df.groupby(by="a").sum() | ||
b c | ||
a | ||
a 13.0 13.0 | ||
b 12.3 123.0 | ||
|
||
>>> df.groupby(by="a", dropna=False).sum() | ||
b c | ||
a | ||
a 13.0 13.0 | ||
b 12.3 123.0 | ||
NaN 12.3 33.0 | ||
""" | ||
) | ||
@Appender(_shared_docs["groupby"] % _shared_doc_kwargs) | ||
|
@@ -5661,6 +5696,7 @@ def groupby( | |
group_keys: bool = True, | ||
squeeze: bool = False, | ||
observed: bool = False, | ||
dropna: bool = True, | ||
) -> "groupby_generic.DataFrameGroupBy": | ||
|
||
if level is None and by is None: | ||
|
@@ -5677,6 +5713,7 @@ def groupby( | |
group_keys=group_keys, | ||
squeeze=squeeze, | ||
observed=observed, | ||
dropna=dropna, | ||
) | ||
|
||
_shared_docs[ | ||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -7346,6 +7346,12 @@ def clip( | |||||
If False: show all values for categorical groupers. | ||||||
|
||||||
.. versionadded:: 0.23.0 | ||||||
dropna : bool, default True | ||||||
If True, and if group keys contain NaN values, NaN values together | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. don't say NaN, say NA values (e.g. can also be NaT or the new NA scalar) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. changed! |
||||||
with row/column will be dropped. | ||||||
If False, NaN values will also be treated as the key in groups | ||||||
|
||||||
.. versionadded:: 1.0.0 | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
|
||||||
Returns | ||||||
------- | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1433,7 +1433,7 @@ def to_string( | |
@Substitution(klass="Series") | ||
@Appender(generic._shared_docs["to_markdown"]) | ||
def to_markdown( | ||
self, buf: Optional[IO[str]] = None, mode: Optional[str] = None, **kwargs, | ||
self, buf: Optional[IO[str]] = None, mode: Optional[str] = None, **kwargs | ||
) -> Optional[str]: | ||
return self.to_frame().to_markdown(buf, mode, **kwargs) | ||
|
||
|
@@ -1620,6 +1620,34 @@ def _set_name(self, name, inplace=False): | |
Captive 210.0 | ||
Wild 185.0 | ||
Name: Max Speed, dtype: float64 | ||
|
||
We can also choose to include NaN in group keys or not by defining | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. mention the default value There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. mentioned |
||
`dropna` parameter: | ||
|
||
>>> ser = pd.Series([1, 2, 3, 3], index=["a", 'a', 'b', np.nan]) | ||
>>> ser.groupby(level=0).sum() | ||
a 3 | ||
b 3 | ||
dtype: int64 | ||
|
||
>>> ser.groupby(level=0, dropna=False).sum() | ||
a 3 | ||
b 3 | ||
NaN 3 | ||
dtype: int64 | ||
|
||
>>> arrays = ['Falcon', 'Falcon', 'Parrot', 'Parrot'] | ||
>>> ser = pd.Series([390., 350., 30., 20.], index=arrays, name="Max Speed") | ||
>>> ser.groupby(["a", "b", "a", np.nan]).mean() | ||
a 210.0 | ||
b 350.0 | ||
Name: Max Speed, dtype: float64 | ||
|
||
>>> ser.groupby(["a", "b", "a", np.nan], dropna=False).mean() | ||
a 210.0 | ||
b 350.0 | ||
NaN 20.0 | ||
Name: Max Speed, dtype: float64 | ||
""" | ||
) | ||
@Appender(generic._shared_docs["groupby"] % _shared_doc_kwargs) | ||
|
@@ -1633,6 +1661,7 @@ def groupby( | |
group_keys: bool = True, | ||
squeeze: bool = False, | ||
observed: bool = False, | ||
dropna: bool = True, | ||
) -> "groupby_generic.SeriesGroupBy": | ||
|
||
if level is None and by is None: | ||
|
@@ -1649,6 +1678,7 @@ def groupby( | |
group_keys=group_keys, | ||
squeeze=squeeze, | ||
observed=observed, | ||
dropna=dropna, | ||
) | ||
|
||
# ---------------------------------------------------------------------- | ||
|
@@ -4478,9 +4508,7 @@ def to_period(self, freq=None, copy=True): | |
hist = pandas.plotting.hist_series | ||
|
||
|
||
Series._setup_axes( | ||
["index"], docs={"index": "The index (axis labels) of the Series."}, | ||
) | ||
Series._setup_axes(["index"], docs={"index": "The index (axis labels) of the Series."}) | ||
Series._add_numeric_operations() | ||
Series._add_series_or_dataframe_operations() | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
move to 1.1
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
moved