Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: DataFrameGroupBy.__getitem__ fails to propagate dropna #35078

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
23c05f7
add values.dtype.kind==f branch to array_with_unit_datetime
arw2019 Jun 27, 2020
f564d48
revert pandas/_libs/tslib.pyx
arw2019 Jun 29, 2020
afe1869
added cast_from_unit definition for float
arw2019 Jun 27, 2020
c2594e0
revert accidental changes
arw2019 Jun 29, 2020
ef36084
revert changes
arw2019 Aug 7, 2020
cd92bc7
revert accidental changes
arw2019 Jun 29, 2020
077bd8e
update Grouping.indicies to return for nan values
arw2019 Jul 1, 2020
72f66d4
updated _GroupBy._get_index to return for nan values
arw2019 Jul 1, 2020
c5c3a28
revert accidental changes
arw2019 Jul 1, 2020
0214470
revert accidental changes
arw2019 Jul 1, 2020
2a3a86b
revert accidental changes
arw2019 Jul 1, 2020
cf71b75
styling change
arw2019 Jul 1, 2020
9051166
added tests
arw2019 Jul 2, 2020
84e04c0
fixed groupby/groupby.py's _get_indicies
arw2019 Jul 6, 2020
86ce781
removed debug statement
arw2019 Jul 6, 2020
7090e2d
fixed naming error in test
arw2019 Jul 7, 2020
68d9b78
remove type coercion block
arw2019 Jul 7, 2020
9caba29
added missing values handing for _GroupBy.get_group method
arw2019 Jul 7, 2020
8e3a460
updated indicies for case dropna=True
arw2019 Jul 7, 2020
b2ed957
cleaned up syntax
arw2019 Jul 7, 2020
6d87151
cleaned up syntax
arw2019 Jul 7, 2020
c67e072
removed print statements
arw2019 Jul 7, 2020
ed338e1
_transform_general: add a check that we don't accidentally upcast
arw2019 Jul 7, 2020
9ffef45
_transform_general: add int32, float32 to upcasting check
arw2019 Jul 7, 2020
736ac69
rewrite for loop as list comprehension
arw2019 Jul 7, 2020
68902eb
rewrote if statement as dict comp + ternary
arw2019 Jul 7, 2020
c6668f0
fixed small bug in list comp in groupby/groupby.py
arw2019 Jul 7, 2020
46949ea
deleted debug statement in groupby/groupby.py
arw2019 Jul 7, 2020
e16a495
rewrite _get_index using next_iter to set default value
arw2019 Jul 7, 2020
e00d71d
update exepcted test_groupby_nat_exclude for new missing values handling
arw2019 Jul 7, 2020
6d5a441
remove print statement
arw2019 Jul 7, 2020
9c24cf2
reworked solution
arw2019 Jul 9, 2020
5637c3e
fixed PEP8 issue
arw2019 Jul 9, 2020
29c13f6
run pre-commit checks
arw2019 Jul 9, 2020
2ea68af
styling fix
arw2019 Jul 9, 2020
3f5c6d6
update whatnew + styling improvements
arw2019 Jul 9, 2020
10147b0
move NaN handling to _get_indicies
arw2019 Jul 22, 2020
c9f6f7e
removed 1.1 release note
arw2019 Jul 22, 2020
9b536dd
redo solution - modify SeriesGroupBy._transform_general only
arw2019 Jul 22, 2020
2c9de8e
Merge remote-tracking branch 'upstream/master' into groupby-getitem-d…
arw2019 Aug 7, 2020
8d991d5
rewrite case + rewrite tests w fixtures
arw2019 Aug 7, 2020
570ce21
fix mypy error
arw2019 Aug 7, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ Indexing
Missing
^^^^^^^

-
- Bug in :meth:`SeriesGroupBy.transform` now correctly handles missing values for `dropna=False` (:issue:`35014`)
-

MultiIndex
Expand Down
18 changes: 8 additions & 10 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,11 @@
from pandas.util._decorators import Appender, Substitution, doc

from pandas.core.dtypes.cast import (
find_common_type,
maybe_cast_result,
maybe_cast_result_dtype,
maybe_convert_objects,
maybe_downcast_numeric,
maybe_downcast_to_dtype,
)
from pandas.core.dtypes.common import (
ensure_int64,
Expand Down Expand Up @@ -513,7 +513,6 @@ def _transform_general(
"""
Transform with a non-str `func`.
"""

if maybe_use_numba(engine):
numba_func, cache_key = generate_numba_func(
func, engine_kwargs, kwargs, "groupby_transform"
Expand All @@ -535,24 +534,23 @@ def _transform_general(
if isinstance(res, (ABCDataFrame, ABCSeries)):
res = res._values

indexer = self._get_index(name)
ser = klass(res, indexer)
results.append(ser)
results.append(klass(res, index=group.index))

# check for empty "results" to avoid concat ValueError
if results:
from pandas.core.reshape.concat import concat

result = concat(results).sort_index()
concatenated = concat(results)
result = self._set_result_index_ordered(concatenated)
else:
result = self.obj._constructor(dtype=np.float64)

# we will only try to coerce the result type if
# we have a numeric dtype, as these are *always* user-defined funcs
# the cython take a different path (and casting)
dtype = self._selected_obj.dtype
if is_numeric_dtype(dtype):
result = maybe_downcast_to_dtype(result, dtype)
if is_numeric_dtype(result.dtype):
common_dtype = find_common_type([self._selected_obj.dtype, result.dtype])
if common_dtype is result.dtype:
jreback marked this conversation as resolved.
Show resolved Hide resolved
result = maybe_downcast_numeric(result, self._selected_obj.dtype)

result.name = self._selected_obj.name
result.index = self._selected_obj.index
Expand Down
34 changes: 34 additions & 0 deletions pandas/tests/groupby/test_groupby_dropna.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,40 @@ def test_groupby_dropna_series_by(dropna, expected):
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize(
"dropna,df_expected,s_expected",
[
pytest.param(
True,
pd.DataFrame({"B": [2, 2, 1]}),
pd.Series(data=[2, 2, 1], name="B"),
marks=pytest.mark.xfail(raises=ValueError),
),
(
False,
pd.DataFrame({"B": [2, 2, 1, 1]}),
pd.Series(data=[2, 2, 1, 1], name="B"),
),
],
)
def test_slice_groupby_then_transform(dropna, df_expected, s_expected):
# GH35014

df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]})
gb = df.groupby("A", dropna=dropna)

res = gb.transform(len)
tm.assert_frame_equal(res, df_expected)

gb_slice = gb[["B"]]
res = gb_slice.transform(len)
tm.assert_frame_equal(res, df_expected)

gb_slice = gb["B"]
res = gb["B"].transform(len)
tm.assert_series_equal(res, s_expected)


@pytest.mark.parametrize(
"dropna, tuples, outputs",
[
Expand Down