From 23c05f72566cd31816b3e1baba610dfb7fa9992f Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sat, 27 Jun 2020 05:24:03 +0000 Subject: [PATCH 01/41] add values.dtype.kind==f branch to array_with_unit_datetime --- pandas/_libs/tslib.pyx | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index e4128af62d06d..ac9c4cd524d04 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -227,7 +227,6 @@ def array_with_unit_to_datetime( m = cast_from_unit(None, unit) if is_raise: - # try a quick conversion to i8 # if we have nulls that are not type-compat # then need to iterate @@ -240,9 +239,17 @@ def array_with_unit_to_datetime( fvalues = iresult.astype('f8') * m need_to_iterate = False + # GH20445 + if values.dtype.kind == "f": + fresult = values.astype('f8', casting='same_kind', copy=False) + # fill by comparing to NPY_NAT constant + mask = fresult == NPY_NAT + fresult[mask] = 0.0 + fvalues = fvalues.astype('f8') * m # FIXME: this line segfaults rn + need_to_iterate = False + # check the bounds if not need_to_iterate: - if ((fvalues < Timestamp.min.value).any() or (fvalues > Timestamp.max.value).any()): raise OutOfBoundsDatetime(f"cannot convert input with unit '{unit}'") @@ -410,7 +417,6 @@ cpdef array_to_datetime( float offset_seconds, tz_offset set out_tzoffset_vals = set() bint string_to_dts_failed - # specify error conditions assert is_raise or is_ignore or is_coerce From f564d48e0fb43ed23294e0a1c15438571cd8b804 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Mon, 29 Jun 2020 01:06:23 +0000 Subject: [PATCH 02/41] revert pandas/_libs/tslib.pyx --- pandas/_libs/tslib.pyx | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index ac9c4cd524d04..e4128af62d06d 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -227,6 +227,7 @@ def array_with_unit_to_datetime( m = cast_from_unit(None, unit) if is_raise: + # try a quick conversion to i8 # if we have nulls that are not type-compat # then need to iterate @@ -239,17 +240,9 @@ def array_with_unit_to_datetime( fvalues = iresult.astype('f8') * m need_to_iterate = False - # GH20445 - if values.dtype.kind == "f": - fresult = values.astype('f8', casting='same_kind', copy=False) - # fill by comparing to NPY_NAT constant - mask = fresult == NPY_NAT - fresult[mask] = 0.0 - fvalues = fvalues.astype('f8') * m # FIXME: this line segfaults rn - need_to_iterate = False - # check the bounds if not need_to_iterate: + if ((fvalues < Timestamp.min.value).any() or (fvalues > Timestamp.max.value).any()): raise OutOfBoundsDatetime(f"cannot convert input with unit '{unit}'") @@ -417,6 +410,7 @@ cpdef array_to_datetime( float offset_seconds, tz_offset set out_tzoffset_vals = set() bint string_to_dts_failed + # specify error conditions assert is_raise or is_ignore or is_coerce From afe1869537ca5900b3be875fdecc3b0f81d4423e Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sat, 27 Jun 2020 05:48:15 +0000 Subject: [PATCH 03/41] added cast_from_unit definition for float --- pandas/_libs/tslibs/conversion.pyx | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index adf1dfbc1ac72..0f1804139aaad 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -119,6 +119,15 @@ cdef inline int64_t cast_from_unit(object ts, str unit) except? -1: frac = round(frac, p) return (base * m) + (frac * m) +cdef inline float cast_from_unit(object ts, str unit) except? -1: + """ return a casting of the unit represented to nanoseconds + round the fractional part of a float to our precision, p """ + cdef: + float m + int p + + # TO DO: fill in body + cpdef inline (int64_t, int) precision_from_unit(str unit): """ From c2594e09ef94440fefa76b831ef48219bfb60b23 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Mon, 29 Jun 2020 00:38:59 +0000 Subject: [PATCH 04/41] revert accidental changes --- pandas/_libs/tslibs/conversion.pyx | 9 --------- 1 file changed, 9 deletions(-) diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 0f1804139aaad..adf1dfbc1ac72 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -119,15 +119,6 @@ cdef inline int64_t cast_from_unit(object ts, str unit) except? -1: frac = round(frac, p) return (base * m) + (frac * m) -cdef inline float cast_from_unit(object ts, str unit) except? -1: - """ return a casting of the unit represented to nanoseconds - round the fractional part of a float to our precision, p """ - cdef: - float m - int p - - # TO DO: fill in body - cpdef inline (int64_t, int) precision_from_unit(str unit): """ From ef36084b48cd21a9ac215b1f23916c5a908b4d5e Mon Sep 17 00:00:00 2001 From: arw2019 Date: Fri, 7 Aug 2020 01:07:46 +0000 Subject: [PATCH 05/41] revert changes --- pandas/_libs/tslibs/conversion.pyx | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index adf1dfbc1ac72..4eb16c4efccf0 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -120,7 +120,11 @@ cdef inline int64_t cast_from_unit(object ts, str unit) except? -1: return (base * m) + (frac * m) +<<<<<<< HEAD cpdef inline (int64_t, int) precision_from_unit(str unit): +======= +cpdef inline object precision_from_unit(str unit): +>>>>>>> 6b9d4de82... revert changes """ Return a casting of the unit represented to nanoseconds + the precision to round the fractional part. From cd92bc719ee1afef4027f507d180196d19b3c714 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Mon, 29 Jun 2020 00:42:29 +0000 Subject: [PATCH 06/41] revert accidental changes --- pandas/_libs/tslib.pyx | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index e4128af62d06d..86e8661d62e43 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -246,11 +246,25 @@ def array_with_unit_to_datetime( if ((fvalues < Timestamp.min.value).any() or (fvalues > Timestamp.max.value).any()): raise OutOfBoundsDatetime(f"cannot convert input with unit '{unit}'") +<<<<<<< HEAD result = (iresult * m).astype('M8[ns]') iresult = result.view('i8') iresult[mask] = NPY_NAT return result, tz +======= + # GH20445 + if values.dtype.kind == 'i': + result = (iresult * m).astype('M8[ns]') + iresult = result.view('i8') + iresult[mask] = NPY_NAT + return result, tz + elif values.dtype.kind == 'f': + result = (fresult * m_as_float).astype('M8[ns]') + fresult = result.view('f8') + fresult[mask] = NPY_NAT + return result, tz +>>>>>>> f1ae8f562... _libs/tslib.pyx added comments result = np.empty(n, dtype='M8[ns]') iresult = result.view('i8') From 077bd8e99f7e92552a4c34e6cf32afb299f23d21 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Wed, 1 Jul 2020 02:43:36 +0000 Subject: [PATCH 07/41] update Grouping.indicies to return for nan values --- pandas/core/groupby/grouper.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 8239a792c65dd..7206e6db0b93d 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -20,6 +20,7 @@ ) from pandas.core.dtypes.generic import ABCSeries +import pandas as pd import pandas.core.algorithms as algorithms from pandas.core.arrays import Categorical, ExtensionArray import pandas.core.common as com @@ -557,7 +558,12 @@ def indices(self): return self.grouper.indices values = Categorical(self.grouper) - return values._reverse_indexer() + + # GH35014 + res = values._reverse_indexer() + res[np.nan] = [i for i, v in enumerate(values) if pd.isna(v)] + print(res) + return res @property def codes(self) -> np.ndarray: From 72f66d41a72988a2970b24437e716444bd310989 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Wed, 1 Jul 2020 02:46:43 +0000 Subject: [PATCH 08/41] updated _GroupBy._get_index to return for nan values --- pandas/core/groupby/groupby.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 6c8a780859939..bb140cf4737fe 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -629,7 +629,10 @@ def _get_index(self, name): """ Safe get index, translate keys for datelike to underlying repr. """ - return self._get_indices([name])[0] + if isna(name): + return [i for i, v in enumerate(self.indices) if isna(v)] + else: + return self._get_indices([name])[0] @cache_readonly def _selected_obj(self): @@ -901,6 +904,7 @@ def _iterate_slices(self) -> Iterable[Series]: raise AbstractMethodError(self) def transform(self, func, *args, **kwargs): + print(f"name={name}, group={group}") raise AbstractMethodError(self) def _cumcount_array(self, ascending: bool = True): From c5c3a28c4785d76366625c7282e7e428930a5a3e Mon Sep 17 00:00:00 2001 From: arw2019 Date: Wed, 1 Jul 2020 02:51:08 +0000 Subject: [PATCH 09/41] revert accidental changes --- pandas/_libs/tslib.pyx | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 86e8661d62e43..4976159eceb0d 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -247,11 +247,15 @@ def array_with_unit_to_datetime( or (fvalues > Timestamp.max.value).any()): raise OutOfBoundsDatetime(f"cannot convert input with unit '{unit}'") <<<<<<< HEAD +<<<<<<< HEAD +======= +>>>>>>> 7df44d10f... revert accidental changes result = (iresult * m).astype('M8[ns]') iresult = result.view('i8') iresult[mask] = NPY_NAT return result, tz +<<<<<<< HEAD ======= # GH20445 if values.dtype.kind == 'i': @@ -265,6 +269,8 @@ def array_with_unit_to_datetime( fresult[mask] = NPY_NAT return result, tz >>>>>>> f1ae8f562... _libs/tslib.pyx added comments +======= +>>>>>>> 7df44d10f... revert accidental changes result = np.empty(n, dtype='M8[ns]') iresult = result.view('i8') From 0214470986cc61645d8dd6b39c43710a4d5bef51 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Wed, 1 Jul 2020 02:52:48 +0000 Subject: [PATCH 10/41] revert accidental changes --- pandas/core/groupby/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index bb140cf4737fe..0b5be1a7717c9 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -904,7 +904,7 @@ def _iterate_slices(self) -> Iterable[Series]: raise AbstractMethodError(self) def transform(self, func, *args, **kwargs): - print(f"name={name}, group={group}") + raise AbstractMethodError(self) def _cumcount_array(self, ascending: bool = True): From 2a3a86ba7c32f1089b728e93b004f48a09a2910e Mon Sep 17 00:00:00 2001 From: arw2019 Date: Wed, 1 Jul 2020 02:53:20 +0000 Subject: [PATCH 11/41] revert accidental changes --- pandas/core/groupby/groupby.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 0b5be1a7717c9..ad213bf7a67d8 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -904,7 +904,6 @@ def _iterate_slices(self) -> Iterable[Series]: raise AbstractMethodError(self) def transform(self, func, *args, **kwargs): - raise AbstractMethodError(self) def _cumcount_array(self, ascending: bool = True): From cf71b757da91384641b7d2d750b5be44d59413db Mon Sep 17 00:00:00 2001 From: arw2019 Date: Wed, 1 Jul 2020 03:00:02 +0000 Subject: [PATCH 12/41] styling change --- pandas/core/groupby/grouper.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 7206e6db0b93d..f4c7f76bc5dcc 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -562,7 +562,6 @@ def indices(self): # GH35014 res = values._reverse_indexer() res[np.nan] = [i for i, v in enumerate(values) if pd.isna(v)] - print(res) return res @property From 9051166212a7a4be89ad63866c12f62de6bf9144 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Thu, 2 Jul 2020 18:27:58 +0000 Subject: [PATCH 13/41] added tests --- pandas/tests/groupby/test_groupby_dropna.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 1a525d306e9f5..4e65760df50aa 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -162,6 +162,27 @@ def test_groupby_dropna_series_by(dropna, expected): tm.assert_series_equal(result, expected) +def test_slice_groupby_then_transform(): + # GH35014 + + df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]}) + gb = df.groupby("A", dropna=False) + + res = gb.transform(len) + expected = pd.DataFrame({"B": [2, 2, 1, 1]}) + tm.assert_frame_equal(res, expected) + + gb_slice = gb[["B"]] + res = gb_slice.transform(len) + expected = pd.DataFrame({"B": [2, 2, 1, 1]}) + tm.assert_frame_equal(res, expected) + + gb_slice = gb["B"] + res = gb["B"].transform(len) + expected = pd.Series([2, 2, 1, 1]) + tm.assert_series_equal(res, expected) + + @pytest.mark.parametrize( "dropna, tuples, outputs", [ From 84e04c0e713bbeeadaf064b4c263dfdf7bf2b282 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Mon, 6 Jul 2020 20:27:22 +0000 Subject: [PATCH 14/41] fixed groupby/groupby.py's _get_indicies --- pandas/core/groupby/groupby.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index ad213bf7a67d8..90f4523e4ee0f 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -580,6 +580,7 @@ def _get_indices(self, names): Safe get multiple indices, translate keys for datelike to underlying repr. """ + print(f"names={names}") def get_converter(s): # possibly convert to the actual key types @@ -623,16 +624,20 @@ def get_converter(s): converter = get_converter(index_sample) names = (converter(name) for name in names) - return [self.indices.get(name, []) for name in names] + res = [] + for name in names: + if isna(name): + res += [v for k, v in self.indices.items() if isna(k)] + else: + res += [self.indices.get(name, [])] + + return res def _get_index(self, name): """ Safe get index, translate keys for datelike to underlying repr. """ - if isna(name): - return [i for i, v in enumerate(self.indices) if isna(v)] - else: - return self._get_indices([name])[0] + return self._get_indices([name])[0] @cache_readonly def _selected_obj(self): From 86ce781aebb5bc3382e9eb0e38a829d828e8daa4 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Mon, 6 Jul 2020 20:28:36 +0000 Subject: [PATCH 15/41] removed debug statement --- pandas/core/groupby/groupby.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 90f4523e4ee0f..fafd8c018d0d6 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -580,7 +580,6 @@ def _get_indices(self, names): Safe get multiple indices, translate keys for datelike to underlying repr. """ - print(f"names={names}") def get_converter(s): # possibly convert to the actual key types From 7090e2d671d9ca779165f9401ebf7600c184bcd6 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Tue, 7 Jul 2020 04:52:45 +0000 Subject: [PATCH 16/41] fixed naming error in test --- pandas/tests/groupby/test_groupby_dropna.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 4e65760df50aa..f81f6a1bf70c6 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -179,7 +179,9 @@ def test_slice_groupby_then_transform(): gb_slice = gb["B"] res = gb["B"].transform(len) - expected = pd.Series([2, 2, 1, 1]) + expected = pd.Series(data=[2, 2, 1, 1], name="B") + print(f"res={res}") + print(f"expected={expected}") tm.assert_series_equal(res, expected) From 68d9b780f09b27a5bd0550c9a42217f1a349e887 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Tue, 7 Jul 2020 05:14:06 +0000 Subject: [PATCH 17/41] remove type coercion block --- pandas/core/groupby/generic.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index c50b753cf3293..9287abfa39a62 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -39,7 +39,6 @@ maybe_cast_result_dtype, maybe_convert_objects, maybe_downcast_numeric, - maybe_downcast_to_dtype, ) from pandas.core.dtypes.common import ( ensure_int64, @@ -47,7 +46,6 @@ is_bool, is_integer_dtype, is_interval_dtype, - is_numeric_dtype, is_object_dtype, is_scalar, needs_i8_conversion, @@ -547,13 +545,6 @@ def _transform_general( else: result = self.obj._constructor(dtype=np.float64) - # we will only try to coerce the result type if - # we have a numeric dtype, as these are *always* user-defined funcs - # the cython take a different path (and casting) - dtype = self._selected_obj.dtype - if is_numeric_dtype(dtype): - result = maybe_downcast_to_dtype(result, dtype) - result.name = self._selected_obj.name result.index = self._selected_obj.index return result From 9caba29c27ab2186795ef1108ebfb4e50b1939fd Mon Sep 17 00:00:00 2001 From: arw2019 Date: Tue, 7 Jul 2020 05:41:55 +0000 Subject: [PATCH 18/41] added missing values handing for _GroupBy.get_group method --- pandas/core/groupby/groupby.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index fafd8c018d0d6..efed0c6f21d63 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -54,6 +54,7 @@ class providing the base-class of operations. ) from pandas.core.dtypes.missing import isna, notna +import pandas as pd from pandas.core import nanops import pandas.core.algorithms as algorithms from pandas.core.arrays import Categorical, DatetimeArray @@ -636,7 +637,8 @@ def _get_index(self, name): """ Safe get index, translate keys for datelike to underlying repr. """ - return self._get_indices([name])[0] + res = self._get_indices([name]) + return res[0] if res else [] @cache_readonly def _selected_obj(self): @@ -814,7 +816,10 @@ def get_group(self, name, obj=None): if obj is None: obj = self._selected_obj - inds = self._get_index(name) + if pd.isna(name): + inds = self._get_index(np.nan) + else: + inds = self._get_index(name) if not len(inds): raise KeyError(name) From 8e3a46017146d1f200cec175b25c91d384f1b65f Mon Sep 17 00:00:00 2001 From: arw2019 Date: Tue, 7 Jul 2020 05:43:44 +0000 Subject: [PATCH 19/41] updated indicies for case dropna=True --- pandas/core/groupby/grouper.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index f4c7f76bc5dcc..fe56cc36630ab 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -561,7 +561,10 @@ def indices(self): # GH35014 res = values._reverse_indexer() - res[np.nan] = [i for i, v in enumerate(values) if pd.isna(v)] + if self.dropna is False: + nan_locs = [i for i, v in enumerate(values) if pd.isna(v)] + if nan_locs: + res[np.nan] = nan_locs return res @property From b2ed9573bed09db7c910c23afcd5971006ee2655 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Tue, 7 Jul 2020 05:45:49 +0000 Subject: [PATCH 20/41] cleaned up syntax --- pandas/core/groupby/grouper.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index fe56cc36630ab..81bf8a174e555 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -561,10 +561,8 @@ def indices(self): # GH35014 res = values._reverse_indexer() - if self.dropna is False: - nan_locs = [i for i, v in enumerate(values) if pd.isna(v)] - if nan_locs: - res[np.nan] = nan_locs + if self.dropna is False and any(pd.isna(v) for v in values): + res[np.nan] = [i for i, v in enumerate(values) if pd.isna(v)] return res @property From 6d871510bc5c9d2a2af69797611586cfc83a984e Mon Sep 17 00:00:00 2001 From: arw2019 Date: Tue, 7 Jul 2020 05:48:12 +0000 Subject: [PATCH 21/41] cleaned up syntax --- pandas/core/groupby/groupby.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index efed0c6f21d63..6151df9edf78a 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -816,10 +816,7 @@ def get_group(self, name, obj=None): if obj is None: obj = self._selected_obj - if pd.isna(name): - inds = self._get_index(np.nan) - else: - inds = self._get_index(name) + inds = self._get_index(np.nan) if pd.isna(name) else self._get_index(name) if not len(inds): raise KeyError(name) From c67e0726bbc7862478e8b9134ee1823bbbdb3daa Mon Sep 17 00:00:00 2001 From: arw2019 Date: Tue, 7 Jul 2020 06:07:23 +0000 Subject: [PATCH 22/41] removed print statements --- pandas/tests/groupby/test_groupby_dropna.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index f81f6a1bf70c6..3f158e99fab31 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -180,8 +180,6 @@ def test_slice_groupby_then_transform(): gb_slice = gb["B"] res = gb["B"].transform(len) expected = pd.Series(data=[2, 2, 1, 1], name="B") - print(f"res={res}") - print(f"expected={expected}") tm.assert_series_equal(res, expected) From ed338e1d12faee6a833db20444bca7e2544d9c0a Mon Sep 17 00:00:00 2001 From: arw2019 Date: Tue, 7 Jul 2020 06:36:42 +0000 Subject: [PATCH 23/41] _transform_general: add a check that we don't accidentally upcast --- pandas/core/groupby/generic.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 9287abfa39a62..474a591c44183 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -39,6 +39,7 @@ maybe_cast_result_dtype, maybe_convert_objects, maybe_downcast_numeric, + maybe_downcast_to_dtype, ) from pandas.core.dtypes.common import ( ensure_int64, @@ -46,6 +47,7 @@ is_bool, is_integer_dtype, is_interval_dtype, + is_numeric_dtype, is_object_dtype, is_scalar, needs_i8_conversion, @@ -545,6 +547,15 @@ def _transform_general( else: result = self.obj._constructor(dtype=np.float64) + # we will only try to coerce the result type if + # we have a numeric dtype, as these are *always* user-defined funcs + # the cython take a different path (and casting) + # make sure we don't accidentally upcast (GH35014) + types = ["bool", "int64", "float64"] + dtype = self._selected_obj.dtype + if is_numeric_dtype(dtype) and types.index(dtype) < types.index(result.dtype): + result = maybe_downcast_to_dtype(result, dtype) + result.name = self._selected_obj.name result.index = self._selected_obj.index return result From 9ffef454f5cd329abae476aec65948ea67f5dbdb Mon Sep 17 00:00:00 2001 From: arw2019 Date: Tue, 7 Jul 2020 06:51:39 +0000 Subject: [PATCH 24/41] _transform_general: add int32, float32 to upcasting check --- pandas/core/groupby/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 474a591c44183..9828c4bec2833 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -551,7 +551,7 @@ def _transform_general( # we have a numeric dtype, as these are *always* user-defined funcs # the cython take a different path (and casting) # make sure we don't accidentally upcast (GH35014) - types = ["bool", "int64", "float64"] + types = ["bool", "int32", "int64", "float32", "float64"] dtype = self._selected_obj.dtype if is_numeric_dtype(dtype) and types.index(dtype) < types.index(result.dtype): result = maybe_downcast_to_dtype(result, dtype) From 736ac699634217aecefc8cc159747765aecf7c69 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Tue, 7 Jul 2020 06:55:35 +0000 Subject: [PATCH 25/41] rewrite for loop as list comprehension --- pandas/core/groupby/groupby.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 6151df9edf78a..6216d63de2164 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -624,12 +624,12 @@ def get_converter(s): converter = get_converter(index_sample) names = (converter(name) for name in names) - res = [] - for name in names: - if isna(name): - res += [v for k, v in self.indices.items() if isna(k)] - else: - res += [self.indices.get(name, [])] + res = [ + [v for k, v in self.indices.items() if isna(k)] + if isna(name) + else self.indices.get(name, []) + for name in names + ] return res From 68902eba1a358f6a44d65aa318f886e6108ed4e7 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Tue, 7 Jul 2020 07:05:14 +0000 Subject: [PATCH 26/41] rewrote if statement as dict comp + ternary --- pandas/core/groupby/grouper.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 81bf8a174e555..e873b72ba9df8 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -560,10 +560,12 @@ def indices(self): values = Categorical(self.grouper) # GH35014 - res = values._reverse_indexer() - if self.dropna is False and any(pd.isna(v) for v in values): - res[np.nan] = [i for i, v in enumerate(values) if pd.isna(v)] - return res + reverse_indexer = values._reverse_indexer() + return ( + {**reverse_indexer, pd.NaT: [i for i, v in enumerate(values) if pd.isna(v)]} + if not self.dropna and any(pd.isna(v) for v in values) + else reverse_indexer + ) @property def codes(self) -> np.ndarray: From c6668f0913a8191f2a6f0799cdf7fdb1b2211dba Mon Sep 17 00:00:00 2001 From: arw2019 Date: Tue, 7 Jul 2020 07:37:36 +0000 Subject: [PATCH 27/41] fixed small bug in list comp in groupby/groupby.py --- pandas/core/groupby/groupby.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 6216d63de2164..e25473c28ae05 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -627,10 +627,11 @@ def get_converter(s): res = [ [v for k, v in self.indices.items() if isna(k)] if isna(name) - else self.indices.get(name, []) + else [self.indices.get(name, [])] for name in names ] + print(f"groupby.py res={res}") return res def _get_index(self, name): @@ -816,7 +817,7 @@ def get_group(self, name, obj=None): if obj is None: obj = self._selected_obj - inds = self._get_index(np.nan) if pd.isna(name) else self._get_index(name) + inds = self._get_index(pd.NaT) if pd.isna(name) else self._get_index(name) if not len(inds): raise KeyError(name) From 46949eace843d041066e6648044a978956e41384 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Tue, 7 Jul 2020 07:40:17 +0000 Subject: [PATCH 28/41] deleted debug statement in groupby/groupby.py --- pandas/core/groupby/groupby.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index e25473c28ae05..cee6f97be06b8 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -631,7 +631,6 @@ def get_converter(s): for name in names ] - print(f"groupby.py res={res}") return res def _get_index(self, name): From e16a4956b1628b96af9baa1246381650d7efabb6 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Tue, 7 Jul 2020 07:52:44 +0000 Subject: [PATCH 29/41] rewrite _get_index using next_iter to set default value --- pandas/core/groupby/groupby.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index cee6f97be06b8..adc635ae195d4 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -637,8 +637,7 @@ def _get_index(self, name): """ Safe get index, translate keys for datelike to underlying repr. """ - res = self._get_indices([name]) - return res[0] if res else [] + return next(iter(self._get_indices([name])), []) @cache_readonly def _selected_obj(self): From e00d71d651e70a1bdc3fb6a5405a8d11de013118 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Tue, 7 Jul 2020 15:30:21 +0000 Subject: [PATCH 30/41] update exepcted test_groupby_nat_exclude for new missing values handling --- pandas/tests/groupby/test_groupby.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 8c51ebf89f5c0..1a2eb51c2d095 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1260,13 +1260,15 @@ def test_groupby_nat_exclude(): } for k in grouped.indices: + if pd.isna(k): + continue # GH 35014 tm.assert_numpy_array_equal(grouped.indices[k], expected[k]) tm.assert_frame_equal(grouped.get_group(Timestamp("2013-01-01")), df.iloc[[1, 7]]) tm.assert_frame_equal(grouped.get_group(Timestamp("2013-02-01")), df.iloc[[3, 5]]) - with pytest.raises(KeyError, match=r"^NaT$"): - grouped.get_group(pd.NaT) + # GH35014 + grouped.get_group(pd.NaT) nan_df = DataFrame( {"nan": [np.nan, np.nan, np.nan], "nat": [pd.NaT, pd.NaT, pd.NaT]} @@ -1276,6 +1278,7 @@ def test_groupby_nat_exclude(): for key in ["nan", "nat"]: grouped = nan_df.groupby(key) + print(f"grouped.__dict__={grouped.__dict__}") assert grouped.groups == {} assert grouped.ngroups == 0 assert grouped.indices == {} From 6d5a4414c41d0f074127313b099aff14af3566fd Mon Sep 17 00:00:00 2001 From: arw2019 Date: Tue, 7 Jul 2020 15:46:14 +0000 Subject: [PATCH 31/41] remove print statement --- pandas/tests/groupby/test_groupby.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 1a2eb51c2d095..a2097ed723f9d 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1278,7 +1278,6 @@ def test_groupby_nat_exclude(): for key in ["nan", "nat"]: grouped = nan_df.groupby(key) - print(f"grouped.__dict__={grouped.__dict__}") assert grouped.groups == {} assert grouped.ngroups == 0 assert grouped.indices == {} From 9c24cf286fd9e24374bc38f23ab91dd1d3eea8e5 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Thu, 9 Jul 2020 17:16:40 +0000 Subject: [PATCH 32/41] reworked solution --- pandas/core/groupby/groupby.py | 15 +++++---------- pandas/core/groupby/grouper.py | 8 +++----- pandas/tests/groupby/test_groupby.py | 5 ++--- 3 files changed, 10 insertions(+), 18 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index adc635ae195d4..10ce5c92e2d14 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -624,21 +624,16 @@ def get_converter(s): converter = get_converter(index_sample) names = (converter(name) for name in names) - res = [ - [v for k, v in self.indices.items() if isna(k)] - if isna(name) - else [self.indices.get(name, [])] - for name in names - ] - - return res + return [self.indices.get(name, []) for name in names] def _get_index(self, name): """ Safe get index, translate keys for datelike to underlying repr. """ - return next(iter(self._get_indices([name])), []) - + if isna(name): + return self._get_indices([pd.NaT])[0] + else: + return self._get_indices([name])[0] @cache_readonly def _selected_obj(self): # Note: _selected_obj is always just `self.obj` for SeriesGroupBy diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index e873b72ba9df8..ea545772d09db 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -561,11 +561,9 @@ def indices(self): # GH35014 reverse_indexer = values._reverse_indexer() - return ( - {**reverse_indexer, pd.NaT: [i for i, v in enumerate(values) if pd.isna(v)]} - if not self.dropna and any(pd.isna(v) for v in values) - else reverse_indexer - ) + res = {**reverse_indexer, pd.NaT: np.array([i for i, v in enumerate(values) if pd.isna(v)])} if not self.dropna and any(pd.isna(v) for v in values) else reverse_indexer + print(f"grouper.py Grouping.indices returns {res}") + return res @property def codes(self) -> np.ndarray: diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index a2097ed723f9d..f1316f1a28df6 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1260,15 +1260,14 @@ def test_groupby_nat_exclude(): } for k in grouped.indices: - if pd.isna(k): - continue # GH 35014 tm.assert_numpy_array_equal(grouped.indices[k], expected[k]) tm.assert_frame_equal(grouped.get_group(Timestamp("2013-01-01")), df.iloc[[1, 7]]) tm.assert_frame_equal(grouped.get_group(Timestamp("2013-02-01")), df.iloc[[3, 5]]) # GH35014 - grouped.get_group(pd.NaT) + with pytest.raises(KeyError): + grouped.get_group(pd.NaT) nan_df = DataFrame( {"nan": [np.nan, np.nan, np.nan], "nat": [pd.NaT, pd.NaT, pd.NaT]} From 5637c3e4f781d1ddedd7766e4c2d25bfb7f7063f Mon Sep 17 00:00:00 2001 From: arw2019 Date: Thu, 9 Jul 2020 17:20:38 +0000 Subject: [PATCH 33/41] fixed PEP8 issue --- pandas/core/groupby/grouper.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index ea545772d09db..fc0a71f50ba23 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -561,9 +561,13 @@ def indices(self): # GH35014 reverse_indexer = values._reverse_indexer() - res = {**reverse_indexer, pd.NaT: np.array([i for i, v in enumerate(values) if pd.isna(v)])} if not self.dropna and any(pd.isna(v) for v in values) else reverse_indexer - print(f"grouper.py Grouping.indices returns {res}") - return res + if not self.dropna and any(pd.isna(v) for v in values): + return { + **reverse_indexer, + pd.NaT: np.array([i for i, v in enumerate(values) if pd.isna(v)]) + } + else: + return reverse_indexer @property def codes(self) -> np.ndarray: From 29c13f682467f313d2a724692c789a8497f7f023 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Thu, 9 Jul 2020 17:55:42 +0000 Subject: [PATCH 34/41] run pre-commit checks --- pandas/tests/groupby/test_groupby.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index f1316f1a28df6..8c51ebf89f5c0 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1265,8 +1265,7 @@ def test_groupby_nat_exclude(): tm.assert_frame_equal(grouped.get_group(Timestamp("2013-01-01")), df.iloc[[1, 7]]) tm.assert_frame_equal(grouped.get_group(Timestamp("2013-02-01")), df.iloc[[3, 5]]) - # GH35014 - with pytest.raises(KeyError): + with pytest.raises(KeyError, match=r"^NaT$"): grouped.get_group(pd.NaT) nan_df = DataFrame( From 2ea68af8ab84cd358a5f4c3ba405b48e2e244258 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Thu, 9 Jul 2020 18:25:52 +0000 Subject: [PATCH 35/41] styling fix --- pandas/core/groupby/grouper.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index fc0a71f50ba23..c1e096cf47b8f 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -563,11 +563,11 @@ def indices(self): reverse_indexer = values._reverse_indexer() if not self.dropna and any(pd.isna(v) for v in values): return { - **reverse_indexer, - pd.NaT: np.array([i for i, v in enumerate(values) if pd.isna(v)]) - } + **reverse_indexer, + pd.NaT: np.array([i for i, v in enumerate(values) if pd.isna(v)]), + } else: - return reverse_indexer + return reverse_indexer @property def codes(self) -> np.ndarray: From 3f5c6d6632e518cc72349b06ac8c8202b58afe6e Mon Sep 17 00:00:00 2001 From: arw2019 Date: Thu, 9 Jul 2020 19:10:14 +0000 Subject: [PATCH 36/41] update whatnew + styling improvements --- doc/source/whatsnew/v1.1.0.rst | 11 +++++++++++ pandas/core/groupby/groupby.py | 1 + 2 files changed, 12 insertions(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index a49b29d691692..bcf0a77770b8c 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1017,6 +1017,7 @@ Indexing Missing ^^^^^^^ +<<<<<<< HEAD - Calling :meth:`fillna` on an empty :class:`Series` now correctly returns a shallow copied object. The behaviour is now consistent with :class:`Index`, :class:`DataFrame` and a non-empty :class:`Series` (:issue:`32543`). - Bug in :meth:`Series.replace` when argument ``to_replace`` is of type dict/list and is used on a :class:`Series` containing ```` was raising a ``TypeError``. The method now handles this by ignoring ```` values when doing the comparison for the replacement (:issue:`32621`) - Bug in :meth:`~Series.any` and :meth:`~Series.all` incorrectly returning ```` for all ``False`` or all ``True`` values using the nulllable Boolean dtype and with ``skipna=False`` (:issue:`33253`) @@ -1025,6 +1026,16 @@ Missing - Bug in :meth:`DataFrame.interpolate` when called on a :class:`DataFrame` with column names of string type was throwing a ValueError. The method is now independent of the type of the column names (:issue:`33956`) - Passing :class:`NA` into a format string using format specs will now work. For example ``"{:.1f}".format(pd.NA)`` would previously raise a ``ValueError``, but will now return the string ``""`` (:issue:`34740`) - Bug in :meth:`Series.map` not raising on invalid ``na_action`` (:issue:`32815`) +======= +- Calling :meth:`fillna` on an empty Series now correctly returns a shallow copied object. The behaviour is now consistent with :class:`Index`, :class:`DataFrame` and a non-empty :class:`Series` (:issue:`32543`). +- Bug in :meth:`replace` when argument ``to_replace`` is of type dict/list and is used on a :class:`Series` containing ```` was raising a ``TypeError``. The method now handles this by ignoring ```` values when doing the comparison for the replacement (:issue:`32621`) +- Bug in :meth:`~Series.any` and :meth:`~Series.all` incorrectly returning ```` for all ``False`` or all ``True`` values using the nulllable boolean dtype and with ``skipna=False`` (:issue:`33253`) +- Clarified documentation on interpolate with method =akima. The ``der`` parameter must be scalar or None (:issue:`33426`) +- :meth:`DataFrame.interpolate` uses the correct axis convention now. Previously interpolating along columns lead to interpolation along indices and vice versa. Furthermore interpolating with methods ``pad``, ``ffill``, ``bfill`` and ``backfill`` are identical to using these methods with :meth:`fillna` (:issue:`12918`, :issue:`29146`) +- Bug in :meth:`DataFrame.interpolate` when called on a DataFrame with column names of string type was throwing a ValueError. The method is no independing of the type of column names (:issue:`33956`) +- passing :class:`NA` will into a format string using format specs will now work. For example ``"{:.1f}".format(pd.NA)`` would previously raise a ``ValueError``, but will now return the string ``""`` (:issue:`34740`) +- Bug in :meth:`SeriesGroupBy.transform` now correctly handles missing values for `dropna=False` (:issue:`35014`) +>>>>>>> 90e9b6a10... update whatnew + styling improvements MultiIndex ^^^^^^^^^^ diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 10ce5c92e2d14..2fb75a0fd7160 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -634,6 +634,7 @@ def _get_index(self, name): return self._get_indices([pd.NaT])[0] else: return self._get_indices([name])[0] + @cache_readonly def _selected_obj(self): # Note: _selected_obj is always just `self.obj` for SeriesGroupBy From 10147b069a62176bc7bcda8b0a6ab6b61cd83e7d Mon Sep 17 00:00:00 2001 From: arw2019 Date: Wed, 22 Jul 2020 16:39:07 +0000 Subject: [PATCH 37/41] move NaN handling to _get_indicies --- pandas/core/groupby/groupby.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 2fb75a0fd7160..85be3f194680d 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -624,16 +624,18 @@ def get_converter(s): converter = get_converter(index_sample) names = (converter(name) for name in names) - return [self.indices.get(name, []) for name in names] + return [ + self.indices.get(name, []) + if not isna(name) + else self.indices.get(pd.NaT, []) + for name in names + ] def _get_index(self, name): """ Safe get index, translate keys for datelike to underlying repr. """ - if isna(name): - return self._get_indices([pd.NaT])[0] - else: - return self._get_indices([name])[0] + return self._get_indices([name])[0] @cache_readonly def _selected_obj(self): From c9f6f7e3991ad0dc1f65f83f27a9338b128af07f Mon Sep 17 00:00:00 2001 From: arw2019 Date: Wed, 22 Jul 2020 16:48:34 +0000 Subject: [PATCH 38/41] removed 1.1 release note --- doc/source/whatsnew/v1.1.0.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index bcf0a77770b8c..ffecb050d85ed 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1034,8 +1034,11 @@ Missing - :meth:`DataFrame.interpolate` uses the correct axis convention now. Previously interpolating along columns lead to interpolation along indices and vice versa. Furthermore interpolating with methods ``pad``, ``ffill``, ``bfill`` and ``backfill`` are identical to using these methods with :meth:`fillna` (:issue:`12918`, :issue:`29146`) - Bug in :meth:`DataFrame.interpolate` when called on a DataFrame with column names of string type was throwing a ValueError. The method is no independing of the type of column names (:issue:`33956`) - passing :class:`NA` will into a format string using format specs will now work. For example ``"{:.1f}".format(pd.NA)`` would previously raise a ``ValueError``, but will now return the string ``""`` (:issue:`34740`) +<<<<<<< HEAD - Bug in :meth:`SeriesGroupBy.transform` now correctly handles missing values for `dropna=False` (:issue:`35014`) >>>>>>> 90e9b6a10... update whatnew + styling improvements +======= +>>>>>>> 8c11b6072... removed 1.1 release note MultiIndex ^^^^^^^^^^ From 9b536dd2d281e564474a8de91b38f672cea3c4f5 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Wed, 22 Jul 2020 17:10:19 +0000 Subject: [PATCH 39/41] redo solution - modify SeriesGroupBy._transform_general only --- doc/source/whatsnew/v1.1.0.rst | 14 -------------- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/_libs/tslib.pyx | 20 -------------------- pandas/_libs/tslibs/conversion.pyx | 4 ---- pandas/core/groupby/generic.py | 20 +++++++++----------- pandas/core/groupby/groupby.py | 10 ++-------- pandas/core/groupby/grouper.py | 12 +----------- 7 files changed, 13 insertions(+), 69 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index ffecb050d85ed..a49b29d691692 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1017,7 +1017,6 @@ Indexing Missing ^^^^^^^ -<<<<<<< HEAD - Calling :meth:`fillna` on an empty :class:`Series` now correctly returns a shallow copied object. The behaviour is now consistent with :class:`Index`, :class:`DataFrame` and a non-empty :class:`Series` (:issue:`32543`). - Bug in :meth:`Series.replace` when argument ``to_replace`` is of type dict/list and is used on a :class:`Series` containing ```` was raising a ``TypeError``. The method now handles this by ignoring ```` values when doing the comparison for the replacement (:issue:`32621`) - Bug in :meth:`~Series.any` and :meth:`~Series.all` incorrectly returning ```` for all ``False`` or all ``True`` values using the nulllable Boolean dtype and with ``skipna=False`` (:issue:`33253`) @@ -1026,19 +1025,6 @@ Missing - Bug in :meth:`DataFrame.interpolate` when called on a :class:`DataFrame` with column names of string type was throwing a ValueError. The method is now independent of the type of the column names (:issue:`33956`) - Passing :class:`NA` into a format string using format specs will now work. For example ``"{:.1f}".format(pd.NA)`` would previously raise a ``ValueError``, but will now return the string ``""`` (:issue:`34740`) - Bug in :meth:`Series.map` not raising on invalid ``na_action`` (:issue:`32815`) -======= -- Calling :meth:`fillna` on an empty Series now correctly returns a shallow copied object. The behaviour is now consistent with :class:`Index`, :class:`DataFrame` and a non-empty :class:`Series` (:issue:`32543`). -- Bug in :meth:`replace` when argument ``to_replace`` is of type dict/list and is used on a :class:`Series` containing ```` was raising a ``TypeError``. The method now handles this by ignoring ```` values when doing the comparison for the replacement (:issue:`32621`) -- Bug in :meth:`~Series.any` and :meth:`~Series.all` incorrectly returning ```` for all ``False`` or all ``True`` values using the nulllable boolean dtype and with ``skipna=False`` (:issue:`33253`) -- Clarified documentation on interpolate with method =akima. The ``der`` parameter must be scalar or None (:issue:`33426`) -- :meth:`DataFrame.interpolate` uses the correct axis convention now. Previously interpolating along columns lead to interpolation along indices and vice versa. Furthermore interpolating with methods ``pad``, ``ffill``, ``bfill`` and ``backfill`` are identical to using these methods with :meth:`fillna` (:issue:`12918`, :issue:`29146`) -- Bug in :meth:`DataFrame.interpolate` when called on a DataFrame with column names of string type was throwing a ValueError. The method is no independing of the type of column names (:issue:`33956`) -- passing :class:`NA` will into a format string using format specs will now work. For example ``"{:.1f}".format(pd.NA)`` would previously raise a ``ValueError``, but will now return the string ``""`` (:issue:`34740`) -<<<<<<< HEAD -- Bug in :meth:`SeriesGroupBy.transform` now correctly handles missing values for `dropna=False` (:issue:`35014`) ->>>>>>> 90e9b6a10... update whatnew + styling improvements -======= ->>>>>>> 8c11b6072... removed 1.1 release note MultiIndex ^^^^^^^^^^ diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 6f173cb2fce12..59a968edcbb90 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -109,7 +109,7 @@ Indexing Missing ^^^^^^^ -- +- Bug in :meth:`SeriesGroupBy.transform` now correctly handles missing values for `dropna=False` (:issue:`35014`) - MultiIndex diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 4976159eceb0d..e4128af62d06d 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -246,31 +246,11 @@ def array_with_unit_to_datetime( if ((fvalues < Timestamp.min.value).any() or (fvalues > Timestamp.max.value).any()): raise OutOfBoundsDatetime(f"cannot convert input with unit '{unit}'") -<<<<<<< HEAD -<<<<<<< HEAD -======= ->>>>>>> 7df44d10f... revert accidental changes result = (iresult * m).astype('M8[ns]') iresult = result.view('i8') iresult[mask] = NPY_NAT return result, tz -<<<<<<< HEAD -======= - # GH20445 - if values.dtype.kind == 'i': - result = (iresult * m).astype('M8[ns]') - iresult = result.view('i8') - iresult[mask] = NPY_NAT - return result, tz - elif values.dtype.kind == 'f': - result = (fresult * m_as_float).astype('M8[ns]') - fresult = result.view('f8') - fresult[mask] = NPY_NAT - return result, tz ->>>>>>> f1ae8f562... _libs/tslib.pyx added comments -======= ->>>>>>> 7df44d10f... revert accidental changes result = np.empty(n, dtype='M8[ns]') iresult = result.view('i8') diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 4eb16c4efccf0..adf1dfbc1ac72 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -120,11 +120,7 @@ cdef inline int64_t cast_from_unit(object ts, str unit) except? -1: return (base * m) + (frac * m) -<<<<<<< HEAD cpdef inline (int64_t, int) precision_from_unit(str unit): -======= -cpdef inline object precision_from_unit(str unit): ->>>>>>> 6b9d4de82... revert changes """ Return a casting of the unit represented to nanoseconds + the precision to round the fractional part. diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 9828c4bec2833..5613588d0d2d8 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -39,7 +39,6 @@ maybe_cast_result_dtype, maybe_convert_objects, maybe_downcast_numeric, - maybe_downcast_to_dtype, ) from pandas.core.dtypes.common import ( ensure_int64, @@ -535,26 +534,25 @@ def _transform_general( if isinstance(res, (ABCDataFrame, ABCSeries)): res = res._values - indexer = self._get_index(name) - ser = klass(res, indexer) - results.append(ser) + results.append(klass(res, index=group.index)) # check for empty "results" to avoid concat ValueError if results: from pandas.core.reshape.concat import concat - result = concat(results).sort_index() + concatenated = concat(results) + result = self._set_result_index_ordered(concatenated) else: result = self.obj._constructor(dtype=np.float64) - # we will only try to coerce the result type if # we have a numeric dtype, as these are *always* user-defined funcs # the cython take a different path (and casting) - # make sure we don't accidentally upcast (GH35014) - types = ["bool", "int32", "int64", "float32", "float64"] - dtype = self._selected_obj.dtype - if is_numeric_dtype(dtype) and types.index(dtype) < types.index(result.dtype): - result = maybe_downcast_to_dtype(result, dtype) + if is_numeric_dtype(result.dtype): + common_dtype = np.find_common_type( + [self._selected_obj.dtype, result.dtype], [] + ) + if common_dtype is result.dtype: + result = maybe_downcast_numeric(result, self._selected_obj.dtype) result.name = self._selected_obj.name result.index = self._selected_obj.index diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 85be3f194680d..6c8a780859939 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -54,7 +54,6 @@ class providing the base-class of operations. ) from pandas.core.dtypes.missing import isna, notna -import pandas as pd from pandas.core import nanops import pandas.core.algorithms as algorithms from pandas.core.arrays import Categorical, DatetimeArray @@ -624,12 +623,7 @@ def get_converter(s): converter = get_converter(index_sample) names = (converter(name) for name in names) - return [ - self.indices.get(name, []) - if not isna(name) - else self.indices.get(pd.NaT, []) - for name in names - ] + return [self.indices.get(name, []) for name in names] def _get_index(self, name): """ @@ -813,7 +807,7 @@ def get_group(self, name, obj=None): if obj is None: obj = self._selected_obj - inds = self._get_index(pd.NaT) if pd.isna(name) else self._get_index(name) + inds = self._get_index(name) if not len(inds): raise KeyError(name) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index c1e096cf47b8f..8239a792c65dd 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -20,7 +20,6 @@ ) from pandas.core.dtypes.generic import ABCSeries -import pandas as pd import pandas.core.algorithms as algorithms from pandas.core.arrays import Categorical, ExtensionArray import pandas.core.common as com @@ -558,16 +557,7 @@ def indices(self): return self.grouper.indices values = Categorical(self.grouper) - - # GH35014 - reverse_indexer = values._reverse_indexer() - if not self.dropna and any(pd.isna(v) for v in values): - return { - **reverse_indexer, - pd.NaT: np.array([i for i, v in enumerate(values) if pd.isna(v)]), - } - else: - return reverse_indexer + return values._reverse_indexer() @property def codes(self) -> np.ndarray: From 8d991d5e8c99e1ab9075272d2a2eb0e6af4743ce Mon Sep 17 00:00:00 2001 From: arw2019 Date: Fri, 7 Aug 2020 18:55:36 +0000 Subject: [PATCH 40/41] rewrite case + rewrite tests w fixtures --- pandas/core/groupby/generic.py | 6 ++-- pandas/tests/groupby/test_groupby_dropna.py | 33 ++++++++++++++++----- 2 files changed, 27 insertions(+), 12 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index bada7060fb515..1fed193dba02c 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -35,6 +35,7 @@ from pandas.util._decorators import Appender, Substitution, doc from pandas.core.dtypes.cast import ( + find_common_type, maybe_cast_result, maybe_cast_result_dtype, maybe_convert_objects, @@ -512,7 +513,6 @@ def _transform_general( """ Transform with a non-str `func`. """ - if maybe_use_numba(engine): numba_func, cache_key = generate_numba_func( func, engine_kwargs, kwargs, "groupby_transform" @@ -548,9 +548,7 @@ def _transform_general( # we have a numeric dtype, as these are *always* user-defined funcs # the cython take a different path (and casting) if is_numeric_dtype(result.dtype): - common_dtype = np.find_common_type( - [self._selected_obj.dtype, result.dtype], [] - ) + common_dtype = find_common_type([self._selected_obj.dtype, result.dtype]) if common_dtype is result.dtype: result = maybe_downcast_numeric(result, self._selected_obj.dtype) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 3f158e99fab31..7e76ea9c0e081 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -162,25 +162,42 @@ def test_groupby_dropna_series_by(dropna, expected): tm.assert_series_equal(result, expected) -def test_slice_groupby_then_transform(): +@pytest.mark.parametrize( + "dropna,df_expected,s_expected", + [ + pytest.param( + True, + pd.DataFrame({"B": [2, 2, 1]}), + pd.Series(data=[2, 2, 1], name="B"), + marks=pytest.mark.xfail( + raises=ValueError, + msg="Length mismatch: Expected axis has 3 elements, " + "new values have 4 elements", + ), + ), + ( + False, + pd.DataFrame({"B": [2, 2, 1, 1]}), + pd.Series(data=[2, 2, 1, 1], name="B"), + ), + ], +) +def test_slice_groupby_then_transform(dropna, df_expected, s_expected): # GH35014 df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]}) - gb = df.groupby("A", dropna=False) + gb = df.groupby("A", dropna=dropna) res = gb.transform(len) - expected = pd.DataFrame({"B": [2, 2, 1, 1]}) - tm.assert_frame_equal(res, expected) + tm.assert_frame_equal(res, df_expected) gb_slice = gb[["B"]] res = gb_slice.transform(len) - expected = pd.DataFrame({"B": [2, 2, 1, 1]}) - tm.assert_frame_equal(res, expected) + tm.assert_frame_equal(res, df_expected) gb_slice = gb["B"] res = gb["B"].transform(len) - expected = pd.Series(data=[2, 2, 1, 1], name="B") - tm.assert_series_equal(res, expected) + tm.assert_series_equal(res, s_expected) @pytest.mark.parametrize( From 570ce21afcd82579a6a02102f47a0c227f0204db Mon Sep 17 00:00:00 2001 From: arw2019 Date: Fri, 7 Aug 2020 19:25:43 +0000 Subject: [PATCH 41/41] fix mypy error --- pandas/tests/groupby/test_groupby_dropna.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 7e76ea9c0e081..adf62c4723526 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -169,11 +169,7 @@ def test_groupby_dropna_series_by(dropna, expected): True, pd.DataFrame({"B": [2, 2, 1]}), pd.Series(data=[2, 2, 1], name="B"), - marks=pytest.mark.xfail( - raises=ValueError, - msg="Length mismatch: Expected axis has 3 elements, " - "new values have 4 elements", - ), + marks=pytest.mark.xfail(raises=ValueError), ), ( False,