From 8def64931af8a01f4af50d79a8d628fe3e63f00c Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 21 Apr 2018 18:26:27 -0400 Subject: [PATCH] TST: split test_groupby.py (#20781) closes #20696 --- .../tests/groupby/aggregate/test_aggregate.py | 71 +- pandas/tests/groupby/common.py | 62 - pandas/tests/groupby/conftest.py | 77 + pandas/tests/groupby/test_apply.py | 517 ++ pandas/tests/groupby/test_categorical.py | 1415 ++--- pandas/tests/groupby/test_filters.py | 1180 ++--- pandas/tests/groupby/test_function.py | 1120 ++++ pandas/tests/groupby/test_functional.py | 372 -- pandas/tests/groupby/test_groupby.py | 4606 ++++++----------- pandas/tests/groupby/test_grouping.py | 115 +- pandas/tests/groupby/test_nth.py | 618 +-- pandas/tests/groupby/test_rank.py | 254 + pandas/tests/groupby/test_transform.py | 1464 +++--- 13 files changed, 5983 insertions(+), 5888 deletions(-) delete mode 100644 pandas/tests/groupby/common.py create mode 100644 pandas/tests/groupby/conftest.py create mode 100644 pandas/tests/groupby/test_apply.py create mode 100644 pandas/tests/groupby/test_function.py delete mode 100644 pandas/tests/groupby/test_functional.py create mode 100644 pandas/tests/groupby/test_rank.py diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index d85719d328ff2..b2f18e11de8ee 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -15,51 +15,6 @@ import pandas.util.testing as tm -@pytest.fixture -def ts(): - return tm.makeTimeSeries() - - -@pytest.fixture -def tsframe(): - return DataFrame(tm.getTimeSeriesData()) - - -@pytest.fixture -def df(): - return DataFrame( - {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), - 'D': np.random.randn(8)}) - - -@pytest.fixture -def mframe(): - index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], - ['one', 'two', 'three']], - labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['first', 'second']) - return DataFrame(np.random.randn(10, 3), - index=index, - columns=['A', 'B', 'C']) - - -@pytest.fixture -def three_group(): - return DataFrame( - {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', - 'bar', 'bar', 'foo', 'foo', 'foo'], - 'B': ['one', 'one', 'one', 'two', 'one', 'one', - 'one', 'two', 'two', 'two', 'one'], - 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', - 'shiny', 'dull', 'shiny', 'shiny', 'shiny'], - 'D': np.random.randn(11), - 'E': np.random.randn(11), - 'F': np.random.randn(11)}) - - def test_agg_regression1(tsframe): grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month]) result = grouped.agg(np.mean) @@ -87,6 +42,32 @@ def test_agg_ser_multi_key(df): tm.assert_series_equal(results, expected) +def test_groupby_aggregation_mixed_dtype(): + + # GH 6212 + expected = DataFrame({ + 'v1': [5, 5, 7, np.nan, 3, 3, 4, 1], + 'v2': [55, 55, 77, np.nan, 33, 33, 44, 11]}, + index=MultiIndex.from_tuples([(1, 95), (1, 99), (2, 95), (2, 99), + ('big', 'damp'), + ('blue', 'dry'), + ('red', 'red'), ('red', 'wet')], + names=['by1', 'by2'])) + + df = DataFrame({ + 'v1': [1, 3, 5, 7, 8, 3, 5, np.nan, 4, 5, 7, 9], + 'v2': [11, 33, 55, 77, 88, 33, 55, np.nan, 44, 55, 77, 99], + 'by1': ["red", "blue", 1, 2, np.nan, "big", 1, 2, "red", 1, np.nan, + 12], + 'by2': ["wet", "dry", 99, 95, np.nan, "damp", 95, 99, "red", 99, + np.nan, np.nan] + }) + + g = df.groupby(['by1', 'by2']) + result = g[['v1', 'v2']].mean() + tm.assert_frame_equal(result, expected) + + def test_agg_apply_corner(ts, tsframe): # nothing to group, all NA grouped = ts.groupby(ts * np.nan) diff --git a/pandas/tests/groupby/common.py b/pandas/tests/groupby/common.py deleted file mode 100644 index 3e99e8211b4f8..0000000000000 --- a/pandas/tests/groupby/common.py +++ /dev/null @@ -1,62 +0,0 @@ -""" Base setup """ - -import pytest -import numpy as np -from pandas.util import testing as tm -from pandas import DataFrame, MultiIndex - - -@pytest.fixture -def mframe(): - index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', - 'three']], - labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['first', 'second']) - return DataFrame(np.random.randn(10, 3), index=index, - columns=['A', 'B', 'C']) - - -@pytest.fixture -def df(): - return DataFrame( - {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), - 'D': np.random.randn(8)}) - - -class MixIn(object): - - def setup_method(self, method): - self.ts = tm.makeTimeSeries() - - self.seriesd = tm.getSeriesData() - self.tsd = tm.getTimeSeriesData() - self.frame = DataFrame(self.seriesd) - self.tsframe = DataFrame(self.tsd) - - self.df = df() - self.df_mixed_floats = DataFrame( - {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), - 'D': np.array( - np.random.randn(8), dtype='float32')}) - - self.mframe = mframe() - - self.three_group = DataFrame( - {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', - 'foo', 'foo', 'foo'], - 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', - 'two', 'two', 'one'], - 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny', - 'dull', 'shiny', 'shiny', 'shiny'], - 'D': np.random.randn(11), - 'E': np.random.randn(11), - 'F': np.random.randn(11)}) - - -def assert_fp_equal(a, b): - assert (np.abs(a - b) < 1e-12).all() diff --git a/pandas/tests/groupby/conftest.py b/pandas/tests/groupby/conftest.py new file mode 100644 index 0000000000000..877aa835ac6f5 --- /dev/null +++ b/pandas/tests/groupby/conftest.py @@ -0,0 +1,77 @@ +import pytest +import numpy as np +from pandas import MultiIndex, DataFrame +from pandas.util import testing as tm + + +@pytest.fixture +def mframe(): + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', + 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['first', 'second']) + return DataFrame(np.random.randn(10, 3), index=index, + columns=['A', 'B', 'C']) + + +@pytest.fixture +def df(): + return DataFrame( + {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], + 'C': np.random.randn(8), + 'D': np.random.randn(8)}) + + +@pytest.fixture +def ts(): + return tm.makeTimeSeries() + + +@pytest.fixture +def seriesd(): + return tm.getSeriesData() + + +@pytest.fixture +def tsd(): + return tm.getTimeSeriesData() + + +@pytest.fixture +def frame(seriesd): + return DataFrame(seriesd) + + +@pytest.fixture +def tsframe(tsd): + return DataFrame(tsd) + + +@pytest.fixture +def df_mixed_floats(): + return DataFrame({'A': ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'three', + 'two', 'two', 'one', 'three'], + 'C': np.random.randn(8), + 'D': np.array( + np.random.randn(8), dtype='float32')}) + + +@pytest.fixture +def three_group(): + return DataFrame({'A': ['foo', 'foo', 'foo', + 'foo', 'bar', 'bar', + 'bar', 'bar', + 'foo', 'foo', 'foo'], + 'B': ['one', 'one', 'one', + 'two', 'one', 'one', 'one', 'two', + 'two', 'two', 'one'], + 'C': ['dull', 'dull', 'shiny', + 'dull', 'dull', 'shiny', 'shiny', + 'dull', 'shiny', 'shiny', 'shiny'], + 'D': np.random.randn(11), + 'E': np.random.randn(11), + 'F': np.random.randn(11)}) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py new file mode 100644 index 0000000000000..5ca10fe1af9d1 --- /dev/null +++ b/pandas/tests/groupby/test_apply.py @@ -0,0 +1,517 @@ +import pytest +import numpy as np +import pandas as pd +from datetime import datetime +from pandas.util import testing as tm +from pandas import DataFrame, MultiIndex, compat, Series, bdate_range, Index + + +def test_apply_issues(): + # GH 5788 + + s = """2011.05.16,00:00,1.40893 +2011.05.16,01:00,1.40760 +2011.05.16,02:00,1.40750 +2011.05.16,03:00,1.40649 +2011.05.17,02:00,1.40893 +2011.05.17,03:00,1.40760 +2011.05.17,04:00,1.40750 +2011.05.17,05:00,1.40649 +2011.05.18,02:00,1.40893 +2011.05.18,03:00,1.40760 +2011.05.18,04:00,1.40750 +2011.05.18,05:00,1.40649""" + + df = pd.read_csv( + compat.StringIO(s), header=None, names=['date', 'time', 'value'], + parse_dates=[['date', 'time']]) + df = df.set_index('date_time') + + expected = df.groupby(df.index.date).idxmax() + result = df.groupby(df.index.date).apply(lambda x: x.idxmax()) + tm.assert_frame_equal(result, expected) + + # GH 5789 + # don't auto coerce dates + df = pd.read_csv( + compat.StringIO(s), header=None, names=['date', 'time', 'value']) + exp_idx = pd.Index( + ['2011.05.16', '2011.05.17', '2011.05.18' + ], dtype=object, name='date') + expected = Series(['00:00', '02:00', '02:00'], index=exp_idx) + result = df.groupby('date').apply( + lambda x: x['time'][x['value'].idxmax()]) + tm.assert_series_equal(result, expected) + + +def test_apply_trivial(): + # GH 20066 + # trivial apply: ignore input and return a constant dataframe. + df = pd.DataFrame({'key': ['a', 'a', 'b', 'b', 'a'], + 'data': [1.0, 2.0, 3.0, 4.0, 5.0]}, + columns=['key', 'data']) + expected = pd.concat([df.iloc[1:], df.iloc[1:]], + axis=1, keys=['float64', 'object']) + result = df.groupby([str(x) for x in df.dtypes], + axis=1).apply(lambda x: df.iloc[1:]) + + tm.assert_frame_equal(result, expected) + + +@pytest.mark.xfail(reason=("GH 20066; function passed into apply " + "returns a DataFrame with the same index " + "as the one to create GroupBy object.")) +def test_apply_trivial_fail(): + # GH 20066 + # trivial apply fails if the constant dataframe has the same index + # with the one used to create GroupBy object. + df = pd.DataFrame({'key': ['a', 'a', 'b', 'b', 'a'], + 'data': [1.0, 2.0, 3.0, 4.0, 5.0]}, + columns=['key', 'data']) + expected = pd.concat([df, df], + axis=1, keys=['float64', 'object']) + result = df.groupby([str(x) for x in df.dtypes], + axis=1).apply(lambda x: df) + + tm.assert_frame_equal(result, expected) + + +def test_fast_apply(): + # make sure that fast apply is correctly called + # rather than raising any kind of error + # otherwise the python path will be callsed + # which slows things down + N = 1000 + labels = np.random.randint(0, 2000, size=N) + labels2 = np.random.randint(0, 3, size=N) + df = DataFrame({'key': labels, + 'key2': labels2, + 'value1': np.random.randn(N), + 'value2': ['foo', 'bar', 'baz', 'qux'] * (N // 4)}) + + def f(g): + return 1 + + g = df.groupby(['key', 'key2']) + + grouper = g.grouper + + splitter = grouper._get_splitter(g._selected_obj, axis=g.axis) + group_keys = grouper._get_group_keys() + + values, mutated = splitter.fast_apply(f, group_keys) + assert not mutated + + +def test_apply_with_mixed_dtype(): + # GH3480, apply with mixed dtype on axis=1 breaks in 0.11 + df = DataFrame({'foo1': np.random.randn(6), + 'foo2': ['one', 'two', 'two', 'three', 'one', 'two']}) + result = df.apply(lambda x: x, axis=1) + tm.assert_series_equal(df.get_dtype_counts(), result.get_dtype_counts()) + + # GH 3610 incorrect dtype conversion with as_index=False + df = DataFrame({"c1": [1, 2, 6, 6, 8]}) + df["c2"] = df.c1 / 2.0 + result1 = df.groupby("c2").mean().reset_index().c2 + result2 = df.groupby("c2", as_index=False).mean().c2 + tm.assert_series_equal(result1, result2) + + +def test_groupby_as_index_apply(df): + # GH #4648 and #3417 + df = DataFrame({'item_id': ['b', 'b', 'a', 'c', 'a', 'b'], + 'user_id': [1, 2, 1, 1, 3, 1], + 'time': range(6)}) + + g_as = df.groupby('user_id', as_index=True) + g_not_as = df.groupby('user_id', as_index=False) + + res_as = g_as.head(2).index + res_not_as = g_not_as.head(2).index + exp = Index([0, 1, 2, 4]) + tm.assert_index_equal(res_as, exp) + tm.assert_index_equal(res_not_as, exp) + + res_as_apply = g_as.apply(lambda x: x.head(2)).index + res_not_as_apply = g_not_as.apply(lambda x: x.head(2)).index + + # apply doesn't maintain the original ordering + # changed in GH5610 as the as_index=False returns a MI here + exp_not_as_apply = MultiIndex.from_tuples([(0, 0), (0, 2), (1, 1), ( + 2, 4)]) + tp = [(1, 0), (1, 2), (2, 1), (3, 4)] + exp_as_apply = MultiIndex.from_tuples(tp, names=['user_id', None]) + + tm.assert_index_equal(res_as_apply, exp_as_apply) + tm.assert_index_equal(res_not_as_apply, exp_not_as_apply) + + ind = Index(list('abcde')) + df = DataFrame([[1, 2], [2, 3], [1, 4], [1, 5], [2, 6]], index=ind) + res = df.groupby(0, as_index=False).apply(lambda x: x).index + tm.assert_index_equal(res, ind) + + +def test_apply_concat_preserve_names(three_group): + grouped = three_group.groupby(['A', 'B']) + + def desc(group): + result = group.describe() + result.index.name = 'stat' + return result + + def desc2(group): + result = group.describe() + result.index.name = 'stat' + result = result[:len(group)] + # weirdo + return result + + def desc3(group): + result = group.describe() + + # names are different + result.index.name = 'stat_%d' % len(group) + + result = result[:len(group)] + # weirdo + return result + + result = grouped.apply(desc) + assert result.index.names == ('A', 'B', 'stat') + + result2 = grouped.apply(desc2) + assert result2.index.names == ('A', 'B', 'stat') + + result3 = grouped.apply(desc3) + assert result3.index.names == ('A', 'B', None) + + +def test_apply_series_to_frame(): + def f(piece): + with np.errstate(invalid='ignore'): + logged = np.log(piece) + return DataFrame({'value': piece, + 'demeaned': piece - piece.mean(), + 'logged': logged}) + + dr = bdate_range('1/1/2000', periods=100) + ts = Series(np.random.randn(100), index=dr) + + grouped = ts.groupby(lambda x: x.month) + result = grouped.apply(f) + + assert isinstance(result, DataFrame) + tm.assert_index_equal(result.index, ts.index) + + +def test_apply_series_yield_constant(df): + result = df.groupby(['A', 'B'])['C'].apply(len) + assert result.index.names[:2] == ('A', 'B') + + +def test_apply_frame_yield_constant(df): + # GH13568 + result = df.groupby(['A', 'B']).apply(len) + assert isinstance(result, Series) + assert result.name is None + + result = df.groupby(['A', 'B'])[['C', 'D']].apply(len) + assert isinstance(result, Series) + assert result.name is None + + +def test_apply_frame_to_series(df): + grouped = df.groupby(['A', 'B']) + result = grouped.apply(len) + expected = grouped.count()['C'] + tm.assert_index_equal(result.index, expected.index) + tm.assert_numpy_array_equal(result.values, expected.values) + + +def test_apply_frame_concat_series(): + def trans(group): + return group.groupby('B')['C'].sum().sort_values()[:2] + + def trans2(group): + grouped = group.groupby(df.reindex(group.index)['B']) + return grouped.sum().sort_values()[:2] + + df = DataFrame({'A': np.random.randint(0, 5, 1000), + 'B': np.random.randint(0, 5, 1000), + 'C': np.random.randn(1000)}) + + result = df.groupby('A').apply(trans) + exp = df.groupby('A')['C'].apply(trans2) + tm.assert_series_equal(result, exp, check_names=False) + assert result.name == 'C' + + +def test_apply_transform(ts): + grouped = ts.groupby(lambda x: x.month) + result = grouped.apply(lambda x: x * 2) + expected = grouped.transform(lambda x: x * 2) + tm.assert_series_equal(result, expected) + + +def test_apply_multikey_corner(tsframe): + grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month]) + + def f(group): + return group.sort_values('A')[-5:] + + result = grouped.apply(f) + for key, group in grouped: + tm.assert_frame_equal(result.loc[key], f(group)) + + +def test_apply_chunk_view(): + # Low level tinkering could be unsafe, make sure not + df = DataFrame({'key': [1, 1, 1, 2, 2, 2, 3, 3, 3], + 'value': compat.lrange(9)}) + + # return view + f = lambda x: x[:2] + + result = df.groupby('key', group_keys=False).apply(f) + expected = df.take([0, 1, 3, 4, 6, 7]) + tm.assert_frame_equal(result, expected) + + +def test_apply_no_name_column_conflict(): + df = DataFrame({'name': [1, 1, 1, 1, 1, 1, 2, 2, 2, 2], + 'name2': [0, 0, 0, 1, 1, 1, 0, 0, 1, 1], + 'value': compat.lrange(10)[::-1]}) + + # it works! #2605 + grouped = df.groupby(['name', 'name2']) + grouped.apply(lambda x: x.sort_values('value', inplace=True)) + + +def test_apply_typecast_fail(): + df = DataFrame({'d': [1., 1., 1., 2., 2., 2.], + 'c': np.tile( + ['a', 'b', 'c'], 2), + 'v': np.arange(1., 7.)}) + + def f(group): + v = group['v'] + group['v2'] = (v - v.min()) / (v.max() - v.min()) + return group + + result = df.groupby('d').apply(f) + + expected = df.copy() + expected['v2'] = np.tile([0., 0.5, 1], 2) + + tm.assert_frame_equal(result, expected) + + +def test_apply_multiindex_fail(): + index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3] + ]) + df = DataFrame({'d': [1., 1., 1., 2., 2., 2.], + 'c': np.tile(['a', 'b', 'c'], 2), + 'v': np.arange(1., 7.)}, index=index) + + def f(group): + v = group['v'] + group['v2'] = (v - v.min()) / (v.max() - v.min()) + return group + + result = df.groupby('d').apply(f) + + expected = df.copy() + expected['v2'] = np.tile([0., 0.5, 1], 2) + + tm.assert_frame_equal(result, expected) + + +def test_apply_corner(tsframe): + result = tsframe.groupby(lambda x: x.year).apply(lambda x: x * 2) + expected = tsframe * 2 + tm.assert_frame_equal(result, expected) + + +def test_apply_without_copy(): + # GH 5545 + # returning a non-copy in an applied function fails + + data = DataFrame({'id_field': [100, 100, 200, 300], + 'category': ['a', 'b', 'c', 'c'], + 'value': [1, 2, 3, 4]}) + + def filt1(x): + if x.shape[0] == 1: + return x.copy() + else: + return x[x.category == 'c'] + + def filt2(x): + if x.shape[0] == 1: + return x + else: + return x[x.category == 'c'] + + expected = data.groupby('id_field').apply(filt1) + result = data.groupby('id_field').apply(filt2) + tm.assert_frame_equal(result, expected) + + +def test_apply_corner_cases(): + # #535, can't use sliding iterator + + N = 1000 + labels = np.random.randint(0, 100, size=N) + df = DataFrame({'key': labels, + 'value1': np.random.randn(N), + 'value2': ['foo', 'bar', 'baz', 'qux'] * (N // 4)}) + + grouped = df.groupby('key') + + def f(g): + g['value3'] = g['value1'] * 2 + return g + + result = grouped.apply(f) + assert 'value3' in result + + +def test_apply_numeric_coercion_when_datetime(): + # In the past, group-by/apply operations have been over-eager + # in converting dtypes to numeric, in the presence of datetime + # columns. Various GH issues were filed, the reproductions + # for which are here. + + # GH 15670 + df = pd.DataFrame({'Number': [1, 2], + 'Date': ["2017-03-02"] * 2, + 'Str': ["foo", "inf"]}) + expected = df.groupby(['Number']).apply(lambda x: x.iloc[0]) + df.Date = pd.to_datetime(df.Date) + result = df.groupby(['Number']).apply(lambda x: x.iloc[0]) + tm.assert_series_equal(result['Str'], expected['Str']) + + # GH 15421 + df = pd.DataFrame({'A': [10, 20, 30], + 'B': ['foo', '3', '4'], + 'T': [pd.Timestamp("12:31:22")] * 3}) + + def get_B(g): + return g.iloc[0][['B']] + result = df.groupby('A').apply(get_B)['B'] + expected = df.B + expected.index = df.A + tm.assert_series_equal(result, expected) + + # GH 14423 + def predictions(tool): + out = pd.Series(index=['p1', 'p2', 'useTime'], dtype=object) + if 'step1' in list(tool.State): + out['p1'] = str(tool[tool.State == 'step1'].Machine.values[0]) + if 'step2' in list(tool.State): + out['p2'] = str(tool[tool.State == 'step2'].Machine.values[0]) + out['useTime'] = str( + tool[tool.State == 'step2'].oTime.values[0]) + return out + df1 = pd.DataFrame({'Key': ['B', 'B', 'A', 'A'], + 'State': ['step1', 'step2', 'step1', 'step2'], + 'oTime': ['', '2016-09-19 05:24:33', + '', '2016-09-19 23:59:04'], + 'Machine': ['23', '36L', '36R', '36R']}) + df2 = df1.copy() + df2.oTime = pd.to_datetime(df2.oTime) + expected = df1.groupby('Key').apply(predictions).p1 + result = df2.groupby('Key').apply(predictions).p1 + tm.assert_series_equal(expected, result) + + +def test_time_field_bug(): + # Test a fix for the following error related to GH issue 11324 When + # non-key fields in a group-by dataframe contained time-based fields + # that were not returned by the apply function, an exception would be + # raised. + + df = pd.DataFrame({'a': 1, 'b': [datetime.now() for nn in range(10)]}) + + def func_with_no_date(batch): + return pd.Series({'c': 2}) + + def func_with_date(batch): + return pd.Series({'b': datetime(2015, 1, 1), 'c': 2}) + + dfg_no_conversion = df.groupby(by=['a']).apply(func_with_no_date) + dfg_no_conversion_expected = pd.DataFrame({'c': 2}, index=[1]) + dfg_no_conversion_expected.index.name = 'a' + + dfg_conversion = df.groupby(by=['a']).apply(func_with_date) + dfg_conversion_expected = pd.DataFrame( + {'b': datetime(2015, 1, 1), + 'c': 2}, index=[1]) + dfg_conversion_expected.index.name = 'a' + + tm.assert_frame_equal(dfg_no_conversion, dfg_no_conversion_expected) + tm.assert_frame_equal(dfg_conversion, dfg_conversion_expected) + + +def test_gb_apply_list_of_unequal_len_arrays(): + + # GH1738 + df = DataFrame({'group1': ['a', 'a', 'a', 'b', 'b', 'b', 'a', 'a', 'a', + 'b', 'b', 'b'], + 'group2': ['c', 'c', 'd', 'd', 'd', 'e', 'c', 'c', 'd', + 'd', 'd', 'e'], + 'weight': [1.1, 2, 3, 4, 5, 6, 2, 4, 6, 8, 1, 2], + 'value': [7.1, 8, 9, 10, 11, 12, 8, 7, 6, 5, 4, 3]}) + df = df.set_index(['group1', 'group2']) + df_grouped = df.groupby(level=['group1', 'group2'], sort=True) + + def noddy(value, weight): + out = np.array(value * weight).repeat(3) + return out + + # the kernel function returns arrays of unequal length + # pandas sniffs the first one, sees it's an array and not + # a list, and assumed the rest are of equal length + # and so tries a vstack + + # don't die + df_grouped.apply(lambda x: noddy(x.value, x.weight)) + + +def test_groupby_apply_all_none(): + # Tests to make sure no errors if apply function returns all None + # values. Issue 9684. + test_df = DataFrame({'groups': [0, 0, 1, 1], + 'random_vars': [8, 7, 4, 5]}) + + def test_func(x): + pass + + result = test_df.groupby('groups').apply(test_func) + expected = DataFrame() + tm.assert_frame_equal(result, expected) + + +def test_groupby_apply_none_first(): + # GH 12824. Tests if apply returns None first. + test_df1 = DataFrame({'groups': [1, 1, 1, 2], 'vars': [0, 1, 2, 3]}) + test_df2 = DataFrame({'groups': [1, 2, 2, 2], 'vars': [0, 1, 2, 3]}) + + def test_func(x): + if x.shape[0] < 2: + return None + return x.iloc[[0, -1]] + + result1 = test_df1.groupby('groups').apply(test_func) + result2 = test_df2.groupby('groups').apply(test_func) + index1 = MultiIndex.from_arrays([[1, 1], [0, 2]], + names=['groups', None]) + index2 = MultiIndex.from_arrays([[2, 2], [1, 3]], + names=['groups', None]) + expected1 = DataFrame({'groups': [1, 1], 'vars': [0, 2]}, + index=index1) + expected2 = DataFrame({'groups': [2, 2], 'vars': [1, 3]}, + index=index2) + tm.assert_frame_equal(result1, expected1) + tm.assert_frame_equal(result2, expected2) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index bcd0da28b5a34..160b60e69f39d 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -9,710 +9,725 @@ import pandas as pd from pandas import (Index, MultiIndex, CategoricalIndex, - DataFrame, Categorical, Series, Interval) + DataFrame, Categorical, Series, Interval, qcut) from pandas.util.testing import assert_frame_equal, assert_series_equal import pandas.util.testing as tm -from .common import MixIn - - -class TestGroupByCategorical(MixIn): - - def test_groupby(self): - - cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], - categories=["a", "b", "c", "d"], ordered=True) - data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats}) - - exp_index = CategoricalIndex(list('abcd'), name='b', ordered=True) - expected = DataFrame({'a': [1, 2, 4, np.nan]}, index=exp_index) - result = data.groupby("b").mean() - tm.assert_frame_equal(result, expected) - - raw_cat1 = Categorical(["a", "a", "b", "b"], - categories=["a", "b", "z"], ordered=True) - raw_cat2 = Categorical(["c", "d", "c", "d"], - categories=["c", "d", "y"], ordered=True) - df = DataFrame({"A": raw_cat1, "B": raw_cat2, "values": [1, 2, 3, 4]}) - - # single grouper - gb = df.groupby("A") - exp_idx = CategoricalIndex(['a', 'b', 'z'], name='A', ordered=True) - expected = DataFrame({'values': Series([3, 7, 0], index=exp_idx)}) - result = gb.sum() - tm.assert_frame_equal(result, expected) - - # multiple groupers - gb = df.groupby(['A', 'B']) - exp_index = pd.MultiIndex.from_product( - [Categorical(["a", "b", "z"], ordered=True), - Categorical(["c", "d", "y"], ordered=True)], - names=['A', 'B']) - expected = DataFrame({'values': [1, 2, np.nan, 3, 4, np.nan, - np.nan, np.nan, np.nan]}, - index=exp_index) - result = gb.sum() - tm.assert_frame_equal(result, expected) - - # multiple groupers with a non-cat - df = df.copy() - df['C'] = ['foo', 'bar'] * 2 - gb = df.groupby(['A', 'B', 'C']) - exp_index = pd.MultiIndex.from_product( - [Categorical(["a", "b", "z"], ordered=True), - Categorical(["c", "d", "y"], ordered=True), - ['foo', 'bar']], - names=['A', 'B', 'C']) - expected = DataFrame({'values': Series( - np.nan, index=exp_index)}).sort_index() - expected.iloc[[1, 2, 7, 8], 0] = [1, 2, 3, 4] - result = gb.sum() - tm.assert_frame_equal(result, expected) - - # GH 8623 - x = DataFrame([[1, 'John P. Doe'], [2, 'Jane Dove'], - [1, 'John P. Doe']], - columns=['person_id', 'person_name']) - x['person_name'] = Categorical(x.person_name) - - g = x.groupby(['person_id']) - result = g.transform(lambda x: x) - tm.assert_frame_equal(result, x[['person_name']]) - - result = x.drop_duplicates('person_name') - expected = x.iloc[[0, 1]] - tm.assert_frame_equal(result, expected) - - def f(x): - return x.drop_duplicates('person_name').iloc[0] - - result = g.apply(f) - expected = x.iloc[[0, 1]].copy() - expected.index = Index([1, 2], name='person_id') - expected['person_name'] = expected['person_name'].astype('object') - tm.assert_frame_equal(result, expected) - - # GH 9921 - # Monotonic - df = DataFrame({"a": [5, 15, 25]}) - c = pd.cut(df.a, bins=[0, 10, 20, 30, 40]) - - result = df.a.groupby(c).transform(sum) - tm.assert_series_equal(result, df['a']) - - tm.assert_series_equal( - df.a.groupby(c).transform(lambda xs: np.sum(xs)), df['a']) - tm.assert_frame_equal(df.groupby(c).transform(sum), df[['a']]) - tm.assert_frame_equal( - df.groupby(c).transform(lambda xs: np.max(xs)), df[['a']]) - - # Filter - tm.assert_series_equal(df.a.groupby(c).filter(np.all), df['a']) - tm.assert_frame_equal(df.groupby(c).filter(np.all), df) - - # Non-monotonic - df = DataFrame({"a": [5, 15, 25, -5]}) - c = pd.cut(df.a, bins=[-10, 0, 10, 20, 30, 40]) - - result = df.a.groupby(c).transform(sum) - tm.assert_series_equal(result, df['a']) - - tm.assert_series_equal( - df.a.groupby(c).transform(lambda xs: np.sum(xs)), df['a']) - tm.assert_frame_equal(df.groupby(c).transform(sum), df[['a']]) - tm.assert_frame_equal( - df.groupby(c).transform(lambda xs: np.sum(xs)), df[['a']]) - - # GH 9603 - df = DataFrame({'a': [1, 0, 0, 0]}) - c = pd.cut(df.a, [0, 1, 2, 3, 4], labels=Categorical(list('abcd'))) - result = df.groupby(c).apply(len) - - exp_index = CategoricalIndex( - c.values.categories, ordered=c.values.ordered) - expected = Series([1, 0, 0, 0], index=exp_index) - expected.index.name = 'a' - tm.assert_series_equal(result, expected) - - def test_groupby_sort(self): - - # http://stackoverflow.com/questions/23814368/sorting-pandas-categorical-labels-after-groupby - # This should result in a properly sorted Series so that the plot - # has a sorted x axis - # self.cat.groupby(['value_group'])['value_group'].count().plot(kind='bar') - - df = DataFrame({'value': np.random.randint(0, 10000, 100)}) - labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] - cat_labels = Categorical(labels, labels) - - df = df.sort_values(by=['value'], ascending=True) - df['value_group'] = pd.cut(df.value, range(0, 10500, 500), - right=False, labels=cat_labels) - - res = df.groupby(['value_group'])['value_group'].count() - exp = res[sorted(res.index, key=lambda x: float(x.split()[0]))] - exp.index = CategoricalIndex(exp.index, name=exp.index.name) - tm.assert_series_equal(res, exp) - - def test_level_groupby_get_group(self): - # GH15155 - df = DataFrame(data=np.arange(2, 22, 2), - index=MultiIndex( - levels=[pd.CategoricalIndex(["a", "b"]), range(10)], - labels=[[0] * 5 + [1] * 5, range(10)], - names=["Index1", "Index2"])) - g = df.groupby(level=["Index1"]) - - # expected should equal test.loc[["a"]] - # GH15166 - expected = DataFrame(data=np.arange(2, 12, 2), - index=pd.MultiIndex(levels=[pd.CategoricalIndex( - ["a", "b"]), range(5)], - labels=[[0] * 5, range(5)], - names=["Index1", "Index2"])) - result = g.get_group('a') - assert_frame_equal(result, expected) - - def test_apply_use_categorical_name(self): - from pandas import qcut - cats = qcut(self.df.C, 4) - - def get_stats(group): - return {'min': group.min(), - 'max': group.max(), - 'count': group.count(), - 'mean': group.mean()} - - result = self.df.groupby(cats).D.apply(get_stats) - assert result.index.names[0] == 'C' - - def test_apply_categorical_data(self): - # GH 10138 - for ordered in [True, False]: - dense = Categorical(list('abc'), ordered=ordered) - # 'b' is in the categories but not in the list - missing = Categorical( - list('aaa'), categories=['a', 'b'], ordered=ordered) - values = np.arange(len(dense)) - df = DataFrame({'missing': missing, - 'dense': dense, - 'values': values}) - grouped = df.groupby(['missing', 'dense']) - - # missing category 'b' should still exist in the output index - idx = MultiIndex.from_product( - [Categorical(['a', 'b'], ordered=ordered), - Categorical(['a', 'b', 'c'], ordered=ordered)], - names=['missing', 'dense']) - expected = DataFrame([0, 1, 2, np.nan, np.nan, np.nan], - index=idx, - columns=['values']) - - assert_frame_equal(grouped.apply(lambda x: np.mean(x)), expected) - assert_frame_equal(grouped.mean(), expected) - assert_frame_equal(grouped.agg(np.mean), expected) - - # but for transform we should still get back the original index - idx = MultiIndex.from_product([['a'], ['a', 'b', 'c']], - names=['missing', 'dense']) - expected = Series(1, index=idx) - assert_series_equal(grouped.apply(lambda x: 1), expected) - - def test_groupby_categorical(self): - levels = ['foo', 'bar', 'baz', 'qux'] - codes = np.random.randint(0, 4, size=100) - - cats = Categorical.from_codes(codes, levels, ordered=True) - - data = DataFrame(np.random.randn(100, 4)) - - result = data.groupby(cats).mean() - - expected = data.groupby(np.asarray(cats)).mean() - exp_idx = CategoricalIndex(levels, categories=cats.categories, - ordered=True) - expected = expected.reindex(exp_idx) - - assert_frame_equal(result, expected) - - grouped = data.groupby(cats) - desc_result = grouped.describe() - - idx = cats.codes.argsort() - ord_labels = np.asarray(cats).take(idx) - ord_data = data.take(idx) - - exp_cats = Categorical(ord_labels, ordered=True, - categories=['foo', 'bar', 'baz', 'qux']) - expected = ord_data.groupby(exp_cats, sort=False).describe() - assert_frame_equal(desc_result, expected) - - # GH 10460 - expc = Categorical.from_codes(np.arange(4).repeat(8), - levels, ordered=True) - exp = CategoricalIndex(expc) - tm.assert_index_equal((desc_result.stack().index - .get_level_values(0)), exp) - exp = Index(['count', 'mean', 'std', 'min', '25%', '50%', - '75%', 'max'] * 4) - tm.assert_index_equal((desc_result.stack().index - .get_level_values(1)), exp) - - def test_groupby_datetime_categorical(self): - # GH9049: ensure backward compatibility - levels = pd.date_range('2014-01-01', periods=4) - codes = np.random.randint(0, 4, size=100) - - cats = Categorical.from_codes(codes, levels, ordered=True) - - data = DataFrame(np.random.randn(100, 4)) - result = data.groupby(cats).mean() - - expected = data.groupby(np.asarray(cats)).mean() - expected = expected.reindex(levels) - expected.index = CategoricalIndex(expected.index, - categories=expected.index, - ordered=True) - - assert_frame_equal(result, expected) - - grouped = data.groupby(cats) - desc_result = grouped.describe() - - idx = cats.codes.argsort() - ord_labels = cats.take_nd(idx) - ord_data = data.take(idx) - expected = ord_data.groupby(ord_labels).describe() - assert_frame_equal(desc_result, expected) - tm.assert_index_equal(desc_result.index, expected.index) - tm.assert_index_equal( - desc_result.index.get_level_values(0), - expected.index.get_level_values(0)) - - # GH 10460 - expc = Categorical.from_codes( - np.arange(4).repeat(8), levels, ordered=True) - exp = CategoricalIndex(expc) - tm.assert_index_equal((desc_result.stack().index - .get_level_values(0)), exp) - exp = Index(['count', 'mean', 'std', 'min', '25%', '50%', - '75%', 'max'] * 4) - tm.assert_index_equal((desc_result.stack().index - .get_level_values(1)), exp) - - def test_groupby_categorical_index(self): - - s = np.random.RandomState(12345) - levels = ['foo', 'bar', 'baz', 'qux'] - codes = s.randint(0, 4, size=20) - cats = Categorical.from_codes(codes, levels, ordered=True) - df = DataFrame( - np.repeat( - np.arange(20), 4).reshape(-1, 4), columns=list('abcd')) - df['cats'] = cats - - # with a cat index - result = df.set_index('cats').groupby(level=0).sum() - expected = df[list('abcd')].groupby(cats.codes).sum() - expected.index = CategoricalIndex( - Categorical.from_codes( - [0, 1, 2, 3], levels, ordered=True), name='cats') - assert_frame_equal(result, expected) - # with a cat column, should produce a cat index - result = df.groupby('cats').sum() - expected = df[list('abcd')].groupby(cats.codes).sum() - expected.index = CategoricalIndex( - Categorical.from_codes( - [0, 1, 2, 3], levels, ordered=True), name='cats') - assert_frame_equal(result, expected) - - def test_groupby_describe_categorical_columns(self): - # GH 11558 - cats = pd.CategoricalIndex(['qux', 'foo', 'baz', 'bar'], - categories=['foo', 'bar', 'baz', 'qux'], - ordered=True) - df = DataFrame(np.random.randn(20, 4), columns=cats) - result = df.groupby([1, 2, 3, 4] * 5).describe() - - tm.assert_index_equal(result.stack().columns, cats) - tm.assert_categorical_equal(result.stack().columns.values, cats.values) - - def test_groupby_unstack_categorical(self): - # GH11558 (example is taken from the original issue) - df = pd.DataFrame({'a': range(10), - 'medium': ['A', 'B'] * 5, - 'artist': list('XYXXY') * 2}) - df['medium'] = df['medium'].astype('category') - - gcat = df.groupby(['artist', 'medium'])['a'].count().unstack() - result = gcat.describe() - - exp_columns = pd.CategoricalIndex(['A', 'B'], ordered=False, - name='medium') - tm.assert_index_equal(result.columns, exp_columns) - tm.assert_categorical_equal(result.columns.values, exp_columns.values) - - result = gcat['A'] + gcat['B'] - expected = pd.Series([6, 4], index=pd.Index(['X', 'Y'], name='artist')) - tm.assert_series_equal(result, expected) - - def test_groupby_bins_unequal_len(self): - # GH3011 - series = Series([np.nan, np.nan, 1, 1, 2, 2, 3, 3, 4, 4]) - bins = pd.cut(series.dropna().values, 4) - - # len(bins) != len(series) here - def f(): - series.groupby(bins).mean() - pytest.raises(ValueError, f) - - def test_groupby_multi_categorical_as_index(self): - # GH13204 - df = DataFrame({'cat': Categorical([1, 2, 2], [1, 2, 3]), - 'A': [10, 11, 11], - 'B': [101, 102, 103]}) - result = df.groupby(['cat', 'A'], as_index=False).sum() - expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]), - 'A': [10, 11, 10, 11, 10, 11], - 'B': [101.0, nan, nan, 205.0, nan, nan]}, - columns=['cat', 'A', 'B']) - tm.assert_frame_equal(result, expected) - - # function grouper - f = lambda r: df.loc[r, 'A'] - result = df.groupby(['cat', f], as_index=False).sum() - expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]), - 'A': [10.0, nan, nan, 22.0, nan, nan], - 'B': [101.0, nan, nan, 205.0, nan, nan]}, - columns=['cat', 'A', 'B']) - tm.assert_frame_equal(result, expected) - - # another not in-axis grouper - s = Series(['a', 'b', 'b'], name='cat2') - result = df.groupby(['cat', s], as_index=False).sum() - expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]), - 'A': [10.0, nan, nan, 22.0, nan, nan], - 'B': [101.0, nan, nan, 205.0, nan, nan]}, - columns=['cat', 'A', 'B']) - tm.assert_frame_equal(result, expected) - - # GH18872: conflicting names in desired index - pytest.raises(ValueError, lambda: df.groupby(['cat', - s.rename('cat')]).sum()) - - # is original index dropped? - expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]), - 'A': [10, 11, 10, 11, 10, 11], - 'B': [101.0, nan, nan, 205.0, nan, nan]}, - columns=['cat', 'A', 'B']) - - group_columns = ['cat', 'A'] - - for name in [None, 'X', 'B', 'cat']: - df.index = Index(list("abc"), name=name) - - if name in group_columns and name in df.index.names: - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result = df.groupby(group_columns, as_index=False).sum() - - else: +def test_groupby(): + + cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], + categories=["a", "b", "c", "d"], ordered=True) + data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats}) + + exp_index = CategoricalIndex(list('abcd'), name='b', ordered=True) + expected = DataFrame({'a': [1, 2, 4, np.nan]}, index=exp_index) + result = data.groupby("b").mean() + tm.assert_frame_equal(result, expected) + + raw_cat1 = Categorical(["a", "a", "b", "b"], + categories=["a", "b", "z"], ordered=True) + raw_cat2 = Categorical(["c", "d", "c", "d"], + categories=["c", "d", "y"], ordered=True) + df = DataFrame({"A": raw_cat1, "B": raw_cat2, "values": [1, 2, 3, 4]}) + + # single grouper + gb = df.groupby("A") + exp_idx = CategoricalIndex(['a', 'b', 'z'], name='A', ordered=True) + expected = DataFrame({'values': Series([3, 7, 0], index=exp_idx)}) + result = gb.sum() + tm.assert_frame_equal(result, expected) + + # multiple groupers + gb = df.groupby(['A', 'B']) + exp_index = pd.MultiIndex.from_product( + [Categorical(["a", "b", "z"], ordered=True), + Categorical(["c", "d", "y"], ordered=True)], + names=['A', 'B']) + expected = DataFrame({'values': [1, 2, np.nan, 3, 4, np.nan, + np.nan, np.nan, np.nan]}, + index=exp_index) + result = gb.sum() + tm.assert_frame_equal(result, expected) + + # multiple groupers with a non-cat + df = df.copy() + df['C'] = ['foo', 'bar'] * 2 + gb = df.groupby(['A', 'B', 'C']) + exp_index = pd.MultiIndex.from_product( + [Categorical(["a", "b", "z"], ordered=True), + Categorical(["c", "d", "y"], ordered=True), + ['foo', 'bar']], + names=['A', 'B', 'C']) + expected = DataFrame({'values': Series( + np.nan, index=exp_index)}).sort_index() + expected.iloc[[1, 2, 7, 8], 0] = [1, 2, 3, 4] + result = gb.sum() + tm.assert_frame_equal(result, expected) + + # GH 8623 + x = DataFrame([[1, 'John P. Doe'], [2, 'Jane Dove'], + [1, 'John P. Doe']], + columns=['person_id', 'person_name']) + x['person_name'] = Categorical(x.person_name) + + g = x.groupby(['person_id']) + result = g.transform(lambda x: x) + tm.assert_frame_equal(result, x[['person_name']]) + + result = x.drop_duplicates('person_name') + expected = x.iloc[[0, 1]] + tm.assert_frame_equal(result, expected) + + def f(x): + return x.drop_duplicates('person_name').iloc[0] + + result = g.apply(f) + expected = x.iloc[[0, 1]].copy() + expected.index = Index([1, 2], name='person_id') + expected['person_name'] = expected['person_name'].astype('object') + tm.assert_frame_equal(result, expected) + + # GH 9921 + # Monotonic + df = DataFrame({"a": [5, 15, 25]}) + c = pd.cut(df.a, bins=[0, 10, 20, 30, 40]) + + result = df.a.groupby(c).transform(sum) + tm.assert_series_equal(result, df['a']) + + tm.assert_series_equal( + df.a.groupby(c).transform(lambda xs: np.sum(xs)), df['a']) + tm.assert_frame_equal(df.groupby(c).transform(sum), df[['a']]) + tm.assert_frame_equal( + df.groupby(c).transform(lambda xs: np.max(xs)), df[['a']]) + + # Filter + tm.assert_series_equal(df.a.groupby(c).filter(np.all), df['a']) + tm.assert_frame_equal(df.groupby(c).filter(np.all), df) + + # Non-monotonic + df = DataFrame({"a": [5, 15, 25, -5]}) + c = pd.cut(df.a, bins=[-10, 0, 10, 20, 30, 40]) + + result = df.a.groupby(c).transform(sum) + tm.assert_series_equal(result, df['a']) + + tm.assert_series_equal( + df.a.groupby(c).transform(lambda xs: np.sum(xs)), df['a']) + tm.assert_frame_equal(df.groupby(c).transform(sum), df[['a']]) + tm.assert_frame_equal( + df.groupby(c).transform(lambda xs: np.sum(xs)), df[['a']]) + + # GH 9603 + df = DataFrame({'a': [1, 0, 0, 0]}) + c = pd.cut(df.a, [0, 1, 2, 3, 4], labels=Categorical(list('abcd'))) + result = df.groupby(c).apply(len) + + exp_index = CategoricalIndex( + c.values.categories, ordered=c.values.ordered) + expected = Series([1, 0, 0, 0], index=exp_index) + expected.index.name = 'a' + tm.assert_series_equal(result, expected) + + +def test_groupby_sort(): + + # http://stackoverflow.com/questions/23814368/sorting-pandas-categorical-labels-after-groupby + # This should result in a properly sorted Series so that the plot + # has a sorted x axis + # self.cat.groupby(['value_group'])['value_group'].count().plot(kind='bar') + + df = DataFrame({'value': np.random.randint(0, 10000, 100)}) + labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] + cat_labels = Categorical(labels, labels) + + df = df.sort_values(by=['value'], ascending=True) + df['value_group'] = pd.cut(df.value, range(0, 10500, 500), + right=False, labels=cat_labels) + + res = df.groupby(['value_group'])['value_group'].count() + exp = res[sorted(res.index, key=lambda x: float(x.split()[0]))] + exp.index = CategoricalIndex(exp.index, name=exp.index.name) + tm.assert_series_equal(res, exp) + + +def test_level_groupby_get_group(): + # GH15155 + df = DataFrame(data=np.arange(2, 22, 2), + index=MultiIndex( + levels=[pd.CategoricalIndex(["a", "b"]), range(10)], + labels=[[0] * 5 + [1] * 5, range(10)], + names=["Index1", "Index2"])) + g = df.groupby(level=["Index1"]) + + # expected should equal test.loc[["a"]] + # GH15166 + expected = DataFrame(data=np.arange(2, 12, 2), + index=pd.MultiIndex(levels=[pd.CategoricalIndex( + ["a", "b"]), range(5)], + labels=[[0] * 5, range(5)], + names=["Index1", "Index2"])) + result = g.get_group('a') + + assert_frame_equal(result, expected) + + +def test_apply_use_categorical_name(df): + cats = qcut(df.C, 4) + + def get_stats(group): + return {'min': group.min(), + 'max': group.max(), + 'count': group.count(), + 'mean': group.mean()} + + result = df.groupby(cats).D.apply(get_stats) + assert result.index.names[0] == 'C' + + +def test_apply_categorical_data(): + # GH 10138 + for ordered in [True, False]: + dense = Categorical(list('abc'), ordered=ordered) + # 'b' is in the categories but not in the list + missing = Categorical( + list('aaa'), categories=['a', 'b'], ordered=ordered) + values = np.arange(len(dense)) + df = DataFrame({'missing': missing, + 'dense': dense, + 'values': values}) + grouped = df.groupby(['missing', 'dense']) + + # missing category 'b' should still exist in the output index + idx = MultiIndex.from_product( + [Categorical(['a', 'b'], ordered=ordered), + Categorical(['a', 'b', 'c'], ordered=ordered)], + names=['missing', 'dense']) + expected = DataFrame([0, 1, 2, np.nan, np.nan, np.nan], + index=idx, + columns=['values']) + + assert_frame_equal(grouped.apply(lambda x: np.mean(x)), expected) + assert_frame_equal(grouped.mean(), expected) + assert_frame_equal(grouped.agg(np.mean), expected) + + # but for transform we should still get back the original index + idx = MultiIndex.from_product([['a'], ['a', 'b', 'c']], + names=['missing', 'dense']) + expected = Series(1, index=idx) + assert_series_equal(grouped.apply(lambda x: 1), expected) + + +def test_groupby_categorical(): + levels = ['foo', 'bar', 'baz', 'qux'] + codes = np.random.randint(0, 4, size=100) + + cats = Categorical.from_codes(codes, levels, ordered=True) + + data = DataFrame(np.random.randn(100, 4)) + + result = data.groupby(cats).mean() + + expected = data.groupby(np.asarray(cats)).mean() + exp_idx = CategoricalIndex(levels, categories=cats.categories, + ordered=True) + expected = expected.reindex(exp_idx) + + assert_frame_equal(result, expected) + + grouped = data.groupby(cats) + desc_result = grouped.describe() + + idx = cats.codes.argsort() + ord_labels = np.asarray(cats).take(idx) + ord_data = data.take(idx) + + exp_cats = Categorical(ord_labels, ordered=True, + categories=['foo', 'bar', 'baz', 'qux']) + expected = ord_data.groupby(exp_cats, sort=False).describe() + assert_frame_equal(desc_result, expected) + + # GH 10460 + expc = Categorical.from_codes(np.arange(4).repeat(8), + levels, ordered=True) + exp = CategoricalIndex(expc) + tm.assert_index_equal((desc_result.stack().index + .get_level_values(0)), exp) + exp = Index(['count', 'mean', 'std', 'min', '25%', '50%', + '75%', 'max'] * 4) + tm.assert_index_equal((desc_result.stack().index + .get_level_values(1)), exp) + + +def test_groupby_datetime_categorical(): + # GH9049: ensure backward compatibility + levels = pd.date_range('2014-01-01', periods=4) + codes = np.random.randint(0, 4, size=100) + + cats = Categorical.from_codes(codes, levels, ordered=True) + + data = DataFrame(np.random.randn(100, 4)) + result = data.groupby(cats).mean() + + expected = data.groupby(np.asarray(cats)).mean() + expected = expected.reindex(levels) + expected.index = CategoricalIndex(expected.index, + categories=expected.index, + ordered=True) + + assert_frame_equal(result, expected) + + grouped = data.groupby(cats) + desc_result = grouped.describe() + + idx = cats.codes.argsort() + ord_labels = cats.take_nd(idx) + ord_data = data.take(idx) + expected = ord_data.groupby(ord_labels).describe() + assert_frame_equal(desc_result, expected) + tm.assert_index_equal(desc_result.index, expected.index) + tm.assert_index_equal( + desc_result.index.get_level_values(0), + expected.index.get_level_values(0)) + + # GH 10460 + expc = Categorical.from_codes( + np.arange(4).repeat(8), levels, ordered=True) + exp = CategoricalIndex(expc) + tm.assert_index_equal((desc_result.stack().index + .get_level_values(0)), exp) + exp = Index(['count', 'mean', 'std', 'min', '25%', '50%', + '75%', 'max'] * 4) + tm.assert_index_equal((desc_result.stack().index + .get_level_values(1)), exp) + + +def test_groupby_categorical_index(): + + s = np.random.RandomState(12345) + levels = ['foo', 'bar', 'baz', 'qux'] + codes = s.randint(0, 4, size=20) + cats = Categorical.from_codes(codes, levels, ordered=True) + df = DataFrame( + np.repeat( + np.arange(20), 4).reshape(-1, 4), columns=list('abcd')) + df['cats'] = cats + + # with a cat index + result = df.set_index('cats').groupby(level=0).sum() + expected = df[list('abcd')].groupby(cats.codes).sum() + expected.index = CategoricalIndex( + Categorical.from_codes( + [0, 1, 2, 3], levels, ordered=True), name='cats') + assert_frame_equal(result, expected) + + # with a cat column, should produce a cat index + result = df.groupby('cats').sum() + expected = df[list('abcd')].groupby(cats.codes).sum() + expected.index = CategoricalIndex( + Categorical.from_codes( + [0, 1, 2, 3], levels, ordered=True), name='cats') + assert_frame_equal(result, expected) + + +def test_groupby_describe_categorical_columns(): + # GH 11558 + cats = pd.CategoricalIndex(['qux', 'foo', 'baz', 'bar'], + categories=['foo', 'bar', 'baz', 'qux'], + ordered=True) + df = DataFrame(np.random.randn(20, 4), columns=cats) + result = df.groupby([1, 2, 3, 4] * 5).describe() + + tm.assert_index_equal(result.stack().columns, cats) + tm.assert_categorical_equal(result.stack().columns.values, cats.values) + + +def test_groupby_unstack_categorical(): + # GH11558 (example is taken from the original issue) + df = pd.DataFrame({'a': range(10), + 'medium': ['A', 'B'] * 5, + 'artist': list('XYXXY') * 2}) + df['medium'] = df['medium'].astype('category') + + gcat = df.groupby(['artist', 'medium'])['a'].count().unstack() + result = gcat.describe() + + exp_columns = pd.CategoricalIndex(['A', 'B'], ordered=False, + name='medium') + tm.assert_index_equal(result.columns, exp_columns) + tm.assert_categorical_equal(result.columns.values, exp_columns.values) + + result = gcat['A'] + gcat['B'] + expected = pd.Series([6, 4], index=pd.Index(['X', 'Y'], name='artist')) + tm.assert_series_equal(result, expected) + + +def test_groupby_bins_unequal_len(): + # GH3011 + series = Series([np.nan, np.nan, 1, 1, 2, 2, 3, 3, 4, 4]) + bins = pd.cut(series.dropna().values, 4) + + # len(bins) != len(series) here + def f(): + series.groupby(bins).mean() + pytest.raises(ValueError, f) + + +def test_groupby_multi_categorical_as_index(): + # GH13204 + df = DataFrame({'cat': Categorical([1, 2, 2], [1, 2, 3]), + 'A': [10, 11, 11], + 'B': [101, 102, 103]}) + result = df.groupby(['cat', 'A'], as_index=False).sum() + expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]), + 'A': [10, 11, 10, 11, 10, 11], + 'B': [101.0, nan, nan, 205.0, nan, nan]}, + columns=['cat', 'A', 'B']) + tm.assert_frame_equal(result, expected) + + # function grouper + f = lambda r: df.loc[r, 'A'] + result = df.groupby(['cat', f], as_index=False).sum() + expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]), + 'A': [10.0, nan, nan, 22.0, nan, nan], + 'B': [101.0, nan, nan, 205.0, nan, nan]}, + columns=['cat', 'A', 'B']) + tm.assert_frame_equal(result, expected) + + # another not in-axis grouper + s = Series(['a', 'b', 'b'], name='cat2') + result = df.groupby(['cat', s], as_index=False).sum() + expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]), + 'A': [10.0, nan, nan, 22.0, nan, nan], + 'B': [101.0, nan, nan, 205.0, nan, nan]}, + columns=['cat', 'A', 'B']) + tm.assert_frame_equal(result, expected) + + # GH18872: conflicting names in desired index + pytest.raises(ValueError, lambda: df.groupby(['cat', + s.rename('cat')]).sum()) + + # is original index dropped? + expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]), + 'A': [10, 11, 10, 11, 10, 11], + 'B': [101.0, nan, nan, 205.0, nan, nan]}, + columns=['cat', 'A', 'B']) + + group_columns = ['cat', 'A'] + + for name in [None, 'X', 'B', 'cat']: + df.index = Index(list("abc"), name=name) + + if name in group_columns and name in df.index.names: + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): result = df.groupby(group_columns, as_index=False).sum() - tm.assert_frame_equal(result, expected, check_index_type=True) - - def test_groupby_preserve_categories(self): - # GH-13179 - categories = list('abc') - - # ordered=True - df = DataFrame({'A': pd.Categorical(list('ba'), - categories=categories, + else: + result = df.groupby(group_columns, as_index=False).sum() + + tm.assert_frame_equal(result, expected, check_index_type=True) + + +def test_groupby_preserve_categories(): + # GH-13179 + categories = list('abc') + + # ordered=True + df = DataFrame({'A': pd.Categorical(list('ba'), + categories=categories, + ordered=True)}) + index = pd.CategoricalIndex(categories, categories, ordered=True) + tm.assert_index_equal(df.groupby('A', sort=True).first().index, index) + tm.assert_index_equal(df.groupby('A', sort=False).first().index, index) + + # ordered=False + df = DataFrame({'A': pd.Categorical(list('ba'), + categories=categories, + ordered=False)}) + sort_index = pd.CategoricalIndex(categories, categories, ordered=False) + nosort_index = pd.CategoricalIndex(list('bac'), list('bac'), + ordered=False) + tm.assert_index_equal(df.groupby('A', sort=True).first().index, + sort_index) + tm.assert_index_equal(df.groupby('A', sort=False).first().index, + nosort_index) + + +def test_groupby_preserve_categorical_dtype(): + # GH13743, GH13854 + df = DataFrame({'A': [1, 2, 1, 1, 2], + 'B': [10, 16, 22, 28, 34], + 'C1': Categorical(list("abaab"), + categories=list("bac"), + ordered=False), + 'C2': Categorical(list("abaab"), + categories=list("bac"), + ordered=True)}) + # single grouper + exp_full = DataFrame({'A': [2.0, 1.0, np.nan], + 'B': [25.0, 20.0, np.nan], + 'C1': Categorical(list("bac"), + categories=list("bac"), + ordered=False), + 'C2': Categorical(list("bac"), + categories=list("bac"), ordered=True)}) - index = pd.CategoricalIndex(categories, categories, ordered=True) - tm.assert_index_equal(df.groupby('A', sort=True).first().index, index) - tm.assert_index_equal(df.groupby('A', sort=False).first().index, index) - - # ordered=False - df = DataFrame({'A': pd.Categorical(list('ba'), - categories=categories, - ordered=False)}) - sort_index = pd.CategoricalIndex(categories, categories, ordered=False) - nosort_index = pd.CategoricalIndex(list('bac'), list('bac'), - ordered=False) - tm.assert_index_equal(df.groupby('A', sort=True).first().index, - sort_index) - tm.assert_index_equal(df.groupby('A', sort=False).first().index, - nosort_index) - - def test_groupby_preserve_categorical_dtype(self): - # GH13743, GH13854 - df = DataFrame({'A': [1, 2, 1, 1, 2], - 'B': [10, 16, 22, 28, 34], - 'C1': Categorical(list("abaab"), - categories=list("bac"), - ordered=False), - 'C2': Categorical(list("abaab"), - categories=list("bac"), - ordered=True)}) - # single grouper - exp_full = DataFrame({'A': [2.0, 1.0, np.nan], - 'B': [25.0, 20.0, np.nan], - 'C1': Categorical(list("bac"), - categories=list("bac"), - ordered=False), - 'C2': Categorical(list("bac"), - categories=list("bac"), - ordered=True)}) - for col in ['C1', 'C2']: - result1 = df.groupby(by=col, as_index=False).mean() - result2 = df.groupby(by=col, as_index=True).mean().reset_index() - expected = exp_full.reindex(columns=result1.columns) - tm.assert_frame_equal(result1, expected) - tm.assert_frame_equal(result2, expected) - - # multiple grouper - exp_full = DataFrame({'A': [1, 1, 1, 2, 2, 2], - 'B': [np.nan, 20.0, np.nan, 25.0, np.nan, - np.nan], - 'C1': Categorical(list("bacbac"), - categories=list("bac"), - ordered=False), - 'C2': Categorical(list("bacbac"), - categories=list("bac"), - ordered=True)}) - for cols in [['A', 'C1'], ['A', 'C2']]: - result1 = df.groupby(by=cols, as_index=False).mean() - result2 = df.groupby(by=cols, as_index=True).mean().reset_index() - expected = exp_full.reindex(columns=result1.columns) - tm.assert_frame_equal(result1, expected) - tm.assert_frame_equal(result2, expected) - - def test_groupby_categorical_no_compress(self): - data = Series(np.random.randn(9)) - - codes = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) - cats = Categorical.from_codes(codes, [0, 1, 2], ordered=True) - - result = data.groupby(cats).mean() - exp = data.groupby(codes).mean() - - exp.index = CategoricalIndex(exp.index, categories=cats.categories, - ordered=cats.ordered) - assert_series_equal(result, exp) - - codes = np.array([0, 0, 0, 1, 1, 1, 3, 3, 3]) - cats = Categorical.from_codes(codes, [0, 1, 2, 3], ordered=True) - - result = data.groupby(cats).mean() - exp = data.groupby(codes).mean().reindex(cats.categories) - exp.index = CategoricalIndex(exp.index, categories=cats.categories, - ordered=cats.ordered) - assert_series_equal(result, exp) - - cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], - categories=["a", "b", "c", "d"], ordered=True) - data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats}) - - result = data.groupby("b").mean() - result = result["a"].values - exp = np.array([1, 2, 4, np.nan]) - tm.assert_numpy_array_equal(result, exp) - - def test_groupby_sort_categorical(self): - # dataframe groupby sort was being ignored # GH 8868 - df = DataFrame([['(7.5, 10]', 10, 10], - ['(7.5, 10]', 8, 20], - ['(2.5, 5]', 5, 30], - ['(5, 7.5]', 6, 40], - ['(2.5, 5]', 4, 50], - ['(0, 2.5]', 1, 60], - ['(5, 7.5]', 7, 70]], columns=['range', 'foo', 'bar']) - df['range'] = Categorical(df['range'], ordered=True) - index = CategoricalIndex(['(0, 2.5]', '(2.5, 5]', '(5, 7.5]', - '(7.5, 10]'], name='range', ordered=True) - result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]], - columns=['foo', 'bar'], index=index) - - col = 'range' - assert_frame_equal(result_sort, df.groupby(col, sort=True).first()) - # when categories is ordered, group is ordered by category's order - assert_frame_equal(result_sort, df.groupby(col, sort=False).first()) - - df['range'] = Categorical(df['range'], ordered=False) - index = CategoricalIndex(['(0, 2.5]', '(2.5, 5]', '(5, 7.5]', - '(7.5, 10]'], name='range') - result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]], - columns=['foo', 'bar'], index=index) - - index = CategoricalIndex(['(7.5, 10]', '(2.5, 5]', '(5, 7.5]', - '(0, 2.5]'], - categories=['(7.5, 10]', '(2.5, 5]', - '(5, 7.5]', '(0, 2.5]'], - name='range') - result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]], - index=index, columns=['foo', 'bar']) - - col = 'range' - # this is an unordered categorical, but we allow this #### - assert_frame_equal(result_sort, df.groupby(col, sort=True).first()) - assert_frame_equal(result_nosort, df.groupby(col, sort=False).first()) - - def test_groupby_sort_categorical_datetimelike(self): - # GH10505 - - # use same data as test_groupby_sort_categorical, which category is - # corresponding to datetime.month - df = DataFrame({'dt': [datetime(2011, 7, 1), datetime(2011, 7, 1), - datetime(2011, 2, 1), datetime(2011, 5, 1), - datetime(2011, 2, 1), datetime(2011, 1, 1), - datetime(2011, 5, 1)], - 'foo': [10, 8, 5, 6, 4, 1, 7], - 'bar': [10, 20, 30, 40, 50, 60, 70]}, - columns=['dt', 'foo', 'bar']) - - # ordered=True - df['dt'] = Categorical(df['dt'], ordered=True) - index = [datetime(2011, 1, 1), datetime(2011, 2, 1), - datetime(2011, 5, 1), datetime(2011, 7, 1)] - result_sort = DataFrame( - [[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar']) - result_sort.index = CategoricalIndex(index, name='dt', ordered=True) - - index = [datetime(2011, 7, 1), datetime(2011, 2, 1), - datetime(2011, 5, 1), datetime(2011, 1, 1)] - result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]], - columns=['foo', 'bar']) - result_nosort.index = CategoricalIndex(index, categories=index, - name='dt', ordered=True) - - col = 'dt' - assert_frame_equal(result_sort, df.groupby(col, sort=True).first()) - # when categories is ordered, group is ordered by category's order - assert_frame_equal(result_sort, df.groupby(col, sort=False).first()) - - # ordered = False - df['dt'] = Categorical(df['dt'], ordered=False) - index = [datetime(2011, 1, 1), datetime(2011, 2, 1), - datetime(2011, 5, 1), datetime(2011, 7, 1)] - result_sort = DataFrame( - [[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar']) - result_sort.index = CategoricalIndex(index, name='dt') - - index = [datetime(2011, 7, 1), datetime(2011, 2, 1), - datetime(2011, 5, 1), datetime(2011, 1, 1)] - result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]], - columns=['foo', 'bar']) - result_nosort.index = CategoricalIndex(index, categories=index, - name='dt') - - col = 'dt' - assert_frame_equal(result_sort, df.groupby(col, sort=True).first()) - assert_frame_equal(result_nosort, df.groupby(col, sort=False).first()) - - def test_groupby_categorical_two_columns(self): - - # https://github.com/pandas-dev/pandas/issues/8138 - d = {'cat': - pd.Categorical(["a", "b", "a", "b"], categories=["a", "b", "c"], - ordered=True), - 'ints': [1, 1, 2, 2], - 'val': [10, 20, 30, 40]} - test = pd.DataFrame(d) - - # Grouping on a single column - groups_single_key = test.groupby("cat") - res = groups_single_key.agg('mean') - - exp_index = pd.CategoricalIndex(["a", "b", "c"], name="cat", - ordered=True) - exp = DataFrame({"ints": [1.5, 1.5, np.nan], "val": [20, 30, np.nan]}, - index=exp_index) - tm.assert_frame_equal(res, exp) - - # Grouping on two columns - groups_double_key = test.groupby(["cat", "ints"]) - res = groups_double_key.agg('mean') - exp = DataFrame({"val": [10, 30, 20, 40, np.nan, np.nan], - "cat": pd.Categorical(["a", "a", "b", "b", "c", "c"], - ordered=True), - "ints": [1, 2, 1, 2, 1, 2]}).set_index(["cat", "ints" - ]) - tm.assert_frame_equal(res, exp) - - # GH 10132 - for key in [('a', 1), ('b', 2), ('b', 1), ('a', 2)]: - c, i = key - result = groups_double_key.get_group(key) - expected = test[(test.cat == c) & (test.ints == i)] - assert_frame_equal(result, expected) - - d = {'C1': [3, 3, 4, 5], 'C2': [1, 2, 3, 4], 'C3': [10, 100, 200, 34]} - test = pd.DataFrame(d) - values = pd.cut(test['C1'], [1, 2, 3, 6]) - values.name = "cat" - groups_double_key = test.groupby([values, 'C2']) - - res = groups_double_key.agg('mean') - nan = np.nan - idx = MultiIndex.from_product( - [Categorical([Interval(1, 2), Interval(2, 3), - Interval(3, 6)], ordered=True), - [1, 2, 3, 4]], - names=["cat", "C2"]) - exp = DataFrame({"C1": [nan, nan, nan, nan, 3, 3, - nan, nan, nan, nan, 4, 5], - "C3": [nan, nan, nan, nan, 10, 100, - nan, nan, nan, nan, 200, 34]}, index=idx) - tm.assert_frame_equal(res, exp) - - def test_empty_sum(self): - # https://github.com/pandas-dev/pandas/issues/18678 - df = pd.DataFrame({"A": pd.Categorical(['a', 'a', 'b'], - categories=['a', 'b', 'c']), - 'B': [1, 2, 1]}) - expected_idx = pd.CategoricalIndex(['a', 'b', 'c'], name='A') - - # 0 by default - result = df.groupby("A").B.sum() - expected = pd.Series([3, 1, 0], expected_idx, name='B') - tm.assert_series_equal(result, expected) - - # min_count=0 - result = df.groupby("A").B.sum(min_count=0) - expected = pd.Series([3, 1, 0], expected_idx, name='B') - tm.assert_series_equal(result, expected) - - # min_count=1 - result = df.groupby("A").B.sum(min_count=1) - expected = pd.Series([3, 1, np.nan], expected_idx, name='B') - tm.assert_series_equal(result, expected) - - # min_count>1 - result = df.groupby("A").B.sum(min_count=2) - expected = pd.Series([3, np.nan, np.nan], expected_idx, name='B') - tm.assert_series_equal(result, expected) - - def test_empty_prod(self): - # https://github.com/pandas-dev/pandas/issues/18678 - df = pd.DataFrame({"A": pd.Categorical(['a', 'a', 'b'], - categories=['a', 'b', 'c']), - 'B': [1, 2, 1]}) - - expected_idx = pd.CategoricalIndex(['a', 'b', 'c'], name='A') - - # 1 by default - result = df.groupby("A").B.prod() - expected = pd.Series([2, 1, 1], expected_idx, name='B') - tm.assert_series_equal(result, expected) - - # min_count=0 - result = df.groupby("A").B.prod(min_count=0) - expected = pd.Series([2, 1, 1], expected_idx, name='B') - tm.assert_series_equal(result, expected) - - # min_count=1 - result = df.groupby("A").B.prod(min_count=1) - expected = pd.Series([2, 1, np.nan], expected_idx, name='B') - tm.assert_series_equal(result, expected) + for col in ['C1', 'C2']: + result1 = df.groupby(by=col, as_index=False).mean() + result2 = df.groupby(by=col, as_index=True).mean().reset_index() + expected = exp_full.reindex(columns=result1.columns) + tm.assert_frame_equal(result1, expected) + tm.assert_frame_equal(result2, expected) + + # multiple grouper + exp_full = DataFrame({'A': [1, 1, 1, 2, 2, 2], + 'B': [np.nan, 20.0, np.nan, 25.0, np.nan, + np.nan], + 'C1': Categorical(list("bacbac"), + categories=list("bac"), + ordered=False), + 'C2': Categorical(list("bacbac"), + categories=list("bac"), + ordered=True)}) + for cols in [['A', 'C1'], ['A', 'C2']]: + result1 = df.groupby(by=cols, as_index=False).mean() + result2 = df.groupby(by=cols, as_index=True).mean().reset_index() + expected = exp_full.reindex(columns=result1.columns) + tm.assert_frame_equal(result1, expected) + tm.assert_frame_equal(result2, expected) + + +def test_groupby_categorical_no_compress(): + data = Series(np.random.randn(9)) + + codes = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) + cats = Categorical.from_codes(codes, [0, 1, 2], ordered=True) + + result = data.groupby(cats).mean() + exp = data.groupby(codes).mean() + + exp.index = CategoricalIndex(exp.index, categories=cats.categories, + ordered=cats.ordered) + assert_series_equal(result, exp) + + codes = np.array([0, 0, 0, 1, 1, 1, 3, 3, 3]) + cats = Categorical.from_codes(codes, [0, 1, 2, 3], ordered=True) + + result = data.groupby(cats).mean() + exp = data.groupby(codes).mean().reindex(cats.categories) + exp.index = CategoricalIndex(exp.index, categories=cats.categories, + ordered=cats.ordered) + assert_series_equal(result, exp) + + cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], + categories=["a", "b", "c", "d"], ordered=True) + data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats}) + + result = data.groupby("b").mean() + result = result["a"].values + exp = np.array([1, 2, 4, np.nan]) + tm.assert_numpy_array_equal(result, exp) + + +def test_groupby_sort_categorical(): + # dataframe groupby sort was being ignored # GH 8868 + df = DataFrame([['(7.5, 10]', 10, 10], + ['(7.5, 10]', 8, 20], + ['(2.5, 5]', 5, 30], + ['(5, 7.5]', 6, 40], + ['(2.5, 5]', 4, 50], + ['(0, 2.5]', 1, 60], + ['(5, 7.5]', 7, 70]], columns=['range', 'foo', 'bar']) + df['range'] = Categorical(df['range'], ordered=True) + index = CategoricalIndex(['(0, 2.5]', '(2.5, 5]', '(5, 7.5]', + '(7.5, 10]'], name='range', ordered=True) + result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]], + columns=['foo', 'bar'], index=index) + + col = 'range' + assert_frame_equal(result_sort, df.groupby(col, sort=True).first()) + # when categories is ordered, group is ordered by category's order + assert_frame_equal(result_sort, df.groupby(col, sort=False).first()) + + df['range'] = Categorical(df['range'], ordered=False) + index = CategoricalIndex(['(0, 2.5]', '(2.5, 5]', '(5, 7.5]', + '(7.5, 10]'], name='range') + result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]], + columns=['foo', 'bar'], index=index) + + index = CategoricalIndex(['(7.5, 10]', '(2.5, 5]', '(5, 7.5]', + '(0, 2.5]'], + categories=['(7.5, 10]', '(2.5, 5]', + '(5, 7.5]', '(0, 2.5]'], + name='range') + result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]], + index=index, columns=['foo', 'bar']) + + col = 'range' + # this is an unordered categorical, but we allow this #### + assert_frame_equal(result_sort, df.groupby(col, sort=True).first()) + assert_frame_equal(result_nosort, df.groupby(col, sort=False).first()) + + +def test_groupby_sort_categorical_datetimelike(): + # GH10505 + + # use same data as test_groupby_sort_categorical, which category is + # corresponding to datetime.month + df = DataFrame({'dt': [datetime(2011, 7, 1), datetime(2011, 7, 1), + datetime(2011, 2, 1), datetime(2011, 5, 1), + datetime(2011, 2, 1), datetime(2011, 1, 1), + datetime(2011, 5, 1)], + 'foo': [10, 8, 5, 6, 4, 1, 7], + 'bar': [10, 20, 30, 40, 50, 60, 70]}, + columns=['dt', 'foo', 'bar']) + + # ordered=True + df['dt'] = Categorical(df['dt'], ordered=True) + index = [datetime(2011, 1, 1), datetime(2011, 2, 1), + datetime(2011, 5, 1), datetime(2011, 7, 1)] + result_sort = DataFrame( + [[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar']) + result_sort.index = CategoricalIndex(index, name='dt', ordered=True) + + index = [datetime(2011, 7, 1), datetime(2011, 2, 1), + datetime(2011, 5, 1), datetime(2011, 1, 1)] + result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]], + columns=['foo', 'bar']) + result_nosort.index = CategoricalIndex(index, categories=index, + name='dt', ordered=True) + + col = 'dt' + assert_frame_equal(result_sort, df.groupby(col, sort=True).first()) + # when categories is ordered, group is ordered by category's order + assert_frame_equal(result_sort, df.groupby(col, sort=False).first()) + + # ordered = False + df['dt'] = Categorical(df['dt'], ordered=False) + index = [datetime(2011, 1, 1), datetime(2011, 2, 1), + datetime(2011, 5, 1), datetime(2011, 7, 1)] + result_sort = DataFrame( + [[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar']) + result_sort.index = CategoricalIndex(index, name='dt') + + index = [datetime(2011, 7, 1), datetime(2011, 2, 1), + datetime(2011, 5, 1), datetime(2011, 1, 1)] + result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]], + columns=['foo', 'bar']) + result_nosort.index = CategoricalIndex(index, categories=index, + name='dt') + + col = 'dt' + assert_frame_equal(result_sort, df.groupby(col, sort=True).first()) + assert_frame_equal(result_nosort, df.groupby(col, sort=False).first()) + + +def test_groupby_categorical_two_columns(): + + # https://github.com/pandas-dev/pandas/issues/8138 + d = {'cat': + pd.Categorical(["a", "b", "a", "b"], categories=["a", "b", "c"], + ordered=True), + 'ints': [1, 1, 2, 2], + 'val': [10, 20, 30, 40]} + test = pd.DataFrame(d) + + # Grouping on a single column + groups_single_key = test.groupby("cat") + res = groups_single_key.agg('mean') + + exp_index = pd.CategoricalIndex(["a", "b", "c"], name="cat", + ordered=True) + exp = DataFrame({"ints": [1.5, 1.5, np.nan], "val": [20, 30, np.nan]}, + index=exp_index) + tm.assert_frame_equal(res, exp) + + # Grouping on two columns + groups_double_key = test.groupby(["cat", "ints"]) + res = groups_double_key.agg('mean') + exp = DataFrame({"val": [10, 30, 20, 40, np.nan, np.nan], + "cat": pd.Categorical(["a", "a", "b", "b", "c", "c"], + ordered=True), + "ints": [1, 2, 1, 2, 1, 2]}).set_index(["cat", "ints" + ]) + tm.assert_frame_equal(res, exp) + + # GH 10132 + for key in [('a', 1), ('b', 2), ('b', 1), ('a', 2)]: + c, i = key + result = groups_double_key.get_group(key) + expected = test[(test.cat == c) & (test.ints == i)] + assert_frame_equal(result, expected) + + d = {'C1': [3, 3, 4, 5], 'C2': [1, 2, 3, 4], 'C3': [10, 100, 200, 34]} + test = pd.DataFrame(d) + values = pd.cut(test['C1'], [1, 2, 3, 6]) + values.name = "cat" + groups_double_key = test.groupby([values, 'C2']) + + res = groups_double_key.agg('mean') + nan = np.nan + idx = MultiIndex.from_product( + [Categorical([Interval(1, 2), Interval(2, 3), + Interval(3, 6)], ordered=True), + [1, 2, 3, 4]], + names=["cat", "C2"]) + exp = DataFrame({"C1": [nan, nan, nan, nan, 3, 3, + nan, nan, nan, nan, 4, 5], + "C3": [nan, nan, nan, nan, 10, 100, + nan, nan, nan, nan, 200, 34]}, index=idx) + tm.assert_frame_equal(res, exp) + + +def test_empty_sum(): + # https://github.com/pandas-dev/pandas/issues/18678 + df = pd.DataFrame({"A": pd.Categorical(['a', 'a', 'b'], + categories=['a', 'b', 'c']), + 'B': [1, 2, 1]}) + expected_idx = pd.CategoricalIndex(['a', 'b', 'c'], name='A') + + # 0 by default + result = df.groupby("A").B.sum() + expected = pd.Series([3, 1, 0], expected_idx, name='B') + tm.assert_series_equal(result, expected) + + # min_count=0 + result = df.groupby("A").B.sum(min_count=0) + expected = pd.Series([3, 1, 0], expected_idx, name='B') + tm.assert_series_equal(result, expected) + + # min_count=1 + result = df.groupby("A").B.sum(min_count=1) + expected = pd.Series([3, 1, np.nan], expected_idx, name='B') + tm.assert_series_equal(result, expected) + + # min_count>1 + result = df.groupby("A").B.sum(min_count=2) + expected = pd.Series([3, np.nan, np.nan], expected_idx, name='B') + tm.assert_series_equal(result, expected) + + +def test_empty_prod(): + # https://github.com/pandas-dev/pandas/issues/18678 + df = pd.DataFrame({"A": pd.Categorical(['a', 'a', 'b'], + categories=['a', 'b', 'c']), + 'B': [1, 2, 1]}) + + expected_idx = pd.CategoricalIndex(['a', 'b', 'c'], name='A') + + # 1 by default + result = df.groupby("A").B.prod() + expected = pd.Series([2, 1, 1], expected_idx, name='B') + tm.assert_series_equal(result, expected) + + # min_count=0 + result = df.groupby("A").B.prod(min_count=0) + expected = pd.Series([2, 1, 1], expected_idx, name='B') + tm.assert_series_equal(result, expected) + + # min_count=1 + result = df.groupby("A").B.prod(min_count=1) + expected = pd.Series([2, 1, np.nan], expected_idx, name='B') + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_filters.py b/pandas/tests/groupby/test_filters.py index cac6b46af8f87..873d9f6076b69 100644 --- a/pandas/tests/groupby/test_filters.py +++ b/pandas/tests/groupby/test_filters.py @@ -1,622 +1,576 @@ # -*- coding: utf-8 -*- from __future__ import print_function -from numpy import nan - import pytest -from pandas import Timestamp -from pandas.core.index import MultiIndex -from pandas.core.api import DataFrame - -from pandas.core.series import Series - -from pandas.util.testing import (assert_frame_equal, assert_series_equal - ) -from pandas.compat import (lmap) - -from pandas import compat - -import pandas.core.common as com import numpy as np - import pandas.util.testing as tm +from pandas import Timestamp, DataFrame, Series import pandas as pd -class TestGroupByFilter(object): - - def setup_method(self, method): - self.ts = tm.makeTimeSeries() - - self.seriesd = tm.getSeriesData() - self.tsd = tm.getTimeSeriesData() - self.frame = DataFrame(self.seriesd) - self.tsframe = DataFrame(self.tsd) - - self.df = DataFrame( - {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), - 'D': np.random.randn(8)}) - - self.df_mixed_floats = DataFrame( - {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), - 'D': np.array( - np.random.randn(8), dtype='float32')}) - - index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', - 'three']], - labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['first', 'second']) - self.mframe = DataFrame(np.random.randn(10, 3), index=index, - columns=['A', 'B', 'C']) - - self.three_group = DataFrame( - {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', - 'foo', 'foo', 'foo'], - 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', - 'two', 'two', 'one'], - 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny', - 'dull', 'shiny', 'shiny', 'shiny'], - 'D': np.random.randn(11), - 'E': np.random.randn(11), - 'F': np.random.randn(11)}) - - def test_filter_series(self): - s = pd.Series([1, 3, 20, 5, 22, 24, 7]) - expected_odd = pd.Series([1, 3, 5, 7], index=[0, 1, 3, 6]) - expected_even = pd.Series([20, 22, 24], index=[2, 4, 5]) - grouper = s.apply(lambda x: x % 2) - grouped = s.groupby(grouper) - assert_series_equal( - grouped.filter(lambda x: x.mean() < 10), expected_odd) - assert_series_equal( - grouped.filter(lambda x: x.mean() > 10), expected_even) - # Test dropna=False. - assert_series_equal( - grouped.filter(lambda x: x.mean() < 10, dropna=False), - expected_odd.reindex(s.index)) - assert_series_equal( - grouped.filter(lambda x: x.mean() > 10, dropna=False), - expected_even.reindex(s.index)) - - def test_filter_single_column_df(self): - df = pd.DataFrame([1, 3, 20, 5, 22, 24, 7]) - expected_odd = pd.DataFrame([1, 3, 5, 7], index=[0, 1, 3, 6]) - expected_even = pd.DataFrame([20, 22, 24], index=[2, 4, 5]) - grouper = df[0].apply(lambda x: x % 2) - grouped = df.groupby(grouper) - assert_frame_equal( - grouped.filter(lambda x: x.mean() < 10), expected_odd) - assert_frame_equal( - grouped.filter(lambda x: x.mean() > 10), expected_even) - # Test dropna=False. - assert_frame_equal( - grouped.filter(lambda x: x.mean() < 10, dropna=False), - expected_odd.reindex(df.index)) - assert_frame_equal( - grouped.filter(lambda x: x.mean() > 10, dropna=False), - expected_even.reindex(df.index)) - - def test_filter_multi_column_df(self): - df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': [1, 1, 1, 1]}) - grouper = df['A'].apply(lambda x: x % 2) - grouped = df.groupby(grouper) - expected = pd.DataFrame({'A': [12, 12], 'B': [1, 1]}, index=[1, 2]) - assert_frame_equal( - grouped.filter(lambda x: x['A'].sum() - x['B'].sum() > 10), - expected) - - def test_filter_mixed_df(self): - df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': 'a b c d'.split()}) - grouper = df['A'].apply(lambda x: x % 2) - grouped = df.groupby(grouper) - expected = pd.DataFrame({'A': [12, 12], 'B': ['b', 'c']}, index=[1, 2]) - assert_frame_equal( - grouped.filter(lambda x: x['A'].sum() > 10), expected) - - def test_filter_out_all_groups(self): - s = pd.Series([1, 3, 20, 5, 22, 24, 7]) - grouper = s.apply(lambda x: x % 2) - grouped = s.groupby(grouper) - assert_series_equal(grouped.filter(lambda x: x.mean() > 1000), s[[]]) - df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': 'a b c d'.split()}) - grouper = df['A'].apply(lambda x: x % 2) - grouped = df.groupby(grouper) - assert_frame_equal( - grouped.filter(lambda x: x['A'].sum() > 1000), df.loc[[]]) - - def test_filter_out_no_groups(self): - s = pd.Series([1, 3, 20, 5, 22, 24, 7]) - grouper = s.apply(lambda x: x % 2) - grouped = s.groupby(grouper) - filtered = grouped.filter(lambda x: x.mean() > 0) - assert_series_equal(filtered, s) - df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': 'a b c d'.split()}) - grouper = df['A'].apply(lambda x: x % 2) - grouped = df.groupby(grouper) - filtered = grouped.filter(lambda x: x['A'].mean() > 0) - assert_frame_equal(filtered, df) - - def test_filter_out_all_groups_in_df(self): - # GH12768 - df = pd.DataFrame({'a': [1, 1, 2], 'b': [1, 2, 0]}) - res = df.groupby('a') - res = res.filter(lambda x: x['b'].sum() > 5, dropna=False) - expected = pd.DataFrame({'a': [nan] * 3, 'b': [nan] * 3}) - assert_frame_equal(expected, res) - - df = pd.DataFrame({'a': [1, 1, 2], 'b': [1, 2, 0]}) - res = df.groupby('a') - res = res.filter(lambda x: x['b'].sum() > 5, dropna=True) - expected = pd.DataFrame({'a': [], 'b': []}, dtype="int64") - assert_frame_equal(expected, res) - - def test_filter_condition_raises(self): - def raise_if_sum_is_zero(x): - if x.sum() == 0: - raise ValueError - else: - return x.sum() > 0 - - s = pd.Series([-1, 0, 1, 2]) - grouper = s.apply(lambda x: x % 2) - grouped = s.groupby(grouper) - pytest.raises(TypeError, - lambda: grouped.filter(raise_if_sum_is_zero)) - - def test_filter_with_axis_in_groupby(self): - # issue 11041 - index = pd.MultiIndex.from_product([range(10), [0, 1]]) - data = pd.DataFrame( - np.arange(100).reshape(-1, 20), columns=index, dtype='int64') - result = data.groupby(level=0, - axis=1).filter(lambda x: x.iloc[0, 0] > 10) - expected = data.iloc[:, 12:20] - assert_frame_equal(result, expected) - - def test_filter_bad_shapes(self): - df = DataFrame({'A': np.arange(8), - 'B': list('aabbbbcc'), - 'C': np.arange(8)}) - s = df['B'] - g_df = df.groupby('B') - g_s = s.groupby(s) - - f = lambda x: x - pytest.raises(TypeError, lambda: g_df.filter(f)) - pytest.raises(TypeError, lambda: g_s.filter(f)) - - f = lambda x: x == 1 - pytest.raises(TypeError, lambda: g_df.filter(f)) - pytest.raises(TypeError, lambda: g_s.filter(f)) - - f = lambda x: np.outer(x, x) - pytest.raises(TypeError, lambda: g_df.filter(f)) - pytest.raises(TypeError, lambda: g_s.filter(f)) - - def test_filter_nan_is_false(self): - df = DataFrame({'A': np.arange(8), - 'B': list('aabbbbcc'), - 'C': np.arange(8)}) - s = df['B'] - g_df = df.groupby(df['B']) - g_s = s.groupby(s) - - f = lambda x: np.nan - assert_frame_equal(g_df.filter(f), df.loc[[]]) - assert_series_equal(g_s.filter(f), s[[]]) - - def test_filter_against_workaround(self): - np.random.seed(0) - # Series of ints - s = Series(np.random.randint(0, 100, 1000)) - grouper = s.apply(lambda x: np.round(x, -1)) - grouped = s.groupby(grouper) - f = lambda x: x.mean() > 10 - - old_way = s[grouped.transform(f).astype('bool')] - new_way = grouped.filter(f) - assert_series_equal(new_way.sort_values(), old_way.sort_values()) - - # Series of floats - s = 100 * Series(np.random.random(1000)) - grouper = s.apply(lambda x: np.round(x, -1)) - grouped = s.groupby(grouper) - f = lambda x: x.mean() > 10 - old_way = s[grouped.transform(f).astype('bool')] - new_way = grouped.filter(f) - assert_series_equal(new_way.sort_values(), old_way.sort_values()) - - # Set up DataFrame of ints, floats, strings. - from string import ascii_lowercase - letters = np.array(list(ascii_lowercase)) - N = 1000 - random_letters = letters.take(np.random.randint(0, 26, N)) - df = DataFrame({'ints': Series(np.random.randint(0, 100, N)), - 'floats': N / 10 * Series(np.random.random(N)), - 'letters': Series(random_letters)}) - - # Group by ints; filter on floats. - grouped = df.groupby('ints') - old_way = df[grouped.floats. - transform(lambda x: x.mean() > N / 20).astype('bool')] - new_way = grouped.filter(lambda x: x['floats'].mean() > N / 20) - assert_frame_equal(new_way, old_way) - - # Group by floats (rounded); filter on strings. - grouper = df.floats.apply(lambda x: np.round(x, -1)) - grouped = df.groupby(grouper) - old_way = df[grouped.letters. - transform(lambda x: len(x) < N / 10).astype('bool')] - new_way = grouped.filter(lambda x: len(x.letters) < N / 10) - assert_frame_equal(new_way, old_way) - - # Group by strings; filter on ints. - grouped = df.groupby('letters') - old_way = df[grouped.ints. - transform(lambda x: x.mean() > N / 20).astype('bool')] - new_way = grouped.filter(lambda x: x['ints'].mean() > N / 20) - assert_frame_equal(new_way, old_way) - - def test_filter_using_len(self): - # BUG GH4447 - df = DataFrame({'A': np.arange(8), - 'B': list('aabbbbcc'), - 'C': np.arange(8)}) - grouped = df.groupby('B') - actual = grouped.filter(lambda x: len(x) > 2) - expected = DataFrame( - {'A': np.arange(2, 6), - 'B': list('bbbb'), - 'C': np.arange(2, 6)}, index=np.arange(2, 6)) - assert_frame_equal(actual, expected) - - actual = grouped.filter(lambda x: len(x) > 4) - expected = df.loc[[]] - assert_frame_equal(actual, expected) - - # Series have always worked properly, but we'll test anyway. - s = df['B'] - grouped = s.groupby(s) - actual = grouped.filter(lambda x: len(x) > 2) - expected = Series(4 * ['b'], index=np.arange(2, 6), name='B') - assert_series_equal(actual, expected) - - actual = grouped.filter(lambda x: len(x) > 4) - expected = s[[]] - assert_series_equal(actual, expected) - - def test_filter_maintains_ordering(self): - # Simple case: index is sequential. #4621 - df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], - 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}) - s = df['pid'] - grouped = df.groupby('tag') - actual = grouped.filter(lambda x: len(x) > 1) - expected = df.iloc[[1, 2, 4, 7]] - assert_frame_equal(actual, expected) - - grouped = s.groupby(df['tag']) - actual = grouped.filter(lambda x: len(x) > 1) - expected = s.iloc[[1, 2, 4, 7]] - assert_series_equal(actual, expected) - - # Now index is sequentially decreasing. - df.index = np.arange(len(df) - 1, -1, -1) - s = df['pid'] - grouped = df.groupby('tag') - actual = grouped.filter(lambda x: len(x) > 1) - expected = df.iloc[[1, 2, 4, 7]] - assert_frame_equal(actual, expected) - - grouped = s.groupby(df['tag']) - actual = grouped.filter(lambda x: len(x) > 1) - expected = s.iloc[[1, 2, 4, 7]] - assert_series_equal(actual, expected) - - # Index is shuffled. - SHUFFLED = [4, 6, 7, 2, 1, 0, 5, 3] - df.index = df.index[SHUFFLED] - s = df['pid'] - grouped = df.groupby('tag') - actual = grouped.filter(lambda x: len(x) > 1) - expected = df.iloc[[1, 2, 4, 7]] - assert_frame_equal(actual, expected) - - grouped = s.groupby(df['tag']) - actual = grouped.filter(lambda x: len(x) > 1) - expected = s.iloc[[1, 2, 4, 7]] - assert_series_equal(actual, expected) - - def test_filter_multiple_timestamp(self): - # GH 10114 - df = DataFrame({'A': np.arange(5, dtype='int64'), - 'B': ['foo', 'bar', 'foo', 'bar', 'bar'], - 'C': Timestamp('20130101')}) - - grouped = df.groupby(['B', 'C']) - - result = grouped['A'].filter(lambda x: True) - assert_series_equal(df['A'], result) - - result = grouped['A'].transform(len) - expected = Series([2, 3, 2, 3, 3], name='A') - assert_series_equal(result, expected) - - result = grouped.filter(lambda x: True) - assert_frame_equal(df, result) - - result = grouped.transform('sum') - expected = DataFrame({'A': [2, 8, 2, 8, 8]}) - assert_frame_equal(result, expected) - - result = grouped.transform(len) - expected = DataFrame({'A': [2, 3, 2, 3, 3]}) - assert_frame_equal(result, expected) - - def test_filter_and_transform_with_non_unique_int_index(self): - # GH4620 - index = [1, 1, 1, 2, 1, 1, 0, 1] - df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], - 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index) - grouped_df = df.groupby('tag') - ser = df['pid'] - grouped_ser = ser.groupby(df['tag']) - expected_indexes = [1, 2, 4, 7] - - # Filter DataFrame - actual = grouped_df.filter(lambda x: len(x) > 1) - expected = df.iloc[expected_indexes] - assert_frame_equal(actual, expected) - - actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) - expected = df.copy() - expected.iloc[[0, 3, 5, 6]] = np.nan - assert_frame_equal(actual, expected) - - # Filter Series - actual = grouped_ser.filter(lambda x: len(x) > 1) - expected = ser.take(expected_indexes) - assert_series_equal(actual, expected) - - actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) - NA = np.nan - expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid') - # ^ made manually because this can get confusing! - assert_series_equal(actual, expected) - - # Transform Series - actual = grouped_ser.transform(len) - expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid') - assert_series_equal(actual, expected) - - # Transform (a column from) DataFrameGroupBy - actual = grouped_df.pid.transform(len) - assert_series_equal(actual, expected) - - def test_filter_and_transform_with_multiple_non_unique_int_index(self): - # GH4620 - index = [1, 1, 1, 2, 0, 0, 0, 1] - df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], - 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index) - grouped_df = df.groupby('tag') - ser = df['pid'] - grouped_ser = ser.groupby(df['tag']) - expected_indexes = [1, 2, 4, 7] - - # Filter DataFrame - actual = grouped_df.filter(lambda x: len(x) > 1) - expected = df.iloc[expected_indexes] - assert_frame_equal(actual, expected) - - actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) - expected = df.copy() - expected.iloc[[0, 3, 5, 6]] = np.nan - assert_frame_equal(actual, expected) - - # Filter Series - actual = grouped_ser.filter(lambda x: len(x) > 1) - expected = ser.take(expected_indexes) - assert_series_equal(actual, expected) - - actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) - NA = np.nan - expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid') - # ^ made manually because this can get confusing! - assert_series_equal(actual, expected) - - # Transform Series - actual = grouped_ser.transform(len) - expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid') - assert_series_equal(actual, expected) - - # Transform (a column from) DataFrameGroupBy - actual = grouped_df.pid.transform(len) - assert_series_equal(actual, expected) - - def test_filter_and_transform_with_non_unique_float_index(self): - # GH4620 - index = np.array([1, 1, 1, 2, 1, 1, 0, 1], dtype=float) - df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], - 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index) - grouped_df = df.groupby('tag') - ser = df['pid'] - grouped_ser = ser.groupby(df['tag']) - expected_indexes = [1, 2, 4, 7] - - # Filter DataFrame - actual = grouped_df.filter(lambda x: len(x) > 1) - expected = df.iloc[expected_indexes] - assert_frame_equal(actual, expected) - - actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) - expected = df.copy() - expected.iloc[[0, 3, 5, 6]] = np.nan - assert_frame_equal(actual, expected) - - # Filter Series - actual = grouped_ser.filter(lambda x: len(x) > 1) - expected = ser.take(expected_indexes) - assert_series_equal(actual, expected) - - actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) - NA = np.nan - expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid') - # ^ made manually because this can get confusing! - assert_series_equal(actual, expected) - - # Transform Series - actual = grouped_ser.transform(len) - expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid') - assert_series_equal(actual, expected) - - # Transform (a column from) DataFrameGroupBy - actual = grouped_df.pid.transform(len) - assert_series_equal(actual, expected) - - def test_filter_and_transform_with_non_unique_timestamp_index(self): - # GH4620 - t0 = Timestamp('2013-09-30 00:05:00') - t1 = Timestamp('2013-10-30 00:05:00') - t2 = Timestamp('2013-11-30 00:05:00') - index = [t1, t1, t1, t2, t1, t1, t0, t1] - df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], - 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index) - grouped_df = df.groupby('tag') - ser = df['pid'] - grouped_ser = ser.groupby(df['tag']) - expected_indexes = [1, 2, 4, 7] - - # Filter DataFrame - actual = grouped_df.filter(lambda x: len(x) > 1) - expected = df.iloc[expected_indexes] - assert_frame_equal(actual, expected) - - actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) - expected = df.copy() - expected.iloc[[0, 3, 5, 6]] = np.nan - assert_frame_equal(actual, expected) - - # Filter Series - actual = grouped_ser.filter(lambda x: len(x) > 1) - expected = ser.take(expected_indexes) - assert_series_equal(actual, expected) - - actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) - NA = np.nan - expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid') - # ^ made manually because this can get confusing! - assert_series_equal(actual, expected) - - # Transform Series - actual = grouped_ser.transform(len) - expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid') - assert_series_equal(actual, expected) - - # Transform (a column from) DataFrameGroupBy - actual = grouped_df.pid.transform(len) - assert_series_equal(actual, expected) - - def test_filter_and_transform_with_non_unique_string_index(self): - # GH4620 - index = list('bbbcbbab') - df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], - 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index) - grouped_df = df.groupby('tag') - ser = df['pid'] - grouped_ser = ser.groupby(df['tag']) - expected_indexes = [1, 2, 4, 7] - - # Filter DataFrame - actual = grouped_df.filter(lambda x: len(x) > 1) - expected = df.iloc[expected_indexes] - assert_frame_equal(actual, expected) - - actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) - expected = df.copy() - expected.iloc[[0, 3, 5, 6]] = np.nan - assert_frame_equal(actual, expected) - - # Filter Series - actual = grouped_ser.filter(lambda x: len(x) > 1) - expected = ser.take(expected_indexes) - assert_series_equal(actual, expected) - - actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) - NA = np.nan - expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid') - # ^ made manually because this can get confusing! - assert_series_equal(actual, expected) - - # Transform Series - actual = grouped_ser.transform(len) - expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid') - assert_series_equal(actual, expected) - - # Transform (a column from) DataFrameGroupBy - actual = grouped_df.pid.transform(len) - assert_series_equal(actual, expected) - - def test_filter_has_access_to_grouped_cols(self): - df = DataFrame([[1, 2], [1, 3], [5, 6]], columns=['A', 'B']) - g = df.groupby('A') - # previously didn't have access to col A #???? - filt = g.filter(lambda x: x['A'].sum() == 2) - assert_frame_equal(filt, df.iloc[[0, 1]]) - - def test_filter_enforces_scalarness(self): - df = pd.DataFrame([ - ['best', 'a', 'x'], - ['worst', 'b', 'y'], - ['best', 'c', 'x'], - ['best', 'd', 'y'], - ['worst', 'd', 'y'], - ['worst', 'd', 'y'], - ['best', 'd', 'z'], - ], columns=['a', 'b', 'c']) - with tm.assert_raises_regex(TypeError, - 'filter function returned a.*'): - df.groupby('c').filter(lambda g: g['a'] == 'best') - - def test_filter_non_bool_raises(self): - df = pd.DataFrame([ - ['best', 'a', 1], - ['worst', 'b', 1], - ['best', 'c', 1], - ['best', 'd', 1], - ['worst', 'd', 1], - ['worst', 'd', 1], - ['best', 'd', 1], - ], columns=['a', 'b', 'c']) - with tm.assert_raises_regex(TypeError, - 'filter function returned a.*'): - df.groupby('a').filter(lambda g: g.c.mean()) - - def test_filter_dropna_with_empty_groups(self): - # GH 10780 - data = pd.Series(np.random.rand(9), index=np.repeat([1, 2, 3], 3)) - groupped = data.groupby(level=0) - result_false = groupped.filter(lambda x: x.mean() > 1, dropna=False) - expected_false = pd.Series([np.nan] * 9, - index=np.repeat([1, 2, 3], 3)) - tm.assert_series_equal(result_false, expected_false) - - result_true = groupped.filter(lambda x: x.mean() > 1, dropna=True) - expected_true = pd.Series(index=pd.Index([], dtype=int)) - tm.assert_series_equal(result_true, expected_true) - - -def assert_fp_equal(a, b): - assert (np.abs(a - b) < 1e-12).all() - - -def _check_groupby(df, result, keys, field, f=lambda x: x.sum()): - tups = lmap(tuple, df[keys].values) - tups = com._asarray_tuplesafe(tups) - expected = f(df.groupby(tups)[field]) - for k, v in compat.iteritems(expected): - assert (result[k] == v) +def test_filter_series(): + s = pd.Series([1, 3, 20, 5, 22, 24, 7]) + expected_odd = pd.Series([1, 3, 5, 7], index=[0, 1, 3, 6]) + expected_even = pd.Series([20, 22, 24], index=[2, 4, 5]) + grouper = s.apply(lambda x: x % 2) + grouped = s.groupby(grouper) + tm.assert_series_equal( + grouped.filter(lambda x: x.mean() < 10), expected_odd) + tm.assert_series_equal( + grouped.filter(lambda x: x.mean() > 10), expected_even) + # Test dropna=False. + tm.assert_series_equal( + grouped.filter(lambda x: x.mean() < 10, dropna=False), + expected_odd.reindex(s.index)) + tm.assert_series_equal( + grouped.filter(lambda x: x.mean() > 10, dropna=False), + expected_even.reindex(s.index)) + + +def test_filter_single_column_df(): + df = pd.DataFrame([1, 3, 20, 5, 22, 24, 7]) + expected_odd = pd.DataFrame([1, 3, 5, 7], index=[0, 1, 3, 6]) + expected_even = pd.DataFrame([20, 22, 24], index=[2, 4, 5]) + grouper = df[0].apply(lambda x: x % 2) + grouped = df.groupby(grouper) + tm.assert_frame_equal( + grouped.filter(lambda x: x.mean() < 10), expected_odd) + tm.assert_frame_equal( + grouped.filter(lambda x: x.mean() > 10), expected_even) + # Test dropna=False. + tm.assert_frame_equal( + grouped.filter(lambda x: x.mean() < 10, dropna=False), + expected_odd.reindex(df.index)) + tm.assert_frame_equal( + grouped.filter(lambda x: x.mean() > 10, dropna=False), + expected_even.reindex(df.index)) + + +def test_filter_multi_column_df(): + df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': [1, 1, 1, 1]}) + grouper = df['A'].apply(lambda x: x % 2) + grouped = df.groupby(grouper) + expected = pd.DataFrame({'A': [12, 12], 'B': [1, 1]}, index=[1, 2]) + tm.assert_frame_equal( + grouped.filter(lambda x: x['A'].sum() - x['B'].sum() > 10), + expected) + + +def test_filter_mixed_df(): + df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': 'a b c d'.split()}) + grouper = df['A'].apply(lambda x: x % 2) + grouped = df.groupby(grouper) + expected = pd.DataFrame({'A': [12, 12], 'B': ['b', 'c']}, index=[1, 2]) + tm.assert_frame_equal( + grouped.filter(lambda x: x['A'].sum() > 10), expected) + + +def test_filter_out_all_groups(): + s = pd.Series([1, 3, 20, 5, 22, 24, 7]) + grouper = s.apply(lambda x: x % 2) + grouped = s.groupby(grouper) + tm.assert_series_equal(grouped.filter(lambda x: x.mean() > 1000), s[[]]) + df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': 'a b c d'.split()}) + grouper = df['A'].apply(lambda x: x % 2) + grouped = df.groupby(grouper) + tm.assert_frame_equal( + grouped.filter(lambda x: x['A'].sum() > 1000), df.loc[[]]) + + +def test_filter_out_no_groups(): + s = pd.Series([1, 3, 20, 5, 22, 24, 7]) + grouper = s.apply(lambda x: x % 2) + grouped = s.groupby(grouper) + filtered = grouped.filter(lambda x: x.mean() > 0) + tm.assert_series_equal(filtered, s) + df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': 'a b c d'.split()}) + grouper = df['A'].apply(lambda x: x % 2) + grouped = df.groupby(grouper) + filtered = grouped.filter(lambda x: x['A'].mean() > 0) + tm.assert_frame_equal(filtered, df) + + +def test_filter_out_all_groups_in_df(): + # GH12768 + df = pd.DataFrame({'a': [1, 1, 2], 'b': [1, 2, 0]}) + res = df.groupby('a') + res = res.filter(lambda x: x['b'].sum() > 5, dropna=False) + expected = pd.DataFrame({'a': [np.nan] * 3, 'b': [np.nan] * 3}) + tm.assert_frame_equal(expected, res) + + df = pd.DataFrame({'a': [1, 1, 2], 'b': [1, 2, 0]}) + res = df.groupby('a') + res = res.filter(lambda x: x['b'].sum() > 5, dropna=True) + expected = pd.DataFrame({'a': [], 'b': []}, dtype="int64") + tm.assert_frame_equal(expected, res) + + +def test_filter_condition_raises(): + def raise_if_sum_is_zero(x): + if x.sum() == 0: + raise ValueError + else: + return x.sum() > 0 + + s = pd.Series([-1, 0, 1, 2]) + grouper = s.apply(lambda x: x % 2) + grouped = s.groupby(grouper) + pytest.raises(TypeError, + lambda: grouped.filter(raise_if_sum_is_zero)) + + +def test_filter_with_axis_in_groupby(): + # issue 11041 + index = pd.MultiIndex.from_product([range(10), [0, 1]]) + data = pd.DataFrame( + np.arange(100).reshape(-1, 20), columns=index, dtype='int64') + result = data.groupby(level=0, + axis=1).filter(lambda x: x.iloc[0, 0] > 10) + expected = data.iloc[:, 12:20] + tm.assert_frame_equal(result, expected) + + +def test_filter_bad_shapes(): + df = DataFrame({'A': np.arange(8), + 'B': list('aabbbbcc'), + 'C': np.arange(8)}) + s = df['B'] + g_df = df.groupby('B') + g_s = s.groupby(s) + + f = lambda x: x + pytest.raises(TypeError, lambda: g_df.filter(f)) + pytest.raises(TypeError, lambda: g_s.filter(f)) + + f = lambda x: x == 1 + pytest.raises(TypeError, lambda: g_df.filter(f)) + pytest.raises(TypeError, lambda: g_s.filter(f)) + + f = lambda x: np.outer(x, x) + pytest.raises(TypeError, lambda: g_df.filter(f)) + pytest.raises(TypeError, lambda: g_s.filter(f)) + + +def test_filter_nan_is_false(): + df = DataFrame({'A': np.arange(8), + 'B': list('aabbbbcc'), + 'C': np.arange(8)}) + s = df['B'] + g_df = df.groupby(df['B']) + g_s = s.groupby(s) + + f = lambda x: np.nan + tm.assert_frame_equal(g_df.filter(f), df.loc[[]]) + tm.assert_series_equal(g_s.filter(f), s[[]]) + + +def test_filter_against_workaround(): + np.random.seed(0) + # Series of ints + s = Series(np.random.randint(0, 100, 1000)) + grouper = s.apply(lambda x: np.round(x, -1)) + grouped = s.groupby(grouper) + f = lambda x: x.mean() > 10 + + old_way = s[grouped.transform(f).astype('bool')] + new_way = grouped.filter(f) + tm.assert_series_equal(new_way.sort_values(), old_way.sort_values()) + + # Series of floats + s = 100 * Series(np.random.random(1000)) + grouper = s.apply(lambda x: np.round(x, -1)) + grouped = s.groupby(grouper) + f = lambda x: x.mean() > 10 + old_way = s[grouped.transform(f).astype('bool')] + new_way = grouped.filter(f) + tm.assert_series_equal(new_way.sort_values(), old_way.sort_values()) + + # Set up DataFrame of ints, floats, strings. + from string import ascii_lowercase + letters = np.array(list(ascii_lowercase)) + N = 1000 + random_letters = letters.take(np.random.randint(0, 26, N)) + df = DataFrame({'ints': Series(np.random.randint(0, 100, N)), + 'floats': N / 10 * Series(np.random.random(N)), + 'letters': Series(random_letters)}) + + # Group by ints; filter on floats. + grouped = df.groupby('ints') + old_way = df[grouped.floats. + transform(lambda x: x.mean() > N / 20).astype('bool')] + new_way = grouped.filter(lambda x: x['floats'].mean() > N / 20) + tm.assert_frame_equal(new_way, old_way) + + # Group by floats (rounded); filter on strings. + grouper = df.floats.apply(lambda x: np.round(x, -1)) + grouped = df.groupby(grouper) + old_way = df[grouped.letters. + transform(lambda x: len(x) < N / 10).astype('bool')] + new_way = grouped.filter(lambda x: len(x.letters) < N / 10) + tm.assert_frame_equal(new_way, old_way) + + # Group by strings; filter on ints. + grouped = df.groupby('letters') + old_way = df[grouped.ints. + transform(lambda x: x.mean() > N / 20).astype('bool')] + new_way = grouped.filter(lambda x: x['ints'].mean() > N / 20) + tm.assert_frame_equal(new_way, old_way) + + +def test_filter_using_len(): + # BUG GH4447 + df = DataFrame({'A': np.arange(8), + 'B': list('aabbbbcc'), + 'C': np.arange(8)}) + grouped = df.groupby('B') + actual = grouped.filter(lambda x: len(x) > 2) + expected = DataFrame( + {'A': np.arange(2, 6), + 'B': list('bbbb'), + 'C': np.arange(2, 6)}, index=np.arange(2, 6)) + tm.assert_frame_equal(actual, expected) + + actual = grouped.filter(lambda x: len(x) > 4) + expected = df.loc[[]] + tm.assert_frame_equal(actual, expected) + + # Series have always worked properly, but we'll test anyway. + s = df['B'] + grouped = s.groupby(s) + actual = grouped.filter(lambda x: len(x) > 2) + expected = Series(4 * ['b'], index=np.arange(2, 6), name='B') + tm.assert_series_equal(actual, expected) + + actual = grouped.filter(lambda x: len(x) > 4) + expected = s[[]] + tm.assert_series_equal(actual, expected) + + +def test_filter_maintains_ordering(): + # Simple case: index is sequential. #4621 + df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], + 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}) + s = df['pid'] + grouped = df.groupby('tag') + actual = grouped.filter(lambda x: len(x) > 1) + expected = df.iloc[[1, 2, 4, 7]] + tm.assert_frame_equal(actual, expected) + + grouped = s.groupby(df['tag']) + actual = grouped.filter(lambda x: len(x) > 1) + expected = s.iloc[[1, 2, 4, 7]] + tm.assert_series_equal(actual, expected) + + # Now index is sequentially decreasing. + df.index = np.arange(len(df) - 1, -1, -1) + s = df['pid'] + grouped = df.groupby('tag') + actual = grouped.filter(lambda x: len(x) > 1) + expected = df.iloc[[1, 2, 4, 7]] + tm.assert_frame_equal(actual, expected) + + grouped = s.groupby(df['tag']) + actual = grouped.filter(lambda x: len(x) > 1) + expected = s.iloc[[1, 2, 4, 7]] + tm.assert_series_equal(actual, expected) + + # Index is shuffled. + SHUFFLED = [4, 6, 7, 2, 1, 0, 5, 3] + df.index = df.index[SHUFFLED] + s = df['pid'] + grouped = df.groupby('tag') + actual = grouped.filter(lambda x: len(x) > 1) + expected = df.iloc[[1, 2, 4, 7]] + tm.assert_frame_equal(actual, expected) + + grouped = s.groupby(df['tag']) + actual = grouped.filter(lambda x: len(x) > 1) + expected = s.iloc[[1, 2, 4, 7]] + tm.assert_series_equal(actual, expected) + + +def test_filter_multiple_timestamp(): + # GH 10114 + df = DataFrame({'A': np.arange(5, dtype='int64'), + 'B': ['foo', 'bar', 'foo', 'bar', 'bar'], + 'C': Timestamp('20130101')}) + + grouped = df.groupby(['B', 'C']) + + result = grouped['A'].filter(lambda x: True) + tm.assert_series_equal(df['A'], result) + + result = grouped['A'].transform(len) + expected = Series([2, 3, 2, 3, 3], name='A') + tm.assert_series_equal(result, expected) + + result = grouped.filter(lambda x: True) + tm.assert_frame_equal(df, result) + + result = grouped.transform('sum') + expected = DataFrame({'A': [2, 8, 2, 8, 8]}) + tm.assert_frame_equal(result, expected) + + result = grouped.transform(len) + expected = DataFrame({'A': [2, 3, 2, 3, 3]}) + tm.assert_frame_equal(result, expected) + + +def test_filter_and_transform_with_non_unique_int_index(): + # GH4620 + index = [1, 1, 1, 2, 1, 1, 0, 1] + df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], + 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index) + grouped_df = df.groupby('tag') + ser = df['pid'] + grouped_ser = ser.groupby(df['tag']) + expected_indexes = [1, 2, 4, 7] + + # Filter DataFrame + actual = grouped_df.filter(lambda x: len(x) > 1) + expected = df.iloc[expected_indexes] + tm.assert_frame_equal(actual, expected) + + actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) + expected = df.copy() + expected.iloc[[0, 3, 5, 6]] = np.nan + tm.assert_frame_equal(actual, expected) + + # Filter Series + actual = grouped_ser.filter(lambda x: len(x) > 1) + expected = ser.take(expected_indexes) + tm.assert_series_equal(actual, expected) + + actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) + NA = np.nan + expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid') + # ^ made manually because this can get confusing! + tm.assert_series_equal(actual, expected) + + # Transform Series + actual = grouped_ser.transform(len) + expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid') + tm.assert_series_equal(actual, expected) + + # Transform (a column from) DataFrameGroupBy + actual = grouped_df.pid.transform(len) + tm.assert_series_equal(actual, expected) + + +def test_filter_and_transform_with_multiple_non_unique_int_index(): + # GH4620 + index = [1, 1, 1, 2, 0, 0, 0, 1] + df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], + 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index) + grouped_df = df.groupby('tag') + ser = df['pid'] + grouped_ser = ser.groupby(df['tag']) + expected_indexes = [1, 2, 4, 7] + + # Filter DataFrame + actual = grouped_df.filter(lambda x: len(x) > 1) + expected = df.iloc[expected_indexes] + tm.assert_frame_equal(actual, expected) + + actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) + expected = df.copy() + expected.iloc[[0, 3, 5, 6]] = np.nan + tm.assert_frame_equal(actual, expected) + + # Filter Series + actual = grouped_ser.filter(lambda x: len(x) > 1) + expected = ser.take(expected_indexes) + tm.assert_series_equal(actual, expected) + + actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) + NA = np.nan + expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid') + # ^ made manually because this can get confusing! + tm.assert_series_equal(actual, expected) + + # Transform Series + actual = grouped_ser.transform(len) + expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid') + tm.assert_series_equal(actual, expected) + + # Transform (a column from) DataFrameGroupBy + actual = grouped_df.pid.transform(len) + tm.assert_series_equal(actual, expected) + + +def test_filter_and_transform_with_non_unique_float_index(): + # GH4620 + index = np.array([1, 1, 1, 2, 1, 1, 0, 1], dtype=float) + df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], + 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index) + grouped_df = df.groupby('tag') + ser = df['pid'] + grouped_ser = ser.groupby(df['tag']) + expected_indexes = [1, 2, 4, 7] + + # Filter DataFrame + actual = grouped_df.filter(lambda x: len(x) > 1) + expected = df.iloc[expected_indexes] + tm.assert_frame_equal(actual, expected) + + actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) + expected = df.copy() + expected.iloc[[0, 3, 5, 6]] = np.nan + tm.assert_frame_equal(actual, expected) + + # Filter Series + actual = grouped_ser.filter(lambda x: len(x) > 1) + expected = ser.take(expected_indexes) + tm.assert_series_equal(actual, expected) + + actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) + NA = np.nan + expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid') + # ^ made manually because this can get confusing! + tm.assert_series_equal(actual, expected) + + # Transform Series + actual = grouped_ser.transform(len) + expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid') + tm.assert_series_equal(actual, expected) + + # Transform (a column from) DataFrameGroupBy + actual = grouped_df.pid.transform(len) + tm.assert_series_equal(actual, expected) + + +def test_filter_and_transform_with_non_unique_timestamp_index(): + # GH4620 + t0 = Timestamp('2013-09-30 00:05:00') + t1 = Timestamp('2013-10-30 00:05:00') + t2 = Timestamp('2013-11-30 00:05:00') + index = [t1, t1, t1, t2, t1, t1, t0, t1] + df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], + 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index) + grouped_df = df.groupby('tag') + ser = df['pid'] + grouped_ser = ser.groupby(df['tag']) + expected_indexes = [1, 2, 4, 7] + + # Filter DataFrame + actual = grouped_df.filter(lambda x: len(x) > 1) + expected = df.iloc[expected_indexes] + tm.assert_frame_equal(actual, expected) + + actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) + expected = df.copy() + expected.iloc[[0, 3, 5, 6]] = np.nan + tm.assert_frame_equal(actual, expected) + + # Filter Series + actual = grouped_ser.filter(lambda x: len(x) > 1) + expected = ser.take(expected_indexes) + tm.assert_series_equal(actual, expected) + + actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) + NA = np.nan + expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid') + # ^ made manually because this can get confusing! + tm.assert_series_equal(actual, expected) + + # Transform Series + actual = grouped_ser.transform(len) + expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid') + tm.assert_series_equal(actual, expected) + + # Transform (a column from) DataFrameGroupBy + actual = grouped_df.pid.transform(len) + tm.assert_series_equal(actual, expected) + + +def test_filter_and_transform_with_non_unique_string_index(): + # GH4620 + index = list('bbbcbbab') + df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], + 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index) + grouped_df = df.groupby('tag') + ser = df['pid'] + grouped_ser = ser.groupby(df['tag']) + expected_indexes = [1, 2, 4, 7] + + # Filter DataFrame + actual = grouped_df.filter(lambda x: len(x) > 1) + expected = df.iloc[expected_indexes] + tm.assert_frame_equal(actual, expected) + + actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) + expected = df.copy() + expected.iloc[[0, 3, 5, 6]] = np.nan + tm.assert_frame_equal(actual, expected) + + # Filter Series + actual = grouped_ser.filter(lambda x: len(x) > 1) + expected = ser.take(expected_indexes) + tm.assert_series_equal(actual, expected) + + actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) + NA = np.nan + expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid') + # ^ made manually because this can get confusing! + tm.assert_series_equal(actual, expected) + + # Transform Series + actual = grouped_ser.transform(len) + expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid') + tm.assert_series_equal(actual, expected) + + # Transform (a column from) DataFrameGroupBy + actual = grouped_df.pid.transform(len) + tm.assert_series_equal(actual, expected) + + +def test_filter_has_access_to_grouped_cols(): + df = DataFrame([[1, 2], [1, 3], [5, 6]], columns=['A', 'B']) + g = df.groupby('A') + # previously didn't have access to col A #???? + filt = g.filter(lambda x: x['A'].sum() == 2) + tm.assert_frame_equal(filt, df.iloc[[0, 1]]) + + +def test_filter_enforces_scalarness(): + df = pd.DataFrame([ + ['best', 'a', 'x'], + ['worst', 'b', 'y'], + ['best', 'c', 'x'], + ['best', 'd', 'y'], + ['worst', 'd', 'y'], + ['worst', 'd', 'y'], + ['best', 'd', 'z'], + ], columns=['a', 'b', 'c']) + with tm.assert_raises_regex(TypeError, + 'filter function returned a.*'): + df.groupby('c').filter(lambda g: g['a'] == 'best') + + +def test_filter_non_bool_raises(): + df = pd.DataFrame([ + ['best', 'a', 1], + ['worst', 'b', 1], + ['best', 'c', 1], + ['best', 'd', 1], + ['worst', 'd', 1], + ['worst', 'd', 1], + ['best', 'd', 1], + ], columns=['a', 'b', 'c']) + with tm.assert_raises_regex(TypeError, + 'filter function returned a.*'): + df.groupby('a').filter(lambda g: g.c.mean()) + + +def test_filter_dropna_with_empty_groups(): + # GH 10780 + data = pd.Series(np.random.rand(9), index=np.repeat([1, 2, 3], 3)) + groupped = data.groupby(level=0) + result_false = groupped.filter(lambda x: x.mean() > 1, dropna=False) + expected_false = pd.Series([np.nan] * 9, + index=np.repeat([1, 2, 3], 3)) + tm.assert_series_equal(result_false, expected_false) + + result_true = groupped.filter(lambda x: x.mean() > 1, dropna=True) + expected_true = pd.Series(index=pd.Index([], dtype=int)) + tm.assert_series_equal(result_true, expected_true) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py new file mode 100644 index 0000000000000..ba1371fe9f931 --- /dev/null +++ b/pandas/tests/groupby/test_function.py @@ -0,0 +1,1120 @@ +import pytest + +import numpy as np +import pandas as pd +from pandas import (DataFrame, Index, compat, isna, + Series, MultiIndex, Timestamp, date_range) +from pandas.errors import UnsupportedFunctionCall +from pandas.util import testing as tm +import pandas.core.nanops as nanops +from string import ascii_lowercase +from pandas.compat import product as cart_product + + +@pytest.mark.parametrize("agg_func", ['any', 'all']) +@pytest.mark.parametrize("skipna", [True, False]) +@pytest.mark.parametrize("vals", [ + ['foo', 'bar', 'baz'], ['foo', '', ''], ['', '', ''], + [1, 2, 3], [1, 0, 0], [0, 0, 0], + [1., 2., 3.], [1., 0., 0.], [0., 0., 0.], + [True, True, True], [True, False, False], [False, False, False], + [np.nan, np.nan, np.nan] +]) +def test_groupby_bool_aggs(agg_func, skipna, vals): + df = DataFrame({'key': ['a'] * 3 + ['b'] * 3, 'val': vals * 2}) + + # Figure out expectation using Python builtin + exp = getattr(compat.builtins, agg_func)(vals) + + # edge case for missing data with skipna and 'any' + if skipna and all(isna(vals)) and agg_func == 'any': + exp = False + + exp_df = DataFrame([exp] * 2, columns=['val'], index=Index( + ['a', 'b'], name='key')) + result = getattr(df.groupby('key'), agg_func)(skipna=skipna) + tm.assert_frame_equal(result, exp_df) + + +def test_max_min_non_numeric(): + # #2700 + aa = DataFrame({'nn': [11, 11, 22, 22], + 'ii': [1, 2, 3, 4], + 'ss': 4 * ['mama']}) + + result = aa.groupby('nn').max() + assert 'ss' in result + + result = aa.groupby('nn').max(numeric_only=False) + assert 'ss' in result + + result = aa.groupby('nn').min() + assert 'ss' in result + + result = aa.groupby('nn').min(numeric_only=False) + assert 'ss' in result + + +def test_intercept_builtin_sum(): + s = Series([1., 2., np.nan, 3.]) + grouped = s.groupby([0, 1, 2, 2]) + + result = grouped.agg(compat.builtins.sum) + result2 = grouped.apply(compat.builtins.sum) + expected = grouped.sum() + tm.assert_series_equal(result, expected) + tm.assert_series_equal(result2, expected) + + +def test_builtins_apply(): # GH8155 + df = pd.DataFrame(np.random.randint(1, 50, (1000, 2)), + columns=['jim', 'joe']) + df['jolie'] = np.random.randn(1000) + + for keys in ['jim', ['jim', 'joe']]: # single key & multi-key + if keys == 'jim': + continue + for f in [max, min, sum]: + fname = f.__name__ + result = df.groupby(keys).apply(f) + result.shape + ngroups = len(df.drop_duplicates(subset=keys)) + assert result.shape == (ngroups, 3), 'invalid frame shape: '\ + '{} (expected ({}, 3))'.format(result.shape, ngroups) + + tm.assert_frame_equal(result, # numpy's equivalent function + df.groupby(keys).apply(getattr(np, fname))) + + if f != sum: + expected = df.groupby(keys).agg(fname).reset_index() + expected.set_index(keys, inplace=True, drop=False) + tm.assert_frame_equal(result, expected, check_dtype=False) + + tm.assert_series_equal(getattr(result, fname)(), + getattr(df, fname)()) + + +def test_arg_passthru(): + # make sure that we are passing thru kwargs + # to our agg functions + + # GH3668 + # GH5724 + df = pd.DataFrame( + {'group': [1, 1, 2], + 'int': [1, 2, 3], + 'float': [4., 5., 6.], + 'string': list('abc'), + 'category_string': pd.Series(list('abc')).astype('category'), + 'category_int': [7, 8, 9], + 'datetime': pd.date_range('20130101', periods=3), + 'datetimetz': pd.date_range('20130101', + periods=3, + tz='US/Eastern'), + 'timedelta': pd.timedelta_range('1 s', periods=3, freq='s')}, + columns=['group', 'int', 'float', 'string', + 'category_string', 'category_int', + 'datetime', 'datetimetz', + 'timedelta']) + + expected_columns_numeric = Index(['int', 'float', 'category_int']) + + # mean / median + expected = pd.DataFrame( + {'category_int': [7.5, 9], + 'float': [4.5, 6.], + 'timedelta': [pd.Timedelta('1.5s'), + pd.Timedelta('3s')], + 'int': [1.5, 3], + 'datetime': [pd.Timestamp('2013-01-01 12:00:00'), + pd.Timestamp('2013-01-03 00:00:00')], + 'datetimetz': [ + pd.Timestamp('2013-01-01 12:00:00', tz='US/Eastern'), + pd.Timestamp('2013-01-03 00:00:00', tz='US/Eastern')]}, + index=Index([1, 2], name='group'), + columns=['int', 'float', 'category_int', + 'datetime', 'datetimetz', 'timedelta']) + for attr in ['mean', 'median']: + f = getattr(df.groupby('group'), attr) + result = f() + tm.assert_index_equal(result.columns, expected_columns_numeric) + + result = f(numeric_only=False) + tm.assert_frame_equal(result.reindex_like(expected), expected) + + # TODO: min, max *should* handle + # categorical (ordered) dtype + expected_columns = Index(['int', 'float', 'string', + 'category_int', + 'datetime', 'datetimetz', + 'timedelta']) + for attr in ['min', 'max']: + f = getattr(df.groupby('group'), attr) + result = f() + tm.assert_index_equal(result.columns, expected_columns) + + result = f(numeric_only=False) + tm.assert_index_equal(result.columns, expected_columns) + + expected_columns = Index(['int', 'float', 'string', + 'category_string', 'category_int', + 'datetime', 'datetimetz', + 'timedelta']) + for attr in ['first', 'last']: + f = getattr(df.groupby('group'), attr) + result = f() + tm.assert_index_equal(result.columns, expected_columns) + + result = f(numeric_only=False) + tm.assert_index_equal(result.columns, expected_columns) + + expected_columns = Index(['int', 'float', 'string', + 'category_int', 'timedelta']) + for attr in ['sum']: + f = getattr(df.groupby('group'), attr) + result = f() + tm.assert_index_equal(result.columns, expected_columns_numeric) + + result = f(numeric_only=False) + tm.assert_index_equal(result.columns, expected_columns) + + expected_columns = Index(['int', 'float', 'category_int']) + for attr in ['prod', 'cumprod']: + f = getattr(df.groupby('group'), attr) + result = f() + tm.assert_index_equal(result.columns, expected_columns_numeric) + + result = f(numeric_only=False) + tm.assert_index_equal(result.columns, expected_columns) + + # like min, max, but don't include strings + expected_columns = Index(['int', 'float', + 'category_int', + 'datetime', 'datetimetz', + 'timedelta']) + for attr in ['cummin', 'cummax']: + f = getattr(df.groupby('group'), attr) + result = f() + # GH 15561: numeric_only=False set by default like min/max + tm.assert_index_equal(result.columns, expected_columns) + + result = f(numeric_only=False) + tm.assert_index_equal(result.columns, expected_columns) + + expected_columns = Index(['int', 'float', 'category_int', + 'timedelta']) + for attr in ['cumsum']: + f = getattr(df.groupby('group'), attr) + result = f() + tm.assert_index_equal(result.columns, expected_columns_numeric) + + result = f(numeric_only=False) + tm.assert_index_equal(result.columns, expected_columns) + + +def test_non_cython_api(): + + # GH5610 + # non-cython calls should not include the grouper + + df = DataFrame( + [[1, 2, 'foo'], + [1, np.nan, 'bar'], + [3, np.nan, 'baz']], + columns=['A', 'B', 'C']) + g = df.groupby('A') + gni = df.groupby('A', as_index=False) + + # mad + expected = DataFrame([[0], [np.nan]], columns=['B'], index=[1, 3]) + expected.index.name = 'A' + result = g.mad() + tm.assert_frame_equal(result, expected) + + expected = DataFrame([[0., 0.], [0, np.nan]], columns=['A', 'B'], + index=[0, 1]) + result = gni.mad() + tm.assert_frame_equal(result, expected) + + # describe + expected_index = pd.Index([1, 3], name='A') + expected_col = pd.MultiIndex(levels=[['B'], + ['count', 'mean', 'std', 'min', + '25%', '50%', '75%', 'max']], + labels=[[0] * 8, list(range(8))]) + expected = pd.DataFrame([[1.0, 2.0, np.nan, 2.0, 2.0, 2.0, 2.0, 2.0], + [0.0, np.nan, np.nan, np.nan, np.nan, np.nan, + np.nan, np.nan]], + index=expected_index, + columns=expected_col) + result = g.describe() + tm.assert_frame_equal(result, expected) + + expected = pd.concat([df[df.A == 1].describe().unstack().to_frame().T, + df[df.A == 3].describe().unstack().to_frame().T]) + expected.index = pd.Index([0, 1]) + result = gni.describe() + tm.assert_frame_equal(result, expected) + + # any + expected = DataFrame([[True, True], [False, True]], columns=['B', 'C'], + index=[1, 3]) + expected.index.name = 'A' + result = g.any() + tm.assert_frame_equal(result, expected) + + # idxmax + expected = DataFrame([[0.0], [np.nan]], columns=['B'], index=[1, 3]) + expected.index.name = 'A' + result = g.idxmax() + tm.assert_frame_equal(result, expected) + + +def test_cython_api2(): + + # this takes the fast apply path + + # cumsum (GH5614) + df = DataFrame( + [[1, 2, np.nan], [1, np.nan, 9], [3, 4, 9] + ], columns=['A', 'B', 'C']) + expected = DataFrame( + [[2, np.nan], [np.nan, 9], [4, 9]], columns=['B', 'C']) + result = df.groupby('A').cumsum() + tm.assert_frame_equal(result, expected) + + # GH 5755 - cumsum is a transformer and should ignore as_index + result = df.groupby('A', as_index=False).cumsum() + tm.assert_frame_equal(result, expected) + + # GH 13994 + result = df.groupby('A').cumsum(axis=1) + expected = df.cumsum(axis=1) + tm.assert_frame_equal(result, expected) + result = df.groupby('A').cumprod(axis=1) + expected = df.cumprod(axis=1) + tm.assert_frame_equal(result, expected) + + +def test_cython_median(): + df = DataFrame(np.random.randn(1000)) + df.values[::2] = np.nan + + labels = np.random.randint(0, 50, size=1000).astype(float) + labels[::17] = np.nan + + result = df.groupby(labels).median() + exp = df.groupby(labels).agg(nanops.nanmedian) + tm.assert_frame_equal(result, exp) + + df = DataFrame(np.random.randn(1000, 5)) + rs = df.groupby(labels).agg(np.median) + xp = df.groupby(labels).median() + tm.assert_frame_equal(rs, xp) + + +def test_median_empty_bins(): + df = pd.DataFrame(np.random.randint(0, 44, 500)) + + grps = range(0, 55, 5) + bins = pd.cut(df[0], grps) + + result = df.groupby(bins).median() + expected = df.groupby(bins).agg(lambda x: x.median()) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("dtype", [ + 'int8', 'int16', 'int32', 'int64', 'float32', 'float64']) +@pytest.mark.parametrize("method,data", [ + ('first', {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}), + ('last', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}), + ('min', {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}), + ('max', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}), + ('nth', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}], + 'args': [1]}), + ('count', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 2}], + 'out_type': 'int64'}) +]) +def test_groupby_non_arithmetic_agg_types(dtype, method, data): + # GH9311, GH6620 + df = pd.DataFrame( + [{'a': 1, 'b': 1}, + {'a': 1, 'b': 2}, + {'a': 2, 'b': 3}, + {'a': 2, 'b': 4}]) + + df['b'] = df.b.astype(dtype) + + if 'args' not in data: + data['args'] = [] + + if 'out_type' in data: + out_type = data['out_type'] + else: + out_type = dtype + + exp = data['df'] + df_out = pd.DataFrame(exp) + + df_out['b'] = df_out.b.astype(out_type) + df_out.set_index('a', inplace=True) + + grpd = df.groupby('a') + t = getattr(grpd, method)(*data['args']) + tm.assert_frame_equal(t, df_out) + + +def test_groupby_non_arithmetic_agg_intlike_precision(): + # GH9311, GH6620 + c = 24650000000000000 + + inputs = ((Timestamp('2011-01-15 12:50:28.502376'), + Timestamp('2011-01-20 12:50:28.593448')), (1 + c, 2 + c)) + + for i in inputs: + df = pd.DataFrame([{'a': 1, 'b': i[0]}, {'a': 1, 'b': i[1]}]) + + grp_exp = {'first': {'expected': i[0]}, + 'last': {'expected': i[1]}, + 'min': {'expected': i[0]}, + 'max': {'expected': i[1]}, + 'nth': {'expected': i[1], + 'args': [1]}, + 'count': {'expected': 2}} + + for method, data in compat.iteritems(grp_exp): + if 'args' not in data: + data['args'] = [] + + grpd = df.groupby('a') + res = getattr(grpd, method)(*data['args']) + assert res.iloc[0].b == data['expected'] + + +def test_fill_constistency(): + + # GH9221 + # pass thru keyword arguments to the generated wrapper + # are set if the passed kw is None (only) + df = DataFrame(index=pd.MultiIndex.from_product( + [['value1', 'value2'], date_range('2014-01-01', '2014-01-06')]), + columns=Index( + ['1', '2'], name='id')) + df['1'] = [np.nan, 1, np.nan, np.nan, 11, np.nan, np.nan, 2, np.nan, + np.nan, 22, np.nan] + df['2'] = [np.nan, 3, np.nan, np.nan, 33, np.nan, np.nan, 4, np.nan, + np.nan, 44, np.nan] + + expected = df.groupby(level=0, axis=0).fillna(method='ffill') + result = df.T.groupby(level=0, axis=1).fillna(method='ffill').T + tm.assert_frame_equal(result, expected) + + +def test_groupby_cumprod(): + # GH 4095 + df = pd.DataFrame({'key': ['b'] * 10, 'value': 2}) + + actual = df.groupby('key')['value'].cumprod() + expected = df.groupby('key')['value'].apply(lambda x: x.cumprod()) + expected.name = 'value' + tm.assert_series_equal(actual, expected) + + df = pd.DataFrame({'key': ['b'] * 100, 'value': 2}) + actual = df.groupby('key')['value'].cumprod() + # if overflows, groupby product casts to float + # while numpy passes back invalid values + df['value'] = df['value'].astype(float) + expected = df.groupby('key')['value'].apply(lambda x: x.cumprod()) + expected.name = 'value' + tm.assert_series_equal(actual, expected) + + +def test_ops_general(): + ops = [('mean', np.mean), + ('median', np.median), + ('std', np.std), + ('var', np.var), + ('sum', np.sum), + ('prod', np.prod), + ('min', np.min), + ('max', np.max), + ('first', lambda x: x.iloc[0]), + ('last', lambda x: x.iloc[-1]), + ('count', np.size), ] + try: + from scipy.stats import sem + except ImportError: + pass + else: + ops.append(('sem', sem)) + df = DataFrame(np.random.randn(1000)) + labels = np.random.randint(0, 50, size=1000).astype(float) + + for op, targop in ops: + result = getattr(df.groupby(labels), op)().astype(float) + expected = df.groupby(labels).agg(targop) + try: + tm.assert_frame_equal(result, expected) + except BaseException as exc: + exc.args += ('operation: %s' % op, ) + raise + + +def test_max_nan_bug(): + raw = """,Date,app,File +-04-23,2013-04-23 00:00:00,,log080001.log +-05-06,2013-05-06 00:00:00,,log.log +-05-07,2013-05-07 00:00:00,OE,xlsx""" + + df = pd.read_csv(compat.StringIO(raw), parse_dates=[0]) + gb = df.groupby('Date') + r = gb[['File']].max() + e = gb['File'].max().to_frame() + tm.assert_frame_equal(r, e) + assert not r['File'].isna().any() + + +def test_nlargest(): + a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10]) + b = Series(list('a' * 5 + 'b' * 5)) + gb = a.groupby(b) + r = gb.nlargest(3) + e = Series([ + 7, 5, 3, 10, 9, 6 + ], index=MultiIndex.from_arrays([list('aaabbb'), [3, 2, 1, 9, 5, 8]])) + tm.assert_series_equal(r, e) + + a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0]) + gb = a.groupby(b) + e = Series([ + 3, 2, 1, 3, 3, 2 + ], index=MultiIndex.from_arrays([list('aaabbb'), [2, 3, 1, 6, 5, 7]])) + tm.assert_series_equal(gb.nlargest(3, keep='last'), e) + + +def test_nsmallest(): + a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10]) + b = Series(list('a' * 5 + 'b' * 5)) + gb = a.groupby(b) + r = gb.nsmallest(3) + e = Series([ + 1, 2, 3, 0, 4, 6 + ], index=MultiIndex.from_arrays([list('aaabbb'), [0, 4, 1, 6, 7, 8]])) + tm.assert_series_equal(r, e) + + a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0]) + gb = a.groupby(b) + e = Series([ + 0, 1, 1, 0, 1, 2 + ], index=MultiIndex.from_arrays([list('aaabbb'), [4, 1, 0, 9, 8, 7]])) + tm.assert_series_equal(gb.nsmallest(3, keep='last'), e) + + +def test_numpy_compat(): + # see gh-12811 + df = pd.DataFrame({'A': [1, 2, 1], 'B': [1, 2, 3]}) + g = df.groupby('A') + + msg = "numpy operations are not valid with groupby" + + for func in ('mean', 'var', 'std', 'cumprod', 'cumsum'): + tm.assert_raises_regex(UnsupportedFunctionCall, msg, + getattr(g, func), 1, 2, 3) + tm.assert_raises_regex(UnsupportedFunctionCall, msg, + getattr(g, func), foo=1) + + +def test_cummin_cummax(): + # GH 15048 + num_types = [np.int32, np.int64, np.float32, np.float64] + num_mins = [np.iinfo(np.int32).min, np.iinfo(np.int64).min, + np.finfo(np.float32).min, np.finfo(np.float64).min] + num_max = [np.iinfo(np.int32).max, np.iinfo(np.int64).max, + np.finfo(np.float32).max, np.finfo(np.float64).max] + base_df = pd.DataFrame({'A': [1, 1, 1, 1, 2, 2, 2, 2], + 'B': [3, 4, 3, 2, 2, 3, 2, 1]}) + expected_mins = [3, 3, 3, 2, 2, 2, 2, 1] + expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3] + + for dtype, min_val, max_val in zip(num_types, num_mins, num_max): + df = base_df.astype(dtype) + + # cummin + expected = pd.DataFrame({'B': expected_mins}).astype(dtype) + result = df.groupby('A').cummin() + tm.assert_frame_equal(result, expected) + result = df.groupby('A').B.apply(lambda x: x.cummin()).to_frame() + tm.assert_frame_equal(result, expected) + + # Test cummin w/ min value for dtype + df.loc[[2, 6], 'B'] = min_val + expected.loc[[2, 3, 6, 7], 'B'] = min_val + result = df.groupby('A').cummin() + tm.assert_frame_equal(result, expected) + expected = df.groupby('A').B.apply(lambda x: x.cummin()).to_frame() + tm.assert_frame_equal(result, expected) + + # cummax + expected = pd.DataFrame({'B': expected_maxs}).astype(dtype) + result = df.groupby('A').cummax() + tm.assert_frame_equal(result, expected) + result = df.groupby('A').B.apply(lambda x: x.cummax()).to_frame() + tm.assert_frame_equal(result, expected) + + # Test cummax w/ max value for dtype + df.loc[[2, 6], 'B'] = max_val + expected.loc[[2, 3, 6, 7], 'B'] = max_val + result = df.groupby('A').cummax() + tm.assert_frame_equal(result, expected) + expected = df.groupby('A').B.apply(lambda x: x.cummax()).to_frame() + tm.assert_frame_equal(result, expected) + + # Test nan in some values + base_df.loc[[0, 2, 4, 6], 'B'] = np.nan + expected = pd.DataFrame({'B': [np.nan, 4, np.nan, 2, + np.nan, 3, np.nan, 1]}) + result = base_df.groupby('A').cummin() + tm.assert_frame_equal(result, expected) + expected = (base_df.groupby('A') + .B + .apply(lambda x: x.cummin()) + .to_frame()) + tm.assert_frame_equal(result, expected) + + expected = pd.DataFrame({'B': [np.nan, 4, np.nan, 4, + np.nan, 3, np.nan, 3]}) + result = base_df.groupby('A').cummax() + tm.assert_frame_equal(result, expected) + expected = (base_df.groupby('A') + .B + .apply(lambda x: x.cummax()) + .to_frame()) + tm.assert_frame_equal(result, expected) + + # Test nan in entire column + base_df['B'] = np.nan + expected = pd.DataFrame({'B': [np.nan] * 8}) + result = base_df.groupby('A').cummin() + tm.assert_frame_equal(expected, result) + result = base_df.groupby('A').B.apply(lambda x: x.cummin()).to_frame() + tm.assert_frame_equal(expected, result) + result = base_df.groupby('A').cummax() + tm.assert_frame_equal(expected, result) + result = base_df.groupby('A').B.apply(lambda x: x.cummax()).to_frame() + tm.assert_frame_equal(expected, result) + + # GH 15561 + df = pd.DataFrame(dict(a=[1], b=pd.to_datetime(['2001']))) + expected = pd.Series(pd.to_datetime('2001'), index=[0], name='b') + for method in ['cummax', 'cummin']: + result = getattr(df.groupby('a')['b'], method)() + tm.assert_series_equal(expected, result) + + # GH 15635 + df = pd.DataFrame(dict(a=[1, 2, 1], b=[2, 1, 1])) + result = df.groupby('a').b.cummax() + expected = pd.Series([2, 1, 2], name='b') + tm.assert_series_equal(result, expected) + + df = pd.DataFrame(dict(a=[1, 2, 1], b=[1, 2, 2])) + result = df.groupby('a').b.cummin() + expected = pd.Series([1, 2, 1], name='b') + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize('in_vals, out_vals', [ + + # Basics: strictly increasing (T), strictly decreasing (F), + # abs val increasing (F), non-strictly increasing (T) + ([1, 2, 5, 3, 2, 0, 4, 5, -6, 1, 1], + [True, False, False, True]), + + # Test with inf vals + ([1, 2.1, np.inf, 3, 2, np.inf, -np.inf, 5, 11, 1, -np.inf], + [True, False, True, False]), + + # Test with nan vals; should always be False + ([1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan], + [False, False, False, False]), +]) +def test_is_monotonic_increasing(in_vals, out_vals): + # GH 17015 + source_dict = { + 'A': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'], + 'B': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd'], + 'C': in_vals} + df = pd.DataFrame(source_dict) + result = df.groupby('B').C.is_monotonic_increasing + index = Index(list('abcd'), name='B') + expected = pd.Series(index=index, data=out_vals, name='C') + tm.assert_series_equal(result, expected) + + # Also check result equal to manually taking x.is_monotonic_increasing. + expected = ( + df.groupby(['B']).C.apply(lambda x: x.is_monotonic_increasing)) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize('in_vals, out_vals', [ + # Basics: strictly decreasing (T), strictly increasing (F), + # abs val decreasing (F), non-strictly increasing (T) + ([10, 9, 7, 3, 4, 5, -3, 2, 0, 1, 1], + [True, False, False, True]), + + # Test with inf vals + ([np.inf, 1, -np.inf, np.inf, 2, -3, -np.inf, 5, -3, -np.inf, -np.inf], + [True, True, False, True]), + + # Test with nan vals; should always be False + ([1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan], + [False, False, False, False]), +]) +def test_is_monotonic_decreasing(in_vals, out_vals): + # GH 17015 + source_dict = { + 'A': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'], + 'B': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd'], + 'C': in_vals} + + df = pd.DataFrame(source_dict) + result = df.groupby('B').C.is_monotonic_decreasing + index = Index(list('abcd'), name='B') + expected = pd.Series(index=index, data=out_vals, name='C') + tm.assert_series_equal(result, expected) + + +# describe +# -------------------------------- + +def test_apply_describe_bug(mframe): + grouped = mframe.groupby(level='first') + grouped.describe() # it works! + + +def test_series_describe_multikey(): + ts = tm.makeTimeSeries() + grouped = ts.groupby([lambda x: x.year, lambda x: x.month]) + result = grouped.describe() + tm.assert_series_equal(result['mean'], grouped.mean(), + check_names=False) + tm.assert_series_equal(result['std'], grouped.std(), check_names=False) + tm.assert_series_equal(result['min'], grouped.min(), check_names=False) + + +def test_series_describe_single(): + ts = tm.makeTimeSeries() + grouped = ts.groupby(lambda x: x.month) + result = grouped.apply(lambda x: x.describe()) + expected = grouped.describe().stack() + tm.assert_series_equal(result, expected) + + +def test_series_index_name(df): + grouped = df.loc[:, ['C']].groupby(df['A']) + result = grouped.agg(lambda x: x.mean()) + assert result.index.name == 'A' + + +def test_frame_describe_multikey(tsframe): + grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month]) + result = grouped.describe() + desc_groups = [] + for col in tsframe: + group = grouped[col].describe() + # GH 17464 - Remove duplicate MultiIndex levels + group_col = pd.MultiIndex( + levels=[[col], group.columns], + labels=[[0] * len(group.columns), range(len(group.columns))]) + group = pd.DataFrame(group.values, + columns=group_col, + index=group.index) + desc_groups.append(group) + expected = pd.concat(desc_groups, axis=1) + tm.assert_frame_equal(result, expected) + + groupedT = tsframe.groupby({'A': 0, 'B': 0, + 'C': 1, 'D': 1}, axis=1) + result = groupedT.describe() + expected = tsframe.describe().T + expected.index = pd.MultiIndex( + levels=[[0, 1], expected.index], + labels=[[0, 0, 1, 1], range(len(expected.index))]) + tm.assert_frame_equal(result, expected) + + +def test_frame_describe_tupleindex(): + + # GH 14848 - regression from 0.19.0 to 0.19.1 + df1 = DataFrame({'x': [1, 2, 3, 4, 5] * 3, + 'y': [10, 20, 30, 40, 50] * 3, + 'z': [100, 200, 300, 400, 500] * 3}) + df1['k'] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5 + df2 = df1.rename(columns={'k': 'key'}) + pytest.raises(ValueError, lambda: df1.groupby('k').describe()) + pytest.raises(ValueError, lambda: df2.groupby('key').describe()) + + +def test_frame_describe_unstacked_format(): + # GH 4792 + prices = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 24990, + pd.Timestamp('2011-01-06 12:43:33', tz=None): 25499, + pd.Timestamp('2011-01-06 12:54:09', tz=None): 25499} + volumes = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 1500000000, + pd.Timestamp('2011-01-06 12:43:33', tz=None): 5000000000, + pd.Timestamp('2011-01-06 12:54:09', tz=None): 100000000} + df = pd.DataFrame({'PRICE': prices, + 'VOLUME': volumes}) + result = df.groupby('PRICE').VOLUME.describe() + data = [df[df.PRICE == 24990].VOLUME.describe().values.tolist(), + df[df.PRICE == 25499].VOLUME.describe().values.tolist()] + expected = pd.DataFrame(data, + index=pd.Index([24990, 25499], name='PRICE'), + columns=['count', 'mean', 'std', 'min', + '25%', '50%', '75%', 'max']) + tm.assert_frame_equal(result, expected) + + +# nunique +# -------------------------------- + +@pytest.mark.parametrize("n, m", cart_product(10 ** np.arange(2, 6), + (10, 100, 1000))) +@pytest.mark.parametrize("sort, dropna", cart_product((False, True), repeat=2)) +def test_series_groupby_nunique(n, m, sort, dropna): + + def check_nunique(df, keys, as_index=True): + gr = df.groupby(keys, as_index=as_index, sort=sort) + left = gr['julie'].nunique(dropna=dropna) + + gr = df.groupby(keys, as_index=as_index, sort=sort) + right = gr['julie'].apply(Series.nunique, dropna=dropna) + if not as_index: + right = right.reset_index(drop=True) + + tm.assert_series_equal(left, right, check_names=False) + + days = date_range('2015-08-23', periods=10) + + frame = DataFrame({'jim': np.random.choice(list(ascii_lowercase), n), + 'joe': np.random.choice(days, n), + 'julie': np.random.randint(0, m, n)}) + + check_nunique(frame, ['jim']) + check_nunique(frame, ['jim', 'joe']) + + frame.loc[1::17, 'jim'] = None + frame.loc[3::37, 'joe'] = None + frame.loc[7::19, 'julie'] = None + frame.loc[8::19, 'julie'] = None + frame.loc[9::19, 'julie'] = None + + check_nunique(frame, ['jim']) + check_nunique(frame, ['jim', 'joe']) + check_nunique(frame, ['jim'], as_index=False) + check_nunique(frame, ['jim', 'joe'], as_index=False) + + +def test_nunique(): + df = DataFrame({ + 'A': list('abbacc'), + 'B': list('abxacc'), + 'C': list('abbacx'), + }) + + expected = DataFrame({'A': [1] * 3, 'B': [1, 2, 1], 'C': [1, 1, 2]}) + result = df.groupby('A', as_index=False).nunique() + tm.assert_frame_equal(result, expected) + + # as_index + expected.index = list('abc') + expected.index.name = 'A' + result = df.groupby('A').nunique() + tm.assert_frame_equal(result, expected) + + # with na + result = df.replace({'x': None}).groupby('A').nunique(dropna=False) + tm.assert_frame_equal(result, expected) + + # dropna + expected = DataFrame({'A': [1] * 3, 'B': [1] * 3, 'C': [1] * 3}, + index=list('abc')) + expected.index.name = 'A' + result = df.replace({'x': None}).groupby('A').nunique() + tm.assert_frame_equal(result, expected) + + +def test_nunique_with_object(): + # GH 11077 + data = pd.DataFrame( + [[100, 1, 'Alice'], + [200, 2, 'Bob'], + [300, 3, 'Charlie'], + [-400, 4, 'Dan'], + [500, 5, 'Edith']], + columns=['amount', 'id', 'name'] + ) + + result = data.groupby(['id', 'amount'])['name'].nunique() + index = MultiIndex.from_arrays([data.id, data.amount]) + expected = pd.Series([1] * 5, name='name', index=index) + tm.assert_series_equal(result, expected) + + +def test_nunique_with_empty_series(): + # GH 12553 + data = pd.Series(name='name') + result = data.groupby(level=0).nunique() + expected = pd.Series(name='name', dtype='int64') + tm.assert_series_equal(result, expected) + + +def test_nunique_with_timegrouper(): + # GH 13453 + test = pd.DataFrame({ + 'time': [Timestamp('2016-06-28 09:35:35'), + Timestamp('2016-06-28 16:09:30'), + Timestamp('2016-06-28 16:46:28')], + 'data': ['1', '2', '3']}).set_index('time') + result = test.groupby(pd.Grouper(freq='h'))['data'].nunique() + expected = test.groupby( + pd.Grouper(freq='h') + )['data'].apply(pd.Series.nunique) + tm.assert_series_equal(result, expected) + + +# count +# -------------------------------- + +def test_groupby_timedelta_cython_count(): + df = DataFrame({'g': list('ab' * 2), + 'delt': np.arange(4).astype('timedelta64[ns]')}) + expected = Series([ + 2, 2 + ], index=pd.Index(['a', 'b'], name='g'), name='delt') + result = df.groupby('g').delt.count() + tm.assert_series_equal(expected, result) + + +def test_count(): + n = 1 << 15 + dr = date_range('2015-08-30', periods=n // 10, freq='T') + + df = DataFrame({ + '1st': np.random.choice( + list(ascii_lowercase), n), + '2nd': np.random.randint(0, 5, n), + '3rd': np.random.randn(n).round(3), + '4th': np.random.randint(-10, 10, n), + '5th': np.random.choice(dr, n), + '6th': np.random.randn(n).round(3), + '7th': np.random.randn(n).round(3), + '8th': np.random.choice(dr, n) - np.random.choice(dr, 1), + '9th': np.random.choice( + list(ascii_lowercase), n) + }) + + for col in df.columns.drop(['1st', '2nd', '4th']): + df.loc[np.random.choice(n, n // 10), col] = np.nan + + df['9th'] = df['9th'].astype('category') + + for key in '1st', '2nd', ['1st', '2nd']: + left = df.groupby(key).count() + right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1) + tm.assert_frame_equal(left, right) + + # GH5610 + # count counts non-nulls + df = pd.DataFrame([[1, 2, 'foo'], + [1, np.nan, 'bar'], + [3, np.nan, np.nan]], + columns=['A', 'B', 'C']) + + count_as = df.groupby('A').count() + count_not_as = df.groupby('A', as_index=False).count() + + expected = DataFrame([[1, 2], [0, 0]], columns=['B', 'C'], + index=[1, 3]) + expected.index.name = 'A' + tm.assert_frame_equal(count_not_as, expected.reset_index()) + tm.assert_frame_equal(count_as, expected) + + count_B = df.groupby('A')['B'].count() + tm.assert_series_equal(count_B, expected['B']) + + +def test_count_object(): + df = pd.DataFrame({'a': ['a'] * 3 + ['b'] * 3, 'c': [2] * 3 + [3] * 3}) + result = df.groupby('c').a.count() + expected = pd.Series([ + 3, 3 + ], index=pd.Index([2, 3], name='c'), name='a') + tm.assert_series_equal(result, expected) + + df = pd.DataFrame({'a': ['a', np.nan, np.nan] + ['b'] * 3, + 'c': [2] * 3 + [3] * 3}) + result = df.groupby('c').a.count() + expected = pd.Series([ + 1, 3 + ], index=pd.Index([2, 3], name='c'), name='a') + tm.assert_series_equal(result, expected) + + +def test_count_cross_type(): + # GH8169 + vals = np.hstack((np.random.randint(0, 5, (100, 2)), np.random.randint( + 0, 2, (100, 2)))) + + df = pd.DataFrame(vals, columns=['a', 'b', 'c', 'd']) + df[df == 2] = np.nan + expected = df.groupby(['c', 'd']).count() + + for t in ['float32', 'object']: + df['a'] = df['a'].astype(t) + df['b'] = df['b'].astype(t) + result = df.groupby(['c', 'd']).count() + tm.assert_frame_equal(result, expected) + + +def test_lower_int_prec_count(): + df = DataFrame({'a': np.array( + [0, 1, 2, 100], np.int8), + 'b': np.array( + [1, 2, 3, 6], np.uint32), + 'c': np.array( + [4, 5, 6, 8], np.int16), + 'grp': list('ab' * 2)}) + result = df.groupby('grp').count() + expected = DataFrame({'a': [2, 2], + 'b': [2, 2], + 'c': [2, 2]}, index=pd.Index(list('ab'), + name='grp')) + tm.assert_frame_equal(result, expected) + + +def test_count_uses_size_on_exception(): + class RaisingObjectException(Exception): + pass + + class RaisingObject(object): + + def __init__(self, msg='I will raise inside Cython'): + super(RaisingObject, self).__init__() + self.msg = msg + + def __eq__(self, other): + # gets called in Cython to check that raising calls the method + raise RaisingObjectException(self.msg) + + df = DataFrame({'a': [RaisingObject() for _ in range(4)], + 'grp': list('ab' * 2)}) + result = df.groupby('grp').count() + expected = DataFrame({'a': [2, 2]}, index=pd.Index( + list('ab'), name='grp')) + tm.assert_frame_equal(result, expected) + + +# size +# -------------------------------- + +def test_size(df): + grouped = df.groupby(['A', 'B']) + result = grouped.size() + for key, group in grouped: + assert result[key] == len(group) + + grouped = df.groupby('A') + result = grouped.size() + for key, group in grouped: + assert result[key] == len(group) + + grouped = df.groupby('B') + result = grouped.size() + for key, group in grouped: + assert result[key] == len(group) + + df = DataFrame(np.random.choice(20, (1000, 3)), columns=list('abc')) + for sort, key in cart_product((False, True), ('a', 'b', ['a', 'b'])): + left = df.groupby(key, sort=sort).size() + right = df.groupby(key, sort=sort)['c'].apply(lambda a: a.shape[0]) + tm.assert_series_equal(left, right, check_names=False) + + # GH11699 + df = DataFrame([], columns=['A', 'B']) + out = Series([], dtype='int64', index=Index([], name='A')) + tm.assert_series_equal(df.groupby('A').size(), out) + + +# pipe +# -------------------------------- + +def test_pipe(): + # Test the pipe method of DataFrameGroupBy. + # Issue #17871 + + random_state = np.random.RandomState(1234567890) + + df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'foo', 'foo'], + 'B': random_state.randn(8), + 'C': random_state.randn(8)}) + + def f(dfgb): + return dfgb.B.max() - dfgb.C.min().min() + + def square(srs): + return srs ** 2 + + # Note that the transformations are + # GroupBy -> Series + # Series -> Series + # This then chains the GroupBy.pipe and the + # NDFrame.pipe methods + result = df.groupby('A').pipe(f).pipe(square) + + index = Index([u'bar', u'foo'], dtype='object', name=u'A') + expected = pd.Series([8.99110003361, 8.17516964785], name='B', + index=index) + + tm.assert_series_equal(expected, result) + + +def test_pipe_args(): + # Test passing args to the pipe method of DataFrameGroupBy. + # Issue #17871 + + df = pd.DataFrame({'group': ['A', 'A', 'B', 'B', 'C'], + 'x': [1.0, 2.0, 3.0, 2.0, 5.0], + 'y': [10.0, 100.0, 1000.0, -100.0, -1000.0]}) + + def f(dfgb, arg1): + return (dfgb.filter(lambda grp: grp.y.mean() > arg1, dropna=False) + .groupby(dfgb.grouper)) + + def g(dfgb, arg2): + return dfgb.sum() / dfgb.sum().sum() + arg2 + + def h(df, arg3): + return df.x + df.y - arg3 + + result = (df + .groupby('group') + .pipe(f, 0) + .pipe(g, 10) + .pipe(h, 100)) + + # Assert the results here + index = pd.Index(['A', 'B', 'C'], name='group') + expected = pd.Series([-79.5160891089, -78.4839108911, -80], + index=index) + + tm.assert_series_equal(expected, result) + + # test SeriesGroupby.pipe + ser = pd.Series([1, 1, 2, 2, 3, 3]) + result = ser.groupby(ser).pipe(lambda grp: grp.sum() * grp.count()) + + expected = pd.Series([4, 8, 12], index=pd.Int64Index([1, 2, 3])) + + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_functional.py b/pandas/tests/groupby/test_functional.py deleted file mode 100644 index b9718663570bd..0000000000000 --- a/pandas/tests/groupby/test_functional.py +++ /dev/null @@ -1,372 +0,0 @@ -# -*- coding: utf-8 -*- - -""" test function application """ - -import pytest - -from string import ascii_lowercase -from pandas import (date_range, Timestamp, - Index, MultiIndex, DataFrame, Series) -from pandas.util.testing import assert_frame_equal, assert_series_equal -from pandas.compat import product as cart_product - -import numpy as np - -import pandas.util.testing as tm -import pandas as pd -from .common import MixIn - - -# describe -# -------------------------------- - -class TestDescribe(MixIn): - - def test_apply_describe_bug(self): - grouped = self.mframe.groupby(level='first') - grouped.describe() # it works! - - def test_series_describe_multikey(self): - ts = tm.makeTimeSeries() - grouped = ts.groupby([lambda x: x.year, lambda x: x.month]) - result = grouped.describe() - assert_series_equal(result['mean'], grouped.mean(), check_names=False) - assert_series_equal(result['std'], grouped.std(), check_names=False) - assert_series_equal(result['min'], grouped.min(), check_names=False) - - def test_series_describe_single(self): - ts = tm.makeTimeSeries() - grouped = ts.groupby(lambda x: x.month) - result = grouped.apply(lambda x: x.describe()) - expected = grouped.describe().stack() - assert_series_equal(result, expected) - - def test_series_index_name(self): - grouped = self.df.loc[:, ['C']].groupby(self.df['A']) - result = grouped.agg(lambda x: x.mean()) - assert result.index.name == 'A' - - def test_frame_describe_multikey(self): - grouped = self.tsframe.groupby([lambda x: x.year, lambda x: x.month]) - result = grouped.describe() - desc_groups = [] - for col in self.tsframe: - group = grouped[col].describe() - # GH 17464 - Remove duplicate MultiIndex levels - group_col = pd.MultiIndex( - levels=[[col], group.columns], - labels=[[0] * len(group.columns), range(len(group.columns))]) - group = pd.DataFrame(group.values, - columns=group_col, - index=group.index) - desc_groups.append(group) - expected = pd.concat(desc_groups, axis=1) - tm.assert_frame_equal(result, expected) - - groupedT = self.tsframe.groupby({'A': 0, 'B': 0, - 'C': 1, 'D': 1}, axis=1) - result = groupedT.describe() - expected = self.tsframe.describe().T - expected.index = pd.MultiIndex( - levels=[[0, 1], expected.index], - labels=[[0, 0, 1, 1], range(len(expected.index))]) - tm.assert_frame_equal(result, expected) - - def test_frame_describe_tupleindex(self): - - # GH 14848 - regression from 0.19.0 to 0.19.1 - df1 = DataFrame({'x': [1, 2, 3, 4, 5] * 3, - 'y': [10, 20, 30, 40, 50] * 3, - 'z': [100, 200, 300, 400, 500] * 3}) - df1['k'] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5 - df2 = df1.rename(columns={'k': 'key'}) - pytest.raises(ValueError, lambda: df1.groupby('k').describe()) - pytest.raises(ValueError, lambda: df2.groupby('key').describe()) - - def test_frame_describe_unstacked_format(self): - # GH 4792 - prices = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 24990, - pd.Timestamp('2011-01-06 12:43:33', tz=None): 25499, - pd.Timestamp('2011-01-06 12:54:09', tz=None): 25499} - volumes = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 1500000000, - pd.Timestamp('2011-01-06 12:43:33', tz=None): 5000000000, - pd.Timestamp('2011-01-06 12:54:09', tz=None): 100000000} - df = pd.DataFrame({'PRICE': prices, - 'VOLUME': volumes}) - result = df.groupby('PRICE').VOLUME.describe() - data = [df[df.PRICE == 24990].VOLUME.describe().values.tolist(), - df[df.PRICE == 25499].VOLUME.describe().values.tolist()] - expected = pd.DataFrame(data, - index=pd.Index([24990, 25499], name='PRICE'), - columns=['count', 'mean', 'std', 'min', - '25%', '50%', '75%', 'max']) - tm.assert_frame_equal(result, expected) - - -# nunique -# -------------------------------- - -class TestNUnique(MixIn): - - def test_series_groupby_nunique(self): - - def check_nunique(df, keys, as_index=True): - for sort, dropna in cart_product((False, True), repeat=2): - gr = df.groupby(keys, as_index=as_index, sort=sort) - left = gr['julie'].nunique(dropna=dropna) - - gr = df.groupby(keys, as_index=as_index, sort=sort) - right = gr['julie'].apply(Series.nunique, dropna=dropna) - if not as_index: - right = right.reset_index(drop=True) - - assert_series_equal(left, right, check_names=False) - - days = date_range('2015-08-23', periods=10) - - for n, m in cart_product(10 ** np.arange(2, 6), (10, 100, 1000)): - frame = DataFrame({ - 'jim': np.random.choice( - list(ascii_lowercase), n), - 'joe': np.random.choice(days, n), - 'julie': np.random.randint(0, m, n) - }) - - check_nunique(frame, ['jim']) - check_nunique(frame, ['jim', 'joe']) - - frame.loc[1::17, 'jim'] = None - frame.loc[3::37, 'joe'] = None - frame.loc[7::19, 'julie'] = None - frame.loc[8::19, 'julie'] = None - frame.loc[9::19, 'julie'] = None - - check_nunique(frame, ['jim']) - check_nunique(frame, ['jim', 'joe']) - check_nunique(frame, ['jim'], as_index=False) - check_nunique(frame, ['jim', 'joe'], as_index=False) - - def test_nunique(self): - df = DataFrame({ - 'A': list('abbacc'), - 'B': list('abxacc'), - 'C': list('abbacx'), - }) - - expected = DataFrame({'A': [1] * 3, 'B': [1, 2, 1], 'C': [1, 1, 2]}) - result = df.groupby('A', as_index=False).nunique() - tm.assert_frame_equal(result, expected) - - # as_index - expected.index = list('abc') - expected.index.name = 'A' - result = df.groupby('A').nunique() - tm.assert_frame_equal(result, expected) - - # with na - result = df.replace({'x': None}).groupby('A').nunique(dropna=False) - tm.assert_frame_equal(result, expected) - - # dropna - expected = DataFrame({'A': [1] * 3, 'B': [1] * 3, 'C': [1] * 3}, - index=list('abc')) - expected.index.name = 'A' - result = df.replace({'x': None}).groupby('A').nunique() - tm.assert_frame_equal(result, expected) - - def test_nunique_with_object(self): - # GH 11077 - data = pd.DataFrame( - [[100, 1, 'Alice'], - [200, 2, 'Bob'], - [300, 3, 'Charlie'], - [-400, 4, 'Dan'], - [500, 5, 'Edith']], - columns=['amount', 'id', 'name'] - ) - - result = data.groupby(['id', 'amount'])['name'].nunique() - index = MultiIndex.from_arrays([data.id, data.amount]) - expected = pd.Series([1] * 5, name='name', index=index) - tm.assert_series_equal(result, expected) - - def test_nunique_with_empty_series(self): - # GH 12553 - data = pd.Series(name='name') - result = data.groupby(level=0).nunique() - expected = pd.Series(name='name', dtype='int64') - tm.assert_series_equal(result, expected) - - def test_nunique_with_timegrouper(self): - # GH 13453 - test = pd.DataFrame({ - 'time': [Timestamp('2016-06-28 09:35:35'), - Timestamp('2016-06-28 16:09:30'), - Timestamp('2016-06-28 16:46:28')], - 'data': ['1', '2', '3']}).set_index('time') - result = test.groupby(pd.Grouper(freq='h'))['data'].nunique() - expected = test.groupby( - pd.Grouper(freq='h') - )['data'].apply(pd.Series.nunique) - tm.assert_series_equal(result, expected) - - -# count -# -------------------------------- - -class TestCount(MixIn): - - def test_groupby_timedelta_cython_count(self): - df = DataFrame({'g': list('ab' * 2), - 'delt': np.arange(4).astype('timedelta64[ns]')}) - expected = Series([ - 2, 2 - ], index=pd.Index(['a', 'b'], name='g'), name='delt') - result = df.groupby('g').delt.count() - tm.assert_series_equal(expected, result) - - def test_count(self): - n = 1 << 15 - dr = date_range('2015-08-30', periods=n // 10, freq='T') - - df = DataFrame({ - '1st': np.random.choice( - list(ascii_lowercase), n), - '2nd': np.random.randint(0, 5, n), - '3rd': np.random.randn(n).round(3), - '4th': np.random.randint(-10, 10, n), - '5th': np.random.choice(dr, n), - '6th': np.random.randn(n).round(3), - '7th': np.random.randn(n).round(3), - '8th': np.random.choice(dr, n) - np.random.choice(dr, 1), - '9th': np.random.choice( - list(ascii_lowercase), n) - }) - - for col in df.columns.drop(['1st', '2nd', '4th']): - df.loc[np.random.choice(n, n // 10), col] = np.nan - - df['9th'] = df['9th'].astype('category') - - for key in '1st', '2nd', ['1st', '2nd']: - left = df.groupby(key).count() - right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1) - assert_frame_equal(left, right) - - # GH5610 - # count counts non-nulls - df = pd.DataFrame([[1, 2, 'foo'], - [1, np.nan, 'bar'], - [3, np.nan, np.nan]], - columns=['A', 'B', 'C']) - - count_as = df.groupby('A').count() - count_not_as = df.groupby('A', as_index=False).count() - - expected = DataFrame([[1, 2], [0, 0]], columns=['B', 'C'], - index=[1, 3]) - expected.index.name = 'A' - assert_frame_equal(count_not_as, expected.reset_index()) - assert_frame_equal(count_as, expected) - - count_B = df.groupby('A')['B'].count() - assert_series_equal(count_B, expected['B']) - - def test_count_object(self): - df = pd.DataFrame({'a': ['a'] * 3 + ['b'] * 3, 'c': [2] * 3 + [3] * 3}) - result = df.groupby('c').a.count() - expected = pd.Series([ - 3, 3 - ], index=pd.Index([2, 3], name='c'), name='a') - tm.assert_series_equal(result, expected) - - df = pd.DataFrame({'a': ['a', np.nan, np.nan] + ['b'] * 3, - 'c': [2] * 3 + [3] * 3}) - result = df.groupby('c').a.count() - expected = pd.Series([ - 1, 3 - ], index=pd.Index([2, 3], name='c'), name='a') - tm.assert_series_equal(result, expected) - - def test_count_cross_type(self): # GH8169 - vals = np.hstack((np.random.randint(0, 5, (100, 2)), np.random.randint( - 0, 2, (100, 2)))) - - df = pd.DataFrame(vals, columns=['a', 'b', 'c', 'd']) - df[df == 2] = np.nan - expected = df.groupby(['c', 'd']).count() - - for t in ['float32', 'object']: - df['a'] = df['a'].astype(t) - df['b'] = df['b'].astype(t) - result = df.groupby(['c', 'd']).count() - tm.assert_frame_equal(result, expected) - - def test_lower_int_prec_count(self): - df = DataFrame({'a': np.array( - [0, 1, 2, 100], np.int8), - 'b': np.array( - [1, 2, 3, 6], np.uint32), - 'c': np.array( - [4, 5, 6, 8], np.int16), - 'grp': list('ab' * 2)}) - result = df.groupby('grp').count() - expected = DataFrame({'a': [2, 2], - 'b': [2, 2], - 'c': [2, 2]}, index=pd.Index(list('ab'), - name='grp')) - tm.assert_frame_equal(result, expected) - - def test_count_uses_size_on_exception(self): - class RaisingObjectException(Exception): - pass - - class RaisingObject(object): - - def __init__(self, msg='I will raise inside Cython'): - super(RaisingObject, self).__init__() - self.msg = msg - - def __eq__(self, other): - # gets called in Cython to check that raising calls the method - raise RaisingObjectException(self.msg) - - df = DataFrame({'a': [RaisingObject() for _ in range(4)], - 'grp': list('ab' * 2)}) - result = df.groupby('grp').count() - expected = DataFrame({'a': [2, 2]}, index=pd.Index( - list('ab'), name='grp')) - tm.assert_frame_equal(result, expected) - - -# size -# -------------------------------- - -class TestSize(MixIn): - - def test_size(self): - grouped = self.df.groupby(['A', 'B']) - result = grouped.size() - for key, group in grouped: - assert result[key] == len(group) - - grouped = self.df.groupby('A') - result = grouped.size() - for key, group in grouped: - assert result[key] == len(group) - - grouped = self.df.groupby('B') - result = grouped.size() - for key, group in grouped: - assert result[key] == len(group) - - df = DataFrame(np.random.choice(20, (1000, 3)), columns=list('abc')) - for sort, key in cart_product((False, True), ('a', 'b', ['a', 'b'])): - left = df.groupby(key, sort=sort).size() - right = df.groupby(key, sort=sort)['c'].apply(lambda a: a.shape[0]) - assert_series_equal(left, right, check_names=False) - - # GH11699 - df = DataFrame([], columns=['A', 'B']) - out = Series([], dtype='int64', index=Index([], name='A')) - assert_series_equal(df.groupby('A').size(), out) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index c3400b6b710e5..bb892f92f213e 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -5,3090 +5,1672 @@ from warnings import catch_warnings from datetime import datetime +from decimal import Decimal -from pandas import (date_range, bdate_range, Timestamp, +from pandas import (date_range, Timestamp, Index, MultiIndex, DataFrame, Series, - concat, Panel, DatetimeIndex, read_csv) -from pandas.core.dtypes.missing import isna -from pandas.errors import UnsupportedFunctionCall, PerformanceWarning -from pandas.util.testing import (assert_frame_equal, assert_index_equal, + Panel, DatetimeIndex, read_csv) +from pandas.errors import PerformanceWarning +from pandas.util.testing import (assert_frame_equal, assert_series_equal, assert_almost_equal) from pandas.compat import (range, lrange, StringIO, lmap, lzip, map, zip, - builtins, OrderedDict) + OrderedDict) from pandas import compat from collections import defaultdict import pandas.core.common as com import numpy as np -import pandas.core.nanops as nanops import pandas.util.testing as tm import pandas as pd -from .common import MixIn -class TestGrouper(object): +def test_repr(): + # GH18203 + result = repr(pd.Grouper(key='A', level='B')) + expected = "Grouper(key='A', level='B', axis=0, sort=False)" + assert result == expected - def test_repr(self): - # GH18203 - result = repr(pd.Grouper(key='A', level='B')) - expected = "Grouper(key='A', level='B', axis=0, sort=False)" - assert result == expected +@pytest.mark.parametrize('dtype', ['int64', 'int32', 'float64', 'float32']) +def test_basic(dtype): -class TestGroupBy(MixIn): + data = Series(np.arange(9) // 3, index=np.arange(9), dtype=dtype) - def test_basic(self): - def checkit(dtype): - data = Series(np.arange(9) // 3, index=np.arange(9), dtype=dtype) + index = np.arange(9) + np.random.shuffle(index) + data = data.reindex(index) - index = np.arange(9) - np.random.shuffle(index) - data = data.reindex(index) + grouped = data.groupby(lambda x: x // 3) - grouped = data.groupby(lambda x: x // 3) + for k, v in grouped: + assert len(v) == 3 - for k, v in grouped: - assert len(v) == 3 + agged = grouped.aggregate(np.mean) + assert agged[1] == 1 - agged = grouped.aggregate(np.mean) - assert agged[1] == 1 + assert_series_equal(agged, grouped.agg(np.mean)) # shorthand + assert_series_equal(agged, grouped.mean()) + assert_series_equal(grouped.agg(np.sum), grouped.sum()) - assert_series_equal(agged, grouped.agg(np.mean)) # shorthand - assert_series_equal(agged, grouped.mean()) - assert_series_equal(grouped.agg(np.sum), grouped.sum()) + expected = grouped.apply(lambda x: x * x.sum()) + transformed = grouped.transform(lambda x: x * x.sum()) + assert transformed[7] == 12 + assert_series_equal(transformed, expected) - expected = grouped.apply(lambda x: x * x.sum()) - transformed = grouped.transform(lambda x: x * x.sum()) - assert transformed[7] == 12 - assert_series_equal(transformed, expected) + value_grouped = data.groupby(data) + assert_series_equal(value_grouped.aggregate(np.mean), agged, + check_index_type=False) - value_grouped = data.groupby(data) - assert_series_equal(value_grouped.aggregate(np.mean), agged, - check_index_type=False) + # complex agg + agged = grouped.aggregate([np.mean, np.std]) - # complex agg - agged = grouped.aggregate([np.mean, np.std]) + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + agged = grouped.aggregate({'one': np.mean, 'two': np.std}) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - agged = grouped.aggregate({'one': np.mean, 'two': np.std}) + group_constants = {0: 10, 1: 20, 2: 30} + agged = grouped.agg(lambda x: group_constants[x.name] + x.mean()) + assert agged[1] == 21 - group_constants = {0: 10, 1: 20, 2: 30} - agged = grouped.agg(lambda x: group_constants[x.name] + x.mean()) - assert agged[1] == 21 + # corner cases + pytest.raises(Exception, grouped.aggregate, lambda x: x * 2) - # corner cases - pytest.raises(Exception, grouped.aggregate, lambda x: x * 2) - for dtype in ['int64', 'int32', 'float64', 'float32']: - checkit(dtype) +def test_groupby_nonobject_dtype(mframe, df_mixed_floats): + key = mframe.index.labels[0] + grouped = mframe.groupby(key) + result = grouped.sum() - def test_groupby_nonobject_dtype(self): - key = self.mframe.index.labels[0] - grouped = self.mframe.groupby(key) - result = grouped.sum() + expected = mframe.groupby(key.astype('O')).sum() + assert_frame_equal(result, expected) - expected = self.mframe.groupby(key.astype('O')).sum() - assert_frame_equal(result, expected) + # GH 3911, mixed frame non-conversion + df = df_mixed_floats.copy() + df['value'] = lrange(len(df)) - # GH 3911, mixed frame non-conversion - df = self.df_mixed_floats.copy() - df['value'] = lrange(len(df)) + def max_value(group): + return group.loc[group['value'].idxmax()] - def max_value(group): - return group.loc[group['value'].idxmax()] + applied = df.groupby('A').apply(max_value) + result = applied.get_dtype_counts().sort_values() + expected = Series({'float64': 2, + 'int64': 1, + 'object': 2}).sort_values() + assert_series_equal(result, expected) - applied = df.groupby('A').apply(max_value) - result = applied.get_dtype_counts().sort_values() - expected = Series({'float64': 2, - 'int64': 1, - 'object': 2}).sort_values() - assert_series_equal(result, expected) - def test_groupby_return_type(self): +def test_groupby_return_type(): - # GH2893, return a reduced type - df1 = DataFrame( - [{"val1": 1, "val2": 20}, - {"val1": 1, "val2": 19}, - {"val1": 2, "val2": 27}, - {"val1": 2, "val2": 12} - ]) + # GH2893, return a reduced type + df1 = DataFrame( + [{"val1": 1, "val2": 20}, + {"val1": 1, "val2": 19}, + {"val1": 2, "val2": 27}, + {"val1": 2, "val2": 12} + ]) - def func(dataf): - return dataf["val2"] - dataf["val2"].mean() + def func(dataf): + return dataf["val2"] - dataf["val2"].mean() - result = df1.groupby("val1", squeeze=True).apply(func) - assert isinstance(result, Series) + result = df1.groupby("val1", squeeze=True).apply(func) + assert isinstance(result, Series) - df2 = DataFrame( - [{"val1": 1, "val2": 20}, - {"val1": 1, "val2": 19}, - {"val1": 1, "val2": 27}, - {"val1": 1, "val2": 12} - ]) + df2 = DataFrame( + [{"val1": 1, "val2": 20}, + {"val1": 1, "val2": 19}, + {"val1": 1, "val2": 27}, + {"val1": 1, "val2": 12} + ]) - def func(dataf): - return dataf["val2"] - dataf["val2"].mean() + def func(dataf): + return dataf["val2"] - dataf["val2"].mean() + + result = df2.groupby("val1", squeeze=True).apply(func) + assert isinstance(result, Series) - result = df2.groupby("val1", squeeze=True).apply(func) - assert isinstance(result, Series) + # GH3596, return a consistent type (regression in 0.11 from 0.10.1) + df = DataFrame([[1, 1], [1, 1]], columns=['X', 'Y']) + result = df.groupby('X', squeeze=False).count() + assert isinstance(result, DataFrame) + + # GH5592 + # inconcistent return type + df = DataFrame(dict(A=['Tiger', 'Tiger', 'Tiger', 'Lamb', 'Lamb', + 'Pony', 'Pony'], B=Series( + np.arange(7), dtype='int64'), C=date_range( + '20130101', periods=7))) + + def f(grp): + return grp.iloc[0] + + expected = df.groupby('A').first()[['B']] + result = df.groupby('A').apply(f)[['B']] + assert_frame_equal(result, expected) + + def f(grp): + if grp.name == 'Tiger': + return None + return grp.iloc[0] + + result = df.groupby('A').apply(f)[['B']] + e = expected.copy() + e.loc['Tiger'] = np.nan + assert_frame_equal(result, e) + + def f(grp): + if grp.name == 'Pony': + return None + return grp.iloc[0] + + result = df.groupby('A').apply(f)[['B']] + e = expected.copy() + e.loc['Pony'] = np.nan + assert_frame_equal(result, e) + + # 5592 revisited, with datetimes + def f(grp): + if grp.name == 'Pony': + return None + return grp.iloc[0] + + result = df.groupby('A').apply(f)[['C']] + e = df.groupby('A').first()[['C']] + e.loc['Pony'] = pd.NaT + assert_frame_equal(result, e) + + # scalar outputs + def f(grp): + if grp.name == 'Pony': + return None + return grp.iloc[0].loc['C'] + + result = df.groupby('A').apply(f) + e = df.groupby('A').first()['C'].copy() + e.loc['Pony'] = np.nan + e.name = None + assert_series_equal(result, e) - # GH3596, return a consistent type (regression in 0.11 from 0.10.1) - df = DataFrame([[1, 1], [1, 1]], columns=['X', 'Y']) - result = df.groupby('X', squeeze=False).count() - assert isinstance(result, DataFrame) - # GH5592 - # inconcistent return type - df = DataFrame(dict(A=['Tiger', 'Tiger', 'Tiger', 'Lamb', 'Lamb', - 'Pony', 'Pony'], B=Series( - np.arange(7), dtype='int64'), C=date_range( - '20130101', periods=7))) +def test_pass_args_kwargs(ts, tsframe): - def f(grp): - return grp.iloc[0] + def f(x, q=None, axis=0): + return np.percentile(x, q, axis=axis) - expected = df.groupby('A').first()[['B']] - result = df.groupby('A').apply(f)[['B']] - assert_frame_equal(result, expected) + g = lambda x: np.percentile(x, 80, axis=0) - def f(grp): - if grp.name == 'Tiger': - return None - return grp.iloc[0] + # Series + ts_grouped = ts.groupby(lambda x: x.month) + agg_result = ts_grouped.agg(np.percentile, 80, axis=0) + apply_result = ts_grouped.apply(np.percentile, 80, axis=0) + trans_result = ts_grouped.transform(np.percentile, 80, axis=0) + + agg_expected = ts_grouped.quantile(.8) + trans_expected = ts_grouped.transform(g) + + assert_series_equal(apply_result, agg_expected) + assert_series_equal(agg_result, agg_expected, check_names=False) + assert_series_equal(trans_result, trans_expected) + + agg_result = ts_grouped.agg(f, q=80) + apply_result = ts_grouped.apply(f, q=80) + trans_result = ts_grouped.transform(f, q=80) + assert_series_equal(agg_result, agg_expected) + assert_series_equal(apply_result, agg_expected) + assert_series_equal(trans_result, trans_expected) + + # DataFrame + df_grouped = tsframe.groupby(lambda x: x.month) + agg_result = df_grouped.agg(np.percentile, 80, axis=0) + apply_result = df_grouped.apply(DataFrame.quantile, .8) + expected = df_grouped.quantile(.8) + assert_frame_equal(apply_result, expected) + assert_frame_equal(agg_result, expected, check_names=False) + + agg_result = df_grouped.agg(f, q=80) + apply_result = df_grouped.apply(DataFrame.quantile, q=.8) + assert_frame_equal(agg_result, expected, check_names=False) + assert_frame_equal(apply_result, expected) + + +def test_len(): + df = tm.makeTimeDataFrame() + grouped = df.groupby([lambda x: x.year, lambda x: x.month, + lambda x: x.day]) + assert len(grouped) == len(df) - result = df.groupby('A').apply(f)[['B']] - e = expected.copy() - e.loc['Tiger'] = np.nan - assert_frame_equal(result, e) + grouped = df.groupby([lambda x: x.year, lambda x: x.month]) + expected = len({(x.year, x.month) for x in df.index}) + assert len(grouped) == expected - def f(grp): - if grp.name == 'Pony': - return None - return grp.iloc[0] + # issue 11016 + df = pd.DataFrame(dict(a=[np.nan] * 3, b=[1, 2, 3])) + assert len(df.groupby(('a'))) == 0 + assert len(df.groupby(('b'))) == 3 + assert len(df.groupby(['a', 'b'])) == 3 + + +def test_basic_regression(): + # regression + T = [1.0 * x for x in lrange(1, 10) * 10][:1095] + result = Series(T, lrange(0, len(T))) - result = df.groupby('A').apply(f)[['B']] - e = expected.copy() - e.loc['Pony'] = np.nan - assert_frame_equal(result, e) + groupings = np.random.random((1100, )) + groupings = Series(groupings, lrange(0, len(groupings))) * 10. - # 5592 revisited, with datetimes - def f(grp): - if grp.name == 'Pony': - return None - return grp.iloc[0] + grouped = result.groupby(groupings) + grouped.mean() - result = df.groupby('A').apply(f)[['C']] - e = df.groupby('A').first()[['C']] - e.loc['Pony'] = pd.NaT - assert_frame_equal(result, e) - # scalar outputs - def f(grp): - if grp.name == 'Pony': - return None - return grp.iloc[0].loc['C'] - - result = df.groupby('A').apply(f) - e = df.groupby('A').first()['C'].copy() - e.loc['Pony'] = np.nan - e.name = None - assert_series_equal(result, e) - - def test_apply_issues(self): - # GH 5788 - - s = """2011.05.16,00:00,1.40893 -2011.05.16,01:00,1.40760 -2011.05.16,02:00,1.40750 -2011.05.16,03:00,1.40649 -2011.05.17,02:00,1.40893 -2011.05.17,03:00,1.40760 -2011.05.17,04:00,1.40750 -2011.05.17,05:00,1.40649 -2011.05.18,02:00,1.40893 -2011.05.18,03:00,1.40760 -2011.05.18,04:00,1.40750 -2011.05.18,05:00,1.40649""" - - df = pd.read_csv( - StringIO(s), header=None, names=['date', 'time', 'value'], - parse_dates=[['date', 'time']]) - df = df.set_index('date_time') - - expected = df.groupby(df.index.date).idxmax() - result = df.groupby(df.index.date).apply(lambda x: x.idxmax()) - assert_frame_equal(result, expected) - - # GH 5789 - # don't auto coerce dates - df = pd.read_csv( - StringIO(s), header=None, names=['date', 'time', 'value']) - exp_idx = pd.Index( - ['2011.05.16', '2011.05.17', '2011.05.18' - ], dtype=object, name='date') - expected = Series(['00:00', '02:00', '02:00'], index=exp_idx) - result = df.groupby('date').apply( - lambda x: x['time'][x['value'].idxmax()]) - assert_series_equal(result, expected) - - def test_apply_trivial(self): - # GH 20066 - # trivial apply: ignore input and return a constant dataframe. - df = pd.DataFrame({'key': ['a', 'a', 'b', 'b', 'a'], - 'data': [1.0, 2.0, 3.0, 4.0, 5.0]}, - columns=['key', 'data']) - expected = pd.concat([df.iloc[1:], df.iloc[1:]], - axis=1, keys=['float64', 'object']) - result = df.groupby([str(x) for x in df.dtypes], - axis=1).apply(lambda x: df.iloc[1:]) - - assert_frame_equal(result, expected) - - @pytest.mark.xfail(reason=("GH 20066; function passed into apply " - "returns a DataFrame with the same index " - "as the one to create GroupBy object.")) - def test_apply_trivial_fail(self): - # GH 20066 - # trivial apply fails if the constant dataframe has the same index - # with the one used to create GroupBy object. - df = pd.DataFrame({'key': ['a', 'a', 'b', 'b', 'a'], - 'data': [1.0, 2.0, 3.0, 4.0, 5.0]}, - columns=['key', 'data']) - expected = pd.concat([df, df], - axis=1, keys=['float64', 'object']) - result = df.groupby([str(x) for x in df.dtypes], - axis=1).apply(lambda x: df) - - assert_frame_equal(result, expected) - - def test_time_field_bug(self): - # Test a fix for the following error related to GH issue 11324 When - # non-key fields in a group-by dataframe contained time-based fields - # that were not returned by the apply function, an exception would be - # raised. - - df = pd.DataFrame({'a': 1, 'b': [datetime.now() for nn in range(10)]}) - - def func_with_no_date(batch): - return pd.Series({'c': 2}) - - def func_with_date(batch): - return pd.Series({'b': datetime(2015, 1, 1), 'c': 2}) - - dfg_no_conversion = df.groupby(by=['a']).apply(func_with_no_date) - dfg_no_conversion_expected = pd.DataFrame({'c': 2}, index=[1]) - dfg_no_conversion_expected.index.name = 'a' - - dfg_conversion = df.groupby(by=['a']).apply(func_with_date) - dfg_conversion_expected = pd.DataFrame( - {'b': datetime(2015, 1, 1), - 'c': 2}, index=[1]) - dfg_conversion_expected.index.name = 'a' - - tm.assert_frame_equal(dfg_no_conversion, dfg_no_conversion_expected) - tm.assert_frame_equal(dfg_conversion, dfg_conversion_expected) - - def test_len(self): - df = tm.makeTimeDataFrame() - grouped = df.groupby([lambda x: x.year, lambda x: x.month, - lambda x: x.day]) - assert len(grouped) == len(df) - - grouped = df.groupby([lambda x: x.year, lambda x: x.month]) - expected = len({(x.year, x.month) for x in df.index}) - assert len(grouped) == expected - - # issue 11016 - df = pd.DataFrame(dict(a=[np.nan] * 3, b=[1, 2, 3])) - assert len(df.groupby(('a'))) == 0 - assert len(df.groupby(('b'))) == 3 - assert len(df.groupby(['a', 'b'])) == 3 - - def test_basic_regression(self): - # regression - T = [1.0 * x for x in lrange(1, 10) * 10][:1095] - result = Series(T, lrange(0, len(T))) - - groupings = np.random.random((1100, )) - groupings = Series(groupings, lrange(0, len(groupings))) * 10. - - grouped = result.groupby(groupings) - grouped.mean() - - def test_with_na_groups(self): - index = Index(np.arange(10)) - - for dtype in ['float64', 'float32', 'int64', 'int32', 'int16', 'int8']: - values = Series(np.ones(10), index, dtype=dtype) - labels = Series([np.nan, 'foo', 'bar', 'bar', np.nan, np.nan, - 'bar', 'bar', np.nan, 'foo'], index=index) - - # this SHOULD be an int - grouped = values.groupby(labels) - agged = grouped.agg(len) - expected = Series([4, 2], index=['bar', 'foo']) - - assert_series_equal(agged, expected, check_dtype=False) - - # assert issubclass(agged.dtype.type, np.integer) - - # explicitly return a float from my function - def f(x): - return float(len(x)) - - agged = grouped.agg(f) - expected = Series([4, 2], index=['bar', 'foo']) - - assert_series_equal(agged, expected, check_dtype=False) - assert issubclass(agged.dtype.type, np.dtype(dtype).type) - - def test_indices_concatenation_order(self): - - # GH 2808 - - def f1(x): - y = x[(x.b % 2) == 1] ** 2 - if y.empty: - multiindex = MultiIndex(levels=[[]] * 2, labels=[[]] * 2, - names=['b', 'c']) - res = DataFrame(None, columns=['a'], index=multiindex) - return res - else: - y = y.set_index(['b', 'c']) - return y - - def f2(x): - y = x[(x.b % 2) == 1] ** 2 - if y.empty: - return DataFrame() - else: - y = y.set_index(['b', 'c']) - return y - - def f3(x): - y = x[(x.b % 2) == 1] ** 2 - if y.empty: - multiindex = MultiIndex(levels=[[]] * 2, labels=[[]] * 2, - names=['foo', 'bar']) - res = DataFrame(None, columns=['a', 'b'], index=multiindex) - return res - else: - return y - - df = DataFrame({'a': [1, 2, 2, 2], 'b': lrange(4), 'c': lrange(5, 9)}) - - df2 = DataFrame({'a': [3, 2, 2, 2], 'b': lrange(4), 'c': lrange(5, 9)}) - - # correct result - result1 = df.groupby('a').apply(f1) - result2 = df2.groupby('a').apply(f1) - assert_frame_equal(result1, result2) - - # should fail (not the same number of levels) - pytest.raises(AssertionError, df.groupby('a').apply, f2) - pytest.raises(AssertionError, df2.groupby('a').apply, f2) - - # should fail (incorrect shape) - pytest.raises(AssertionError, df.groupby('a').apply, f3) - pytest.raises(AssertionError, df2.groupby('a').apply, f3) - - def test_attr_wrapper(self): - grouped = self.ts.groupby(lambda x: x.weekday()) - - result = grouped.std() - expected = grouped.agg(lambda x: np.std(x, ddof=1)) - assert_series_equal(result, expected) - - # this is pretty cool - result = grouped.describe() - expected = {} - for name, gp in grouped: - expected[name] = gp.describe() - expected = DataFrame(expected).T - assert_frame_equal(result, expected) - - # get attribute - result = grouped.dtype - expected = grouped.agg(lambda x: x.dtype) - - # make sure raises error - pytest.raises(AttributeError, getattr, grouped, 'foo') - - def test_frame_groupby(self): - grouped = self.tsframe.groupby(lambda x: x.weekday()) - - # aggregate - aggregated = grouped.aggregate(np.mean) - assert len(aggregated) == 5 - assert len(aggregated.columns) == 4 - - # by string - tscopy = self.tsframe.copy() - tscopy['weekday'] = [x.weekday() for x in tscopy.index] - stragged = tscopy.groupby('weekday').aggregate(np.mean) - assert_frame_equal(stragged, aggregated, check_names=False) - - # transform - grouped = self.tsframe.head(30).groupby(lambda x: x.weekday()) - transformed = grouped.transform(lambda x: x - x.mean()) - assert len(transformed) == 30 - assert len(transformed.columns) == 4 - - # transform propagate - transformed = grouped.transform(lambda x: x.mean()) - for name, group in grouped: - mean = group.mean() - for idx in group.index: - tm.assert_series_equal(transformed.xs(idx), mean, - check_names=False) - - # iterate - for weekday, group in grouped: - assert group.index[0].weekday() == weekday - - # groups / group_indices - groups = grouped.groups - indices = grouped.indices - - for k, v in compat.iteritems(groups): - samething = self.tsframe.index.take(indices[k]) - assert (samething == v).all() - - def test_frame_groupby_columns(self): - mapping = {'A': 0, 'B': 0, 'C': 1, 'D': 1} - grouped = self.tsframe.groupby(mapping, axis=1) - - # aggregate - aggregated = grouped.aggregate(np.mean) - assert len(aggregated) == len(self.tsframe) - assert len(aggregated.columns) == 2 - - # transform - tf = lambda x: x - x.mean() - groupedT = self.tsframe.T.groupby(mapping, axis=0) - assert_frame_equal(groupedT.transform(tf).T, grouped.transform(tf)) - - # iterate - for k, v in grouped: - assert len(v.columns) == 2 - - def test_frame_set_name_single(self): - grouped = self.df.groupby('A') - - result = grouped.mean() - assert result.index.name == 'A' - - result = self.df.groupby('A', as_index=False).mean() - assert result.index.name != 'A' - - result = grouped.agg(np.mean) - assert result.index.name == 'A' - - result = grouped.agg({'C': np.mean, 'D': np.std}) - assert result.index.name == 'A' - - result = grouped['C'].mean() - assert result.index.name == 'A' - result = grouped['C'].agg(np.mean) - assert result.index.name == 'A' - result = grouped['C'].agg([np.mean, np.std]) - assert result.index.name == 'A' - - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result = grouped['C'].agg({'foo': np.mean, 'bar': np.std}) - assert result.index.name == 'A' - - def test_multi_func(self): - col1 = self.df['A'] - col2 = self.df['B'] - - grouped = self.df.groupby([col1.get, col2.get]) - agged = grouped.mean() - expected = self.df.groupby(['A', 'B']).mean() - - # TODO groupby get drops names - assert_frame_equal(agged.loc[:, ['C', 'D']], - expected.loc[:, ['C', 'D']], - check_names=False) - - # some "groups" with no data - df = DataFrame({'v1': np.random.randn(6), - 'v2': np.random.randn(6), - 'k1': np.array(['b', 'b', 'b', 'a', 'a', 'a']), - 'k2': np.array(['1', '1', '1', '2', '2', '2'])}, - index=['one', 'two', 'three', 'four', 'five', 'six']) - # only verify that it works for now - grouped = df.groupby(['k1', 'k2']) - grouped.agg(np.sum) - - def test_multi_key_multiple_functions(self): - grouped = self.df.groupby(['A', 'B'])['C'] - - agged = grouped.agg([np.mean, np.std]) - expected = DataFrame({'mean': grouped.agg(np.mean), - 'std': grouped.agg(np.std)}) - assert_frame_equal(agged, expected) - - def test_frame_multi_key_function_list(self): - data = DataFrame( - {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', - 'foo', 'foo', 'foo'], - 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', - 'two', 'two', 'one'], - 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny', - 'dull', 'shiny', 'shiny', 'shiny'], - 'D': np.random.randn(11), - 'E': np.random.randn(11), - 'F': np.random.randn(11)}) - - grouped = data.groupby(['A', 'B']) - funcs = [np.mean, np.std] - agged = grouped.agg(funcs) - expected = concat([grouped['D'].agg(funcs), grouped['E'].agg(funcs), - grouped['F'].agg(funcs)], - keys=['D', 'E', 'F'], axis=1) - assert (isinstance(agged.index, MultiIndex)) - assert (isinstance(expected.index, MultiIndex)) - assert_frame_equal(agged, expected) - - def test_groupby_multiple_columns(self): - data = self.df - grouped = data.groupby(['A', 'B']) - - def _check_op(op): - - with catch_warnings(record=True): - result1 = op(grouped) - - expected = defaultdict(dict) - for n1, gp1 in data.groupby('A'): - for n2, gp2 in gp1.groupby('B'): - expected[n1][n2] = op(gp2.loc[:, ['C', 'D']]) - expected = dict((k, DataFrame(v)) - for k, v in compat.iteritems(expected)) - expected = Panel.fromDict(expected).swapaxes(0, 1) - expected.major_axis.name, expected.minor_axis.name = 'A', 'B' - - # a little bit crude - for col in ['C', 'D']: - result_col = op(grouped[col]) - exp = expected[col] - pivoted = result1[col].unstack() - pivoted2 = result_col.unstack() - assert_frame_equal(pivoted.reindex_like(exp), exp) - assert_frame_equal(pivoted2.reindex_like(exp), exp) - - _check_op(lambda x: x.sum()) - _check_op(lambda x: x.mean()) - - # test single series works the same - result = data['C'].groupby([data['A'], data['B']]).mean() - expected = data.groupby(['A', 'B']).mean()['C'] - - assert_series_equal(result, expected) - - def test_groupby_as_index_agg(self): - grouped = self.df.groupby('A', as_index=False) - - # single-key - - result = grouped.agg(np.mean) - expected = grouped.mean() - assert_frame_equal(result, expected) - - result2 = grouped.agg(OrderedDict([['C', np.mean], ['D', np.sum]])) - expected2 = grouped.mean() - expected2['D'] = grouped.sum()['D'] - assert_frame_equal(result2, expected2) - - grouped = self.df.groupby('A', as_index=True) - expected3 = grouped['C'].sum() - expected3 = DataFrame(expected3).rename(columns={'C': 'Q'}) - - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result3 = grouped['C'].agg({'Q': np.sum}) - assert_frame_equal(result3, expected3) - - # multi-key - - grouped = self.df.groupby(['A', 'B'], as_index=False) - - result = grouped.agg(np.mean) - expected = grouped.mean() - assert_frame_equal(result, expected) - - result2 = grouped.agg(OrderedDict([['C', np.mean], ['D', np.sum]])) - expected2 = grouped.mean() - expected2['D'] = grouped.sum()['D'] - assert_frame_equal(result2, expected2) - - expected3 = grouped['C'].sum() - expected3 = DataFrame(expected3).rename(columns={'C': 'Q'}) +@pytest.mark.parametrize('dtype', ['float64', 'float32', 'int64', + 'int32', 'int16', 'int8']) +def test_with_na_groups(dtype): + index = Index(np.arange(10)) + values = Series(np.ones(10), index, dtype=dtype) + labels = Series([np.nan, 'foo', 'bar', 'bar', np.nan, np.nan, + 'bar', 'bar', np.nan, 'foo'], index=index) + + # this SHOULD be an int + grouped = values.groupby(labels) + agged = grouped.agg(len) + expected = Series([4, 2], index=['bar', 'foo']) + + assert_series_equal(agged, expected, check_dtype=False) + + # assert issubclass(agged.dtype.type, np.integer) + + # explicitly return a float from my function + def f(x): + return float(len(x)) + + agged = grouped.agg(f) + expected = Series([4, 2], index=['bar', 'foo']) + + assert_series_equal(agged, expected, check_dtype=False) + assert issubclass(agged.dtype.type, np.dtype(dtype).type) + + +def test_indices_concatenation_order(): + + # GH 2808 + + def f1(x): + y = x[(x.b % 2) == 1] ** 2 + if y.empty: + multiindex = MultiIndex(levels=[[]] * 2, labels=[[]] * 2, + names=['b', 'c']) + res = DataFrame(None, columns=['a'], index=multiindex) + return res + else: + y = y.set_index(['b', 'c']) + return y + + def f2(x): + y = x[(x.b % 2) == 1] ** 2 + if y.empty: + return DataFrame() + else: + y = y.set_index(['b', 'c']) + return y + + def f3(x): + y = x[(x.b % 2) == 1] ** 2 + if y.empty: + multiindex = MultiIndex(levels=[[]] * 2, labels=[[]] * 2, + names=['foo', 'bar']) + res = DataFrame(None, columns=['a', 'b'], index=multiindex) + return res + else: + return y + + df = DataFrame({'a': [1, 2, 2, 2], 'b': lrange(4), 'c': lrange(5, 9)}) + + df2 = DataFrame({'a': [3, 2, 2, 2], 'b': lrange(4), 'c': lrange(5, 9)}) + + # correct result + result1 = df.groupby('a').apply(f1) + result2 = df2.groupby('a').apply(f1) + assert_frame_equal(result1, result2) + + # should fail (not the same number of levels) + pytest.raises(AssertionError, df.groupby('a').apply, f2) + pytest.raises(AssertionError, df2.groupby('a').apply, f2) + + # should fail (incorrect shape) + pytest.raises(AssertionError, df.groupby('a').apply, f3) + pytest.raises(AssertionError, df2.groupby('a').apply, f3) + + +def test_attr_wrapper(ts): + grouped = ts.groupby(lambda x: x.weekday()) + + result = grouped.std() + expected = grouped.agg(lambda x: np.std(x, ddof=1)) + assert_series_equal(result, expected) + + # this is pretty cool + result = grouped.describe() + expected = {} + for name, gp in grouped: + expected[name] = gp.describe() + expected = DataFrame(expected).T + assert_frame_equal(result, expected) + + # get attribute + result = grouped.dtype + expected = grouped.agg(lambda x: x.dtype) + + # make sure raises error + pytest.raises(AttributeError, getattr, grouped, 'foo') + + +def test_frame_groupby(tsframe): + grouped = tsframe.groupby(lambda x: x.weekday()) + + # aggregate + aggregated = grouped.aggregate(np.mean) + assert len(aggregated) == 5 + assert len(aggregated.columns) == 4 + + # by string + tscopy = tsframe.copy() + tscopy['weekday'] = [x.weekday() for x in tscopy.index] + stragged = tscopy.groupby('weekday').aggregate(np.mean) + assert_frame_equal(stragged, aggregated, check_names=False) + + # transform + grouped = tsframe.head(30).groupby(lambda x: x.weekday()) + transformed = grouped.transform(lambda x: x - x.mean()) + assert len(transformed) == 30 + assert len(transformed.columns) == 4 + + # transform propagate + transformed = grouped.transform(lambda x: x.mean()) + for name, group in grouped: + mean = group.mean() + for idx in group.index: + tm.assert_series_equal(transformed.xs(idx), mean, + check_names=False) + + # iterate + for weekday, group in grouped: + assert group.index[0].weekday() == weekday + + # groups / group_indices + groups = grouped.groups + indices = grouped.indices + + for k, v in compat.iteritems(groups): + samething = tsframe.index.take(indices[k]) + assert (samething == v).all() + + +def test_frame_groupby_columns(tsframe): + mapping = {'A': 0, 'B': 0, 'C': 1, 'D': 1} + grouped = tsframe.groupby(mapping, axis=1) + + # aggregate + aggregated = grouped.aggregate(np.mean) + assert len(aggregated) == len(tsframe) + assert len(aggregated.columns) == 2 + + # transform + tf = lambda x: x - x.mean() + groupedT = tsframe.T.groupby(mapping, axis=0) + assert_frame_equal(groupedT.transform(tf).T, grouped.transform(tf)) + + # iterate + for k, v in grouped: + assert len(v.columns) == 2 + + +def test_frame_set_name_single(df): + grouped = df.groupby('A') + + result = grouped.mean() + assert result.index.name == 'A' + + result = df.groupby('A', as_index=False).mean() + assert result.index.name != 'A' + + result = grouped.agg(np.mean) + assert result.index.name == 'A' + + result = grouped.agg({'C': np.mean, 'D': np.std}) + assert result.index.name == 'A' + + result = grouped['C'].mean() + assert result.index.name == 'A' + result = grouped['C'].agg(np.mean) + assert result.index.name == 'A' + result = grouped['C'].agg([np.mean, np.std]) + assert result.index.name == 'A' + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + result = grouped['C'].agg({'foo': np.mean, 'bar': np.std}) + assert result.index.name == 'A' + + +def test_multi_func(df): + col1 = df['A'] + col2 = df['B'] + + grouped = df.groupby([col1.get, col2.get]) + agged = grouped.mean() + expected = df.groupby(['A', 'B']).mean() + + # TODO groupby get drops names + assert_frame_equal(agged.loc[:, ['C', 'D']], + expected.loc[:, ['C', 'D']], + check_names=False) + + # some "groups" with no data + df = DataFrame({'v1': np.random.randn(6), + 'v2': np.random.randn(6), + 'k1': np.array(['b', 'b', 'b', 'a', 'a', 'a']), + 'k2': np.array(['1', '1', '1', '2', '2', '2'])}, + index=['one', 'two', 'three', 'four', 'five', 'six']) + # only verify that it works for now + grouped = df.groupby(['k1', 'k2']) + grouped.agg(np.sum) + + +def test_multi_key_multiple_functions(df): + grouped = df.groupby(['A', 'B'])['C'] + + agged = grouped.agg([np.mean, np.std]) + expected = DataFrame({'mean': grouped.agg(np.mean), + 'std': grouped.agg(np.std)}) + assert_frame_equal(agged, expected) + + +def test_frame_multi_key_function_list(): + data = DataFrame( + {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', + 'foo', 'foo', 'foo'], + 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', + 'two', 'two', 'one'], + 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny', + 'dull', 'shiny', 'shiny', 'shiny'], + 'D': np.random.randn(11), + 'E': np.random.randn(11), + 'F': np.random.randn(11)}) + + grouped = data.groupby(['A', 'B']) + funcs = [np.mean, np.std] + agged = grouped.agg(funcs) + expected = pd.concat([grouped['D'].agg(funcs), grouped['E'].agg(funcs), + grouped['F'].agg(funcs)], + keys=['D', 'E', 'F'], axis=1) + assert (isinstance(agged.index, MultiIndex)) + assert (isinstance(expected.index, MultiIndex)) + assert_frame_equal(agged, expected) + + +@pytest.mark.parametrize('op', [lambda x: x.sum(), lambda x: x.mean()]) +def test_groupby_multiple_columns(df, op): + data = df + grouped = data.groupby(['A', 'B']) + + with catch_warnings(record=True): + result1 = op(grouped) + + expected = defaultdict(dict) + for n1, gp1 in data.groupby('A'): + for n2, gp2 in gp1.groupby('B'): + expected[n1][n2] = op(gp2.loc[:, ['C', 'D']]) + expected = dict((k, DataFrame(v)) + for k, v in compat.iteritems(expected)) + expected = Panel.fromDict(expected).swapaxes(0, 1) + expected.major_axis.name, expected.minor_axis.name = 'A', 'B' + + # a little bit crude + for col in ['C', 'D']: + result_col = op(grouped[col]) + exp = expected[col] + pivoted = result1[col].unstack() + pivoted2 = result_col.unstack() + assert_frame_equal(pivoted.reindex_like(exp), exp) + assert_frame_equal(pivoted2.reindex_like(exp), exp) + + # test single series works the same + result = data['C'].groupby([data['A'], data['B']]).mean() + expected = data.groupby(['A', 'B']).mean()['C'] + + assert_series_equal(result, expected) + + +def test_groupby_as_index_agg(df): + grouped = df.groupby('A', as_index=False) + + # single-key + + result = grouped.agg(np.mean) + expected = grouped.mean() + assert_frame_equal(result, expected) + + result2 = grouped.agg(OrderedDict([['C', np.mean], ['D', np.sum]])) + expected2 = grouped.mean() + expected2['D'] = grouped.sum()['D'] + assert_frame_equal(result2, expected2) + + grouped = df.groupby('A', as_index=True) + expected3 = grouped['C'].sum() + expected3 = DataFrame(expected3).rename(columns={'C': 'Q'}) + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): result3 = grouped['C'].agg({'Q': np.sum}) - assert_frame_equal(result3, expected3) - - # GH7115 & GH8112 & GH8582 - df = DataFrame(np.random.randint(0, 100, (50, 3)), - columns=['jim', 'joe', 'jolie']) - ts = Series(np.random.randint(5, 10, 50), name='jim') - - gr = df.groupby(ts) - gr.nth(0) # invokes set_selection_from_grouper internally - assert_frame_equal(gr.apply(sum), df.groupby(ts).apply(sum)) - - for attr in ['mean', 'max', 'count', 'idxmax', 'cumsum', 'all']: - gr = df.groupby(ts, as_index=False) - left = getattr(gr, attr)() - - gr = df.groupby(ts.values, as_index=True) - right = getattr(gr, attr)().reset_index(drop=True) - - assert_frame_equal(left, right) - - def test_as_index_series_return_frame(self): - grouped = self.df.groupby('A', as_index=False) - grouped2 = self.df.groupby(['A', 'B'], as_index=False) - - result = grouped['C'].agg(np.sum) - expected = grouped.agg(np.sum).loc[:, ['A', 'C']] - assert isinstance(result, DataFrame) - assert_frame_equal(result, expected) - - result2 = grouped2['C'].agg(np.sum) - expected2 = grouped2.agg(np.sum).loc[:, ['A', 'B', 'C']] - assert isinstance(result2, DataFrame) - assert_frame_equal(result2, expected2) - - result = grouped['C'].sum() - expected = grouped.sum().loc[:, ['A', 'C']] - assert isinstance(result, DataFrame) - assert_frame_equal(result, expected) - - result2 = grouped2['C'].sum() - expected2 = grouped2.sum().loc[:, ['A', 'B', 'C']] - assert isinstance(result2, DataFrame) - assert_frame_equal(result2, expected2) - - # corner case - pytest.raises(Exception, grouped['C'].__getitem__, 'D') - - def test_groupby_as_index_cython(self): - data = self.df - - # single-key - grouped = data.groupby('A', as_index=False) - result = grouped.mean() - expected = data.groupby(['A']).mean() - expected.insert(0, 'A', expected.index) - expected.index = np.arange(len(expected)) - assert_frame_equal(result, expected) - - # multi-key - grouped = data.groupby(['A', 'B'], as_index=False) - result = grouped.mean() - expected = data.groupby(['A', 'B']).mean() - - arrays = lzip(*expected.index.values) - expected.insert(0, 'A', arrays[0]) - expected.insert(1, 'B', arrays[1]) - expected.index = np.arange(len(expected)) - assert_frame_equal(result, expected) - - def test_groupby_as_index_series_scalar(self): - grouped = self.df.groupby(['A', 'B'], as_index=False) - - # GH #421 - - result = grouped['C'].agg(len) - expected = grouped.agg(len).loc[:, ['A', 'B', 'C']] - assert_frame_equal(result, expected) - - def test_groupby_as_index_corner(self): - pytest.raises(TypeError, self.ts.groupby, lambda x: x.weekday(), - as_index=False) - - pytest.raises(ValueError, self.df.groupby, lambda x: x.lower(), - as_index=False, axis=1) - - def test_groupby_as_index_apply(self): - # GH #4648 and #3417 - df = DataFrame({'item_id': ['b', 'b', 'a', 'c', 'a', 'b'], - 'user_id': [1, 2, 1, 1, 3, 1], - 'time': range(6)}) - - g_as = df.groupby('user_id', as_index=True) - g_not_as = df.groupby('user_id', as_index=False) - - res_as = g_as.head(2).index - res_not_as = g_not_as.head(2).index - exp = Index([0, 1, 2, 4]) - assert_index_equal(res_as, exp) - assert_index_equal(res_not_as, exp) - - res_as_apply = g_as.apply(lambda x: x.head(2)).index - res_not_as_apply = g_not_as.apply(lambda x: x.head(2)).index - - # apply doesn't maintain the original ordering - # changed in GH5610 as the as_index=False returns a MI here - exp_not_as_apply = MultiIndex.from_tuples([(0, 0), (0, 2), (1, 1), ( - 2, 4)]) - tp = [(1, 0), (1, 2), (2, 1), (3, 4)] - exp_as_apply = MultiIndex.from_tuples(tp, names=['user_id', None]) - - assert_index_equal(res_as_apply, exp_as_apply) - assert_index_equal(res_not_as_apply, exp_not_as_apply) - - ind = Index(list('abcde')) - df = DataFrame([[1, 2], [2, 3], [1, 4], [1, 5], [2, 6]], index=ind) - res = df.groupby(0, as_index=False).apply(lambda x: x).index - assert_index_equal(res, ind) - - def test_groupby_multiple_key(self): - df = tm.makeTimeDataFrame() - grouped = df.groupby([lambda x: x.year, lambda x: x.month, - lambda x: x.day]) - agged = grouped.sum() - assert_almost_equal(df.values, agged.values) - - grouped = df.T.groupby([lambda x: x.year, - lambda x: x.month, - lambda x: x.day], axis=1) - - agged = grouped.agg(lambda x: x.sum()) - tm.assert_index_equal(agged.index, df.columns) - assert_almost_equal(df.T.values, agged.values) - - agged = grouped.agg(lambda x: x.sum()) - assert_almost_equal(df.T.values, agged.values) - - def test_groupby_multi_corner(self): - # test that having an all-NA column doesn't mess you up - df = self.df.copy() - df['bad'] = np.nan - agged = df.groupby(['A', 'B']).mean() - - expected = self.df.groupby(['A', 'B']).mean() - expected['bad'] = np.nan - - assert_frame_equal(agged, expected) - - def test_omit_nuisance(self): - grouped = self.df.groupby('A') - - result = grouped.mean() - expected = self.df.loc[:, ['A', 'C', 'D']].groupby('A').mean() - assert_frame_equal(result, expected) - - agged = grouped.agg(np.mean) - exp = grouped.mean() - assert_frame_equal(agged, exp) - - df = self.df.loc[:, ['A', 'C', 'D']] - df['E'] = datetime.now() - grouped = df.groupby('A') - result = grouped.agg(np.sum) - expected = grouped.sum() - assert_frame_equal(result, expected) - - # won't work with axis = 1 - grouped = df.groupby({'A': 0, 'C': 0, 'D': 1, 'E': 1}, axis=1) - result = pytest.raises(TypeError, grouped.agg, - lambda x: x.sum(0, numeric_only=False)) - - def test_omit_nuisance_python_multiple(self): - grouped = self.three_group.groupby(['A', 'B']) - - agged = grouped.agg(np.mean) - exp = grouped.mean() - assert_frame_equal(agged, exp) - - def test_empty_groups_corner(self): - # handle empty groups - df = DataFrame({'k1': np.array(['b', 'b', 'b', 'a', 'a', 'a']), - 'k2': np.array(['1', '1', '1', '2', '2', '2']), - 'k3': ['foo', 'bar'] * 3, - 'v1': np.random.randn(6), - 'v2': np.random.randn(6)}) - - grouped = df.groupby(['k1', 'k2']) - result = grouped.agg(np.mean) - expected = grouped.mean() - assert_frame_equal(result, expected) - - grouped = self.mframe[3:5].groupby(level=0) - agged = grouped.apply(lambda x: x.mean()) - agged_A = grouped['A'].apply(np.mean) - assert_series_equal(agged['A'], agged_A) - assert agged.index.name == 'first' - - def test_apply_concat_preserve_names(self): - grouped = self.three_group.groupby(['A', 'B']) - - def desc(group): - result = group.describe() - result.index.name = 'stat' - return result - - def desc2(group): - result = group.describe() - result.index.name = 'stat' - result = result[:len(group)] - # weirdo - return result - - def desc3(group): - result = group.describe() - - # names are different - result.index.name = 'stat_%d' % len(group) - - result = result[:len(group)] - # weirdo - return result - - result = grouped.apply(desc) - assert result.index.names == ('A', 'B', 'stat') - - result2 = grouped.apply(desc2) - assert result2.index.names == ('A', 'B', 'stat') - - result3 = grouped.apply(desc3) - assert result3.index.names == ('A', 'B', None) - - def test_nonsense_func(self): - df = DataFrame([0]) - pytest.raises(Exception, df.groupby, lambda x: x + 'foo') - - def test_builtins_apply(self): # GH8155 - df = pd.DataFrame(np.random.randint(1, 50, (1000, 2)), - columns=['jim', 'joe']) - df['jolie'] = np.random.randn(1000) - - for keys in ['jim', ['jim', 'joe']]: # single key & multi-key - if keys == 'jim': - continue - for f in [max, min, sum]: - fname = f.__name__ - result = df.groupby(keys).apply(f) - result.shape - ngroups = len(df.drop_duplicates(subset=keys)) - assert result.shape == (ngroups, 3), 'invalid frame shape: '\ - '{} (expected ({}, 3))'.format(result.shape, ngroups) - - assert_frame_equal(result, # numpy's equivalent function - df.groupby(keys).apply(getattr(np, fname))) - - if f != sum: - expected = df.groupby(keys).agg(fname).reset_index() - expected.set_index(keys, inplace=True, drop=False) - assert_frame_equal(result, expected, check_dtype=False) - - assert_series_equal(getattr(result, fname)(), - getattr(df, fname)()) - - def test_max_min_non_numeric(self): - # #2700 - aa = DataFrame({'nn': [11, 11, 22, 22], - 'ii': [1, 2, 3, 4], - 'ss': 4 * ['mama']}) - - result = aa.groupby('nn').max() - assert 'ss' in result - - result = aa.groupby('nn').max(numeric_only=False) - assert 'ss' in result - - result = aa.groupby('nn').min() - assert 'ss' in result - - result = aa.groupby('nn').min(numeric_only=False) - assert 'ss' in result - - def test_arg_passthru(self): - # make sure that we are passing thru kwargs - # to our agg functions - - # GH3668 - # GH5724 - df = pd.DataFrame( - {'group': [1, 1, 2], - 'int': [1, 2, 3], - 'float': [4., 5., 6.], - 'string': list('abc'), - 'category_string': pd.Series(list('abc')).astype('category'), - 'category_int': [7, 8, 9], - 'datetime': pd.date_range('20130101', periods=3), - 'datetimetz': pd.date_range('20130101', - periods=3, - tz='US/Eastern'), - 'timedelta': pd.timedelta_range('1 s', periods=3, freq='s')}, - columns=['group', 'int', 'float', 'string', - 'category_string', 'category_int', - 'datetime', 'datetimetz', - 'timedelta']) - - expected_columns_numeric = Index(['int', 'float', 'category_int']) - - # mean / median - expected = pd.DataFrame( - {'category_int': [7.5, 9], - 'float': [4.5, 6.], - 'timedelta': [pd.Timedelta('1.5s'), - pd.Timedelta('3s')], - 'int': [1.5, 3], - 'datetime': [pd.Timestamp('2013-01-01 12:00:00'), - pd.Timestamp('2013-01-03 00:00:00')], - 'datetimetz': [ - pd.Timestamp('2013-01-01 12:00:00', tz='US/Eastern'), - pd.Timestamp('2013-01-03 00:00:00', tz='US/Eastern')]}, - index=Index([1, 2], name='group'), - columns=['int', 'float', 'category_int', - 'datetime', 'datetimetz', 'timedelta']) - for attr in ['mean', 'median']: - f = getattr(df.groupby('group'), attr) - result = f() - tm.assert_index_equal(result.columns, expected_columns_numeric) - - result = f(numeric_only=False) - assert_frame_equal(result.reindex_like(expected), expected) - - # TODO: min, max *should* handle - # categorical (ordered) dtype - expected_columns = Index(['int', 'float', 'string', - 'category_int', - 'datetime', 'datetimetz', - 'timedelta']) - for attr in ['min', 'max']: - f = getattr(df.groupby('group'), attr) - result = f() - tm.assert_index_equal(result.columns, expected_columns) - - result = f(numeric_only=False) - tm.assert_index_equal(result.columns, expected_columns) - - expected_columns = Index(['int', 'float', 'string', - 'category_string', 'category_int', - 'datetime', 'datetimetz', - 'timedelta']) - for attr in ['first', 'last']: - f = getattr(df.groupby('group'), attr) - result = f() - tm.assert_index_equal(result.columns, expected_columns) - - result = f(numeric_only=False) - tm.assert_index_equal(result.columns, expected_columns) - - expected_columns = Index(['int', 'float', 'string', - 'category_int', 'timedelta']) - for attr in ['sum']: - f = getattr(df.groupby('group'), attr) - result = f() - tm.assert_index_equal(result.columns, expected_columns_numeric) - - result = f(numeric_only=False) - tm.assert_index_equal(result.columns, expected_columns) - - expected_columns = Index(['int', 'float', 'category_int']) - for attr in ['prod', 'cumprod']: - f = getattr(df.groupby('group'), attr) - result = f() - tm.assert_index_equal(result.columns, expected_columns_numeric) - - result = f(numeric_only=False) - tm.assert_index_equal(result.columns, expected_columns) - - # like min, max, but don't include strings - expected_columns = Index(['int', 'float', - 'category_int', - 'datetime', 'datetimetz', - 'timedelta']) - for attr in ['cummin', 'cummax']: - f = getattr(df.groupby('group'), attr) - result = f() - # GH 15561: numeric_only=False set by default like min/max - tm.assert_index_equal(result.columns, expected_columns) - - result = f(numeric_only=False) - tm.assert_index_equal(result.columns, expected_columns) - - expected_columns = Index(['int', 'float', 'category_int', - 'timedelta']) - for attr in ['cumsum']: - f = getattr(df.groupby('group'), attr) - result = f() - tm.assert_index_equal(result.columns, expected_columns_numeric) - - result = f(numeric_only=False) - tm.assert_index_equal(result.columns, expected_columns) - - def test_wrap_aggregated_output_multindex(self): - df = self.mframe.T - df['baz', 'two'] = 'peekaboo' - - keys = [np.array([0, 0, 1]), np.array([0, 0, 1])] - agged = df.groupby(keys).agg(np.mean) - assert isinstance(agged.columns, MultiIndex) - - def aggfun(ser): - if ser.name == ('foo', 'one'): - raise TypeError - else: - return ser.sum() - - agged2 = df.groupby(keys).aggregate(aggfun) - assert len(agged2.columns) + 1 == len(df.columns) - - def test_groupby_level_apply(self): - frame = self.mframe - - result = frame.groupby(level=0).count() - assert result.index.name == 'first' - result = frame.groupby(level=1).count() - assert result.index.name == 'second' - - result = frame['A'].groupby(level=0).count() - assert result.index.name == 'first' - - def test_groupby_level_mapper(self): - frame = self.mframe - deleveled = frame.reset_index() - - mapper0 = {'foo': 0, 'bar': 0, 'baz': 1, 'qux': 1} - mapper1 = {'one': 0, 'two': 0, 'three': 1} - - result0 = frame.groupby(mapper0, level=0).sum() - result1 = frame.groupby(mapper1, level=1).sum() - - mapped_level0 = np.array([mapper0.get(x) for x in deleveled['first']]) - mapped_level1 = np.array([mapper1.get(x) for x in deleveled['second']]) - expected0 = frame.groupby(mapped_level0).sum() - expected1 = frame.groupby(mapped_level1).sum() - expected0.index.name, expected1.index.name = 'first', 'second' - - assert_frame_equal(result0, expected0) - assert_frame_equal(result1, expected1) - - def test_groupby_level_nonmulti(self): - # GH 1313, GH 13901 - s = Series([1, 2, 3, 10, 4, 5, 20, 6], - Index([1, 2, 3, 1, 4, 5, 2, 6], name='foo')) - expected = Series([11, 22, 3, 4, 5, 6], - Index(range(1, 7), name='foo')) - - result = s.groupby(level=0).sum() - tm.assert_series_equal(result, expected) - result = s.groupby(level=[0]).sum() - tm.assert_series_equal(result, expected) - result = s.groupby(level=-1).sum() - tm.assert_series_equal(result, expected) - result = s.groupby(level=[-1]).sum() - tm.assert_series_equal(result, expected) - - pytest.raises(ValueError, s.groupby, level=1) - pytest.raises(ValueError, s.groupby, level=-2) - pytest.raises(ValueError, s.groupby, level=[]) - pytest.raises(ValueError, s.groupby, level=[0, 0]) - pytest.raises(ValueError, s.groupby, level=[0, 1]) - pytest.raises(ValueError, s.groupby, level=[1]) - - def test_groupby_complex(self): - # GH 12902 - a = Series(data=np.arange(4) * (1 + 2j), index=[0, 0, 1, 1]) - expected = Series((1 + 2j, 5 + 10j)) - - result = a.groupby(level=0).sum() - assert_series_equal(result, expected) - - result = a.sum(level=0) - assert_series_equal(result, expected) - - def test_apply_series_to_frame(self): - def f(piece): - with np.errstate(invalid='ignore'): - logged = np.log(piece) - return DataFrame({'value': piece, - 'demeaned': piece - piece.mean(), - 'logged': logged}) - - dr = bdate_range('1/1/2000', periods=100) - ts = Series(np.random.randn(100), index=dr) - - grouped = ts.groupby(lambda x: x.month) - result = grouped.apply(f) - - assert isinstance(result, DataFrame) - tm.assert_index_equal(result.index, ts.index) - - def test_apply_series_yield_constant(self): - result = self.df.groupby(['A', 'B'])['C'].apply(len) - assert result.index.names[:2] == ('A', 'B') - - def test_apply_frame_yield_constant(self): - # GH13568 - result = self.df.groupby(['A', 'B']).apply(len) - assert isinstance(result, Series) - assert result.name is None - - result = self.df.groupby(['A', 'B'])[['C', 'D']].apply(len) - assert isinstance(result, Series) - assert result.name is None - - def test_apply_frame_to_series(self): - grouped = self.df.groupby(['A', 'B']) - result = grouped.apply(len) - expected = grouped.count()['C'] - tm.assert_index_equal(result.index, expected.index) - tm.assert_numpy_array_equal(result.values, expected.values) - - def test_apply_frame_concat_series(self): - def trans(group): - return group.groupby('B')['C'].sum().sort_values()[:2] - - def trans2(group): - grouped = group.groupby(df.reindex(group.index)['B']) - return grouped.sum().sort_values()[:2] - - df = DataFrame({'A': np.random.randint(0, 5, 1000), - 'B': np.random.randint(0, 5, 1000), - 'C': np.random.randn(1000)}) - - result = df.groupby('A').apply(trans) - exp = df.groupby('A')['C'].apply(trans2) - assert_series_equal(result, exp, check_names=False) - assert result.name == 'C' - - def test_apply_transform(self): - grouped = self.ts.groupby(lambda x: x.month) - result = grouped.apply(lambda x: x * 2) - expected = grouped.transform(lambda x: x * 2) - assert_series_equal(result, expected) - - def test_apply_multikey_corner(self): - grouped = self.tsframe.groupby([lambda x: x.year, lambda x: x.month]) - - def f(group): - return group.sort_values('A')[-5:] - - result = grouped.apply(f) - for key, group in grouped: - assert_frame_equal(result.loc[key], f(group)) - - def test_mutate_groups(self): - - # GH3380 - - mydf = DataFrame({ - 'cat1': ['a'] * 8 + ['b'] * 6, - 'cat2': ['c'] * 2 + ['d'] * 2 + ['e'] * 2 + ['f'] * 2 + ['c'] * 2 + - ['d'] * 2 + ['e'] * 2, - 'cat3': lmap(lambda x: 'g%s' % x, lrange(1, 15)), - 'val': np.random.randint(100, size=14), - }) - - def f_copy(x): - x = x.copy() - x['rank'] = x.val.rank(method='min') - return x.groupby('cat2')['rank'].min() - - def f_no_copy(x): - x['rank'] = x.val.rank(method='min') - return x.groupby('cat2')['rank'].min() - - grpby_copy = mydf.groupby('cat1').apply(f_copy) - grpby_no_copy = mydf.groupby('cat1').apply(f_no_copy) - assert_series_equal(grpby_copy, grpby_no_copy) - - def test_no_mutate_but_looks_like(self): - - # GH 8467 - # first show's mutation indicator - # second does not, but should yield the same results - df = DataFrame({'key': [1, 1, 1, 2, 2, 2, 3, 3, 3], 'value': range(9)}) - - result1 = df.groupby('key', group_keys=True).apply(lambda x: x[:].key) - result2 = df.groupby('key', group_keys=True).apply(lambda x: x.key) - assert_series_equal(result1, result2) - - def test_apply_chunk_view(self): - # Low level tinkering could be unsafe, make sure not - df = DataFrame({'key': [1, 1, 1, 2, 2, 2, 3, 3, 3], - 'value': lrange(9)}) - - # return view - f = lambda x: x[:2] - - result = df.groupby('key', group_keys=False).apply(f) - expected = df.take([0, 1, 3, 4, 6, 7]) - assert_frame_equal(result, expected) - - def test_apply_no_name_column_conflict(self): - df = DataFrame({'name': [1, 1, 1, 1, 1, 1, 2, 2, 2, 2], - 'name2': [0, 0, 0, 1, 1, 1, 0, 0, 1, 1], - 'value': lrange(10)[::-1]}) - - # it works! #2605 - grouped = df.groupby(['name', 'name2']) - grouped.apply(lambda x: x.sort_values('value', inplace=True)) - - def test_groupby_series_indexed_differently(self): - s1 = Series([5.0, -9.0, 4.0, 100., -5., 55., 6.7], - index=Index(['a', 'b', 'c', 'd', 'e', 'f', 'g'])) - s2 = Series([1.0, 1.0, 4.0, 5.0, 5.0, 7.0], - index=Index(['a', 'b', 'd', 'f', 'g', 'h'])) - - grouped = s1.groupby(s2) - agged = grouped.mean() - exp = s1.groupby(s2.reindex(s1.index).get).mean() - assert_series_equal(agged, exp) - - def test_groupby_with_hier_columns(self): - tuples = list(zip(*[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', - 'qux'], ['one', 'two', 'one', 'two', 'one', 'two', - 'one', 'two']])) - index = MultiIndex.from_tuples(tuples) - columns = MultiIndex.from_tuples([('A', 'cat'), ('B', 'dog'), ( - 'B', 'cat'), ('A', 'dog')]) - df = DataFrame(np.random.randn(8, 4), index=index, columns=columns) - - result = df.groupby(level=0).mean() - tm.assert_index_equal(result.columns, columns) - - result = df.groupby(level=0, axis=1).mean() - tm.assert_index_equal(result.index, df.index) - - result = df.groupby(level=0).agg(np.mean) - tm.assert_index_equal(result.columns, columns) - - result = df.groupby(level=0).apply(lambda x: x.mean()) - tm.assert_index_equal(result.columns, columns) - - result = df.groupby(level=0, axis=1).agg(lambda x: x.mean(1)) - tm.assert_index_equal(result.columns, Index(['A', 'B'])) - tm.assert_index_equal(result.index, df.index) - - # add a nuisance column - sorted_columns, _ = columns.sortlevel(0) - df['A', 'foo'] = 'bar' - result = df.groupby(level=0).mean() - tm.assert_index_equal(result.columns, df.columns[:-1]) - - def test_pass_args_kwargs(self): - from numpy import percentile - - def f(x, q=None, axis=0): - return percentile(x, q, axis=axis) - - g = lambda x: percentile(x, 80, axis=0) - - # Series - ts_grouped = self.ts.groupby(lambda x: x.month) - agg_result = ts_grouped.agg(percentile, 80, axis=0) - apply_result = ts_grouped.apply(percentile, 80, axis=0) - trans_result = ts_grouped.transform(percentile, 80, axis=0) - - agg_expected = ts_grouped.quantile(.8) - trans_expected = ts_grouped.transform(g) - - assert_series_equal(apply_result, agg_expected) - assert_series_equal(agg_result, agg_expected, check_names=False) - assert_series_equal(trans_result, trans_expected) - - agg_result = ts_grouped.agg(f, q=80) - apply_result = ts_grouped.apply(f, q=80) - trans_result = ts_grouped.transform(f, q=80) - assert_series_equal(agg_result, agg_expected) - assert_series_equal(apply_result, agg_expected) - assert_series_equal(trans_result, trans_expected) - - # DataFrame - df_grouped = self.tsframe.groupby(lambda x: x.month) - agg_result = df_grouped.agg(percentile, 80, axis=0) - apply_result = df_grouped.apply(DataFrame.quantile, .8) - expected = df_grouped.quantile(.8) - assert_frame_equal(apply_result, expected) - assert_frame_equal(agg_result, expected, check_names=False) - - agg_result = df_grouped.agg(f, q=80) - apply_result = df_grouped.apply(DataFrame.quantile, q=.8) - assert_frame_equal(agg_result, expected, check_names=False) - assert_frame_equal(apply_result, expected) - - def test_non_cython_api(self): - - # GH5610 - # non-cython calls should not include the grouper - - df = DataFrame( - [[1, 2, 'foo'], - [1, np.nan, 'bar'], - [3, np.nan, 'baz']], - columns=['A', 'B', 'C']) - g = df.groupby('A') - gni = df.groupby('A', as_index=False) - - # mad - expected = DataFrame([[0], [np.nan]], columns=['B'], index=[1, 3]) - expected.index.name = 'A' - result = g.mad() - assert_frame_equal(result, expected) - - expected = DataFrame([[0., 0.], [0, np.nan]], columns=['A', 'B'], - index=[0, 1]) - result = gni.mad() - assert_frame_equal(result, expected) - - # describe - expected_index = pd.Index([1, 3], name='A') - expected_col = pd.MultiIndex(levels=[['B'], - ['count', 'mean', 'std', 'min', - '25%', '50%', '75%', 'max']], - labels=[[0] * 8, list(range(8))]) - expected = pd.DataFrame([[1.0, 2.0, np.nan, 2.0, 2.0, 2.0, 2.0, 2.0], - [0.0, np.nan, np.nan, np.nan, np.nan, np.nan, - np.nan, np.nan]], - index=expected_index, - columns=expected_col) - result = g.describe() - assert_frame_equal(result, expected) - - expected = pd.concat([df[df.A == 1].describe().unstack().to_frame().T, - df[df.A == 3].describe().unstack().to_frame().T]) - expected.index = pd.Index([0, 1]) - result = gni.describe() - assert_frame_equal(result, expected) - - # any - expected = DataFrame([[True, True], [False, True]], columns=['B', 'C'], - index=[1, 3]) - expected.index.name = 'A' - result = g.any() - assert_frame_equal(result, expected) - - # idxmax - expected = DataFrame([[0.0], [np.nan]], columns=['B'], index=[1, 3]) - expected.index.name = 'A' - result = g.idxmax() - assert_frame_equal(result, expected) - - def test_cython_api2(self): - - # this takes the fast apply path - - # cumsum (GH5614) - df = DataFrame( - [[1, 2, np.nan], [1, np.nan, 9], [3, 4, 9] - ], columns=['A', 'B', 'C']) - expected = DataFrame( - [[2, np.nan], [np.nan, 9], [4, 9]], columns=['B', 'C']) - result = df.groupby('A').cumsum() - assert_frame_equal(result, expected) - - # GH 5755 - cumsum is a transformer and should ignore as_index - result = df.groupby('A', as_index=False).cumsum() - assert_frame_equal(result, expected) - - # GH 13994 - result = df.groupby('A').cumsum(axis=1) - expected = df.cumsum(axis=1) - assert_frame_equal(result, expected) - result = df.groupby('A').cumprod(axis=1) - expected = df.cumprod(axis=1) - assert_frame_equal(result, expected) - - def test_grouping_ndarray(self): - grouped = self.df.groupby(self.df['A'].values) - - result = grouped.sum() - expected = self.df.groupby('A').sum() - assert_frame_equal(result, expected, check_names=False - ) # Note: no names when grouping by value - - def test_apply_typecast_fail(self): - df = DataFrame({'d': [1., 1., 1., 2., 2., 2.], - 'c': np.tile( - ['a', 'b', 'c'], 2), - 'v': np.arange(1., 7.)}) - - def f(group): - v = group['v'] - group['v2'] = (v - v.min()) / (v.max() - v.min()) - return group - - result = df.groupby('d').apply(f) - - expected = df.copy() - expected['v2'] = np.tile([0., 0.5, 1], 2) - - assert_frame_equal(result, expected) - - def test_apply_multiindex_fail(self): - index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3] - ]) - df = DataFrame({'d': [1., 1., 1., 2., 2., 2.], - 'c': np.tile(['a', 'b', 'c'], 2), - 'v': np.arange(1., 7.)}, index=index) - - def f(group): - v = group['v'] - group['v2'] = (v - v.min()) / (v.max() - v.min()) - return group - - result = df.groupby('d').apply(f) - - expected = df.copy() - expected['v2'] = np.tile([0., 0.5, 1], 2) - - assert_frame_equal(result, expected) - - def test_apply_corner(self): - result = self.tsframe.groupby(lambda x: x.year).apply(lambda x: x * 2) - expected = self.tsframe * 2 - assert_frame_equal(result, expected) - - def test_apply_without_copy(self): - # GH 5545 - # returning a non-copy in an applied function fails - - data = DataFrame({'id_field': [100, 100, 200, 300], - 'category': ['a', 'b', 'c', 'c'], - 'value': [1, 2, 3, 4]}) - - def filt1(x): - if x.shape[0] == 1: - return x.copy() - else: - return x[x.category == 'c'] - - def filt2(x): - if x.shape[0] == 1: - return x - else: - return x[x.category == 'c'] - - expected = data.groupby('id_field').apply(filt1) - result = data.groupby('id_field').apply(filt2) - assert_frame_equal(result, expected) - - def test_apply_corner_cases(self): - # #535, can't use sliding iterator - - N = 1000 - labels = np.random.randint(0, 100, size=N) - df = DataFrame({'key': labels, - 'value1': np.random.randn(N), - 'value2': ['foo', 'bar', 'baz', 'qux'] * (N // 4)}) - - grouped = df.groupby('key') - - def f(g): - g['value3'] = g['value1'] * 2 - return g - - result = grouped.apply(f) - assert 'value3' in result + assert_frame_equal(result3, expected3) + + # multi-key + + grouped = df.groupby(['A', 'B'], as_index=False) + + result = grouped.agg(np.mean) + expected = grouped.mean() + assert_frame_equal(result, expected) + + result2 = grouped.agg(OrderedDict([['C', np.mean], ['D', np.sum]])) + expected2 = grouped.mean() + expected2['D'] = grouped.sum()['D'] + assert_frame_equal(result2, expected2) + + expected3 = grouped['C'].sum() + expected3 = DataFrame(expected3).rename(columns={'C': 'Q'}) + result3 = grouped['C'].agg({'Q': np.sum}) + assert_frame_equal(result3, expected3) + + # GH7115 & GH8112 & GH8582 + df = DataFrame(np.random.randint(0, 100, (50, 3)), + columns=['jim', 'joe', 'jolie']) + ts = Series(np.random.randint(5, 10, 50), name='jim') + + gr = df.groupby(ts) + gr.nth(0) # invokes set_selection_from_grouper internally + assert_frame_equal(gr.apply(sum), df.groupby(ts).apply(sum)) + + for attr in ['mean', 'max', 'count', 'idxmax', 'cumsum', 'all']: + gr = df.groupby(ts, as_index=False) + left = getattr(gr, attr)() + + gr = df.groupby(ts.values, as_index=True) + right = getattr(gr, attr)().reset_index(drop=True) + + assert_frame_equal(left, right) + + +def test_as_index_series_return_frame(df): + grouped = df.groupby('A', as_index=False) + grouped2 = df.groupby(['A', 'B'], as_index=False) + + result = grouped['C'].agg(np.sum) + expected = grouped.agg(np.sum).loc[:, ['A', 'C']] + assert isinstance(result, DataFrame) + assert_frame_equal(result, expected) + + result2 = grouped2['C'].agg(np.sum) + expected2 = grouped2.agg(np.sum).loc[:, ['A', 'B', 'C']] + assert isinstance(result2, DataFrame) + assert_frame_equal(result2, expected2) + + result = grouped['C'].sum() + expected = grouped.sum().loc[:, ['A', 'C']] + assert isinstance(result, DataFrame) + assert_frame_equal(result, expected) + + result2 = grouped2['C'].sum() + expected2 = grouped2.sum().loc[:, ['A', 'B', 'C']] + assert isinstance(result2, DataFrame) + assert_frame_equal(result2, expected2) + + # corner case + pytest.raises(Exception, grouped['C'].__getitem__, 'D') + + +def test_groupby_as_index_cython(df): + data = df + + # single-key + grouped = data.groupby('A', as_index=False) + result = grouped.mean() + expected = data.groupby(['A']).mean() + expected.insert(0, 'A', expected.index) + expected.index = np.arange(len(expected)) + assert_frame_equal(result, expected) + + # multi-key + grouped = data.groupby(['A', 'B'], as_index=False) + result = grouped.mean() + expected = data.groupby(['A', 'B']).mean() + + arrays = lzip(*expected.index.values) + expected.insert(0, 'A', arrays[0]) + expected.insert(1, 'B', arrays[1]) + expected.index = np.arange(len(expected)) + assert_frame_equal(result, expected) + + +def test_groupby_as_index_series_scalar(df): + grouped = df.groupby(['A', 'B'], as_index=False) + + # GH #421 + + result = grouped['C'].agg(len) + expected = grouped.agg(len).loc[:, ['A', 'B', 'C']] + assert_frame_equal(result, expected) + + +def test_groupby_as_index_corner(df, ts): + pytest.raises(TypeError, ts.groupby, lambda x: x.weekday(), + as_index=False) + + pytest.raises(ValueError, df.groupby, lambda x: x.lower(), + as_index=False, axis=1) + + +def test_groupby_multiple_key(df): + df = tm.makeTimeDataFrame() + grouped = df.groupby([lambda x: x.year, lambda x: x.month, + lambda x: x.day]) + agged = grouped.sum() + assert_almost_equal(df.values, agged.values) + + grouped = df.T.groupby([lambda x: x.year, + lambda x: x.month, + lambda x: x.day], axis=1) + + agged = grouped.agg(lambda x: x.sum()) + tm.assert_index_equal(agged.index, df.columns) + assert_almost_equal(df.T.values, agged.values) + + agged = grouped.agg(lambda x: x.sum()) + assert_almost_equal(df.T.values, agged.values) + + +def test_groupby_multi_corner(df): + # test that having an all-NA column doesn't mess you up + df = df.copy() + df['bad'] = np.nan + agged = df.groupby(['A', 'B']).mean() + + expected = df.groupby(['A', 'B']).mean() + expected['bad'] = np.nan + + assert_frame_equal(agged, expected) + + +def test_omit_nuisance(df): + grouped = df.groupby('A') + + result = grouped.mean() + expected = df.loc[:, ['A', 'C', 'D']].groupby('A').mean() + assert_frame_equal(result, expected) + + agged = grouped.agg(np.mean) + exp = grouped.mean() + assert_frame_equal(agged, exp) + + df = df.loc[:, ['A', 'C', 'D']] + df['E'] = datetime.now() + grouped = df.groupby('A') + result = grouped.agg(np.sum) + expected = grouped.sum() + assert_frame_equal(result, expected) + + # won't work with axis = 1 + grouped = df.groupby({'A': 0, 'C': 0, 'D': 1, 'E': 1}, axis=1) + result = pytest.raises(TypeError, grouped.agg, + lambda x: x.sum(0, numeric_only=False)) + + +def test_omit_nuisance_python_multiple(three_group): + grouped = three_group.groupby(['A', 'B']) + + agged = grouped.agg(np.mean) + exp = grouped.mean() + assert_frame_equal(agged, exp) + + +def test_empty_groups_corner(mframe): + # handle empty groups + df = DataFrame({'k1': np.array(['b', 'b', 'b', 'a', 'a', 'a']), + 'k2': np.array(['1', '1', '1', '2', '2', '2']), + 'k3': ['foo', 'bar'] * 3, + 'v1': np.random.randn(6), + 'v2': np.random.randn(6)}) + + grouped = df.groupby(['k1', 'k2']) + result = grouped.agg(np.mean) + expected = grouped.mean() + assert_frame_equal(result, expected) + + grouped = mframe[3:5].groupby(level=0) + agged = grouped.apply(lambda x: x.mean()) + agged_A = grouped['A'].apply(np.mean) + assert_series_equal(agged['A'], agged_A) + assert agged.index.name == 'first' + + +def test_nonsense_func(): + df = DataFrame([0]) + pytest.raises(Exception, df.groupby, lambda x: x + 'foo') + + +def test_wrap_aggregated_output_multindex(mframe): + df = mframe.T + df['baz', 'two'] = 'peekaboo' + + keys = [np.array([0, 0, 1]), np.array([0, 0, 1])] + agged = df.groupby(keys).agg(np.mean) + assert isinstance(agged.columns, MultiIndex) + + def aggfun(ser): + if ser.name == ('foo', 'one'): + raise TypeError + else: + return ser.sum() + + agged2 = df.groupby(keys).aggregate(aggfun) + assert len(agged2.columns) + 1 == len(df.columns) + + +def test_groupby_level_apply(mframe): + + result = mframe.groupby(level=0).count() + assert result.index.name == 'first' + result = mframe.groupby(level=1).count() + assert result.index.name == 'second' + + result = mframe['A'].groupby(level=0).count() + assert result.index.name == 'first' + + +def test_groupby_level_mapper(mframe): + deleveled = mframe.reset_index() + + mapper0 = {'foo': 0, 'bar': 0, 'baz': 1, 'qux': 1} + mapper1 = {'one': 0, 'two': 0, 'three': 1} + + result0 = mframe.groupby(mapper0, level=0).sum() + result1 = mframe.groupby(mapper1, level=1).sum() + + mapped_level0 = np.array([mapper0.get(x) for x in deleveled['first']]) + mapped_level1 = np.array([mapper1.get(x) for x in deleveled['second']]) + expected0 = mframe.groupby(mapped_level0).sum() + expected1 = mframe.groupby(mapped_level1).sum() + expected0.index.name, expected1.index.name = 'first', 'second' + + assert_frame_equal(result0, expected0) + assert_frame_equal(result1, expected1) + + +def test_groupby_level_nonmulti(): + # GH 1313, GH 13901 + s = Series([1, 2, 3, 10, 4, 5, 20, 6], + Index([1, 2, 3, 1, 4, 5, 2, 6], name='foo')) + expected = Series([11, 22, 3, 4, 5, 6], + Index(range(1, 7), name='foo')) + + result = s.groupby(level=0).sum() + tm.assert_series_equal(result, expected) + result = s.groupby(level=[0]).sum() + tm.assert_series_equal(result, expected) + result = s.groupby(level=-1).sum() + tm.assert_series_equal(result, expected) + result = s.groupby(level=[-1]).sum() + tm.assert_series_equal(result, expected) + + pytest.raises(ValueError, s.groupby, level=1) + pytest.raises(ValueError, s.groupby, level=-2) + pytest.raises(ValueError, s.groupby, level=[]) + pytest.raises(ValueError, s.groupby, level=[0, 0]) + pytest.raises(ValueError, s.groupby, level=[0, 1]) + pytest.raises(ValueError, s.groupby, level=[1]) + + +def test_groupby_complex(): + # GH 12902 + a = Series(data=np.arange(4) * (1 + 2j), index=[0, 0, 1, 1]) + expected = Series((1 + 2j, 5 + 10j)) + + result = a.groupby(level=0).sum() + assert_series_equal(result, expected) + + result = a.sum(level=0) + assert_series_equal(result, expected) + + +def test_mutate_groups(): + + # GH3380 + + df = DataFrame({ + 'cat1': ['a'] * 8 + ['b'] * 6, + 'cat2': ['c'] * 2 + ['d'] * 2 + ['e'] * 2 + ['f'] * 2 + ['c'] * 2 + + ['d'] * 2 + ['e'] * 2, + 'cat3': lmap(lambda x: 'g%s' % x, lrange(1, 15)), + 'val': np.random.randint(100, size=14), + }) + + def f_copy(x): + x = x.copy() + x['rank'] = x.val.rank(method='min') + return x.groupby('cat2')['rank'].min() + + def f_no_copy(x): + x['rank'] = x.val.rank(method='min') + return x.groupby('cat2')['rank'].min() + + grpby_copy = df.groupby('cat1').apply(f_copy) + grpby_no_copy = df.groupby('cat1').apply(f_no_copy) + assert_series_equal(grpby_copy, grpby_no_copy) + + +def test_no_mutate_but_looks_like(): + + # GH 8467 + # first show's mutation indicator + # second does not, but should yield the same results + df = DataFrame({'key': [1, 1, 1, 2, 2, 2, 3, 3, 3], 'value': range(9)}) + + result1 = df.groupby('key', group_keys=True).apply(lambda x: x[:].key) + result2 = df.groupby('key', group_keys=True).apply(lambda x: x.key) + assert_series_equal(result1, result2) + + +def test_groupby_series_indexed_differently(): + s1 = Series([5.0, -9.0, 4.0, 100., -5., 55., 6.7], + index=Index(['a', 'b', 'c', 'd', 'e', 'f', 'g'])) + s2 = Series([1.0, 1.0, 4.0, 5.0, 5.0, 7.0], + index=Index(['a', 'b', 'd', 'f', 'g', 'h'])) + + grouped = s1.groupby(s2) + agged = grouped.mean() + exp = s1.groupby(s2.reindex(s1.index).get).mean() + assert_series_equal(agged, exp) + + +def test_groupby_with_hier_columns(): + tuples = list(zip(*[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', + 'qux'], ['one', 'two', 'one', 'two', 'one', 'two', + 'one', 'two']])) + index = MultiIndex.from_tuples(tuples) + columns = MultiIndex.from_tuples([('A', 'cat'), ('B', 'dog'), ( + 'B', 'cat'), ('A', 'dog')]) + df = DataFrame(np.random.randn(8, 4), index=index, columns=columns) + + result = df.groupby(level=0).mean() + tm.assert_index_equal(result.columns, columns) + + result = df.groupby(level=0, axis=1).mean() + tm.assert_index_equal(result.index, df.index) + + result = df.groupby(level=0).agg(np.mean) + tm.assert_index_equal(result.columns, columns) + + result = df.groupby(level=0).apply(lambda x: x.mean()) + tm.assert_index_equal(result.columns, columns) + + result = df.groupby(level=0, axis=1).agg(lambda x: x.mean(1)) + tm.assert_index_equal(result.columns, Index(['A', 'B'])) + tm.assert_index_equal(result.index, df.index) + + # add a nuisance column + sorted_columns, _ = columns.sortlevel(0) + df['A', 'foo'] = 'bar' + result = df.groupby(level=0).mean() + tm.assert_index_equal(result.columns, df.columns[:-1]) - def test_groupby_wrong_multi_labels(self): - data = """index,foo,bar,baz,spam,data + +def test_grouping_ndarray(df): + grouped = df.groupby(df['A'].values) + + result = grouped.sum() + expected = df.groupby('A').sum() + assert_frame_equal(result, expected, check_names=False + ) # Note: no names when grouping by value + + +def test_groupby_wrong_multi_labels(): + data = """index,foo,bar,baz,spam,data 0,foo1,bar1,baz1,spam2,20 1,foo1,bar2,baz1,spam3,30 2,foo2,bar2,baz1,spam2,40 3,foo1,bar1,baz2,spam1,50 4,foo3,bar1,baz2,spam1,60""" - data = read_csv(StringIO(data), index_col=0) - - grouped = data.groupby(['foo', 'bar', 'baz', 'spam']) - - result = grouped.agg(np.mean) - expected = grouped.mean() - assert_frame_equal(result, expected) - - def test_groupby_series_with_name(self): - result = self.df.groupby(self.df['A']).mean() - result2 = self.df.groupby(self.df['A'], as_index=False).mean() - assert result.index.name == 'A' - assert 'A' in result2 - - result = self.df.groupby([self.df['A'], self.df['B']]).mean() - result2 = self.df.groupby([self.df['A'], self.df['B']], - as_index=False).mean() - assert result.index.names == ('A', 'B') - assert 'A' in result2 - assert 'B' in result2 - - def test_seriesgroupby_name_attr(self): - # GH 6265 - result = self.df.groupby('A')['C'] - assert result.count().name == 'C' - assert result.mean().name == 'C' - - testFunc = lambda x: np.sum(x) * 2 - assert result.agg(testFunc).name == 'C' - - def test_consistency_name(self): - # GH 12363 - - df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'two', - 'two', 'two', 'one', 'two'], - 'C': np.random.randn(8) + 1.0, - 'D': np.arange(8)}) - - expected = df.groupby(['A']).B.count() - result = df.B.groupby(df.A).count() - assert_series_equal(result, expected) - - def test_groupby_name_propagation(self): - # GH 6124 - def summarize(df, name=None): - return Series({'count': 1, 'mean': 2, 'omissions': 3, }, name=name) - - def summarize_random_name(df): - # Provide a different name for each Series. In this case, groupby - # should not attempt to propagate the Series name since they are - # inconsistent. - return Series({ - 'count': 1, - 'mean': 2, - 'omissions': 3, - }, name=df.iloc[0]['A']) - - metrics = self.df.groupby('A').apply(summarize) - assert metrics.columns.name is None - metrics = self.df.groupby('A').apply(summarize, 'metrics') - assert metrics.columns.name == 'metrics' - metrics = self.df.groupby('A').apply(summarize_random_name) - assert metrics.columns.name is None - - def test_groupby_nonstring_columns(self): - df = DataFrame([np.arange(10) for x in range(10)]) - grouped = df.groupby(0) - result = grouped.mean() - expected = df.groupby(df[0]).mean() - assert_frame_equal(result, expected) - - def test_groupby_mixed_type_columns(self): - # GH 13432, unorderable types in py3 - df = DataFrame([[0, 1, 2]], columns=['A', 'B', 0]) - expected = DataFrame([[1, 2]], columns=['B', 0], - index=Index([0], name='A')) - - result = df.groupby('A').first() - tm.assert_frame_equal(result, expected) - - result = df.groupby('A').sum() - tm.assert_frame_equal(result, expected) - - def test_cython_grouper_series_bug_noncontig(self): - arr = np.empty((100, 100)) - arr.fill(np.nan) - obj = Series(arr[:, 0], index=lrange(100)) - inds = np.tile(lrange(10), 10) - - result = obj.groupby(inds).agg(Series.median) - assert result.isna().all() - - def test_series_grouper_noncontig_index(self): - index = Index(tm.rands_array(10, 100)) - - values = Series(np.random.randn(50), index=index[::2]) - labels = np.random.randint(0, 5, 50) - - # it works! - grouped = values.groupby(labels) - - # accessing the index elements causes segfault - f = lambda x: len(set(map(id, x.index))) - grouped.agg(f) - - def test_convert_objects_leave_decimal_alone(self): - - from decimal import Decimal - - s = Series(lrange(5)) - labels = np.array(['a', 'b', 'c', 'd', 'e'], dtype='O') - - def convert_fast(x): - return Decimal(str(x.mean())) - - def convert_force_pure(x): - # base will be length 0 - assert (len(x.base) > 0) - return Decimal(str(x.mean())) - - grouped = s.groupby(labels) - - result = grouped.agg(convert_fast) - assert result.dtype == np.object_ - assert isinstance(result[0], Decimal) - - result = grouped.agg(convert_force_pure) - assert result.dtype == np.object_ - assert isinstance(result[0], Decimal) - - def test_fast_apply(self): - # make sure that fast apply is correctly called - # rather than raising any kind of error - # otherwise the python path will be callsed - # which slows things down - N = 1000 - labels = np.random.randint(0, 2000, size=N) - labels2 = np.random.randint(0, 3, size=N) - df = DataFrame({'key': labels, - 'key2': labels2, - 'value1': np.random.randn(N), - 'value2': ['foo', 'bar', 'baz', 'qux'] * (N // 4)}) - - def f(g): - return 1 - - g = df.groupby(['key', 'key2']) - - grouper = g.grouper - - splitter = grouper._get_splitter(g._selected_obj, axis=g.axis) - group_keys = grouper._get_group_keys() - - values, mutated = splitter.fast_apply(f, group_keys) - assert not mutated - - def test_apply_with_mixed_dtype(self): - # GH3480, apply with mixed dtype on axis=1 breaks in 0.11 - df = DataFrame({'foo1': np.random.randn(6), - 'foo2': ['one', 'two', 'two', 'three', 'one', 'two']}) - result = df.apply(lambda x: x, axis=1) - assert_series_equal(df.get_dtype_counts(), result.get_dtype_counts()) - - # GH 3610 incorrect dtype conversion with as_index=False - df = DataFrame({"c1": [1, 2, 6, 6, 8]}) - df["c2"] = df.c1 / 2.0 - result1 = df.groupby("c2").mean().reset_index().c2 - result2 = df.groupby("c2", as_index=False).mean().c2 - assert_series_equal(result1, result2) - - def test_groupby_aggregation_mixed_dtype(self): - - # GH 6212 - expected = DataFrame({ - 'v1': [5, 5, 7, np.nan, 3, 3, 4, 1], - 'v2': [55, 55, 77, np.nan, 33, 33, 44, 11]}, - index=MultiIndex.from_tuples([(1, 95), (1, 99), (2, 95), (2, 99), - ('big', 'damp'), - ('blue', 'dry'), - ('red', 'red'), ('red', 'wet')], - names=['by1', 'by2'])) - - df = DataFrame({ - 'v1': [1, 3, 5, 7, 8, 3, 5, np.nan, 4, 5, 7, 9], - 'v2': [11, 33, 55, 77, 88, 33, 55, np.nan, 44, 55, 77, 99], - 'by1': ["red", "blue", 1, 2, np.nan, "big", 1, 2, "red", 1, np.nan, - 12], - 'by2': ["wet", "dry", 99, 95, np.nan, "damp", 95, 99, "red", 99, - np.nan, np.nan] - }) - - g = df.groupby(['by1', 'by2']) - result = g[['v1', 'v2']].mean() - assert_frame_equal(result, expected) - - def test_groupby_dtype_inference_empty(self): - # GH 6733 - df = DataFrame({'x': [], 'range': np.arange(0, dtype='int64')}) - assert df['x'].dtype == np.float64 - - result = df.groupby('x').first() - exp_index = Index([], name='x', dtype=np.float64) - expected = DataFrame({'range': Series( - [], index=exp_index, dtype='int64')}) - assert_frame_equal(result, expected, by_blocks=True) - - def test_groupby_list_infer_array_like(self): - result = self.df.groupby(list(self.df['A'])).mean() - expected = self.df.groupby(self.df['A']).mean() - assert_frame_equal(result, expected, check_names=False) - - pytest.raises(Exception, self.df.groupby, list(self.df['A'][:-1])) - - # pathological case of ambiguity - df = DataFrame({'foo': [0, 1], - 'bar': [3, 4], - 'val': np.random.randn(2)}) - - result = df.groupby(['foo', 'bar']).mean() - expected = df.groupby([df['foo'], df['bar']]).mean()[['val']] - - def test_groupby_keys_same_size_as_index(self): - # GH 11185 - freq = 's' - index = pd.date_range(start=pd.Timestamp('2015-09-29T11:34:44-0700'), - periods=2, freq=freq) - df = pd.DataFrame([['A', 10], ['B', 15]], columns=[ - 'metric', 'values' - ], index=index) - result = df.groupby([pd.Grouper(level=0, freq=freq), 'metric']).mean() - expected = df.set_index([df.index, 'metric']) - - assert_frame_equal(result, expected) - - def test_groupby_one_row(self): - # GH 11741 - df1 = pd.DataFrame(np.random.randn(1, 4), columns=list('ABCD')) - pytest.raises(KeyError, df1.groupby, 'Z') - df2 = pd.DataFrame(np.random.randn(2, 4), columns=list('ABCD')) - pytest.raises(KeyError, df2.groupby, 'Z') - - def test_groupby_nat_exclude(self): - # GH 6992 - df = pd.DataFrame( - {'values': np.random.randn(8), - 'dt': [np.nan, pd.Timestamp('2013-01-01'), np.nan, pd.Timestamp( - '2013-02-01'), np.nan, pd.Timestamp('2013-02-01'), np.nan, - pd.Timestamp('2013-01-01')], - 'str': [np.nan, 'a', np.nan, 'a', np.nan, 'a', np.nan, 'b']}) - grouped = df.groupby('dt') - - expected = [pd.Index([1, 7]), pd.Index([3, 5])] - keys = sorted(grouped.groups.keys()) - assert len(keys) == 2 - for k, e in zip(keys, expected): - # grouped.groups keys are np.datetime64 with system tz - # not to be affected by tz, only compare values - tm.assert_index_equal(grouped.groups[k], e) - - # confirm obj is not filtered - tm.assert_frame_equal(grouped.grouper.groupings[0].obj, df) - assert grouped.ngroups == 2 - - expected = { - Timestamp('2013-01-01 00:00:00'): np.array([1, 7], dtype=np.int64), - Timestamp('2013-02-01 00:00:00'): np.array([3, 5], dtype=np.int64) - } - - for k in grouped.indices: - tm.assert_numpy_array_equal(grouped.indices[k], expected[k]) - - tm.assert_frame_equal( - grouped.get_group(Timestamp('2013-01-01')), df.iloc[[1, 7]]) - tm.assert_frame_equal( - grouped.get_group(Timestamp('2013-02-01')), df.iloc[[3, 5]]) + data = read_csv(StringIO(data), index_col=0) + + grouped = data.groupby(['foo', 'bar', 'baz', 'spam']) + + result = grouped.agg(np.mean) + expected = grouped.mean() + assert_frame_equal(result, expected) + + +def test_groupby_series_with_name(df): + result = df.groupby(df['A']).mean() + result2 = df.groupby(df['A'], as_index=False).mean() + assert result.index.name == 'A' + assert 'A' in result2 + + result = df.groupby([df['A'], df['B']]).mean() + result2 = df.groupby([df['A'], df['B']], + as_index=False).mean() + assert result.index.names == ('A', 'B') + assert 'A' in result2 + assert 'B' in result2 + + +def test_seriesgroupby_name_attr(df): + # GH 6265 + result = df.groupby('A')['C'] + assert result.count().name == 'C' + assert result.mean().name == 'C' + + testFunc = lambda x: np.sum(x) * 2 + assert result.agg(testFunc).name == 'C' + + +def test_consistency_name(): + # GH 12363 + + df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'two', + 'two', 'two', 'one', 'two'], + 'C': np.random.randn(8) + 1.0, + 'D': np.arange(8)}) + + expected = df.groupby(['A']).B.count() + result = df.B.groupby(df.A).count() + assert_series_equal(result, expected) + + +def test_groupby_name_propagation(df): + # GH 6124 + def summarize(df, name=None): + return Series({'count': 1, 'mean': 2, 'omissions': 3, }, name=name) + + def summarize_random_name(df): + # Provide a different name for each Series. In this case, groupby + # should not attempt to propagate the Series name since they are + # inconsistent. + return Series({ + 'count': 1, + 'mean': 2, + 'omissions': 3, + }, name=df.iloc[0]['A']) + + metrics = df.groupby('A').apply(summarize) + assert metrics.columns.name is None + metrics = df.groupby('A').apply(summarize, 'metrics') + assert metrics.columns.name == 'metrics' + metrics = df.groupby('A').apply(summarize_random_name) + assert metrics.columns.name is None + + +def test_groupby_nonstring_columns(): + df = DataFrame([np.arange(10) for x in range(10)]) + grouped = df.groupby(0) + result = grouped.mean() + expected = df.groupby(df[0]).mean() + assert_frame_equal(result, expected) + + +def test_groupby_mixed_type_columns(): + # GH 13432, unorderable types in py3 + df = DataFrame([[0, 1, 2]], columns=['A', 'B', 0]) + expected = DataFrame([[1, 2]], columns=['B', 0], + index=Index([0], name='A')) + + result = df.groupby('A').first() + tm.assert_frame_equal(result, expected) + + result = df.groupby('A').sum() + tm.assert_frame_equal(result, expected) + + +def test_cython_grouper_series_bug_noncontig(): + arr = np.empty((100, 100)) + arr.fill(np.nan) + obj = Series(arr[:, 0], index=lrange(100)) + inds = np.tile(lrange(10), 10) + + result = obj.groupby(inds).agg(Series.median) + assert result.isna().all() + + +def test_series_grouper_noncontig_index(): + index = Index(tm.rands_array(10, 100)) + + values = Series(np.random.randn(50), index=index[::2]) + labels = np.random.randint(0, 5, 50) + + # it works! + grouped = values.groupby(labels) + + # accessing the index elements causes segfault + f = lambda x: len(set(map(id, x.index))) + grouped.agg(f) + +def test_convert_objects_leave_decimal_alone(): + + s = Series(lrange(5)) + labels = np.array(['a', 'b', 'c', 'd', 'e'], dtype='O') + + def convert_fast(x): + return Decimal(str(x.mean())) + + def convert_force_pure(x): + # base will be length 0 + assert (len(x.base) > 0) + return Decimal(str(x.mean())) + + grouped = s.groupby(labels) + + result = grouped.agg(convert_fast) + assert result.dtype == np.object_ + assert isinstance(result[0], Decimal) + + result = grouped.agg(convert_force_pure) + assert result.dtype == np.object_ + assert isinstance(result[0], Decimal) + + +def test_groupby_dtype_inference_empty(): + # GH 6733 + df = DataFrame({'x': [], 'range': np.arange(0, dtype='int64')}) + assert df['x'].dtype == np.float64 + + result = df.groupby('x').first() + exp_index = Index([], name='x', dtype=np.float64) + expected = DataFrame({'range': Series( + [], index=exp_index, dtype='int64')}) + assert_frame_equal(result, expected, by_blocks=True) + + +def test_groupby_list_infer_array_like(df): + result = df.groupby(list(df['A'])).mean() + expected = df.groupby(df['A']).mean() + assert_frame_equal(result, expected, check_names=False) + + pytest.raises(Exception, df.groupby, list(df['A'][:-1])) + + # pathological case of ambiguity + df = DataFrame({'foo': [0, 1], + 'bar': [3, 4], + 'val': np.random.randn(2)}) + + result = df.groupby(['foo', 'bar']).mean() + expected = df.groupby([df['foo'], df['bar']]).mean()[['val']] + + +def test_groupby_keys_same_size_as_index(): + # GH 11185 + freq = 's' + index = pd.date_range(start=pd.Timestamp('2015-09-29T11:34:44-0700'), + periods=2, freq=freq) + df = pd.DataFrame([['A', 10], ['B', 15]], columns=[ + 'metric', 'values' + ], index=index) + result = df.groupby([pd.Grouper(level=0, freq=freq), 'metric']).mean() + expected = df.set_index([df.index, 'metric']) + + assert_frame_equal(result, expected) + + +def test_groupby_one_row(): + # GH 11741 + df1 = pd.DataFrame(np.random.randn(1, 4), columns=list('ABCD')) + pytest.raises(KeyError, df1.groupby, 'Z') + df2 = pd.DataFrame(np.random.randn(2, 4), columns=list('ABCD')) + pytest.raises(KeyError, df2.groupby, 'Z') + + +def test_groupby_nat_exclude(): + # GH 6992 + df = pd.DataFrame( + {'values': np.random.randn(8), + 'dt': [np.nan, pd.Timestamp('2013-01-01'), np.nan, pd.Timestamp( + '2013-02-01'), np.nan, pd.Timestamp('2013-02-01'), np.nan, + pd.Timestamp('2013-01-01')], + 'str': [np.nan, 'a', np.nan, 'a', np.nan, 'a', np.nan, 'b']}) + grouped = df.groupby('dt') + + expected = [pd.Index([1, 7]), pd.Index([3, 5])] + keys = sorted(grouped.groups.keys()) + assert len(keys) == 2 + for k, e in zip(keys, expected): + # grouped.groups keys are np.datetime64 with system tz + # not to be affected by tz, only compare values + tm.assert_index_equal(grouped.groups[k], e) + + # confirm obj is not filtered + tm.assert_frame_equal(grouped.grouper.groupings[0].obj, df) + assert grouped.ngroups == 2 + + expected = { + Timestamp('2013-01-01 00:00:00'): np.array([1, 7], dtype=np.int64), + Timestamp('2013-02-01 00:00:00'): np.array([3, 5], dtype=np.int64) + } + + for k in grouped.indices: + tm.assert_numpy_array_equal(grouped.indices[k], expected[k]) + + tm.assert_frame_equal( + grouped.get_group(Timestamp('2013-01-01')), df.iloc[[1, 7]]) + tm.assert_frame_equal( + grouped.get_group(Timestamp('2013-02-01')), df.iloc[[3, 5]]) + + pytest.raises(KeyError, grouped.get_group, pd.NaT) + + nan_df = DataFrame({'nan': [np.nan, np.nan, np.nan], + 'nat': [pd.NaT, pd.NaT, pd.NaT]}) + assert nan_df['nan'].dtype == 'float64' + assert nan_df['nat'].dtype == 'datetime64[ns]' + + for key in ['nan', 'nat']: + grouped = nan_df.groupby(key) + assert grouped.groups == {} + assert grouped.ngroups == 0 + assert grouped.indices == {} + pytest.raises(KeyError, grouped.get_group, np.nan) pytest.raises(KeyError, grouped.get_group, pd.NaT) - nan_df = DataFrame({'nan': [np.nan, np.nan, np.nan], - 'nat': [pd.NaT, pd.NaT, pd.NaT]}) - assert nan_df['nan'].dtype == 'float64' - assert nan_df['nat'].dtype == 'datetime64[ns]' - - for key in ['nan', 'nat']: - grouped = nan_df.groupby(key) - assert grouped.groups == {} - assert grouped.ngroups == 0 - assert grouped.indices == {} - pytest.raises(KeyError, grouped.get_group, np.nan) - pytest.raises(KeyError, grouped.get_group, pd.NaT) - - def test_sparse_friendly(self): - sdf = self.df[['C', 'D']].to_sparse() - with catch_warnings(record=True): - panel = tm.makePanel() - tm.add_nans(panel) - - def _check_work(gp): - gp.mean() - gp.agg(np.mean) - dict(iter(gp)) - - # it works! - _check_work(sdf.groupby(lambda x: x // 2)) - _check_work(sdf['C'].groupby(lambda x: x // 2)) - _check_work(sdf.groupby(self.df['A'])) - - # do this someday - # _check_work(panel.groupby(lambda x: x.month, axis=1)) - - def test_panel_groupby(self): - with catch_warnings(record=True): - self.panel = tm.makePanel() - tm.add_nans(self.panel) - grouped = self.panel.groupby({'ItemA': 0, 'ItemB': 0, 'ItemC': 1}, - axis='items') - agged = grouped.mean() - agged2 = grouped.agg(lambda x: x.mean('items')) - - tm.assert_panel_equal(agged, agged2) - - tm.assert_index_equal(agged.items, Index([0, 1])) - - grouped = self.panel.groupby(lambda x: x.month, axis='major') - agged = grouped.mean() - - exp = Index(sorted(list(set(self.panel.major_axis.month)))) - tm.assert_index_equal(agged.major_axis, exp) - - grouped = self.panel.groupby({'A': 0, 'B': 0, 'C': 1, 'D': 1}, - axis='minor') - agged = grouped.mean() - tm.assert_index_equal(agged.minor_axis, Index([0, 1])) - - def test_groupby_2d_malformed(self): - d = DataFrame(index=lrange(2)) - d['group'] = ['g1', 'g2'] - d['zeros'] = [0, 0] - d['ones'] = [1, 1] - d['label'] = ['l1', 'l2'] - tmp = d.groupby(['group']).mean() - res_values = np.array([[0, 1], [0, 1]], dtype=np.int64) - tm.assert_index_equal(tmp.columns, Index(['zeros', 'ones'])) - tm.assert_numpy_array_equal(tmp.values, res_values) - - def test_int32_overflow(self): - B = np.concatenate((np.arange(10000), np.arange(10000), np.arange(5000) - )) - A = np.arange(25000) - df = DataFrame({'A': A, - 'B': B, - 'C': A, - 'D': B, - 'E': np.random.randn(25000)}) - - left = df.groupby(['A', 'B', 'C', 'D']).sum() - right = df.groupby(['D', 'C', 'B', 'A']).sum() - assert len(left) == len(right) - - def test_groupby_sort_multi(self): - df = DataFrame({'a': ['foo', 'bar', 'baz'], - 'b': [3, 2, 1], - 'c': [0, 1, 2], - 'd': np.random.randn(3)}) - - tups = lmap(tuple, df[['a', 'b', 'c']].values) - tups = com._asarray_tuplesafe(tups) - result = df.groupby(['a', 'b', 'c'], sort=True).sum() - tm.assert_numpy_array_equal(result.index.values, tups[[1, 2, 0]]) - tups = lmap(tuple, df[['c', 'a', 'b']].values) - tups = com._asarray_tuplesafe(tups) - result = df.groupby(['c', 'a', 'b'], sort=True).sum() - tm.assert_numpy_array_equal(result.index.values, tups) +def test_sparse_friendly(df): + sdf = df[['C', 'D']].to_sparse() + with catch_warnings(record=True): + panel = tm.makePanel() + tm.add_nans(panel) + + def _check_work(gp): + gp.mean() + gp.agg(np.mean) + dict(iter(gp)) + + # it works! + _check_work(sdf.groupby(lambda x: x // 2)) + _check_work(sdf['C'].groupby(lambda x: x // 2)) + _check_work(sdf.groupby(df['A'])) + + # do this someday + # _check_work(panel.groupby(lambda x: x.month, axis=1)) + + +def test_panel_groupby(): + with catch_warnings(record=True): + panel = tm.makePanel() + tm.add_nans(panel) + grouped = panel.groupby({'ItemA': 0, 'ItemB': 0, 'ItemC': 1}, + axis='items') + agged = grouped.mean() + agged2 = grouped.agg(lambda x: x.mean('items')) + + tm.assert_panel_equal(agged, agged2) + + tm.assert_index_equal(agged.items, Index([0, 1])) - tups = lmap(tuple, df[['b', 'c', 'a']].values) + grouped = panel.groupby(lambda x: x.month, axis='major') + agged = grouped.mean() + + exp = Index(sorted(list(set(panel.major_axis.month)))) + tm.assert_index_equal(agged.major_axis, exp) + + grouped = panel.groupby({'A': 0, 'B': 0, 'C': 1, 'D': 1}, + axis='minor') + agged = grouped.mean() + tm.assert_index_equal(agged.minor_axis, Index([0, 1])) + + +def test_groupby_2d_malformed(): + d = DataFrame(index=lrange(2)) + d['group'] = ['g1', 'g2'] + d['zeros'] = [0, 0] + d['ones'] = [1, 1] + d['label'] = ['l1', 'l2'] + tmp = d.groupby(['group']).mean() + res_values = np.array([[0, 1], [0, 1]], dtype=np.int64) + tm.assert_index_equal(tmp.columns, Index(['zeros', 'ones'])) + tm.assert_numpy_array_equal(tmp.values, res_values) + + +def test_int32_overflow(): + B = np.concatenate((np.arange(10000), np.arange(10000), np.arange(5000) + )) + A = np.arange(25000) + df = DataFrame({'A': A, + 'B': B, + 'C': A, + 'D': B, + 'E': np.random.randn(25000)}) + + left = df.groupby(['A', 'B', 'C', 'D']).sum() + right = df.groupby(['D', 'C', 'B', 'A']).sum() + assert len(left) == len(right) + + +def test_groupby_sort_multi(): + df = DataFrame({'a': ['foo', 'bar', 'baz'], + 'b': [3, 2, 1], + 'c': [0, 1, 2], + 'd': np.random.randn(3)}) + + tups = lmap(tuple, df[['a', 'b', 'c']].values) + tups = com._asarray_tuplesafe(tups) + result = df.groupby(['a', 'b', 'c'], sort=True).sum() + tm.assert_numpy_array_equal(result.index.values, tups[[1, 2, 0]]) + + tups = lmap(tuple, df[['c', 'a', 'b']].values) + tups = com._asarray_tuplesafe(tups) + result = df.groupby(['c', 'a', 'b'], sort=True).sum() + tm.assert_numpy_array_equal(result.index.values, tups) + + tups = lmap(tuple, df[['b', 'c', 'a']].values) + tups = com._asarray_tuplesafe(tups) + result = df.groupby(['b', 'c', 'a'], sort=True).sum() + tm.assert_numpy_array_equal(result.index.values, tups[[2, 1, 0]]) + + df = DataFrame({'a': [0, 1, 2, 0, 1, 2], + 'b': [0, 0, 0, 1, 1, 1], + 'd': np.random.randn(6)}) + grouped = df.groupby(['a', 'b'])['d'] + result = grouped.sum() + + def _check_groupby(df, result, keys, field, f=lambda x: x.sum()): + tups = lmap(tuple, df[keys].values) tups = com._asarray_tuplesafe(tups) - result = df.groupby(['b', 'c', 'a'], sort=True).sum() - tm.assert_numpy_array_equal(result.index.values, tups[[2, 1, 0]]) - - df = DataFrame({'a': [0, 1, 2, 0, 1, 2], - 'b': [0, 0, 0, 1, 1, 1], - 'd': np.random.randn(6)}) - grouped = df.groupby(['a', 'b'])['d'] - result = grouped.sum() - _check_groupby(df, result, ['a', 'b'], 'd') - - def test_intercept_builtin_sum(self): - s = Series([1., 2., np.nan, 3.]) - grouped = s.groupby([0, 1, 2, 2]) - - result = grouped.agg(builtins.sum) - result2 = grouped.apply(builtins.sum) - expected = grouped.sum() - assert_series_equal(result, expected) - assert_series_equal(result2, expected) - - def test_rank_apply(self): - lev1 = tm.rands_array(10, 100) - lev2 = tm.rands_array(10, 130) - lab1 = np.random.randint(0, 100, size=500) - lab2 = np.random.randint(0, 130, size=500) - - df = DataFrame({'value': np.random.randn(500), - 'key1': lev1.take(lab1), - 'key2': lev2.take(lab2)}) - - result = df.groupby(['key1', 'key2']).value.rank() - - expected = [] - for key, piece in df.groupby(['key1', 'key2']): - expected.append(piece.value.rank()) - expected = concat(expected, axis=0) - expected = expected.reindex(result.index) - assert_series_equal(result, expected) - - result = df.groupby(['key1', 'key2']).value.rank(pct=True) - - expected = [] - for key, piece in df.groupby(['key1', 'key2']): - expected.append(piece.value.rank(pct=True)) - expected = concat(expected, axis=0) - expected = expected.reindex(result.index) - assert_series_equal(result, expected) - - @pytest.mark.parametrize("grps", [ - ['qux'], ['qux', 'quux']]) - @pytest.mark.parametrize("vals", [ - [2, 2, 8, 2, 6], - [pd.Timestamp('2018-01-02'), pd.Timestamp('2018-01-02'), - pd.Timestamp('2018-01-08'), pd.Timestamp('2018-01-02'), - pd.Timestamp('2018-01-06')]]) - @pytest.mark.parametrize("ties_method,ascending,pct,exp", [ - ('average', True, False, [2., 2., 5., 2., 4.]), - ('average', True, True, [0.4, 0.4, 1.0, 0.4, 0.8]), - ('average', False, False, [4., 4., 1., 4., 2.]), - ('average', False, True, [.8, .8, .2, .8, .4]), - ('min', True, False, [1., 1., 5., 1., 4.]), - ('min', True, True, [0.2, 0.2, 1.0, 0.2, 0.8]), - ('min', False, False, [3., 3., 1., 3., 2.]), - ('min', False, True, [.6, .6, .2, .6, .4]), - ('max', True, False, [3., 3., 5., 3., 4.]), - ('max', True, True, [0.6, 0.6, 1.0, 0.6, 0.8]), - ('max', False, False, [5., 5., 1., 5., 2.]), - ('max', False, True, [1., 1., .2, 1., .4]), - ('first', True, False, [1., 2., 5., 3., 4.]), - ('first', True, True, [0.2, 0.4, 1.0, 0.6, 0.8]), - ('first', False, False, [3., 4., 1., 5., 2.]), - ('first', False, True, [.6, .8, .2, 1., .4]), - ('dense', True, False, [1., 1., 3., 1., 2.]), - ('dense', True, True, [0.2, 0.2, 0.6, 0.2, 0.4]), - ('dense', False, False, [3., 3., 1., 3., 2.]), - ('dense', False, True, [.6, .6, .2, .6, .4]), - ]) - def test_rank_args(self, grps, vals, ties_method, ascending, pct, exp): - key = np.repeat(grps, len(vals)) - vals = vals * len(grps) - df = DataFrame({'key': key, 'val': vals}) - result = df.groupby('key').rank(method=ties_method, - ascending=ascending, pct=pct) - - exp_df = DataFrame(exp * len(grps), columns=['val']) - assert_frame_equal(result, exp_df) - - @pytest.mark.parametrize("grps", [ - ['qux'], ['qux', 'quux']]) - @pytest.mark.parametrize("vals", [ - [-np.inf, -np.inf, np.nan, 1., np.nan, np.inf, np.inf], - ]) - @pytest.mark.parametrize("ties_method,ascending,na_option,exp", [ - ('average', True, 'keep', [1.5, 1.5, np.nan, 3, np.nan, 4.5, 4.5]), - ('average', True, 'top', [3.5, 3.5, 1.5, 5., 1.5, 6.5, 6.5]), - ('average', True, 'bottom', [1.5, 1.5, 6.5, 3., 6.5, 4.5, 4.5]), - ('average', False, 'keep', [4.5, 4.5, np.nan, 3, np.nan, 1.5, 1.5]), - ('average', False, 'top', [6.5, 6.5, 1.5, 5., 1.5, 3.5, 3.5]), - ('average', False, 'bottom', [4.5, 4.5, 6.5, 3., 6.5, 1.5, 1.5]), - ('min', True, 'keep', [1., 1., np.nan, 3., np.nan, 4., 4.]), - ('min', True, 'top', [3., 3., 1., 5., 1., 6., 6.]), - ('min', True, 'bottom', [1., 1., 6., 3., 6., 4., 4.]), - ('min', False, 'keep', [4., 4., np.nan, 3., np.nan, 1., 1.]), - ('min', False, 'top', [6., 6., 1., 5., 1., 3., 3.]), - ('min', False, 'bottom', [4., 4., 6., 3., 6., 1., 1.]), - ('max', True, 'keep', [2., 2., np.nan, 3., np.nan, 5., 5.]), - ('max', True, 'top', [4., 4., 2., 5., 2., 7., 7.]), - ('max', True, 'bottom', [2., 2., 7., 3., 7., 5., 5.]), - ('max', False, 'keep', [5., 5., np.nan, 3., np.nan, 2., 2.]), - ('max', False, 'top', [7., 7., 2., 5., 2., 4., 4.]), - ('max', False, 'bottom', [5., 5., 7., 3., 7., 2., 2.]), - ('first', True, 'keep', [1., 2., np.nan, 3., np.nan, 4., 5.]), - ('first', True, 'top', [3., 4., 1., 5., 2., 6., 7.]), - ('first', True, 'bottom', [1., 2., 6., 3., 7., 4., 5.]), - ('first', False, 'keep', [4., 5., np.nan, 3., np.nan, 1., 2.]), - ('first', False, 'top', [6., 7., 1., 5., 2., 3., 4.]), - ('first', False, 'bottom', [4., 5., 6., 3., 7., 1., 2.]), - ('dense', True, 'keep', [1., 1., np.nan, 2., np.nan, 3., 3.]), - ('dense', True, 'top', [2., 2., 1., 3., 1., 4., 4.]), - ('dense', True, 'bottom', [1., 1., 4., 2., 4., 3., 3.]), - ('dense', False, 'keep', [3., 3., np.nan, 2., np.nan, 1., 1.]), - ('dense', False, 'top', [4., 4., 1., 3., 1., 2., 2.]), - ('dense', False, 'bottom', [3., 3., 4., 2., 4., 1., 1.]) - ]) - def test_infs_n_nans(self, grps, vals, ties_method, ascending, na_option, - exp): - # GH 20561 - key = np.repeat(grps, len(vals)) - vals = vals * len(grps) - df = DataFrame({'key': key, 'val': vals}) - result = df.groupby('key').rank(method=ties_method, - ascending=ascending, - na_option=na_option) - exp_df = DataFrame(exp * len(grps), columns=['val']) - assert_frame_equal(result, exp_df) - - @pytest.mark.parametrize("grps", [ - ['qux'], ['qux', 'quux']]) - @pytest.mark.parametrize("vals", [ - [2, 2, np.nan, 8, 2, 6, np.nan, np.nan], # floats - [pd.Timestamp('2018-01-02'), pd.Timestamp('2018-01-02'), np.nan, - pd.Timestamp('2018-01-08'), pd.Timestamp('2018-01-02'), - pd.Timestamp('2018-01-06'), np.nan, np.nan] - ]) - @pytest.mark.parametrize("ties_method,ascending,na_option,pct,exp", [ - ('average', True, 'keep', False, - [2., 2., np.nan, 5., 2., 4., np.nan, np.nan]), - ('average', True, 'keep', True, - [0.4, 0.4, np.nan, 1.0, 0.4, 0.8, np.nan, np.nan]), - ('average', False, 'keep', False, - [4., 4., np.nan, 1., 4., 2., np.nan, np.nan]), - ('average', False, 'keep', True, - [.8, 0.8, np.nan, 0.2, 0.8, 0.4, np.nan, np.nan]), - ('min', True, 'keep', False, - [1., 1., np.nan, 5., 1., 4., np.nan, np.nan]), - ('min', True, 'keep', True, - [0.2, 0.2, np.nan, 1.0, 0.2, 0.8, np.nan, np.nan]), - ('min', False, 'keep', False, - [3., 3., np.nan, 1., 3., 2., np.nan, np.nan]), - ('min', False, 'keep', True, - [.6, 0.6, np.nan, 0.2, 0.6, 0.4, np.nan, np.nan]), - ('max', True, 'keep', False, - [3., 3., np.nan, 5., 3., 4., np.nan, np.nan]), - ('max', True, 'keep', True, - [0.6, 0.6, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan]), - ('max', False, 'keep', False, - [5., 5., np.nan, 1., 5., 2., np.nan, np.nan]), - ('max', False, 'keep', True, - [1., 1., np.nan, 0.2, 1., 0.4, np.nan, np.nan]), - ('first', True, 'keep', False, - [1., 2., np.nan, 5., 3., 4., np.nan, np.nan]), - ('first', True, 'keep', True, - [0.2, 0.4, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan]), - ('first', False, 'keep', False, - [3., 4., np.nan, 1., 5., 2., np.nan, np.nan]), - ('first', False, 'keep', True, - [.6, 0.8, np.nan, 0.2, 1., 0.4, np.nan, np.nan]), - ('dense', True, 'keep', False, - [1., 1., np.nan, 3., 1., 2., np.nan, np.nan]), - ('dense', True, 'keep', True, - [0.2, 0.2, np.nan, 0.6, 0.2, 0.4, np.nan, np.nan]), - ('dense', False, 'keep', False, - [3., 3., np.nan, 1., 3., 2., np.nan, np.nan]), - ('dense', False, 'keep', True, - [.6, 0.6, np.nan, 0.2, 0.6, 0.4, np.nan, np.nan]), - ('average', True, 'no_na', False, [2., 2., 7., 5., 2., 4., 7., 7.]), - ('average', True, 'no_na', True, - [0.25, 0.25, 0.875, 0.625, 0.25, 0.5, 0.875, 0.875]), - ('average', False, 'no_na', False, [4., 4., 7., 1., 4., 2., 7., 7.]), - ('average', False, 'no_na', True, - [0.5, 0.5, 0.875, 0.125, 0.5, 0.25, 0.875, 0.875]), - ('min', True, 'no_na', False, [1., 1., 6., 5., 1., 4., 6., 6.]), - ('min', True, 'no_na', True, - [0.125, 0.125, 0.75, 0.625, 0.125, 0.5, 0.75, 0.75]), - ('min', False, 'no_na', False, [3., 3., 6., 1., 3., 2., 6., 6.]), - ('min', False, 'no_na', True, - [0.375, 0.375, 0.75, 0.125, 0.375, 0.25, 0.75, 0.75]), - ('max', True, 'no_na', False, [3., 3., 8., 5., 3., 4., 8., 8.]), - ('max', True, 'no_na', True, - [0.375, 0.375, 1., 0.625, 0.375, 0.5, 1., 1.]), - ('max', False, 'no_na', False, [5., 5., 8., 1., 5., 2., 8., 8.]), - ('max', False, 'no_na', True, - [0.625, 0.625, 1., 0.125, 0.625, 0.25, 1., 1.]), - ('first', True, 'no_na', False, [1., 2., 6., 5., 3., 4., 7., 8.]), - ('first', True, 'no_na', True, - [0.125, 0.25, 0.75, 0.625, 0.375, 0.5, 0.875, 1.]), - ('first', False, 'no_na', False, [3., 4., 6., 1., 5., 2., 7., 8.]), - ('first', False, 'no_na', True, - [0.375, 0.5, 0.75, 0.125, 0.625, 0.25, 0.875, 1.]), - ('dense', True, 'no_na', False, [1., 1., 4., 3., 1., 2., 4., 4.]), - ('dense', True, 'no_na', True, - [0.125, 0.125, 0.5, 0.375, 0.125, 0.25, 0.5, 0.5]), - ('dense', False, 'no_na', False, [3., 3., 4., 1., 3., 2., 4., 4.]), - ('dense', False, 'no_na', True, - [0.375, 0.375, 0.5, 0.125, 0.375, 0.25, 0.5, 0.5]) - ]) - def test_rank_args_missing(self, grps, vals, ties_method, ascending, - na_option, pct, exp): - key = np.repeat(grps, len(vals)) - vals = vals * len(grps) - df = DataFrame({'key': key, 'val': vals}) - result = df.groupby('key').rank(method=ties_method, - ascending=ascending, - na_option=na_option, pct=pct) - - exp_df = DataFrame(exp * len(grps), columns=['val']) - assert_frame_equal(result, exp_df) - - @pytest.mark.parametrize("pct,exp", [ - (False, [3., 3., 3., 3., 3.]), - (True, [.6, .6, .6, .6, .6])]) - def test_rank_resets_each_group(self, pct, exp): - df = DataFrame( - {'key': ['a', 'a', 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'b'], - 'val': [1] * 10} - ) - result = df.groupby('key').rank(pct=pct) - exp_df = DataFrame(exp * 2, columns=['val']) - assert_frame_equal(result, exp_df) - - def test_rank_avg_even_vals(self): - df = DataFrame({'key': ['a'] * 4, 'val': [1] * 4}) - result = df.groupby('key').rank() - exp_df = DataFrame([2.5, 2.5, 2.5, 2.5], columns=['val']) - assert_frame_equal(result, exp_df) - - @pytest.mark.parametrize("ties_method", [ - 'average', 'min', 'max', 'first', 'dense']) - @pytest.mark.parametrize("ascending", [True, False]) - @pytest.mark.parametrize("na_option", ["keep", "top", "bottom"]) - @pytest.mark.parametrize("pct", [True, False]) - @pytest.mark.parametrize("vals", [ - ['bar', 'bar', 'foo', 'bar', 'baz'], - ['bar', np.nan, 'foo', np.nan, 'baz'] - ]) - def test_rank_object_raises(self, ties_method, ascending, na_option, - pct, vals): - df = DataFrame({'key': ['foo'] * 5, 'val': vals}) - with tm.assert_raises_regex(TypeError, "not callable"): - df.groupby('key').rank(method=ties_method, - ascending=ascending, - na_option=na_option, pct=pct) - - @pytest.mark.parametrize("agg_func", ['any', 'all']) - @pytest.mark.parametrize("skipna", [True, False]) - @pytest.mark.parametrize("vals", [ - ['foo', 'bar', 'baz'], ['foo', '', ''], ['', '', ''], - [1, 2, 3], [1, 0, 0], [0, 0, 0], - [1., 2., 3.], [1., 0., 0.], [0., 0., 0.], - [True, True, True], [True, False, False], [False, False, False], - [np.nan, np.nan, np.nan] - ]) - def test_groupby_bool_aggs(self, agg_func, skipna, vals): - df = DataFrame({'key': ['a'] * 3 + ['b'] * 3, 'val': vals * 2}) - - # Figure out expectation using Python builtin - exp = getattr(compat.builtins, agg_func)(vals) - - # edge case for missing data with skipna and 'any' - if skipna and all(isna(vals)) and agg_func == 'any': - exp = False - - exp_df = DataFrame([exp] * 2, columns=['val'], index=pd.Index( - ['a', 'b'], name='key')) - result = getattr(df.groupby('key'), agg_func)(skipna=skipna) - assert_frame_equal(result, exp_df) - - def test_dont_clobber_name_column(self): - df = DataFrame({'key': ['a', 'a', 'a', 'b', 'b', 'b'], - 'name': ['foo', 'bar', 'baz'] * 2}) - - result = df.groupby('key').apply(lambda x: x) - assert_frame_equal(result, df) - - def test_skip_group_keys(self): - from pandas import concat - - tsf = tm.makeTimeDataFrame() - - grouped = tsf.groupby(lambda x: x.month, group_keys=False) - result = grouped.apply(lambda x: x.sort_values(by='A')[:3]) - - pieces = [] - for key, group in grouped: - pieces.append(group.sort_values(by='A')[:3]) - - expected = concat(pieces) - assert_frame_equal(result, expected) - - grouped = tsf['A'].groupby(lambda x: x.month, group_keys=False) - result = grouped.apply(lambda x: x.sort_values()[:3]) - - pieces = [] - for key, group in grouped: - pieces.append(group.sort_values()[:3]) - - expected = concat(pieces) - assert_series_equal(result, expected) - - def test_no_nonsense_name(self): - # GH #995 - s = self.frame['C'].copy() - s.name = None - - result = s.groupby(self.frame['A']).agg(np.sum) - assert result.name is None - - def test_multifunc_sum_bug(self): - # GH #1065 - x = DataFrame(np.arange(9).reshape(3, 3)) - x['test'] = 0 - x['fl'] = [1.3, 1.5, 1.6] - - grouped = x.groupby('test') - result = grouped.agg({'fl': 'sum', 2: 'size'}) - assert result['fl'].dtype == np.float64 - - def test_handle_dict_return_value(self): - def f(group): - return {'max': group.max(), 'min': group.min()} - - def g(group): - return Series({'max': group.max(), 'min': group.min()}) - - result = self.df.groupby('A')['C'].apply(f) - expected = self.df.groupby('A')['C'].apply(g) - - assert isinstance(result, Series) - assert_series_equal(result, expected) - - def test_set_group_name(self): - def f(group): - assert group.name is not None - return group - - def freduce(group): - assert group.name is not None - return group.sum() - - def foo(x): - return freduce(x) - - def _check_all(grouped): - # make sure all these work - grouped.apply(f) - grouped.aggregate(freduce) - grouped.aggregate({'C': freduce, 'D': freduce}) - grouped.transform(f) - - grouped['C'].apply(f) - grouped['C'].aggregate(freduce) - grouped['C'].aggregate([freduce, foo]) - grouped['C'].transform(f) + expected = f(df.groupby(tups)[field]) + for k, v in compat.iteritems(expected): + assert (result[k] == v) - _check_all(self.df.groupby('A')) - _check_all(self.df.groupby(['A', 'B'])) - - def test_group_name_available_in_inference_pass(self): - # gh-15062 - df = pd.DataFrame({'a': [0, 0, 1, 1, 2, 2], 'b': np.arange(6)}) - - names = [] - - def f(group): - names.append(group.name) - return group.copy() - - df.groupby('a', sort=False, group_keys=False).apply(f) - # we expect 2 zeros because we call ``f`` once to see if a faster route - # can be used. - expected_names = [0, 0, 1, 2] - assert names == expected_names - - def test_no_dummy_key_names(self): - # see gh-1291 - result = self.df.groupby(self.df['A'].values).sum() - assert result.index.name is None - - result = self.df.groupby([self.df['A'].values, self.df['B'].values - ]).sum() - assert result.index.names == (None, None) - - def test_groupby_sort_multiindex_series(self): - # series multiindex groupby sort argument was not being passed through - # _compress_group_index - # GH 9444 - index = MultiIndex(levels=[[1, 2], [1, 2]], - labels=[[0, 0, 0, 0, 1, 1], [1, 1, 0, 0, 0, 0]], - names=['a', 'b']) - mseries = Series([0, 1, 2, 3, 4, 5], index=index) - index = MultiIndex(levels=[[1, 2], [1, 2]], - labels=[[0, 0, 1], [1, 0, 0]], names=['a', 'b']) - mseries_result = Series([0, 2, 4], index=index) - - result = mseries.groupby(level=['a', 'b'], sort=False).first() - assert_series_equal(result, mseries_result) - result = mseries.groupby(level=['a', 'b'], sort=True).first() - assert_series_equal(result, mseries_result.sort_index()) - - def test_groupby_reindex_inside_function(self): - - periods = 1000 - ind = DatetimeIndex(start='2012/1/1', freq='5min', periods=periods) - df = DataFrame({'high': np.arange( - periods), 'low': np.arange(periods)}, index=ind) - - def agg_before(hour, func, fix=False): - """ - Run an aggregate func on the subset of data. - """ - - def _func(data): - d = data.loc[data.index.map( - lambda x: x.hour < 11)].dropna() - if fix: - data[data.index[0]] - if len(d) == 0: - return None - return func(d) - - return _func - - def afunc(data): - d = data.select(lambda x: x.hour < 11).dropna() - return np.max(d) - - grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day)) - closure_bad = grouped.agg({'high': agg_before(11, np.max)}) - closure_good = grouped.agg({'high': agg_before(11, np.max, True)}) - - assert_frame_equal(closure_bad, closure_good) - - def test_cython_median(self): - df = DataFrame(np.random.randn(1000)) - df.values[::2] = np.nan - - labels = np.random.randint(0, 50, size=1000).astype(float) - labels[::17] = np.nan - - result = df.groupby(labels).median() - exp = df.groupby(labels).agg(nanops.nanmedian) - assert_frame_equal(result, exp) - - df = DataFrame(np.random.randn(1000, 5)) - rs = df.groupby(labels).agg(np.median) - xp = df.groupby(labels).median() - assert_frame_equal(rs, xp) - - def test_median_empty_bins(self): - df = pd.DataFrame(np.random.randint(0, 44, 500)) - - grps = range(0, 55, 5) - bins = pd.cut(df[0], grps) - - result = df.groupby(bins).median() - expected = df.groupby(bins).agg(lambda x: x.median()) - assert_frame_equal(result, expected) - - @pytest.mark.parametrize("dtype", [ - 'int8', 'int16', 'int32', 'int64', 'float32', 'float64']) - @pytest.mark.parametrize("method,data", [ - ('first', {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}), - ('last', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}), - ('min', {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}), - ('max', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}), - ('nth', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}], - 'args': [1]}), - ('count', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 2}], - 'out_type': 'int64'}) - ]) - def test_groupby_non_arithmetic_agg_types(self, dtype, method, data): - # GH9311, GH6620 - df = pd.DataFrame( - [{'a': 1, 'b': 1}, - {'a': 1, 'b': 2}, - {'a': 2, 'b': 3}, - {'a': 2, 'b': 4}]) - - df['b'] = df.b.astype(dtype) - - if 'args' not in data: - data['args'] = [] - - if 'out_type' in data: - out_type = data['out_type'] - else: - out_type = dtype - - exp = data['df'] - df_out = pd.DataFrame(exp) - - df_out['b'] = df_out.b.astype(out_type) - df_out.set_index('a', inplace=True) - - grpd = df.groupby('a') - t = getattr(grpd, method)(*data['args']) - assert_frame_equal(t, df_out) - - def test_groupby_non_arithmetic_agg_intlike_precision(self): - # GH9311, GH6620 - c = 24650000000000000 - - inputs = ((Timestamp('2011-01-15 12:50:28.502376'), - Timestamp('2011-01-20 12:50:28.593448')), (1 + c, 2 + c)) - - for i in inputs: - df = pd.DataFrame([{'a': 1, 'b': i[0]}, {'a': 1, 'b': i[1]}]) - - grp_exp = {'first': {'expected': i[0]}, - 'last': {'expected': i[1]}, - 'min': {'expected': i[0]}, - 'max': {'expected': i[1]}, - 'nth': {'expected': i[1], - 'args': [1]}, - 'count': {'expected': 2}} - - for method, data in compat.iteritems(grp_exp): - if 'args' not in data: - data['args'] = [] - - grpd = df.groupby('a') - res = getattr(grpd, method)(*data['args']) - assert res.iloc[0].b == data['expected'] - - def test_groupby_multiindex_missing_pair(self): - # GH9049 - df = DataFrame({'group1': ['a', 'a', 'a', 'b'], - 'group2': ['c', 'c', 'd', 'c'], - 'value': [1, 1, 1, 5]}) - df = df.set_index(['group1', 'group2']) - df_grouped = df.groupby(level=['group1', 'group2'], sort=True) - - res = df_grouped.agg('sum') - idx = MultiIndex.from_tuples( - [('a', 'c'), ('a', 'd'), ('b', 'c')], names=['group1', 'group2']) - exp = DataFrame([[2], [1], [5]], index=idx, columns=['value']) - - tm.assert_frame_equal(res, exp) - - def test_groupby_multiindex_not_lexsorted(self): - # GH 11640 - - # define the lexsorted version - lexsorted_mi = MultiIndex.from_tuples( - [('a', ''), ('b1', 'c1'), ('b2', 'c2')], names=['b', 'c']) - lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi) - assert lexsorted_df.columns.is_lexsorted() - - # define the non-lexsorted version - not_lexsorted_df = DataFrame(columns=['a', 'b', 'c', 'd'], - data=[[1, 'b1', 'c1', 3], - [1, 'b2', 'c2', 4]]) - not_lexsorted_df = not_lexsorted_df.pivot_table( - index='a', columns=['b', 'c'], values='d') - not_lexsorted_df = not_lexsorted_df.reset_index() - assert not not_lexsorted_df.columns.is_lexsorted() - - # compare the results - tm.assert_frame_equal(lexsorted_df, not_lexsorted_df) - - expected = lexsorted_df.groupby('a').mean() - with tm.assert_produces_warning(PerformanceWarning): - result = not_lexsorted_df.groupby('a').mean() - tm.assert_frame_equal(expected, result) - - # a transforming function should work regardless of sort - # GH 14776 - df = DataFrame({'x': ['a', 'a', 'b', 'a'], - 'y': [1, 1, 2, 2], - 'z': [1, 2, 3, 4]}).set_index(['x', 'y']) - assert not df.index.is_lexsorted() - - for level in [0, 1, [0, 1]]: - for sort in [False, True]: - result = df.groupby(level=level, sort=sort).apply( - DataFrame.drop_duplicates) - expected = df - tm.assert_frame_equal(expected, result) - - result = df.sort_index().groupby(level=level, sort=sort).apply( - DataFrame.drop_duplicates) - expected = df.sort_index() - tm.assert_frame_equal(expected, result) - - def test_gb_apply_list_of_unequal_len_arrays(self): - - # GH1738 - df = DataFrame({'group1': ['a', 'a', 'a', 'b', 'b', 'b', 'a', 'a', 'a', - 'b', 'b', 'b'], - 'group2': ['c', 'c', 'd', 'd', 'd', 'e', 'c', 'c', 'd', - 'd', 'd', 'e'], - 'weight': [1.1, 2, 3, 4, 5, 6, 2, 4, 6, 8, 1, 2], - 'value': [7.1, 8, 9, 10, 11, 12, 8, 7, 6, 5, 4, 3]}) - df = df.set_index(['group1', 'group2']) - df_grouped = df.groupby(level=['group1', 'group2'], sort=True) - - def noddy(value, weight): - out = np.array(value * weight).repeat(3) - return out - - # the kernel function returns arrays of unequal length - # pandas sniffs the first one, sees it's an array and not - # a list, and assumed the rest are of equal length - # and so tries a vstack - - # don't die - df_grouped.apply(lambda x: noddy(x.value, x.weight)) - - def test_fill_constistency(self): - - # GH9221 - # pass thru keyword arguments to the generated wrapper - # are set if the passed kw is None (only) - df = DataFrame(index=pd.MultiIndex.from_product( - [['value1', 'value2'], date_range('2014-01-01', '2014-01-06')]), - columns=Index( - ['1', '2'], name='id')) - df['1'] = [np.nan, 1, np.nan, np.nan, 11, np.nan, np.nan, 2, np.nan, - np.nan, 22, np.nan] - df['2'] = [np.nan, 3, np.nan, np.nan, 33, np.nan, np.nan, 4, np.nan, - np.nan, 44, np.nan] - - expected = df.groupby(level=0, axis=0).fillna(method='ffill') - result = df.T.groupby(level=0, axis=1).fillna(method='ffill').T - assert_frame_equal(result, expected) - - def test_index_label_overlaps_location(self): - # checking we don't have any label/location confusion in the - # the wake of GH5375 - df = DataFrame(list('ABCDE'), index=[2, 0, 2, 1, 1]) - g = df.groupby(list('ababb')) - actual = g.filter(lambda x: len(x) > 2) - expected = df.iloc[[1, 3, 4]] - assert_frame_equal(actual, expected) - - ser = df[0] - g = ser.groupby(list('ababb')) - actual = g.filter(lambda x: len(x) > 2) - expected = ser.take([1, 3, 4]) - assert_series_equal(actual, expected) - - # ... and again, with a generic Index of floats - df.index = df.index.astype(float) - g = df.groupby(list('ababb')) - actual = g.filter(lambda x: len(x) > 2) - expected = df.iloc[[1, 3, 4]] - assert_frame_equal(actual, expected) - - ser = df[0] - g = ser.groupby(list('ababb')) - actual = g.filter(lambda x: len(x) > 2) - expected = ser.take([1, 3, 4]) - assert_series_equal(actual, expected) - - def test_groupby_cumprod(self): - # GH 4095 - df = pd.DataFrame({'key': ['b'] * 10, 'value': 2}) - - actual = df.groupby('key')['value'].cumprod() - expected = df.groupby('key')['value'].apply(lambda x: x.cumprod()) - expected.name = 'value' - tm.assert_series_equal(actual, expected) - - df = pd.DataFrame({'key': ['b'] * 100, 'value': 2}) - actual = df.groupby('key')['value'].cumprod() - # if overflows, groupby product casts to float - # while numpy passes back invalid values - df['value'] = df['value'].astype(float) - expected = df.groupby('key')['value'].apply(lambda x: x.cumprod()) - expected.name = 'value' - tm.assert_series_equal(actual, expected) - - def test_ops_general(self): - ops = [('mean', np.mean), - ('median', np.median), - ('std', np.std), - ('var', np.var), - ('sum', np.sum), - ('prod', np.prod), - ('min', np.min), - ('max', np.max), - ('first', lambda x: x.iloc[0]), - ('last', lambda x: x.iloc[-1]), - ('count', np.size), ] - try: - from scipy.stats import sem - except ImportError: - pass - else: - ops.append(('sem', sem)) - df = DataFrame(np.random.randn(1000)) - labels = np.random.randint(0, 50, size=1000).astype(float) - - for op, targop in ops: - result = getattr(df.groupby(labels), op)().astype(float) - expected = df.groupby(labels).agg(targop) - try: - tm.assert_frame_equal(result, expected) - except BaseException as exc: - exc.args += ('operation: %s' % op, ) - raise - - def test_max_nan_bug(self): - raw = """,Date,app,File -2013-04-23,2013-04-23 00:00:00,,log080001.log -2013-05-06,2013-05-06 00:00:00,,log.log -2013-05-07,2013-05-07 00:00:00,OE,xlsx""" - - df = pd.read_csv(StringIO(raw), parse_dates=[0]) - gb = df.groupby('Date') - r = gb[['File']].max() - e = gb['File'].max().to_frame() - tm.assert_frame_equal(r, e) - assert not r['File'].isna().any() - - def test_nlargest(self): - a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10]) - b = Series(list('a' * 5 + 'b' * 5)) - gb = a.groupby(b) - r = gb.nlargest(3) - e = Series([ - 7, 5, 3, 10, 9, 6 - ], index=MultiIndex.from_arrays([list('aaabbb'), [3, 2, 1, 9, 5, 8]])) - tm.assert_series_equal(r, e) - - a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0]) - gb = a.groupby(b) - e = Series([ - 3, 2, 1, 3, 3, 2 - ], index=MultiIndex.from_arrays([list('aaabbb'), [2, 3, 1, 6, 5, 7]])) - assert_series_equal(gb.nlargest(3, keep='last'), e) - - def test_nsmallest(self): - a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10]) - b = Series(list('a' * 5 + 'b' * 5)) - gb = a.groupby(b) - r = gb.nsmallest(3) - e = Series([ - 1, 2, 3, 0, 4, 6 - ], index=MultiIndex.from_arrays([list('aaabbb'), [0, 4, 1, 6, 7, 8]])) - tm.assert_series_equal(r, e) - - a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0]) - gb = a.groupby(b) - e = Series([ - 0, 1, 1, 0, 1, 2 - ], index=MultiIndex.from_arrays([list('aaabbb'), [4, 1, 0, 9, 8, 7]])) - assert_series_equal(gb.nsmallest(3, keep='last'), e) - - def test_transform_doesnt_clobber_ints(self): - # GH 7972 - n = 6 - x = np.arange(n) - df = DataFrame({'a': x // 2, 'b': 2.0 * x, 'c': 3.0 * x}) - df2 = DataFrame({'a': x // 2 * 1.0, 'b': 2.0 * x, 'c': 3.0 * x}) - - gb = df.groupby('a') - result = gb.transform('mean') - - gb2 = df2.groupby('a') - expected = gb2.transform('mean') - tm.assert_frame_equal(result, expected) - - def test_groupby_apply_all_none(self): - # Tests to make sure no errors if apply function returns all None - # values. Issue 9684. - test_df = DataFrame({'groups': [0, 0, 1, 1], - 'random_vars': [8, 7, 4, 5]}) - - def test_func(x): - pass - - result = test_df.groupby('groups').apply(test_func) - expected = DataFrame() - tm.assert_frame_equal(result, expected) - - def test_groupby_apply_none_first(self): - # GH 12824. Tests if apply returns None first. - test_df1 = DataFrame({'groups': [1, 1, 1, 2], 'vars': [0, 1, 2, 3]}) - test_df2 = DataFrame({'groups': [1, 2, 2, 2], 'vars': [0, 1, 2, 3]}) - - def test_func(x): - if x.shape[0] < 2: + _check_groupby(df, result, ['a', 'b'], 'd') + + +def test_dont_clobber_name_column(): + df = DataFrame({'key': ['a', 'a', 'a', 'b', 'b', 'b'], + 'name': ['foo', 'bar', 'baz'] * 2}) + + result = df.groupby('key').apply(lambda x: x) + assert_frame_equal(result, df) + + +def test_skip_group_keys(): + + tsf = tm.makeTimeDataFrame() + + grouped = tsf.groupby(lambda x: x.month, group_keys=False) + result = grouped.apply(lambda x: x.sort_values(by='A')[:3]) + + pieces = [] + for key, group in grouped: + pieces.append(group.sort_values(by='A')[:3]) + + expected = pd.concat(pieces) + assert_frame_equal(result, expected) + + grouped = tsf['A'].groupby(lambda x: x.month, group_keys=False) + result = grouped.apply(lambda x: x.sort_values()[:3]) + + pieces = [] + for key, group in grouped: + pieces.append(group.sort_values()[:3]) + + expected = pd.concat(pieces) + assert_series_equal(result, expected) + + +def test_no_nonsense_name(frame): + # GH #995 + s = frame['C'].copy() + s.name = None + + result = s.groupby(frame['A']).agg(np.sum) + assert result.name is None + + +def test_multifunc_sum_bug(): + # GH #1065 + x = DataFrame(np.arange(9).reshape(3, 3)) + x['test'] = 0 + x['fl'] = [1.3, 1.5, 1.6] + + grouped = x.groupby('test') + result = grouped.agg({'fl': 'sum', 2: 'size'}) + assert result['fl'].dtype == np.float64 + + +def test_handle_dict_return_value(df): + def f(group): + return {'max': group.max(), 'min': group.min()} + + def g(group): + return Series({'max': group.max(), 'min': group.min()}) + + result = df.groupby('A')['C'].apply(f) + expected = df.groupby('A')['C'].apply(g) + + assert isinstance(result, Series) + assert_series_equal(result, expected) + + +@pytest.mark.parametrize('grouper', ['A', ['A', 'B']]) +def test_set_group_name(df, grouper): + def f(group): + assert group.name is not None + return group + + def freduce(group): + assert group.name is not None + return group.sum() + + def foo(x): + return freduce(x) + + grouped = df.groupby(grouper) + + # make sure all these work + grouped.apply(f) + grouped.aggregate(freduce) + grouped.aggregate({'C': freduce, 'D': freduce}) + grouped.transform(f) + + grouped['C'].apply(f) + grouped['C'].aggregate(freduce) + grouped['C'].aggregate([freduce, foo]) + grouped['C'].transform(f) + + +def test_group_name_available_in_inference_pass(): + # gh-15062 + df = pd.DataFrame({'a': [0, 0, 1, 1, 2, 2], 'b': np.arange(6)}) + + names = [] + + def f(group): + names.append(group.name) + return group.copy() + + df.groupby('a', sort=False, group_keys=False).apply(f) + # we expect 2 zeros because we call ``f`` once to see if a faster route + # can be used. + expected_names = [0, 0, 1, 2] + assert names == expected_names + + +def test_no_dummy_key_names(df): + # see gh-1291 + result = df.groupby(df['A'].values).sum() + assert result.index.name is None + + result = df.groupby([df['A'].values, df['B'].values]).sum() + assert result.index.names == (None, None) + + +def test_groupby_sort_multiindex_series(): + # series multiindex groupby sort argument was not being passed through + # _compress_group_index + # GH 9444 + index = MultiIndex(levels=[[1, 2], [1, 2]], + labels=[[0, 0, 0, 0, 1, 1], [1, 1, 0, 0, 0, 0]], + names=['a', 'b']) + mseries = Series([0, 1, 2, 3, 4, 5], index=index) + index = MultiIndex(levels=[[1, 2], [1, 2]], + labels=[[0, 0, 1], [1, 0, 0]], names=['a', 'b']) + mseries_result = Series([0, 2, 4], index=index) + + result = mseries.groupby(level=['a', 'b'], sort=False).first() + assert_series_equal(result, mseries_result) + result = mseries.groupby(level=['a', 'b'], sort=True).first() + assert_series_equal(result, mseries_result.sort_index()) + + +def test_groupby_reindex_inside_function(): + + periods = 1000 + ind = DatetimeIndex(start='2012/1/1', freq='5min', periods=periods) + df = DataFrame({'high': np.arange( + periods), 'low': np.arange(periods)}, index=ind) + + def agg_before(hour, func, fix=False): + """ + Run an aggregate func on the subset of data. + """ + + def _func(data): + d = data.loc[data.index.map( + lambda x: x.hour < 11)].dropna() + if fix: + data[data.index[0]] + if len(d) == 0: return None - return x.iloc[[0, -1]] - - result1 = test_df1.groupby('groups').apply(test_func) - result2 = test_df2.groupby('groups').apply(test_func) - index1 = MultiIndex.from_arrays([[1, 1], [0, 2]], - names=['groups', None]) - index2 = MultiIndex.from_arrays([[2, 2], [1, 3]], - names=['groups', None]) - expected1 = DataFrame({'groups': [1, 1], 'vars': [0, 2]}, - index=index1) - expected2 = DataFrame({'groups': [2, 2], 'vars': [1, 3]}, - index=index2) - tm.assert_frame_equal(result1, expected1) - tm.assert_frame_equal(result2, expected2) - - def test_groupby_preserves_sort(self): - # Test to ensure that groupby always preserves sort order of original - # object. Issue #8588 and #9651 - - df = DataFrame( - {'int_groups': [3, 1, 0, 1, 0, 3, 3, 3], - 'string_groups': ['z', 'a', 'z', 'a', 'a', 'g', 'g', 'g'], - 'ints': [8, 7, 4, 5, 2, 9, 1, 1], - 'floats': [2.3, 5.3, 6.2, -2.4, 2.2, 1.1, 1.1, 5], - 'strings': ['z', 'd', 'a', 'e', 'word', 'word2', '42', '47']}) - - # Try sorting on different types and with different group types - for sort_column in ['ints', 'floats', 'strings', ['ints', 'floats'], - ['ints', 'strings']]: - for group_column in ['int_groups', 'string_groups', - ['int_groups', 'string_groups']]: - - df = df.sort_values(by=sort_column) - - g = df.groupby(group_column) - - def test_sort(x): - assert_frame_equal(x, x.sort_values(by=sort_column)) - - g.apply(test_sort) - - def test_numpy_compat(self): - # see gh-12811 - df = pd.DataFrame({'A': [1, 2, 1], 'B': [1, 2, 3]}) - g = df.groupby('A') - - msg = "numpy operations are not valid with groupby" - - for func in ('mean', 'var', 'std', 'cumprod', 'cumsum'): - tm.assert_raises_regex(UnsupportedFunctionCall, msg, - getattr(g, func), 1, 2, 3) - tm.assert_raises_regex(UnsupportedFunctionCall, msg, - getattr(g, func), foo=1) - - def test_group_shift_with_null_key(self): - # This test is designed to replicate the segfault in issue #13813. - n_rows = 1200 - - # Generate a moderately large dataframe with occasional missing - # values in column `B`, and then group by [`A`, `B`]. This should - # force `-1` in `labels` array of `g.grouper.group_info` exactly - # at those places, where the group-by key is partially missing. - df = DataFrame([(i % 12, i % 3 if i % 3 else np.nan, i) - for i in range(n_rows)], dtype=float, - columns=["A", "B", "Z"], index=None) - g = df.groupby(["A", "B"]) - - expected = DataFrame([(i + 12 if i % 3 and i < n_rows - 12 - else np.nan) - for i in range(n_rows)], dtype=float, - columns=["Z"], index=None) - result = g.shift(-1) - - assert_frame_equal(result, expected) - - def test_pivot_table_values_key_error(self): - # This test is designed to replicate the error in issue #14938 - df = pd.DataFrame({'eventDate': - pd.date_range(pd.datetime.today(), - periods=20, freq='M').tolist(), - 'thename': range(0, 20)}) - - df['year'] = df.set_index('eventDate').index.year - df['month'] = df.set_index('eventDate').index.month - - with pytest.raises(KeyError): - df.reset_index().pivot_table(index='year', columns='month', - values='badname', aggfunc='count') - - def test_cummin_cummax(self): - # GH 15048 - num_types = [np.int32, np.int64, np.float32, np.float64] - num_mins = [np.iinfo(np.int32).min, np.iinfo(np.int64).min, - np.finfo(np.float32).min, np.finfo(np.float64).min] - num_max = [np.iinfo(np.int32).max, np.iinfo(np.int64).max, - np.finfo(np.float32).max, np.finfo(np.float64).max] - base_df = pd.DataFrame({'A': [1, 1, 1, 1, 2, 2, 2, 2], - 'B': [3, 4, 3, 2, 2, 3, 2, 1]}) - expected_mins = [3, 3, 3, 2, 2, 2, 2, 1] - expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3] - - for dtype, min_val, max_val in zip(num_types, num_mins, num_max): - df = base_df.astype(dtype) - - # cummin - expected = pd.DataFrame({'B': expected_mins}).astype(dtype) - result = df.groupby('A').cummin() - tm.assert_frame_equal(result, expected) - result = df.groupby('A').B.apply(lambda x: x.cummin()).to_frame() - tm.assert_frame_equal(result, expected) - - # Test cummin w/ min value for dtype - df.loc[[2, 6], 'B'] = min_val - expected.loc[[2, 3, 6, 7], 'B'] = min_val - result = df.groupby('A').cummin() - tm.assert_frame_equal(result, expected) - expected = df.groupby('A').B.apply(lambda x: x.cummin()).to_frame() - tm.assert_frame_equal(result, expected) - - # cummax - expected = pd.DataFrame({'B': expected_maxs}).astype(dtype) - result = df.groupby('A').cummax() - tm.assert_frame_equal(result, expected) - result = df.groupby('A').B.apply(lambda x: x.cummax()).to_frame() - tm.assert_frame_equal(result, expected) - - # Test cummax w/ max value for dtype - df.loc[[2, 6], 'B'] = max_val - expected.loc[[2, 3, 6, 7], 'B'] = max_val - result = df.groupby('A').cummax() - tm.assert_frame_equal(result, expected) - expected = df.groupby('A').B.apply(lambda x: x.cummax()).to_frame() - tm.assert_frame_equal(result, expected) - - # Test nan in some values - base_df.loc[[0, 2, 4, 6], 'B'] = np.nan - expected = pd.DataFrame({'B': [np.nan, 4, np.nan, 2, - np.nan, 3, np.nan, 1]}) - result = base_df.groupby('A').cummin() - tm.assert_frame_equal(result, expected) - expected = (base_df.groupby('A') - .B - .apply(lambda x: x.cummin()) - .to_frame()) - tm.assert_frame_equal(result, expected) - - expected = pd.DataFrame({'B': [np.nan, 4, np.nan, 4, - np.nan, 3, np.nan, 3]}) - result = base_df.groupby('A').cummax() - tm.assert_frame_equal(result, expected) - expected = (base_df.groupby('A') - .B - .apply(lambda x: x.cummax()) - .to_frame()) - tm.assert_frame_equal(result, expected) - - # Test nan in entire column - base_df['B'] = np.nan - expected = pd.DataFrame({'B': [np.nan] * 8}) - result = base_df.groupby('A').cummin() - tm.assert_frame_equal(expected, result) - result = base_df.groupby('A').B.apply(lambda x: x.cummin()).to_frame() - tm.assert_frame_equal(expected, result) - result = base_df.groupby('A').cummax() - tm.assert_frame_equal(expected, result) - result = base_df.groupby('A').B.apply(lambda x: x.cummax()).to_frame() - tm.assert_frame_equal(expected, result) - - # GH 15561 - df = pd.DataFrame(dict(a=[1], b=pd.to_datetime(['2001']))) - expected = pd.Series(pd.to_datetime('2001'), index=[0], name='b') - for method in ['cummax', 'cummin']: - result = getattr(df.groupby('a')['b'], method)() - tm.assert_series_equal(expected, result) - - # GH 15635 - df = pd.DataFrame(dict(a=[1, 2, 1], b=[2, 1, 1])) - result = df.groupby('a').b.cummax() - expected = pd.Series([2, 1, 2], name='b') - tm.assert_series_equal(result, expected) - - df = pd.DataFrame(dict(a=[1, 2, 1], b=[1, 2, 2])) - result = df.groupby('a').b.cummin() - expected = pd.Series([1, 2, 1], name='b') - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize('in_vals, out_vals', [ - - # Basics: strictly increasing (T), strictly decreasing (F), - # abs val increasing (F), non-strictly increasing (T) - ([1, 2, 5, 3, 2, 0, 4, 5, -6, 1, 1], - [True, False, False, True]), - - # Test with inf vals - ([1, 2.1, np.inf, 3, 2, np.inf, -np.inf, 5, 11, 1, -np.inf], - [True, False, True, False]), - - # Test with nan vals; should always be False - ([1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan], - [False, False, False, False]), - ]) - def test_is_monotonic_increasing(self, in_vals, out_vals): - # GH 17015 - source_dict = { - 'A': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'], - 'B': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd'], - 'C': in_vals} - df = pd.DataFrame(source_dict) - result = df.groupby('B').C.is_monotonic_increasing - index = Index(list('abcd'), name='B') - expected = pd.Series(index=index, data=out_vals, name='C') - tm.assert_series_equal(result, expected) - - # Also check result equal to manually taking x.is_monotonic_increasing. - expected = ( - df.groupby(['B']).C.apply(lambda x: x.is_monotonic_increasing)) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize('in_vals, out_vals', [ - # Basics: strictly decreasing (T), strictly increasing (F), - # abs val decreasing (F), non-strictly increasing (T) - ([10, 9, 7, 3, 4, 5, -3, 2, 0, 1, 1], - [True, False, False, True]), - - # Test with inf vals - ([np.inf, 1, -np.inf, np.inf, 2, -3, -np.inf, 5, -3, -np.inf, -np.inf], - [True, True, False, True]), - - # Test with nan vals; should always be False - ([1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan], - [False, False, False, False]), - ]) - def test_is_monotonic_decreasing(self, in_vals, out_vals): - # GH 17015 - source_dict = { - 'A': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'], - 'B': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd'], - 'C': in_vals} - - df = pd.DataFrame(source_dict) - result = df.groupby('B').C.is_monotonic_decreasing - index = Index(list('abcd'), name='B') - expected = pd.Series(index=index, data=out_vals, name='C') - tm.assert_series_equal(result, expected) - - def test_apply_numeric_coercion_when_datetime(self): - # In the past, group-by/apply operations have been over-eager - # in converting dtypes to numeric, in the presence of datetime - # columns. Various GH issues were filed, the reproductions - # for which are here. - - # GH 15670 - df = pd.DataFrame({'Number': [1, 2], - 'Date': ["2017-03-02"] * 2, - 'Str': ["foo", "inf"]}) - expected = df.groupby(['Number']).apply(lambda x: x.iloc[0]) - df.Date = pd.to_datetime(df.Date) - result = df.groupby(['Number']).apply(lambda x: x.iloc[0]) - tm.assert_series_equal(result['Str'], expected['Str']) - - # GH 15421 - df = pd.DataFrame({'A': [10, 20, 30], - 'B': ['foo', '3', '4'], - 'T': [pd.Timestamp("12:31:22")] * 3}) - - def get_B(g): - return g.iloc[0][['B']] - result = df.groupby('A').apply(get_B)['B'] - expected = df.B - expected.index = df.A - tm.assert_series_equal(result, expected) - - # GH 14423 - def predictions(tool): - out = pd.Series(index=['p1', 'p2', 'useTime'], dtype=object) - if 'step1' in list(tool.State): - out['p1'] = str(tool[tool.State == 'step1'].Machine.values[0]) - if 'step2' in list(tool.State): - out['p2'] = str(tool[tool.State == 'step2'].Machine.values[0]) - out['useTime'] = str( - tool[tool.State == 'step2'].oTime.values[0]) - return out - df1 = pd.DataFrame({'Key': ['B', 'B', 'A', 'A'], - 'State': ['step1', 'step2', 'step1', 'step2'], - 'oTime': ['', '2016-09-19 05:24:33', - '', '2016-09-19 23:59:04'], - 'Machine': ['23', '36L', '36R', '36R']}) - df2 = df1.copy() - df2.oTime = pd.to_datetime(df2.oTime) - expected = df1.groupby('Key').apply(predictions).p1 - result = df2.groupby('Key').apply(predictions).p1 - tm.assert_series_equal(expected, result) - - def test_pipe(self): - # Test the pipe method of DataFrameGroupBy. - # Issue #17871 - - random_state = np.random.RandomState(1234567890) - - df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'foo', 'foo'], - 'B': random_state.randn(8), - 'C': random_state.randn(8)}) - - def f(dfgb): - return dfgb.B.max() - dfgb.C.min().min() - - def square(srs): - return srs ** 2 - - # Note that the transformations are - # GroupBy -> Series - # Series -> Series - # This then chains the GroupBy.pipe and the - # NDFrame.pipe methods - result = df.groupby('A').pipe(f).pipe(square) - - index = Index([u'bar', u'foo'], dtype='object', name=u'A') - expected = pd.Series([8.99110003361, 8.17516964785], name='B', - index=index) - - assert_series_equal(expected, result) - - def test_pipe_args(self): - # Test passing args to the pipe method of DataFrameGroupBy. - # Issue #17871 - - df = pd.DataFrame({'group': ['A', 'A', 'B', 'B', 'C'], - 'x': [1.0, 2.0, 3.0, 2.0, 5.0], - 'y': [10.0, 100.0, 1000.0, -100.0, -1000.0]}) - - def f(dfgb, arg1): - return (dfgb.filter(lambda grp: grp.y.mean() > arg1, dropna=False) - .groupby(dfgb.grouper)) - - def g(dfgb, arg2): - return dfgb.sum() / dfgb.sum().sum() + arg2 - - def h(df, arg3): - return df.x + df.y - arg3 - - result = (df - .groupby('group') - .pipe(f, 0) - .pipe(g, 10) - .pipe(h, 100)) - - # Assert the results here - index = pd.Index(['A', 'B', 'C'], name='group') - expected = pd.Series([-79.5160891089, -78.4839108911, -80], - index=index) - - assert_series_equal(expected, result) - - # test SeriesGroupby.pipe - ser = pd.Series([1, 1, 2, 2, 3, 3]) - result = ser.groupby(ser).pipe(lambda grp: grp.sum() * grp.count()) - - expected = pd.Series([4, 8, 12], index=pd.Int64Index([1, 2, 3])) - - assert_series_equal(result, expected) - - def test_empty_dataframe_groupby(self): - # GH8093 - df = DataFrame(columns=['A', 'B', 'C']) - - result = df.groupby('A').sum() - expected = DataFrame(columns=['B', 'C'], dtype=np.float64) - expected.index.name = 'A' - - assert_frame_equal(result, expected) - - def test_tuple_warns(self): - # https://github.com/pandas-dev/pandas/issues/18314 - df = pd.DataFrame({('a', 'b'): [1, 1, 2, 2], 'a': [1, 1, 1, 2], - 'b': [1, 2, 2, 2], 'c': [1, 1, 1, 1]}) - with tm.assert_produces_warning(FutureWarning) as w: - df[['a', 'b', 'c']].groupby(('a', 'b')).c.mean() - - assert "Interpreting tuple 'by' as a list" in str(w[0].message) + return func(d) + + return _func + + def afunc(data): + d = data.select(lambda x: x.hour < 11).dropna() + return np.max(d) + + grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day)) + closure_bad = grouped.agg({'high': agg_before(11, np.max)}) + closure_good = grouped.agg({'high': agg_before(11, np.max, True)}) + + assert_frame_equal(closure_bad, closure_good) + + +def test_groupby_multiindex_missing_pair(): + # GH9049 + df = DataFrame({'group1': ['a', 'a', 'a', 'b'], + 'group2': ['c', 'c', 'd', 'c'], + 'value': [1, 1, 1, 5]}) + df = df.set_index(['group1', 'group2']) + df_grouped = df.groupby(level=['group1', 'group2'], sort=True) + + res = df_grouped.agg('sum') + idx = MultiIndex.from_tuples( + [('a', 'c'), ('a', 'd'), ('b', 'c')], names=['group1', 'group2']) + exp = DataFrame([[2], [1], [5]], index=idx, columns=['value']) + + tm.assert_frame_equal(res, exp) + + +def test_groupby_multiindex_not_lexsorted(): + # GH 11640 + + # define the lexsorted version + lexsorted_mi = MultiIndex.from_tuples( + [('a', ''), ('b1', 'c1'), ('b2', 'c2')], names=['b', 'c']) + lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi) + assert lexsorted_df.columns.is_lexsorted() + + # define the non-lexsorted version + not_lexsorted_df = DataFrame(columns=['a', 'b', 'c', 'd'], + data=[[1, 'b1', 'c1', 3], + [1, 'b2', 'c2', 4]]) + not_lexsorted_df = not_lexsorted_df.pivot_table( + index='a', columns=['b', 'c'], values='d') + not_lexsorted_df = not_lexsorted_df.reset_index() + assert not not_lexsorted_df.columns.is_lexsorted() + + # compare the results + tm.assert_frame_equal(lexsorted_df, not_lexsorted_df) + + expected = lexsorted_df.groupby('a').mean() + with tm.assert_produces_warning(PerformanceWarning): + result = not_lexsorted_df.groupby('a').mean() + tm.assert_frame_equal(expected, result) + + # a transforming function should work regardless of sort + # GH 14776 + df = DataFrame({'x': ['a', 'a', 'b', 'a'], + 'y': [1, 1, 2, 2], + 'z': [1, 2, 3, 4]}).set_index(['x', 'y']) + assert not df.index.is_lexsorted() + + for level in [0, 1, [0, 1]]: + for sort in [False, True]: + result = df.groupby(level=level, sort=sort).apply( + DataFrame.drop_duplicates) + expected = df + tm.assert_frame_equal(expected, result) + + result = df.sort_index().groupby(level=level, sort=sort).apply( + DataFrame.drop_duplicates) + expected = df.sort_index() + tm.assert_frame_equal(expected, result) + + +def test_index_label_overlaps_location(): + # checking we don't have any label/location confusion in the + # the wake of GH5375 + df = DataFrame(list('ABCDE'), index=[2, 0, 2, 1, 1]) + g = df.groupby(list('ababb')) + actual = g.filter(lambda x: len(x) > 2) + expected = df.iloc[[1, 3, 4]] + assert_frame_equal(actual, expected) + + ser = df[0] + g = ser.groupby(list('ababb')) + actual = g.filter(lambda x: len(x) > 2) + expected = ser.take([1, 3, 4]) + assert_series_equal(actual, expected) + + # ... and again, with a generic Index of floats + df.index = df.index.astype(float) + g = df.groupby(list('ababb')) + actual = g.filter(lambda x: len(x) > 2) + expected = df.iloc[[1, 3, 4]] + assert_frame_equal(actual, expected) + + ser = df[0] + g = ser.groupby(list('ababb')) + actual = g.filter(lambda x: len(x) > 2) + expected = ser.take([1, 3, 4]) + assert_series_equal(actual, expected) + + +def test_transform_doesnt_clobber_ints(): + # GH 7972 + n = 6 + x = np.arange(n) + df = DataFrame({'a': x // 2, 'b': 2.0 * x, 'c': 3.0 * x}) + df2 = DataFrame({'a': x // 2 * 1.0, 'b': 2.0 * x, 'c': 3.0 * x}) + + gb = df.groupby('a') + result = gb.transform('mean') + + gb2 = df2.groupby('a') + expected = gb2.transform('mean') + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize('sort_column', ['ints', 'floats', 'strings', + ['ints', 'floats'], + ['ints', 'strings']]) +@pytest.mark.parametrize('group_column', ['int_groups', 'string_groups', + ['int_groups', 'string_groups']]) +def test_groupby_preserves_sort(sort_column, group_column): + # Test to ensure that groupby always preserves sort order of original + # object. Issue #8588 and #9651 + + df = DataFrame( + {'int_groups': [3, 1, 0, 1, 0, 3, 3, 3], + 'string_groups': ['z', 'a', 'z', 'a', 'a', 'g', 'g', 'g'], + 'ints': [8, 7, 4, 5, 2, 9, 1, 1], + 'floats': [2.3, 5.3, 6.2, -2.4, 2.2, 1.1, 1.1, 5], + 'strings': ['z', 'd', 'a', 'e', 'word', 'word2', '42', '47']}) + + # Try sorting on different types and with different group types + + df = df.sort_values(by=sort_column) + g = df.groupby(group_column) + + def test_sort(x): + assert_frame_equal(x, x.sort_values(by=sort_column)) + g.apply(test_sort) + + +def test_group_shift_with_null_key(): + # This test is designed to replicate the segfault in issue #13813. + n_rows = 1200 + + # Generate a moderately large dataframe with occasional missing + # values in column `B`, and then group by [`A`, `B`]. This should + # force `-1` in `labels` array of `g.grouper.group_info` exactly + # at those places, where the group-by key is partially missing. + df = DataFrame([(i % 12, i % 3 if i % 3 else np.nan, i) + for i in range(n_rows)], dtype=float, + columns=["A", "B", "Z"], index=None) + g = df.groupby(["A", "B"]) + + expected = DataFrame([(i + 12 if i % 3 and i < n_rows - 12 + else np.nan) + for i in range(n_rows)], dtype=float, + columns=["Z"], index=None) + result = g.shift(-1) + + assert_frame_equal(result, expected) + - with tm.assert_produces_warning(None): - df.groupby(('a', 'b')).c.mean() +def test_pivot_table_values_key_error(): + # This test is designed to replicate the error in issue #14938 + df = pd.DataFrame({'eventDate': + pd.date_range(pd.datetime.today(), + periods=20, freq='M').tolist(), + 'thename': range(0, 20)}) - def test_tuple_warns_unhashable(self): - # https://github.com/pandas-dev/pandas/issues/18314 - business_dates = date_range(start='4/1/2014', end='6/30/2014', - freq='B') - df = DataFrame(1, index=business_dates, columns=['a', 'b']) + df['year'] = df.set_index('eventDate').index.year + df['month'] = df.set_index('eventDate').index.month + + with pytest.raises(KeyError): + df.reset_index().pivot_table(index='year', columns='month', + values='badname', aggfunc='count') - with tm.assert_produces_warning(FutureWarning) as w: - df.groupby((df.index.year, df.index.month)).nth([0, 3, -1]) - assert "Interpreting tuple 'by' as a list" in str(w[0].message) +def test_empty_dataframe_groupby(): + # GH8093 + df = DataFrame(columns=['A', 'B', 'C']) - def test_tuple_correct_keyerror(self): - # https://github.com/pandas-dev/pandas/issues/18798 - df = pd.DataFrame(1, index=range(3), - columns=pd.MultiIndex.from_product([[1, 2], - [3, 4]])) - with tm.assert_raises_regex(KeyError, "(7, 8)"): - df.groupby((7, 8)).mean() + result = df.groupby('A').sum() + expected = DataFrame(columns=['B', 'C'], dtype=np.float64) + expected.index.name = 'A' + assert_frame_equal(result, expected) -def _check_groupby(df, result, keys, field, f=lambda x: x.sum()): - tups = lmap(tuple, df[keys].values) - tups = com._asarray_tuplesafe(tups) - expected = f(df.groupby(tups)[field]) - for k, v in compat.iteritems(expected): - assert (result[k] == v) + +def test_tuple_warns(): + # https://github.com/pandas-dev/pandas/issues/18314 + df = pd.DataFrame({('a', 'b'): [1, 1, 2, 2], 'a': [1, 1, 1, 2], + 'b': [1, 2, 2, 2], 'c': [1, 1, 1, 1]}) + with tm.assert_produces_warning(FutureWarning) as w: + df[['a', 'b', 'c']].groupby(('a', 'b')).c.mean() + + assert "Interpreting tuple 'by' as a list" in str(w[0].message) + + with tm.assert_produces_warning(None): + df.groupby(('a', 'b')).c.mean() + + +def test_tuple_warns_unhashable(): + # https://github.com/pandas-dev/pandas/issues/18314 + business_dates = date_range(start='4/1/2014', end='6/30/2014', + freq='B') + df = DataFrame(1, index=business_dates, columns=['a', 'b']) + + with tm.assert_produces_warning(FutureWarning) as w: + df.groupby((df.index.year, df.index.month)).nth([0, 3, -1]) + + assert "Interpreting tuple 'by' as a list" in str(w[0].message) + + +def test_tuple_correct_keyerror(): + # https://github.com/pandas-dev/pandas/issues/18798 + df = pd.DataFrame(1, index=range(3), + columns=pd.MultiIndex.from_product([[1, 2], + [3, 4]])) + with tm.assert_raises_regex(KeyError, "(7, 8)"): + df.groupby((7, 8)).mean() diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 57becd342d370..743237f5b386c 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -9,6 +9,7 @@ Index, MultiIndex, DataFrame, Series, CategoricalIndex) from pandas.util.testing import (assert_panel_equal, assert_frame_equal, assert_series_equal, assert_almost_equal) +from pandas.core.groupby.groupby import Grouping from pandas.compat import lrange, long from pandas import compat @@ -16,13 +17,12 @@ import pandas.util.testing as tm import pandas as pd -from .common import MixIn # selection # -------------------------------- -class TestSelection(MixIn): +class TestSelection(): def test_select_bad_cols(self): df = DataFrame([[1, 2]], columns=['A', 'B']) @@ -48,14 +48,14 @@ def test_groupby_duplicated_column_errormsg(self): assert c.columns.nlevels == 1 assert c.columns.size == 3 - def test_column_select_via_attr(self): - result = self.df.groupby('A').C.sum() - expected = self.df.groupby('A')['C'].sum() + def test_column_select_via_attr(self, df): + result = df.groupby('A').C.sum() + expected = df.groupby('A')['C'].sum() assert_series_equal(result, expected) - self.df['mean'] = 1.5 - result = self.df.groupby('A').mean() - expected = self.df.groupby('A').agg(np.mean) + df['mean'] = 1.5 + result = df.groupby('A').mean() + expected = df.groupby('A').agg(np.mean) assert_frame_equal(result, expected) def test_getitem_list_of_columns(self): @@ -96,7 +96,7 @@ def test_getitem_numeric_column_names(self): # grouping # -------------------------------- -class TestGrouping(MixIn): +class TestGrouping(): def test_grouper_index_types(self): # related GH5375 @@ -291,17 +291,17 @@ def test_grouper_getting_correct_binner(self): names=['one', 'two'])) assert_frame_equal(result, expected) - def test_grouper_iter(self): - assert sorted(self.df.groupby('A').grouper) == ['bar', 'foo'] + def test_grouper_iter(self, df): + assert sorted(df.groupby('A').grouper) == ['bar', 'foo'] - def test_empty_groups(self): + def test_empty_groups(self, df): # see gh-1048 - pytest.raises(ValueError, self.df.groupby, []) + pytest.raises(ValueError, df.groupby, []) - def test_groupby_grouper(self): - grouped = self.df.groupby('A') + def test_groupby_grouper(self, df): + grouped = df.groupby('A') - result = self.df.groupby(grouped.grouper).mean() + result = df.groupby(grouped.grouper).mean() expected = grouped.mean() tm.assert_frame_equal(result, expected) @@ -339,10 +339,9 @@ def test_groupby_grouper_f_sanity_checked(self): pytest.raises(AssertionError, ts.groupby, lambda key: key[0:6]) - def test_grouping_error_on_multidim_input(self): - from pandas.core.groupby.groupby import Grouping + def test_grouping_error_on_multidim_input(self, df): pytest.raises(ValueError, - Grouping, self.df.index, self.df[['A', 'A']]) + Grouping, df.index, df[['A', 'A']]) def test_multiindex_passthru(self): @@ -354,26 +353,25 @@ def test_multiindex_passthru(self): result = df.groupby(axis=1, level=[0, 1]).first() assert_frame_equal(result, df) - def test_multiindex_negative_level(self): + def test_multiindex_negative_level(self, mframe): # GH 13901 - result = self.mframe.groupby(level=-1).sum() - expected = self.mframe.groupby(level='second').sum() + result = mframe.groupby(level=-1).sum() + expected = mframe.groupby(level='second').sum() assert_frame_equal(result, expected) - result = self.mframe.groupby(level=-2).sum() - expected = self.mframe.groupby(level='first').sum() + result = mframe.groupby(level=-2).sum() + expected = mframe.groupby(level='first').sum() assert_frame_equal(result, expected) - result = self.mframe.groupby(level=[-2, -1]).sum() - expected = self.mframe + result = mframe.groupby(level=[-2, -1]).sum() + expected = mframe assert_frame_equal(result, expected) - result = self.mframe.groupby(level=[-1, 'first']).sum() - expected = self.mframe.groupby(level=['second', 'first']).sum() + result = mframe.groupby(level=[-1, 'first']).sum() + expected = mframe.groupby(level=['second', 'first']).sum() assert_frame_equal(result, expected) - def test_multifunc_select_col_integer_cols(self): - df = self.df + def test_multifunc_select_col_integer_cols(self, df): df.columns = np.arange(len(df.columns)) # it works! @@ -428,9 +426,9 @@ def test_groupby_multiindex_tuple(self): tm.assert_dict_equal(expected, result) @pytest.mark.parametrize('sort', [True, False]) - def test_groupby_level(self, sort): + def test_groupby_level(self, sort, mframe, df): # GH 17537 - frame = self.mframe + frame = mframe deleveled = frame.reset_index() result0 = frame.groupby(level=0, sort=sort).sum() @@ -464,7 +462,7 @@ def test_groupby_level(self, sort): assert_frame_equal(result1, expected1.T) # raise exception for non-MultiIndex - pytest.raises(ValueError, self.df.groupby, level=1) + pytest.raises(ValueError, df.groupby, level=1) def test_groupby_level_index_names(self): # GH4014 this used to raise ValueError since 'exp'>1 (in py2) @@ -496,9 +494,9 @@ def test_groupby_level_with_nas(self, sort): expected = Series([6., 18.], index=[0.0, 1.0]) assert_series_equal(result, expected) - def test_groupby_args(self): + def test_groupby_args(self, mframe): # PR8618 and issue 8015 - frame = self.mframe + frame = mframe def j(): frame.groupby() @@ -516,14 +514,14 @@ def k(): [True, [2, 2, 2, 0, 0, 1, 1, 3, 3, 3]], [False, [0, 0, 0, 1, 1, 2, 2, 3, 3, 3]] ]) - def test_level_preserve_order(self, sort, labels): + def test_level_preserve_order(self, sort, labels, mframe): # GH 17537 - grouped = self.mframe.groupby(level=0, sort=sort) + grouped = mframe.groupby(level=0, sort=sort) exp_labels = np.array(labels, np.intp) assert_almost_equal(grouped.grouper.labels[0], exp_labels) - def test_grouping_labels(self): - grouped = self.mframe.groupby(self.mframe.index.get_level_values(0)) + def test_grouping_labels(self, mframe): + grouped = mframe.groupby(mframe.index.get_level_values(0)) exp_labels = np.array([2, 2, 2, 0, 0, 1, 1, 3, 3, 3], dtype=np.intp) assert_almost_equal(grouped.grouper.labels[0], exp_labels) @@ -531,7 +529,7 @@ def test_grouping_labels(self): # get_group # -------------------------------- -class TestGetGroup(MixIn): +class TestGetGroup(): def test_get_group(self): with catch_warnings(record=True): @@ -638,29 +636,28 @@ def test_gb_key_len_equal_axis_len(self): # groups & iteration # -------------------------------- -class TestIteration(MixIn): +class TestIteration(): - def test_groups(self): - grouped = self.df.groupby(['A']) + def test_groups(self, df): + grouped = df.groupby(['A']) groups = grouped.groups assert groups is grouped.groups # caching works for k, v in compat.iteritems(grouped.groups): - assert (self.df.loc[v]['A'] == k).all() + assert (df.loc[v]['A'] == k).all() - grouped = self.df.groupby(['A', 'B']) + grouped = df.groupby(['A', 'B']) groups = grouped.groups assert groups is grouped.groups # caching works for k, v in compat.iteritems(grouped.groups): - assert (self.df.loc[v]['A'] == k[0]).all() - assert (self.df.loc[v]['B'] == k[1]).all() + assert (df.loc[v]['A'] == k[0]).all() + assert (df.loc[v]['B'] == k[1]).all() - def test_grouping_is_iterable(self): + def test_grouping_is_iterable(self, tsframe): # this code path isn't used anywhere else # not sure it's useful - grouped = self.tsframe.groupby([lambda x: x.weekday(), lambda x: x.year - ]) + grouped = tsframe.groupby([lambda x: x.weekday(), lambda x: x.year]) # test it works for g in grouped.grouper.groupings[0]: @@ -682,7 +679,7 @@ def test_multi_iter(self): assert e2 == two assert_series_equal(three, e3) - def test_multi_iter_frame(self): + def test_multi_iter_frame(self, three_group): k1 = np.array(['b', 'b', 'b', 'a', 'a', 'a']) k2 = np.array(['1', '2', '1', '2', '1', '2']) df = DataFrame({'v1': np.random.randn(6), @@ -715,7 +712,7 @@ def test_multi_iter_frame(self): assert len(groups) == 2 # axis = 1 - three_levels = self.three_group.groupby(['A', 'B', 'C']).mean() + three_levels = three_group.groupby(['A', 'B', 'C']).mean() grouped = three_levels.T.groupby(axis=1, level=(1, 2)) for key, group in grouped: pass @@ -733,13 +730,13 @@ def test_multi_iter_panel(self): expected = wp.reindex(major=exp_axis) assert_panel_equal(group, expected) - def test_dictify(self): - dict(iter(self.df.groupby('A'))) - dict(iter(self.df.groupby(['A', 'B']))) - dict(iter(self.df['C'].groupby(self.df['A']))) - dict(iter(self.df['C'].groupby([self.df['A'], self.df['B']]))) - dict(iter(self.df.groupby('A')['C'])) - dict(iter(self.df.groupby(['A', 'B'])['C'])) + def test_dictify(self, df): + dict(iter(df.groupby('A'))) + dict(iter(df.groupby(['A', 'B']))) + dict(iter(df['C'].groupby(df['A']))) + dict(iter(df['C'].groupby([df['A'], df['B']]))) + dict(iter(df.groupby('A')['C'])) + dict(iter(df.groupby(['A', 'B'])['C'])) def test_groupby_with_small_elem(self): # GH 8542 diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py index ccde545b5b8e9..a32ba9ad76f14 100644 --- a/pandas/tests/groupby/test_nth.py +++ b/pandas/tests/groupby/test_nth.py @@ -7,314 +7,316 @@ assert_produces_warning, assert_series_equal) -from .common import MixIn - - -class TestNth(MixIn): - - def test_first_last_nth(self): - # tests for first / last / nth - grouped = self.df.groupby('A') - first = grouped.first() - expected = self.df.loc[[1, 0], ['B', 'C', 'D']] - expected.index = Index(['bar', 'foo'], name='A') - expected = expected.sort_index() - assert_frame_equal(first, expected) - - nth = grouped.nth(0) - assert_frame_equal(nth, expected) - - last = grouped.last() - expected = self.df.loc[[5, 7], ['B', 'C', 'D']] - expected.index = Index(['bar', 'foo'], name='A') - assert_frame_equal(last, expected) - - nth = grouped.nth(-1) - assert_frame_equal(nth, expected) - - nth = grouped.nth(1) - expected = self.df.loc[[2, 3], ['B', 'C', 'D']].copy() - expected.index = Index(['foo', 'bar'], name='A') - expected = expected.sort_index() - assert_frame_equal(nth, expected) - - # it works! - grouped['B'].first() - grouped['B'].last() - grouped['B'].nth(0) - - self.df.loc[self.df['A'] == 'foo', 'B'] = np.nan - assert isna(grouped['B'].first()['foo']) - assert isna(grouped['B'].last()['foo']) - assert isna(grouped['B'].nth(0)['foo']) - - # v0.14.0 whatsnew - df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) - g = df.groupby('A') - result = g.first() - expected = df.iloc[[1, 2]].set_index('A') - assert_frame_equal(result, expected) - - expected = df.iloc[[1, 2]].set_index('A') - result = g.nth(0, dropna='any') - assert_frame_equal(result, expected) - - def test_first_last_nth_dtypes(self): - - df = self.df_mixed_floats.copy() - df['E'] = True - df['F'] = 1 - - # tests for first / last / nth - grouped = df.groupby('A') - first = grouped.first() - expected = df.loc[[1, 0], ['B', 'C', 'D', 'E', 'F']] - expected.index = Index(['bar', 'foo'], name='A') - expected = expected.sort_index() - assert_frame_equal(first, expected) - - last = grouped.last() - expected = df.loc[[5, 7], ['B', 'C', 'D', 'E', 'F']] - expected.index = Index(['bar', 'foo'], name='A') - expected = expected.sort_index() - assert_frame_equal(last, expected) - - nth = grouped.nth(1) - expected = df.loc[[3, 2], ['B', 'C', 'D', 'E', 'F']] - expected.index = Index(['bar', 'foo'], name='A') - expected = expected.sort_index() - assert_frame_equal(nth, expected) - - # GH 2763, first/last shifting dtypes - idx = lrange(10) - idx.append(9) - s = Series(data=lrange(11), index=idx, name='IntCol') - assert s.dtype == 'int64' - f = s.groupby(level=0).first() - assert f.dtype == 'int64' - - def test_nth(self): - df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) - g = df.groupby('A') - - assert_frame_equal(g.nth(0), df.iloc[[0, 2]].set_index('A')) - assert_frame_equal(g.nth(1), df.iloc[[1]].set_index('A')) - assert_frame_equal(g.nth(2), df.loc[[]].set_index('A')) - assert_frame_equal(g.nth(-1), df.iloc[[1, 2]].set_index('A')) - assert_frame_equal(g.nth(-2), df.iloc[[0]].set_index('A')) - assert_frame_equal(g.nth(-3), df.loc[[]].set_index('A')) - assert_series_equal(g.B.nth(0), df.set_index('A').B.iloc[[0, 2]]) - assert_series_equal(g.B.nth(1), df.set_index('A').B.iloc[[1]]) - assert_frame_equal(g[['B']].nth(0), - df.loc[[0, 2], ['A', 'B']].set_index('A')) - - exp = df.set_index('A') - assert_frame_equal(g.nth(0, dropna='any'), exp.iloc[[1, 2]]) - assert_frame_equal(g.nth(-1, dropna='any'), exp.iloc[[1, 2]]) - - exp['B'] = np.nan - assert_frame_equal(g.nth(7, dropna='any'), exp.iloc[[1, 2]]) - assert_frame_equal(g.nth(2, dropna='any'), exp.iloc[[1, 2]]) - - # out of bounds, regression from 0.13.1 - # GH 6621 - df = DataFrame({'color': {0: 'green', - 1: 'green', - 2: 'red', - 3: 'red', - 4: 'red'}, - 'food': {0: 'ham', - 1: 'eggs', - 2: 'eggs', - 3: 'ham', - 4: 'pork'}, - 'two': {0: 1.5456590000000001, - 1: -0.070345000000000005, - 2: -2.4004539999999999, - 3: 0.46206000000000003, - 4: 0.52350799999999997}, - 'one': {0: 0.56573799999999996, - 1: -0.9742360000000001, - 2: 1.033801, - 3: -0.78543499999999999, - 4: 0.70422799999999997}}).set_index(['color', - 'food']) - - result = df.groupby(level=0, as_index=False).nth(2) - expected = df.iloc[[-1]] - assert_frame_equal(result, expected) - - result = df.groupby(level=0, as_index=False).nth(3) - expected = df.loc[[]] - assert_frame_equal(result, expected) - - # GH 7559 - # from the vbench - df = DataFrame(np.random.randint(1, 10, (100, 2)), dtype='int64') - s = df[1] - g = df[0] - expected = s.groupby(g).first() - expected2 = s.groupby(g).apply(lambda x: x.iloc[0]) - assert_series_equal(expected2, expected, check_names=False) - assert expected.name == 1 - assert expected2.name == 1 - - # validate first - v = s[g == 1].iloc[0] - assert expected.iloc[0] == v - assert expected2.iloc[0] == v - - # this is NOT the same as .first (as sorted is default!) - # as it keeps the order in the series (and not the group order) - # related GH 7287 - expected = s.groupby(g, sort=False).first() - result = s.groupby(g, sort=False).nth(0, dropna='all') - assert_series_equal(result, expected) - - # doc example - df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) - g = df.groupby('A') - # PR 17493, related to issue 11038 - # test Series.nth with True for dropna produces FutureWarning - with assert_produces_warning(FutureWarning): - result = g.B.nth(0, dropna=True) - expected = g.B.first() - assert_series_equal(result, expected) - - # test multiple nth values - df = DataFrame([[1, np.nan], [1, 3], [1, 4], [5, 6], [5, 7]], - columns=['A', 'B']) - g = df.groupby('A') - - assert_frame_equal(g.nth(0), df.iloc[[0, 3]].set_index('A')) - assert_frame_equal(g.nth([0]), df.iloc[[0, 3]].set_index('A')) - assert_frame_equal(g.nth([0, 1]), df.iloc[[0, 1, 3, 4]].set_index('A')) - assert_frame_equal( - g.nth([0, -1]), df.iloc[[0, 2, 3, 4]].set_index('A')) - assert_frame_equal( - g.nth([0, 1, 2]), df.iloc[[0, 1, 2, 3, 4]].set_index('A')) - assert_frame_equal( - g.nth([0, 1, -1]), df.iloc[[0, 1, 2, 3, 4]].set_index('A')) - assert_frame_equal(g.nth([2]), df.iloc[[2]].set_index('A')) - assert_frame_equal(g.nth([3, 4]), df.loc[[]].set_index('A')) - - business_dates = pd.date_range(start='4/1/2014', end='6/30/2014', - freq='B') - df = DataFrame(1, index=business_dates, columns=['a', 'b']) - # get the first, fourth and last two business days for each month - key = [df.index.year, df.index.month] - result = df.groupby(key, as_index=False).nth([0, 3, -2, -1]) - expected_dates = pd.to_datetime( - ['2014/4/1', '2014/4/4', '2014/4/29', '2014/4/30', '2014/5/1', - '2014/5/6', '2014/5/29', '2014/5/30', '2014/6/2', '2014/6/5', - '2014/6/27', '2014/6/30']) - expected = DataFrame(1, columns=['a', 'b'], index=expected_dates) - assert_frame_equal(result, expected) - - def test_nth_multi_index(self): - # PR 9090, related to issue 8979 - # test nth on MultiIndex, should match .first() - grouped = self.three_group.groupby(['A', 'B']) - result = grouped.nth(0) - expected = grouped.first() - assert_frame_equal(result, expected) - - def test_nth_multi_index_as_expected(self): - # PR 9090, related to issue 8979 - # test nth on MultiIndex - three_group = DataFrame( - {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', - 'foo', 'foo', 'foo'], - 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', - 'two', 'two', 'one'], - 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny', - 'dull', 'shiny', 'shiny', 'shiny']}) - grouped = three_group.groupby(['A', 'B']) - result = grouped.nth(0) - expected = DataFrame( - {'C': ['dull', 'dull', 'dull', 'dull']}, - index=MultiIndex.from_arrays([['bar', 'bar', 'foo', 'foo'], - ['one', 'two', 'one', 'two']], - names=['A', 'B'])) - assert_frame_equal(result, expected) - - def test_groupby_head_tail(self): - df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) - g_as = df.groupby('A', as_index=True) - g_not_as = df.groupby('A', as_index=False) - - # as_index= False, much easier - assert_frame_equal(df.loc[[0, 2]], g_not_as.head(1)) - assert_frame_equal(df.loc[[1, 2]], g_not_as.tail(1)) - - empty_not_as = DataFrame(columns=df.columns, - index=pd.Index([], dtype=df.index.dtype)) - empty_not_as['A'] = empty_not_as['A'].astype(df.A.dtype) - empty_not_as['B'] = empty_not_as['B'].astype(df.B.dtype) - assert_frame_equal(empty_not_as, g_not_as.head(0)) - assert_frame_equal(empty_not_as, g_not_as.tail(0)) - assert_frame_equal(empty_not_as, g_not_as.head(-1)) - assert_frame_equal(empty_not_as, g_not_as.tail(-1)) - - assert_frame_equal(df, g_not_as.head(7)) # contains all - assert_frame_equal(df, g_not_as.tail(7)) - - # as_index=True, (used to be different) - df_as = df - - assert_frame_equal(df_as.loc[[0, 2]], g_as.head(1)) - assert_frame_equal(df_as.loc[[1, 2]], g_as.tail(1)) - - empty_as = DataFrame(index=df_as.index[:0], columns=df.columns) - empty_as['A'] = empty_not_as['A'].astype(df.A.dtype) - empty_as['B'] = empty_not_as['B'].astype(df.B.dtype) - assert_frame_equal(empty_as, g_as.head(0)) - assert_frame_equal(empty_as, g_as.tail(0)) - assert_frame_equal(empty_as, g_as.head(-1)) - assert_frame_equal(empty_as, g_as.tail(-1)) - - assert_frame_equal(df_as, g_as.head(7)) # contains all - assert_frame_equal(df_as, g_as.tail(7)) - - # test with selection - assert_frame_equal(g_as[[]].head(1), df_as.loc[[0, 2], []]) - assert_frame_equal(g_as[['A']].head(1), df_as.loc[[0, 2], ['A']]) - assert_frame_equal(g_as[['B']].head(1), df_as.loc[[0, 2], ['B']]) - assert_frame_equal(g_as[['A', 'B']].head(1), df_as.loc[[0, 2]]) - - assert_frame_equal(g_not_as[[]].head(1), df_as.loc[[0, 2], []]) - assert_frame_equal(g_not_as[['A']].head(1), df_as.loc[[0, 2], ['A']]) - assert_frame_equal(g_not_as[['B']].head(1), df_as.loc[[0, 2], ['B']]) - assert_frame_equal(g_not_as[['A', 'B']].head(1), df_as.loc[[0, 2]]) - - def test_group_selection_cache(self): - # GH 12839 nth, head, and tail should return same result consistently - df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) - expected = df.iloc[[0, 2]].set_index('A') - - g = df.groupby('A') - result1 = g.head(n=2) - result2 = g.nth(0) - assert_frame_equal(result1, df) - assert_frame_equal(result2, expected) - - g = df.groupby('A') - result1 = g.tail(n=2) - result2 = g.nth(0) - assert_frame_equal(result1, df) - assert_frame_equal(result2, expected) - - g = df.groupby('A') - result1 = g.nth(0) - result2 = g.head(n=2) - assert_frame_equal(result1, expected) - assert_frame_equal(result2, df) - - g = df.groupby('A') - result1 = g.nth(0) - result2 = g.tail(n=2) - assert_frame_equal(result1, expected) - assert_frame_equal(result2, df) + +def test_first_last_nth(df): + # tests for first / last / nth + grouped = df.groupby('A') + first = grouped.first() + expected = df.loc[[1, 0], ['B', 'C', 'D']] + expected.index = Index(['bar', 'foo'], name='A') + expected = expected.sort_index() + assert_frame_equal(first, expected) + + nth = grouped.nth(0) + assert_frame_equal(nth, expected) + + last = grouped.last() + expected = df.loc[[5, 7], ['B', 'C', 'D']] + expected.index = Index(['bar', 'foo'], name='A') + assert_frame_equal(last, expected) + + nth = grouped.nth(-1) + assert_frame_equal(nth, expected) + + nth = grouped.nth(1) + expected = df.loc[[2, 3], ['B', 'C', 'D']].copy() + expected.index = Index(['foo', 'bar'], name='A') + expected = expected.sort_index() + assert_frame_equal(nth, expected) + + # it works! + grouped['B'].first() + grouped['B'].last() + grouped['B'].nth(0) + + df.loc[df['A'] == 'foo', 'B'] = np.nan + assert isna(grouped['B'].first()['foo']) + assert isna(grouped['B'].last()['foo']) + assert isna(grouped['B'].nth(0)['foo']) + + # v0.14.0 whatsnew + df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) + g = df.groupby('A') + result = g.first() + expected = df.iloc[[1, 2]].set_index('A') + assert_frame_equal(result, expected) + + expected = df.iloc[[1, 2]].set_index('A') + result = g.nth(0, dropna='any') + assert_frame_equal(result, expected) + + +def test_first_last_nth_dtypes(df_mixed_floats): + + df = df_mixed_floats.copy() + df['E'] = True + df['F'] = 1 + + # tests for first / last / nth + grouped = df.groupby('A') + first = grouped.first() + expected = df.loc[[1, 0], ['B', 'C', 'D', 'E', 'F']] + expected.index = Index(['bar', 'foo'], name='A') + expected = expected.sort_index() + assert_frame_equal(first, expected) + + last = grouped.last() + expected = df.loc[[5, 7], ['B', 'C', 'D', 'E', 'F']] + expected.index = Index(['bar', 'foo'], name='A') + expected = expected.sort_index() + assert_frame_equal(last, expected) + + nth = grouped.nth(1) + expected = df.loc[[3, 2], ['B', 'C', 'D', 'E', 'F']] + expected.index = Index(['bar', 'foo'], name='A') + expected = expected.sort_index() + assert_frame_equal(nth, expected) + + # GH 2763, first/last shifting dtypes + idx = lrange(10) + idx.append(9) + s = Series(data=lrange(11), index=idx, name='IntCol') + assert s.dtype == 'int64' + f = s.groupby(level=0).first() + assert f.dtype == 'int64' + + +def test_nth(): + df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) + g = df.groupby('A') + + assert_frame_equal(g.nth(0), df.iloc[[0, 2]].set_index('A')) + assert_frame_equal(g.nth(1), df.iloc[[1]].set_index('A')) + assert_frame_equal(g.nth(2), df.loc[[]].set_index('A')) + assert_frame_equal(g.nth(-1), df.iloc[[1, 2]].set_index('A')) + assert_frame_equal(g.nth(-2), df.iloc[[0]].set_index('A')) + assert_frame_equal(g.nth(-3), df.loc[[]].set_index('A')) + assert_series_equal(g.B.nth(0), df.set_index('A').B.iloc[[0, 2]]) + assert_series_equal(g.B.nth(1), df.set_index('A').B.iloc[[1]]) + assert_frame_equal(g[['B']].nth(0), + df.loc[[0, 2], ['A', 'B']].set_index('A')) + + exp = df.set_index('A') + assert_frame_equal(g.nth(0, dropna='any'), exp.iloc[[1, 2]]) + assert_frame_equal(g.nth(-1, dropna='any'), exp.iloc[[1, 2]]) + + exp['B'] = np.nan + assert_frame_equal(g.nth(7, dropna='any'), exp.iloc[[1, 2]]) + assert_frame_equal(g.nth(2, dropna='any'), exp.iloc[[1, 2]]) + + # out of bounds, regression from 0.13.1 + # GH 6621 + df = DataFrame({'color': {0: 'green', + 1: 'green', + 2: 'red', + 3: 'red', + 4: 'red'}, + 'food': {0: 'ham', + 1: 'eggs', + 2: 'eggs', + 3: 'ham', + 4: 'pork'}, + 'two': {0: 1.5456590000000001, + 1: -0.070345000000000005, + 2: -2.4004539999999999, + 3: 0.46206000000000003, + 4: 0.52350799999999997}, + 'one': {0: 0.56573799999999996, + 1: -0.9742360000000001, + 2: 1.033801, + 3: -0.78543499999999999, + 4: 0.70422799999999997}}).set_index(['color', + 'food']) + + result = df.groupby(level=0, as_index=False).nth(2) + expected = df.iloc[[-1]] + assert_frame_equal(result, expected) + + result = df.groupby(level=0, as_index=False).nth(3) + expected = df.loc[[]] + assert_frame_equal(result, expected) + + # GH 7559 + # from the vbench + df = DataFrame(np.random.randint(1, 10, (100, 2)), dtype='int64') + s = df[1] + g = df[0] + expected = s.groupby(g).first() + expected2 = s.groupby(g).apply(lambda x: x.iloc[0]) + assert_series_equal(expected2, expected, check_names=False) + assert expected.name == 1 + assert expected2.name == 1 + + # validate first + v = s[g == 1].iloc[0] + assert expected.iloc[0] == v + assert expected2.iloc[0] == v + + # this is NOT the same as .first (as sorted is default!) + # as it keeps the order in the series (and not the group order) + # related GH 7287 + expected = s.groupby(g, sort=False).first() + result = s.groupby(g, sort=False).nth(0, dropna='all') + assert_series_equal(result, expected) + + # doc example + df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) + g = df.groupby('A') + # PR 17493, related to issue 11038 + # test Series.nth with True for dropna produces FutureWarning + with assert_produces_warning(FutureWarning): + result = g.B.nth(0, dropna=True) + expected = g.B.first() + assert_series_equal(result, expected) + + # test multiple nth values + df = DataFrame([[1, np.nan], [1, 3], [1, 4], [5, 6], [5, 7]], + columns=['A', 'B']) + g = df.groupby('A') + + assert_frame_equal(g.nth(0), df.iloc[[0, 3]].set_index('A')) + assert_frame_equal(g.nth([0]), df.iloc[[0, 3]].set_index('A')) + assert_frame_equal(g.nth([0, 1]), df.iloc[[0, 1, 3, 4]].set_index('A')) + assert_frame_equal( + g.nth([0, -1]), df.iloc[[0, 2, 3, 4]].set_index('A')) + assert_frame_equal( + g.nth([0, 1, 2]), df.iloc[[0, 1, 2, 3, 4]].set_index('A')) + assert_frame_equal( + g.nth([0, 1, -1]), df.iloc[[0, 1, 2, 3, 4]].set_index('A')) + assert_frame_equal(g.nth([2]), df.iloc[[2]].set_index('A')) + assert_frame_equal(g.nth([3, 4]), df.loc[[]].set_index('A')) + + business_dates = pd.date_range(start='4/1/2014', end='6/30/2014', + freq='B') + df = DataFrame(1, index=business_dates, columns=['a', 'b']) + # get the first, fourth and last two business days for each month + key = [df.index.year, df.index.month] + result = df.groupby(key, as_index=False).nth([0, 3, -2, -1]) + expected_dates = pd.to_datetime( + ['2014/4/1', '2014/4/4', '2014/4/29', '2014/4/30', '2014/5/1', + '2014/5/6', '2014/5/29', '2014/5/30', '2014/6/2', '2014/6/5', + '2014/6/27', '2014/6/30']) + expected = DataFrame(1, columns=['a', 'b'], index=expected_dates) + assert_frame_equal(result, expected) + + +def test_nth_multi_index(three_group): + # PR 9090, related to issue 8979 + # test nth on MultiIndex, should match .first() + grouped = three_group.groupby(['A', 'B']) + result = grouped.nth(0) + expected = grouped.first() + assert_frame_equal(result, expected) + + +def test_nth_multi_index_as_expected(): + # PR 9090, related to issue 8979 + # test nth on MultiIndex + three_group = DataFrame( + {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', + 'foo', 'foo', 'foo'], + 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', + 'two', 'two', 'one'], + 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny', + 'dull', 'shiny', 'shiny', 'shiny']}) + grouped = three_group.groupby(['A', 'B']) + result = grouped.nth(0) + expected = DataFrame( + {'C': ['dull', 'dull', 'dull', 'dull']}, + index=MultiIndex.from_arrays([['bar', 'bar', 'foo', 'foo'], + ['one', 'two', 'one', 'two']], + names=['A', 'B'])) + assert_frame_equal(result, expected) + + +def test_groupby_head_tail(): + df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) + g_as = df.groupby('A', as_index=True) + g_not_as = df.groupby('A', as_index=False) + + # as_index= False, much easier + assert_frame_equal(df.loc[[0, 2]], g_not_as.head(1)) + assert_frame_equal(df.loc[[1, 2]], g_not_as.tail(1)) + + empty_not_as = DataFrame(columns=df.columns, + index=pd.Index([], dtype=df.index.dtype)) + empty_not_as['A'] = empty_not_as['A'].astype(df.A.dtype) + empty_not_as['B'] = empty_not_as['B'].astype(df.B.dtype) + assert_frame_equal(empty_not_as, g_not_as.head(0)) + assert_frame_equal(empty_not_as, g_not_as.tail(0)) + assert_frame_equal(empty_not_as, g_not_as.head(-1)) + assert_frame_equal(empty_not_as, g_not_as.tail(-1)) + + assert_frame_equal(df, g_not_as.head(7)) # contains all + assert_frame_equal(df, g_not_as.tail(7)) + + # as_index=True, (used to be different) + df_as = df + + assert_frame_equal(df_as.loc[[0, 2]], g_as.head(1)) + assert_frame_equal(df_as.loc[[1, 2]], g_as.tail(1)) + + empty_as = DataFrame(index=df_as.index[:0], columns=df.columns) + empty_as['A'] = empty_not_as['A'].astype(df.A.dtype) + empty_as['B'] = empty_not_as['B'].astype(df.B.dtype) + assert_frame_equal(empty_as, g_as.head(0)) + assert_frame_equal(empty_as, g_as.tail(0)) + assert_frame_equal(empty_as, g_as.head(-1)) + assert_frame_equal(empty_as, g_as.tail(-1)) + + assert_frame_equal(df_as, g_as.head(7)) # contains all + assert_frame_equal(df_as, g_as.tail(7)) + + # test with selection + assert_frame_equal(g_as[[]].head(1), df_as.loc[[0, 2], []]) + assert_frame_equal(g_as[['A']].head(1), df_as.loc[[0, 2], ['A']]) + assert_frame_equal(g_as[['B']].head(1), df_as.loc[[0, 2], ['B']]) + assert_frame_equal(g_as[['A', 'B']].head(1), df_as.loc[[0, 2]]) + + assert_frame_equal(g_not_as[[]].head(1), df_as.loc[[0, 2], []]) + assert_frame_equal(g_not_as[['A']].head(1), df_as.loc[[0, 2], ['A']]) + assert_frame_equal(g_not_as[['B']].head(1), df_as.loc[[0, 2], ['B']]) + assert_frame_equal(g_not_as[['A', 'B']].head(1), df_as.loc[[0, 2]]) + + +def test_group_selection_cache(): + # GH 12839 nth, head, and tail should return same result consistently + df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) + expected = df.iloc[[0, 2]].set_index('A') + + g = df.groupby('A') + result1 = g.head(n=2) + result2 = g.nth(0) + assert_frame_equal(result1, df) + assert_frame_equal(result2, expected) + + g = df.groupby('A') + result1 = g.tail(n=2) + result2 = g.nth(0) + assert_frame_equal(result1, df) + assert_frame_equal(result2, expected) + + g = df.groupby('A') + result1 = g.nth(0) + result2 = g.head(n=2) + assert_frame_equal(result1, expected) + assert_frame_equal(result2, df) + + g = df.groupby('A') + result1 = g.nth(0) + result2 = g.tail(n=2) + assert_frame_equal(result1, expected) + assert_frame_equal(result2, df) def test_nth_empty(): diff --git a/pandas/tests/groupby/test_rank.py b/pandas/tests/groupby/test_rank.py new file mode 100644 index 0000000000000..6ad8b4905abff --- /dev/null +++ b/pandas/tests/groupby/test_rank.py @@ -0,0 +1,254 @@ +import pytest +import numpy as np +import pandas as pd +from pandas import DataFrame, concat +from pandas.util import testing as tm + + +def test_rank_apply(): + lev1 = tm.rands_array(10, 100) + lev2 = tm.rands_array(10, 130) + lab1 = np.random.randint(0, 100, size=500) + lab2 = np.random.randint(0, 130, size=500) + + df = DataFrame({'value': np.random.randn(500), + 'key1': lev1.take(lab1), + 'key2': lev2.take(lab2)}) + + result = df.groupby(['key1', 'key2']).value.rank() + + expected = [] + for key, piece in df.groupby(['key1', 'key2']): + expected.append(piece.value.rank()) + expected = concat(expected, axis=0) + expected = expected.reindex(result.index) + tm.assert_series_equal(result, expected) + + result = df.groupby(['key1', 'key2']).value.rank(pct=True) + + expected = [] + for key, piece in df.groupby(['key1', 'key2']): + expected.append(piece.value.rank(pct=True)) + expected = concat(expected, axis=0) + expected = expected.reindex(result.index) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("grps", [ + ['qux'], ['qux', 'quux']]) +@pytest.mark.parametrize("vals", [ + [2, 2, 8, 2, 6], + [pd.Timestamp('2018-01-02'), pd.Timestamp('2018-01-02'), + pd.Timestamp('2018-01-08'), pd.Timestamp('2018-01-02'), + pd.Timestamp('2018-01-06')]]) +@pytest.mark.parametrize("ties_method,ascending,pct,exp", [ + ('average', True, False, [2., 2., 5., 2., 4.]), + ('average', True, True, [0.4, 0.4, 1.0, 0.4, 0.8]), + ('average', False, False, [4., 4., 1., 4., 2.]), + ('average', False, True, [.8, .8, .2, .8, .4]), + ('min', True, False, [1., 1., 5., 1., 4.]), + ('min', True, True, [0.2, 0.2, 1.0, 0.2, 0.8]), + ('min', False, False, [3., 3., 1., 3., 2.]), + ('min', False, True, [.6, .6, .2, .6, .4]), + ('max', True, False, [3., 3., 5., 3., 4.]), + ('max', True, True, [0.6, 0.6, 1.0, 0.6, 0.8]), + ('max', False, False, [5., 5., 1., 5., 2.]), + ('max', False, True, [1., 1., .2, 1., .4]), + ('first', True, False, [1., 2., 5., 3., 4.]), + ('first', True, True, [0.2, 0.4, 1.0, 0.6, 0.8]), + ('first', False, False, [3., 4., 1., 5., 2.]), + ('first', False, True, [.6, .8, .2, 1., .4]), + ('dense', True, False, [1., 1., 3., 1., 2.]), + ('dense', True, True, [0.2, 0.2, 0.6, 0.2, 0.4]), + ('dense', False, False, [3., 3., 1., 3., 2.]), + ('dense', False, True, [.6, .6, .2, .6, .4]), +]) +def test_rank_args(grps, vals, ties_method, ascending, pct, exp): + key = np.repeat(grps, len(vals)) + vals = vals * len(grps) + df = DataFrame({'key': key, 'val': vals}) + result = df.groupby('key').rank(method=ties_method, + ascending=ascending, pct=pct) + + exp_df = DataFrame(exp * len(grps), columns=['val']) + tm.assert_frame_equal(result, exp_df) + + +@pytest.mark.parametrize("grps", [ + ['qux'], ['qux', 'quux']]) +@pytest.mark.parametrize("vals", [ + [-np.inf, -np.inf, np.nan, 1., np.nan, np.inf, np.inf], +]) +@pytest.mark.parametrize("ties_method,ascending,na_option,exp", [ + ('average', True, 'keep', [1.5, 1.5, np.nan, 3, np.nan, 4.5, 4.5]), + ('average', True, 'top', [3.5, 3.5, 1.5, 5., 1.5, 6.5, 6.5]), + ('average', True, 'bottom', [1.5, 1.5, 6.5, 3., 6.5, 4.5, 4.5]), + ('average', False, 'keep', [4.5, 4.5, np.nan, 3, np.nan, 1.5, 1.5]), + ('average', False, 'top', [6.5, 6.5, 1.5, 5., 1.5, 3.5, 3.5]), + ('average', False, 'bottom', [4.5, 4.5, 6.5, 3., 6.5, 1.5, 1.5]), + ('min', True, 'keep', [1., 1., np.nan, 3., np.nan, 4., 4.]), + ('min', True, 'top', [3., 3., 1., 5., 1., 6., 6.]), + ('min', True, 'bottom', [1., 1., 6., 3., 6., 4., 4.]), + ('min', False, 'keep', [4., 4., np.nan, 3., np.nan, 1., 1.]), + ('min', False, 'top', [6., 6., 1., 5., 1., 3., 3.]), + ('min', False, 'bottom', [4., 4., 6., 3., 6., 1., 1.]), + ('max', True, 'keep', [2., 2., np.nan, 3., np.nan, 5., 5.]), + ('max', True, 'top', [4., 4., 2., 5., 2., 7., 7.]), + ('max', True, 'bottom', [2., 2., 7., 3., 7., 5., 5.]), + ('max', False, 'keep', [5., 5., np.nan, 3., np.nan, 2., 2.]), + ('max', False, 'top', [7., 7., 2., 5., 2., 4., 4.]), + ('max', False, 'bottom', [5., 5., 7., 3., 7., 2., 2.]), + ('first', True, 'keep', [1., 2., np.nan, 3., np.nan, 4., 5.]), + ('first', True, 'top', [3., 4., 1., 5., 2., 6., 7.]), + ('first', True, 'bottom', [1., 2., 6., 3., 7., 4., 5.]), + ('first', False, 'keep', [4., 5., np.nan, 3., np.nan, 1., 2.]), + ('first', False, 'top', [6., 7., 1., 5., 2., 3., 4.]), + ('first', False, 'bottom', [4., 5., 6., 3., 7., 1., 2.]), + ('dense', True, 'keep', [1., 1., np.nan, 2., np.nan, 3., 3.]), + ('dense', True, 'top', [2., 2., 1., 3., 1., 4., 4.]), + ('dense', True, 'bottom', [1., 1., 4., 2., 4., 3., 3.]), + ('dense', False, 'keep', [3., 3., np.nan, 2., np.nan, 1., 1.]), + ('dense', False, 'top', [4., 4., 1., 3., 1., 2., 2.]), + ('dense', False, 'bottom', [3., 3., 4., 2., 4., 1., 1.]) +]) +def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp): + # GH 20561 + key = np.repeat(grps, len(vals)) + vals = vals * len(grps) + df = DataFrame({'key': key, 'val': vals}) + result = df.groupby('key').rank(method=ties_method, + ascending=ascending, + na_option=na_option) + exp_df = DataFrame(exp * len(grps), columns=['val']) + tm.assert_frame_equal(result, exp_df) + + +@pytest.mark.parametrize("grps", [ + ['qux'], ['qux', 'quux']]) +@pytest.mark.parametrize("vals", [ + [2, 2, np.nan, 8, 2, 6, np.nan, np.nan], # floats + [pd.Timestamp('2018-01-02'), pd.Timestamp('2018-01-02'), np.nan, + pd.Timestamp('2018-01-08'), pd.Timestamp('2018-01-02'), + pd.Timestamp('2018-01-06'), np.nan, np.nan] +]) +@pytest.mark.parametrize("ties_method,ascending,na_option,pct,exp", [ + ('average', True, 'keep', False, + [2., 2., np.nan, 5., 2., 4., np.nan, np.nan]), + ('average', True, 'keep', True, + [0.4, 0.4, np.nan, 1.0, 0.4, 0.8, np.nan, np.nan]), + ('average', False, 'keep', False, + [4., 4., np.nan, 1., 4., 2., np.nan, np.nan]), + ('average', False, 'keep', True, + [.8, 0.8, np.nan, 0.2, 0.8, 0.4, np.nan, np.nan]), + ('min', True, 'keep', False, + [1., 1., np.nan, 5., 1., 4., np.nan, np.nan]), + ('min', True, 'keep', True, + [0.2, 0.2, np.nan, 1.0, 0.2, 0.8, np.nan, np.nan]), + ('min', False, 'keep', False, + [3., 3., np.nan, 1., 3., 2., np.nan, np.nan]), + ('min', False, 'keep', True, + [.6, 0.6, np.nan, 0.2, 0.6, 0.4, np.nan, np.nan]), + ('max', True, 'keep', False, + [3., 3., np.nan, 5., 3., 4., np.nan, np.nan]), + ('max', True, 'keep', True, + [0.6, 0.6, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan]), + ('max', False, 'keep', False, + [5., 5., np.nan, 1., 5., 2., np.nan, np.nan]), + ('max', False, 'keep', True, + [1., 1., np.nan, 0.2, 1., 0.4, np.nan, np.nan]), + ('first', True, 'keep', False, + [1., 2., np.nan, 5., 3., 4., np.nan, np.nan]), + ('first', True, 'keep', True, + [0.2, 0.4, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan]), + ('first', False, 'keep', False, + [3., 4., np.nan, 1., 5., 2., np.nan, np.nan]), + ('first', False, 'keep', True, + [.6, 0.8, np.nan, 0.2, 1., 0.4, np.nan, np.nan]), + ('dense', True, 'keep', False, + [1., 1., np.nan, 3., 1., 2., np.nan, np.nan]), + ('dense', True, 'keep', True, + [0.2, 0.2, np.nan, 0.6, 0.2, 0.4, np.nan, np.nan]), + ('dense', False, 'keep', False, + [3., 3., np.nan, 1., 3., 2., np.nan, np.nan]), + ('dense', False, 'keep', True, + [.6, 0.6, np.nan, 0.2, 0.6, 0.4, np.nan, np.nan]), + ('average', True, 'no_na', False, [2., 2., 7., 5., 2., 4., 7., 7.]), + ('average', True, 'no_na', True, + [0.25, 0.25, 0.875, 0.625, 0.25, 0.5, 0.875, 0.875]), + ('average', False, 'no_na', False, [4., 4., 7., 1., 4., 2., 7., 7.]), + ('average', False, 'no_na', True, + [0.5, 0.5, 0.875, 0.125, 0.5, 0.25, 0.875, 0.875]), + ('min', True, 'no_na', False, [1., 1., 6., 5., 1., 4., 6., 6.]), + ('min', True, 'no_na', True, + [0.125, 0.125, 0.75, 0.625, 0.125, 0.5, 0.75, 0.75]), + ('min', False, 'no_na', False, [3., 3., 6., 1., 3., 2., 6., 6.]), + ('min', False, 'no_na', True, + [0.375, 0.375, 0.75, 0.125, 0.375, 0.25, 0.75, 0.75]), + ('max', True, 'no_na', False, [3., 3., 8., 5., 3., 4., 8., 8.]), + ('max', True, 'no_na', True, + [0.375, 0.375, 1., 0.625, 0.375, 0.5, 1., 1.]), + ('max', False, 'no_na', False, [5., 5., 8., 1., 5., 2., 8., 8.]), + ('max', False, 'no_na', True, + [0.625, 0.625, 1., 0.125, 0.625, 0.25, 1., 1.]), + ('first', True, 'no_na', False, [1., 2., 6., 5., 3., 4., 7., 8.]), + ('first', True, 'no_na', True, + [0.125, 0.25, 0.75, 0.625, 0.375, 0.5, 0.875, 1.]), + ('first', False, 'no_na', False, [3., 4., 6., 1., 5., 2., 7., 8.]), + ('first', False, 'no_na', True, + [0.375, 0.5, 0.75, 0.125, 0.625, 0.25, 0.875, 1.]), + ('dense', True, 'no_na', False, [1., 1., 4., 3., 1., 2., 4., 4.]), + ('dense', True, 'no_na', True, + [0.125, 0.125, 0.5, 0.375, 0.125, 0.25, 0.5, 0.5]), + ('dense', False, 'no_na', False, [3., 3., 4., 1., 3., 2., 4., 4.]), + ('dense', False, 'no_na', True, + [0.375, 0.375, 0.5, 0.125, 0.375, 0.25, 0.5, 0.5]) +]) +def test_rank_args_missing(grps, vals, ties_method, ascending, + na_option, pct, exp): + key = np.repeat(grps, len(vals)) + vals = vals * len(grps) + df = DataFrame({'key': key, 'val': vals}) + result = df.groupby('key').rank(method=ties_method, + ascending=ascending, + na_option=na_option, pct=pct) + + exp_df = DataFrame(exp * len(grps), columns=['val']) + tm.assert_frame_equal(result, exp_df) + + +@pytest.mark.parametrize("pct,exp", [ + (False, [3., 3., 3., 3., 3.]), + (True, [.6, .6, .6, .6, .6])]) +def test_rank_resets_each_group(pct, exp): + df = DataFrame( + {'key': ['a', 'a', 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'b'], + 'val': [1] * 10} + ) + result = df.groupby('key').rank(pct=pct) + exp_df = DataFrame(exp * 2, columns=['val']) + tm.assert_frame_equal(result, exp_df) + + +def test_rank_avg_even_vals(): + df = DataFrame({'key': ['a'] * 4, 'val': [1] * 4}) + result = df.groupby('key').rank() + exp_df = DataFrame([2.5, 2.5, 2.5, 2.5], columns=['val']) + tm.assert_frame_equal(result, exp_df) + + +@pytest.mark.parametrize("ties_method", [ + 'average', 'min', 'max', 'first', 'dense']) +@pytest.mark.parametrize("ascending", [True, False]) +@pytest.mark.parametrize("na_option", ["keep", "top", "bottom"]) +@pytest.mark.parametrize("pct", [True, False]) +@pytest.mark.parametrize("vals", [ + ['bar', 'bar', 'foo', 'bar', 'baz'], + ['bar', np.nan, 'foo', np.nan, 'baz'] +]) +def test_rank_object_raises(ties_method, ascending, na_option, + pct, vals): + df = DataFrame({'key': ['foo'] * 5, 'val': vals}) + with tm.assert_raises_regex(TypeError, "not callable"): + df.groupby('key').rank(method=ties_method, + ascending=ascending, + na_option=na_option, pct=pct) diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index 390b99d0fab1c..626057c1ea760 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -10,728 +10,758 @@ _ensure_platform_int, is_timedelta64_dtype) from pandas.compat import StringIO from pandas._libs import groupby -from .common import MixIn, assert_fp_equal from pandas.util.testing import assert_frame_equal, assert_series_equal from pandas.core.groupby.groupby import DataError from pandas.core.config import option_context -class TestGroupBy(MixIn): - - def test_transform(self): - data = Series(np.arange(9) // 3, index=np.arange(9)) - - index = np.arange(9) - np.random.shuffle(index) - data = data.reindex(index) - - grouped = data.groupby(lambda x: x // 3) - - transformed = grouped.transform(lambda x: x * x.sum()) - assert transformed[7] == 12 - - # GH 8046 - # make sure that we preserve the input order - - df = DataFrame( - np.arange(6, dtype='int64').reshape( - 3, 2), columns=["a", "b"], index=[0, 2, 1]) - key = [0, 0, 1] - expected = df.sort_index().groupby(key).transform( - lambda x: x - x.mean()).groupby(key).mean() - result = df.groupby(key).transform(lambda x: x - x.mean()).groupby( - key).mean() - assert_frame_equal(result, expected) - - def demean(arr): - return arr - arr.mean() - - people = DataFrame(np.random.randn(5, 5), - columns=['a', 'b', 'c', 'd', 'e'], - index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis']) - key = ['one', 'two', 'one', 'two', 'one'] - result = people.groupby(key).transform(demean).groupby(key).mean() - expected = people.groupby(key).apply(demean).groupby(key).mean() - assert_frame_equal(result, expected) - - # GH 8430 - df = tm.makeTimeDataFrame() - g = df.groupby(pd.Grouper(freq='M')) - g.transform(lambda x: x - 1) - - # GH 9700 - df = DataFrame({'a': range(5, 10), 'b': range(5)}) - result = df.groupby('a').transform(max) - expected = DataFrame({'b': range(5)}) - tm.assert_frame_equal(result, expected) - - def test_transform_fast(self): - - df = DataFrame({'id': np.arange(100000) / 3, - 'val': np.random.randn(100000)}) - - grp = df.groupby('id')['val'] - - values = np.repeat(grp.mean().values, - _ensure_platform_int(grp.count().values)) - expected = pd.Series(values, index=df.index, name='val') - - result = grp.transform(np.mean) - assert_series_equal(result, expected) - - result = grp.transform('mean') - assert_series_equal(result, expected) - - # GH 12737 - df = pd.DataFrame({'grouping': [0, 1, 1, 3], 'f': [1.1, 2.1, 3.1, 4.5], - 'd': pd.date_range('2014-1-1', '2014-1-4'), - 'i': [1, 2, 3, 4]}, - columns=['grouping', 'f', 'i', 'd']) - result = df.groupby('grouping').transform('first') - - dates = [pd.Timestamp('2014-1-1'), pd.Timestamp('2014-1-2'), - pd.Timestamp('2014-1-2'), pd.Timestamp('2014-1-4')] - expected = pd.DataFrame({'f': [1.1, 2.1, 2.1, 4.5], - 'd': dates, - 'i': [1, 2, 2, 4]}, - columns=['f', 'i', 'd']) - assert_frame_equal(result, expected) - - # selection - result = df.groupby('grouping')[['f', 'i']].transform('first') - expected = expected[['f', 'i']] - assert_frame_equal(result, expected) - - # dup columns - df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['g', 'a', 'a']) - result = df.groupby('g').transform('first') - expected = df.drop('g', axis=1) - assert_frame_equal(result, expected) - - def test_transform_broadcast(self): - grouped = self.ts.groupby(lambda x: x.month) - result = grouped.transform(np.mean) - - tm.assert_index_equal(result.index, self.ts.index) - for _, gp in grouped: - assert_fp_equal(result.reindex(gp.index), gp.mean()) - - grouped = self.tsframe.groupby(lambda x: x.month) - result = grouped.transform(np.mean) - tm.assert_index_equal(result.index, self.tsframe.index) - for _, gp in grouped: - agged = gp.mean() - res = result.reindex(gp.index) - for col in self.tsframe: - assert_fp_equal(res[col], agged[col]) - - # group columns - grouped = self.tsframe.groupby({'A': 0, 'B': 0, 'C': 1, 'D': 1}, - axis=1) - result = grouped.transform(np.mean) - tm.assert_index_equal(result.index, self.tsframe.index) - tm.assert_index_equal(result.columns, self.tsframe.columns) - for _, gp in grouped: - agged = gp.mean(1) - res = result.reindex(columns=gp.columns) - for idx in gp.index: - assert_fp_equal(res.xs(idx), agged[idx]) - - def test_transform_axis(self): - - # make sure that we are setting the axes - # correctly when on axis=0 or 1 - # in the presence of a non-monotonic indexer - # GH12713 - - base = self.tsframe.iloc[0:5] - r = len(base.index) - c = len(base.columns) - tso = DataFrame(np.random.randn(r, c), - index=base.index, - columns=base.columns, - dtype='float64') - # monotonic - ts = tso - grouped = ts.groupby(lambda x: x.weekday()) - result = ts - grouped.transform('mean') - expected = grouped.apply(lambda x: x - x.mean()) - assert_frame_equal(result, expected) - - ts = ts.T - grouped = ts.groupby(lambda x: x.weekday(), axis=1) - result = ts - grouped.transform('mean') - expected = grouped.apply(lambda x: (x.T - x.mean(1)).T) - assert_frame_equal(result, expected) - - # non-monotonic - ts = tso.iloc[[1, 0] + list(range(2, len(base)))] - grouped = ts.groupby(lambda x: x.weekday()) - result = ts - grouped.transform('mean') - expected = grouped.apply(lambda x: x - x.mean()) - assert_frame_equal(result, expected) - - ts = ts.T - grouped = ts.groupby(lambda x: x.weekday(), axis=1) - result = ts - grouped.transform('mean') - expected = grouped.apply(lambda x: (x.T - x.mean(1)).T) - assert_frame_equal(result, expected) - - def test_transform_dtype(self): - # GH 9807 - # Check transform dtype output is preserved - df = DataFrame([[1, 3], [2, 3]]) - result = df.groupby(1).transform('mean') - expected = DataFrame([[1.5], [1.5]]) - assert_frame_equal(result, expected) - - def test_transform_bug(self): - # GH 5712 - # transforming on a datetime column - df = DataFrame(dict(A=Timestamp('20130101'), B=np.arange(5))) - result = df.groupby('A')['B'].transform( - lambda x: x.rank(ascending=False)) - expected = Series(np.arange(5, 0, step=-1), name='B') - assert_series_equal(result, expected) - - def test_transform_numeric_to_boolean(self): - # GH 16875 - # inconsistency in transforming boolean values - expected = pd.Series([True, True], name='A') - - df = pd.DataFrame({'A': [1.1, 2.2], 'B': [1, 2]}) - result = df.groupby('B').A.transform(lambda x: True) - assert_series_equal(result, expected) - - df = pd.DataFrame({'A': [1, 2], 'B': [1, 2]}) - result = df.groupby('B').A.transform(lambda x: True) - assert_series_equal(result, expected) - - def test_transform_datetime_to_timedelta(self): - # GH 15429 - # transforming a datetime to timedelta - df = DataFrame(dict(A=Timestamp('20130101'), B=np.arange(5))) - expected = pd.Series([ - Timestamp('20130101') - Timestamp('20130101')] * 5, name='A') - - # this does date math without changing result type in transform - base_time = df['A'][0] - result = df.groupby('A')['A'].transform( - lambda x: x.max() - x.min() + base_time) - base_time - assert_series_equal(result, expected) - - # this does date math and causes the transform to return timedelta - result = df.groupby('A')['A'].transform(lambda x: x.max() - x.min()) - assert_series_equal(result, expected) - - def test_transform_datetime_to_numeric(self): - # GH 10972 - # convert dt to float - df = DataFrame({ - 'a': 1, 'b': date_range('2015-01-01', periods=2, freq='D')}) - result = df.groupby('a').b.transform( - lambda x: x.dt.dayofweek - x.dt.dayofweek.mean()) - - expected = Series([-0.5, 0.5], name='b') - assert_series_equal(result, expected) - - # convert dt to int - df = DataFrame({ - 'a': 1, 'b': date_range('2015-01-01', periods=2, freq='D')}) - result = df.groupby('a').b.transform( - lambda x: x.dt.dayofweek - x.dt.dayofweek.min()) - - expected = Series([0, 1], name='b') - assert_series_equal(result, expected) - - def test_transform_casting(self): - # 13046 - data = """ - idx A ID3 DATETIME - 0 B-028 b76cd912ff "2014-10-08 13:43:27" - 1 B-054 4a57ed0b02 "2014-10-08 14:26:19" - 2 B-076 1a682034f8 "2014-10-08 14:29:01" - 3 B-023 b76cd912ff "2014-10-08 18:39:34" - 4 B-023 f88g8d7sds "2014-10-08 18:40:18" - 5 B-033 b76cd912ff "2014-10-08 18:44:30" - 6 B-032 b76cd912ff "2014-10-08 18:46:00" - 7 B-037 b76cd912ff "2014-10-08 18:52:15" - 8 B-046 db959faf02 "2014-10-08 18:59:59" - 9 B-053 b76cd912ff "2014-10-08 19:17:48" - 10 B-065 b76cd912ff "2014-10-08 19:21:38" - """ - df = pd.read_csv(StringIO(data), sep=r'\s+', - index_col=[0], parse_dates=['DATETIME']) - - result = df.groupby('ID3')['DATETIME'].transform(lambda x: x.diff()) - assert is_timedelta64_dtype(result.dtype) - - result = df[['ID3', 'DATETIME']].groupby('ID3').transform( - lambda x: x.diff()) - assert is_timedelta64_dtype(result.DATETIME.dtype) - - def test_transform_multiple(self): - grouped = self.ts.groupby([lambda x: x.year, lambda x: x.month]) - - grouped.transform(lambda x: x * 2) - grouped.transform(np.mean) - - def test_dispatch_transform(self): - df = self.tsframe[::5].reindex(self.tsframe.index) - - grouped = df.groupby(lambda x: x.month) - - filled = grouped.fillna(method='pad') - fillit = lambda x: x.fillna(method='pad') - expected = df.groupby(lambda x: x.month).transform(fillit) - assert_frame_equal(filled, expected) - - def test_transform_select_columns(self): - f = lambda x: x.mean() - result = self.df.groupby('A')['C', 'D'].transform(f) - - selection = self.df[['C', 'D']] - expected = selection.groupby(self.df['A']).transform(f) - - assert_frame_equal(result, expected) - - def test_transform_exclude_nuisance(self): - - # this also tests orderings in transform between - # series/frame to make sure it's consistent - expected = {} - grouped = self.df.groupby('A') - expected['C'] = grouped['C'].transform(np.mean) - expected['D'] = grouped['D'].transform(np.mean) - expected = DataFrame(expected) - result = self.df.groupby('A').transform(np.mean) - - assert_frame_equal(result, expected) - - def test_transform_function_aliases(self): - result = self.df.groupby('A').transform('mean') - expected = self.df.groupby('A').transform(np.mean) - assert_frame_equal(result, expected) - - result = self.df.groupby('A')['C'].transform('mean') - expected = self.df.groupby('A')['C'].transform(np.mean) - assert_series_equal(result, expected) - - def test_series_fast_transform_date(self): - # GH 13191 - df = pd.DataFrame({'grouping': [np.nan, 1, 1, 3], - 'd': pd.date_range('2014-1-1', '2014-1-4')}) - result = df.groupby('grouping')['d'].transform('first') - dates = [pd.NaT, pd.Timestamp('2014-1-2'), pd.Timestamp('2014-1-2'), - pd.Timestamp('2014-1-4')] - expected = pd.Series(dates, name='d') - assert_series_equal(result, expected) - - def test_transform_length(self): - # GH 9697 - df = pd.DataFrame({'col1': [1, 1, 2, 2], 'col2': [1, 2, 3, np.nan]}) - expected = pd.Series([3.0] * 4) - - def nsum(x): - return np.nansum(x) - - results = [df.groupby('col1').transform(sum)['col2'], - df.groupby('col1')['col2'].transform(sum), - df.groupby('col1').transform(nsum)['col2'], - df.groupby('col1')['col2'].transform(nsum)] - for result in results: - assert_series_equal(result, expected, check_names=False) - - def test_transform_coercion(self): - - # 14457 - # when we are transforming be sure to not coerce - # via assignment - df = pd.DataFrame(dict(A=['a', 'a'], B=[0, 1])) - g = df.groupby('A') - - expected = g.transform(np.mean) - result = g.transform(lambda x: np.mean(x)) - assert_frame_equal(result, expected) - - def test_groupby_transform_with_int(self): - - # GH 3740, make sure that we might upcast on item-by-item transform - - # floats - df = DataFrame(dict(A=[1, 1, 1, 2, 2, 2], B=Series(1, dtype='float64'), - C=Series( - [1, 2, 3, 1, 2, 3], dtype='float64'), D='foo')) - with np.errstate(all='ignore'): - result = df.groupby('A').transform( - lambda x: (x - x.mean()) / x.std()) - expected = DataFrame(dict(B=np.nan, C=Series( - [-1, 0, 1, -1, 0, 1], dtype='float64'))) - assert_frame_equal(result, expected) - - # int case - df = DataFrame(dict(A=[1, 1, 1, 2, 2, 2], B=1, - C=[1, 2, 3, 1, 2, 3], D='foo')) - with np.errstate(all='ignore'): - result = df.groupby('A').transform( - lambda x: (x - x.mean()) / x.std()) - expected = DataFrame(dict(B=np.nan, C=[-1, 0, 1, -1, 0, 1])) - assert_frame_equal(result, expected) - - # int that needs float conversion - s = Series([2, 3, 4, 10, 5, -1]) - df = DataFrame(dict(A=[1, 1, 1, 2, 2, 2], B=1, C=s, D='foo')) - with np.errstate(all='ignore'): - result = df.groupby('A').transform( - lambda x: (x - x.mean()) / x.std()) - - s1 = s.iloc[0:3] - s1 = (s1 - s1.mean()) / s1.std() - s2 = s.iloc[3:6] - s2 = (s2 - s2.mean()) / s2.std() - expected = DataFrame(dict(B=np.nan, C=concat([s1, s2]))) - assert_frame_equal(result, expected) - - # int downcasting - result = df.groupby('A').transform(lambda x: x * 2 / 2) - expected = DataFrame(dict(B=1, C=[2, 3, 4, 10, 5, -1])) - assert_frame_equal(result, expected) - - def test_groupby_transform_with_nan_group(self): - # GH 9941 - df = pd.DataFrame({'a': range(10), - 'b': [1, 1, 2, 3, np.nan, 4, 4, 5, 5, 5]}) - result = df.groupby(df.b)['a'].transform(max) - expected = pd.Series([1., 1., 2., 3., np.nan, 6., 6., 9., 9., 9.], - name='a') - assert_series_equal(result, expected) - - def test_transform_mixed_type(self): - index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3] - ]) - df = DataFrame({'d': [1., 1., 1., 2., 2., 2.], - 'c': np.tile(['a', 'b', 'c'], 2), - 'v': np.arange(1., 7.)}, index=index) - - def f(group): - group['g'] = group['d'] * 2 - return group[:1] - - grouped = df.groupby('c') - result = grouped.apply(f) - - assert result['d'].dtype == np.float64 - - # this is by definition a mutating operation! - with option_context('mode.chained_assignment', None): - for key, group in grouped: - res = f(group) - assert_frame_equal(res, result.loc[key]) - - def test_cython_group_transform_algos(self): - # GH 4095 - dtypes = [np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint32, - np.uint64, np.float32, np.float64] - - ops = [(groupby.group_cumprod_float64, np.cumproduct, [np.float64]), - (groupby.group_cumsum, np.cumsum, dtypes)] - - is_datetimelike = False - for pd_op, np_op, dtypes in ops: - for dtype in dtypes: - data = np.array([[1], [2], [3], [4]], dtype=dtype) - ans = np.zeros_like(data) - labels = np.array([0, 0, 0, 0], dtype=np.int64) - pd_op(ans, data, labels, is_datetimelike) - tm.assert_numpy_array_equal(np_op(data), ans[:, 0], - check_dtype=False) - - # with nans - labels = np.array([0, 0, 0, 0, 0], dtype=np.int64) - - data = np.array([[1], [2], [3], [np.nan], [4]], dtype='float64') - actual = np.zeros_like(data) - actual.fill(np.nan) - groupby.group_cumprod_float64(actual, data, labels, is_datetimelike) - expected = np.array([1, 2, 6, np.nan, 24], dtype='float64') - tm.assert_numpy_array_equal(actual[:, 0], expected) - - actual = np.zeros_like(data) - actual.fill(np.nan) - groupby.group_cumsum(actual, data, labels, is_datetimelike) - expected = np.array([1, 3, 6, np.nan, 10], dtype='float64') - tm.assert_numpy_array_equal(actual[:, 0], expected) - - # timedelta - is_datetimelike = True - data = np.array([np.timedelta64(1, 'ns')] * 5, dtype='m8[ns]')[:, None] - actual = np.zeros_like(data, dtype='int64') - groupby.group_cumsum(actual, data.view('int64'), labels, - is_datetimelike) - expected = np.array([np.timedelta64(1, 'ns'), np.timedelta64( - 2, 'ns'), np.timedelta64(3, 'ns'), np.timedelta64(4, 'ns'), - np.timedelta64(5, 'ns')]) - tm.assert_numpy_array_equal(actual[:, 0].view('m8[ns]'), expected) - - @pytest.mark.parametrize( - "op, args, targop", - [('cumprod', (), lambda x: x.cumprod()), - ('cumsum', (), lambda x: x.cumsum()), - ('shift', (-1, ), lambda x: x.shift(-1)), - ('shift', (1, ), lambda x: x.shift())]) - def test_cython_transform_series(self, op, args, targop): - # GH 4095 - s = Series(np.random.randn(1000)) - s_missing = s.copy() - s_missing.iloc[2:10] = np.nan - labels = np.random.randint(0, 50, size=1000).astype(float) - - # series - for data in [s, s_missing]: - # print(data.head()) - expected = data.groupby(labels).transform(targop) - - tm.assert_series_equal( +def assert_fp_equal(a, b): + assert (np.abs(a - b) < 1e-12).all() + + +def test_transform(): + data = Series(np.arange(9) // 3, index=np.arange(9)) + + index = np.arange(9) + np.random.shuffle(index) + data = data.reindex(index) + + grouped = data.groupby(lambda x: x // 3) + + transformed = grouped.transform(lambda x: x * x.sum()) + assert transformed[7] == 12 + + # GH 8046 + # make sure that we preserve the input order + + df = DataFrame( + np.arange(6, dtype='int64').reshape( + 3, 2), columns=["a", "b"], index=[0, 2, 1]) + key = [0, 0, 1] + expected = df.sort_index().groupby(key).transform( + lambda x: x - x.mean()).groupby(key).mean() + result = df.groupby(key).transform(lambda x: x - x.mean()).groupby( + key).mean() + assert_frame_equal(result, expected) + + def demean(arr): + return arr - arr.mean() + + people = DataFrame(np.random.randn(5, 5), + columns=['a', 'b', 'c', 'd', 'e'], + index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis']) + key = ['one', 'two', 'one', 'two', 'one'] + result = people.groupby(key).transform(demean).groupby(key).mean() + expected = people.groupby(key).apply(demean).groupby(key).mean() + assert_frame_equal(result, expected) + + # GH 8430 + df = tm.makeTimeDataFrame() + g = df.groupby(pd.Grouper(freq='M')) + g.transform(lambda x: x - 1) + + # GH 9700 + df = DataFrame({'a': range(5, 10), 'b': range(5)}) + result = df.groupby('a').transform(max) + expected = DataFrame({'b': range(5)}) + tm.assert_frame_equal(result, expected) + + +def test_transform_fast(): + + df = DataFrame({'id': np.arange(100000) / 3, + 'val': np.random.randn(100000)}) + + grp = df.groupby('id')['val'] + + values = np.repeat(grp.mean().values, + _ensure_platform_int(grp.count().values)) + expected = pd.Series(values, index=df.index, name='val') + + result = grp.transform(np.mean) + assert_series_equal(result, expected) + + result = grp.transform('mean') + assert_series_equal(result, expected) + + # GH 12737 + df = pd.DataFrame({'grouping': [0, 1, 1, 3], 'f': [1.1, 2.1, 3.1, 4.5], + 'd': pd.date_range('2014-1-1', '2014-1-4'), + 'i': [1, 2, 3, 4]}, + columns=['grouping', 'f', 'i', 'd']) + result = df.groupby('grouping').transform('first') + + dates = [pd.Timestamp('2014-1-1'), pd.Timestamp('2014-1-2'), + pd.Timestamp('2014-1-2'), pd.Timestamp('2014-1-4')] + expected = pd.DataFrame({'f': [1.1, 2.1, 2.1, 4.5], + 'd': dates, + 'i': [1, 2, 2, 4]}, + columns=['f', 'i', 'd']) + assert_frame_equal(result, expected) + + # selection + result = df.groupby('grouping')[['f', 'i']].transform('first') + expected = expected[['f', 'i']] + assert_frame_equal(result, expected) + + # dup columns + df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['g', 'a', 'a']) + result = df.groupby('g').transform('first') + expected = df.drop('g', axis=1) + assert_frame_equal(result, expected) + + +def test_transform_broadcast(tsframe, ts): + grouped = ts.groupby(lambda x: x.month) + result = grouped.transform(np.mean) + + tm.assert_index_equal(result.index, ts.index) + for _, gp in grouped: + assert_fp_equal(result.reindex(gp.index), gp.mean()) + + grouped = tsframe.groupby(lambda x: x.month) + result = grouped.transform(np.mean) + tm.assert_index_equal(result.index, tsframe.index) + for _, gp in grouped: + agged = gp.mean() + res = result.reindex(gp.index) + for col in tsframe: + assert_fp_equal(res[col], agged[col]) + + # group columns + grouped = tsframe.groupby({'A': 0, 'B': 0, 'C': 1, 'D': 1}, + axis=1) + result = grouped.transform(np.mean) + tm.assert_index_equal(result.index, tsframe.index) + tm.assert_index_equal(result.columns, tsframe.columns) + for _, gp in grouped: + agged = gp.mean(1) + res = result.reindex(columns=gp.columns) + for idx in gp.index: + assert_fp_equal(res.xs(idx), agged[idx]) + + +def test_transform_axis(tsframe): + + # make sure that we are setting the axes + # correctly when on axis=0 or 1 + # in the presence of a non-monotonic indexer + # GH12713 + + base = tsframe.iloc[0:5] + r = len(base.index) + c = len(base.columns) + tso = DataFrame(np.random.randn(r, c), + index=base.index, + columns=base.columns, + dtype='float64') + # monotonic + ts = tso + grouped = ts.groupby(lambda x: x.weekday()) + result = ts - grouped.transform('mean') + expected = grouped.apply(lambda x: x - x.mean()) + assert_frame_equal(result, expected) + + ts = ts.T + grouped = ts.groupby(lambda x: x.weekday(), axis=1) + result = ts - grouped.transform('mean') + expected = grouped.apply(lambda x: (x.T - x.mean(1)).T) + assert_frame_equal(result, expected) + + # non-monotonic + ts = tso.iloc[[1, 0] + list(range(2, len(base)))] + grouped = ts.groupby(lambda x: x.weekday()) + result = ts - grouped.transform('mean') + expected = grouped.apply(lambda x: x - x.mean()) + assert_frame_equal(result, expected) + + ts = ts.T + grouped = ts.groupby(lambda x: x.weekday(), axis=1) + result = ts - grouped.transform('mean') + expected = grouped.apply(lambda x: (x.T - x.mean(1)).T) + assert_frame_equal(result, expected) + + +def test_transform_dtype(): + # GH 9807 + # Check transform dtype output is preserved + df = DataFrame([[1, 3], [2, 3]]) + result = df.groupby(1).transform('mean') + expected = DataFrame([[1.5], [1.5]]) + assert_frame_equal(result, expected) + + +def test_transform_bug(): + # GH 5712 + # transforming on a datetime column + df = DataFrame(dict(A=Timestamp('20130101'), B=np.arange(5))) + result = df.groupby('A')['B'].transform( + lambda x: x.rank(ascending=False)) + expected = Series(np.arange(5, 0, step=-1), name='B') + assert_series_equal(result, expected) + + +def test_transform_numeric_to_boolean(): + # GH 16875 + # inconsistency in transforming boolean values + expected = pd.Series([True, True], name='A') + + df = pd.DataFrame({'A': [1.1, 2.2], 'B': [1, 2]}) + result = df.groupby('B').A.transform(lambda x: True) + assert_series_equal(result, expected) + + df = pd.DataFrame({'A': [1, 2], 'B': [1, 2]}) + result = df.groupby('B').A.transform(lambda x: True) + assert_series_equal(result, expected) + + +def test_transform_datetime_to_timedelta(): + # GH 15429 + # transforming a datetime to timedelta + df = DataFrame(dict(A=Timestamp('20130101'), B=np.arange(5))) + expected = pd.Series([ + Timestamp('20130101') - Timestamp('20130101')] * 5, name='A') + + # this does date math without changing result type in transform + base_time = df['A'][0] + result = df.groupby('A')['A'].transform( + lambda x: x.max() - x.min() + base_time) - base_time + assert_series_equal(result, expected) + + # this does date math and causes the transform to return timedelta + result = df.groupby('A')['A'].transform(lambda x: x.max() - x.min()) + assert_series_equal(result, expected) + + +def test_transform_datetime_to_numeric(): + # GH 10972 + # convert dt to float + df = DataFrame({ + 'a': 1, 'b': date_range('2015-01-01', periods=2, freq='D')}) + result = df.groupby('a').b.transform( + lambda x: x.dt.dayofweek - x.dt.dayofweek.mean()) + + expected = Series([-0.5, 0.5], name='b') + assert_series_equal(result, expected) + + # convert dt to int + df = DataFrame({ + 'a': 1, 'b': date_range('2015-01-01', periods=2, freq='D')}) + result = df.groupby('a').b.transform( + lambda x: x.dt.dayofweek - x.dt.dayofweek.min()) + + expected = Series([0, 1], name='b') + assert_series_equal(result, expected) + + +def test_transform_casting(): + # 13046 + data = """ + idx A ID3 DATETIME + 0 B-028 b76cd912ff "2014-10-08 13:43:27" + 1 B-054 4a57ed0b02 "2014-10-08 14:26:19" + 2 B-076 1a682034f8 "2014-10-08 14:29:01" + 3 B-023 b76cd912ff "2014-10-08 18:39:34" + 4 B-023 f88g8d7sds "2014-10-08 18:40:18" + 5 B-033 b76cd912ff "2014-10-08 18:44:30" + 6 B-032 b76cd912ff "2014-10-08 18:46:00" + 7 B-037 b76cd912ff "2014-10-08 18:52:15" + 8 B-046 db959faf02 "2014-10-08 18:59:59" + 9 B-053 b76cd912ff "2014-10-08 19:17:48" + 10 B-065 b76cd912ff "2014-10-08 19:21:38" + """ + df = pd.read_csv(StringIO(data), sep=r'\s+', + index_col=[0], parse_dates=['DATETIME']) + + result = df.groupby('ID3')['DATETIME'].transform(lambda x: x.diff()) + assert is_timedelta64_dtype(result.dtype) + + result = df[['ID3', 'DATETIME']].groupby('ID3').transform( + lambda x: x.diff()) + assert is_timedelta64_dtype(result.DATETIME.dtype) + + +def test_transform_multiple(ts): + grouped = ts.groupby([lambda x: x.year, lambda x: x.month]) + + grouped.transform(lambda x: x * 2) + grouped.transform(np.mean) + + +def test_dispatch_transform(tsframe): + df = tsframe[::5].reindex(tsframe.index) + + grouped = df.groupby(lambda x: x.month) + + filled = grouped.fillna(method='pad') + fillit = lambda x: x.fillna(method='pad') + expected = df.groupby(lambda x: x.month).transform(fillit) + assert_frame_equal(filled, expected) + + +def test_transform_select_columns(df): + f = lambda x: x.mean() + result = df.groupby('A')['C', 'D'].transform(f) + + selection = df[['C', 'D']] + expected = selection.groupby(df['A']).transform(f) + + assert_frame_equal(result, expected) + + +def test_transform_exclude_nuisance(df): + + # this also tests orderings in transform between + # series/frame to make sure it's consistent + expected = {} + grouped = df.groupby('A') + expected['C'] = grouped['C'].transform(np.mean) + expected['D'] = grouped['D'].transform(np.mean) + expected = DataFrame(expected) + result = df.groupby('A').transform(np.mean) + + assert_frame_equal(result, expected) + + +def test_transform_function_aliases(df): + result = df.groupby('A').transform('mean') + expected = df.groupby('A').transform(np.mean) + assert_frame_equal(result, expected) + + result = df.groupby('A')['C'].transform('mean') + expected = df.groupby('A')['C'].transform(np.mean) + assert_series_equal(result, expected) + + +def test_series_fast_transform_date(): + # GH 13191 + df = pd.DataFrame({'grouping': [np.nan, 1, 1, 3], + 'd': pd.date_range('2014-1-1', '2014-1-4')}) + result = df.groupby('grouping')['d'].transform('first') + dates = [pd.NaT, pd.Timestamp('2014-1-2'), pd.Timestamp('2014-1-2'), + pd.Timestamp('2014-1-4')] + expected = pd.Series(dates, name='d') + assert_series_equal(result, expected) + + +def test_transform_length(): + # GH 9697 + df = pd.DataFrame({'col1': [1, 1, 2, 2], 'col2': [1, 2, 3, np.nan]}) + expected = pd.Series([3.0] * 4) + + def nsum(x): + return np.nansum(x) + + results = [df.groupby('col1').transform(sum)['col2'], + df.groupby('col1')['col2'].transform(sum), + df.groupby('col1').transform(nsum)['col2'], + df.groupby('col1')['col2'].transform(nsum)] + for result in results: + assert_series_equal(result, expected, check_names=False) + + +def test_transform_coercion(): + + # 14457 + # when we are transforming be sure to not coerce + # via assignment + df = pd.DataFrame(dict(A=['a', 'a'], B=[0, 1])) + g = df.groupby('A') + + expected = g.transform(np.mean) + result = g.transform(lambda x: np.mean(x)) + assert_frame_equal(result, expected) + + +def test_groupby_transform_with_int(): + + # GH 3740, make sure that we might upcast on item-by-item transform + + # floats + df = DataFrame(dict(A=[1, 1, 1, 2, 2, 2], B=Series(1, dtype='float64'), + C=Series( + [1, 2, 3, 1, 2, 3], dtype='float64'), D='foo')) + with np.errstate(all='ignore'): + result = df.groupby('A').transform( + lambda x: (x - x.mean()) / x.std()) + expected = DataFrame(dict(B=np.nan, C=Series( + [-1, 0, 1, -1, 0, 1], dtype='float64'))) + assert_frame_equal(result, expected) + + # int case + df = DataFrame(dict(A=[1, 1, 1, 2, 2, 2], B=1, + C=[1, 2, 3, 1, 2, 3], D='foo')) + with np.errstate(all='ignore'): + result = df.groupby('A').transform( + lambda x: (x - x.mean()) / x.std()) + expected = DataFrame(dict(B=np.nan, C=[-1, 0, 1, -1, 0, 1])) + assert_frame_equal(result, expected) + + # int that needs float conversion + s = Series([2, 3, 4, 10, 5, -1]) + df = DataFrame(dict(A=[1, 1, 1, 2, 2, 2], B=1, C=s, D='foo')) + with np.errstate(all='ignore'): + result = df.groupby('A').transform( + lambda x: (x - x.mean()) / x.std()) + + s1 = s.iloc[0:3] + s1 = (s1 - s1.mean()) / s1.std() + s2 = s.iloc[3:6] + s2 = (s2 - s2.mean()) / s2.std() + expected = DataFrame(dict(B=np.nan, C=concat([s1, s2]))) + assert_frame_equal(result, expected) + + # int downcasting + result = df.groupby('A').transform(lambda x: x * 2 / 2) + expected = DataFrame(dict(B=1, C=[2, 3, 4, 10, 5, -1])) + assert_frame_equal(result, expected) + + +def test_groupby_transform_with_nan_group(): + # GH 9941 + df = pd.DataFrame({'a': range(10), + 'b': [1, 1, 2, 3, np.nan, 4, 4, 5, 5, 5]}) + result = df.groupby(df.b)['a'].transform(max) + expected = pd.Series([1., 1., 2., 3., np.nan, 6., 6., 9., 9., 9.], + name='a') + assert_series_equal(result, expected) + + +def test_transform_mixed_type(): + index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3] + ]) + df = DataFrame({'d': [1., 1., 1., 2., 2., 2.], + 'c': np.tile(['a', 'b', 'c'], 2), + 'v': np.arange(1., 7.)}, index=index) + + def f(group): + group['g'] = group['d'] * 2 + return group[:1] + + grouped = df.groupby('c') + result = grouped.apply(f) + + assert result['d'].dtype == np.float64 + + # this is by definition a mutating operation! + with option_context('mode.chained_assignment', None): + for key, group in grouped: + res = f(group) + assert_frame_equal(res, result.loc[key]) + + +def test_cython_group_transform_algos(): + # GH 4095 + dtypes = [np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint32, + np.uint64, np.float32, np.float64] + + ops = [(groupby.group_cumprod_float64, np.cumproduct, [np.float64]), + (groupby.group_cumsum, np.cumsum, dtypes)] + + is_datetimelike = False + for pd_op, np_op, dtypes in ops: + for dtype in dtypes: + data = np.array([[1], [2], [3], [4]], dtype=dtype) + ans = np.zeros_like(data) + labels = np.array([0, 0, 0, 0], dtype=np.int64) + pd_op(ans, data, labels, is_datetimelike) + tm.assert_numpy_array_equal(np_op(data), ans[:, 0], + check_dtype=False) + + # with nans + labels = np.array([0, 0, 0, 0, 0], dtype=np.int64) + + data = np.array([[1], [2], [3], [np.nan], [4]], dtype='float64') + actual = np.zeros_like(data) + actual.fill(np.nan) + groupby.group_cumprod_float64(actual, data, labels, is_datetimelike) + expected = np.array([1, 2, 6, np.nan, 24], dtype='float64') + tm.assert_numpy_array_equal(actual[:, 0], expected) + + actual = np.zeros_like(data) + actual.fill(np.nan) + groupby.group_cumsum(actual, data, labels, is_datetimelike) + expected = np.array([1, 3, 6, np.nan, 10], dtype='float64') + tm.assert_numpy_array_equal(actual[:, 0], expected) + + # timedelta + is_datetimelike = True + data = np.array([np.timedelta64(1, 'ns')] * 5, dtype='m8[ns]')[:, None] + actual = np.zeros_like(data, dtype='int64') + groupby.group_cumsum(actual, data.view('int64'), labels, + is_datetimelike) + expected = np.array([np.timedelta64(1, 'ns'), np.timedelta64( + 2, 'ns'), np.timedelta64(3, 'ns'), np.timedelta64(4, 'ns'), + np.timedelta64(5, 'ns')]) + tm.assert_numpy_array_equal(actual[:, 0].view('m8[ns]'), expected) + + +@pytest.mark.parametrize( + "op, args, targop", + [('cumprod', (), lambda x: x.cumprod()), + ('cumsum', (), lambda x: x.cumsum()), + ('shift', (-1, ), lambda x: x.shift(-1)), + ('shift', (1, ), lambda x: x.shift())]) +def test_cython_transform_series(op, args, targop): + # GH 4095 + s = Series(np.random.randn(1000)) + s_missing = s.copy() + s_missing.iloc[2:10] = np.nan + labels = np.random.randint(0, 50, size=1000).astype(float) + + # series + for data in [s, s_missing]: + # print(data.head()) + expected = data.groupby(labels).transform(targop) + + tm.assert_series_equal( + expected, + data.groupby(labels).transform(op, *args)) + tm.assert_series_equal(expected, getattr( + data.groupby(labels), op)(*args)) + + +@pytest.mark.parametrize("op", ['cumprod', 'cumsum']) +@pytest.mark.parametrize("skipna", [False, True]) +@pytest.mark.parametrize('input, exp', [ + # When everything is NaN + ({'key': ['b'] * 10, 'value': np.nan}, + pd.Series([np.nan] * 10, name='value')), + # When there is a single NaN + ({'key': ['b'] * 10 + ['a'] * 2, + 'value': [3] * 3 + [np.nan] + [3] * 8}, + {('cumprod', False): [3.0, 9.0, 27.0] + [np.nan] * 7 + [3.0, 9.0], + ('cumprod', True): [3.0, 9.0, 27.0, np.nan, 81., 243., 729., + 2187., 6561., 19683., 3.0, 9.0], + ('cumsum', False): [3.0, 6.0, 9.0] + [np.nan] * 7 + [3.0, 6.0], + ('cumsum', True): [3.0, 6.0, 9.0, np.nan, 12., 15., 18., + 21., 24., 27., 3.0, 6.0]})]) +def test_groupby_cum_skipna(op, skipna, input, exp): + df = pd.DataFrame(input) + result = df.groupby('key')['value'].transform(op, skipna=skipna) + if isinstance(exp, dict): + expected = exp[(op, skipna)] + else: + expected = exp + expected = pd.Series(expected, name='value') + tm.assert_series_equal(expected, result) + + +@pytest.mark.parametrize( + "op, args, targop", + [('cumprod', (), lambda x: x.cumprod()), + ('cumsum', (), lambda x: x.cumsum()), + ('shift', (-1, ), lambda x: x.shift(-1)), + ('shift', (1, ), lambda x: x.shift())]) +def test_cython_transform_frame(op, args, targop): + s = Series(np.random.randn(1000)) + s_missing = s.copy() + s_missing.iloc[2:10] = np.nan + labels = np.random.randint(0, 50, size=1000).astype(float) + strings = list('qwertyuiopasdfghjklz') + strings_missing = strings[:] + strings_missing[5] = np.nan + df = DataFrame({'float': s, + 'float_missing': s_missing, + 'int': [1, 1, 1, 1, 2] * 200, + 'datetime': pd.date_range('1990-1-1', periods=1000), + 'timedelta': pd.timedelta_range(1, freq='s', + periods=1000), + 'string': strings * 50, + 'string_missing': strings_missing * 50}, + columns=['float', 'float_missing', 'int', 'datetime', + 'timedelta', 'string', 'string_missing']) + df['cat'] = df['string'].astype('category') + + df2 = df.copy() + df2.index = pd.MultiIndex.from_product([range(100), range(10)]) + + # DataFrame - Single and MultiIndex, + # group by values, index level, columns + for df in [df, df2]: + for gb_target in [dict(by=labels), dict(level=0), dict(by='string') + ]: # dict(by='string_missing')]: + # dict(by=['int','string'])]: + + gb = df.groupby(**gb_target) + # whitelisted methods set the selection before applying + # bit a of hack to make sure the cythonized shift + # is equivalent to pre 0.17.1 behavior + if op == 'shift': + gb._set_group_selection() + + if op != 'shift' and 'int' not in gb_target: + # numeric apply fastpath promotes dtype so have + # to apply separately and concat + i = gb[['int']].apply(targop) + f = gb[['float', 'float_missing']].apply(targop) + expected = pd.concat([f, i], axis=1) + else: + expected = gb.apply(targop) + + expected = expected.sort_index(axis=1) + tm.assert_frame_equal(expected, + gb.transform(op, *args).sort_index( + axis=1)) + tm.assert_frame_equal( expected, - data.groupby(labels).transform(op, *args)) - tm.assert_series_equal(expected, getattr( - data.groupby(labels), op)(*args)) - - @pytest.mark.parametrize("op", ['cumprod', 'cumsum']) - @pytest.mark.parametrize("skipna", [False, True]) - @pytest.mark.parametrize('input, exp', [ - # When everything is NaN - ({'key': ['b'] * 10, 'value': np.nan}, - pd.Series([np.nan] * 10, name='value')), - # When there is a single NaN - ({'key': ['b'] * 10 + ['a'] * 2, - 'value': [3] * 3 + [np.nan] + [3] * 8}, - {('cumprod', False): [3.0, 9.0, 27.0] + [np.nan] * 7 + [3.0, 9.0], - ('cumprod', True): [3.0, 9.0, 27.0, np.nan, 81., 243., 729., - 2187., 6561., 19683., 3.0, 9.0], - ('cumsum', False): [3.0, 6.0, 9.0] + [np.nan] * 7 + [3.0, 6.0], - ('cumsum', True): [3.0, 6.0, 9.0, np.nan, 12., 15., 18., - 21., 24., 27., 3.0, 6.0]})]) - def test_groupby_cum_skipna(self, op, skipna, input, exp): - df = pd.DataFrame(input) - result = df.groupby('key')['value'].transform(op, skipna=skipna) - if isinstance(exp, dict): - expected = exp[(op, skipna)] - else: - expected = exp - expected = pd.Series(expected, name='value') - tm.assert_series_equal(expected, result) - - @pytest.mark.parametrize( - "op, args, targop", - [('cumprod', (), lambda x: x.cumprod()), - ('cumsum', (), lambda x: x.cumsum()), - ('shift', (-1, ), lambda x: x.shift(-1)), - ('shift', (1, ), lambda x: x.shift())]) - def test_cython_transform_frame(self, op, args, targop): - s = Series(np.random.randn(1000)) - s_missing = s.copy() - s_missing.iloc[2:10] = np.nan - labels = np.random.randint(0, 50, size=1000).astype(float) - strings = list('qwertyuiopasdfghjklz') - strings_missing = strings[:] - strings_missing[5] = np.nan - df = DataFrame({'float': s, - 'float_missing': s_missing, - 'int': [1, 1, 1, 1, 2] * 200, - 'datetime': pd.date_range('1990-1-1', periods=1000), - 'timedelta': pd.timedelta_range(1, freq='s', - periods=1000), - 'string': strings * 50, - 'string_missing': strings_missing * 50}, - columns=['float', 'float_missing', 'int', 'datetime', - 'timedelta', 'string', 'string_missing']) - df['cat'] = df['string'].astype('category') - - df2 = df.copy() - df2.index = pd.MultiIndex.from_product([range(100), range(10)]) - - # DataFrame - Single and MultiIndex, - # group by values, index level, columns - for df in [df, df2]: - for gb_target in [dict(by=labels), dict(level=0), dict(by='string') - ]: # dict(by='string_missing')]: - # dict(by=['int','string'])]: - - gb = df.groupby(**gb_target) - # whitelisted methods set the selection before applying - # bit a of hack to make sure the cythonized shift - # is equivalent to pre 0.17.1 behavior - if op == 'shift': - gb._set_group_selection() - - if op != 'shift' and 'int' not in gb_target: - # numeric apply fastpath promotes dtype so have - # to apply separately and concat - i = gb[['int']].apply(targop) - f = gb[['float', 'float_missing']].apply(targop) - expected = pd.concat([f, i], axis=1) + getattr(gb, op)(*args).sort_index(axis=1)) + # individual columns + for c in df: + if c not in ['float', 'int', 'float_missing' + ] and op != 'shift': + pytest.raises(DataError, gb[c].transform, op) + pytest.raises(DataError, getattr(gb[c], op)) else: - expected = gb.apply(targop) - - expected = expected.sort_index(axis=1) - tm.assert_frame_equal(expected, - gb.transform(op, *args).sort_index( - axis=1)) - tm.assert_frame_equal( - expected, - getattr(gb, op)(*args).sort_index(axis=1)) - # individual columns - for c in df: - if c not in ['float', 'int', 'float_missing' - ] and op != 'shift': - pytest.raises(DataError, gb[c].transform, op) - pytest.raises(DataError, getattr(gb[c], op)) - else: - expected = gb[c].apply(targop) - expected.name = c - tm.assert_series_equal(expected, - gb[c].transform(op, *args)) - tm.assert_series_equal(expected, - getattr(gb[c], op)(*args)) - - def test_transform_with_non_scalar_group(self): - # GH 10165 - cols = pd.MultiIndex.from_tuples([ - ('syn', 'A'), ('mis', 'A'), ('non', 'A'), - ('syn', 'C'), ('mis', 'C'), ('non', 'C'), - ('syn', 'T'), ('mis', 'T'), ('non', 'T'), - ('syn', 'G'), ('mis', 'G'), ('non', 'G')]) - df = pd.DataFrame(np.random.randint(1, 10, (4, 12)), - columns=cols, - index=['A', 'C', 'G', 'T']) - tm.assert_raises_regex(ValueError, 'transform must return ' - 'a scalar value for each ' - 'group.*', - df.groupby(axis=1, level=1).transform, - lambda z: z.div(z.sum(axis=1), axis=0)) - - @pytest.mark.parametrize('cols,exp,comp_func', [ - ('a', pd.Series([1, 1, 1], name='a'), tm.assert_series_equal), - (['a', 'c'], pd.DataFrame({'a': [1, 1, 1], 'c': [1, 1, 1]}), - tm.assert_frame_equal) - ]) - @pytest.mark.parametrize('agg_func', [ - 'count', 'rank', 'size']) - def test_transform_numeric_ret(self, cols, exp, comp_func, agg_func): - if agg_func == 'size' and isinstance(cols, list): - pytest.xfail("'size' transformation not supported with " - "NDFrameGroupy") - - # GH 19200 - df = pd.DataFrame( - {'a': pd.date_range('2018-01-01', periods=3), - 'b': range(3), - 'c': range(7, 10)}) - - result = df.groupby('b')[cols].transform(agg_func) - - if agg_func == 'rank': - exp = exp.astype('float') - - comp_func(result, exp) - - @pytest.mark.parametrize("mix_groupings", [True, False]) - @pytest.mark.parametrize("as_series", [True, False]) - @pytest.mark.parametrize("val1,val2", [ - ('foo', 'bar'), (1, 2), (1., 2.)]) - @pytest.mark.parametrize("fill_method,limit,exp_vals", [ - ("ffill", None, - [np.nan, np.nan, 'val1', 'val1', 'val1', 'val2', 'val2', 'val2']), - ("ffill", 1, - [np.nan, np.nan, 'val1', 'val1', np.nan, 'val2', 'val2', np.nan]), - ("bfill", None, - ['val1', 'val1', 'val1', 'val2', 'val2', 'val2', np.nan, np.nan]), - ("bfill", 1, - [np.nan, 'val1', 'val1', np.nan, 'val2', 'val2', np.nan, np.nan]) - ]) - def test_group_fill_methods(self, mix_groupings, as_series, val1, val2, - fill_method, limit, exp_vals): - vals = [np.nan, np.nan, val1, np.nan, np.nan, val2, np.nan, np.nan] - _exp_vals = list(exp_vals) - # Overwrite placeholder values - for index, exp_val in enumerate(_exp_vals): - if exp_val == 'val1': - _exp_vals[index] = val1 - elif exp_val == 'val2': - _exp_vals[index] = val2 - - # Need to modify values and expectations depending on the - # Series / DataFrame that we ultimately want to generate - if mix_groupings: # ['a', 'b', 'a, 'b', ...] - keys = ['a', 'b'] * len(vals) - - def interweave(list_obj): - temp = list() - for x in list_obj: - temp.extend([x, x]) - - return temp - - _exp_vals = interweave(_exp_vals) - vals = interweave(vals) - else: # ['a', 'a', 'a', ... 'b', 'b', 'b'] - keys = ['a'] * len(vals) + ['b'] * len(vals) - _exp_vals = _exp_vals * 2 - vals = vals * 2 - - df = DataFrame({'key': keys, 'val': vals}) - if as_series: - result = getattr( - df.groupby('key')['val'], fill_method)(limit=limit) - exp = Series(_exp_vals, name='val') - assert_series_equal(result, exp) - else: - result = getattr(df.groupby('key'), fill_method)(limit=limit) - exp = DataFrame({'key': keys, 'val': _exp_vals}) - assert_frame_equal(result, exp) - - @pytest.mark.parametrize("test_series", [True, False]) - @pytest.mark.parametrize("periods,fill_method,limit", [ - (1, 'ffill', None), (1, 'ffill', 1), - (1, 'bfill', None), (1, 'bfill', 1), - (-1, 'ffill', None), (-1, 'ffill', 1), - (-1, 'bfill', None), (-1, 'bfill', 1)]) - def test_pct_change(self, test_series, periods, fill_method, limit): - vals = [np.nan, np.nan, 1, 2, 4, 10, np.nan, np.nan] - exp_vals = Series(vals).pct_change(periods=periods, - fill_method=fill_method, - limit=limit).tolist() - - df = DataFrame({'key': ['a'] * len(vals) + ['b'] * len(vals), - 'vals': vals * 2}) - grp = df.groupby('key') - - def get_result(grp_obj): - return grp_obj.pct_change(periods=periods, - fill_method=fill_method, - limit=limit) - - if test_series: - exp = pd.Series(exp_vals * 2) - exp.name = 'vals' - grp = grp['vals'] - result = get_result(grp) - tm.assert_series_equal(result, exp) - else: - exp = DataFrame({'vals': exp_vals * 2}) - result = get_result(grp) - tm.assert_frame_equal(result, exp) - - @pytest.mark.parametrize("func", [np.any, np.all]) - def test_any_all_np_func(self, func): - # GH 20653 - df = pd.DataFrame([['foo', True], - [np.nan, True], - ['foo', True]], columns=['key', 'val']) - - exp = pd.Series([True, np.nan, True], name='val') - - res = df.groupby('key')['val'].transform(func) - tm.assert_series_equal(res, exp) + expected = gb[c].apply(targop) + expected.name = c + tm.assert_series_equal(expected, + gb[c].transform(op, *args)) + tm.assert_series_equal(expected, + getattr(gb[c], op)(*args)) + + +def test_transform_with_non_scalar_group(): + # GH 10165 + cols = pd.MultiIndex.from_tuples([ + ('syn', 'A'), ('mis', 'A'), ('non', 'A'), + ('syn', 'C'), ('mis', 'C'), ('non', 'C'), + ('syn', 'T'), ('mis', 'T'), ('non', 'T'), + ('syn', 'G'), ('mis', 'G'), ('non', 'G')]) + df = pd.DataFrame(np.random.randint(1, 10, (4, 12)), + columns=cols, + index=['A', 'C', 'G', 'T']) + tm.assert_raises_regex(ValueError, 'transform must return ' + 'a scalar value for each ' + 'group.*', + df.groupby(axis=1, level=1).transform, + lambda z: z.div(z.sum(axis=1), axis=0)) + + +@pytest.mark.parametrize('cols,exp,comp_func', [ + ('a', pd.Series([1, 1, 1], name='a'), tm.assert_series_equal), + (['a', 'c'], pd.DataFrame({'a': [1, 1, 1], 'c': [1, 1, 1]}), + tm.assert_frame_equal) +]) +@pytest.mark.parametrize('agg_func', [ + 'count', 'rank', 'size']) +def test_transform_numeric_ret(cols, exp, comp_func, agg_func): + if agg_func == 'size' and isinstance(cols, list): + pytest.xfail("'size' transformation not supported with " + "NDFrameGroupy") + + # GH 19200 + df = pd.DataFrame( + {'a': pd.date_range('2018-01-01', periods=3), + 'b': range(3), + 'c': range(7, 10)}) + + result = df.groupby('b')[cols].transform(agg_func) + + if agg_func == 'rank': + exp = exp.astype('float') + + comp_func(result, exp) + + +@pytest.mark.parametrize("mix_groupings", [True, False]) +@pytest.mark.parametrize("as_series", [True, False]) +@pytest.mark.parametrize("val1,val2", [ + ('foo', 'bar'), (1, 2), (1., 2.)]) +@pytest.mark.parametrize("fill_method,limit,exp_vals", [ + ("ffill", None, + [np.nan, np.nan, 'val1', 'val1', 'val1', 'val2', 'val2', 'val2']), + ("ffill", 1, + [np.nan, np.nan, 'val1', 'val1', np.nan, 'val2', 'val2', np.nan]), + ("bfill", None, + ['val1', 'val1', 'val1', 'val2', 'val2', 'val2', np.nan, np.nan]), + ("bfill", 1, + [np.nan, 'val1', 'val1', np.nan, 'val2', 'val2', np.nan, np.nan]) +]) +def test_group_fill_methods(mix_groupings, as_series, val1, val2, + fill_method, limit, exp_vals): + vals = [np.nan, np.nan, val1, np.nan, np.nan, val2, np.nan, np.nan] + _exp_vals = list(exp_vals) + # Overwrite placeholder values + for index, exp_val in enumerate(_exp_vals): + if exp_val == 'val1': + _exp_vals[index] = val1 + elif exp_val == 'val2': + _exp_vals[index] = val2 + + # Need to modify values and expectations depending on the + # Series / DataFrame that we ultimately want to generate + if mix_groupings: # ['a', 'b', 'a, 'b', ...] + keys = ['a', 'b'] * len(vals) + + def interweave(list_obj): + temp = list() + for x in list_obj: + temp.extend([x, x]) + + return temp + + _exp_vals = interweave(_exp_vals) + vals = interweave(vals) + else: # ['a', 'a', 'a', ... 'b', 'b', 'b'] + keys = ['a'] * len(vals) + ['b'] * len(vals) + _exp_vals = _exp_vals * 2 + vals = vals * 2 + + df = DataFrame({'key': keys, 'val': vals}) + if as_series: + result = getattr( + df.groupby('key')['val'], fill_method)(limit=limit) + exp = Series(_exp_vals, name='val') + assert_series_equal(result, exp) + else: + result = getattr(df.groupby('key'), fill_method)(limit=limit) + exp = DataFrame({'key': keys, 'val': _exp_vals}) + assert_frame_equal(result, exp) + + +@pytest.mark.parametrize("test_series", [True, False]) +@pytest.mark.parametrize("periods,fill_method,limit", [ + (1, 'ffill', None), (1, 'ffill', 1), + (1, 'bfill', None), (1, 'bfill', 1), + (-1, 'ffill', None), (-1, 'ffill', 1), + (-1, 'bfill', None), (-1, 'bfill', 1)]) +def test_pct_change(test_series, periods, fill_method, limit): + vals = [np.nan, np.nan, 1, 2, 4, 10, np.nan, np.nan] + exp_vals = Series(vals).pct_change(periods=periods, + fill_method=fill_method, + limit=limit).tolist() + + df = DataFrame({'key': ['a'] * len(vals) + ['b'] * len(vals), + 'vals': vals * 2}) + grp = df.groupby('key') + + def get_result(grp_obj): + return grp_obj.pct_change(periods=periods, + fill_method=fill_method, + limit=limit) + + if test_series: + exp = pd.Series(exp_vals * 2) + exp.name = 'vals' + grp = grp['vals'] + result = get_result(grp) + tm.assert_series_equal(result, exp) + else: + exp = DataFrame({'vals': exp_vals * 2}) + result = get_result(grp) + tm.assert_frame_equal(result, exp) + + +@pytest.mark.parametrize("func", [np.any, np.all]) +def test_any_all_np_func(func): + # GH 20653 + df = pd.DataFrame([['foo', True], + [np.nan, True], + ['foo', True]], columns=['key', 'val']) + + exp = pd.Series([True, np.nan, True], name='val') + + res = df.groupby('key')['val'].transform(func) + tm.assert_series_equal(res, exp)