Skip to content

Commit

Permalink
REF: Add tests.groupby.methods (#55312)
Browse files Browse the repository at this point in the history
* REF: Add tests.groupby.methods

* Merge cleanup

* Refactor

* Refactor

* Show value of ymin

* fixup

* Revert

* Revert
  • Loading branch information
rhshadrach authored Oct 12, 2023
1 parent ae177e8 commit 9de2a19
Show file tree
Hide file tree
Showing 19 changed files with 1,567 additions and 1,566 deletions.
Empty file.
24 changes: 24 additions & 0 deletions pandas/tests/groupby/methods/test_corrwith.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import numpy as np

from pandas import (
DataFrame,
Index,
Series,
)
import pandas._testing as tm


def test_corrwith_with_1_axis():
# GH 47723
df = DataFrame({"a": [1, 1, 2], "b": [3, 7, 4]})
gb = df.groupby("a")

msg = "DataFrameGroupBy.corrwith with axis=1 is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = gb.corrwith(df, axis=1)
index = Index(
data=[(1, 0), (1, 1), (1, 2), (2, 2), (2, 0), (2, 1)],
name=("a", None),
)
expected = Series([np.nan] * 6, index=index)
tm.assert_series_equal(result, expected)
221 changes: 221 additions & 0 deletions pandas/tests/groupby/methods/test_describe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,221 @@
import numpy as np
import pytest

import pandas as pd
from pandas import (
DataFrame,
Index,
MultiIndex,
Timestamp,
)
import pandas._testing as tm


def test_apply_describe_bug(mframe):
grouped = mframe.groupby(level="first")
grouped.describe() # it works!


def test_series_describe_multikey():
ts = tm.makeTimeSeries()
grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
result = grouped.describe()
tm.assert_series_equal(result["mean"], grouped.mean(), check_names=False)
tm.assert_series_equal(result["std"], grouped.std(), check_names=False)
tm.assert_series_equal(result["min"], grouped.min(), check_names=False)


def test_series_describe_single():
ts = tm.makeTimeSeries()
grouped = ts.groupby(lambda x: x.month)
result = grouped.apply(lambda x: x.describe())
expected = grouped.describe().stack(future_stack=True)
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize("keys", ["key1", ["key1", "key2"]])
def test_series_describe_as_index(as_index, keys):
# GH#49256
df = DataFrame(
{
"key1": ["one", "two", "two", "three", "two"],
"key2": ["one", "two", "two", "three", "two"],
"foo2": [1, 2, 4, 4, 6],
}
)
gb = df.groupby(keys, as_index=as_index)["foo2"]
result = gb.describe()
expected = DataFrame(
{
"key1": ["one", "three", "two"],
"count": [1.0, 1.0, 3.0],
"mean": [1.0, 4.0, 4.0],
"std": [np.nan, np.nan, 2.0],
"min": [1.0, 4.0, 2.0],
"25%": [1.0, 4.0, 3.0],
"50%": [1.0, 4.0, 4.0],
"75%": [1.0, 4.0, 5.0],
"max": [1.0, 4.0, 6.0],
}
)
if len(keys) == 2:
expected.insert(1, "key2", expected["key1"])
if as_index:
expected = expected.set_index(keys)
tm.assert_frame_equal(result, expected)


def test_frame_describe_multikey(tsframe):
grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month])
result = grouped.describe()
desc_groups = []
for col in tsframe:
group = grouped[col].describe()
# GH 17464 - Remove duplicate MultiIndex levels
group_col = MultiIndex(
levels=[[col], group.columns],
codes=[[0] * len(group.columns), range(len(group.columns))],
)
group = DataFrame(group.values, columns=group_col, index=group.index)
desc_groups.append(group)
expected = pd.concat(desc_groups, axis=1)
tm.assert_frame_equal(result, expected)

msg = "DataFrame.groupby with axis=1 is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
groupedT = tsframe.groupby({"A": 0, "B": 0, "C": 1, "D": 1}, axis=1)
result = groupedT.describe()
expected = tsframe.describe().T
# reverting the change from https://github.com/pandas-dev/pandas/pull/35441/
expected.index = MultiIndex(
levels=[[0, 1], expected.index],
codes=[[0, 0, 1, 1], range(len(expected.index))],
)
tm.assert_frame_equal(result, expected)


def test_frame_describe_tupleindex():
# GH 14848 - regression from 0.19.0 to 0.19.1
df1 = DataFrame(
{
"x": [1, 2, 3, 4, 5] * 3,
"y": [10, 20, 30, 40, 50] * 3,
"z": [100, 200, 300, 400, 500] * 3,
}
)
df1["k"] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5
df2 = df1.rename(columns={"k": "key"})
msg = "Names should be list-like for a MultiIndex"
with pytest.raises(ValueError, match=msg):
df1.groupby("k").describe()
with pytest.raises(ValueError, match=msg):
df2.groupby("key").describe()


def test_frame_describe_unstacked_format():
# GH 4792
prices = {
Timestamp("2011-01-06 10:59:05", tz=None): 24990,
Timestamp("2011-01-06 12:43:33", tz=None): 25499,
Timestamp("2011-01-06 12:54:09", tz=None): 25499,
}
volumes = {
Timestamp("2011-01-06 10:59:05", tz=None): 1500000000,
Timestamp("2011-01-06 12:43:33", tz=None): 5000000000,
Timestamp("2011-01-06 12:54:09", tz=None): 100000000,
}
df = DataFrame({"PRICE": prices, "VOLUME": volumes})
result = df.groupby("PRICE").VOLUME.describe()
data = [
df[df.PRICE == 24990].VOLUME.describe().values.tolist(),
df[df.PRICE == 25499].VOLUME.describe().values.tolist(),
]
expected = DataFrame(
data,
index=Index([24990, 25499], name="PRICE"),
columns=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
)
tm.assert_frame_equal(result, expected)


@pytest.mark.filterwarnings(
"ignore:"
"indexing past lexsort depth may impact performance:"
"pandas.errors.PerformanceWarning"
)
@pytest.mark.parametrize("as_index", [True, False])
@pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]])
def test_describe_with_duplicate_output_column_names(as_index, keys):
# GH 35314
df = DataFrame(
{
"a1": [99, 99, 99, 88, 88, 88],
"a2": [99, 99, 99, 88, 88, 88],
"b": [1, 2, 3, 4, 5, 6],
"c": [10, 20, 30, 40, 50, 60],
},
columns=["a1", "a2", "b", "b"],
copy=False,
)
if keys == ["a1"]:
df = df.drop(columns="a2")

expected = (
DataFrame.from_records(
[
("b", "count", 3.0, 3.0),
("b", "mean", 5.0, 2.0),
("b", "std", 1.0, 1.0),
("b", "min", 4.0, 1.0),
("b", "25%", 4.5, 1.5),
("b", "50%", 5.0, 2.0),
("b", "75%", 5.5, 2.5),
("b", "max", 6.0, 3.0),
("b", "count", 3.0, 3.0),
("b", "mean", 5.0, 2.0),
("b", "std", 1.0, 1.0),
("b", "min", 4.0, 1.0),
("b", "25%", 4.5, 1.5),
("b", "50%", 5.0, 2.0),
("b", "75%", 5.5, 2.5),
("b", "max", 6.0, 3.0),
],
)
.set_index([0, 1])
.T
)
expected.columns.names = [None, None]
if len(keys) == 2:
expected.index = MultiIndex(
levels=[[88, 99], [88, 99]], codes=[[0, 1], [0, 1]], names=["a1", "a2"]
)
else:
expected.index = Index([88, 99], name="a1")

if not as_index:
expected = expected.reset_index()

result = df.groupby(keys, as_index=as_index).describe()

tm.assert_frame_equal(result, expected)


def test_describe_duplicate_columns():
# GH#50806
df = DataFrame([[0, 1, 2, 3]])
df.columns = [0, 1, 2, 0]
gb = df.groupby(df[1])
result = gb.describe(percentiles=[])

columns = ["count", "mean", "std", "min", "50%", "max"]
frames = [
DataFrame([[1.0, val, np.nan, val, val, val]], index=[1], columns=columns)
for val in (0.0, 2.0, 3.0)
]
expected = pd.concat(frames, axis=1)
expected.columns = MultiIndex(
levels=[[0, 2], columns],
codes=[6 * [0] + 6 * [1] + 6 * [0], 3 * list(range(6))],
)
expected.index.names = [1]
tm.assert_frame_equal(result, expected)
78 changes: 78 additions & 0 deletions pandas/tests/groupby/methods/test_is_monotonic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import numpy as np
import pytest

from pandas import (
DataFrame,
Index,
Series,
)
import pandas._testing as tm


@pytest.mark.parametrize(
"in_vals, out_vals",
[
# Basics: strictly increasing (T), strictly decreasing (F),
# abs val increasing (F), non-strictly increasing (T)
([1, 2, 5, 3, 2, 0, 4, 5, -6, 1, 1], [True, False, False, True]),
# Test with inf vals
(
[1, 2.1, np.inf, 3, 2, np.inf, -np.inf, 5, 11, 1, -np.inf],
[True, False, True, False],
),
# Test with nan vals; should always be False
(
[1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan],
[False, False, False, False],
),
],
)
def test_is_monotonic_increasing(in_vals, out_vals):
# GH 17015
source_dict = {
"A": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"],
"B": ["a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d"],
"C": in_vals,
}
df = DataFrame(source_dict)
result = df.groupby("B").C.is_monotonic_increasing
index = Index(list("abcd"), name="B")
expected = Series(index=index, data=out_vals, name="C")
tm.assert_series_equal(result, expected)

# Also check result equal to manually taking x.is_monotonic_increasing.
expected = df.groupby(["B"]).C.apply(lambda x: x.is_monotonic_increasing)
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize(
"in_vals, out_vals",
[
# Basics: strictly decreasing (T), strictly increasing (F),
# abs val decreasing (F), non-strictly increasing (T)
([10, 9, 7, 3, 4, 5, -3, 2, 0, 1, 1], [True, False, False, True]),
# Test with inf vals
(
[np.inf, 1, -np.inf, np.inf, 2, -3, -np.inf, 5, -3, -np.inf, -np.inf],
[True, True, False, True],
),
# Test with nan vals; should always be False
(
[1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan],
[False, False, False, False],
),
],
)
def test_is_monotonic_decreasing(in_vals, out_vals):
# GH 17015
source_dict = {
"A": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"],
"B": ["a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d"],
"C": in_vals,
}

df = DataFrame(source_dict)
result = df.groupby("B").C.is_monotonic_decreasing
index = Index(list("abcd"), name="B")
expected = Series(index=index, data=out_vals, name="C")
tm.assert_series_equal(result, expected)
Loading

0 comments on commit 9de2a19

Please sign in to comment.