From 5511af08e9bd8b0610d3c68d98a9c781460f6e4e Mon Sep 17 00:00:00 2001 From: richard Date: Mon, 30 May 2022 23:32:31 -0400 Subject: [PATCH 1/2] DOC: Add to docs on group_keys in groupby.apply --- pandas/core/groupby/groupby.py | 61 ++++++++++++++++++++++++++++------ pandas/core/shared_docs.py | 8 +++-- 2 files changed, 56 insertions(+), 13 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index ffbee0bf21a66..5ce0657ca7d47 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -188,21 +188,33 @@ class providing the base-class of operations. >>> df = pd.DataFrame({'A': 'a a b'.split(), ... 'B': [1,2,3], ... 'C': [4,6,5]}) - >>> g = df.groupby('A') + >>> g1 = df.groupby('A', group_keys=False) + >>> g2 = df.groupby('A', group_keys=True) - Notice that ``g`` has two groups, ``a`` and ``b``. - Calling `apply` in various ways, we can get different grouping results: + Notice that ``g1`` have ``g2`` have two groups, ``a`` and ``b``, and only + differ in their ``group_keys`` argument. Calling `apply` in various ways, + we can get different grouping results: Example 1: below the function passed to `apply` takes a DataFrame as its argument and returns a DataFrame. `apply` combines the result for each group together into a new DataFrame: - >>> g[['B', 'C']].apply(lambda x: x / x.sum()) + >>> g1[['B', 'C']].apply(lambda x: x / x.sum()) B C 0 0.333333 0.4 1 0.666667 0.6 2 1.000000 1.0 + In the above, the groups are not part of the index. We can have them included + by using ``g2`` where ``group_keys=True``: + + >>> g2[['B', 'C']].apply(lambda x: x / x.sum()) + B C + A + a 0 0.333333 0.4 + 1 0.666667 0.6 + b 2 1.000000 1.0 + Example 2: The function passed to `apply` takes a DataFrame as its argument and returns a Series. `apply` combines the result for each group together into a new DataFrame. @@ -211,28 +223,40 @@ class providing the base-class of operations. The resulting dtype will reflect the return value of the passed ``func``. - >>> g[['B', 'C']].apply(lambda x: x.astype(float).max() - x.min()) + >>> g1[['B', 'C']].apply(lambda x: x.astype(float).max() - x.min()) + B C + A + a 1.0 2.0 + b 0.0 0.0 + + >>> g2[['B', 'C']].apply(lambda x: x.astype(float).max() - x.min()) B C A a 1.0 2.0 b 0.0 0.0 + The ``group_keys`` argument has no effect here because the result is not + like-indexed when compared to the input. + Example 3: The function passed to `apply` takes a DataFrame as its argument and returns a scalar. `apply` combines the result for each group together into a Series, including setting the index as appropriate: - >>> g.apply(lambda x: x.C.max() - x.B.min()) + >>> g1.apply(lambda x: x.C.max() - x.B.min()) A a 5 b 2 dtype: int64""", "series_examples": """ >>> s = pd.Series([0, 1, 2], index='a a b'.split()) - >>> g = s.groupby(s.index) + >>> g1 = s.groupby(s.index, group_keys=False) + >>> g2 = s.groupby(s.index, group_keys=True) From ``s`` above we can see that ``g`` has two groups, ``a`` and ``b``. - Calling `apply` in various ways, we can get different grouping results: + Notice that ``g1`` have ``g2`` have two groups, ``a`` and ``b``, and only + differ in their ``group_keys`` argument. Calling `apply` in various ways, + we can get different grouping results: Example 1: The function passed to `apply` takes a Series as its argument and returns a Series. `apply` combines the result for @@ -242,18 +266,35 @@ class providing the base-class of operations. The resulting dtype will reflect the return value of the passed ``func``. - >>> g.apply(lambda x: x*2 if x.name == 'a' else x/2) + >>> g1.apply(lambda x: x*2 if x.name == 'a' else x/2) a 0.0 a 2.0 b 1.0 dtype: float64 + In the above, the groups are not part of the index. We can have them included + by using ``g2`` where ``group_keys=True``: + + >>> g2.apply(lambda x: x*2 if x.name == 'a' else x/2) + a a 0.0 + a 2.0 + b b 1.0 + dtype: float64 + Example 2: The function passed to `apply` takes a Series as its argument and returns a scalar. `apply` combines the result for each group together into a Series, including setting the index as appropriate: - >>> g.apply(lambda x: x.max() - x.min()) + >>> g1.apply(lambda x: x.max() - x.min()) + a 1 + b 0 + dtype: int64 + + The ``group_keys`` argument has no effect here because the result is not + like-indexed when compared to the input. + + >>> g2.apply(lambda x: x.max() - x.min()) a 1 b 0 dtype: int64""", diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index 3750a8a3ceed9..bc9c7764d094a 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -115,9 +115,11 @@ Note this does not influence the order of observations within each group. Groupby preserves the order of rows within each group. group_keys : bool, optional - When calling apply, add group keys to index to identify pieces. - By default group keys are not included when the result's index - (and column) labels match the inputs, and are included otherwise. + When calling apply and the ``by`` argument produces a like-indexed (transformed) + result, add group keys to index to identify pieces. By default group keys are not + included when the result's index (and column) labels match the inputs, and + are included otherwise. This argument has no effect if the result produced + is not like-indexed with respect to the input. .. versionchanged:: 1.5.0 From ea190c86c6636e1550eeccd04c6c9922c1ee9110 Mon Sep 17 00:00:00 2001 From: richard Date: Wed, 1 Jun 2022 21:35:11 -0400 Subject: [PATCH 2/2] Add link to user guide --- pandas/core/groupby/groupby.py | 6 ++++-- pandas/core/shared_docs.py | 7 ++++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 5ce0657ca7d47..af2a5579bf1cd 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -236,7 +236,8 @@ class providing the base-class of operations. b 0.0 0.0 The ``group_keys`` argument has no effect here because the result is not - like-indexed when compared to the input. + like-indexed (i.e. :ref:`a transform `) when compared + to the input. Example 3: The function passed to `apply` takes a DataFrame as its argument and returns a scalar. `apply` combines the result for @@ -292,7 +293,8 @@ class providing the base-class of operations. dtype: int64 The ``group_keys`` argument has no effect here because the result is not - like-indexed when compared to the input. + like-indexed (i.e. :ref:`a transform `) when compared + to the input. >>> g2.apply(lambda x: x.max() - x.min()) a 1 diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index bc9c7764d094a..3a8a95865d10e 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -115,9 +115,10 @@ Note this does not influence the order of observations within each group. Groupby preserves the order of rows within each group. group_keys : bool, optional - When calling apply and the ``by`` argument produces a like-indexed (transformed) - result, add group keys to index to identify pieces. By default group keys are not - included when the result's index (and column) labels match the inputs, and + When calling apply and the ``by`` argument produces a like-indexed + (i.e. :ref:`a transform `) result, add group keys to + index to identify pieces. By default group keys are not included + when the result's index (and column) labels match the inputs, and are included otherwise. This argument has no effect if the result produced is not like-indexed with respect to the input.