diff --git a/src/nested_pandas/utils/__init__.py b/src/nested_pandas/utils/__init__.py new file mode 100644 index 0000000..ed5d0c5 --- /dev/null +++ b/src/nested_pandas/utils/__init__.py @@ -0,0 +1 @@ +from .utils import * # noqa diff --git a/src/nested_pandas/utils/utils.py b/src/nested_pandas/utils/utils.py new file mode 100644 index 0000000..885668b --- /dev/null +++ b/src/nested_pandas/utils/utils.py @@ -0,0 +1,38 @@ +import pandas as pd + +from nested_pandas import NestedFrame + + +def count_nested(df, nested, by=None, join=True) -> NestedFrame: + """Counts the number of rows of a nested dataframe. + + Parameters + ---------- + df: NestedFrame + A NestedFrame that contains the desired `nested` series + to count. + nested: 'str' + The label of the nested series to count. + by: 'str', optional + Specifies a column within nested to count by, returning + a count for each unique value in `by`. + join: bool, optional + Join the output count columns to df and return df, otherwise + just return a NestedFrame containing only the count columns. + + Returns + ------- + NestedFrame + """ + + if by is None: + counts = df["nested"].apply(lambda x: len(x)).rename(f"n_{nested}") + else: + counts = df["nested"].apply(lambda x: x[by].value_counts()) + counts = counts.rename(columns={colname: f"n_{nested}_{colname}" for colname in counts.columns}) + if join: + return df.join(counts) + # else just return the counts NestedFrame + if isinstance(counts, pd.Series): # for by=None, which returns a Series + counts = NestedFrame(counts.to_frame()) + return counts diff --git a/tests/nested_pandas/nestedframe/test_utils.py b/tests/nested_pandas/nestedframe/test_nestedframe_utils.py similarity index 100% rename from tests/nested_pandas/nestedframe/test_utils.py rename to tests/nested_pandas/nestedframe/test_nestedframe_utils.py diff --git a/tests/nested_pandas/utils/test_utils.py b/tests/nested_pandas/utils/test_utils.py new file mode 100644 index 0000000..43e4648 --- /dev/null +++ b/tests/nested_pandas/utils/test_utils.py @@ -0,0 +1,39 @@ +import numpy as np +import pandas as pd +import pytest +from nested_pandas import NestedFrame +from nested_pandas.utils import count_nested + + +@pytest.mark.parametrize("join", [True, False]) +def test_count_nested(join): + """Test the functionality of count nested""" + + # Initialize test data + base = NestedFrame(data={"a": [1, 2, 3], "b": [2, np.NaN, 6]}, index=[0, 1, 2]) + nested = pd.DataFrame( + data={ + "c": [0, 2, 4, 1, np.NaN, 3, 1, 4, 1], + "d": [5, 4, 7, 5, 3, 1, 9, 3, 4], + "label": ["a", "a", "b", "b", "a", "a", "b", "a", "b"], + }, + index=[0, 0, 0, 1, 1, 1, 2, 2, 2], + ) + base = base.add_nested(nested, "nested") + + # Test general count + total_counts = count_nested(base, "nested", join=join) + assert all(total_counts["n_nested"].values == 3) + + # Test count by + label_counts = count_nested(base, "nested", by="label", join=join) + assert all(label_counts["n_nested_a"].values == [2, 2, 1]) + assert all(label_counts["n_nested_b"].values == [1, 1, 2]) + + # Test join behavior + if join: + assert total_counts.columns.tolist() == base.columns.tolist() + ["n_nested"] + assert label_counts.columns.tolist() == base.columns.tolist() + ["n_nested_a", "n_nested_b"] + else: + assert total_counts.columns.tolist() == ["n_nested"] + assert label_counts.columns.tolist() == ["n_nested_a", "n_nested_b"]