From b56786632ee706d5e9cc90a1adf03822c785a30b Mon Sep 17 00:00:00 2001 From: Doug Branton Date: Tue, 13 Aug 2024 13:53:56 -0700 Subject: [PATCH 1/5] add pack_lists class function --- src/nested_pandas/nestedframe/core.py | 63 +++++++++++++++++++ .../nestedframe/test_nestedframe.py | 33 ++++++++++ 2 files changed, 96 insertions(+) diff --git a/src/nested_pandas/nestedframe/core.py b/src/nested_pandas/nestedframe/core.py index a7ec7dc..768fbeb 100644 --- a/src/nested_pandas/nestedframe/core.py +++ b/src/nested_pandas/nestedframe/core.py @@ -214,6 +214,69 @@ def from_flat(cls, df, base_columns, nested_columns=None, index=None, name="nest nested_columns = [col for col in df.columns if col not in base_columns] return out_df.add_nested(df[nested_columns], name=name) + @classmethod + def from_lists(cls, df, base_columns=None, list_columns=None, name="nested"): + """Creates a NestedFrame with base and nested columns from a flat + dataframe. + + Parameters + ---------- + df: pd.DataFrame or NestedFrame + A dataframe with list columns. + base_columns: list-like, or None + Any columns that have non-list values in the input df. These will + simply be kept as identical columns in the result + list_columns: list-like, or None + The list-value columns that should be packed into a nested column. + All columns in the list will attempt to be packed into a single + nested column with the name provided in `nested_name`. If None, is + defined as all columns not in `base_columns`. + name: + The name of the output column the `nested_columns` are packed into. + + Returns + ------- + NestedFrame + A NestedFrame with the specified nesting structure. + + Examples + -------- + + >>> nf = NestedFrame({"c":[1,2,3], "d":[2,4,6], + ... "e":[[1,2,3], [4,5,6], [7,8,9]]}, + ... index=[0,1,2]) + + + >>> NestedFrame.from_lists(nf, base_columns=["c","d"]) + """ + + # Resolve base and list columns + if base_columns is None: + if list_columns is None: + # with no inputs, assume all columns are list-valued + list_columns = df.columns + else: + # if list_columns are defined, assume everything else is base + base_columns = [col for col in df.columns if col not in list_columns] + else: + if list_columns is None: + # with defined base_columns, assume everything else is list + list_columns = [col for col in df.columns if col not in base_columns] + + if len(list_columns) == 0: + raise ValueError("No columns were assigned as list columns.") + + # Pack list columns into a nested column + packed_df = packer.pack_lists(df[list_columns]) + packed_df.name = name + + # join the nested column to the base_column df + if base_columns is not None: + return df[base_columns].join(packed_df) + # or just return the packed_df as a nestedframe if no base cols + else: + return packed_df.to_frame() + def _split_query(self, expr) -> dict: """Splits a pandas query into multiple subqueries for nested and base layers""" # Ensure query has needed spacing for upcoming split diff --git a/tests/nested_pandas/nestedframe/test_nestedframe.py b/tests/nested_pandas/nestedframe/test_nestedframe.py index 3dbf34b..ef40179 100644 --- a/tests/nested_pandas/nestedframe/test_nestedframe.py +++ b/tests/nested_pandas/nestedframe/test_nestedframe.py @@ -320,6 +320,39 @@ def test_recover_from_flat(): assert nf2.equals(nf) +def test_from_lists(): + """Test NestedFrame.from_lists behavior""" + nf = NestedFrame( + {"c": [1, 2, 3], "d": [2, 4, 6], "e": [[1, 2, 3], [4, 5, 6], [7, 8, 9]]}, index=[0, 1, 2] + ) + + # Test a few combinations + res = NestedFrame.from_lists(nf, base_columns=["c", "d"], name="nested_e") + assert list(res.columns) == ["c", "d", "nested_e"] + assert list(res.nested_columns) == ["nested_e"] + + res = NestedFrame.from_lists(nf, base_columns=["c", "d"], list_columns=["e"]) + assert list(res.columns) == ["c", "d", "nested"] + assert list(res.nested_columns) == ["nested"] + + res = NestedFrame.from_lists(nf, list_columns=["e"]) + assert list(res.columns) == ["c", "d", "nested"] + assert list(res.nested_columns) == ["nested"] + + # Check for the no list columns error + with pytest.raises(ValueError): + res = NestedFrame.from_lists(nf, base_columns=["c", "d", "e"]) + + # Check for subsetting + res = NestedFrame.from_lists(nf, base_columns=["c"], list_columns=["e"]) + assert list(res.columns) == ["c", "nested"] + assert list(res.nested_columns) == ["nested"] + + res = NestedFrame.from_lists(nf, base_columns=[], list_columns=["e"]) + assert list(res.columns) == ["nested"] + assert list(res.nested_columns) == ["nested"] + + def test_query(): """Test that NestedFrame.query handles nested queries correctly""" From 862cae9b4673ba91ebe463bca93d08901a8fa46d Mon Sep 17 00:00:00 2001 From: Doug Branton Date: Tue, 13 Aug 2024 14:04:17 -0700 Subject: [PATCH 2/5] add extra test --- tests/nested_pandas/nestedframe/test_nestedframe.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/nested_pandas/nestedframe/test_nestedframe.py b/tests/nested_pandas/nestedframe/test_nestedframe.py index ef40179..e00b215 100644 --- a/tests/nested_pandas/nestedframe/test_nestedframe.py +++ b/tests/nested_pandas/nestedframe/test_nestedframe.py @@ -352,6 +352,10 @@ def test_from_lists(): assert list(res.columns) == ["nested"] assert list(res.nested_columns) == ["nested"] + res = NestedFrame.from_lists(nf, base_columns=None, list_columns=["e"]) + assert list(res.columns) == ["nested"] + assert list(res.nested_columns) == ["nested"] + def test_query(): """Test that NestedFrame.query handles nested queries correctly""" From 5eef6a5c9d624f95eff6716573208cae49eb8d31 Mon Sep 17 00:00:00 2001 From: Doug Branton Date: Tue, 13 Aug 2024 14:07:30 -0700 Subject: [PATCH 3/5] fix added test --- tests/nested_pandas/nestedframe/test_nestedframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/nested_pandas/nestedframe/test_nestedframe.py b/tests/nested_pandas/nestedframe/test_nestedframe.py index e00b215..a6a1a36 100644 --- a/tests/nested_pandas/nestedframe/test_nestedframe.py +++ b/tests/nested_pandas/nestedframe/test_nestedframe.py @@ -352,7 +352,7 @@ def test_from_lists(): assert list(res.columns) == ["nested"] assert list(res.nested_columns) == ["nested"] - res = NestedFrame.from_lists(nf, base_columns=None, list_columns=["e"]) + res = NestedFrame.from_lists(nf[["e"]], base_columns=None, list_columns=["e"]) assert list(res.columns) == ["nested"] assert list(res.nested_columns) == ["nested"] From 30e5859c505483144a76dfe338082e0fee0ffe38 Mon Sep 17 00:00:00 2001 From: Doug Branton Date: Tue, 13 Aug 2024 14:57:31 -0700 Subject: [PATCH 4/5] actually fix test --- src/nested_pandas/nestedframe/core.py | 2 +- tests/nested_pandas/nestedframe/test_nestedframe.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/nested_pandas/nestedframe/core.py b/src/nested_pandas/nestedframe/core.py index 768fbeb..cb743fc 100644 --- a/src/nested_pandas/nestedframe/core.py +++ b/src/nested_pandas/nestedframe/core.py @@ -275,7 +275,7 @@ def from_lists(cls, df, base_columns=None, list_columns=None, name="nested"): return df[base_columns].join(packed_df) # or just return the packed_df as a nestedframe if no base cols else: - return packed_df.to_frame() + return NestedFrame(packed_df.to_frame()) def _split_query(self, expr) -> dict: """Splits a pandas query into multiple subqueries for nested and base layers""" diff --git a/tests/nested_pandas/nestedframe/test_nestedframe.py b/tests/nested_pandas/nestedframe/test_nestedframe.py index a6a1a36..6cf44bf 100644 --- a/tests/nested_pandas/nestedframe/test_nestedframe.py +++ b/tests/nested_pandas/nestedframe/test_nestedframe.py @@ -352,7 +352,7 @@ def test_from_lists(): assert list(res.columns) == ["nested"] assert list(res.nested_columns) == ["nested"] - res = NestedFrame.from_lists(nf[["e"]], base_columns=None, list_columns=["e"]) + res = NestedFrame.from_lists(nf[["e"]], base_columns=None, list_columns=None) assert list(res.columns) == ["nested"] assert list(res.nested_columns) == ["nested"] From b087283a84f9890e6fdd5bcd7917047f64338979 Mon Sep 17 00:00:00 2001 From: Doug Branton Date: Mon, 19 Aug 2024 11:30:14 -0700 Subject: [PATCH 5/5] add tests --- .../nestedframe/test_nestedframe.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/nested_pandas/nestedframe/test_nestedframe.py b/tests/nested_pandas/nestedframe/test_nestedframe.py index 6cf44bf..7c4d2fc 100644 --- a/tests/nested_pandas/nestedframe/test_nestedframe.py +++ b/tests/nested_pandas/nestedframe/test_nestedframe.py @@ -343,6 +343,22 @@ def test_from_lists(): with pytest.raises(ValueError): res = NestedFrame.from_lists(nf, base_columns=["c", "d", "e"]) + # Multiple list columns (of uneven length) + nf2 = NestedFrame( + { + "c": [1, 2, 3], + "d": [2, 4, 6], + "e": [[1, 2, 3], [4, 5, 6, 7], [8, 9]], + "f": [[10, 20, 30], [40, 50, 60, 70], [80, 90]], + }, + index=[0, 1, 2], + ) + + res = NestedFrame.from_lists(nf2, list_columns=["e", "f"]) + assert list(res.columns) == ["c", "d", "nested"] + assert list(res.nested_columns) == ["nested"] + assert list(res.nested.nest.fields) == ["e", "f"] + # Check for subsetting res = NestedFrame.from_lists(nf, base_columns=["c"], list_columns=["e"]) assert list(res.columns) == ["c", "nested"]