lincc-frameworks · dougbrn · Apr 4, 2024 · Apr 3, 2024 · Apr 3, 2024 · hombit
diff --git a/src/nested_pandas/__init__.py b/src/nested_pandas/__init__.py
@@ -1,4 +1,5 @@
 from .example_module import greetings, meaning
+from .nestedframe import NestedFrame  # noqa
 
 # Import for registering
 from .series.accessor import NestSeriesAccessor  # noqa: F401

diff --git a/src/nested_pandas/nestedframe/__init__.py b/src/nested_pandas/nestedframe/__init__.py
@@ -0,0 +1 @@
+from .core import *  # noqa
diff --git a/src/nested_pandas/nestedframe/core.py b/src/nested_pandas/nestedframe/core.py
@@ -0,0 +1,61 @@
+# typing.Self and "|" union syntax don't exist in Python 3.9
+from __future__ import annotations
+
+import pandas as pd
+
+from nested_pandas.series import packer
+
+
+class NestedFrame(pd.DataFrame):
+    """A Pandas Dataframe extension with support for nested structure.
+
+    See https://pandas.pydata.org/docs/development/extending.html#subclassing-pandas-data-structures
+    """
+
+    # normal properties
+    _metadata = ["added_property"]
+
+    @property
+    def _constructor(self) -> Self:  # type: ignore[name-defined] # noqa: F821
+        return NestedFrame
+
+    @property
+    def _constructor_expanddim(self) -> Self:  # type: ignore[name-defined] # noqa: F821
+        return NestedFrame
+
+    @property
+    def all_columns(self) -> dict:
+        """returns a dictionary of columns for each base/nested dataframe"""
+        all_columns = {"base": self.columns}
+        for column in self.columns:
+            if hasattr(self[column], "nest"):
+                nest_cols = self[column].iloc[0].columns  # TODO: Improve access to columns
-                nest_cols = self[column].iloc[0].columns  # TODO: Improve access to columns
+                nest_cols = self[column].nest.fields()
-                nest_cols = self[column].iloc[0].columns  # TODO: Improve access to columns
+                nest_cols = self[column].nest.fields()
+                all_columns[column] = nest_cols
+        return all_columns
+
+    @property
+    def nested_columns(self) -> list:
+        """retrieves the base column names for all nested dataframes"""
+        nest_cols = []
+        for column in self.columns:
+            if hasattr(self[column], "nest"):
-            if hasattr(self[column], "nest"):
+            if isinstance(self[column].dtype, NestedDtype):
-            if hasattr(self[column], "nest"):
+            if isinstance(self[column].dtype, NestedDtype):
+                nest_cols.append(column)
+        return nest_cols
+
+    def _is_known_hierarchical_column(self, colname) -> bool:
+        """Determine whether a string is a known hierarchical column name"""
+        if "." in colname:
+            left, right = colname.split(".")
+            if left in self.nested_columns:
+                return right in self.all_columns[left]
+            else:
+                return False
+        else:
+            return False
+
+    def add_nested(self, nested, name) -> Self:  # type: ignore[name-defined] # noqa: F821
+        """Packs a dataframe into a nested column"""
+        # Add sources to objects
+        packed = packer.pack_flat(nested, name=name)
+        label = packed.name
+        return self.assign(**{f"{label}": packed})
diff --git a/tests/nested_pandas/nestedframe/test_nestedframe.py b/tests/nested_pandas/nestedframe/test_nestedframe.py
@@ -0,0 +1,76 @@
+import pandas as pd
+from nested_pandas import NestedFrame
+
+
+def test_nestedframe_construction():
+    """Test NestedFrame construction"""
+    base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2])
+
+    assert isinstance(base, NestedFrame)
+
+
+def test_all_columns():
+    """Test the all_columns function"""
+
+    base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2])
+
+    assert list(base.all_columns.keys()) == ["base"]
+    assert list(base.all_columns["base"]) == list(base.columns)
+
+    nested = pd.DataFrame(
+        data={"c": [0, 2, 4, 1, 4, 3, 1, 4, 1], "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
+        index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
+    )
+
+    base = base.add_nested(nested, "nested")
+
+    assert list(base.all_columns.keys()) == ["base", "nested"]
+    assert list(base.all_columns["nested"]) == list(nested.columns)
+
+
+def test_nested_columns():
+    """Test that nested_columns correctly retrieves the nested base columns"""
+
+    base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2])
+
+    nested = pd.DataFrame(
+        data={"c": [0, 2, 4, 1, 4, 3, 1, 4, 1], "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
+        index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
+    )
+
+    base = base.add_nested(nested, "nested")
+
+    assert base.nested_columns == ["nested"]
+
+
+def test_is_known_hierarchical_column():
+    """Test that hierarchical column labels can be identified"""
+
+    base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2])
+
+    nested = pd.DataFrame(
+        data={"c": [0, 2, 4, 1, 4, 3, 1, 4, 1], "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
+        index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
+    )
+
+    base = base.add_nested(nested, "nested")
+
+    assert base._is_known_hierarchical_column("nested.c")
+    assert not base._is_known_hierarchical_column("nested.b")
+    assert not base._is_known_hierarchical_column("base.a")
+
+
+def test_add_nested():
+    """Test that add_nested correctly adds a nested column to the base df"""
+
+    base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2])
+
+    nested = pd.DataFrame(
+        data={"c": [0, 2, 4, 1, 4, 3, 1, 4, 1], "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
+        index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
+    )
+
+    base = base.add_nested(nested, "nested")
+
+    assert "nested" in base.columns
+    assert base.nested.nest.to_flat().equals(nested)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		from .core import * # noqa
Copy link Collaborator hombit Apr 3, 2024 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. Do we really need star-import here? It would add `pd` and other stuff we don't really need