Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

initialize nestedframe #5

Merged
merged 2 commits into from
Apr 4, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/nested_pandas/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from .example_module import greetings, meaning
from .nestedframe import NestedFrame # noqa
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's remove noqa and add to __all__?


# Import for registering
from .series.accessor import NestSeriesAccessor # noqa: F401
Expand Down
1 change: 1 addition & 0 deletions src/nested_pandas/nestedframe/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .core import * # noqa
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we really need star-import here? It would add pd and other stuff we don't really need

61 changes: 61 additions & 0 deletions src/nested_pandas/nestedframe/core.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# typing.Self and "|" union syntax don't exist in Python 3.9
from __future__ import annotations

import pandas as pd

from nested_pandas.series import packer


class NestedFrame(pd.DataFrame):
"""A Pandas Dataframe extension with support for nested structure.

See https://pandas.pydata.org/docs/development/extending.html#subclassing-pandas-data-structures
"""

# normal properties
_metadata = ["added_property"]

@property
def _constructor(self) -> Self: # type: ignore[name-defined] # noqa: F821
return NestedFrame

@property
def _constructor_expanddim(self) -> Self: # type: ignore[name-defined] # noqa: F821
return NestedFrame

@property
def all_columns(self) -> dict:
"""returns a dictionary of columns for each base/nested dataframe"""
all_columns = {"base": self.columns}
for column in self.columns:
if hasattr(self[column], "nest"):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would check column's dtype instead, it should be NestedDtype. No code suggestion here, because we need to import it first

from nested_pandas import NestedDtype

if isinstance(self[column].dtype, NestedDtype):

nest_cols = self[column].iloc[0].columns # TODO: Improve access to columns
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
nest_cols = self[column].iloc[0].columns # TODO: Improve access to columns
nest_cols = self[column].nest.fields()

all_columns[column] = nest_cols
return all_columns

@property
def nested_columns(self) -> list:
"""retrieves the base column names for all nested dataframes"""
nest_cols = []
for column in self.columns:
if hasattr(self[column], "nest"):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if hasattr(self[column], "nest"):
if isinstance(self[column].dtype, NestedDtype):

nest_cols.append(column)
return nest_cols

def _is_known_hierarchical_column(self, colname) -> bool:
"""Determine whether a string is a known hierarchical column name"""
if "." in colname:
left, right = colname.split(".")
if left in self.nested_columns:
return right in self.all_columns[left]
else:
return False
else:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is a matter of taste, but I believe we don't need these else:s here

return False

def add_nested(self, nested, name) -> Self: # type: ignore[name-defined] # noqa: F821
"""Packs a dataframe into a nested column"""
# Add sources to objects
packed = packer.pack_flat(nested, name=name)
label = packed.name
return self.assign(**{f"{label}": packed})
76 changes: 76 additions & 0 deletions tests/nested_pandas/nestedframe/test_nestedframe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import pandas as pd
from nested_pandas import NestedFrame


def test_nestedframe_construction():
"""Test NestedFrame construction"""
base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2])

assert isinstance(base, NestedFrame)


def test_all_columns():
"""Test the all_columns function"""

base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2])

assert list(base.all_columns.keys()) == ["base"]
assert list(base.all_columns["base"]) == list(base.columns)

nested = pd.DataFrame(
data={"c": [0, 2, 4, 1, 4, 3, 1, 4, 1], "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)

base = base.add_nested(nested, "nested")

assert list(base.all_columns.keys()) == ["base", "nested"]
assert list(base.all_columns["nested"]) == list(nested.columns)


def test_nested_columns():
"""Test that nested_columns correctly retrieves the nested base columns"""

base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2])

nested = pd.DataFrame(
data={"c": [0, 2, 4, 1, 4, 3, 1, 4, 1], "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)

base = base.add_nested(nested, "nested")

assert base.nested_columns == ["nested"]


def test_is_known_hierarchical_column():
"""Test that hierarchical column labels can be identified"""

base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2])

nested = pd.DataFrame(
data={"c": [0, 2, 4, 1, 4, 3, 1, 4, 1], "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)

base = base.add_nested(nested, "nested")

assert base._is_known_hierarchical_column("nested.c")
assert not base._is_known_hierarchical_column("nested.b")
assert not base._is_known_hierarchical_column("base.a")


def test_add_nested():
"""Test that add_nested correctly adds a nested column to the base df"""

base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2])

nested = pd.DataFrame(
data={"c": [0, 2, 4, 1, 4, 3, 1, 4, 1], "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)

base = base.add_nested(nested, "nested")

assert "nested" in base.columns
assert base.nested.nest.to_flat().equals(nested)
Loading