Skip to content

Commit

Permalink
pack_seq and df.add_nested([df1,df2])
Browse files Browse the repository at this point in the history
  • Loading branch information
hombit committed May 3, 2024
1 parent 109c16c commit 442321b
Show file tree
Hide file tree
Showing 4 changed files with 280 additions and 26 deletions.
36 changes: 33 additions & 3 deletions src/nested_pandas/nestedframe/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import numpy as np
import pandas as pd
import pyarrow as pa
from pandas._libs import lib
from pandas._typing import Any, AnyAll, Axis, IndexLabel
from pandas.api.extensions import no_default
Expand Down Expand Up @@ -62,10 +63,39 @@ def _is_known_column(self, colname) -> bool:
"""Determine whether a string is a known column name"""
return colname in self.columns or self._is_known_hierarchical_column(colname)

def add_nested(self, nested, name) -> Self: # type: ignore[name-defined] # noqa: F821
"""Packs a dataframe into a nested column"""
def add_nested(
self, obj, name: str, *, dtype: NestedDtype | pd.ArrowDtype | pa.DataType | None = None
) -> Self: # type: ignore[name-defined] # noqa: F821
"""Packs input object to a nested column and adds it to the NestedFrame
This method returns a new NestedFrame with the added nested column.
Parameters
----------
obj : pd.DataFrame or a sequence of items convertible to nested structures
The object to be packed into nested pd.Series and added to
the NestedFrame. If a DataFrame is passed, it must have non-unique
index values, which are used to pack the DataFrame. If a sequence
of elements is passed, it is packed into a nested pd.Series.
Sequence elements may be individual pd.DataFrames, dictionaries
(keys are nested column names, values are arrays of the same
length), or any other object convertible to pa.StructArray.
Additionally, None and pd.NA are allowed as elements to represent
missing values.
name : str
The name of the nested column to be added to the NestedFrame.
dtype : dtype or None
NestedDtype to use for the nested column; pd.ArrowDtype or
pa.DataType can also be used to specify the nested dtype. If None,
the dtype is inferred from the input object.
Returns
-------
NestedFrame
A new NestedFrame with the added nested column.
"""
# Add sources to objects
packed = packer.pack_flat(nested, name=name)
packed = packer.pack(obj, name=name, dtype=dtype)
label = packed.name
return self.assign(**{f"{label}": packed})

Expand Down
76 changes: 58 additions & 18 deletions src/nested_pandas/series/packer.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,47 @@
from nested_pandas.series.dtype import NestedDtype
from nested_pandas.series.ext_array import NestedExtensionArray

__all__ = ["pack_flat", "pack_lists", "pack_dfs"]
__all__ = ["pack", "pack_flat", "pack_lists", "pack_seq"]


N_ROWS_INFER_DTYPE = 1000


def pack(
obj,
name: str | None = None,
*,
index=None,
dtype: NestedDtype | pd.ArrowDtype | pa.DataType | None = None,
) -> pd.Series:
"""Pack a "flat" dataframe or a sequence of dataframes into a "nested" series.
Parameters
----------
obj : pd.DataFrame or Sequence of
Input dataframe, with repeated indexes, or a sequence of dataframes or missed values.
name : str, optional
Name of the output series.
index : convertable to pd.Index, optional
Index of the output series. If obj is a pd.DataFrame, it is always nested by the original index,
and this value is used to override the index after the nesting.
dtype : dtype or None
NestedDtype of the output series, or other type to derive from. If None,
the dtype is inferred from the first non-missing dataframe.
Returns
-------
pd.Series
Output series.
"""
if isinstance(obj, pd.DataFrame):
nested = pack_flat(obj, name=name)
if index is not None:
nested.index = index
return nested
return pack_seq(obj, name=name, index=index, dtype=dtype)


def pack_flat_into_df(df: pd.DataFrame, name=None) -> pd.DataFrame:
"""Pack a "flat" dataframe into a "nested" dataframe.
Expand Down Expand Up @@ -86,35 +121,40 @@ def pack_flat(df: pd.DataFrame, name: str | None = None) -> pd.Series:
return pack_sorted_df_into_struct(flat, name=name)


def pack_dfs(dfs: Sequence[pd.DataFrame], index: object = None, name: str | None = None) -> pd.Series:
def pack_seq(
sequence: Sequence,
name: str | None = None,
*,
index: object = None,
dtype: NestedDtype | pd.ArrowDtype | pa.DataType | None = None,
) -> pd.Series:
"""Pack a sequence of "flat" dataframes into a "nested" series.
Parameters
----------
dfs : Sequence[pd.DataFrame]
Input sequence of dataframes.
index : pd.Index, optional
Index of the output series.
sequence : Sequence of pd.DataFrame or None or pd.NA or convertible to pa.StructScalar
Input sequence of dataframes or missed values.
name : str, optional
Name of the output series.
index : pd.Index, optional
Index of the output series.
dtype : dtype or None
NestedDtype of the output series, or other type to derive from. If None,
the dtype is inferred from the first non-missing dataframe.
Returns
-------
pd.Series
Output series.
"""
if isinstance(dfs, pd.Series) and index is None:
index = dfs.index

first_df = dfs.iloc[0] if hasattr(dfs, "iloc") else dfs[0]

field_types = {
column: pa.array(first_df[column].iloc[:N_ROWS_INFER_DTYPE]).type for column in first_df.columns
}
dtype = NestedDtype.from_fields(field_types)
dummy_value: dict[str, list] = {column: [] for column in first_df.columns}
series = pd.Series([dummy_value] * len(dfs), dtype=dtype, index=index, name=name)
series[:] = dfs
if isinstance(sequence, pd.Series):
if index is None:
index = sequence.index
if name is None:
name = sequence.name

ext_array = NestedExtensionArray.from_sequence(sequence, dtype=dtype)
series = pd.Series(ext_array, index=index, name=name, copy=False)
return series


Expand Down
39 changes: 37 additions & 2 deletions tests/nested_pandas/nestedframe/test_nestedframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import pandas as pd
import pytest
from nested_pandas import NestedFrame
from pandas.testing import assert_frame_equal


def test_nestedframe_construction():
Expand Down Expand Up @@ -62,7 +63,7 @@ def test_is_known_hierarchical_column():
assert not base._is_known_hierarchical_column("base.a")


def test_add_nested():
def test_add_nested_with_flat_df():
"""Test that add_nested correctly adds a nested column to the base df"""

base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2])
Expand All @@ -78,7 +79,7 @@ def test_add_nested():
assert base.nested.nest.to_flat().equals(nested)


def test_add_nested_with_mismatched_index():
def test_add_nested_with_flat_df_and_mismatched_index():
"""Test add_nested when index values of base are missing matches in nested"""

base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2])
Expand All @@ -94,6 +95,40 @@ def test_add_nested_with_mismatched_index():
assert pd.isna(base.loc[2]["nested"])


def test_add_nested_with_series():
"""Test that add_nested correctly adds a nested column to the base df"""

base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2])

nested = pd.Series(
data=[pd.DataFrame({"c": [0, 1]}), pd.DataFrame({"c": [1, 2]}), pd.DataFrame({"c": [2, 3]})],
index=[0, 1, 2],
name="c",
)

base = base.add_nested(nested, "nested")

assert "nested" in base.columns
for i in range(3):
assert_frame_equal(base.iloc[i]["nested"], nested[i])


def test_add_nested_with_series_and_mismatched_index():
"""Test add_nested when index values of base are missing matches in nested"""
base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2])

nested = pd.Series(
data=[pd.DataFrame({"c": [0, 1]}), pd.DataFrame({"c": [2, 3]})],
index=[0, 2], # no data for index value of "1"
name="c",
)

base = base.add_nested(nested, "nested")

assert "nested" in base.columns
assert pd.isna(base.loc[1]["nested"])


def test_query():
"""Test that NestedFrame.query handles nested queries correctly"""

Expand Down
Loading

0 comments on commit 442321b

Please sign in to comment.