pack_seq and df.add_nested([df1,df2])

lincc-frameworks · May 3, 2024 · 442321b · 442321b
1 parent 109c16c
commit 442321b
Show file tree

Hide file tree

Showing 4 changed files with 280 additions and 26 deletions.
diff --git a/src/nested_pandas/nestedframe/core.py b/src/nested_pandas/nestedframe/core.py
@@ -3,6 +3,7 @@
 
 import numpy as np
 import pandas as pd
+import pyarrow as pa
 from pandas._libs import lib
 from pandas._typing import Any, AnyAll, Axis, IndexLabel
 from pandas.api.extensions import no_default
@@ -62,10 +63,39 @@ def _is_known_column(self, colname) -> bool:
         """Determine whether a string is a known column name"""
         return colname in self.columns or self._is_known_hierarchical_column(colname)
 
-    def add_nested(self, nested, name) -> Self:  # type: ignore[name-defined] # noqa: F821
-        """Packs a dataframe into a nested column"""
+    def add_nested(
+        self, obj, name: str, *, dtype: NestedDtype | pd.ArrowDtype | pa.DataType | None = None
+    ) -> Self:  # type: ignore[name-defined] # noqa: F821
+        """Packs input object to a nested column and adds it to the NestedFrame
+
+        This method returns a new NestedFrame with the added nested column.
+
+        Parameters
+        ----------
+        obj : pd.DataFrame or a sequence of items convertible to nested structures
+            The object to be packed into nested pd.Series and added to
+            the NestedFrame. If a DataFrame is passed, it must have non-unique
+            index values, which are used to pack the DataFrame. If a sequence
+            of elements is passed, it is packed into a nested pd.Series.
+            Sequence elements may be individual pd.DataFrames, dictionaries
+            (keys are nested column names, values are arrays of the same
+            length), or any other object convertible to pa.StructArray.
+            Additionally, None and pd.NA are allowed as elements to represent
+            missing values.
+        name : str
+            The name of the nested column to be added to the NestedFrame.
+        dtype : dtype or None
+            NestedDtype to use for the nested column; pd.ArrowDtype or
+            pa.DataType can also be used to specify the nested dtype. If None,
+            the dtype is inferred from the input object.
+
+        Returns
+        -------
+        NestedFrame
+            A new NestedFrame with the added nested column.
+        """
         # Add sources to objects
-        packed = packer.pack_flat(nested, name=name)
+        packed = packer.pack(obj, name=name, dtype=dtype)
         label = packed.name
         return self.assign(**{f"{label}": packed})
 

diff --git a/src/nested_pandas/series/packer.py b/src/nested_pandas/series/packer.py
@@ -16,12 +16,47 @@
 from nested_pandas.series.dtype import NestedDtype
 from nested_pandas.series.ext_array import NestedExtensionArray
 
-__all__ = ["pack_flat", "pack_lists", "pack_dfs"]
+__all__ = ["pack", "pack_flat", "pack_lists", "pack_seq"]
 
 
 N_ROWS_INFER_DTYPE = 1000
 
 
+def pack(
+    obj,
+    name: str | None = None,
+    *,
+    index=None,
+    dtype: NestedDtype | pd.ArrowDtype | pa.DataType | None = None,
+) -> pd.Series:
+    """Pack a "flat" dataframe or a sequence of dataframes into a "nested" series.
+
+    Parameters
+    ----------
+    obj : pd.DataFrame or Sequence of
+        Input dataframe, with repeated indexes, or a sequence of dataframes or missed values.
+    name : str, optional
+        Name of the output series.
+    index : convertable to pd.Index, optional
+        Index of the output series. If obj is a pd.DataFrame, it is always nested by the original index,
+        and this value is used to override the index after the nesting.
+    dtype : dtype or None
+        NestedDtype of the output series, or other type to derive from. If None,
+        the dtype is inferred from the first non-missing dataframe.
+
+    Returns
+    -------
+    pd.Series
+        Output series.
+    """
+    if isinstance(obj, pd.DataFrame):
+        nested = pack_flat(obj, name=name)
+        if index is not None:
+            nested.index = index
+        return nested
+    return pack_seq(obj, name=name, index=index, dtype=dtype)
+
+
 def pack_flat_into_df(df: pd.DataFrame, name=None) -> pd.DataFrame:
     """Pack a "flat" dataframe into a "nested" dataframe.
 
@@ -86,35 +121,40 @@ def pack_flat(df: pd.DataFrame, name: str | None = None) -> pd.Series:
     return pack_sorted_df_into_struct(flat, name=name)
 
 
-def pack_dfs(dfs: Sequence[pd.DataFrame], index: object = None, name: str | None = None) -> pd.Series:
+def pack_seq(
+    sequence: Sequence,
+    name: str | None = None,
+    *,
+    index: object = None,
+    dtype: NestedDtype | pd.ArrowDtype | pa.DataType | None = None,
+) -> pd.Series:
     """Pack a sequence of "flat" dataframes into a "nested" series.
 
     Parameters
     ----------
-    dfs : Sequence[pd.DataFrame]
-        Input sequence of dataframes.
-    index : pd.Index, optional
-        Index of the output series.
+    sequence : Sequence of pd.DataFrame or None or pd.NA or convertible to pa.StructScalar
+        Input sequence of dataframes or missed values.
     name : str, optional
         Name of the output series.
+    index : pd.Index, optional
+        Index of the output series.
+    dtype : dtype or None
+        NestedDtype of the output series, or other type to derive from. If None,
+        the dtype is inferred from the first non-missing dataframe.
 
     Returns
     -------
     pd.Series
         Output series.
     """
-    if isinstance(dfs, pd.Series) and index is None:
-        index = dfs.index
-
-    first_df = dfs.iloc[0] if hasattr(dfs, "iloc") else dfs[0]
-
-    field_types = {
-        column: pa.array(first_df[column].iloc[:N_ROWS_INFER_DTYPE]).type for column in first_df.columns
-    }
-    dtype = NestedDtype.from_fields(field_types)
-    dummy_value: dict[str, list] = {column: [] for column in first_df.columns}
-    series = pd.Series([dummy_value] * len(dfs), dtype=dtype, index=index, name=name)
-    series[:] = dfs
+    if isinstance(sequence, pd.Series):
+        if index is None:
+            index = sequence.index
+        if name is None:
+            name = sequence.name
+
+    ext_array = NestedExtensionArray.from_sequence(sequence, dtype=dtype)
+    series = pd.Series(ext_array, index=index, name=name, copy=False)
     return series
 
 

diff --git a/tests/nested_pandas/nestedframe/test_nestedframe.py b/tests/nested_pandas/nestedframe/test_nestedframe.py
@@ -2,6 +2,7 @@
 import pandas as pd
 import pytest
 from nested_pandas import NestedFrame
+from pandas.testing import assert_frame_equal
 
 
 def test_nestedframe_construction():
@@ -62,7 +63,7 @@ def test_is_known_hierarchical_column():
     assert not base._is_known_hierarchical_column("base.a")
 
 
-def test_add_nested():
+def test_add_nested_with_flat_df():
     """Test that add_nested correctly adds a nested column to the base df"""
 
     base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2])
@@ -78,7 +79,7 @@ def test_add_nested():
     assert base.nested.nest.to_flat().equals(nested)
 
 
-def test_add_nested_with_mismatched_index():
+def test_add_nested_with_flat_df_and_mismatched_index():
     """Test add_nested when index values of base are missing matches in nested"""
 
     base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2])
@@ -94,6 +95,40 @@ def test_add_nested_with_mismatched_index():
     assert pd.isna(base.loc[2]["nested"])
 
 
+def test_add_nested_with_series():
+    """Test that add_nested correctly adds a nested column to the base df"""
+
+    base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2])
+
+    nested = pd.Series(
+        data=[pd.DataFrame({"c": [0, 1]}), pd.DataFrame({"c": [1, 2]}), pd.DataFrame({"c": [2, 3]})],
+        index=[0, 1, 2],
+        name="c",
+    )
+
+    base = base.add_nested(nested, "nested")
+
+    assert "nested" in base.columns
+    for i in range(3):
+        assert_frame_equal(base.iloc[i]["nested"], nested[i])
+
+
+def test_add_nested_with_series_and_mismatched_index():
+    """Test add_nested when index values of base are missing matches in nested"""
+    base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2])
+
+    nested = pd.Series(
+        data=[pd.DataFrame({"c": [0, 1]}), pd.DataFrame({"c": [2, 3]})],
+        index=[0, 2],  # no data for index value of "1"
+        name="c",
+    )
+
+    base = base.add_nested(nested, "nested")
+
+    assert "nested" in base.columns
+    assert pd.isna(base.loc[1]["nested"])
+
+
 def test_query():
     """Test that NestedFrame.query handles nested queries correctly"""