From 442321b309583ccac978d60b31444cde298bd093 Mon Sep 17 00:00:00 2001
From: Konstantin Malanchev <hombit@gmail.com>
Date: Fri, 3 May 2024 12:59:19 -0400
Subject: [PATCH] pack_seq and df.add_nested([df1,df2])

---
 src/nested_pandas/nestedframe/core.py         |  36 +++-
 src/nested_pandas/series/packer.py            |  76 +++++++--
 .../nestedframe/test_nestedframe.py           |  39 ++++-
 tests/nested_pandas/series/test_packer.py     | 155 +++++++++++++++++-
 4 files changed, 280 insertions(+), 26 deletions(-)

diff --git a/src/nested_pandas/nestedframe/core.py b/src/nested_pandas/nestedframe/core.py
index 1c8e182..71dd33f 100644
--- a/src/nested_pandas/nestedframe/core.py
+++ b/src/nested_pandas/nestedframe/core.py
@@ -3,6 +3,7 @@
 
 import numpy as np
 import pandas as pd
+import pyarrow as pa
 from pandas._libs import lib
 from pandas._typing import Any, AnyAll, Axis, IndexLabel
 from pandas.api.extensions import no_default
@@ -62,10 +63,39 @@ def _is_known_column(self, colname) -> bool:
         """Determine whether a string is a known column name"""
         return colname in self.columns or self._is_known_hierarchical_column(colname)
 
-    def add_nested(self, nested, name) -> Self:  # type: ignore[name-defined] # noqa: F821
-        """Packs a dataframe into a nested column"""
+    def add_nested(
+        self, obj, name: str, *, dtype: NestedDtype | pd.ArrowDtype | pa.DataType | None = None
+    ) -> Self:  # type: ignore[name-defined] # noqa: F821
+        """Packs input object to a nested column and adds it to the NestedFrame
+
+        This method returns a new NestedFrame with the added nested column.
+
+        Parameters
+        ----------
+        obj : pd.DataFrame or a sequence of items convertible to nested structures
+            The object to be packed into nested pd.Series and added to
+            the NestedFrame. If a DataFrame is passed, it must have non-unique
+            index values, which are used to pack the DataFrame. If a sequence
+            of elements is passed, it is packed into a nested pd.Series.
+            Sequence elements may be individual pd.DataFrames, dictionaries
+            (keys are nested column names, values are arrays of the same
+            length), or any other object convertible to pa.StructArray.
+            Additionally, None and pd.NA are allowed as elements to represent
+            missing values.
+        name : str
+            The name of the nested column to be added to the NestedFrame.
+        dtype : dtype or None
+            NestedDtype to use for the nested column; pd.ArrowDtype or
+            pa.DataType can also be used to specify the nested dtype. If None,
+            the dtype is inferred from the input object.
+
+        Returns
+        -------
+        NestedFrame
+            A new NestedFrame with the added nested column.
+        """
         # Add sources to objects
-        packed = packer.pack_flat(nested, name=name)
+        packed = packer.pack(obj, name=name, dtype=dtype)
         label = packed.name
         return self.assign(**{f"{label}": packed})
 
diff --git a/src/nested_pandas/series/packer.py b/src/nested_pandas/series/packer.py
index 0bb25c4..15b155c 100644
--- a/src/nested_pandas/series/packer.py
+++ b/src/nested_pandas/series/packer.py
@@ -16,12 +16,47 @@
 from nested_pandas.series.dtype import NestedDtype
 from nested_pandas.series.ext_array import NestedExtensionArray
 
-__all__ = ["pack_flat", "pack_lists", "pack_dfs"]
+__all__ = ["pack", "pack_flat", "pack_lists", "pack_seq"]
 
 
 N_ROWS_INFER_DTYPE = 1000
 
 
+def pack(
+    obj,
+    name: str | None = None,
+    *,
+    index=None,
+    dtype: NestedDtype | pd.ArrowDtype | pa.DataType | None = None,
+) -> pd.Series:
+    """Pack a "flat" dataframe or a sequence of dataframes into a "nested" series.
+
+    Parameters
+    ----------
+    obj : pd.DataFrame or Sequence of
+        Input dataframe, with repeated indexes, or a sequence of dataframes or missed values.
+    name : str, optional
+        Name of the output series.
+    index : convertable to pd.Index, optional
+        Index of the output series. If obj is a pd.DataFrame, it is always nested by the original index,
+        and this value is used to override the index after the nesting.
+    dtype : dtype or None
+        NestedDtype of the output series, or other type to derive from. If None,
+        the dtype is inferred from the first non-missing dataframe.
+
+    Returns
+    -------
+    pd.Series
+        Output series.
+    """
+    if isinstance(obj, pd.DataFrame):
+        nested = pack_flat(obj, name=name)
+        if index is not None:
+            nested.index = index
+        return nested
+    return pack_seq(obj, name=name, index=index, dtype=dtype)
+
+
 def pack_flat_into_df(df: pd.DataFrame, name=None) -> pd.DataFrame:
     """Pack a "flat" dataframe into a "nested" dataframe.
 
@@ -86,35 +121,40 @@ def pack_flat(df: pd.DataFrame, name: str | None = None) -> pd.Series:
     return pack_sorted_df_into_struct(flat, name=name)
 
 
-def pack_dfs(dfs: Sequence[pd.DataFrame], index: object = None, name: str | None = None) -> pd.Series:
+def pack_seq(
+    sequence: Sequence,
+    name: str | None = None,
+    *,
+    index: object = None,
+    dtype: NestedDtype | pd.ArrowDtype | pa.DataType | None = None,
+) -> pd.Series:
     """Pack a sequence of "flat" dataframes into a "nested" series.
 
     Parameters
     ----------
-    dfs : Sequence[pd.DataFrame]
-        Input sequence of dataframes.
-    index : pd.Index, optional
-        Index of the output series.
+    sequence : Sequence of pd.DataFrame or None or pd.NA or convertible to pa.StructScalar
+        Input sequence of dataframes or missed values.
     name : str, optional
         Name of the output series.
+    index : pd.Index, optional
+        Index of the output series.
+    dtype : dtype or None
+        NestedDtype of the output series, or other type to derive from. If None,
+        the dtype is inferred from the first non-missing dataframe.
 
     Returns
     -------
     pd.Series
         Output series.
     """
-    if isinstance(dfs, pd.Series) and index is None:
-        index = dfs.index
-
-    first_df = dfs.iloc[0] if hasattr(dfs, "iloc") else dfs[0]
-
-    field_types = {
-        column: pa.array(first_df[column].iloc[:N_ROWS_INFER_DTYPE]).type for column in first_df.columns
-    }
-    dtype = NestedDtype.from_fields(field_types)
-    dummy_value: dict[str, list] = {column: [] for column in first_df.columns}
-    series = pd.Series([dummy_value] * len(dfs), dtype=dtype, index=index, name=name)
-    series[:] = dfs
+    if isinstance(sequence, pd.Series):
+        if index is None:
+            index = sequence.index
+        if name is None:
+            name = sequence.name
+
+    ext_array = NestedExtensionArray.from_sequence(sequence, dtype=dtype)
+    series = pd.Series(ext_array, index=index, name=name, copy=False)
     return series
 
 
diff --git a/tests/nested_pandas/nestedframe/test_nestedframe.py b/tests/nested_pandas/nestedframe/test_nestedframe.py
index 077140f..cff96e4 100644
--- a/tests/nested_pandas/nestedframe/test_nestedframe.py
+++ b/tests/nested_pandas/nestedframe/test_nestedframe.py
@@ -2,6 +2,7 @@
 import pandas as pd
 import pytest
 from nested_pandas import NestedFrame
+from pandas.testing import assert_frame_equal
 
 
 def test_nestedframe_construction():
@@ -62,7 +63,7 @@ def test_is_known_hierarchical_column():
     assert not base._is_known_hierarchical_column("base.a")
 
 
-def test_add_nested():
+def test_add_nested_with_flat_df():
     """Test that add_nested correctly adds a nested column to the base df"""
 
     base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2])
@@ -78,7 +79,7 @@ def test_add_nested():
     assert base.nested.nest.to_flat().equals(nested)
 
 
-def test_add_nested_with_mismatched_index():
+def test_add_nested_with_flat_df_and_mismatched_index():
     """Test add_nested when index values of base are missing matches in nested"""
 
     base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2])
@@ -94,6 +95,40 @@ def test_add_nested_with_mismatched_index():
     assert pd.isna(base.loc[2]["nested"])
 
 
+def test_add_nested_with_series():
+    """Test that add_nested correctly adds a nested column to the base df"""
+
+    base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2])
+
+    nested = pd.Series(
+        data=[pd.DataFrame({"c": [0, 1]}), pd.DataFrame({"c": [1, 2]}), pd.DataFrame({"c": [2, 3]})],
+        index=[0, 1, 2],
+        name="c",
+    )
+
+    base = base.add_nested(nested, "nested")
+
+    assert "nested" in base.columns
+    for i in range(3):
+        assert_frame_equal(base.iloc[i]["nested"], nested[i])
+
+
+def test_add_nested_with_series_and_mismatched_index():
+    """Test add_nested when index values of base are missing matches in nested"""
+    base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2])
+
+    nested = pd.Series(
+        data=[pd.DataFrame({"c": [0, 1]}), pd.DataFrame({"c": [2, 3]})],
+        index=[0, 2],  # no data for index value of "1"
+        name="c",
+    )
+
+    base = base.add_nested(nested, "nested")
+
+    assert "nested" in base.columns
+    assert pd.isna(base.loc[1]["nested"])
+
+
 def test_query():
     """Test that NestedFrame.query handles nested queries correctly"""
 
diff --git a/tests/nested_pandas/series/test_packer.py b/tests/nested_pandas/series/test_packer.py
index 351572b..23c0698 100644
--- a/tests/nested_pandas/series/test_packer.py
+++ b/tests/nested_pandas/series/test_packer.py
@@ -7,6 +7,86 @@
 from pandas.testing import assert_frame_equal, assert_series_equal
 
 
+def test_pack_with_flat_df():
+    """Test pack(pd.DataFrame)."""
+    df = pd.DataFrame(
+        data={
+            "a": [1, 2, 3, 4],
+            "b": [0, 1, 0, 1],
+        },
+        index=[1, 2, 1, 2],
+    )
+    series = packer.pack(df, name="series")
+
+    desired = pd.Series(
+        data=[
+            (np.array([1, 3]), np.array([0, 0])),
+            (np.array([2, 4]), np.array([1, 1])),
+        ],
+        index=[1, 2],
+        dtype=NestedDtype.from_fields(dict(a=pa.int64(), b=pa.int64())),
+        name="series",
+    )
+    assert_series_equal(series, desired)
+
+
+def test_pack_with_flat_df_and_index():
+    """Test pack(pd.DataFrame)."""
+    df = pd.DataFrame(
+        data={
+            "a": [1, 2, 3, 4],
+            "b": [0, 1, 0, 1],
+        },
+        index=[1, 2, 1, 2],
+    )
+    series = packer.pack(df, name="series", index=[101, 102])
+
+    desired = pd.Series(
+        data=[
+            (np.array([1, 3]), np.array([0, 0])),
+            (np.array([2, 4]), np.array([1, 1])),
+        ],
+        index=[101, 102],
+        dtype=NestedDtype.from_fields(dict(a=pa.int64(), b=pa.int64())),
+        name="series",
+    )
+    assert_series_equal(series, desired)
+
+
+def test_pack_with_series_of_dfs():
+    """Test pack(pd.Series([pd.DataFrame(), ...]))."""
+    input_series = pd.Series(
+        [
+            pd.DataFrame(
+                {
+                    "a": [1, 2],
+                    "b": [0, 1],
+                },
+            ),
+            pd.DataFrame(
+                {
+                    "a": [3, 4],
+                    "b": [0, 1],
+                },
+            ),
+        ],
+        index=[1, 2],
+        name="series",
+    )
+    series = packer.pack(input_series, name="nested")
+
+    desired = pd.Series(
+        data=[
+            (np.array([1, 2]), np.array([0, 1])),
+            (np.array([3, 4]), np.array([0, 1])),
+        ],
+        index=[1, 2],
+        name="nested",
+        dtype=NestedDtype.from_fields(dict(a=pa.int64(), b=pa.int64())),
+    )
+    assert_series_equal(series, desired)
+
+
 def test_pack_flat_into_df():
     """Test pack_flat_into_df()."""
     df = pd.DataFrame(
@@ -132,8 +212,8 @@ def test_pack_lists():
         assert_series_equal(series.nest.get_list_series(field_name), packed_df[field_name])
 
 
-def test_pack_dfs():
-    """Test pack_dfs()."""
+def test_pack_seq_with_dfs_and_index():
+    """Test pack_seq()."""
     dfs = [
         pd.DataFrame(
             data={
@@ -164,7 +244,7 @@ def test_pack_dfs():
             index=[103, 103, 103],
         ),
     ]
-    series = packer.pack_dfs(dfs, index=[100, 101, 102, 103])
+    series = packer.pack_seq(dfs, index=[100, 101, 102, 103])
 
     desired = pd.Series(
         data=[
@@ -179,6 +259,75 @@ def test_pack_dfs():
     assert_series_equal(series, desired)
 
 
+def test_pack_seq_with_different_elements_and_index():
+    """Test pack_seq() with different elements and index"""
+    seq = [
+        pd.DataFrame(
+            data={
+                "a": [1, 2],
+                "b": [0, 1],
+            },
+        ),
+        None,
+        {"a": [3, 4], "b": [-1, 0]},
+        pd.NA,
+    ]
+    series = packer.pack_seq(seq, index=[100, 101, 102, 103])
+
+    desired = pd.Series(
+        data=[
+            (np.array([1, 2]), np.array([0, 1])),
+            None,
+            (np.array([3, 4]), np.array([-1, 0])),
+            pd.NA,
+        ],
+        index=[100, 101, 102, 103],
+        dtype=NestedDtype.from_fields(dict(a=pa.int64(), b=pa.int64())),
+    )
+    assert_series_equal(series, desired)
+
+
+def test_pack_seq_with_series_of_dfs():
+    """Test pack_seq(pd.Series([pd.DataFrame(), ...]))."""
+    input_series = pd.Series(
+        [
+            pd.DataFrame(
+                {
+                    "a": [1, 2],
+                    "b": [0, 1],
+                },
+            ),
+            pd.DataFrame(
+                {
+                    "a": [3, 4],
+                    "b": [0, 1],
+                },
+            ),
+            pd.DataFrame(
+                {
+                    "a": [5, 6],
+                    "b": [0, 1],
+                },
+            ),
+        ],
+        index=[100, 101, 102],
+        name="series",
+    )
+    series = packer.pack_seq(input_series)
+
+    desired = pd.Series(
+        data=[
+            (np.array([1, 2]), np.array([0, 1])),
+            (np.array([3, 4]), np.array([0, 1])),
+            (np.array([5, 6]), np.array([0, 1])),
+        ],
+        index=[100, 101, 102],
+        dtype=NestedDtype.from_fields(dict(a=pa.int64(), b=pa.int64())),
+        name="series",
+    )
+    assert_series_equal(series, desired)
+
+
 def test_view_sorted_df_as_list_arrays():
     """Test view_sorted_df_as_list_arrays()."""
     flat_df = pd.DataFrame(