From 442321b309583ccac978d60b31444cde298bd093 Mon Sep 17 00:00:00 2001 From: Konstantin Malanchev Date: Fri, 3 May 2024 12:59:19 -0400 Subject: [PATCH] pack_seq and df.add_nested([df1,df2]) --- src/nested_pandas/nestedframe/core.py | 36 +++- src/nested_pandas/series/packer.py | 76 +++++++-- .../nestedframe/test_nestedframe.py | 39 ++++- tests/nested_pandas/series/test_packer.py | 155 +++++++++++++++++- 4 files changed, 280 insertions(+), 26 deletions(-) diff --git a/src/nested_pandas/nestedframe/core.py b/src/nested_pandas/nestedframe/core.py index 1c8e182..71dd33f 100644 --- a/src/nested_pandas/nestedframe/core.py +++ b/src/nested_pandas/nestedframe/core.py @@ -3,6 +3,7 @@ import numpy as np import pandas as pd +import pyarrow as pa from pandas._libs import lib from pandas._typing import Any, AnyAll, Axis, IndexLabel from pandas.api.extensions import no_default @@ -62,10 +63,39 @@ def _is_known_column(self, colname) -> bool: """Determine whether a string is a known column name""" return colname in self.columns or self._is_known_hierarchical_column(colname) - def add_nested(self, nested, name) -> Self: # type: ignore[name-defined] # noqa: F821 - """Packs a dataframe into a nested column""" + def add_nested( + self, obj, name: str, *, dtype: NestedDtype | pd.ArrowDtype | pa.DataType | None = None + ) -> Self: # type: ignore[name-defined] # noqa: F821 + """Packs input object to a nested column and adds it to the NestedFrame + + This method returns a new NestedFrame with the added nested column. + + Parameters + ---------- + obj : pd.DataFrame or a sequence of items convertible to nested structures + The object to be packed into nested pd.Series and added to + the NestedFrame. If a DataFrame is passed, it must have non-unique + index values, which are used to pack the DataFrame. If a sequence + of elements is passed, it is packed into a nested pd.Series. + Sequence elements may be individual pd.DataFrames, dictionaries + (keys are nested column names, values are arrays of the same + length), or any other object convertible to pa.StructArray. + Additionally, None and pd.NA are allowed as elements to represent + missing values. + name : str + The name of the nested column to be added to the NestedFrame. + dtype : dtype or None + NestedDtype to use for the nested column; pd.ArrowDtype or + pa.DataType can also be used to specify the nested dtype. If None, + the dtype is inferred from the input object. + + Returns + ------- + NestedFrame + A new NestedFrame with the added nested column. + """ # Add sources to objects - packed = packer.pack_flat(nested, name=name) + packed = packer.pack(obj, name=name, dtype=dtype) label = packed.name return self.assign(**{f"{label}": packed}) diff --git a/src/nested_pandas/series/packer.py b/src/nested_pandas/series/packer.py index 0bb25c4..15b155c 100644 --- a/src/nested_pandas/series/packer.py +++ b/src/nested_pandas/series/packer.py @@ -16,12 +16,47 @@ from nested_pandas.series.dtype import NestedDtype from nested_pandas.series.ext_array import NestedExtensionArray -__all__ = ["pack_flat", "pack_lists", "pack_dfs"] +__all__ = ["pack", "pack_flat", "pack_lists", "pack_seq"] N_ROWS_INFER_DTYPE = 1000 +def pack( + obj, + name: str | None = None, + *, + index=None, + dtype: NestedDtype | pd.ArrowDtype | pa.DataType | None = None, +) -> pd.Series: + """Pack a "flat" dataframe or a sequence of dataframes into a "nested" series. + + Parameters + ---------- + obj : pd.DataFrame or Sequence of + Input dataframe, with repeated indexes, or a sequence of dataframes or missed values. + name : str, optional + Name of the output series. + index : convertable to pd.Index, optional + Index of the output series. If obj is a pd.DataFrame, it is always nested by the original index, + and this value is used to override the index after the nesting. + dtype : dtype or None + NestedDtype of the output series, or other type to derive from. If None, + the dtype is inferred from the first non-missing dataframe. + + Returns + ------- + pd.Series + Output series. + """ + if isinstance(obj, pd.DataFrame): + nested = pack_flat(obj, name=name) + if index is not None: + nested.index = index + return nested + return pack_seq(obj, name=name, index=index, dtype=dtype) + + def pack_flat_into_df(df: pd.DataFrame, name=None) -> pd.DataFrame: """Pack a "flat" dataframe into a "nested" dataframe. @@ -86,35 +121,40 @@ def pack_flat(df: pd.DataFrame, name: str | None = None) -> pd.Series: return pack_sorted_df_into_struct(flat, name=name) -def pack_dfs(dfs: Sequence[pd.DataFrame], index: object = None, name: str | None = None) -> pd.Series: +def pack_seq( + sequence: Sequence, + name: str | None = None, + *, + index: object = None, + dtype: NestedDtype | pd.ArrowDtype | pa.DataType | None = None, +) -> pd.Series: """Pack a sequence of "flat" dataframes into a "nested" series. Parameters ---------- - dfs : Sequence[pd.DataFrame] - Input sequence of dataframes. - index : pd.Index, optional - Index of the output series. + sequence : Sequence of pd.DataFrame or None or pd.NA or convertible to pa.StructScalar + Input sequence of dataframes or missed values. name : str, optional Name of the output series. + index : pd.Index, optional + Index of the output series. + dtype : dtype or None + NestedDtype of the output series, or other type to derive from. If None, + the dtype is inferred from the first non-missing dataframe. Returns ------- pd.Series Output series. """ - if isinstance(dfs, pd.Series) and index is None: - index = dfs.index - - first_df = dfs.iloc[0] if hasattr(dfs, "iloc") else dfs[0] - - field_types = { - column: pa.array(first_df[column].iloc[:N_ROWS_INFER_DTYPE]).type for column in first_df.columns - } - dtype = NestedDtype.from_fields(field_types) - dummy_value: dict[str, list] = {column: [] for column in first_df.columns} - series = pd.Series([dummy_value] * len(dfs), dtype=dtype, index=index, name=name) - series[:] = dfs + if isinstance(sequence, pd.Series): + if index is None: + index = sequence.index + if name is None: + name = sequence.name + + ext_array = NestedExtensionArray.from_sequence(sequence, dtype=dtype) + series = pd.Series(ext_array, index=index, name=name, copy=False) return series diff --git a/tests/nested_pandas/nestedframe/test_nestedframe.py b/tests/nested_pandas/nestedframe/test_nestedframe.py index 077140f..cff96e4 100644 --- a/tests/nested_pandas/nestedframe/test_nestedframe.py +++ b/tests/nested_pandas/nestedframe/test_nestedframe.py @@ -2,6 +2,7 @@ import pandas as pd import pytest from nested_pandas import NestedFrame +from pandas.testing import assert_frame_equal def test_nestedframe_construction(): @@ -62,7 +63,7 @@ def test_is_known_hierarchical_column(): assert not base._is_known_hierarchical_column("base.a") -def test_add_nested(): +def test_add_nested_with_flat_df(): """Test that add_nested correctly adds a nested column to the base df""" base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2]) @@ -78,7 +79,7 @@ def test_add_nested(): assert base.nested.nest.to_flat().equals(nested) -def test_add_nested_with_mismatched_index(): +def test_add_nested_with_flat_df_and_mismatched_index(): """Test add_nested when index values of base are missing matches in nested""" base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2]) @@ -94,6 +95,40 @@ def test_add_nested_with_mismatched_index(): assert pd.isna(base.loc[2]["nested"]) +def test_add_nested_with_series(): + """Test that add_nested correctly adds a nested column to the base df""" + + base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2]) + + nested = pd.Series( + data=[pd.DataFrame({"c": [0, 1]}), pd.DataFrame({"c": [1, 2]}), pd.DataFrame({"c": [2, 3]})], + index=[0, 1, 2], + name="c", + ) + + base = base.add_nested(nested, "nested") + + assert "nested" in base.columns + for i in range(3): + assert_frame_equal(base.iloc[i]["nested"], nested[i]) + + +def test_add_nested_with_series_and_mismatched_index(): + """Test add_nested when index values of base are missing matches in nested""" + base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2]) + + nested = pd.Series( + data=[pd.DataFrame({"c": [0, 1]}), pd.DataFrame({"c": [2, 3]})], + index=[0, 2], # no data for index value of "1" + name="c", + ) + + base = base.add_nested(nested, "nested") + + assert "nested" in base.columns + assert pd.isna(base.loc[1]["nested"]) + + def test_query(): """Test that NestedFrame.query handles nested queries correctly""" diff --git a/tests/nested_pandas/series/test_packer.py b/tests/nested_pandas/series/test_packer.py index 351572b..23c0698 100644 --- a/tests/nested_pandas/series/test_packer.py +++ b/tests/nested_pandas/series/test_packer.py @@ -7,6 +7,86 @@ from pandas.testing import assert_frame_equal, assert_series_equal +def test_pack_with_flat_df(): + """Test pack(pd.DataFrame).""" + df = pd.DataFrame( + data={ + "a": [1, 2, 3, 4], + "b": [0, 1, 0, 1], + }, + index=[1, 2, 1, 2], + ) + series = packer.pack(df, name="series") + + desired = pd.Series( + data=[ + (np.array([1, 3]), np.array([0, 0])), + (np.array([2, 4]), np.array([1, 1])), + ], + index=[1, 2], + dtype=NestedDtype.from_fields(dict(a=pa.int64(), b=pa.int64())), + name="series", + ) + assert_series_equal(series, desired) + + +def test_pack_with_flat_df_and_index(): + """Test pack(pd.DataFrame).""" + df = pd.DataFrame( + data={ + "a": [1, 2, 3, 4], + "b": [0, 1, 0, 1], + }, + index=[1, 2, 1, 2], + ) + series = packer.pack(df, name="series", index=[101, 102]) + + desired = pd.Series( + data=[ + (np.array([1, 3]), np.array([0, 0])), + (np.array([2, 4]), np.array([1, 1])), + ], + index=[101, 102], + dtype=NestedDtype.from_fields(dict(a=pa.int64(), b=pa.int64())), + name="series", + ) + assert_series_equal(series, desired) + + +def test_pack_with_series_of_dfs(): + """Test pack(pd.Series([pd.DataFrame(), ...])).""" + input_series = pd.Series( + [ + pd.DataFrame( + { + "a": [1, 2], + "b": [0, 1], + }, + ), + pd.DataFrame( + { + "a": [3, 4], + "b": [0, 1], + }, + ), + ], + index=[1, 2], + name="series", + ) + series = packer.pack(input_series, name="nested") + + desired = pd.Series( + data=[ + (np.array([1, 2]), np.array([0, 1])), + (np.array([3, 4]), np.array([0, 1])), + ], + index=[1, 2], + name="nested", + dtype=NestedDtype.from_fields(dict(a=pa.int64(), b=pa.int64())), + ) + assert_series_equal(series, desired) + + def test_pack_flat_into_df(): """Test pack_flat_into_df().""" df = pd.DataFrame( @@ -132,8 +212,8 @@ def test_pack_lists(): assert_series_equal(series.nest.get_list_series(field_name), packed_df[field_name]) -def test_pack_dfs(): - """Test pack_dfs().""" +def test_pack_seq_with_dfs_and_index(): + """Test pack_seq().""" dfs = [ pd.DataFrame( data={ @@ -164,7 +244,7 @@ def test_pack_dfs(): index=[103, 103, 103], ), ] - series = packer.pack_dfs(dfs, index=[100, 101, 102, 103]) + series = packer.pack_seq(dfs, index=[100, 101, 102, 103]) desired = pd.Series( data=[ @@ -179,6 +259,75 @@ def test_pack_dfs(): assert_series_equal(series, desired) +def test_pack_seq_with_different_elements_and_index(): + """Test pack_seq() with different elements and index""" + seq = [ + pd.DataFrame( + data={ + "a": [1, 2], + "b": [0, 1], + }, + ), + None, + {"a": [3, 4], "b": [-1, 0]}, + pd.NA, + ] + series = packer.pack_seq(seq, index=[100, 101, 102, 103]) + + desired = pd.Series( + data=[ + (np.array([1, 2]), np.array([0, 1])), + None, + (np.array([3, 4]), np.array([-1, 0])), + pd.NA, + ], + index=[100, 101, 102, 103], + dtype=NestedDtype.from_fields(dict(a=pa.int64(), b=pa.int64())), + ) + assert_series_equal(series, desired) + + +def test_pack_seq_with_series_of_dfs(): + """Test pack_seq(pd.Series([pd.DataFrame(), ...])).""" + input_series = pd.Series( + [ + pd.DataFrame( + { + "a": [1, 2], + "b": [0, 1], + }, + ), + pd.DataFrame( + { + "a": [3, 4], + "b": [0, 1], + }, + ), + pd.DataFrame( + { + "a": [5, 6], + "b": [0, 1], + }, + ), + ], + index=[100, 101, 102], + name="series", + ) + series = packer.pack_seq(input_series) + + desired = pd.Series( + data=[ + (np.array([1, 2]), np.array([0, 1])), + (np.array([3, 4]), np.array([0, 1])), + (np.array([5, 6]), np.array([0, 1])), + ], + index=[100, 101, 102], + dtype=NestedDtype.from_fields(dict(a=pa.int64(), b=pa.int64())), + name="series", + ) + assert_series_equal(series, desired) + + def test_view_sorted_df_as_list_arrays(): """Test view_sorted_df_as_list_arrays().""" flat_df = pd.DataFrame(