Skip to content

Commit

Permalink
Merge b63a49c into b822a14
Browse files Browse the repository at this point in the history
  • Loading branch information
dougbrn authored Apr 11, 2024
2 parents b822a14 + b63a49c commit 28f018d
Show file tree
Hide file tree
Showing 2 changed files with 261 additions and 0 deletions.
156 changes: 156 additions & 0 deletions src/nested_pandas/nestedframe/core.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
# typing.Self and "|" union syntax don't exist in Python 3.9
from __future__ import annotations

import numpy as np
import pandas as pd
from pandas._libs import lib
from pandas._typing import AnyAll, Axis, IndexLabel
from pandas.api.extensions import no_default

from nested_pandas.series import packer
from nested_pandas.series.dtype import NestedDtype
Expand Down Expand Up @@ -154,3 +158,155 @@ def query(self, expr) -> Self: # type: ignore[name-defined] # noqa: F821
# TODO: does not work with queries that empty the dataframe
result[expr] = result[expr].nest.query_flat(exprs_to_use[expr])
return result

def _resolve_dropna_target(self, on_nested, subset):
"""resolves the target layer for a given set of dropna kwargs"""

nested_cols = self.nested_columns
columns = self.columns

# first check the subset kwarg input
subset_target = []
if subset:
if isinstance(subset, str):
subset = [subset]

for col in subset:
col = col.split(".")[0]
if col in nested_cols:
subset_target.append(col)
elif col in columns:
subset_target.append("base")
else:
raise ValueError(f"Column name {col} not found in any base or nested columns")

# Check for 1 target
subset_target = np.unique(subset_target)
if len(subset_target) > 1: # prohibit multi-target operations
raise ValueError(
f"Targeted multiple nested structures ({subset_target}), write one command per target dataframe" # noqa
)
subset_target = str(subset_target[0])

# Next check the on_nested kwarg input
if on_nested and on_nested not in nested_cols:
raise ValueError("Provided nested layer not found in nested dataframes")

# Resolve target layer
target = "base"
if on_nested and subset_target:
if on_nested != subset_target:
raise ValueError(
f"Provided on_nested={on_nested}, but subset columns are from {subset_target}. Make sure these are aligned or just use subset." # noqa
)
else:
target = subset_target
elif on_nested:
target = str(on_nested)
elif subset_target:
target = str(subset_target)
return target, subset

def dropna(
self,
*,
axis: Axis = 0,
how: AnyAll | lib.NoDefault = no_default,
thresh: int | lib.NoDefault = no_default,
on_nested: bool = False,
subset: IndexLabel | None = None,
inplace: bool = False,
ignore_index: bool = False,
) -> NestedFrame | None:
"""
Remove missing values for one layer of the NestedFrame.
Parameters
----------
axis : {0 or 'index', 1 or 'columns'}, default 0
Determine if rows or columns which contain missing values are
removed.
* 0, or 'index' : Drop rows which contain missing values.
* 1, or 'columns' : Drop columns which contain missing value.
Only a single axis is allowed.
how : {'any', 'all'}, default 'any'
Determine if row or column is removed from DataFrame, when we have
at least one NA or all NA.
* 'any' : If any NA values are present, drop that row or column.
* 'all' : If all values are NA, drop that row or column.
thresh : int, optional
Require that many non-NA values. Cannot be combined with how.
on_nested : str or bool, optional
If not False, applies the call to the nested dataframe in the
column with label equal to the provided string. If specified,
the nested dataframe should align with any columns given in
`subset`.
subset : column label or sequence of labels, optional
Labels along other axis to consider, e.g. if you are dropping rows
these would be a list of columns to include.
Access nested columns using `nested_df.nested_col` (where
`nested_df` refers to a particular nested dataframe and
`nested_col` is a column of that nested dataframe).
inplace : bool, default False
Whether to modify the DataFrame rather than creating a new one.
ignore_index : bool, default ``False``
If ``True``, the resulting axis will be labeled 0, 1, …, n - 1.
.. versionadded:: 2.0.0
Returns
-------
DataFrame or None
DataFrame with NA entries dropped from it or None if ``inplace=True``.
Notes
-----
Operations that target a particular nested structure return a dataframe
with rows of that particular nested structure affected.
Values for `on_nested` and `subset` should be consistent in pointing
to a single layer, multi-layer operations are not supported at this
time.
"""

# determine target dataframe
target, subset = self._resolve_dropna_target(on_nested, subset)

if target == "base":
return super().dropna(
axis=axis, how=how, thresh=thresh, subset=subset, inplace=inplace, ignore_index=ignore_index
)
if subset is not None:
subset = [col.split(".")[-1] for col in subset]
if inplace:
target_flat = self[target].nest.to_flat()
target_flat.dropna(
axis=axis,
how=how,
thresh=thresh,
subset=subset,
inplace=inplace,
ignore_index=ignore_index,
)
self[target] = packer.pack_flat(target_flat)
return self
# Or if not inplace
new_df = self.copy()
new_df[target] = packer.pack_flat(
new_df[target]
.nest.to_flat()
.dropna(
axis=axis,
how=how,
thresh=thresh,
subset=subset,
inplace=inplace,
ignore_index=ignore_index,
)
)
return new_df
105 changes: 105 additions & 0 deletions tests/nested_pandas/nestedframe/test_nestedframe.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import numpy as np
import pandas as pd
import pytest
from nested_pandas import NestedFrame
Expand Down Expand Up @@ -101,3 +102,107 @@ def test_query():

nest_queried = base.query("(nested.c > 1) and (nested.d>2)")
assert len(nest_queried.nested.nest.to_flat()) == 4


def test_dropna():
"""Test that dropna works on all layers"""

base = NestedFrame(data={"a": [1, 2, 3], "b": [2, np.NaN, 6]}, index=[0, 1, 2])

nested = pd.DataFrame(
data={"c": [0, 2, 4, 1, np.NaN, 3, 1, 4, 1], "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)

base = base.add_nested(nested, "nested")

# Test basic functionality
dn_base = base.dropna(subset=["b"])
assert len(dn_base) == 2
assert len(dn_base["nested"].nest.to_flat() == 6)

# Test on_nested kwarg
dn_on_nested = base.dropna(on_nested="nested")
assert len(dn_on_nested) == 3
assert len(dn_on_nested["nested"].nest.to_flat() == 8)

# Test hierarchical column subset
dn_hierarchical = base.dropna(subset="nested.c")
assert len(dn_hierarchical) == 3
assert len(dn_hierarchical["nested"].nest.to_flat() == 8)

# Test hierarchical column subset and on_nested
dn_hierarchical = base.dropna(on_nested="nested", subset="nested.c")
assert len(dn_hierarchical) == 3
assert len(dn_hierarchical["nested"].nest.to_flat() == 8)


def test_dropna_inplace_base():
"""Test in-place behavior of dropna"""

base = NestedFrame(data={"a": [1, 2, 3], "b": [np.NaN, 4, 6]}, index=[0, 1, 2])

nested = pd.DataFrame(
data={"c": [0, 2, 4, 1, np.NaN, 3, 1, 4, 1], "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)

base = base.add_nested(nested, "nested")

# Test inplace=False with base layer
dn_base = base.dropna(subset=["b"], inplace=False)
assert not dn_base.equals(base)

# Test inplace=True with base layer
base.dropna(subset=["b"], inplace=True)
assert dn_base.equals(base)


def test_dropna_inplace_nested():
"""Test in-place behavior of dropna"""

base = NestedFrame(data={"a": [1, 2, 3], "b": [np.NaN, 4, 6]}, index=[0, 1, 2])

nested = pd.DataFrame(
data={"c": [0, 2, 4, 1, np.NaN, 3, 1, 4, 1], "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)

base = base.add_nested(nested, "nested")

# Test inplace=False with nested layer
dn_base = base.dropna(on_nested="nested", inplace=False)
assert not dn_base.nested.nest.to_flat().equals(base.nested.nest.to_flat())

# Test inplace=True with nested layer
base.dropna(on_nested="nested", inplace=True)
assert dn_base.equals(base)


def test_dropna_errors():
"""Test that the various dropna exceptions trigger"""

base = NestedFrame(data={"a": [1, 2, 3], "b": [2, np.NaN, 6]}, index=[0, 1, 2])

nested = pd.DataFrame(
data={"c": [0, 2, 4, 1, np.NaN, 3, 1, 4, 1], "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)

base = base.add_nested(nested, "nested")

# Test multi-target
with pytest.raises(ValueError):
base.dropna(subset=["b", "nested.c"])

# Test no-target
with pytest.raises(ValueError):
base.dropna(subset=["not_nested.c"])

# Test bad on-nested value
with pytest.raises(ValueError):
base.dropna(on_nested="not_nested")

# Test on-nested + subset disagreement
with pytest.raises(ValueError):
base.dropna(on_nested="nested", subset=["b"])

0 comments on commit 28f018d

Please sign in to comment.