Skip to content

Commit

Permalink
Use the Pandas expr tree for preflighting.
Browse files Browse the repository at this point in the history
Requires `extract_nest_names` to be a method on `NestedFrame` so that
the evaluation context is available at parsing time, since the
Pandas Expr parsing does some eager evaluation.

Resolves #174 .
gitosaurus committed Nov 14, 2024
1 parent 402ab66 commit a86a532
Showing 4 changed files with 151 additions and 76 deletions.
99 changes: 96 additions & 3 deletions src/nested_pandas/nestedframe/core.py
Original file line number Diff line number Diff line change
@@ -10,9 +10,11 @@
from pandas._libs import lib
from pandas._typing import Any, AnyAll, Axis, IndexLabel
from pandas.api.extensions import no_default
from pandas.core.computation import ops
from pandas.core.computation.eval import Expr, ensure_scope
from pandas.core.computation.expr import PARSERS, PandasExprVisitor
from pandas.core.computation.parsing import clean_column_name

from nested_pandas.nestedframe.utils import extract_nest_names
from nested_pandas.series.dtype import NestedDtype
from nested_pandas.series.packer import pack, pack_lists, pack_sorted_df_into_struct

@@ -79,6 +81,22 @@ class _NestResolver(dict):
def __init__(self, outer: NestedFrame):
self._outer = outer
super().__init__()
# Pre-load the field resolvers for all columns which are known at present.
for column in outer.nested_columns:
self._initialize_field_resolver(column, outer)

def _initialize_field_resolver(self, column: str, outer: NestedFrame):
"""
Initialize a resolver for the given nested column, and also an alias
for it, in the case of column names that have spaces or are otherwise
not identifier-like.
"""
super().__setitem__(column, _NestedFieldResolver(column, outer))
clean_id = clean_column_name(column)
# And once more for the cleaned name, if it's different.
# This allows us to capture references to it from the Pandas evaluator.
if clean_id != column:
super().__setitem__(clean_id, _NestedFieldResolver(column, outer))

def __contains__(self, item):
top_nest = item if "." not in item else item.split(".")[0].strip()
@@ -89,7 +107,7 @@ def __getitem__(self, item):
if not super().__contains__(top_nest):
if top_nest not in self._outer.nested_columns:
raise KeyError(f"Unknown nest {top_nest}")
super().__setitem__(top_nest, _NestedFieldResolver(top_nest, self._outer))
self._initialize_field_resolver(top_nest, self._outer)
return super().__getitem__(top_nest)

def __setitem__(self, item, _):
@@ -133,6 +151,48 @@ def __getattr__(self, item_name: str):
raise AttributeError(f"No attribute {item_name}")


def _subexprs_by_nest(parents: list, node) -> dict[str, list]:
"""
Given an expression which contains references to both base and nested
columns, return a dictionary of the sub-expressions that should be
evaluated independently, keyed by nesting context.
The key of the dictionary is the name of the nested column, and will
be a blank string in the case of base columns. The value is a list
of the parent nodes that lead to sub-expressions that can be evaluated
successfully.
While this is not in use today for automatically splitting expressions,
it can be used to detect whether an expression is suitably structured
for evaluation: the returned dictionary should have a single key.
"""
if isinstance(node, ops.Term) and not isinstance(node, ops.Constant):
if isinstance(node.value, _SeriesFromNest):
return {node.value.nest_name: parents}
return {getattr(node, "upper_name", ""): parents}
if not isinstance(node, ops.Op):
return {}
sources = [getattr(node, "lhs", None), getattr(node, "rhs", None)]
result: dict[str, list] = {}
for source in sources:
child = _subexprs_by_nest(parents, source)
for k, v in child.items():
result.setdefault(k, []).append(v)
# After a complete traversal across sources, check for any necessary splits.
# If it's homogenous, move the split-node up the tree.
if len(result) == 1:
# Let the record of each parent node drift up the tree,
# and merge the subtrees into a single node, since by definition,
# this node is homogeneous over all of its children, and can
# be evaluated in a single step.
result = {k: [node] for k in result}
# If the result is either empty or has more than one key, leave the result
# alone. Each key represents a different nest (with a blank string for the base),
# and the value is the highest point in the expression tree where the expression
# was still within a single nest.
return result


class NestedFrame(pd.DataFrame):
"""A Pandas Dataframe extension with support for nested structure.
@@ -457,6 +517,39 @@ def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None:
kwargs["parser"] = "nested-pandas"
return super().eval(expr, **kwargs)

def extract_nest_names(
self,
expr: str,
local_dict=None,
global_dict=None,
resolvers=(),
level: int = 0,
target=None,
**kwargs,
) -> set[str]:
"""
Given a string expression, parse it and visit the resulting expression tree,
surfacing the nesting types. The purpose is to identify expressions that attempt
to mix base and nested columns, or columns from two different nests.
"""
index_resolvers = self._get_index_resolvers()
column_resolvers = self._get_cleaned_column_resolvers()
resolvers = resolvers + (_NestResolver(self), column_resolvers, index_resolvers)
# Parser needs to be the "nested-pandas" parser.
# We also need the same variable context that eval() will have, so that
# backtick-quoted names are substituted as expected.
env = ensure_scope(
level + 1,
global_dict=global_dict,
local_dict=local_dict,
resolvers=resolvers,
target=target,
)
parsed_expr = Expr(expr, parser="nested-pandas", env=env)
expr_tree = parsed_expr.terms
separable = _subexprs_by_nest([], expr_tree)
return set(separable.keys())

def query(self, expr: str, *, inplace: bool = False, **kwargs) -> NestedFrame | None:
"""
Query the columns of a NestedFrame with a boolean expression. Specified
@@ -514,7 +607,7 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> NestedFrame |
# At present, the query expression must be either entirely within a
# single nest, or have nothing but base columns. Mixed structures are not
# supported, so preflight the expression.
nest_names = extract_nest_names(expr)
nest_names = self.extract_nest_names(expr, **kwargs)
if len(nest_names) > 1:
raise ValueError("Queries cannot target multiple structs/layers, write a separate query for each")
result = self.eval(expr, **kwargs)
61 changes: 0 additions & 61 deletions src/nested_pandas/nestedframe/utils.py

This file was deleted.

19 changes: 19 additions & 0 deletions tests/nested_pandas/nestedframe/test_nestedframe.py
Original file line number Diff line number Diff line change
@@ -594,6 +594,25 @@ def test_query():
assert base["nested.d"].shape == (2,)


def test_query_on_non_identifier_columns():
"""
Column names very often follow the same rules as Python identifiers, but
they are not required to. Test that query() can handle such names.
"""
# Taken from GH#174
nf = NestedFrame(data={"dog": [1, 2, 3], "good dog": [2, 4, 6]}, index=[0, 1, 2])
nested = pd.DataFrame(
data={"a": [0, 2, 4, 1, 4, 3, 1, 4, 1], "b": [5, 4, 7, 5, 3, 1, 9, 3, 4]},
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)
nf = nf.add_nested(nested, "bad dog")
nf2 = nf.query("`good dog` > 3")
assert nf.shape == (3, 3)
assert nf2.shape == (2, 3)
nf3 = nf.query("`bad dog`.a > 2")
assert nf3["bad dog"].nest["a"].size == 4


def test_dropna():
"""Test that dropna works on all layers"""

48 changes: 36 additions & 12 deletions tests/nested_pandas/utils/test_utils.py
Original file line number Diff line number Diff line change
@@ -2,7 +2,6 @@
import pandas as pd
import pytest
from nested_pandas import NestedFrame
from nested_pandas.nestedframe.utils import extract_nest_names
from nested_pandas.utils import count_nested


@@ -52,16 +51,41 @@ def test_check_expr_nesting():
used to ensure that an expression-based query does not try to combine base and nested
sub-expressions.
"""
assert extract_nest_names("a > 2 & nested.c > 1") == {"", "nested"}
assert extract_nest_names("(nested.c > 1) and (nested.d>2)") == {"nested"}
assert extract_nest_names("-1.52e-5 < abc < 35.2e2") == {""}
assert extract_nest_names("(n.a > 1) and ((b + c) > (d - 1e-8)) or n.q > c") == {"n", ""}
base = NestedFrame(data={"a": [1, 2, 3], "b": [2, np.nan, 6]}, index=[0, 1, 2])
nested = pd.DataFrame(
data={
"c": [0, 2, 4, 1, np.nan, 3, 1, 4, 1],
"d": [5, 4, 7, 5, 3, 1, 9, 3, 4],
"label": ["b", "a", "b", "b", "a", "a", "b", "a", "b"],
},
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)
b1 = base.add_nested(nested, "nested")
assert b1.extract_nest_names("a > 2 & nested.c > 1") == {"", "nested"}
assert b1.extract_nest_names("(nested.c > 1) and (nested.d>2)") == {"nested"}
assert b1.extract_nest_names("-1.52e-5 < b < 35.2e2") == {""}

b2 = base.add_nested(nested.copy(), "n")
assert b2.extract_nest_names("(n.c > 1) and ((b + a) > (b - 1e-8)) or n.d > a") == {"n", ""}

abc = pd.DataFrame(
data={
"c": [3, 1, 4, 1, 5, 9, 2, 6, 5],
"d": [1, 4, 1, 2, 1, 3, 5, 6, 2],
"g": ["a", "b", "c", "d", "e", "f", "g", "h", "i"],
},
index=[0, 0, 0, 1, 1, 1, 2, 2, 2],
)
b3 = base.add_nested(abc, "abc").add_nested(abc, "c")
assert b3.extract_nest_names("abc.c > 2 & c.d < 5") == {"abc", "c"}

assert b3.extract_nest_names("(abc.d > 3) & (abc.c == [2, 5])") == {"abc"}
assert b3.extract_nest_names("(abc.d > 3)&(abc.g == 'f')") == {"abc"}
assert b3.extract_nest_names("(abc.d > 3) & (abc.g == 'f')") == {"abc"}

assert extract_nest_names("a.b > 2 & c.d < 5") == {"a", "c"}
assert b1.extract_nest_names("a>3") == {""}
assert b1.extract_nest_names("a > 3") == {""}

assert extract_nest_names("a>3") == {""}
assert extract_nest_names("a > 3") == {""}
assert extract_nest_names("test.a>5&b==2") == {"test", ""}
assert extract_nest_names("test.a > 5 & b == 2") == {"test", ""}
assert extract_nest_names("(a.b > 3)&(a.c == 'f')") == {"a"}
assert extract_nest_names("(a.b > 3) & (a.c == 'f')") == {"a"}
b4 = base.add_nested(nested, "test")
assert b4.extract_nest_names("test.c>5&b==2") == {"test", ""}
assert b4.extract_nest_names("test.c > 5 & b == 2") == {"test", ""}

0 comments on commit a86a532

Please sign in to comment.