Skip to content

Commit

Permalink
feat(python): Allow empty select/with_columns/groupby (#9205)
Browse files Browse the repository at this point in the history
  • Loading branch information
stinodego authored Jun 4, 2023
1 parent ac728ce commit f800164
Show file tree
Hide file tree
Showing 7 changed files with 164 additions and 155 deletions.
6 changes: 0 additions & 6 deletions py-polars/polars/lazyframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2031,9 +2031,6 @@ def select(
└───────────┘
"""
if exprs is None and not named_exprs:
raise ValueError("Expected at least one of 'exprs' or '**named_exprs'")

structify = bool(int(os.environ.get("POLARS_AUTO_STRUCTIFY", 0)))

exprs = parse_as_list_of_expressions(
Expand Down Expand Up @@ -3088,9 +3085,6 @@ def with_columns(
└─────┴──────┴─────────────┘
"""
if exprs is None and not named_exprs:
raise ValueError("Expected at least one of 'exprs' or '**named_exprs'")

structify = bool(int(os.environ.get("POLARS_AUTO_STRUCTIFY", 0)))

exprs = parse_as_list_of_expressions(
Expand Down
2 changes: 0 additions & 2 deletions py-polars/polars/lazyframe/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,8 +120,6 @@ def agg(
raise ValueError(
f"'aggs' argument should be one or multiple expressions, got: '{aggs}'."
)
if aggs is None and not named_aggs:
raise ValueError("Expected at least one of 'aggs' or '**named_aggs'")

exprs = parse_as_list_of_expressions(aggs, *more_aggs, **named_aggs)

Expand Down
7 changes: 7 additions & 0 deletions py-polars/tests/unit/operations/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,13 @@ def test_groupby_args() -> None:
assert df.groupby("a").agg(q="b", r="c").columns == ["a", "q", "r"]


def test_groupby_empty() -> None:
df = pl.DataFrame({"a": [1, 1, 2]})
result = df.groupby("a").agg()
expected = pl.DataFrame({"a": [1, 2]})
assert_frame_equal(result, expected, check_row_order=False)


def test_groupby_iteration() -> None:
df = pl.DataFrame(
{
Expand Down
6 changes: 6 additions & 0 deletions py-polars/tests/unit/operations/test_select.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,12 @@ def test_select_args_kwargs() -> None:
assert_frame_equal(result, expected)


def test_select_empty() -> None:
result = pl.select()
expected = pl.DataFrame()
assert_frame_equal(result, expected)


def test_select_empty_list() -> None:
result = pl.select([])
expected = pl.DataFrame()
Expand Down
151 changes: 151 additions & 0 deletions py-polars/tests/unit/operations/test_with_columns.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
import polars as pl
from polars.testing import assert_frame_equal


def test_with_columns() -> None:
import datetime

df = pl.DataFrame(
{
"a": [1, 2, 3, 4],
"b": [0.5, 4, 10, 13],
"c": [True, True, False, True],
}
)
srs_named = pl.Series("f", [3, 2, 1, 0])
srs_unnamed = pl.Series(values=[3, 2, 1, 0])

expected = pl.DataFrame(
{
"a": [1, 2, 3, 4],
"b": [0.5, 4, 10, 13],
"c": [True, True, False, True],
"d": [0.5, 8.0, 30.0, 52.0],
"e": [False, False, True, False],
"f": [3, 2, 1, 0],
"g": True,
"h": pl.Series(values=[1, 1, 1, 1], dtype=pl.Int32),
"i": 3.2,
"j": [1, 2, 3, 4],
"k": pl.Series(values=[None, None, None, None], dtype=pl.Null),
"l": datetime.datetime(2001, 1, 1, 0, 0),
}
)

# as exprs list
dx = df.with_columns(
[
(pl.col("a") * pl.col("b")).alias("d"),
~pl.col("c").alias("e"),
srs_named,
pl.lit(True).alias("g"),
pl.lit(1).alias("h"),
pl.lit(3.2).alias("i"),
pl.col("a").alias("j"),
pl.lit(None).alias("k"),
pl.lit(datetime.datetime(2001, 1, 1, 0, 0)).alias("l"),
]
)
assert_frame_equal(dx, expected)

# as positional arguments
dx = df.with_columns(
(pl.col("a") * pl.col("b")).alias("d"),
~pl.col("c").alias("e"),
srs_named,
pl.lit(True).alias("g"),
pl.lit(1).alias("h"),
pl.lit(3.2).alias("i"),
pl.col("a").alias("j"),
pl.lit(None).alias("k"),
pl.lit(datetime.datetime(2001, 1, 1, 0, 0)).alias("l"),
)
assert_frame_equal(dx, expected)

# as keyword arguments
dx = df.with_columns(
d=pl.col("a") * pl.col("b"),
e=~pl.col("c"),
f=srs_unnamed,
g=True,
h=1,
i=3.2,
j="a", # Note: string interpreted as column name, resolves to `pl.col("a")`
k=None,
l=datetime.datetime(2001, 1, 1, 0, 0),
)
assert_frame_equal(dx, expected)

# mixed
dx = df.with_columns(
[(pl.col("a") * pl.col("b")).alias("d")],
~pl.col("c").alias("e"),
f=srs_unnamed,
g=True,
h=1,
i=3.2,
j="a", # Note: string interpreted as column name, resolves to `pl.col("a")`
k=None,
l=datetime.datetime(2001, 1, 1, 0, 0),
)
assert_frame_equal(dx, expected)

# automatically upconvert multi-output expressions to struct
with pl.Config() as cfg:
cfg.set_auto_structify(True)

ldf = (
pl.DataFrame({"x1": [1, 2, 6], "x2": [1, 2, 3]})
.lazy()
.with_columns(
pl.col(["x1", "x2"]).pct_change().alias("pct_change"),
maxes=pl.all().max().suffix("_max"),
xcols=pl.col("^x.*$"),
)
)
# ┌─────┬─────┬─────────────┬───────────┬───────────┐
# │ x1 ┆ x2 ┆ pct_change ┆ maxes ┆ xcols │
# │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
# │ i64 ┆ i64 ┆ struct[2] ┆ struct[2] ┆ struct[2] │
# ╞═════╪═════╪═════════════╪═══════════╪═══════════╡
# │ 1 ┆ 1 ┆ {null,null} ┆ {6,3} ┆ {1,1} │
# │ 2 ┆ 2 ┆ {1.0,1.0} ┆ {6,3} ┆ {2,2} │
# │ 6 ┆ 3 ┆ {2.0,0.5} ┆ {6,3} ┆ {6,3} │
# └─────┴─────┴─────────────┴───────────┴───────────┘
assert ldf.collect().to_dicts() == [
{
"x1": 1,
"x2": 1,
"pct_change": {"x1": None, "x2": None},
"maxes": {"x1_max": 6, "x2_max": 3},
"xcols": {"x1": 1, "x2": 1},
},
{
"x1": 2,
"x2": 2,
"pct_change": {"x1": 1.0, "x2": 1.0},
"maxes": {"x1_max": 6, "x2_max": 3},
"xcols": {"x1": 2, "x2": 2},
},
{
"x1": 6,
"x2": 3,
"pct_change": {"x1": 2.0, "x2": 0.5},
"maxes": {"x1_max": 6, "x2_max": 3},
"xcols": {"x1": 6, "x2": 3},
},
]


def test_with_columns_empty() -> None:
df = pl.DataFrame({"a": [1, 2]})
result = df.with_columns()
assert_frame_equal(result, df)


def test_with_columns_single_series() -> None:
ldf = pl.LazyFrame({"a": [1, 2]})
result = ldf.with_columns(pl.Series("b", [3, 4]))

expected = pl.DataFrame({"a": [1, 2], "b": [3, 4]})
assert_frame_equal(result.collect(), expected)
139 changes: 0 additions & 139 deletions py-polars/tests/unit/test_df.py
Original file line number Diff line number Diff line change
Expand Up @@ -3111,145 +3111,6 @@ def test_unique_on_sorted() -> None:
).to_dict(False) == {"a": [1, 3], "b": [2, 3]}


def test_with_columns() -> None:
import datetime

df = pl.DataFrame(
{
"a": [1, 2, 3, 4],
"b": [0.5, 4, 10, 13],
"c": [True, True, False, True],
}
)
srs_named = pl.Series("f", [3, 2, 1, 0])
srs_unnamed = pl.Series(values=[3, 2, 1, 0])

expected = pl.DataFrame(
{
"a": [1, 2, 3, 4],
"b": [0.5, 4, 10, 13],
"c": [True, True, False, True],
"d": [0.5, 8.0, 30.0, 52.0],
"e": [False, False, True, False],
"f": [3, 2, 1, 0],
"g": True,
"h": pl.Series(values=[1, 1, 1, 1], dtype=pl.Int32),
"i": 3.2,
"j": [1, 2, 3, 4],
"k": pl.Series(values=[None, None, None, None], dtype=pl.Null),
"l": datetime.datetime(2001, 1, 1, 0, 0),
}
)

# as exprs list
dx = df.with_columns(
[
(pl.col("a") * pl.col("b")).alias("d"),
~pl.col("c").alias("e"),
srs_named,
pl.lit(True).alias("g"),
pl.lit(1).alias("h"),
pl.lit(3.2).alias("i"),
pl.col("a").alias("j"),
pl.lit(None).alias("k"),
pl.lit(datetime.datetime(2001, 1, 1, 0, 0)).alias("l"),
]
)
assert_frame_equal(dx, expected)

# as positional arguments
dx = df.with_columns(
(pl.col("a") * pl.col("b")).alias("d"),
~pl.col("c").alias("e"),
srs_named,
pl.lit(True).alias("g"),
pl.lit(1).alias("h"),
pl.lit(3.2).alias("i"),
pl.col("a").alias("j"),
pl.lit(None).alias("k"),
pl.lit(datetime.datetime(2001, 1, 1, 0, 0)).alias("l"),
)
assert_frame_equal(dx, expected)

# as keyword arguments
dx = df.with_columns(
d=pl.col("a") * pl.col("b"),
e=~pl.col("c"),
f=srs_unnamed,
g=True,
h=1,
i=3.2,
j="a", # Note: string interpreted as column name, resolves to `pl.col("a")`
k=None,
l=datetime.datetime(2001, 1, 1, 0, 0),
)
assert_frame_equal(dx, expected)

# mixed
dx = df.with_columns(
[(pl.col("a") * pl.col("b")).alias("d")],
~pl.col("c").alias("e"),
f=srs_unnamed,
g=True,
h=1,
i=3.2,
j="a", # Note: string interpreted as column name, resolves to `pl.col("a")`
k=None,
l=datetime.datetime(2001, 1, 1, 0, 0),
)
assert_frame_equal(dx, expected)

# automatically upconvert multi-output expressions to struct
with pl.Config() as cfg:
cfg.set_auto_structify(True)

ldf = (
pl.DataFrame({"x1": [1, 2, 6], "x2": [1, 2, 3]})
.lazy()
.with_columns(
pl.col(["x1", "x2"]).pct_change().alias("pct_change"),
maxes=pl.all().max().suffix("_max"),
xcols=pl.col("^x.*$"),
)
)
# ┌─────┬─────┬─────────────┬───────────┬───────────┐
# │ x1 ┆ x2 ┆ pct_change ┆ maxes ┆ xcols │
# │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
# │ i64 ┆ i64 ┆ struct[2] ┆ struct[2] ┆ struct[2] │
# ╞═════╪═════╪═════════════╪═══════════╪═══════════╡
# │ 1 ┆ 1 ┆ {null,null} ┆ {6,3} ┆ {1,1} │
# │ 2 ┆ 2 ┆ {1.0,1.0} ┆ {6,3} ┆ {2,2} │
# │ 6 ┆ 3 ┆ {2.0,0.5} ┆ {6,3} ┆ {6,3} │
# └─────┴─────┴─────────────┴───────────┴───────────┘
assert ldf.collect().to_dicts() == [
{
"x1": 1,
"x2": 1,
"pct_change": {"x1": None, "x2": None},
"maxes": {"x1_max": 6, "x2_max": 3},
"xcols": {"x1": 1, "x2": 1},
},
{
"x1": 2,
"x2": 2,
"pct_change": {"x1": 1.0, "x2": 1.0},
"maxes": {"x1_max": 6, "x2_max": 3},
"xcols": {"x1": 2, "x2": 2},
},
{
"x1": 6,
"x2": 3,
"pct_change": {"x1": 2.0, "x2": 0.5},
"maxes": {"x1_max": 6, "x2_max": 3},
"xcols": {"x1": 6, "x2": 3},
},
]

# require at least one of exprs / **named_exprs
with pytest.raises(ValueError):
_ = ldf.with_columns()


def test_len_compute(df: pl.DataFrame) -> None:
df = df.with_columns(pl.struct(["list_bool", "cat"]).alias("struct"))
filtered = df.filter(pl.col("bools"))
Expand Down
8 changes: 0 additions & 8 deletions py-polars/tests/unit/test_lazy.py
Original file line number Diff line number Diff line change
Expand Up @@ -892,14 +892,6 @@ def test_with_column_renamed(fruits_cars: pl.DataFrame) -> None:
assert res.columns[0] == "C"


def test_with_columns_single_series() -> None:
ldf = pl.LazyFrame({"a": [1, 2]})
result = ldf.with_columns(pl.Series("b", [3, 4]))

expected = pl.DataFrame({"a": [1, 2], "b": [3, 4]})
assert_frame_equal(result.collect(), expected)


def test_reverse() -> None:
out = pl.LazyFrame({"a": [1, 2], "b": [3, 4]}).reverse()
expected = pl.DataFrame({"a": [2, 1], "b": [4, 3]})
Expand Down

0 comments on commit f800164

Please sign in to comment.