Skip to content

Commit

Permalink
test: check that empty structs are disallowed
Browse files Browse the repository at this point in the history
See ibis-project#9310 for a discussion of whether
empty structs should be allowed or disallowed.
We settled on disallowing them for now,
since pyarrow is like the only backend to support them,
and it is definitely easier to change our mind and allow them later rather than
disallow them later.

Perhaps if you stay in ibis-land, emptry structs are useful ie for doing type manipulations, or maybe you
only use them for intermediate calculations?
But until we see this concrete use case, don't worry about it.

I had to adjust the dask testcase generation because this change revealed a bug in our existing dask tests. `ibis.backends.dask.Backend.get_schema()` uses a method of `PandasData.infer_table(df.head(1))`, so we were infering the type of the `map_of_*` testing column as
`struct<>` since the first element in that test array of values was `{}`.
So the simple solution was to just put the "representative" value first. This doesn't really
solve the root problem in how dask does type inference, we probably want to improve this.
I'd guess we should infer the column type from a larger sample of values. 1 seems really absurdly small. I tried doing this but it was trickier than I would have thought,
with a head(10) I was only getting two values back, even thought there should be 3??
IDK that should get solved in a different PR.
  • Loading branch information
NickCrews committed Jun 29, 2024
1 parent b150635 commit cb79f00
Show file tree
Hide file tree
Showing 6 changed files with 37 additions and 25 deletions.
6 changes: 3 additions & 3 deletions ibis/backends/dask/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,9 +151,9 @@ def pandas_df():
"array_of_float64": [[1.0, 2.0], [3.0], []],
"array_of_int64": [[1, 2], [], [3]],
"array_of_strings": [["a", "b"], [], ["c"]],
"map_of_strings_integers": [{"a": 1, "b": 2}, None, {}],
"map_of_integers_strings": [{}, None, {1: "a", 2: "b"}],
"map_of_complex_values": [None, {"a": [1, 2, 3], "b": []}, {}],
"map_of_strings_integers": [{"a": 1, "b": 2}, {}, None],
"map_of_integers_strings": [{1: "a", 2: "b"}, {}, None],
"map_of_complex_values": [{"a": [1, 2, 3], "b": []}, {}, None],
}
)

Expand Down
33 changes: 15 additions & 18 deletions ibis/backends/dask/tests/test_maps.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from __future__ import annotations

import numpy as np
import pandas as pd
import pytest

Expand All @@ -13,15 +12,17 @@
def test_map_length_expr(t):
expr = t.map_of_integers_strings.length()
result = expr.execute()
expected = pd.Series([0, None, 2], name="MapLength(map_of_integers_strings)")
expected = pd.Series(
[2, 0, None], dtype="object", name="MapLength(map_of_integers_strings)"
)
tm.assert_series_equal(result, expected, check_index=False)


def test_map_value_for_key_expr(t):
expr = t.map_of_integers_strings[1]
result = expr.execute()
expected = pd.Series(
[None, None, "a"], name="MapGet(map_of_integers_strings, 1, None)"
["a", None, None], name="MapGet(map_of_integers_strings, 1, None)"
)
tm.assert_series_equal(result, expected, check_index=False)

Expand All @@ -30,7 +31,7 @@ def test_map_value_or_default_for_key_expr(t):
expr = t.map_of_complex_values.get("a")
result = expr.execute()
expected = pd.Series(
[None, [1, 2, 3], None],
[[1, 2, 3], None, None],
dtype="object",
name=expr.get_name(),
)
Expand All @@ -45,7 +46,7 @@ def test_map_keys_expr(t):
expr = t.map_of_strings_integers.keys()
result = expr.execute().map(safe_sorter)
expected = pd.Series(
[["a", "b"], None, []],
[["a", "b"], [], None],
dtype="object",
name="MapKeys(map_of_strings_integers)",
).map(safe_sorter)
Expand All @@ -54,27 +55,23 @@ def test_map_keys_expr(t):

def test_map_values_expr(t):
expr = t.map_of_complex_values.values()
result = expr.execute()
expected = pd.Series(
[
None,
np.array([[1, 2, 3], []], dtype="object"),
np.array([], dtype="object"),
],
dtype="object",
name="MapValues(map_of_complex_values)",
)
tm.assert_series_equal(result, expected, check_index=False)
result = list(expr.execute())
expected = [
[[1, 2, 3], []],
[],
None,
]
assert result == expected


def test_map_concat_expr(t):
expr = t.map_of_complex_values + {"b": [4, 5, 6], "c": [], "a": []}
result = expr.execute()
expected = pd.Series(
[
None,
{"a": [], "b": [4, 5, 6], "c": []},
{"b": [4, 5, 6], "c": [], "a": []},
{"a": [], "b": [4, 5, 6], "c": []},
None,
],
dtype="object",
name=expr.get_name(),
Expand Down
12 changes: 11 additions & 1 deletion ibis/expr/datatypes/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -831,13 +831,23 @@ def _pretty_piece(self) -> str:

@public
class Struct(Parametric, MapSet):
"""Structured values."""
"""A nested type with ordered fields of any type, analogous to a dataclass in Python.
eg a Struct might have a field a of type int64 and a field b of type string.
"""

fields: FrozenOrderedDict[str, DataType]

scalar = "StructScalar"
column = "StructColumn"

def __init__(self, fields, nullable=True):
# We could do this validation in a type annotation, but I thought this
# was common enough to warrant a nicer error message.
if not fields:
raise TypeError("Structs require at least one field")
super().__init__(fields=fields, nullable=nullable)

@classmethod
def from_tuples(
cls, pairs: Iterable[tuple[str, str | DataType]], nullable: bool = True
Expand Down
2 changes: 0 additions & 2 deletions ibis/expr/datatypes/value.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,6 @@ def infer(value: Any) -> dt.DataType:
@infer.register(collections.OrderedDict)
def infer_struct(value: Mapping[str, Any]) -> dt.Struct:
"""Infer the [`Struct`](./datatypes.qmd#ibis.expr.datatypes.Struct) type of `value`."""
if not value:
raise TypeError("Empty struct type not supported")
fields = {name: infer(val) for name, val in value.items()}
return dt.Struct(fields)

Expand Down
7 changes: 7 additions & 0 deletions ibis/tests/expr/test_struct.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,13 @@ def s():
return ibis.table(dict(a="struct<f: float, g: string>"), name="s")


@pytest.mark.parametrize("val", [{}, []])
@pytest.mark.parametrize("typ", [None, "struct<>"])
def test_struct_factory_empty(val, typ):
with pytest.raises(TypeError):
ibis.struct(val, type=typ)


def test_struct_operations():
value = OrderedDict(
[
Expand Down
2 changes: 1 addition & 1 deletion ibis/tests/strategies.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ def struct_dtypes(
draw,
types=_item_strategy,
names=_any_text,
num_fields=st.integers(min_value=0, max_value=20), # noqa: B008
num_fields=st.integers(min_value=1, max_value=20), # noqa: B008
nullable=_nullable,
):
num_fields = draw(num_fields)
Expand Down

0 comments on commit cb79f00

Please sign in to comment.