From cb79f0019fcfd98bdd68fabf41c0e953860f73db Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Sat, 29 Jun 2024 13:28:07 -0800 Subject: [PATCH] test: check that empty structs are disallowed See https://github.com/ibis-project/ibis/pull/9310 for a discussion of whether empty structs should be allowed or disallowed. We settled on disallowing them for now, since pyarrow is like the only backend to support them, and it is definitely easier to change our mind and allow them later rather than disallow them later. Perhaps if you stay in ibis-land, emptry structs are useful ie for doing type manipulations, or maybe you only use them for intermediate calculations? But until we see this concrete use case, don't worry about it. I had to adjust the dask testcase generation because this change revealed a bug in our existing dask tests. `ibis.backends.dask.Backend.get_schema()` uses a method of `PandasData.infer_table(df.head(1))`, so we were infering the type of the `map_of_*` testing column as `struct<>` since the first element in that test array of values was `{}`. So the simple solution was to just put the "representative" value first. This doesn't really solve the root problem in how dask does type inference, we probably want to improve this. I'd guess we should infer the column type from a larger sample of values. 1 seems really absurdly small. I tried doing this but it was trickier than I would have thought, with a head(10) I was only getting two values back, even thought there should be 3?? IDK that should get solved in a different PR. --- ibis/backends/dask/tests/conftest.py | 6 ++--- ibis/backends/dask/tests/test_maps.py | 33 ++++++++++++--------------- ibis/expr/datatypes/core.py | 12 +++++++++- ibis/expr/datatypes/value.py | 2 -- ibis/tests/expr/test_struct.py | 7 ++++++ ibis/tests/strategies.py | 2 +- 6 files changed, 37 insertions(+), 25 deletions(-) diff --git a/ibis/backends/dask/tests/conftest.py b/ibis/backends/dask/tests/conftest.py index d9cbfd2689f3b..9cdb1270e744f 100644 --- a/ibis/backends/dask/tests/conftest.py +++ b/ibis/backends/dask/tests/conftest.py @@ -151,9 +151,9 @@ def pandas_df(): "array_of_float64": [[1.0, 2.0], [3.0], []], "array_of_int64": [[1, 2], [], [3]], "array_of_strings": [["a", "b"], [], ["c"]], - "map_of_strings_integers": [{"a": 1, "b": 2}, None, {}], - "map_of_integers_strings": [{}, None, {1: "a", 2: "b"}], - "map_of_complex_values": [None, {"a": [1, 2, 3], "b": []}, {}], + "map_of_strings_integers": [{"a": 1, "b": 2}, {}, None], + "map_of_integers_strings": [{1: "a", 2: "b"}, {}, None], + "map_of_complex_values": [{"a": [1, 2, 3], "b": []}, {}, None], } ) diff --git a/ibis/backends/dask/tests/test_maps.py b/ibis/backends/dask/tests/test_maps.py index b7445434211dd..b5c49620b8969 100644 --- a/ibis/backends/dask/tests/test_maps.py +++ b/ibis/backends/dask/tests/test_maps.py @@ -1,6 +1,5 @@ from __future__ import annotations -import numpy as np import pandas as pd import pytest @@ -13,7 +12,9 @@ def test_map_length_expr(t): expr = t.map_of_integers_strings.length() result = expr.execute() - expected = pd.Series([0, None, 2], name="MapLength(map_of_integers_strings)") + expected = pd.Series( + [2, 0, None], dtype="object", name="MapLength(map_of_integers_strings)" + ) tm.assert_series_equal(result, expected, check_index=False) @@ -21,7 +22,7 @@ def test_map_value_for_key_expr(t): expr = t.map_of_integers_strings[1] result = expr.execute() expected = pd.Series( - [None, None, "a"], name="MapGet(map_of_integers_strings, 1, None)" + ["a", None, None], name="MapGet(map_of_integers_strings, 1, None)" ) tm.assert_series_equal(result, expected, check_index=False) @@ -30,7 +31,7 @@ def test_map_value_or_default_for_key_expr(t): expr = t.map_of_complex_values.get("a") result = expr.execute() expected = pd.Series( - [None, [1, 2, 3], None], + [[1, 2, 3], None, None], dtype="object", name=expr.get_name(), ) @@ -45,7 +46,7 @@ def test_map_keys_expr(t): expr = t.map_of_strings_integers.keys() result = expr.execute().map(safe_sorter) expected = pd.Series( - [["a", "b"], None, []], + [["a", "b"], [], None], dtype="object", name="MapKeys(map_of_strings_integers)", ).map(safe_sorter) @@ -54,17 +55,13 @@ def test_map_keys_expr(t): def test_map_values_expr(t): expr = t.map_of_complex_values.values() - result = expr.execute() - expected = pd.Series( - [ - None, - np.array([[1, 2, 3], []], dtype="object"), - np.array([], dtype="object"), - ], - dtype="object", - name="MapValues(map_of_complex_values)", - ) - tm.assert_series_equal(result, expected, check_index=False) + result = list(expr.execute()) + expected = [ + [[1, 2, 3], []], + [], + None, + ] + assert result == expected def test_map_concat_expr(t): @@ -72,9 +69,9 @@ def test_map_concat_expr(t): result = expr.execute() expected = pd.Series( [ - None, {"a": [], "b": [4, 5, 6], "c": []}, - {"b": [4, 5, 6], "c": [], "a": []}, + {"a": [], "b": [4, 5, 6], "c": []}, + None, ], dtype="object", name=expr.get_name(), diff --git a/ibis/expr/datatypes/core.py b/ibis/expr/datatypes/core.py index 2a2cfa4eae789..decb29500e34b 100644 --- a/ibis/expr/datatypes/core.py +++ b/ibis/expr/datatypes/core.py @@ -831,13 +831,23 @@ def _pretty_piece(self) -> str: @public class Struct(Parametric, MapSet): - """Structured values.""" + """A nested type with ordered fields of any type, analogous to a dataclass in Python. + + eg a Struct might have a field a of type int64 and a field b of type string. + """ fields: FrozenOrderedDict[str, DataType] scalar = "StructScalar" column = "StructColumn" + def __init__(self, fields, nullable=True): + # We could do this validation in a type annotation, but I thought this + # was common enough to warrant a nicer error message. + if not fields: + raise TypeError("Structs require at least one field") + super().__init__(fields=fields, nullable=nullable) + @classmethod def from_tuples( cls, pairs: Iterable[tuple[str, str | DataType]], nullable: bool = True diff --git a/ibis/expr/datatypes/value.py b/ibis/expr/datatypes/value.py index ad40f9047ecf3..dceccf048d6ab 100644 --- a/ibis/expr/datatypes/value.py +++ b/ibis/expr/datatypes/value.py @@ -42,8 +42,6 @@ def infer(value: Any) -> dt.DataType: @infer.register(collections.OrderedDict) def infer_struct(value: Mapping[str, Any]) -> dt.Struct: """Infer the [`Struct`](./datatypes.qmd#ibis.expr.datatypes.Struct) type of `value`.""" - if not value: - raise TypeError("Empty struct type not supported") fields = {name: infer(val) for name, val in value.items()} return dt.Struct(fields) diff --git a/ibis/tests/expr/test_struct.py b/ibis/tests/expr/test_struct.py index de297a4e4a586..6c87da2102958 100644 --- a/ibis/tests/expr/test_struct.py +++ b/ibis/tests/expr/test_struct.py @@ -22,6 +22,13 @@ def s(): return ibis.table(dict(a="struct"), name="s") +@pytest.mark.parametrize("val", [{}, []]) +@pytest.mark.parametrize("typ", [None, "struct<>"]) +def test_struct_factory_empty(val, typ): + with pytest.raises(TypeError): + ibis.struct(val, type=typ) + + def test_struct_operations(): value = OrderedDict( [ diff --git a/ibis/tests/strategies.py b/ibis/tests/strategies.py index f417b3719a124..ba4786289d14a 100644 --- a/ibis/tests/strategies.py +++ b/ibis/tests/strategies.py @@ -168,7 +168,7 @@ def struct_dtypes( draw, types=_item_strategy, names=_any_text, - num_fields=st.integers(min_value=0, max_value=20), # noqa: B008 + num_fields=st.integers(min_value=1, max_value=20), # noqa: B008 nullable=_nullable, ): num_fields = draw(num_fields)