test: check that empty structs are disallowed

See ibis-project#9310 for a discussion of whether empty structs should be allowed or disallowed. We settled on disallowing them for now, since pyarrow is like the only backend to support them, and it is definitely easier to change our mind and allow them later rather than disallow them later. Perhaps if you stay in ibis-land, emptry structs are useful ie for doing type manipulations, or maybe you only use them for intermediate calculations? But until we see this concrete use case, don't worry about it. I had to adjust the dask testcase generation because this change revealed a bug in our existing dask tests. `ibis.backends.dask.Backend.get_schema()` uses a method of `PandasData.infer_table(df.head(1))`, so we were infering the type of the `map_of_*` testing column as `struct<>` since the first element in that test array of values was `{}`. So the simple solution was to just put the "representative" value first. This doesn't really solve the root problem in how dask does type inference, we probably want to improve this. I'd guess we should infer the column type from a larger sample of values. 1 seems really absurdly small. I tried doing this but it was trickier than I would have thought, with a head(10) I was only getting two values back, even thought there should be 3?? IDK that should get solved in a different PR.
NickCrews · Jun 29, 2024 · cb79f00 · cb79f00
1 parent b150635
commit cb79f00
Show file tree

Hide file tree

Showing 6 changed files with 37 additions and 25 deletions.
diff --git a/ibis/backends/dask/tests/conftest.py b/ibis/backends/dask/tests/conftest.py
@@ -151,9 +151,9 @@ def pandas_df():
             "array_of_float64": [[1.0, 2.0], [3.0], []],
             "array_of_int64": [[1, 2], [], [3]],
             "array_of_strings": [["a", "b"], [], ["c"]],
-            "map_of_strings_integers": [{"a": 1, "b": 2}, None, {}],
-            "map_of_integers_strings": [{}, None, {1: "a", 2: "b"}],
-            "map_of_complex_values": [None, {"a": [1, 2, 3], "b": []}, {}],
+            "map_of_strings_integers": [{"a": 1, "b": 2}, {}, None],
+            "map_of_integers_strings": [{1: "a", 2: "b"}, {}, None],
+            "map_of_complex_values": [{"a": [1, 2, 3], "b": []}, {}, None],
         }
     )
 

diff --git a/ibis/backends/dask/tests/test_maps.py b/ibis/backends/dask/tests/test_maps.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-import numpy as np
 import pandas as pd
 import pytest
 
@@ -13,15 +12,17 @@
 def test_map_length_expr(t):
     expr = t.map_of_integers_strings.length()
     result = expr.execute()
-    expected = pd.Series([0, None, 2], name="MapLength(map_of_integers_strings)")
+    expected = pd.Series(
+        [2, 0, None], dtype="object", name="MapLength(map_of_integers_strings)"
+    )
     tm.assert_series_equal(result, expected, check_index=False)
 
 
 def test_map_value_for_key_expr(t):
     expr = t.map_of_integers_strings[1]
     result = expr.execute()
     expected = pd.Series(
-        [None, None, "a"], name="MapGet(map_of_integers_strings, 1, None)"
+        ["a", None, None], name="MapGet(map_of_integers_strings, 1, None)"
     )
     tm.assert_series_equal(result, expected, check_index=False)
 
@@ -30,7 +31,7 @@ def test_map_value_or_default_for_key_expr(t):
     expr = t.map_of_complex_values.get("a")
     result = expr.execute()
     expected = pd.Series(
-        [None, [1, 2, 3], None],
+        [[1, 2, 3], None, None],
         dtype="object",
         name=expr.get_name(),
     )
@@ -45,7 +46,7 @@ def test_map_keys_expr(t):
     expr = t.map_of_strings_integers.keys()
     result = expr.execute().map(safe_sorter)
     expected = pd.Series(
-        [["a", "b"], None, []],
+        [["a", "b"], [], None],
         dtype="object",
         name="MapKeys(map_of_strings_integers)",
     ).map(safe_sorter)
@@ -54,27 +55,23 @@ def test_map_keys_expr(t):
 
 def test_map_values_expr(t):
     expr = t.map_of_complex_values.values()
-    result = expr.execute()
-    expected = pd.Series(
-        [
-            None,
-            np.array([[1, 2, 3], []], dtype="object"),
-            np.array([], dtype="object"),
-        ],
-        dtype="object",
-        name="MapValues(map_of_complex_values)",
-    )
-    tm.assert_series_equal(result, expected, check_index=False)
+    result = list(expr.execute())
+    expected = [
+        [[1, 2, 3], []],
+        [],
+        None,
+    ]
+    assert result == expected
 
 
 def test_map_concat_expr(t):
     expr = t.map_of_complex_values + {"b": [4, 5, 6], "c": [], "a": []}
     result = expr.execute()
     expected = pd.Series(
         [
-            None,
             {"a": [], "b": [4, 5, 6], "c": []},
-            {"b": [4, 5, 6], "c": [], "a": []},
+            {"a": [], "b": [4, 5, 6], "c": []},
+            None,
         ],
         dtype="object",
         name=expr.get_name(),

diff --git a/ibis/expr/datatypes/core.py b/ibis/expr/datatypes/core.py
@@ -831,13 +831,23 @@ def _pretty_piece(self) -> str:
 
 @public
 class Struct(Parametric, MapSet):
-    """Structured values."""
+    """A nested type with ordered fields of any type, analogous to a dataclass in Python.
+
+    eg a Struct might have a field a of type int64 and a field b of type string.
+    """
 
     fields: FrozenOrderedDict[str, DataType]
 
     scalar = "StructScalar"
     column = "StructColumn"
 
+    def __init__(self, fields, nullable=True):
+        # We could do this validation in a type annotation, but I thought this
+        # was common enough to warrant a nicer error message.
+        if not fields:
+            raise TypeError("Structs require at least one field")
+        super().__init__(fields=fields, nullable=nullable)
+
     @classmethod
     def from_tuples(
         cls, pairs: Iterable[tuple[str, str | DataType]], nullable: bool = True

diff --git a/ibis/expr/datatypes/value.py b/ibis/expr/datatypes/value.py
@@ -42,8 +42,6 @@ def infer(value: Any) -> dt.DataType:
 @infer.register(collections.OrderedDict)
 def infer_struct(value: Mapping[str, Any]) -> dt.Struct:
     """Infer the [`Struct`](./datatypes.qmd#ibis.expr.datatypes.Struct) type of `value`."""
-    if not value:
-        raise TypeError("Empty struct type not supported")
     fields = {name: infer(val) for name, val in value.items()}
     return dt.Struct(fields)
 

diff --git a/ibis/tests/expr/test_struct.py b/ibis/tests/expr/test_struct.py
@@ -22,6 +22,13 @@ def s():
     return ibis.table(dict(a="struct<f: float, g: string>"), name="s")
 
 
+@pytest.mark.parametrize("val", [{}, []])
+@pytest.mark.parametrize("typ", [None, "struct<>"])
+def test_struct_factory_empty(val, typ):
+    with pytest.raises(TypeError):
+        ibis.struct(val, type=typ)
+
+
 def test_struct_operations():
     value = OrderedDict(
         [

diff --git a/ibis/tests/strategies.py b/ibis/tests/strategies.py
@@ -168,7 +168,7 @@ def struct_dtypes(
     draw,
     types=_item_strategy,
     names=_any_text,
-    num_fields=st.integers(min_value=0, max_value=20),  # noqa: B008
+    num_fields=st.integers(min_value=1, max_value=20),  # noqa: B008
     nullable=_nullable,
 ):
     num_fields = draw(num_fields)