Skip to content

Commit

Permalink
fix(duckdb): workaround remaining null map issues (#8985)
Browse files Browse the repository at this point in the history
Closes #8632.

Note that of the backends that support nullable maps, DuckDB is the only
one whose `NULL` handling semantics differ.

Other backends that support nullable maps and are implemented in Ibis
and tested in CI:

- Postgres (only maps of string -> string via the postgres `hstore`
extension)
- PySpark
- Snowflake
- Trino

BREAKING CHANGE: Calling the `get` or `contains` method on `NULL` map
values now returns `NULL`. Use `coalesce(map.get(...), default)` or
`coalesce(map.contains(), False)` to get the previous behavior.
  • Loading branch information
cpcloud authored Apr 17, 2024
1 parent 0b76129 commit b6c71d7
Show file tree
Hide file tree
Showing 3 changed files with 206 additions and 12 deletions.
41 changes: 32 additions & 9 deletions ibis/backends/duckdb/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,7 @@ class DuckDBCompiler(SQLGlotCompiler):
ops.Hash: "hash",
ops.IntegerRange: "range",
ops.TimestampRange: "range",
ops.MapKeys: "map_keys",
ops.MapLength: "cardinality",
ops.MapMerge: "map_concat",
ops.MapValues: "map_values",
ops.Mode: "mode",
ops.TimeFromHMS: "make_time",
ops.TypeOf: "typeof",
Expand Down Expand Up @@ -201,17 +198,43 @@ def visit_ArrayZip(self, op, *, arg):

def visit_Map(self, op, *, keys, values):
# workaround for https://github.com/ibis-project/ibis/issues/8632
regular = self.f.map(keys, values)
either_null = sg.or_(keys.is_(NULL), values.is_(NULL))
return self.if_(either_null, NULL, regular)
return self.if_(
sg.or_(keys.is_(NULL), values.is_(NULL)), NULL, self.f.map(keys, values)
)

def visit_MapGet(self, op, *, arg, key, default):
return self.f.ifnull(
self.f.list_extract(self.f.element_at(arg, key), 1), default
return self.if_(
arg.is_(NULL),
NULL,
self.f.ifnull(
self.f.list_extract(
self.if_(key.is_(NULL), NULL, self.f.element_at(arg, key)), 1
),
default,
),
)

def visit_MapContains(self, op, *, arg, key):
return self.f.len(self.f.element_at(arg, key)).neq(0)
return self.if_(
arg.is_(NULL),
NULL,
self.f.len(self.if_(key.is_(NULL), NULL, self.f.element_at(arg, key))).neq(
0
),
)

def visit_MapKeys(self, op, *, arg):
return self.if_(arg.is_(NULL), NULL, self.f.map_keys(arg))

def visit_MapValues(self, op, *, arg):
return self.if_(arg.is_(NULL), NULL, self.f.map_values(arg))

def visit_MapMerge(self, op, *, left, right):
return self.if_(
sg.or_(left.is_(NULL), right.is_(NULL)),
NULL,
self.f.map_concat(left, right),
)

def visit_ToJSONMap(self, op, *, arg):
return self.if_(
Expand Down
171 changes: 171 additions & 0 deletions ibis/backends/tests/test_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,177 @@ def test_map_nulls(con, k, v):
assert con.execute(m) is None


@pytest.mark.notyet("clickhouse", reason="nested types can't be NULL")
@pytest.mark.broken(["pandas", "dask"], reason="TypeError: iteration over a 0-d array")
@pytest.mark.notimpl(
["risingwave"],
raises=PsycoPg2InternalError,
reason="function hstore(character varying[], character varying[]) does not exist",
)
@pytest.mark.parametrize(
("k", "v"),
[
param(None, ["c", "d"], id="null_keys"),
param(None, None, id="null_both"),
],
)
def test_map_keys_nulls(con, k, v):
k = ibis.literal(k, type="array<string>")
v = ibis.literal(v, type="array<string>")
m = ibis.map(k, v)
assert con.execute(m.keys()) is None


@pytest.mark.notyet("clickhouse", reason="nested types can't be NULL")
@pytest.mark.notimpl(
["risingwave"],
raises=PsycoPg2InternalError,
reason="function hstore(character varying[], character varying[]) does not exist",
)
@pytest.mark.parametrize(
"map",
[
param(
ibis.map(
ibis.literal(["a", "b"]), ibis.literal(None, type="array<string>")
),
marks=[
pytest.mark.broken(
["pandas", "dask"], reason="TypeError: iteration over a 0-d array"
)
],
id="null_values",
),
param(
ibis.map(
ibis.literal(None, type="array<string>"),
ibis.literal(None, type="array<string>"),
),
marks=[
pytest.mark.broken(
["pandas", "dask"], reason="TypeError: iteration over a 0-d array"
)
],
id="null_both",
),
param(ibis.literal(None, type="map<string, string>"), id="null_map"),
],
)
def test_map_values_nulls(con, map):
assert con.execute(map.values()) is None


@pytest.mark.notimpl(
["risingwave"],
raises=PsycoPg2InternalError,
reason="function hstore(character varying[], character varying[]) does not exist",
)
@pytest.mark.parametrize(
("map", "key"),
[
param(
ibis.map(
ibis.literal(["a", "b"]), ibis.literal(["c", "d"], type="array<string>")
),
ibis.literal(None, type="string"),
marks=[
pytest.mark.broken(
["pandas", "dask"],
reason="result is False instead of None",
strict=False, # passes for contains, but not for get
),
pytest.mark.notimpl(
"flink",
raises=AssertionError,
reason="not yet implemented",
strict=False,
),
],
id="non_null_map_null_key",
),
param(
ibis.map(
ibis.literal(None, type="array<string>"),
ibis.literal(None, type="array<string>"),
),
"a",
marks=[
pytest.mark.notyet("clickhouse", reason="nested types can't be NULL"),
pytest.mark.broken(
["pandas", "dask"], reason="TypeError: iteration over a 0-d array"
),
],
id="null_both_non_null_key",
),
param(
ibis.map(
ibis.literal(None, type="array<string>"),
ibis.literal(None, type="array<string>"),
),
ibis.literal(None, type="string"),
marks=[
pytest.mark.notyet("clickhouse", reason="nested types can't be NULL"),
pytest.mark.broken(
["pandas", "dask"], reason="TypeError: iteration over a 0-d array"
),
],
id="null_both_null_key",
),
param(
ibis.literal(None, type="map<string, string>"),
"a",
marks=[
pytest.mark.notyet("clickhouse", reason="nested types can't be NULL")
],
id="null_map_non_null_key",
),
param(
ibis.literal(None, type="map<string, string>"),
ibis.literal(None, type="string"),
marks=[
pytest.mark.notyet("clickhouse", reason="nested types can't be NULL")
],
id="null_map_null_key",
),
],
)
@pytest.mark.parametrize("method", ["get", "contains"])
def test_map_get_contains_nulls(con, map, key, method):
expr = getattr(map, method)
assert con.execute(expr(key)) is None


@pytest.mark.notyet("clickhouse", reason="nested types can't be NULL")
@pytest.mark.notimpl(
["risingwave"],
raises=PsycoPg2InternalError,
reason="function hstore(character varying[], character varying[]) does not exist",
)
@pytest.mark.parametrize(
("m1", "m2"),
[
param(
ibis.literal(None, type="map<string, string>"),
ibis.literal({"a": "b"}, type="map<string, string>"),
id="null_and_non_null",
),
param(
ibis.literal({"a": "b"}, type="map<string, string>"),
ibis.literal(None, type="map<string, string>"),
id="non_null_and_null",
),
param(
ibis.literal(None, type="map<string, string>"),
ibis.literal(None, type="map<string, string>"),
id="null_and_null",
),
],
)
def test_map_merge_nulls(con, m1, m2):
concatted = m1 + m2
assert con.execute(concatted) is None


@pytest.mark.notimpl(["pandas", "dask"])
def test_map_table(backend):
table = backend.map
Expand Down
6 changes: 3 additions & 3 deletions ibis/expr/types/maps.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ class MapValue(Value):
├───────────────────┤
│ 2 │
│ 0 │
0
NULL
└───────────────────┘
"""

Expand Down Expand Up @@ -149,7 +149,7 @@ def get(self, key: ir.Value, default: ir.Value | None = None) -> ir.Value:
├───────────────────┤
│ 2 │
│ 0 │
0
NULL
└───────────────────┘
"""

Expand Down Expand Up @@ -304,7 +304,7 @@ def contains(
├─────────────────────┤
│ True │
│ False │
False
NULL
└─────────────────────┘
"""
return ops.MapContains(self, key).to_expr()
Expand Down

0 comments on commit b6c71d7

Please sign in to comment.