Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(duckdb): support version 1.1.0 #10037

Merged
merged 4 commits into from
Sep 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions ibis/backends/duckdb/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import pyarrow_hotfix # noqa: F401
import sqlglot as sg
import sqlglot.expressions as sge
from packaging.version import parse as vparse

import ibis
import ibis.backends.sql.compilers as sc
Expand Down Expand Up @@ -461,6 +462,11 @@ def _post_connect(self, extensions: Sequence[str] | None = None) -> None:
# Default timezone, can't be set with `config`
self.settings["timezone"] = "UTC"

# setting this to false disables magic variables-as-tables discovery,
# hopefully eliminating large classes of bugs
if vparse(self.version) > vparse("1"):
self.settings["python_enable_replacements"] = False

self._record_batch_readers_consumed = {}

def _load_extensions(
Expand Down
3 changes: 2 additions & 1 deletion ibis/backends/duckdb/tests/test_udf.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,8 @@ def favg(x: float, where: bool = True) -> float: ...
def test_builtin_agg(con, func):
import ibis

raw_data = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]
start, stop = 1, 11
raw_data = list(map(float, range(start, stop)))
data = ibis.memtable({"a": raw_data})
expr = func(data.a)

Expand Down
14 changes: 12 additions & 2 deletions ibis/backends/sql/compilers/duckdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,6 @@ class DuckDBCompiler(SQLGlotCompiler):

SIMPLE_OPS = {
ops.Arbitrary: "any_value",
ops.ArrayPosition: "list_indexof",
ops.ArrayMin: "list_min",
ops.ArrayMax: "list_max",
ops.ArrayAny: "list_bool_or",
Expand Down Expand Up @@ -150,6 +149,13 @@ def visit_ArrayDistinct(self, op, *, arg):
),
)

def visit_ArrayPosition(self, op, *, arg, other):
return self.if_(
arg.is_(NULL) | other.is_(NULL),
NULL,
self.f.coalesce(self.f.list_indexof(arg, other), 0),
)

def visit_ArrayCollect(self, op, *, arg, where, order_by, include_null):
if not include_null:
cond = arg.is_(sg.not_(NULL, copy=False))
Expand Down Expand Up @@ -352,7 +358,11 @@ def visit_IntervalFromInteger(self, op, *, arg, unit):
return self.f[f"to_{unit.plural}"](arg)

def visit_FindInSet(self, op, *, needle, values):
return self.f.list_indexof(self.f.array(*values), needle)
return self.if_(
needle.is_(NULL),
NULL,
self.f.coalesce(self.f.list_indexof(self.f.array(*values), needle), 0),
)

def visit_CountDistinctStar(self, op, *, where, arg):
# use a tuple because duckdb doesn't accept COUNT(DISTINCT a, b, c, ...)
Expand Down
6 changes: 6 additions & 0 deletions ibis/backends/tests/test_aggregation.py
Original file line number Diff line number Diff line change
Expand Up @@ -466,6 +466,12 @@ def mean_and_std(v):
lambda t, where: t.string_col.approx_nunique(where=where),
lambda t, where: t.string_col[where].nunique(),
id="approx_nunique",
marks=pytest.mark.xfail_version(
duckdb=["duckdb>=1.1"],
raises=AssertionError,
reason="not exact, even at this tiny scale",
strict=False,
),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't need to handle it in this PR, but we should probably split any approx function tests out to test approximate results rather than just xfailing.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agree.

),
param(
lambda t, where: t.bigint_col.bit_and(where=where),
Expand Down
8 changes: 5 additions & 3 deletions ibis/backends/tests/test_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,15 +296,17 @@ def test_roundtrip_partitioned_parquet(tmp_path, con, backend, awards_players):

# Reingest and compare schema
reingest = con.read_parquet(outparquet / "*" / "*")
reingest = reingest.cast({"yearID": "int64"})

# avoid type comparison to appease duckdb: as of 0.8.0 it returns large_string
assert reingest.schema().names == awards_players.schema().names
assert reingest.schema().keys() == awards_players.schema().keys()

reingest = reingest.order_by(["yearID", "playerID", "awardID", "lgID"])
awards_players = awards_players.order_by(["yearID", "playerID", "awardID", "lgID"])

backend.assert_frame_equal(reingest.to_pandas(), awards_players.to_pandas())
# reorder columns to match the partitioning
backend.assert_frame_equal(
reingest.to_pandas(), awards_players[reingest.columns].to_pandas()
)


@pytest.mark.parametrize("ftype", ["csv", "parquet"])
Expand Down
5 changes: 2 additions & 3 deletions ibis/backends/tests/test_numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -1290,14 +1290,13 @@ def test_floating_mod(backend, alltypes, df):
)
@pytest.mark.notyet(["mysql", "pyspark"], raises=AssertionError)
@pytest.mark.notyet(
["duckdb", "sqlite"],
raises=AssertionError,
reason="returns NULL when dividing by zero",
["sqlite"], raises=AssertionError, reason="returns NULL when dividing by zero"
)
@pytest.mark.notyet(["mssql"], raises=PyODBCDataError)
@pytest.mark.notyet(["snowflake"], raises=SnowflakeProgrammingError)
@pytest.mark.notyet(["postgres"], raises=PsycoPg2DivisionByZero)
@pytest.mark.notimpl(["exasol"], raises=ExaQueryError)
@pytest.mark.xfail_version(duckdb=["duckdb<1.1"])
def test_divide_by_zero(backend, alltypes, df, column, denominator):
expr = alltypes[column] / denominator
result = expr.name("tmp").execute()
Expand Down
7 changes: 6 additions & 1 deletion ibis/backends/tests/test_temporal.py
Original file line number Diff line number Diff line change
Expand Up @@ -1900,10 +1900,15 @@ def test_large_timestamp(con):
id="ns",
marks=[
pytest.mark.notyet(
["duckdb", "impala", "pyspark", "trino"],
["impala", "pyspark", "trino"],
reason="drivers appear to truncate nanos",
raises=AssertionError,
),
pytest.mark.xfail_version(
duckdb=["duckdb<1.1"],
reason="not implemented until 1.1",
raises=AssertionError,
),
pytest.mark.notimpl(
["druid"],
reason="ibis normalization truncates nanos",
Expand Down
4 changes: 2 additions & 2 deletions ibis/expr/types/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1630,11 +1630,11 @@ def approx_nunique(self, where: ir.BooleanValue | None = None) -> ir.IntegerScal
>>> t = ibis.examples.penguins.fetch()
>>> t.body_mass_g.approx_nunique()
┌────┐
94
92
└────┘
>>> t.body_mass_g.approx_nunique(where=t.species == "Adelie")
┌────┐
55
61
└────┘
"""
return ops.ApproxCountDistinct(
Expand Down
10 changes: 5 additions & 5 deletions ibis/expr/types/relations.py
Original file line number Diff line number Diff line change
Expand Up @@ -3277,11 +3277,11 @@ def join(
┡━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩
│ int64 │ string │ int64 │ string │
├─────────┼───────────────────┼───────────────┼───────────────────┤
1732 │ funny │ 60756 │ funny │
1732 │ Highly quotable │ 60756 │ Highly quotable │
1732 │ drugs 106782drugs
5989 │ Leonardo DiCaprio 106782 │ Leonardo DiCaprio
139385tom hardy 89774 │ Tom Hardy
60756 │ funny │ 1732 │ funny │
60756 │ Highly quotable │ 1732 │ Highly quotable │
89774 │ Tom Hardy139385tom hardy
106782 │ drugs 1732 │ drugs
106782Leonardo DiCaprio 5989 │ Leonardo DiCaprio
└─────────┴───────────────────┴───────────────┴───────────────────┘
"""
from ibis.expr.types.joins import Join
Expand Down
2 changes: 2 additions & 0 deletions ibis/formats/pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,6 +383,8 @@
return value
elif isinstance(value, UUID):
return value
elif isinstance(value, bytes):
return UUID(bytes=value)

Check warning on line 387 in ibis/formats/pandas.py

View check run for this annotation

Codecov / codecov/patch

ibis/formats/pandas.py#L387

Added line #L387 was not covered by tests
return UUID(value)

return convert
Expand Down
Loading
Loading