Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor(duckdb): use .sql instead of .execute in performance-senitive locations #8669

Merged
merged 2 commits into from
Mar 17, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 54 additions & 40 deletions ibis/backends/duckdb/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1183,6 +1183,27 @@

super()._run_pre_execute_hooks(expr)

def _to_duckdb_relation(

Check warning on line 1186 in ibis/backends/duckdb/__init__.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/duckdb/__init__.py#L1186

Added line #L1186 was not covered by tests
self,
expr: ir.Expr,
*,
params: Mapping[ir.Scalar, Any] | None = None,
limit: int | str | None = None,
):
"""Preprocess the expr, and return a ``duckdb.DuckDBPyRelation`` object.

When retrieving in-memory results, it's faster to use `duckdb_con.sql`
than `duckdb_con.execute`, as the query planner can take advantage of
knowing the output type. Since the relation objects aren't compatible
with the dbapi, we choose to only use them in select internal methods
where performance might matter, and use the standard
`duckdb_con.execute` everywhere else.
"""
self._run_pre_execute_hooks(expr)
table_expr = expr.as_table()
sql = self.compile(table_expr, limit=limit, params=params)
return self.con.sql(sql)

Check warning on line 1205 in ibis/backends/duckdb/__init__.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/duckdb/__init__.py#L1202-L1205

Added lines #L1202 - L1205 were not covered by tests

def to_pyarrow_batches(
self,
expr: ir.Expr,
Expand Down Expand Up @@ -1220,12 +1241,7 @@
def batch_producer(cur):
yield from cur.fetch_record_batch(rows_per_batch=chunk_size)

# TODO: check that this is still handled correctly
# batch_producer keeps the `self.con` member alive long enough to
# exhaust the record batch reader, even if the backend or connection
# have gone out of scope in the caller
result = self.raw_sql(sql)

return pa.RecordBatchReader.from_batches(
expr.as_table().schema().to_pyarrow(), batch_producer(result)
)
Expand All @@ -1238,14 +1254,40 @@
limit: int | str | None = None,
**_: Any,
) -> pa.Table:
self._run_pre_execute_hooks(expr)
table = expr.as_table()
sql = self.compile(table, limit=limit, params=params)
table = self._to_duckdb_relation(expr, params=params, limit=limit).arrow()
return expr.__pyarrow_result__(table)

Check warning on line 1258 in ibis/backends/duckdb/__init__.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/duckdb/__init__.py#L1257-L1258

Added lines #L1257 - L1258 were not covered by tests

with self._safe_raw_sql(sql) as cur:
table = cur.fetch_arrow_table()
def execute(

Check warning on line 1260 in ibis/backends/duckdb/__init__.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/duckdb/__init__.py#L1260

Added line #L1260 was not covered by tests
self,
expr: ir.Expr,
params: Mapping | None = None,
limit: str | None = "default",
**_: Any,
) -> Any:
"""Execute an expression."""
import pandas as pd
import pyarrow.types as pat

Check warning on line 1269 in ibis/backends/duckdb/__init__.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/duckdb/__init__.py#L1268-L1269

Added lines #L1268 - L1269 were not covered by tests

return expr.__pyarrow_result__(table)
table = self._to_duckdb_relation(expr, params=params, limit=limit).arrow()

Check warning on line 1271 in ibis/backends/duckdb/__init__.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/duckdb/__init__.py#L1271

Added line #L1271 was not covered by tests

df = pd.DataFrame(
{
name: (
col.to_pylist()
if (
pat.is_nested(col.type)
or
# pyarrow / duckdb type null literals columns as int32?
# but calling `to_pylist()` will render it as None
col.null_count
)
else col.to_pandas(timestamp_as_object=True)
)
for name, col in zip(table.column_names, table.columns)
}
)
df = DuckDBPandasData.convert_table(df, expr.as_table().schema())
return expr.__pandas_result__(df)

Check warning on line 1290 in ibis/backends/duckdb/__init__.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/duckdb/__init__.py#L1289-L1290

Added lines #L1289 - L1290 were not covered by tests

@util.experimental
def to_torch(
Expand Down Expand Up @@ -1275,9 +1317,7 @@
A dictionary of torch tensors, keyed by column name.

"""
compiled = self.compile(expr, limit=limit, params=params, **kwargs)
with self._safe_raw_sql(compiled) as cur:
return cur.torch()
return self._to_duckdb_relation(expr, params=params, limit=limit).torch()

Check warning on line 1320 in ibis/backends/duckdb/__init__.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/duckdb/__init__.py#L1320

Added line #L1320 was not covered by tests

@util.experimental
def to_parquet(
Expand Down Expand Up @@ -1377,32 +1417,6 @@
with self._safe_raw_sql(copy_cmd):
pass

def _fetch_from_cursor(
self, cursor: duckdb.DuckDBPyConnection, schema: sch.Schema
) -> pd.DataFrame:
import pandas as pd
import pyarrow.types as pat

table = cursor.fetch_arrow_table()

df = pd.DataFrame(
{
name: (
col.to_pylist()
if (
pat.is_nested(col.type)
or
# pyarrow / duckdb type null literals columns as int32?
# but calling `to_pylist()` will render it as None
col.null_count
)
else col.to_pandas(timestamp_as_object=True)
)
for name, col in zip(table.column_names, table.columns)
}
)
return DuckDBPandasData.convert_table(df, schema)

def _get_schema_using_query(self, query: str) -> sch.Schema:
with self._safe_raw_sql(f"DESCRIBE {query}") as cur:
rows = cur.fetch_arrow_table()
Expand Down
Loading