Skip to content

Commit

Permalink
refactor(duckdb): use pyarrow for all memtable registration
Browse files Browse the repository at this point in the history
  • Loading branch information
gforsyth authored and cpcloud committed Nov 27, 2023
1 parent 12058f2 commit d6a2f09
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 24 deletions.
26 changes: 2 additions & 24 deletions ibis/backends/duckdb/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@
from ibis.backends.base.sqlglot import C, F
from ibis.backends.duckdb.compiler import DuckDBSQLCompiler
from ibis.backends.duckdb.datatypes import DuckDBType
from ibis.expr.operations.relations import PandasDataFrameProxy
from ibis.expr.operations.udf import InputType
from ibis.formats.pandas import PandasData

Expand Down Expand Up @@ -1171,10 +1170,6 @@ def _metadata(self, query: str) -> Iterator[tuple[str, dt.DataType]]:
yield name, ibis_type

def _register_in_memory_table(self, op: ops.InMemoryTable) -> None:
# in theory we could use pandas dataframes, but when using dataframes
# with pyarrow datatypes later reads of this data segfault
import pandas as pd

schema = op.schema
if null_columns := [col for col, dtype in schema.items() if dtype.is_null()]:
raise exc.IbisTypeError(
Expand All @@ -1184,32 +1179,15 @@ def _register_in_memory_table(self, op: ops.InMemoryTable) -> None:

# only register if we haven't already done so
if (name := op.name) not in self.list_tables():
if isinstance(data := op.data, PandasDataFrameProxy):
table = data.to_frame()

# convert to object string dtypes because duckdb is either
# 1. extremely slow to register DataFrames with not-pyarrow
# string dtypes
# 2. broken for string[pyarrow] dtypes (segfault)
if conversions := {
colname: "str"
for colname, col in table.items()
if isinstance(col.dtype, pd.StringDtype)
}:
table = table.astype(conversions)
else:
table = data.to_pyarrow(schema)
table = op.data.to_pyarrow(schema)

# register creates a transaction, and we can't nest transactions so
# we create a function to encapsulate the whole shebang
def _register(name, table):
with self.begin() as con:
con.connection.register(name, table)

try:
_register(name, table)
except duckdb.NotImplementedException:
_register(name, data.to_pyarrow(schema))
_register(name, table)

def _get_temp_view_definition(
self, name: str, definition: sa.sql.compiler.Compiled
Expand Down
8 changes: 8 additions & 0 deletions ibis/backends/duckdb/tests/test_register.py
Original file line number Diff line number Diff line change
Expand Up @@ -367,3 +367,11 @@ def test_register_filesystem_gcs(con):
)

assert band_members.count().to_pyarrow()


def test_memtable_null_column_parquet_dtype_roundtrip(con, tmp_path):
before = ibis.memtable({"a": [None, None, None]}, schema={"a": "string"})
before.to_parquet(tmp_path / "tmp.parquet")
after = ibis.read_parquet(tmp_path / "tmp.parquet")

assert before.a.type() == after.a.type()

0 comments on commit d6a2f09

Please sign in to comment.