From cd9ee1b0468bdcfd14b357905ad4467b51c25367 Mon Sep 17 00:00:00 2001 From: Gil Forsyth Date: Mon, 23 Sep 2024 11:54:46 -0400 Subject: [PATCH] refactor(joins): require explicit abstract table as RHS of joins (#9661) ## Description of changes We have (had) limited support for passing in in-memory objects as the RHS of a join, where we would create a memtable for the user and then use that. For backends where memtable creation is expensive, or for queries where there may be multiple calls to the same in-memory data, it is better to be explicit and first register the in-memory data with the backend using either `memtable` or `create_table`. BREAKING CHANGE: Passing a `pyarrow.Table` or a `pandas.DataFrame` as the right-hand-side of a join is no longer supported. To join against in-memory data, you can pass the in-memory object to `ibis.memtable` or `con.create_table` and use the resulting table object instead. ## Issues closed * Resolves #9571 --- ibis/backends/tests/test_join.py | 7 +++++-- ibis/expr/types/joins.py | 29 +++++------------------------ ibis/tests/expr/test_table.py | 2 +- 3 files changed, 11 insertions(+), 27 deletions(-) diff --git a/ibis/backends/tests/test_join.py b/ibis/backends/tests/test_join.py index 88fdd5489384..b81b1878a882 100644 --- a/ibis/backends/tests/test_join.py +++ b/ibis/backends/tests/test_join.py @@ -188,7 +188,8 @@ def test_join_with_pandas(batting, awards_players): batting_filt = batting.filter(lambda t: t.yearID < 1900) awards_players_filt = awards_players.filter(lambda t: t.yearID < 1900).execute() assert isinstance(awards_players_filt, pd.DataFrame) - expr = batting_filt.join(awards_players_filt, "yearID") + t = ibis.memtable(awards_players_filt) + expr = batting_filt.join(t, "yearID") df = expr.execute() assert df.yearID.nunique() == 7 @@ -206,7 +207,9 @@ def test_join_with_pandas_non_null_typed_columns(batting, awards_players): assert sch.infer(awards_players_filt) == sch.Schema(dict(yearID="int")) assert isinstance(awards_players_filt, pd.DataFrame) - expr = batting_filt.join(awards_players_filt, "yearID") + + t = ibis.memtable(awards_players_filt) + expr = batting_filt.join(t, "yearID") df = expr.execute() assert df.yearID.nunique() == 7 diff --git a/ibis/expr/types/joins.py b/ibis/expr/types/joins.py index 2987848802af..c0b9a7a8b817 100644 --- a/ibis/expr/types/joins.py +++ b/ibis/expr/types/joins.py @@ -5,7 +5,6 @@ from public import public -import ibis import ibis.expr.operations as ops from ibis import util from ibis.common.deferred import Deferred @@ -13,6 +12,7 @@ from ibis.common.exceptions import ( ExpressionError, IbisInputError, + IbisTypeError, InputTypeError, IntegrityError, ) @@ -31,28 +31,6 @@ from ibis.expr.operations.relations import JoinKind -def coerce_to_table(data): - try: - import pandas as pd - except ImportError: - pass - else: - if isinstance(data, pd.DataFrame): - return ibis.memtable(data) - - try: - import pyarrow as pa - except ImportError: - pass - else: - if isinstance(data, pa.Table): - return ibis.memtable(data) - - if not isinstance(data, Table): - raise TypeError(f"right operand must be a Table, got {type(data).__name__}") - return data - - def disambiguate_fields( how, predicates, @@ -254,7 +232,10 @@ def join( lname: str = "", rname: str = "{name}_right", ): - right = coerce_to_table(right) + if not isinstance(right, Table): + raise IbisTypeError( + f"Right side of join must be an Ibis table, got {type(right)}." + ) if how == "left_semi": how = "semi" diff --git a/ibis/tests/expr/test_table.py b/ibis/tests/expr/test_table.py index 2b117cbb93ab..b0ebe5b76355 100644 --- a/ibis/tests/expr/test_table.py +++ b/ibis/tests/expr/test_table.py @@ -1315,7 +1315,7 @@ def test_join_invalid_expr_type(con): invalid_right = left.foo_id join_key = ["bar_id"] - with pytest.raises(TypeError): + with pytest.raises(com.IbisTypeError): left.inner_join(invalid_right, join_key)