Skip to content

Commit

Permalink
feat(ir): more flexible dereferencing support for join right hand side (
Browse files Browse the repository at this point in the history
ibis-project#8992)

Enables to use fields from parent tables of the join right hand side
instead of enforcing to use the same exact table:

```py
t1 = ibis.table(name="t1", schema={"a": "int64", "b": "string"})
t2 = ibis.table(name="t2", schema={"c": "int64", "d": "string"})

t3 = t2.mutate(e=t2.c + 1)
joined = t1.join(t3, [t1.a == t2.c])  # here we use t2.c instead of t3.c
```

Identify ambiguous cases and raise an error, like the following case:

```py
t.join(t, [t.a == t.a])
```

depends on:
- ibis-project#9043 
- ibis-project#9041 

fixes ibis-project#8581
  • Loading branch information
kszucs authored Apr 25, 2024
1 parent e04c3e5 commit d7a31aa
Show file tree
Hide file tree
Showing 42 changed files with 563 additions and 424 deletions.
1 change: 1 addition & 0 deletions docs/_code/setup_penguins.qmd
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
```{python}
import ibis # <1>
import ibis.selectors as s # <1>
from ibis import _
ibis.options.interactive = True # <2>
Expand Down
10 changes: 5 additions & 5 deletions docs/how-to/analytics/basics.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -58,20 +58,20 @@ t.mutate(bill_length_cm=t["bill_length_mm"] / 10).relocate(
Use the `.join()` method to join data:

```{python}
t.join(t, t["species"] == t["species"], how="left_semi")
t.join(t, ["species"], how="left_semi")
```

## Combining it all together

We can use [the underscore to chain expressions together](./chain_expressions.qmd).

```{python}
t.join(t, t["species"] == t["species"], how="left_semi").filter(
ibis._["species"] != "Adelie"
t.join(t, ["species"], how="left_semi").filter(
_.species != "Adelie"
).group_by(["species", "island"]).aggregate(
avg_bill_length=ibis._["bill_length_mm"].mean()
avg_bill_length=_.bill_length_mm.mean()
).order_by(
ibis._["avg_bill_length"].desc()
_.avg_bill_length.desc()
)
```

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,5 @@ SELECT
`t3`.`val`,
`t3`.`XYZ`
FROM `t1` AS `t3`
INNER JOIN `t1` AS `t5`
INNER JOIN `t1` AS `t4`
ON TRUE
3 changes: 2 additions & 1 deletion ibis/backends/bigquery/tests/unit/test_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,8 @@ class MockBackend(ibis.backends.bigquery.Backend):
table = ops.SQLQueryResult("select * from t", schema, ibis_client).to_expr()
for _ in range(num_joins): # noqa: F402
table = table.mutate(dummy=ibis.literal(""))
table = table.left_join(table, ["dummy"])[[table]]
table_ = table.view()
table = table.left_join(table_, ["dummy"])[[table_]]

start = time.time()
table.compile()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,5 @@ SELECT
"t1"."year",
"t1"."month"
FROM "functional_alltypes" AS "t1"
INNER JOIN "functional_alltypes" AS "t3"
ON "t1"."id" = "t3"."id"
INNER JOIN "functional_alltypes" AS "t2"
ON "t1"."id" = "t2"."id"
Original file line number Diff line number Diff line change
@@ -1 +1 @@
WITH `t9` AS (SELECT EXTRACT(year FROM `t8`.`odate`) AS `year`, COUNT(*) AS `CountStar()` FROM (SELECT `t6`.`c_custkey`, `t6`.`c_name`, `t6`.`c_address`, `t6`.`c_nationkey`, `t6`.`c_phone`, `t6`.`c_acctbal`, `t6`.`c_mktsegment`, `t6`.`c_comment`, `t4`.`r_name` AS `region`, `t7`.`o_totalprice`, CAST(`t7`.`o_orderdate` AS TIMESTAMP) AS `odate` FROM `tpch_region` AS `t4` INNER JOIN `tpch_nation` AS `t5` ON `t4`.`r_regionkey` = `t5`.`n_regionkey` INNER JOIN `tpch_customer` AS `t6` ON `t6`.`c_nationkey` = `t5`.`n_nationkey` INNER JOIN `tpch_orders` AS `t7` ON `t7`.`o_custkey` = `t6`.`c_custkey`) AS `t8` GROUP BY 1) SELECT `t11`.`year`, `t11`.`CountStar()` AS `pre_count`, `t13`.`CountStar()` AS `post_count` FROM `t9` AS `t11` INNER JOIN `t9` AS `t13` ON `t11`.`year` = `t13`.`year`
WITH `t9` AS (SELECT EXTRACT(year FROM `t8`.`odate`) AS `year`, COUNT(*) AS `CountStar()` FROM (SELECT `t6`.`c_custkey`, `t6`.`c_name`, `t6`.`c_address`, `t6`.`c_nationkey`, `t6`.`c_phone`, `t6`.`c_acctbal`, `t6`.`c_mktsegment`, `t6`.`c_comment`, `t4`.`r_name` AS `region`, `t7`.`o_totalprice`, CAST(`t7`.`o_orderdate` AS TIMESTAMP) AS `odate` FROM `tpch_region` AS `t4` INNER JOIN `tpch_nation` AS `t5` ON `t4`.`r_regionkey` = `t5`.`n_regionkey` INNER JOIN `tpch_customer` AS `t6` ON `t6`.`c_nationkey` = `t5`.`n_nationkey` INNER JOIN `tpch_orders` AS `t7` ON `t7`.`o_custkey` = `t6`.`c_custkey`) AS `t8` GROUP BY 1) SELECT `t11`.`year`, `t11`.`CountStar()` AS `pre_count`, `t12`.`CountStar()` AS `post_count` FROM `t9` AS `t11` INNER JOIN `t9` AS `t12` ON `t11`.`year` = `t12`.`year`
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@ SELECT
`t1`.`a`,
`t1`.`b`
FROM `t` AS `t1`
INNER JOIN `t` AS `t3`
ON `t1`.`a` = `t3`.`a`
INNER JOIN `t` AS `t2`
ON `t1`.`a` = `t2`.`a`
AND (
(
`t1`.`a` <> `t3`.`b`
`t1`.`a` <> `t2`.`b`
) OR (
`t1`.`b` <> `t3`.`a`
`t1`.`b` <> `t2`.`a`
)
)
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@ SELECT
`t1`.`a`,
`t1`.`b`
FROM `t` AS `t1`
INNER JOIN `t` AS `t3`
ON `t1`.`a` = `t3`.`a`
INNER JOIN `t` AS `t2`
ON `t1`.`a` = `t2`.`a`
AND (
(
`t1`.`a` <> `t3`.`b` OR `t1`.`b` <> `t3`.`a`
`t1`.`a` <> `t2`.`b` OR `t1`.`b` <> `t2`.`a`
)
AND NOT (
`t1`.`a` <> `t3`.`b` AND `t1`.`b` <> `t3`.`a`
`t1`.`a` <> `t2`.`b` AND `t1`.`b` <> `t2`.`a`
)
)
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,5 @@ SELECT
`t3`.`year`,
`t3`.`month`
FROM `t1` AS `t3`
INNER JOIN `t1` AS `t5`
INNER JOIN `t1` AS `t4`
ON TRUE
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ WITH `t1` AS (
1
)
SELECT
`t5`.`uuid`,
`t3`.`uuid`,
`t3`.`CountStar(t)`
FROM (
SELECT
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ WITH `t1` AS (
1
)
SELECT
`t7`.`uuid`,
`t4`.`uuid`,
`t4`.`CountStar(t)`,
`t5`.`last_visit`
FROM (
Expand All @@ -28,4 +28,4 @@ LEFT OUTER JOIN (
GROUP BY
1
) AS `t5`
ON `t7`.`uuid` = `t5`.`uuid`
ON `t4`.`uuid` = `t5`.`uuid`
2 changes: 1 addition & 1 deletion ibis/backends/pandas/executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -540,7 +540,7 @@ def visit(cls, op: ops.DummyTable, values):
return df

@classmethod
def visit(cls, op: ops.SelfReference | ops.JoinTable, parent, **kwargs):
def visit(cls, op: ops.Reference, parent, **kwargs):
return parent

@classmethod
Expand Down
9 changes: 2 additions & 7 deletions ibis/backends/polars/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -1248,13 +1248,8 @@ def execute_view(op, *, ctx: pl.SQLContext, **kw):
return child


@translate.register(ops.SelfReference)
def execute_self_reference(op, **kw):
return translate(op.parent, **kw)


@translate.register(ops.JoinTable)
def execute_join_table(op, **kw):
@translate.register(ops.Reference)
def execute_reference(op, **kw):
return translate(op.parent, **kw)


Expand Down
5 changes: 2 additions & 3 deletions ibis/backends/sql/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -1158,6 +1158,8 @@ def visit_DatabaseTable(
def visit_SelfReference(self, op, *, parent, identifier):
return parent

visit_JoinReference = visit_SelfReference

def visit_JoinChain(self, op, *, first, rest, values):
result = sg.select(*self._cleanup_names(values), copy=False).from_(
first, copy=False
Expand Down Expand Up @@ -1388,9 +1390,6 @@ def visit_SQLStringView(self, op, *, query: str, child, schema):
def visit_SQLQueryResult(self, op, *, query, schema, source):
return sg.parse_one(query, dialect=self.dialect).subquery(copy=False)

def visit_JoinTable(self, op, *, parent, index):
return parent

def visit_RegexExtract(self, op, *, arg, pattern, index):
return self.f.regexp_extract(arg, pattern, index, dialect=self.dialect)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,5 @@ SELECT
"t3"."year",
"t3"."month"
FROM "t1" AS "t3"
INNER JOIN "t1" AS "t5"
INNER JOIN "t1" AS "t4"
ON TRUE
Original file line number Diff line number Diff line change
Expand Up @@ -15,20 +15,20 @@ FROM (
"t1"."timestamp_col",
"t1"."year",
"t1"."month",
"t3"."id" AS "id_right",
"t3"."bool_col" AS "bool_col_right",
"t3"."tinyint_col" AS "tinyint_col_right",
"t3"."smallint_col" AS "smallint_col_right",
"t3"."int_col" AS "int_col_right",
"t3"."bigint_col" AS "bigint_col_right",
"t3"."float_col" AS "float_col_right",
"t3"."double_col" AS "double_col_right",
"t3"."date_string_col" AS "date_string_col_right",
"t3"."string_col" AS "string_col_right",
"t3"."timestamp_col" AS "timestamp_col_right",
"t3"."year" AS "year_right",
"t3"."month" AS "month_right"
"t2"."id" AS "id_right",
"t2"."bool_col" AS "bool_col_right",
"t2"."tinyint_col" AS "tinyint_col_right",
"t2"."smallint_col" AS "smallint_col_right",
"t2"."int_col" AS "int_col_right",
"t2"."bigint_col" AS "bigint_col_right",
"t2"."float_col" AS "float_col_right",
"t2"."double_col" AS "double_col_right",
"t2"."date_string_col" AS "date_string_col_right",
"t2"."string_col" AS "string_col_right",
"t2"."timestamp_col" AS "timestamp_col_right",
"t2"."year" AS "year_right",
"t2"."month" AS "month_right"
FROM "functional_alltypes" AS "t1"
INNER JOIN "functional_alltypes" AS "t3"
ON "t1"."tinyint_col" < EXTRACT(minute FROM "t3"."timestamp_col")
) AS "t4"
INNER JOIN "functional_alltypes" AS "t2"
ON "t1"."tinyint_col" < EXTRACT(minute FROM "t2"."timestamp_col")
) AS "t3"
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
SELECT
"t2"."r_name",
"t6"."n_name"
"t5"."n_name"
FROM "tpch_region" AS "t2"
INNER JOIN "tpch_nation" AS "t3"
ON "t2"."r_regionkey" = "t3"."n_regionkey"
Expand All @@ -16,5 +16,5 @@ INNER JOIN (
FROM "tpch_region" AS "t2"
INNER JOIN "tpch_nation" AS "t3"
ON "t2"."r_regionkey" = "t3"."n_regionkey"
) AS "t6"
ON "t2"."r_regionkey" = "t6"."r_regionkey"
) AS "t5"
ON "t2"."r_regionkey" = "t5"."r_regionkey"
Original file line number Diff line number Diff line change
Expand Up @@ -7,25 +7,25 @@ WITH "t1" AS (
GROUP BY
1,
2
), "t6" AS (
), "t5" AS (
SELECT
"t3"."a",
"t3"."g",
"t3"."metric"
FROM "t1" AS "t3"
INNER JOIN "t1" AS "t5"
ON "t3"."g" = "t5"."g"
INNER JOIN "t1" AS "t4"
ON "t3"."g" = "t4"."g"
)
SELECT
"t9"."a",
"t9"."g",
"t9"."metric"
"t8"."a",
"t8"."g",
"t8"."metric"
FROM (
SELECT
*
FROM "t6" AS "t7"
FROM "t5" AS "t6"
UNION ALL
SELECT
*
FROM "t6" AS "t8"
) AS "t9"
FROM "t5" AS "t7"
) AS "t8"
Original file line number Diff line number Diff line change
Expand Up @@ -11,21 +11,21 @@ WITH "t1" AS (
3
)
SELECT
"t6"."g",
MAX("t6"."total" - "t6"."total_right") AS "metric"
"t5"."g",
MAX("t5"."total" - "t5"."total_right") AS "metric"
FROM (
SELECT
"t3"."g",
"t3"."a",
"t3"."b",
"t3"."total",
"t5"."g" AS "g_right",
"t5"."a" AS "a_right",
"t5"."b" AS "b_right",
"t5"."total" AS "total_right"
"t4"."g" AS "g_right",
"t4"."a" AS "a_right",
"t4"."b" AS "b_right",
"t4"."total" AS "total_right"
FROM "t1" AS "t3"
INNER JOIN "t1" AS "t5"
ON "t3"."a" = "t5"."b"
) AS "t6"
INNER JOIN "t1" AS "t4"
ON "t3"."a" = "t4"."b"
) AS "t5"
GROUP BY
1
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@ WITH "t9" AS (
SELECT
"t11"."region",
"t11"."year",
"t11"."total" - "t13"."total" AS "yoy_change"
"t11"."total" - "t12"."total" AS "yoy_change"
FROM "t9" AS "t11"
INNER JOIN "t9" AS "t13"
INNER JOIN "t9" AS "t12"
ON "t11"."year" = (
"t13"."year" - CAST(1 AS TINYINT)
"t12"."year" - CAST(1 AS TINYINT)
)
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,5 @@ INNER JOIN (
FROM "alltypes" AS "t1"
GROUP BY
1
) AS "t6"
ON "t3"."g" = "t6"."g"
) AS "t5"
ON "t3"."g" = "t5"."g"
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,5 @@ SELECT
"t1"."foo_id",
"t1"."bar_id"
FROM "star1" AS "t1"
INNER JOIN "star1" AS "t3"
ON "t1"."foo_id" = "t3"."bar_id"
INNER JOIN "star1" AS "t2"
ON "t1"."foo_id" = "t2"."bar_id"
Loading

0 comments on commit d7a31aa

Please sign in to comment.