Skip to content

Commit

Permalink
test(datafusion): run tpch correctness tests (#9296)
Browse files Browse the repository at this point in the history
  • Loading branch information
cpcloud authored Jun 5, 2024
1 parent 9eb1ed1 commit 6136ce0
Show file tree
Hide file tree
Showing 25 changed files with 1,515 additions and 1 deletion.
11 changes: 11 additions & 0 deletions ibis/backends/datafusion/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,11 @@ class TestConf(BackendTest):
supports_structs = False
supports_json = False
supports_arrays = True
supports_tpch = True
stateful = False
deps = ("datafusion",)
# Query 1 seems to require a bit more room here
tpch_absolute_tolerance = 0.11

def _load_data(self, **_: Any) -> None:
con = self.connection
Expand All @@ -35,6 +38,14 @@ def _load_data(self, **_: Any) -> None:
def connect(*, tmpdir, worker_id, **kw):
return ibis.datafusion.connect(**kw)

def load_tpch(self) -> None:
con = self.connection
for path in self.data_dir.joinpath("tpch", "sf=0.17", "parquet").glob(
"*.parquet"
):
table_name = path.with_suffix("").name
con.read_parquet(path, table_name=table_name)


@pytest.fixture(scope="session")
def con(data_dir, tmp_path_factory, worker_id):
Expand Down
2 changes: 2 additions & 0 deletions ibis/backends/tests/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ class BackendTest(abc.ABC):
"Name of round method to use for rounding test comparisons."
driver_supports_multiple_statements: bool = False
"Whether the driver supports executing multiple statements in a single call."
tpch_absolute_tolerance: float | None = None
"Absolute tolerance for floating point comparisons with pytest.approx in TPC-H correctness tests."

@property
@abc.abstractmethod
Expand Down
6 changes: 5 additions & 1 deletion ibis/backends/tests/tpch/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,11 @@ def wrapper(*args, backend, snapshot, **kwargs):
left = result.loc[:, column]
right = expected.loc[:, column]
assert (
pytest.approx(left.values.tolist(), nan_ok=True)
pytest.approx(
left.values.tolist(),
nan_ok=True,
abs=backend.tpch_absolute_tolerance,
)
== right.values.tolist()
)

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
SELECT
*
FROM (
SELECT
"t1"."l_returnflag",
"t1"."l_linestatus",
SUM("t1"."l_quantity") AS "sum_qty",
SUM("t1"."l_extendedprice") AS "sum_base_price",
SUM("t1"."l_extendedprice" * (
1 - "t1"."l_discount"
)) AS "sum_disc_price",
SUM(
(
"t1"."l_extendedprice" * (
1 - "t1"."l_discount"
)
) * (
"t1"."l_tax" + 1
)
) AS "sum_charge",
AVG("t1"."l_quantity") AS "avg_qty",
AVG("t1"."l_extendedprice") AS "avg_price",
AVG("t1"."l_discount") AS "avg_disc",
COUNT(*) AS "count_order"
FROM (
SELECT
"t1"."l_orderkey",
"t1"."l_partkey",
"t1"."l_suppkey",
"t1"."l_linenumber",
"t1"."l_quantity",
"t1"."l_extendedprice",
"t1"."l_discount",
"t1"."l_tax",
"t1"."l_shipdate",
"t1"."l_commitdate",
"t1"."l_receiptdate",
"t1"."l_shipinstruct",
"t1"."l_shipmode",
"t1"."l_comment",
"t1"."l_returnflag",
"t1"."l_linestatus"
FROM (
SELECT
*
FROM "lineitem" AS "t0"
WHERE
"t0"."l_shipdate" <= DATE_TRUNC('DAY', '1998-09-02')
) AS "t1"
) AS t1
GROUP BY
"t1"."l_returnflag",
"t1"."l_linestatus"
) AS "t2"
ORDER BY
"t2"."l_returnflag" ASC,
"t2"."l_linestatus" ASC
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
SELECT
"t14"."s_acctbal",
"t14"."s_name",
"t14"."n_name",
"t14"."p_partkey",
"t14"."p_mfgr",
"t14"."s_address",
"t14"."s_phone",
"t14"."s_comment"
FROM (
SELECT
"t5"."p_partkey",
"t5"."p_name",
"t5"."p_mfgr",
"t5"."p_brand",
"t5"."p_type",
"t5"."p_size",
"t5"."p_container",
"t5"."p_retailprice",
"t5"."p_comment",
"t6"."ps_partkey",
"t6"."ps_suppkey",
"t6"."ps_availqty",
"t6"."ps_supplycost",
"t6"."ps_comment",
"t8"."s_suppkey",
"t8"."s_name",
"t8"."s_address",
"t8"."s_nationkey",
"t8"."s_phone",
"t8"."s_acctbal",
"t8"."s_comment",
"t10"."n_nationkey",
"t10"."n_name",
"t10"."n_regionkey",
"t10"."n_comment",
"t12"."r_regionkey",
"t12"."r_name",
"t12"."r_comment"
FROM "part" AS "t5"
INNER JOIN "partsupp" AS "t6"
ON "t5"."p_partkey" = "t6"."ps_partkey"
INNER JOIN "supplier" AS "t8"
ON "t8"."s_suppkey" = "t6"."ps_suppkey"
INNER JOIN "nation" AS "t10"
ON "t8"."s_nationkey" = "t10"."n_nationkey"
INNER JOIN "region" AS "t12"
ON "t10"."n_regionkey" = "t12"."r_regionkey"
) AS "t14"
WHERE
"t14"."p_size" = 15
AND "t14"."p_type" LIKE '%BRASS'
AND "t14"."r_name" = 'EUROPE'
AND "t14"."ps_supplycost" = (
SELECT
MIN("t16"."ps_supplycost") AS "Min(ps_supplycost)"
FROM (
SELECT
*
FROM (
SELECT
"t7"."ps_partkey",
"t7"."ps_suppkey",
"t7"."ps_availqty",
"t7"."ps_supplycost",
"t7"."ps_comment",
"t9"."s_suppkey",
"t9"."s_name",
"t9"."s_address",
"t9"."s_nationkey",
"t9"."s_phone",
"t9"."s_acctbal",
"t9"."s_comment",
"t11"."n_nationkey",
"t11"."n_name",
"t11"."n_regionkey",
"t11"."n_comment",
"t13"."r_regionkey",
"t13"."r_name",
"t13"."r_comment"
FROM "partsupp" AS "t7"
INNER JOIN "supplier" AS "t9"
ON "t9"."s_suppkey" = "t7"."ps_suppkey"
INNER JOIN "nation" AS "t11"
ON "t9"."s_nationkey" = "t11"."n_nationkey"
INNER JOIN "region" AS "t13"
ON "t11"."n_regionkey" = "t13"."r_regionkey"
) AS "t15"
WHERE
"t15"."r_name" = 'EUROPE' AND "t14"."p_partkey" = "t15"."ps_partkey"
) AS "t16"
)
ORDER BY
"t14"."s_acctbal" DESC NULLS LAST,
"t14"."n_name" ASC,
"t14"."s_name" ASC,
"t14"."p_partkey" ASC
LIMIT 100
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
SELECT
"t8"."l_orderkey",
"t8"."revenue",
"t8"."o_orderdate",
"t8"."o_shippriority"
FROM (
SELECT
"t7"."l_orderkey",
"t7"."o_orderdate",
"t7"."o_shippriority",
SUM("t7"."l_extendedprice" * (
1 - "t7"."l_discount"
)) AS "revenue"
FROM (
SELECT
"t7"."c_custkey",
"t7"."c_name",
"t7"."c_address",
"t7"."c_nationkey",
"t7"."c_phone",
"t7"."c_acctbal",
"t7"."c_mktsegment",
"t7"."c_comment",
"t7"."o_orderkey",
"t7"."o_custkey",
"t7"."o_orderstatus",
"t7"."o_totalprice",
"t7"."o_orderpriority",
"t7"."o_clerk",
"t7"."o_comment",
"t7"."l_partkey",
"t7"."l_suppkey",
"t7"."l_linenumber",
"t7"."l_quantity",
"t7"."l_extendedprice",
"t7"."l_discount",
"t7"."l_tax",
"t7"."l_returnflag",
"t7"."l_linestatus",
"t7"."l_shipdate",
"t7"."l_commitdate",
"t7"."l_receiptdate",
"t7"."l_shipinstruct",
"t7"."l_shipmode",
"t7"."l_comment",
"t7"."l_orderkey",
"t7"."o_orderdate",
"t7"."o_shippriority"
FROM (
SELECT
*
FROM (
SELECT
"t3"."c_custkey",
"t3"."c_name",
"t3"."c_address",
"t3"."c_nationkey",
"t3"."c_phone",
"t3"."c_acctbal",
"t3"."c_mktsegment",
"t3"."c_comment",
"t4"."o_orderkey",
"t4"."o_custkey",
"t4"."o_orderstatus",
"t4"."o_totalprice",
"t4"."o_orderdate",
"t4"."o_orderpriority",
"t4"."o_clerk",
"t4"."o_shippriority",
"t4"."o_comment",
"t5"."l_orderkey",
"t5"."l_partkey",
"t5"."l_suppkey",
"t5"."l_linenumber",
"t5"."l_quantity",
"t5"."l_extendedprice",
"t5"."l_discount",
"t5"."l_tax",
"t5"."l_returnflag",
"t5"."l_linestatus",
"t5"."l_shipdate",
"t5"."l_commitdate",
"t5"."l_receiptdate",
"t5"."l_shipinstruct",
"t5"."l_shipmode",
"t5"."l_comment"
FROM "customer" AS "t3"
INNER JOIN "orders" AS "t4"
ON "t3"."c_custkey" = "t4"."o_custkey"
INNER JOIN "lineitem" AS "t5"
ON "t5"."l_orderkey" = "t4"."o_orderkey"
) AS "t6"
WHERE
"t6"."c_mktsegment" = 'BUILDING'
AND "t6"."o_orderdate" < DATE_TRUNC('DAY', '1995-03-15')
AND "t6"."l_shipdate" > DATE_TRUNC('DAY', '1995-03-15')
) AS "t7"
) AS t7
GROUP BY
"t7"."l_orderkey",
"t7"."o_orderdate",
"t7"."o_shippriority"
) AS "t8"
ORDER BY
"t8"."revenue" DESC NULLS LAST,
"t8"."o_orderdate" ASC
LIMIT 10
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
SELECT
*
FROM (
SELECT
"t3"."o_orderpriority",
COUNT(*) AS "order_count"
FROM (
SELECT
"t3"."o_orderkey",
"t3"."o_custkey",
"t3"."o_orderstatus",
"t3"."o_totalprice",
"t3"."o_orderdate",
"t3"."o_clerk",
"t3"."o_shippriority",
"t3"."o_comment",
"t3"."o_orderpriority"
FROM (
SELECT
*
FROM "orders" AS "t0"
WHERE
EXISTS(
SELECT
1
FROM "lineitem" AS "t1"
WHERE
(
"t1"."l_orderkey" = "t0"."o_orderkey"
)
AND (
"t1"."l_commitdate" < "t1"."l_receiptdate"
)
)
AND "t0"."o_orderdate" >= DATE_TRUNC('DAY', '1993-07-01')
AND "t0"."o_orderdate" < DATE_TRUNC('DAY', '1993-10-01')
) AS "t3"
) AS t3
GROUP BY
"t3"."o_orderpriority"
) AS "t4"
ORDER BY
"t4"."o_orderpriority" ASC
Loading

0 comments on commit 6136ce0

Please sign in to comment.