Skip to content

Commit

Permalink
chore: add tpch q14-18 (#928)
Browse files Browse the repository at this point in the history
  • Loading branch information
Genesis929 authored Aug 29, 2024
1 parent 2f6cd9f commit 0413964
Show file tree
Hide file tree
Showing 10 changed files with 342 additions and 0 deletions.
25 changes: 25 additions & 0 deletions tests/benchmark/tpch/q14.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pathlib

import benchmark.utils as utils
import bigframes_vendored.tpch.queries.q14 as vendored_tpch_q14

if __name__ == "__main__":
dataset_id, session, suffix = utils.get_tpch_configuration()
current_path = pathlib.Path(__file__).absolute()

utils.get_execution_time(
vendored_tpch_q14.q, current_path, suffix, dataset_id, session
)
25 changes: 25 additions & 0 deletions tests/benchmark/tpch/q15.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pathlib

import benchmark.utils as utils
import bigframes_vendored.tpch.queries.q15 as vendored_tpch_q15

if __name__ == "__main__":
dataset_id, session, suffix = utils.get_tpch_configuration()
current_path = pathlib.Path(__file__).absolute()

utils.get_execution_time(
vendored_tpch_q15.q, current_path, suffix, dataset_id, session
)
25 changes: 25 additions & 0 deletions tests/benchmark/tpch/q16.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pathlib

import benchmark.utils as utils
import bigframes_vendored.tpch.queries.q16 as vendored_tpch_q16

if __name__ == "__main__":
dataset_id, session, suffix = utils.get_tpch_configuration()
current_path = pathlib.Path(__file__).absolute()

utils.get_execution_time(
vendored_tpch_q16.q, current_path, suffix, dataset_id, session
)
25 changes: 25 additions & 0 deletions tests/benchmark/tpch/q17.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pathlib

import benchmark.utils as utils
import bigframes_vendored.tpch.queries.q17 as vendored_tpch_q17

if __name__ == "__main__":
dataset_id, session, suffix = utils.get_tpch_configuration()
current_path = pathlib.Path(__file__).absolute()

utils.get_execution_time(
vendored_tpch_q17.q, current_path, suffix, dataset_id, session
)
25 changes: 25 additions & 0 deletions tests/benchmark/tpch/q18.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pathlib

import benchmark.utils as utils
import bigframes_vendored.tpch.queries.q18 as vendored_tpch_q18

if __name__ == "__main__":
dataset_id, session, suffix = utils.get_tpch_configuration()
current_path = pathlib.Path(__file__).absolute()

utils.get_execution_time(
vendored_tpch_q18.q, current_path, suffix, dataset_id, session
)
34 changes: 34 additions & 0 deletions third_party/bigframes_vendored/tpch/queries/q14.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Contains code from https://github.com/pola-rs/tpch/blob/main/queries/polars/q14.py

from datetime import date

import bigframes


def q(dataset_id: str, session: bigframes.Session):
lineitem = session.read_gbq(
f"bigframes-dev-perf.{dataset_id}.LINEITEM",
index_col=bigframes.enums.DefaultIndexKind.NULL,
)
part = session.read_gbq(
f"bigframes-dev-perf.{dataset_id}.PART",
index_col=bigframes.enums.DefaultIndexKind.NULL,
)

var1 = date(1995, 9, 1)
var2 = date(1995, 10, 1)

merged = lineitem.merge(part, left_on="L_PARTKEY", right_on="P_PARTKEY")

filtered = merged[(merged["L_SHIPDATE"] >= var1) & (merged["L_SHIPDATE"] < var2)]

filtered["CONDI_REVENUE"] = (
filtered["L_EXTENDEDPRICE"] * (1 - filtered["L_DISCOUNT"])
) * filtered["P_TYPE"].str.contains("PROMO").astype("Int64")

total_revenue = (filtered["L_EXTENDEDPRICE"] * (1 - filtered["L_DISCOUNT"])).sum()
promo_revenue = filtered["CONDI_REVENUE"].sum()

promo_revenue_percent = 100.00 * promo_revenue / total_revenue

_ = round(promo_revenue_percent, 2)
48 changes: 48 additions & 0 deletions third_party/bigframes_vendored/tpch/queries/q15.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Contains code from https://github.com/pola-rs/tpch/blob/main/queries/polars/q15.py

from datetime import date

import bigframes
import bigframes.pandas as bpd


def q(dataset_id: str, session: bigframes.Session):
lineitem = session.read_gbq(
f"bigframes-dev-perf.{dataset_id}.LINEITEM",
index_col=bigframes.enums.DefaultIndexKind.NULL,
)
supplier = session.read_gbq(
f"bigframes-dev-perf.{dataset_id}.SUPPLIER",
index_col=bigframes.enums.DefaultIndexKind.NULL,
)

var1 = date(1996, 1, 1)
var2 = date(1996, 4, 1)

filtered_lineitem = lineitem[
(lineitem["L_SHIPDATE"] >= var1) & (lineitem["L_SHIPDATE"] < var2)
]
filtered_lineitem["REVENUE"] = filtered_lineitem["L_EXTENDEDPRICE"] * (
1 - filtered_lineitem["L_DISCOUNT"]
)

grouped_revenue = (
filtered_lineitem.groupby("L_SUPPKEY", as_index=False)
.agg(TOTAL_REVENUE=bpd.NamedAgg(column="REVENUE", aggfunc="sum"))
.rename(columns={"L_SUPPKEY": "SUPPLIER_NO"})
)

joined_data = bpd.merge(
supplier, grouped_revenue, left_on="S_SUPPKEY", right_on="SUPPLIER_NO"
)

max_revenue = joined_data["TOTAL_REVENUE"].max()
max_revenue_suppliers = joined_data[joined_data["TOTAL_REVENUE"] == max_revenue]

max_revenue_suppliers["TOTAL_REVENUE"] = max_revenue_suppliers[
"TOTAL_REVENUE"
].round(2)
q_final = max_revenue_suppliers[
["S_SUPPKEY", "S_NAME", "S_ADDRESS", "S_PHONE", "TOTAL_REVENUE"]
].sort_values("S_SUPPKEY")
q_final.to_gbq()
44 changes: 44 additions & 0 deletions third_party/bigframes_vendored/tpch/queries/q16.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# Contains code from https://github.com/pola-rs/tpch/blob/main/queries/polars/q16.py

import bigframes
import bigframes.pandas as bpd


def q(dataset_id: str, session: bigframes.Session):
part = session.read_gbq(
f"bigframes-dev-perf.{dataset_id}.PART",
index_col=bigframes.enums.DefaultIndexKind.NULL,
)
partsupp = session.read_gbq(
f"bigframes-dev-perf.{dataset_id}.PARTSUPP",
index_col=bigframes.enums.DefaultIndexKind.NULL,
)
supplier = session.read_gbq(
f"bigframes-dev-perf.{dataset_id}.SUPPLIER",
index_col=bigframes.enums.DefaultIndexKind.NULL,
)

var1 = "Brand#45"

supplier = supplier[
supplier["S_COMMENT"].str.contains("Customer.*Complaints", regex=True)
]["S_SUPPKEY"]

q_filtered = part.merge(partsupp, left_on="P_PARTKEY", right_on="PS_PARTKEY")
q_filtered = q_filtered[q_filtered["P_BRAND"] != var1]
q_filtered = q_filtered[~q_filtered["P_TYPE"].str.contains("MEDIUM POLISHED")]
q_filtered = q_filtered[q_filtered["P_SIZE"].isin([49, 14, 23, 45, 19, 3, 36, 9])]

final_df = q_filtered[~q_filtered["PS_SUPPKEY"].isin(supplier)]

grouped = final_df.groupby(["P_BRAND", "P_TYPE", "P_SIZE"], as_index=False)
result = grouped.agg(
SUPPLIER_CNT=bpd.NamedAgg(column="PS_SUPPKEY", aggfunc="nunique")
)

q_final = result.sort_values(
by=["SUPPLIER_CNT", "P_BRAND", "P_TYPE", "P_SIZE"],
ascending=[False, True, True, True],
)

q_final.to_gbq()
40 changes: 40 additions & 0 deletions third_party/bigframes_vendored/tpch/queries/q17.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Contains code from https://github.com/pola-rs/tpch/blob/main/queries/polars/q17.py

import bigframes
import bigframes.pandas as bpd


def q(dataset_id: str, session: bigframes.Session):
lineitem = session.read_gbq(
f"bigframes-dev-perf.{dataset_id}.LINEITEM",
index_col=bigframes.enums.DefaultIndexKind.NULL,
)
part = session.read_gbq(
f"bigframes-dev-perf.{dataset_id}.PART",
index_col=bigframes.enums.DefaultIndexKind.NULL,
)

VAR1 = "Brand#23"
VAR2 = "MED BOX"

filtered_part = part[(part["P_BRAND"] == VAR1) & (part["P_CONTAINER"] == VAR2)]
q1 = bpd.merge(
filtered_part, lineitem, how="left", left_on="P_PARTKEY", right_on="L_PARTKEY"
)

grouped = (
q1.groupby("P_PARTKEY", as_index=False)
.agg(AVG_QUANTITY=bpd.NamedAgg(column="L_QUANTITY", aggfunc="mean"))
.rename(columns={"P_PARTKEY": "KEY"})
)
grouped["AVG_QUANTITY"] = grouped["AVG_QUANTITY"] * 0.2

q_final = bpd.merge(grouped, q1, left_on="KEY", right_on="P_PARTKEY")

q_final = q_final[q_final["L_QUANTITY"] < q_final["AVG_QUANTITY"]]

q_final = bpd.DataFrame(
{"AVG_YEARLY": [(q_final["L_EXTENDEDPRICE"].sum() / 7.0).round(2)]}
)

q_final.to_gbq()
51 changes: 51 additions & 0 deletions third_party/bigframes_vendored/tpch/queries/q18.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# Contains code from https://github.com/pola-rs/tpch/blob/main/queries/polars/q18.py

import typing

import bigframes
import bigframes.pandas as bpd


def q(dataset_id: str, session: bigframes.Session):
customer = session.read_gbq(
f"bigframes-dev-perf.{dataset_id}.CUSTOMER",
index_col=bigframes.enums.DefaultIndexKind.NULL,
)
lineitem = session.read_gbq(
f"bigframes-dev-perf.{dataset_id}.LINEITEM",
index_col=bigframes.enums.DefaultIndexKind.NULL,
)
orders = session.read_gbq(
f"bigframes-dev-perf.{dataset_id}.ORDERS",
index_col=bigframes.enums.DefaultIndexKind.NULL,
)

var1 = 300

q1 = lineitem.groupby("L_ORDERKEY", as_index=False).agg(
SUM_QUANTITY=bpd.NamedAgg(column="L_QUANTITY", aggfunc="sum")
)
q1 = q1[q1["SUM_QUANTITY"] > var1]

filtered_orders = orders.merge(
q1, left_on="O_ORDERKEY", right_on="L_ORDERKEY", how="inner"
)

result = filtered_orders.merge(
lineitem, left_on="O_ORDERKEY", right_on="L_ORDERKEY"
)
result = result.merge(customer, left_on="O_CUSTKEY", right_on="C_CUSTKEY")

final_result = result.groupby(
["C_NAME", "C_CUSTKEY", "O_ORDERKEY", "O_ORDERDATE", "O_TOTALPRICE"],
as_index=False,
).agg(COL6=bpd.NamedAgg(column="L_QUANTITY", aggfunc="sum"))

final_result = final_result.rename(columns={"O_ORDERDATE": "O_ORDERDAT"})

final_result = typing.cast(bpd.DataFrame, final_result).sort_values(
["O_TOTALPRICE", "O_ORDERDAT"], ascending=[False, True]
)

q_final = final_result.head(100)
q_final.to_gbq()

0 comments on commit 0413964

Please sign in to comment.