diff --git a/tests/benchmark/tpch/config.jsonl b/tests/benchmark/tpch/config.jsonl index 151100594a..1c8b4dd19a 100644 --- a/tests/benchmark/tpch/config.jsonl +++ b/tests/benchmark/tpch/config.jsonl @@ -6,5 +6,3 @@ {"benchmark_suffix": "100g_unordered", "dataset_id": "tpch_0100g", "ordered": false} {"benchmark_suffix": "1t_ordered", "dataset_id": "tpch_0001t", "ordered": true} {"benchmark_suffix": "1t_unordered", "dataset_id": "tpch_0001t", "ordered": false} -{"benchmark_suffix": "10t_ordered", "dataset_id": "tpch_0010t", "ordered": true} -{"benchmark_suffix": "10t_unordered", "dataset_id": "tpch_0010t", "ordered": false} diff --git a/tests/benchmark/tpch/q4.py b/tests/benchmark/tpch/q4.py new file mode 100644 index 0000000000..aa67cc77a0 --- /dev/null +++ b/tests/benchmark/tpch/q4.py @@ -0,0 +1,25 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pathlib + +import benchmark.utils as utils +import bigframes_vendored.tpch.queries.q4 as vendored_tpch_q4 + +if __name__ == "__main__": + dataset_id, session, suffix = utils.get_tpch_configuration() + current_path = pathlib.Path(__file__).absolute() + + utils.get_execution_time( + vendored_tpch_q4.q, current_path, suffix, dataset_id, session + ) diff --git a/tests/benchmark/tpch/q5.py b/tests/benchmark/tpch/q5.py new file mode 100644 index 0000000000..e4b3cb0f51 --- /dev/null +++ b/tests/benchmark/tpch/q5.py @@ -0,0 +1,25 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pathlib + +import benchmark.utils as utils +import bigframes_vendored.tpch.queries.q1 as vendored_tpch_q5 + +if __name__ == "__main__": + dataset_id, session, suffix = utils.get_tpch_configuration() + current_path = pathlib.Path(__file__).absolute() + + utils.get_execution_time( + vendored_tpch_q5.q, current_path, suffix, dataset_id, session + ) diff --git a/tests/benchmark/tpch/q6.py b/tests/benchmark/tpch/q6.py new file mode 100644 index 0000000000..a193333045 --- /dev/null +++ b/tests/benchmark/tpch/q6.py @@ -0,0 +1,25 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pathlib + +import benchmark.utils as utils +import bigframes_vendored.tpch.queries.q6 as vendored_tpch_q6 + +if __name__ == "__main__": + dataset_id, session, suffix = utils.get_tpch_configuration() + current_path = pathlib.Path(__file__).absolute() + + utils.get_execution_time( + vendored_tpch_q6.q, current_path, suffix, dataset_id, session + ) diff --git a/tests/benchmark/tpch/q7.py b/tests/benchmark/tpch/q7.py new file mode 100644 index 0000000000..8a17eb91ea --- /dev/null +++ b/tests/benchmark/tpch/q7.py @@ -0,0 +1,25 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pathlib + +import benchmark.utils as utils +import bigframes_vendored.tpch.queries.q7 as vendored_tpch_q7 + +if __name__ == "__main__": + dataset_id, session, suffix = utils.get_tpch_configuration() + current_path = pathlib.Path(__file__).absolute() + + utils.get_execution_time( + vendored_tpch_q7.q, current_path, suffix, dataset_id, session + ) diff --git a/tests/benchmark/tpch/q8.py b/tests/benchmark/tpch/q8.py new file mode 100644 index 0000000000..b5e7d7aa37 --- /dev/null +++ b/tests/benchmark/tpch/q8.py @@ -0,0 +1,25 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pathlib + +import benchmark.utils as utils +import bigframes_vendored.tpch.queries.q8 as vendored_tpch_q8 + +if __name__ == "__main__": + dataset_id, session, suffix = utils.get_tpch_configuration() + current_path = pathlib.Path(__file__).absolute() + + utils.get_execution_time( + vendored_tpch_q8.q, current_path, suffix, dataset_id, session + ) diff --git a/third_party/bigframes_vendored/tpch/queries/q4.py b/third_party/bigframes_vendored/tpch/queries/q4.py new file mode 100644 index 0000000000..9cc6f73c98 --- /dev/null +++ b/third_party/bigframes_vendored/tpch/queries/q4.py @@ -0,0 +1,38 @@ +# Contains code from https://github.com/pola-rs/tpch/blob/main/queries/pandas/q4.py + + +from datetime import date +import typing + +import bigframes +import bigframes.pandas as bpd + + +def q(dataset_id: str, session: bigframes.Session): + lineitem = session.read_gbq( + f"bigframes-dev-perf.{dataset_id}.LINEITEM", + index_col=bigframes.enums.DefaultIndexKind.NULL, + ) + orders = session.read_gbq( + f"bigframes-dev-perf.{dataset_id}.ORDERS", + index_col=bigframes.enums.DefaultIndexKind.NULL, + ) + + var1 = date(1993, 7, 1) + var2 = date(1993, 10, 1) + + jn = lineitem.merge(orders, left_on="L_ORDERKEY", right_on="O_ORDERKEY") + + jn = jn[(jn["O_ORDERDATE"] >= var1) & (jn["O_ORDERDATE"] < var2)] + jn = jn[jn["L_COMMITDATE"] < jn["L_RECEIPTDATE"]] + + if not session._strictly_ordered: + jn = jn.sort_values(by=["O_ORDERPRIORITY", "L_ORDERKEY"]) + + jn = jn.drop_duplicates(subset=["O_ORDERPRIORITY", "L_ORDERKEY"]) + + gb = jn.groupby("O_ORDERPRIORITY", as_index=False) + agg = gb.agg(ORDER_COUNT=bpd.NamedAgg(column="L_ORDERKEY", aggfunc="count")) + + result_df = typing.cast(bpd.DataFrame, agg).sort_values(["O_ORDERPRIORITY"]) + result_df.to_gbq() diff --git a/third_party/bigframes_vendored/tpch/queries/q5.py b/third_party/bigframes_vendored/tpch/queries/q5.py new file mode 100644 index 0000000000..20bd8d9c44 --- /dev/null +++ b/third_party/bigframes_vendored/tpch/queries/q5.py @@ -0,0 +1,55 @@ +# Contains code from https://github.com/pola-rs/tpch/blob/main/queries/pandas/q5.py + +from datetime import date + +import bigframes + + +def q(dataset_id: str, session: bigframes.Session): + region = session.read_gbq( + f"bigframes-dev-perf.{dataset_id}.REGION", + index_col=bigframes.enums.DefaultIndexKind.NULL, + ) + nation = session.read_gbq( + f"bigframes-dev-perf.{dataset_id}.NATION", + index_col=bigframes.enums.DefaultIndexKind.NULL, + ) + customer = session.read_gbq( + f"bigframes-dev-perf.{dataset_id}.CUSTOMER", + index_col=bigframes.enums.DefaultIndexKind.NULL, + ) + lineitem = session.read_gbq( + f"bigframes-dev-perf.{dataset_id}.LINEITEM", + index_col=bigframes.enums.DefaultIndexKind.NULL, + ) + orders = session.read_gbq( + f"bigframes-dev-perf.{dataset_id}.ORDERES", + index_col=bigframes.enums.DefaultIndexKind.NULL, + ) + supplier = session.read_gbq( + f"bigframes-dev-perf.{dataset_id}.SUPPLIER", + index_col=bigframes.enums.DefaultIndexKind.NULL, + ) + + var1 = "ASIA" + var2 = date(1994, 1, 1) + var3 = date(1995, 1, 1) + + jn1 = region.merge(nation, left_on="R_REGIONKEY", right_on="N_REGIONKEY") + jn2 = jn1.merge(customer, left_on="N_NATIONKEY", right_on="C_NATIONKEY") + jn3 = jn2.merge(orders, left_on="C_CUSTKEY", right_on="O_CUSTKEY") + jn4 = jn3.merge(lineitem, left_on="O_ORDERKEY", right_on="L_ORDERKEY") + jn5 = jn4.merge( + supplier, + left_on=["L_SUPPKEY", "N_NATIONKEY"], + right_on=["S_SUPPKEY", "S_NATIONKEY"], + ) + + jn5 = jn5[jn5["R_NAME"] == var1] + jn5 = jn5[(jn5["O_ORDERDATE"] >= var2) & (jn5["O_ORDERDATE"] < var3)] + jn5["REVENUE"] = jn5["L_EXTENDEDPRICE"] * (1.0 - jn5["L_DISCOUNT"]) + + gb = jn5.groupby("N_NAME", as_index=False)["REVENUE"].sum() + result_df = gb.sort_values("REVENUE", ascending=False) + + result_df.to_gbq() diff --git a/third_party/bigframes_vendored/tpch/queries/q6.py b/third_party/bigframes_vendored/tpch/queries/q6.py new file mode 100644 index 0000000000..13341c4f4e --- /dev/null +++ b/third_party/bigframes_vendored/tpch/queries/q6.py @@ -0,0 +1,30 @@ +# Contains code from https://github.com/pola-rs/tpch/blob/main/queries/pandas/q6.py + +from datetime import date + +import bigframes + + +def q(dataset_id: str, session: bigframes.Session): + lineitem = session.read_gbq( + f"bigframes-dev-perf.{dataset_id}.LINEITEM", + index_col=bigframes.enums.DefaultIndexKind.NULL, + ) + + var1 = date(1994, 1, 1) + var2 = date(1995, 1, 1) + var3 = 0.05 + var4 = 0.07 + var5 = 24 + + filt = lineitem[(lineitem["L_SHIPDATE"] >= var1) & (lineitem["L_SHIPDATE"] < var2)] + filt = filt[(filt["L_DISCOUNT"] >= var3) & (filt["L_DISCOUNT"] <= var4)] + filt = filt[filt["L_QUANTITY"] < var5] + result_df = ( + (filt["L_EXTENDEDPRICE"] * filt["L_DISCOUNT"]) + .agg(["sum"]) + .rename("REVENUE") + .to_frame() + ) + + result_df.to_gbq() diff --git a/third_party/bigframes_vendored/tpch/queries/q7.py b/third_party/bigframes_vendored/tpch/queries/q7.py new file mode 100644 index 0000000000..4ea5e6b238 --- /dev/null +++ b/third_party/bigframes_vendored/tpch/queries/q7.py @@ -0,0 +1,79 @@ +# Contains code from https://github.com/pola-rs/tpch/blob/main/queries/pandas/q7.py + +from datetime import date +import typing + +import bigframes +import bigframes.dataframe +import bigframes.pandas as bpd + + +def q(dataset_id: str, session: bigframes.Session): + nation = session.read_gbq( + f"bigframes-dev-perf.{dataset_id}.NATION", + index_col=bigframes.enums.DefaultIndexKind.NULL, + ) + customer = session.read_gbq( + f"bigframes-dev-perf.{dataset_id}.CUSTOMER", + index_col=bigframes.enums.DefaultIndexKind.NULL, + ) + lineitem = session.read_gbq( + f"bigframes-dev-perf.{dataset_id}.LINEITEM", + index_col=bigframes.enums.DefaultIndexKind.NULL, + ) + orders = session.read_gbq( + f"bigframes-dev-perf.{dataset_id}.ORDERS", + index_col=bigframes.enums.DefaultIndexKind.NULL, + ) + supplier = session.read_gbq( + f"bigframes-dev-perf.{dataset_id}.SUPPLIER", + index_col=bigframes.enums.DefaultIndexKind.NULL, + ) + + var1 = "FRANCE" + var2 = "GERMANY" + var3 = date(1995, 1, 1) + var4 = date(1996, 12, 31) + + n1 = nation[(nation["N_NAME"] == var1)] + n2 = nation[(nation["N_NAME"] == var2)] + + jn1 = customer.merge(n1, left_on="C_NATIONKEY", right_on="N_NATIONKEY") + jn2 = jn1.merge(orders, left_on="C_CUSTKEY", right_on="O_CUSTKEY") + jn2 = jn2.rename(columns={"N_NAME": "CUST_NATION"}) + jn3 = jn2.merge(lineitem, left_on="O_ORDERKEY", right_on="L_ORDERKEY") + jn4 = jn3.merge(supplier, left_on="L_SUPPKEY", right_on="S_SUPPKEY") + jn5 = jn4.merge(n2, left_on="S_NATIONKEY", right_on="N_NATIONKEY") + df1 = jn5.rename(columns={"N_NAME": "SUPP_NATION"}) + + jn1 = customer.merge(n2, left_on="C_NATIONKEY", right_on="N_NATIONKEY") + jn2 = jn1.merge(orders, left_on="C_CUSTKEY", right_on="O_CUSTKEY") + jn2 = jn2.rename(columns={"N_NAME": "CUST_NATION"}) + jn3 = jn2.merge(lineitem, left_on="O_ORDERKEY", right_on="L_ORDERKEY") + jn4 = jn3.merge(supplier, left_on="L_SUPPKEY", right_on="S_SUPPKEY") + jn5 = jn4.merge(n1, left_on="S_NATIONKEY", right_on="N_NATIONKEY") + df2 = jn5.rename(columns={"N_NAME": "SUPP_NATION"}) + + total = bpd.concat([df1, df2]) + + # TODO(huanc): TEMPORARY CODE to force a fresh start. Currently, + # combining everything into a single query seems to trigger a bug + # causing incorrect results. This workaround involves writing to and + # then reading from BigQuery. Remove this once b/355714291 is + # resolved. + dest = total.to_gbq() + total = bpd.read_gbq(dest) + + total = total[(total["L_SHIPDATE"] >= var3) & (total["L_SHIPDATE"] <= var4)] + total["VOLUME"] = total["L_EXTENDEDPRICE"] * (1.0 - total["L_DISCOUNT"]) + total["L_YEAR"] = total["L_SHIPDATE"].dt.year + + gb = typing.cast(bpd.DataFrame, total).groupby( + ["SUPP_NATION", "CUST_NATION", "L_YEAR"], as_index=False + ) + agg = gb.agg(REVENUE=bpd.NamedAgg(column="VOLUME", aggfunc="sum")) + + result_df = typing.cast(bpd.DataFrame, agg).sort_values( + ["SUPP_NATION", "CUST_NATION", "L_YEAR"] + ) + result_df.to_gbq() diff --git a/third_party/bigframes_vendored/tpch/queries/q8.py b/third_party/bigframes_vendored/tpch/queries/q8.py new file mode 100644 index 0000000000..4520fce14a --- /dev/null +++ b/third_party/bigframes_vendored/tpch/queries/q8.py @@ -0,0 +1,78 @@ +# Contains code from https://github.com/pola-rs/tpch/blob/main/queries/pandas/q8.py + +from datetime import date + +import bigframes + + +def q(dataset_id: str, session: bigframes.Session): + customer = session.read_gbq( + f"bigframes-dev-perf.{dataset_id}.CUSTOMER", + index_col=bigframes.enums.DefaultIndexKind.NULL, + ) + lineitem = session.read_gbq( + f"bigframes-dev-perf.{dataset_id}.LINEITEM", + index_col=bigframes.enums.DefaultIndexKind.NULL, + ) + nation = session.read_gbq( + f"bigframes-dev-perf.{dataset_id}.NATION", + index_col=bigframes.enums.DefaultIndexKind.NULL, + ) + orders = session.read_gbq( + f"bigframes-dev-perf.{dataset_id}.ORDERS", + index_col=bigframes.enums.DefaultIndexKind.NULL, + ) + part = session.read_gbq( + f"bigframes-dev-perf.{dataset_id}.PART", + index_col=bigframes.enums.DefaultIndexKind.NULL, + ) + region = session.read_gbq( + f"bigframes-dev-perf.{dataset_id}.REGION", + index_col=bigframes.enums.DefaultIndexKind.NULL, + ) + supplier = session.read_gbq( + f"bigframes-dev-perf.{dataset_id}.SUPPLIER", + index_col=bigframes.enums.DefaultIndexKind.NULL, + ) + + var1 = "BRAZIL" + var2 = "AMERICA" + var3 = "ECONOMY ANODIZED STEEL" + var4 = date(1995, 1, 1) + var5 = date(1996, 12, 31) + + n1 = nation[["N_NATIONKEY", "N_REGIONKEY"]] + n2 = nation[["N_NATIONKEY", "N_NAME"]] + + jn1 = part.merge(lineitem, left_on="P_PARTKEY", right_on="L_PARTKEY") + jn2 = jn1.merge(supplier, left_on="L_SUPPKEY", right_on="S_SUPPKEY") + jn3 = jn2.merge(orders, left_on="L_ORDERKEY", right_on="O_ORDERKEY") + jn4 = jn3.merge(customer, left_on="O_CUSTKEY", right_on="C_CUSTKEY") + jn5 = jn4.merge(n1, left_on="C_NATIONKEY", right_on="N_NATIONKEY") + jn6 = jn5.merge(region, left_on="N_REGIONKEY", right_on="R_REGIONKEY") + + jn6 = jn6[(jn6["R_NAME"] == var2)] + + jn7 = jn6.merge(n2, left_on="S_NATIONKEY", right_on="N_NATIONKEY") + + jn7 = jn7[(jn7["O_ORDERDATE"] >= var4) & (jn7["O_ORDERDATE"] <= var5)] + jn7 = jn7[jn7["P_TYPE"] == var3] + + jn7["O_YEAR"] = jn7["O_ORDERDATE"].dt.year + jn7["VOLUME"] = jn7["L_EXTENDEDPRICE"] * (1.0 - jn7["L_DISCOUNT"]) + jn7 = jn7.rename(columns={"N_NAME": "NATION"}) + + denominator = jn7.groupby("O_YEAR")["VOLUME"].sum().rename("DENOMINATOR") + numerator = ( + jn7[jn7["NATION"] == var1] + .groupby(jn7["O_YEAR"])["VOLUME"] + .sum() + .rename("NUMERATOR") + ) + jn8 = denominator.to_frame().join(numerator.to_frame(), how="left") + + # ValueError: Caching with offsets only supported in strictly ordered mode. + jn8["MKT_SHARE"] = (jn8["NUMERATOR"] / jn8["DENOMINATOR"]).round(2) + + result_df = jn8["MKT_SHARE"].sort_index().rename("MKT_SHARE").reset_index() + result_df.to_gbq()