From 69fe5d7392f03c6ef14d42e8cd6e26be62d409cd Mon Sep 17 00:00:00 2001 From: Salem Boyland Date: Wed, 6 Dec 2023 12:12:49 -0600 Subject: [PATCH 01/28] k-means code sample --- samples/snippets/create_k-means_model | 84 +++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 samples/snippets/create_k-means_model diff --git a/samples/snippets/create_k-means_model b/samples/snippets/create_k-means_model new file mode 100644 index 0000000000..fb129f19c0 --- /dev/null +++ b/samples/snippets/create_k-means_model @@ -0,0 +1,84 @@ +def test_kmeans_sample(): + import bigframes.pandas as bpd + import bigframes + from bigframes import dataframe + import datetime + + + bigframes.options.bigquery.project= "username-testing" + + + # read_gbq: Loads a DataFrame from BigQuery + + + h = bpd.read_gbq("bigquery-public-data.london_bicycles.cycle_hire") + s= bpd.read_gbq( + ''' + SELECT + id, + ST_DISTANCE( + ST_GEOGPOINT(s.longitude, s.latitude), + ST_GEOGPOINT(-0.1, 51.5) + ) / 1000 AS distance_from_city_center + FROM + `bigquery-public-data.london_bicycles.cycle_stations` s + ''' + ) + # transform the data + + + h= h.rename(columns={"start_station_name": "station_name","start_station_id": "station_id"} ) + + + h= h[["start_date", "station_name", "station_id", "duration"]] + + + start_date = datetime.datetime.now() + + + sample_time = datetime.datetime(2015, 1, 1, 0, 0, 0, tzinfo= datetime.timezone.utc) + + + sample_time2 = datetime.datetime(2016, 1, 1, 0, 0, 0, tzinfo= datetime.timezone.utc) + + + h= h.loc[(h["start_date"] >= sample_time) & (h["start_date"] <= sample_time2)] + + + isweekday = h.start_date.dt.dayofweek.map({0: "weekday", 1: "weekday", 2: "weekday", 3: "weekday", + 4:"weekday",5:"weekend", 6:"weekend"}) + + + # create the dataframe variable + + + df= bpd.DataFrame() + + + merged_df = h.merge( + right= s, + how="inner", + left_on= "station_id", + right_on= "id", + ) + + + stationstats = merged_df.groupby("station_name").agg({"duration":[ "mean","count"] , "distance_from_city_center": "max"}) + + + stationstats.columns=["duration","num_trips","distance_from_city_center"] + + + stationstats.sort_values(by="distance_from_city_center", ascending=True) + + + from bigframes.ml.cluster import KMeans + + + cluster_model = KMeans(n_clusters=4) + + + cluster_model.fit(stationstats) + + + predict = cluster_model.predict(stationstats) \ No newline at end of file From 5fb1d4f94d798a97deee8e8de46b7640a0e48c3b Mon Sep 17 00:00:00 2001 From: Salem Boyland Date: Mon, 11 Dec 2023 12:28:11 -0600 Subject: [PATCH 02/28] formatting --- samples/snippets/create_k-means_model | 87 +++++++++++++-------------- 1 file changed, 43 insertions(+), 44 deletions(-) diff --git a/samples/snippets/create_k-means_model b/samples/snippets/create_k-means_model index fb129f19c0..c1eb05f27a 100644 --- a/samples/snippets/create_k-means_model +++ b/samples/snippets/create_k-means_model @@ -1,84 +1,83 @@ def test_kmeans_sample(): - import bigframes.pandas as bpd - import bigframes - from bigframes import dataframe - import datetime + import bigframes.pandas as bpd + import bigframes + from bigframes import dataframe + import datetime - bigframes.options.bigquery.project= "username-testing" + bigframes.options.bigquery.project= "username-testing" - # read_gbq: Loads a DataFrame from BigQuery + # read_gbq: Loads a DataFrame from BigQuery + h = bpd.read_gbq("bigquery-public-data.london_bicycles.cycle_hire") + s= bpd.read_gbq( + ''' + SELECT + id, + ST_DISTANCE( + ST_GEOGPOINT(s.longitude, s.latitude), + ST_GEOGPOINT(-0.1, 51.5) + ) / 1000 AS distance_from_city_center + FROM + `bigquery-public-data.london_bicycles.cycle_stations` s + ''' + ) + # transform the data - h = bpd.read_gbq("bigquery-public-data.london_bicycles.cycle_hire") - s= bpd.read_gbq( - ''' - SELECT - id, - ST_DISTANCE( - ST_GEOGPOINT(s.longitude, s.latitude), - ST_GEOGPOINT(-0.1, 51.5) - ) / 1000 AS distance_from_city_center - FROM - `bigquery-public-data.london_bicycles.cycle_stations` s - ''' - ) - # transform the data + h= h.rename(columns={"start_station_name": "station_name","start_station_id": "station_id"} ) - h= h.rename(columns={"start_station_name": "station_name","start_station_id": "station_id"} ) + h= h[["start_date", "station_name", "station_id", "duration"]] - h= h[["start_date", "station_name", "station_id", "duration"]] + start_date = datetime.datetime.now() - start_date = datetime.datetime.now() + sample_time = datetime.datetime(2015, 1, 1, 0, 0, 0, tzinfo= datetime.timezone.utc) - sample_time = datetime.datetime(2015, 1, 1, 0, 0, 0, tzinfo= datetime.timezone.utc) + sample_time2 = datetime.datetime(2016, 1, 1, 0, 0, 0, tzinfo= datetime.timezone.utc) - sample_time2 = datetime.datetime(2016, 1, 1, 0, 0, 0, tzinfo= datetime.timezone.utc) + h= h.loc[(h["start_date"] >= sample_time) & (h["start_date"] <= sample_time2)] - h= h.loc[(h["start_date"] >= sample_time) & (h["start_date"] <= sample_time2)] + isweekday = h.start_date.dt.dayofweek.map({0: "weekday", 1: "weekday", 2: "weekday", 3: "weekday", + 4:"weekday",5:"weekend", 6:"weekend"}) - isweekday = h.start_date.dt.dayofweek.map({0: "weekday", 1: "weekday", 2: "weekday", 3: "weekday", - 4:"weekday",5:"weekend", 6:"weekend"}) + # create the dataframe variable - # create the dataframe variable + df= bpd.DataFrame() - df= bpd.DataFrame() + merged_df = h.merge( + right= s, + how="inner", + left_on= "station_id", + right_on= "id", + ) - merged_df = h.merge( - right= s, - how="inner", - left_on= "station_id", - right_on= "id", - ) + stationstats = merged_df.groupby("station_name").agg({"duration":[ "mean","count"] , "distance_from_city_center": "max"}) - stationstats = merged_df.groupby("station_name").agg({"duration":[ "mean","count"] , "distance_from_city_center": "max"}) + stationstats.columns=["duration","num_trips","distance_from_city_center"] - stationstats.columns=["duration","num_trips","distance_from_city_center"] + stationstats.sort_values(by="distance_from_city_center", ascending=True) - stationstats.sort_values(by="distance_from_city_center", ascending=True) + from bigframes.ml.cluster import KMeans - from bigframes.ml.cluster import KMeans + cluster_model = KMeans(n_clusters=4) - cluster_model = KMeans(n_clusters=4) + cluster_model.fit(stationstats) - cluster_model.fit(stationstats) - - predict = cluster_model.predict(stationstats) \ No newline at end of file + predict = cluster_model.predict(stationstats) \ No newline at end of file From 523255f7c823f6e4e5663025ea0ec27559e0fdff Mon Sep 17 00:00:00 2001 From: Salem Boyland Date: Mon, 11 Dec 2023 12:57:34 -0600 Subject: [PATCH 03/28] added test --- samples/snippets/create_k-means_model | 38 +++++---------------------- 1 file changed, 7 insertions(+), 31 deletions(-) diff --git a/samples/snippets/create_k-means_model b/samples/snippets/create_k-means_model index c1eb05f27a..932ae43d44 100644 --- a/samples/snippets/create_k-means_model +++ b/samples/snippets/create_k-means_model @@ -4,12 +4,11 @@ def test_kmeans_sample(): from bigframes import dataframe import datetime - - bigframes.options.bigquery.project= "username-testing" + #NOTE: ask about line below and whether it is needed outside of colab notebooks + #bigframes.options.bigquery.project= "username-testing" # read_gbq: Loads a DataFrame from BigQuery - h = bpd.read_gbq("bigquery-public-data.london_bicycles.cycle_hire") s= bpd.read_gbq( ''' @@ -24,60 +23,37 @@ def test_kmeans_sample(): ''' ) # transform the data - - h= h.rename(columns={"start_station_name": "station_name","start_station_id": "station_id"} ) - - h= h[["start_date", "station_name", "station_id", "duration"]] - start_date = datetime.datetime.now() - - sample_time = datetime.datetime(2015, 1, 1, 0, 0, 0, tzinfo= datetime.timezone.utc) - - sample_time2 = datetime.datetime(2016, 1, 1, 0, 0, 0, tzinfo= datetime.timezone.utc) - h= h.loc[(h["start_date"] >= sample_time) & (h["start_date"] <= sample_time2)] - isweekday = h.start_date.dt.dayofweek.map({0: "weekday", 1: "weekday", 2: "weekday", 3: "weekday", 4:"weekday",5:"weekend", 6:"weekend"}) - - # create the dataframe variable - - + # create the new dataframe variable df= bpd.DataFrame() - merged_df = h.merge( right= s, how="inner", left_on= "station_id", right_on= "id", ) - - stationstats = merged_df.groupby("station_name").agg({"duration":[ "mean","count"] , "distance_from_city_center": "max"}) - stationstats.columns=["duration","num_trips","distance_from_city_center"] - stationstats.sort_values(by="distance_from_city_center", ascending=True) - from bigframes.ml.cluster import KMeans - - cluster_model = KMeans(n_clusters=4) + cluster_model= cluster_model.fit(stationstats) - - cluster_model.fit(stationstats) - - - predict = cluster_model.predict(stationstats) \ No newline at end of file + def predict_kennington_stations(stationstats): + station_filter = "REGEXP_CONTAINS(station_name, 'Kennington')" + predict = cluster_model.predict(station_filter) From 3bb267aecd663a072bf243f8ec5391926ebc050f Mon Sep 17 00:00:00 2001 From: Salem Boyland Date: Mon, 11 Dec 2023 14:44:19 -0600 Subject: [PATCH 04/28] docs: add code sampke for creating kmeans model --- ...eans_model => create_kmeans_model_test.py} | 33 ++++++++++--------- 1 file changed, 17 insertions(+), 16 deletions(-) rename samples/snippets/{create_k-means_model => create_kmeans_model_test.py} (68%) diff --git a/samples/snippets/create_k-means_model b/samples/snippets/create_kmeans_model_test.py similarity index 68% rename from samples/snippets/create_k-means_model rename to samples/snippets/create_kmeans_model_test.py index 932ae43d44..789b1109b3 100644 --- a/samples/snippets/create_k-means_model +++ b/samples/snippets/create_kmeans_model_test.py @@ -7,7 +7,6 @@ def test_kmeans_sample(): #NOTE: ask about line below and whether it is needed outside of colab notebooks #bigframes.options.bigquery.project= "username-testing" - # read_gbq: Loads a DataFrame from BigQuery h = bpd.read_gbq("bigquery-public-data.london_bicycles.cycle_hire") s= bpd.read_gbq( @@ -26,6 +25,7 @@ def test_kmeans_sample(): h= h.rename(columns={"start_station_name": "station_name","start_station_id": "station_id"} ) h= h[["start_date", "station_name", "station_id", "duration"]] + # NOTE: line below is not accessed, is it needed outside of colab notebook? start_date = datetime.datetime.now() sample_time = datetime.datetime(2015, 1, 1, 0, 0, 0, tzinfo= datetime.timezone.utc) sample_time2 = datetime.datetime(2016, 1, 1, 0, 0, 0, tzinfo= datetime.timezone.utc) @@ -35,25 +35,26 @@ def test_kmeans_sample(): isweekday = h.start_date.dt.dayofweek.map({0: "weekday", 1: "weekday", 2: "weekday", 3: "weekday", 4:"weekday",5:"weekend", 6:"weekend"}) - # create the new dataframe variable - df= bpd.DataFrame() - + # create the new dataframe variable, stationstats merged_df = h.merge( - right= s, - how="inner", - left_on= "station_id", - right_on= "id", - ) - stationstats = merged_df.groupby("station_name").agg({"duration":[ "mean","count"] , "distance_from_city_center": "max"}) - - stationstats.columns=["duration","num_trips","distance_from_city_center"] + right= s, + how="inner", + left_on= "station_id", + right_on= "id", + ) - stationstats.sort_values(by="distance_from_city_center", ascending=True) + stationstats = merged_df.groupby("station_name").agg({"duration":[ "mean","count"] , "distance_from_city_center": "max"}) + def station_filter(): + stationstats.columns = ["duration","num_trips","distance_from_city_center"] + stationstats.sort_values(by = "distance_from_city_center", ascending = True) + filter = '''REGEXP_CONTAINS(station_name, 'Kennington')''' + # import the KMeans model to cluster the data from bigframes.ml.cluster import KMeans cluster_model = KMeans(n_clusters=4) cluster_model= cluster_model.fit(stationstats) - def predict_kennington_stations(stationstats): - station_filter = "REGEXP_CONTAINS(station_name, 'Kennington')" - predict = cluster_model.predict(station_filter) + #the following function predicts the cluster of every station that has the string "Kennington" in its name. + def predict_kennington_stations(): + cluster_model.predict(stationstats) + From 73d2a4681212c1881366c298f44f228bbf208932 Mon Sep 17 00:00:00 2001 From: Owl Bot Date: Mon, 11 Dec 2023 20:48:09 +0000 Subject: [PATCH 05/28] =?UTF-8?q?=F0=9F=A6=89=20Updates=20from=20OwlBot=20?= =?UTF-8?q?post-processor?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --- samples/snippets/create_kmeans_model_test.py | 68 ++++++++++++-------- 1 file changed, 41 insertions(+), 27 deletions(-) diff --git a/samples/snippets/create_kmeans_model_test.py b/samples/snippets/create_kmeans_model_test.py index 789b1109b3..2c6e5712b5 100644 --- a/samples/snippets/create_kmeans_model_test.py +++ b/samples/snippets/create_kmeans_model_test.py @@ -1,16 +1,16 @@ def test_kmeans_sample(): - import bigframes.pandas as bpd - import bigframes - from bigframes import dataframe import datetime - #NOTE: ask about line below and whether it is needed outside of colab notebooks - #bigframes.options.bigquery.project= "username-testing" + import bigframes + from bigframes import dataframe + import bigframes.pandas as bpd + # NOTE: ask about line below and whether it is needed outside of colab notebooks + # bigframes.options.bigquery.project= "username-testing" # read_gbq: Loads a DataFrame from BigQuery h = bpd.read_gbq("bigquery-public-data.london_bicycles.cycle_hire") - s= bpd.read_gbq( - ''' + s = bpd.read_gbq( + """ SELECT id, ST_DISTANCE( @@ -19,42 +19,56 @@ def test_kmeans_sample(): ) / 1000 AS distance_from_city_center FROM `bigquery-public-data.london_bicycles.cycle_stations` s - ''' + """ ) # transform the data - h= h.rename(columns={"start_station_name": "station_name","start_station_id": "station_id"} ) - h= h[["start_date", "station_name", "station_id", "duration"]] + h = h.rename( + columns={"start_station_name": "station_name", "start_station_id": "station_id"} + ) + h = h[["start_date", "station_name", "station_id", "duration"]] # NOTE: line below is not accessed, is it needed outside of colab notebook? start_date = datetime.datetime.now() - sample_time = datetime.datetime(2015, 1, 1, 0, 0, 0, tzinfo= datetime.timezone.utc) - sample_time2 = datetime.datetime(2016, 1, 1, 0, 0, 0, tzinfo= datetime.timezone.utc) + sample_time = datetime.datetime(2015, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc) + sample_time2 = datetime.datetime(2016, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc) - h= h.loc[(h["start_date"] >= sample_time) & (h["start_date"] <= sample_time2)] + h = h.loc[(h["start_date"] >= sample_time) & (h["start_date"] <= sample_time2)] - isweekday = h.start_date.dt.dayofweek.map({0: "weekday", 1: "weekday", 2: "weekday", 3: "weekday", - 4:"weekday",5:"weekend", 6:"weekend"}) + isweekday = h.start_date.dt.dayofweek.map( + { + 0: "weekday", + 1: "weekday", + 2: "weekday", + 3: "weekday", + 4: "weekday", + 5: "weekend", + 6: "weekend", + } + ) # create the new dataframe variable, stationstats merged_df = h.merge( - right= s, - how="inner", - left_on= "station_id", - right_on= "id", - ) + right=s, + how="inner", + left_on="station_id", + right_on="id", + ) + + stationstats = merged_df.groupby("station_name").agg( + {"duration": ["mean", "count"], "distance_from_city_center": "max"} + ) - stationstats = merged_df.groupby("station_name").agg({"duration":[ "mean","count"] , "distance_from_city_center": "max"}) def station_filter(): - stationstats.columns = ["duration","num_trips","distance_from_city_center"] - stationstats.sort_values(by = "distance_from_city_center", ascending = True) - filter = '''REGEXP_CONTAINS(station_name, 'Kennington')''' + stationstats.columns = ["duration", "num_trips", "distance_from_city_center"] + stationstats.sort_values(by="distance_from_city_center", ascending=True) + filter = """REGEXP_CONTAINS(station_name, 'Kennington')""" # import the KMeans model to cluster the data from bigframes.ml.cluster import KMeans + cluster_model = KMeans(n_clusters=4) - cluster_model= cluster_model.fit(stationstats) + cluster_model = cluster_model.fit(stationstats) - #the following function predicts the cluster of every station that has the string "Kennington" in its name. + # the following function predicts the cluster of every station that has the string "Kennington" in its name. def predict_kennington_stations(): cluster_model.predict(stationstats) - From b3c0578f1059085430b35c87f105567f0d14ed1c Mon Sep 17 00:00:00 2001 From: Salem Boyland Date: Tue, 12 Dec 2023 14:24:06 -0600 Subject: [PATCH 06/28] license header + region tags added --- samples/snippets/create_kmeans_model_test.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/samples/snippets/create_kmeans_model_test.py b/samples/snippets/create_kmeans_model_test.py index 789b1109b3..7eb0222a8e 100644 --- a/samples/snippets/create_kmeans_model_test.py +++ b/samples/snippets/create_kmeans_model_test.py @@ -1,4 +1,20 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + def test_kmeans_sample(): +# [START bigquery_dataframes_bqml_kmeans] + import bigframes.pandas as bpd import bigframes from bigframes import dataframe @@ -58,3 +74,4 @@ def station_filter(): def predict_kennington_stations(): cluster_model.predict(stationstats) +# [END bigquery_dataframes_bqml_kmeans] \ No newline at end of file From db9f43986ef14aabe9fc9e83d6e7d1ffdfc50e91 Mon Sep 17 00:00:00 2001 From: Salem Jorden <115185670+SalemJorden@users.noreply.github.com> Date: Tue, 12 Dec 2023 14:55:01 -0600 Subject: [PATCH 07/28] Update samples/snippets/create_kmeans_model_test.py Co-authored-by: Tim Swast --- samples/snippets/create_kmeans_model_test.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/samples/snippets/create_kmeans_model_test.py b/samples/snippets/create_kmeans_model_test.py index 83823e1936..702e83035a 100644 --- a/samples/snippets/create_kmeans_model_test.py +++ b/samples/snippets/create_kmeans_model_test.py @@ -30,7 +30,12 @@ def test_kmeans_sample(): # NOTE: ask about line below and whether it is needed outside of colab notebooks # bigframes.options.bigquery.project= "username-testing" # read_gbq: Loads a DataFrame from BigQuery - h = bpd.read_gbq("bigquery-public-data.london_bicycles.cycle_hire") + h = bpd.read_gbq( + "bigquery-public-data.london_bicycles.cycle_hire", + columns=[ + "start_station_name", "start_station_id", "start_date", "duration", + ], + ) s = bpd.read_gbq( """ SELECT From e7bd5ef34bcdbb58a76fb13555e4daae940a6713 Mon Sep 17 00:00:00 2001 From: Salem Boyland Date: Wed, 13 Dec 2023 12:26:12 -0600 Subject: [PATCH 08/28] code corrections resolved --- samples/snippets/create_kmeans_model_test.py | 66 +++++++++----------- 1 file changed, 28 insertions(+), 38 deletions(-) diff --git a/samples/snippets/create_kmeans_model_test.py b/samples/snippets/create_kmeans_model_test.py index 83823e1936..8c133ef755 100644 --- a/samples/snippets/create_kmeans_model_test.py +++ b/samples/snippets/create_kmeans_model_test.py @@ -13,24 +13,17 @@ # limitations under the License. def test_kmeans_sample(): -<<<<<<< HEAD # [START bigquery_dataframes_bqml_kmeans] - import bigframes.pandas as bpd import bigframes from bigframes import dataframe -======= ->>>>>>> 73d2a4681212c1881366c298f44f228bbf208932 - import datetime - - import bigframes - from bigframes import dataframe import bigframes.pandas as bpd + import datetime - # NOTE: ask about line below and whether it is needed outside of colab notebooks - # bigframes.options.bigquery.project= "username-testing" - # read_gbq: Loads a DataFrame from BigQuery - h = bpd.read_gbq("bigquery-public-data.london_bicycles.cycle_hire") + #Load data from BigQuery + h = bpd.read_gbq("bigquery-public-data.london_bicycles.cycle_hire", h.rename( + columns = {"start_station_name": "station_name", "start_station_id": "station_id"} + )) s = bpd.read_gbq( """ SELECT @@ -41,22 +34,15 @@ def test_kmeans_sample(): ) / 1000 AS distance_from_city_center FROM `bigquery-public-data.london_bicycles.cycle_stations` s - """ - ) - # transform the data - h = h.rename( - columns={"start_station_name": "station_name", "start_station_id": "station_id"} - ) - h = h[["start_date", "station_name", "station_id", "duration"]] + """ ) - # NOTE: line below is not accessed, is it needed outside of colab notebook? - start_date = datetime.datetime.now() + # transform data into queryable format sample_time = datetime.datetime(2015, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc) sample_time2 = datetime.datetime(2016, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc) h = h.loc[(h["start_date"] >= sample_time) & (h["start_date"] <= sample_time2)] - isweekday = h.start_date.dt.dayofweek.map( + h.start_date.dt.dayofweek.map( { 0: "weekday", 1: "weekday", @@ -68,34 +54,38 @@ def test_kmeans_sample(): } ) - # create the new dataframe variable, stationstats + #merge dataframes h and s merged_df = h.merge( right=s, how="inner", left_on="station_id", right_on="id", ) - + # Create new dataframe variable from merge: 'stationstats' stationstats = merged_df.groupby("station_name").agg( {"duration": ["mean", "count"], "distance_from_city_center": "max"} ) + # [END bigquery_dataframes_bqml_kmeans] + - def station_filter(): - stationstats.columns = ["duration", "num_trips", "distance_from_city_center"] - stationstats.sort_values(by="distance_from_city_center", ascending=True) - filter = """REGEXP_CONTAINS(station_name, 'Kennington')""" + # [START bigquery_dataframes_bqml_kmeans_fit] - # import the KMeans model to cluster the data + # import the KMeans model from bigframes.ml to cluster the data from bigframes.ml.cluster import KMeans cluster_model = KMeans(n_clusters=4) - cluster_model = cluster_model.fit(stationstats) + cluster_model = cluster_model.fit(stationstats).to_gbq(cluster_model) + + # [END bigquery_dataframes_bqml_kmeans_fit] + + # [START bigquery_dataframes_bqml_kmeans_predict] + + # Use 'contains' function to find all entries with string "Kennington". + stationstats = stationstats.str.contains("Kennington") + + #Predict using the model + result = cluster_model.predict(stationstats) + + # [END bigquery_dataframes_bqml_kmeans_predict] - # the following function predicts the cluster of every station that has the string "Kennington" in its name. - def predict_kennington_stations(): - cluster_model.predict(stationstats) -<<<<<<< HEAD - -# [END bigquery_dataframes_bqml_kmeans] -======= ->>>>>>> 73d2a4681212c1881366c298f44f228bbf208932 + assert result is not None From 809ed05547914024b22df2f5b9319b31ebe01e29 Mon Sep 17 00:00:00 2001 From: Salem Boyland Date: Mon, 18 Dec 2023 14:32:32 -0600 Subject: [PATCH 09/28] code corrections commit 1 --- samples/snippets/create_kmeans_model_test.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/samples/snippets/create_kmeans_model_test.py b/samples/snippets/create_kmeans_model_test.py index 8c133ef755..780051fca6 100644 --- a/samples/snippets/create_kmeans_model_test.py +++ b/samples/snippets/create_kmeans_model_test.py @@ -13,14 +13,14 @@ # limitations under the License. def test_kmeans_sample(): -# [START bigquery_dataframes_bqml_kmeans] - import bigframes.pandas as bpd + # [START bigquery_dataframes_bqml_kmeans] + import datetime + import bigframes - from bigframes import dataframe import bigframes.pandas as bpd - import datetime - #Load data from BigQuery + # Load cycle hires data from BigQuery into a dataframe variable using read_gbq function in order to + # extract the relevant information needed to train the model later on in tutorial. h = bpd.read_gbq("bigquery-public-data.london_bicycles.cycle_hire", h.rename( columns = {"start_station_name": "station_name", "start_station_id": "station_id"} )) From 5dba2b906bdc4bb389eeb62285dce7fe22b94ee8 Mon Sep 17 00:00:00 2001 From: Salem Boyland Date: Mon, 18 Dec 2023 17:15:46 -0600 Subject: [PATCH 10/28] descriptions of geospatial analysis functions --- samples/snippets/create_kmeans_model_test.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/samples/snippets/create_kmeans_model_test.py b/samples/snippets/create_kmeans_model_test.py index 780051fca6..68bcb6d129 100644 --- a/samples/snippets/create_kmeans_model_test.py +++ b/samples/snippets/create_kmeans_model_test.py @@ -25,6 +25,9 @@ def test_kmeans_sample(): columns = {"start_station_name": "station_name", "start_station_id": "station_id"} )) s = bpd.read_gbq( + # Here we use a SQL query so that we can use the geospatial analytics functions, ST_GEOPOINT + # and ST_DISTANCE, which are supported in GoogleSQL for BigQuery. These functions allow us to analyze the + # geographical data and determine spatial relationships between the geographical features. """ SELECT id, From 2207941b16763e69cd5f46ba77abbb4fe9d521de Mon Sep 17 00:00:00 2001 From: Salem Boyland Date: Mon, 18 Dec 2023 17:29:01 -0600 Subject: [PATCH 11/28] explantions revised for clarity --- samples/snippets/create_kmeans_model_test.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/samples/snippets/create_kmeans_model_test.py b/samples/snippets/create_kmeans_model_test.py index 68bcb6d129..5c1a275379 100644 --- a/samples/snippets/create_kmeans_model_test.py +++ b/samples/snippets/create_kmeans_model_test.py @@ -39,12 +39,15 @@ def test_kmeans_sample(): `bigquery-public-data.london_bicycles.cycle_stations` s """ ) - # transform data into queryable format + # Here we transform the datetime data into the UTC timezone for standardization because BigQuery priortizes + # UTC as the internal format for global analysis. sample_time = datetime.datetime(2015, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc) sample_time2 = datetime.datetime(2016, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc) h = h.loc[(h["start_date"] >= sample_time) & (h["start_date"] <= sample_time2)] - + + # In this section, we use a mapping function to transform the start_date column by replacing each day-of-the-week + # number with the corresponding label ("weekday" or "weekend"). h.start_date.dt.dayofweek.map( { 0: "weekday", From 5e00a3c78e4ae91516cdaa05d9930e6e8f7530c5 Mon Sep 17 00:00:00 2001 From: Owl Bot Date: Mon, 18 Dec 2023 23:31:27 +0000 Subject: [PATCH 12/28] =?UTF-8?q?=F0=9F=A6=89=20Updates=20from=20OwlBot=20?= =?UTF-8?q?post-processor?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --- samples/snippets/create_kmeans_model_test.py | 35 ++++++++++++-------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/samples/snippets/create_kmeans_model_test.py b/samples/snippets/create_kmeans_model_test.py index 5c1a275379..3c451e4939 100644 --- a/samples/snippets/create_kmeans_model_test.py +++ b/samples/snippets/create_kmeans_model_test.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. + def test_kmeans_sample(): # [START bigquery_dataframes_bqml_kmeans] import datetime @@ -21,13 +22,19 @@ def test_kmeans_sample(): # Load cycle hires data from BigQuery into a dataframe variable using read_gbq function in order to # extract the relevant information needed to train the model later on in tutorial. - h = bpd.read_gbq("bigquery-public-data.london_bicycles.cycle_hire", h.rename( - columns = {"start_station_name": "station_name", "start_station_id": "station_id"} - )) + h = bpd.read_gbq( + "bigquery-public-data.london_bicycles.cycle_hire", + h.rename( + columns={ + "start_station_name": "station_name", + "start_station_id": "station_id", + } + ), + ) s = bpd.read_gbq( - # Here we use a SQL query so that we can use the geospatial analytics functions, ST_GEOPOINT - # and ST_DISTANCE, which are supported in GoogleSQL for BigQuery. These functions allow us to analyze the - # geographical data and determine spatial relationships between the geographical features. + # Here we use a SQL query so that we can use the geospatial analytics functions, ST_GEOPOINT + # and ST_DISTANCE, which are supported in GoogleSQL for BigQuery. These functions allow us to analyze the + # geographical data and determine spatial relationships between the geographical features. """ SELECT id, @@ -37,7 +44,8 @@ def test_kmeans_sample(): ) / 1000 AS distance_from_city_center FROM `bigquery-public-data.london_bicycles.cycle_stations` s - """ ) + """ + ) # Here we transform the datetime data into the UTC timezone for standardization because BigQuery priortizes # UTC as the internal format for global analysis. @@ -45,7 +53,7 @@ def test_kmeans_sample(): sample_time2 = datetime.datetime(2016, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc) h = h.loc[(h["start_date"] >= sample_time) & (h["start_date"] <= sample_time2)] - + # In this section, we use a mapping function to transform the start_date column by replacing each day-of-the-week # number with the corresponding label ("weekday" or "weekend"). h.start_date.dt.dayofweek.map( @@ -60,19 +68,18 @@ def test_kmeans_sample(): } ) - #merge dataframes h and s + # merge dataframes h and s merged_df = h.merge( right=s, how="inner", left_on="station_id", right_on="id", ) - # Create new dataframe variable from merge: 'stationstats' + # Create new dataframe variable from merge: 'stationstats' stationstats = merged_df.groupby("station_name").agg( {"duration": ["mean", "count"], "distance_from_city_center": "max"} ) # [END bigquery_dataframes_bqml_kmeans] - # [START bigquery_dataframes_bqml_kmeans_fit] @@ -83,13 +90,13 @@ def test_kmeans_sample(): cluster_model = cluster_model.fit(stationstats).to_gbq(cluster_model) # [END bigquery_dataframes_bqml_kmeans_fit] - + # [START bigquery_dataframes_bqml_kmeans_predict] - # Use 'contains' function to find all entries with string "Kennington". + # Use 'contains' function to find all entries with string "Kennington". stationstats = stationstats.str.contains("Kennington") - #Predict using the model + # Predict using the model result = cluster_model.predict(stationstats) # [END bigquery_dataframes_bqml_kmeans_predict] From 7c6422731a76bae2fa4fad51421d833f2e3fa422 Mon Sep 17 00:00:00 2001 From: Salem Jorden <115185670+SalemJorden@users.noreply.github.com> Date: Tue, 19 Dec 2023 12:44:57 -0600 Subject: [PATCH 13/28] Update samples/snippets/create_kmeans_model_test.py Co-authored-by: Tim Swast --- samples/snippets/create_kmeans_model_test.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/samples/snippets/create_kmeans_model_test.py b/samples/snippets/create_kmeans_model_test.py index 3c451e4939..516a9de5b6 100644 --- a/samples/snippets/create_kmeans_model_test.py +++ b/samples/snippets/create_kmeans_model_test.py @@ -20,6 +20,9 @@ def test_kmeans_sample(): import bigframes import bigframes.pandas as bpd + # You must compute in the EU multi-region to query the London bicycles dataset. + bigframes.options.bigquery.location = "EU" + # Load cycle hires data from BigQuery into a dataframe variable using read_gbq function in order to # extract the relevant information needed to train the model later on in tutorial. h = bpd.read_gbq( From 11678e06e700dff3d861e25e4a39bc977f26518c Mon Sep 17 00:00:00 2001 From: Salem Boyland Date: Tue, 19 Dec 2023 13:02:48 -0600 Subject: [PATCH 14/28] code corrections --- samples/snippets/create_kmeans_model_test.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/samples/snippets/create_kmeans_model_test.py b/samples/snippets/create_kmeans_model_test.py index 516a9de5b6..db99ec339e 100644 --- a/samples/snippets/create_kmeans_model_test.py +++ b/samples/snippets/create_kmeans_model_test.py @@ -23,11 +23,18 @@ def test_kmeans_sample(): # You must compute in the EU multi-region to query the London bicycles dataset. bigframes.options.bigquery.location = "EU" - # Load cycle hires data from BigQuery into a dataframe variable using read_gbq function in order to - # extract the relevant information needed to train the model later on in tutorial. + # Extract the information you'll need to train the k-means model later in this tutorial. Use the + # read_gbq function to represent cycle hires data as a DataFrame. h = bpd.read_gbq( "bigquery-public-data.london_bicycles.cycle_hire", - h.rename( + col_order =[ + "start_station_name", + "start_station_id", + "start_date", + "duration" + ], + ) + h.rename( columns={ "start_station_name": "station_name", "start_station_id": "station_id", From f95cd9fdddb3d63f97cdb0fb4a99f733a87b9c39 Mon Sep 17 00:00:00 2001 From: Salem Boyland Date: Tue, 19 Dec 2023 15:28:45 -0600 Subject: [PATCH 15/28] code revision --- samples/snippets/create_kmeans_model_test.py | 40 ++++++++++---------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/samples/snippets/create_kmeans_model_test.py b/samples/snippets/create_kmeans_model_test.py index db99ec339e..4bbafac1c0 100644 --- a/samples/snippets/create_kmeans_model_test.py +++ b/samples/snippets/create_kmeans_model_test.py @@ -33,18 +33,16 @@ def test_kmeans_sample(): "start_date", "duration" ], - ) - h.rename( + ).rename( columns={ "start_station_name": "station_name", "start_station_id": "station_id", } - ), - ) + ) + s = bpd.read_gbq( - # Here we use a SQL query so that we can use the geospatial analytics functions, ST_GEOPOINT - # and ST_DISTANCE, which are supported in GoogleSQL for BigQuery. These functions allow us to analyze the - # geographical data and determine spatial relationships between the geographical features. + # Use ST_GEOPOINT and ST_DISTANCE to analyze geographical data. + # These functions determine spatial relationships between the geographical features. """ SELECT id, @@ -57,16 +55,17 @@ def test_kmeans_sample(): """ ) - # Here we transform the datetime data into the UTC timezone for standardization because BigQuery priortizes - # UTC as the internal format for global analysis. + # Define Python datetime objects in the UTC timezone for range comparison, because BigQuery stores + # timestamp data in the UTC timezone. sample_time = datetime.datetime(2015, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc) sample_time2 = datetime.datetime(2016, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc) h = h.loc[(h["start_date"] >= sample_time) & (h["start_date"] <= sample_time2)] - # In this section, we use a mapping function to transform the start_date column by replacing each day-of-the-week - # number with the corresponding label ("weekday" or "weekend"). - h.start_date.dt.dayofweek.map( + # Replace each day-of-the-week number with the corresponding "weekday" or "weekend" label by using the + # Series.map method. + h = h.assign( + isweekday = h.start_date.dt.dayofweek.map( { 0: "weekday", 1: "weekday", @@ -76,26 +75,30 @@ def test_kmeans_sample(): 5: "weekend", 6: "weekend", } - ) + )) - # merge dataframes h and s + # Supplement each trip in "h" with the station distance information from "s" by + # merging the two DataFrames by station ID. merged_df = h.merge( right=s, how="inner", left_on="station_id", right_on="id", ) - # Create new dataframe variable from merge: 'stationstats' - stationstats = merged_df.groupby("station_name").agg( + + # Engineer features to cluster the stations. For each station, find the average trip duration, number of + # trips, and distance from city center. + stationstats = merged_df.groupby("station_name", "isweekday").agg( {"duration": ["mean", "count"], "distance_from_city_center": "max"} ) # [END bigquery_dataframes_bqml_kmeans] # [START bigquery_dataframes_bqml_kmeans_fit] - # import the KMeans model from bigframes.ml to cluster the data from bigframes.ml.cluster import KMeans + # To determine an optimal number of clusters, you would run the CREATE MODEL query for different values of + # num_clusters, find the error measure, and pick the point at which the error measure is at its minimum value. cluster_model = KMeans(n_clusters=4) cluster_model = cluster_model.fit(stationstats).to_gbq(cluster_model) @@ -103,10 +106,9 @@ def test_kmeans_sample(): # [START bigquery_dataframes_bqml_kmeans_predict] - # Use 'contains' function to find all entries with string "Kennington". + # Use 'contains' function to predict which clusters contain the stations with string "Kennington". stationstats = stationstats.str.contains("Kennington") - # Predict using the model result = cluster_model.predict(stationstats) # [END bigquery_dataframes_bqml_kmeans_predict] From 0df2dec28e17584bc285f305caf68c7988acfb2c Mon Sep 17 00:00:00 2001 From: Salem Boyland Date: Wed, 20 Dec 2023 14:46:30 -0600 Subject: [PATCH 16/28] code changes --- samples/snippets/create_kmeans_model_test.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/samples/snippets/create_kmeans_model_test.py b/samples/snippets/create_kmeans_model_test.py index 4bbafac1c0..8cf928eedf 100644 --- a/samples/snippets/create_kmeans_model_test.py +++ b/samples/snippets/create_kmeans_model_test.py @@ -88,9 +88,13 @@ def test_kmeans_sample(): # Engineer features to cluster the stations. For each station, find the average trip duration, number of # trips, and distance from city center. - stationstats = merged_df.groupby("station_name", "isweekday").agg( + stationstats = merged_df.groupby(["station_name", "isweekday"]).agg( {"duration": ["mean", "count"], "distance_from_city_center": "max"} ) + stationstats.columns=["duration","num_trips","distance_from_city_center"] + stationstats.sort_values(by="distance_from_city_center", ascending=True) + #Expected output looks as follows + # [END bigquery_dataframes_bqml_kmeans] # [START bigquery_dataframes_bqml_kmeans_fit] @@ -100,7 +104,7 @@ def test_kmeans_sample(): # To determine an optimal number of clusters, you would run the CREATE MODEL query for different values of # num_clusters, find the error measure, and pick the point at which the error measure is at its minimum value. cluster_model = KMeans(n_clusters=4) - cluster_model = cluster_model.fit(stationstats).to_gbq(cluster_model) + cluster_model.fit(stationstats) # [END bigquery_dataframes_bqml_kmeans_fit] From 06a24905aae918e2a8a33a618547bce80586651d Mon Sep 17 00:00:00 2001 From: Salem Boyland Date: Thu, 21 Dec 2023 15:27:27 -0600 Subject: [PATCH 17/28] revisions --- samples/snippets/create_kmeans_model_test.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/samples/snippets/create_kmeans_model_test.py b/samples/snippets/create_kmeans_model_test.py index 8cf928eedf..a3afbf3644 100644 --- a/samples/snippets/create_kmeans_model_test.py +++ b/samples/snippets/create_kmeans_model_test.py @@ -20,6 +20,7 @@ def test_kmeans_sample(): import bigframes import bigframes.pandas as bpd + bigframes.options.bigquery.project = "salemb-testing" # You must compute in the EU multi-region to query the London bicycles dataset. bigframes.options.bigquery.location = "EU" @@ -89,7 +90,7 @@ def test_kmeans_sample(): # Engineer features to cluster the stations. For each station, find the average trip duration, number of # trips, and distance from city center. stationstats = merged_df.groupby(["station_name", "isweekday"]).agg( - {"duration": ["mean", "count"], "distance_from_city_center": "max"} + {"duration": ["mean", "count"], "distance_from_city_center": "max"} ) stationstats.columns=["duration","num_trips","distance_from_city_center"] stationstats.sort_values(by="distance_from_city_center", ascending=True) @@ -111,9 +112,10 @@ def test_kmeans_sample(): # [START bigquery_dataframes_bqml_kmeans_predict] # Use 'contains' function to predict which clusters contain the stations with string "Kennington". - stationstats = stationstats.str.contains("Kennington") + stationstats = stationstats.contains("Kennington") result = cluster_model.predict(stationstats) + #Expected output results: # [END bigquery_dataframes_bqml_kmeans_predict] From 1a9f7d9ff8fef3951c9a93f6d49dbff577cd4c98 Mon Sep 17 00:00:00 2001 From: Salem Boyland Date: Tue, 26 Dec 2023 12:47:30 -0600 Subject: [PATCH 18/28] expected output previews --- samples/snippets/create_kmeans_model_test.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/samples/snippets/create_kmeans_model_test.py b/samples/snippets/create_kmeans_model_test.py index a3afbf3644..65f2408ee3 100644 --- a/samples/snippets/create_kmeans_model_test.py +++ b/samples/snippets/create_kmeans_model_test.py @@ -94,7 +94,14 @@ def test_kmeans_sample(): ) stationstats.columns=["duration","num_trips","distance_from_city_center"] stationstats.sort_values(by="distance_from_city_center", ascending=True) - #Expected output looks as follows + +#Expected output results: >>> stationstats.head(3) +# duration num_trips distance_from_city_center +# station_name isweekday +# Abbey Orchard Street, Westminster weekday 1139.686075 14908 2.231931 +# weekend 1538.533802 2278 2.231931 +# Abbotsbury Road, Holland Park weekday 1110.262258 2631 7.338276 +# 3 rows × 3 columns # [END bigquery_dataframes_bqml_kmeans] @@ -115,7 +122,12 @@ def test_kmeans_sample(): stationstats = stationstats.contains("Kennington") result = cluster_model.predict(stationstats) - #Expected output results: + #Expected output results: >>>results.head(2) + # CENTROID_ID NEAREST_CENTROIDS_DISTANCE duration num_trips distance_from_city_center + # station_name isweekday + # Abbey Orchard Street, Westminster weekday 2 [{'CENTROID_ID': 2, 'DISTANCE': 0.695970380477... 1139.686075 14908 2.231931 + # weekend 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.467343170961... 1538.533802 2278 2.231931 + # 2 rows × 5 columns # [END bigquery_dataframes_bqml_kmeans_predict] From 464cf1c285912c0da654997a0dc4af8b859fedc2 Mon Sep 17 00:00:00 2001 From: Salem Boyland Date: Tue, 26 Dec 2023 12:57:08 -0600 Subject: [PATCH 19/28] revisions --- samples/snippets/create_kmeans_model_test.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/samples/snippets/create_kmeans_model_test.py b/samples/snippets/create_kmeans_model_test.py index 65f2408ee3..915f519a2d 100644 --- a/samples/snippets/create_kmeans_model_test.py +++ b/samples/snippets/create_kmeans_model_test.py @@ -97,7 +97,8 @@ def test_kmeans_sample(): #Expected output results: >>> stationstats.head(3) # duration num_trips distance_from_city_center -# station_name isweekday +# station_name isweekday +# # Abbey Orchard Street, Westminster weekday 1139.686075 14908 2.231931 # weekend 1538.533802 2278 2.231931 # Abbotsbury Road, Holland Park weekday 1110.262258 2631 7.338276 @@ -119,14 +120,15 @@ def test_kmeans_sample(): # [START bigquery_dataframes_bqml_kmeans_predict] # Use 'contains' function to predict which clusters contain the stations with string "Kennington". - stationstats = stationstats.contains("Kennington") + stationstats = stationstats.str.contains("Kennington") result = cluster_model.predict(stationstats) #Expected output results: >>>results.head(2) - # CENTROID_ID NEAREST_CENTROIDS_DISTANCE duration num_trips distance_from_city_center - # station_name isweekday - # Abbey Orchard Street, Westminster weekday 2 [{'CENTROID_ID': 2, 'DISTANCE': 0.695970380477... 1139.686075 14908 2.231931 - # weekend 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.467343170961... 1538.533802 2278 2.231931 + # CENTROID_ID NEAREST_CENTROIDS_DISTANCE duration num_trips distance_from_city_center + # station_name isweekday + # + # Abbey Orchard Street, Westminster weekday 2 [{'CENTROID_ID': 2, 'DISTANCE': 0.695970380477... 1139.686075 14908 2.231931 + # weekend 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.467343170961... 1538.533802 2278 2.231931 # 2 rows × 5 columns # [END bigquery_dataframes_bqml_kmeans_predict] From 019e2432c4879a7f6837d2cf9acbfdf4c6c8954c Mon Sep 17 00:00:00 2001 From: Salem Boyland Date: Tue, 26 Dec 2023 13:54:20 -0600 Subject: [PATCH 20/28] tests passing, expected output characters >80 --- samples/snippets/create_kmeans_model_test.py | 92 ++++++++++---------- 1 file changed, 45 insertions(+), 47 deletions(-) diff --git a/samples/snippets/create_kmeans_model_test.py b/samples/snippets/create_kmeans_model_test.py index 915f519a2d..9e2c7c4da4 100644 --- a/samples/snippets/create_kmeans_model_test.py +++ b/samples/snippets/create_kmeans_model_test.py @@ -28,19 +28,14 @@ def test_kmeans_sample(): # read_gbq function to represent cycle hires data as a DataFrame. h = bpd.read_gbq( "bigquery-public-data.london_bicycles.cycle_hire", - col_order =[ - "start_station_name", - "start_station_id", - "start_date", - "duration" - ], + col_order=["start_station_name", "start_station_id", "start_date", "duration"], ).rename( - columns={ - "start_station_name": "station_name", - "start_station_id": "station_id", - } - ) - + columns={ + "start_station_name": "station_name", + "start_station_id": "station_id", + } + ) + s = bpd.read_gbq( # Use ST_GEOPOINT and ST_DISTANCE to analyze geographical data. # These functions determine spatial relationships between the geographical features. @@ -56,29 +51,30 @@ def test_kmeans_sample(): """ ) - # Define Python datetime objects in the UTC timezone for range comparison, because BigQuery stores + # Define Python datetime objects in the UTC timezone for range comparison, because BigQuery stores # timestamp data in the UTC timezone. sample_time = datetime.datetime(2015, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc) sample_time2 = datetime.datetime(2016, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc) h = h.loc[(h["start_date"] >= sample_time) & (h["start_date"] <= sample_time2)] - # Replace each day-of-the-week number with the corresponding "weekday" or "weekend" label by using the + # Replace each day-of-the-week number with the corresponding "weekday" or "weekend" label by using the # Series.map method. h = h.assign( - isweekday = h.start_date.dt.dayofweek.map( - { - 0: "weekday", - 1: "weekday", - 2: "weekday", - 3: "weekday", - 4: "weekday", - 5: "weekend", - 6: "weekend", - } - )) + isweekday=h.start_date.dt.dayofweek.map( + { + 0: "weekday", + 1: "weekday", + 2: "weekday", + 3: "weekday", + 4: "weekday", + 5: "weekend", + 6: "weekend", + } + ) + ) - # Supplement each trip in "h" with the station distance information from "s" by + # Supplement each trip in "h" with the station distance information from "s" by # merging the two DataFrames by station ID. merged_df = h.merge( right=s, @@ -87,22 +83,22 @@ def test_kmeans_sample(): right_on="id", ) - # Engineer features to cluster the stations. For each station, find the average trip duration, number of + # Engineer features to cluster the stations. For each station, find the average trip duration, number of # trips, and distance from city center. stationstats = merged_df.groupby(["station_name", "isweekday"]).agg( - {"duration": ["mean", "count"], "distance_from_city_center": "max"} + {"duration": ["mean", "count"], "distance_from_city_center": "max"} ) - stationstats.columns=["duration","num_trips","distance_from_city_center"] - stationstats.sort_values(by="distance_from_city_center", ascending=True) - -#Expected output results: >>> stationstats.head(3) -# duration num_trips distance_from_city_center -# station_name isweekday -# -# Abbey Orchard Street, Westminster weekday 1139.686075 14908 2.231931 -# weekend 1538.533802 2278 2.231931 -# Abbotsbury Road, Holland Park weekday 1110.262258 2631 7.338276 -# 3 rows × 3 columns + stationstats.columns = ["duration", "num_trips", "distance_from_city_center"] + stationstats = stationstats.sort_values( + by="distance_from_city_center", ascending=True + ).reset_index() + + # Expected output results: >>> stationstats.head(3) + # station_name isweekday duration num_trips distance_from_city_center + # Borough Road... weekday 1110 5749 0.12624 + # Borough Road... weekend 2125 1774 0.12624 + # Webber Street... weekday 795 6517 0.164021 + # 3 rows × 5 columns # [END bigquery_dataframes_bqml_kmeans] @@ -120,16 +116,18 @@ def test_kmeans_sample(): # [START bigquery_dataframes_bqml_kmeans_predict] # Use 'contains' function to predict which clusters contain the stations with string "Kennington". - stationstats = stationstats.str.contains("Kennington") + stationstats = stationstats.loc[ + stationstats["station_name"].str.contains("Kennington") + ] result = cluster_model.predict(stationstats) - #Expected output results: >>>results.head(2) - # CENTROID_ID NEAREST_CENTROIDS_DISTANCE duration num_trips distance_from_city_center - # station_name isweekday - # - # Abbey Orchard Street, Westminster weekday 2 [{'CENTROID_ID': 2, 'DISTANCE': 0.695970380477... 1139.686075 14908 2.231931 - # weekend 1 [{'CENTROID_ID': 1, 'DISTANCE': 0.467343170961... 1538.533802 2278 2.231931 - # 2 rows × 5 columns + + # Expected output results: >>>results.head(3) + # CENTROID_ID NEAREST_CENTROIDS... station_name isweekday duration num_trips distance... + # 1 [{'CENTROID_ID': 1, 'DISTANCE': 2 Borough... weekday 1110 5749 0.13 + # 2 [{'CENTROID_ID': 2, 'DISTANCE': 2 Borough... weekend 2125 1774 0.13 + # 1 [{'CENTROID_ID': 1, 'DISTANCE': 2 Webber... weekday 795 6517 0.16 + # 3 rows × 7 columns # [END bigquery_dataframes_bqml_kmeans_predict] From 72174f90fa795710fd803cb9453460fab1fcba13 Mon Sep 17 00:00:00 2001 From: Owl Bot Date: Tue, 26 Dec 2023 19:56:36 +0000 Subject: [PATCH 21/28] =?UTF-8?q?=F0=9F=A6=89=20Updates=20from=20OwlBot=20?= =?UTF-8?q?post-processor?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --- samples/snippets/create_kmeans_model_test.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/samples/snippets/create_kmeans_model_test.py b/samples/snippets/create_kmeans_model_test.py index 9e2c7c4da4..854f642e41 100644 --- a/samples/snippets/create_kmeans_model_test.py +++ b/samples/snippets/create_kmeans_model_test.py @@ -124,9 +124,9 @@ def test_kmeans_sample(): # Expected output results: >>>results.head(3) # CENTROID_ID NEAREST_CENTROIDS... station_name isweekday duration num_trips distance... - # 1 [{'CENTROID_ID': 1, 'DISTANCE': 2 Borough... weekday 1110 5749 0.13 - # 2 [{'CENTROID_ID': 2, 'DISTANCE': 2 Borough... weekend 2125 1774 0.13 - # 1 [{'CENTROID_ID': 1, 'DISTANCE': 2 Webber... weekday 795 6517 0.16 + # 1 [{'CENTROID_ID': 1, 'DISTANCE': 2 Borough... weekday 1110 5749 0.13 + # 2 [{'CENTROID_ID': 2, 'DISTANCE': 2 Borough... weekend 2125 1774 0.13 + # 1 [{'CENTROID_ID': 1, 'DISTANCE': 2 Webber... weekday 795 6517 0.16 # 3 rows × 7 columns # [END bigquery_dataframes_bqml_kmeans_predict] From ac348bf397d6a4e540c3ae57b1a01fc714d69509 Mon Sep 17 00:00:00 2001 From: Salem Boyland Date: Wed, 27 Dec 2023 12:07:16 -0600 Subject: [PATCH 22/28] column wrapping --- samples/snippets/create_kmeans_model_test.py | 52 +++++++++++--------- 1 file changed, 29 insertions(+), 23 deletions(-) diff --git a/samples/snippets/create_kmeans_model_test.py b/samples/snippets/create_kmeans_model_test.py index 9e2c7c4da4..47afdec02e 100644 --- a/samples/snippets/create_kmeans_model_test.py +++ b/samples/snippets/create_kmeans_model_test.py @@ -13,19 +13,21 @@ # limitations under the License. -def test_kmeans_sample(): +def test_kmeans_sample(project_id: str): + your_gcp_project_id = project_id # [START bigquery_dataframes_bqml_kmeans] import datetime import bigframes import bigframes.pandas as bpd - bigframes.options.bigquery.project = "salemb-testing" - # You must compute in the EU multi-region to query the London bicycles dataset. + bigframes.options.bigquery.project = your_gcp_project_id + # Compute in the EU multi-region to query the London bicycles dataset. bigframes.options.bigquery.location = "EU" - # Extract the information you'll need to train the k-means model later in this tutorial. Use the - # read_gbq function to represent cycle hires data as a DataFrame. + # Extract the information you'll need to train the k-means model in this + # tutorial. Use the read_gbq function to represent cycle hires + # data as a DataFrame. h = bpd.read_gbq( "bigquery-public-data.london_bicycles.cycle_hire", col_order=["start_station_name", "start_station_id", "start_date", "duration"], @@ -37,8 +39,9 @@ def test_kmeans_sample(): ) s = bpd.read_gbq( - # Use ST_GEOPOINT and ST_DISTANCE to analyze geographical data. - # These functions determine spatial relationships between the geographical features. + # Use ST_GEOPOINT and ST_DISTANCE to analyze geographical + # data.These functions determine spatial relationships between + # geographical features. """ SELECT id, @@ -51,15 +54,15 @@ def test_kmeans_sample(): """ ) - # Define Python datetime objects in the UTC timezone for range comparison, because BigQuery stores - # timestamp data in the UTC timezone. + # Define Python datetime objects in the UTC timezone for range comparison, + # because BigQuery stores timestamp data in the UTC timezone. sample_time = datetime.datetime(2015, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc) sample_time2 = datetime.datetime(2016, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc) h = h.loc[(h["start_date"] >= sample_time) & (h["start_date"] <= sample_time2)] - # Replace each day-of-the-week number with the corresponding "weekday" or "weekend" label by using the - # Series.map method. + # Replace each day-of-the-week number with the corresponding "weekday" or + # "weekend" label by using the Series.map method. h = h.assign( isweekday=h.start_date.dt.dayofweek.map( { @@ -74,8 +77,8 @@ def test_kmeans_sample(): ) ) - # Supplement each trip in "h" with the station distance information from "s" by - # merging the two DataFrames by station ID. + # Supplement each trip in "h" with the station distance information from + # "s" by merging the two DataFrames by station ID. merged_df = h.merge( right=s, how="inner", @@ -83,8 +86,8 @@ def test_kmeans_sample(): right_on="id", ) - # Engineer features to cluster the stations. For each station, find the average trip duration, number of - # trips, and distance from city center. + # Engineer features to cluster the stations. For each station, find the + # average trip duration, number of trips, and distance from city center. stationstats = merged_df.groupby(["station_name", "isweekday"]).agg( {"duration": ["mean", "count"], "distance_from_city_center": "max"} ) @@ -94,7 +97,7 @@ def test_kmeans_sample(): ).reset_index() # Expected output results: >>> stationstats.head(3) - # station_name isweekday duration num_trips distance_from_city_center + # station_name isweekday duration num_trips distance_from_city_center # Borough Road... weekday 1110 5749 0.12624 # Borough Road... weekend 2125 1774 0.12624 # Webber Street... weekday 795 6517 0.164021 @@ -106,8 +109,10 @@ def test_kmeans_sample(): from bigframes.ml.cluster import KMeans - # To determine an optimal number of clusters, you would run the CREATE MODEL query for different values of - # num_clusters, find the error measure, and pick the point at which the error measure is at its minimum value. + # To determine an optimal number of clusters, construct and fit several + # K-Means objects with different values of num_clusters, find the error + # measure, and pick the point at which the error measure is at its minimum + # value. cluster_model = KMeans(n_clusters=4) cluster_model.fit(stationstats) @@ -115,7 +120,8 @@ def test_kmeans_sample(): # [START bigquery_dataframes_bqml_kmeans_predict] - # Use 'contains' function to predict which clusters contain the stations with string "Kennington". + # Use 'contains' function to predict which clusters contain the stations + # with string "Kennington". stationstats = stationstats.loc[ stationstats["station_name"].str.contains("Kennington") ] @@ -123,10 +129,10 @@ def test_kmeans_sample(): result = cluster_model.predict(stationstats) # Expected output results: >>>results.head(3) - # CENTROID_ID NEAREST_CENTROIDS... station_name isweekday duration num_trips distance... - # 1 [{'CENTROID_ID': 1, 'DISTANCE': 2 Borough... weekday 1110 5749 0.13 - # 2 [{'CENTROID_ID': 2, 'DISTANCE': 2 Borough... weekend 2125 1774 0.13 - # 1 [{'CENTROID_ID': 1, 'DISTANCE': 2 Webber... weekday 795 6517 0.16 + # CENTROID... NEAREST... station_name isweekday duration num_trips dist... + # 1 [{'CENTROID_ID'... Borough... weekday 1110 5749 0.13 + # 2 [{'CENTROID_ID'... Borough... weekend 2125 1774 0.13 + # 1 [{'CENTROID_ID'... Webber... weekday 795 6517 0.16 # 3 rows × 7 columns # [END bigquery_dataframes_bqml_kmeans_predict] From 1572ddd2b68452ff2879a5387f3091cd2f7ee466 Mon Sep 17 00:00:00 2001 From: Salem Jorden <115185670+SalemJorden@users.noreply.github.com> Date: Mon, 5 Feb 2024 14:22:06 -0600 Subject: [PATCH 23/28] reset session before running code smaples --- samples/snippets/conftest.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/samples/snippets/conftest.py b/samples/snippets/conftest.py index 1ce54b3c0c..968dac011b 100644 --- a/samples/snippets/conftest.py +++ b/samples/snippets/conftest.py @@ -18,6 +18,8 @@ import pytest import test_utils.prefixer +import bigframes.pandas as bpd + prefixer = test_utils.prefixer.Prefixer( "python-bigquery-dataframes", "samples/snippets" ) @@ -43,6 +45,11 @@ def project_id(bigquery_client: bigquery.Client) -> str: return bigquery_client.project +@pytest.fixture(autouse=True) +def reset_session(): + bpd.reset_session() + + @pytest.fixture(scope="session") def dataset_id(bigquery_client: bigquery.Client, project_id: str) -> Iterator[str]: dataset_id = prefixer.create_prefix() From 3d77ddd06d0baf45ff9d69aec012d3964a244193 Mon Sep 17 00:00:00 2001 From: Salem Jorden <115185670+SalemJorden@users.noreply.github.com> Date: Tue, 6 Feb 2024 14:09:10 -0600 Subject: [PATCH 24/28] Update samples/snippets/create_kmeans_model_test.py Co-authored-by: Tim Swast --- samples/snippets/create_kmeans_model_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/snippets/create_kmeans_model_test.py b/samples/snippets/create_kmeans_model_test.py index 47afdec02e..cfc14f56ae 100644 --- a/samples/snippets/create_kmeans_model_test.py +++ b/samples/snippets/create_kmeans_model_test.py @@ -40,7 +40,7 @@ def test_kmeans_sample(project_id: str): s = bpd.read_gbq( # Use ST_GEOPOINT and ST_DISTANCE to analyze geographical - # data.These functions determine spatial relationships between + # data. These functions determine spatial relationships between # geographical features. """ SELECT From 505b79015d6557326dcc5651ea46127c26b847ae Mon Sep 17 00:00:00 2001 From: Salem Jorden <115185670+SalemJorden@users.noreply.github.com> Date: Wed, 7 Feb 2024 12:47:18 -0600 Subject: [PATCH 25/28] predict function added to tutorial --- samples/snippets/create_kmeans_model_test.py | 29 ++++++++++++++------ 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/samples/snippets/create_kmeans_model_test.py b/samples/snippets/create_kmeans_model_test.py index cfc14f56ae..49fe8d0559 100644 --- a/samples/snippets/create_kmeans_model_test.py +++ b/samples/snippets/create_kmeans_model_test.py @@ -61,7 +61,7 @@ def test_kmeans_sample(project_id: str): h = h.loc[(h["start_date"] >= sample_time) & (h["start_date"] <= sample_time2)] - # Replace each day-of-the-week number with the corresponding "weekday" or + # Replace each day-of-the-week number with the corresponding "weekday" or # "weekend" label by using the Series.map method. h = h.assign( isweekday=h.start_date.dt.dayofweek.map( @@ -86,7 +86,7 @@ def test_kmeans_sample(project_id: str): right_on="id", ) - # Engineer features to cluster the stations. For each station, find the + # Engineer features to cluster the stations. For each station, find the # average trip duration, number of trips, and distance from city center. stationstats = merged_df.groupby(["station_name", "isweekday"]).agg( {"duration": ["mean", "count"], "distance_from_city_center": "max"} @@ -109,18 +109,29 @@ def test_kmeans_sample(project_id: str): from bigframes.ml.cluster import KMeans - # To determine an optimal number of clusters, construct and fit several + # To determine an optimal number of clusters, construct and fit several # K-Means objects with different values of num_clusters, find the error - # measure, and pick the point at which the error measure is at its minimum + # measure, and pick the point at which the error measure is at its minimum # value. cluster_model = KMeans(n_clusters=4) cluster_model.fit(stationstats) - + cluster_model.to_gbq( + your_gcp_project_id, # For example: "bqml_tutorial.sample_model" + replace=True, + ) # [END bigquery_dataframes_bqml_kmeans_fit] # [START bigquery_dataframes_bqml_kmeans_predict] - # Use 'contains' function to predict which clusters contain the stations + # Select model you'll use for training. `read_gbq_model` loads model data + # from BigQuery, but you could also use the `cluster_model` object from + # previous steps. + cluster_model = bpd.read_gbq_model( + your_gcp_project_id, + # For example: "bqml_tutorial.london_station_clusters", + ) + + # Use 'contains' function to predict which clusters contain the stations # with string "Kennington". stationstats = stationstats.loc[ stationstats["station_name"].str.contains("Kennington") @@ -130,9 +141,9 @@ def test_kmeans_sample(project_id: str): # Expected output results: >>>results.head(3) # CENTROID... NEAREST... station_name isweekday duration num_trips dist... - # 1 [{'CENTROID_ID'... Borough... weekday 1110 5749 0.13 - # 2 [{'CENTROID_ID'... Borough... weekend 2125 1774 0.13 - # 1 [{'CENTROID_ID'... Webber... weekday 795 6517 0.16 + # 1 [{'CENTROID_ID'... Borough... weekday 1110 5749 0.13 + # 2 [{'CENTROID_ID'... Borough... weekend 2125 1774 0.13 + # 1 [{'CENTROID_ID'... Webber... weekday 795 6517 0.16 # 3 rows × 7 columns # [END bigquery_dataframes_bqml_kmeans_predict] From 4505c5caa4e69bc459326afcf48f7951d10f23be Mon Sep 17 00:00:00 2001 From: Salem Jorden <115185670+SalemJorden@users.noreply.github.com> Date: Fri, 9 Feb 2024 15:22:42 -0600 Subject: [PATCH 26/28] replaced project_id with model_id --- samples/snippets/conftest.py | 5 +++++ samples/snippets/create_kmeans_model_test.py | 6 +++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/samples/snippets/conftest.py b/samples/snippets/conftest.py index 968dac011b..5e43f079fa 100644 --- a/samples/snippets/conftest.py +++ b/samples/snippets/conftest.py @@ -47,6 +47,11 @@ def project_id(bigquery_client: bigquery.Client) -> str: @pytest.fixture(autouse=True) def reset_session(): + """An autouse fixture ensuring each sample runs in a fresh session. + + This allows us to have samples that query data in different locations. + + """ bpd.reset_session() diff --git a/samples/snippets/create_kmeans_model_test.py b/samples/snippets/create_kmeans_model_test.py index 49fe8d0559..aaaa4e3e6f 100644 --- a/samples/snippets/create_kmeans_model_test.py +++ b/samples/snippets/create_kmeans_model_test.py @@ -1,4 +1,4 @@ -# Copyright 2023 Google LLC +# Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -116,7 +116,7 @@ def test_kmeans_sample(project_id: str): cluster_model = KMeans(n_clusters=4) cluster_model.fit(stationstats) cluster_model.to_gbq( - your_gcp_project_id, # For example: "bqml_tutorial.sample_model" + "bqml_tutrial.sample_model", # For example: "bqml_tutorial.sample_model" replace=True, ) # [END bigquery_dataframes_bqml_kmeans_fit] @@ -127,7 +127,7 @@ def test_kmeans_sample(project_id: str): # from BigQuery, but you could also use the `cluster_model` object from # previous steps. cluster_model = bpd.read_gbq_model( - your_gcp_project_id, + "bqml_tutorial.sample_model", # For example: "bqml_tutorial.london_station_clusters", ) From 3ab8220956d7acf4dbcf7f280815e4e462a6ae2f Mon Sep 17 00:00:00 2001 From: Salem Jorden <115185670+SalemJorden@users.noreply.github.com> Date: Fri, 23 Feb 2024 14:22:11 -0600 Subject: [PATCH 27/28] reformatting --- samples/snippets/conftest.py | 1 - samples/snippets/create_kmeans_model_test.py | 19 ++++++++++--------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/samples/snippets/conftest.py b/samples/snippets/conftest.py index 5e43f079fa..c8180565e1 100644 --- a/samples/snippets/conftest.py +++ b/samples/snippets/conftest.py @@ -50,7 +50,6 @@ def reset_session(): """An autouse fixture ensuring each sample runs in a fresh session. This allows us to have samples that query data in different locations. - """ bpd.reset_session() diff --git a/samples/snippets/create_kmeans_model_test.py b/samples/snippets/create_kmeans_model_test.py index aaaa4e3e6f..4afbc42971 100644 --- a/samples/snippets/create_kmeans_model_test.py +++ b/samples/snippets/create_kmeans_model_test.py @@ -13,8 +13,9 @@ # limitations under the License. -def test_kmeans_sample(project_id: str): +def test_kmeans_sample(project_id: str, random_model_id: str): your_gcp_project_id = project_id + your_model_id = random_model_id # [START bigquery_dataframes_bqml_kmeans] import datetime @@ -116,30 +117,30 @@ def test_kmeans_sample(project_id: str): cluster_model = KMeans(n_clusters=4) cluster_model.fit(stationstats) cluster_model.to_gbq( - "bqml_tutrial.sample_model", # For example: "bqml_tutorial.sample_model" + your_model_id, # For example: "bqml_tutorial.london_station_clusters" replace=True, ) # [END bigquery_dataframes_bqml_kmeans_fit] # [START bigquery_dataframes_bqml_kmeans_predict] - # Select model you'll use for training. `read_gbq_model` loads model data - # from BigQuery, but you could also use the `cluster_model` object from - # previous steps. + # Select model you'll use for predictions. `read_gbq_model` loads model + # data from BigQuery, but you could also use the `cluster_model` object + # from previous steps. cluster_model = bpd.read_gbq_model( - "bqml_tutorial.sample_model", + your_model_id, # For example: "bqml_tutorial.london_station_clusters", ) - # Use 'contains' function to predict which clusters contain the stations - # with string "Kennington". + # Use 'contains' function to filter by stations containing the string + # "Kennington". stationstats = stationstats.loc[ stationstats["station_name"].str.contains("Kennington") ] result = cluster_model.predict(stationstats) - # Expected output results: >>>results.head(3) + # Expected output results: >>>results.peek(3) # CENTROID... NEAREST... station_name isweekday duration num_trips dist... # 1 [{'CENTROID_ID'... Borough... weekday 1110 5749 0.13 # 2 [{'CENTROID_ID'... Borough... weekend 2125 1774 0.13 From ae9a36284108513d08c25cfd9e350400b3cb349b Mon Sep 17 00:00:00 2001 From: Salem Jorden <115185670+SalemJorden@users.noreply.github.com> Date: Mon, 26 Feb 2024 14:27:28 -0600 Subject: [PATCH 28/28] reformat --- samples/snippets/conftest.py | 26 ++++++++++++++++++++ samples/snippets/create_kmeans_model_test.py | 4 +-- 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/samples/snippets/conftest.py b/samples/snippets/conftest.py index c8180565e1..d34837b3e2 100644 --- a/samples/snippets/conftest.py +++ b/samples/snippets/conftest.py @@ -52,6 +52,7 @@ def reset_session(): This allows us to have samples that query data in different locations. """ bpd.reset_session() + bpd.options.bigquery.location = None @pytest.fixture(scope="session") @@ -64,6 +65,17 @@ def dataset_id(bigquery_client: bigquery.Client, project_id: str) -> Iterator[st bigquery_client.delete_dataset(dataset, delete_contents=True, not_found_ok=True) +@pytest.fixture(scope="session") +def dataset_id_eu(bigquery_client: bigquery.Client, project_id: str) -> Iterator[str]: + dataset_id = prefixer.create_prefix() + full_dataset_id = f"{project_id}.{dataset_id}" + dataset = bigquery.Dataset(full_dataset_id) + dataset.location = "EU" + bigquery_client.create_dataset(dataset) + yield dataset_id + bigquery_client.delete_dataset(dataset, delete_contents=True, not_found_ok=True) + + @pytest.fixture def random_model_id( bigquery_client: bigquery.Client, project_id: str, dataset_id: str @@ -75,3 +87,17 @@ def random_model_id( full_model_id = f"{project_id}.{dataset_id}.{random_model_id}" yield full_model_id bigquery_client.delete_model(full_model_id, not_found_ok=True) + + +@pytest.fixture +def random_model_id_eu( + bigquery_client: bigquery.Client, project_id: str, dataset_id_eu: str +) -> Iterator[str]: + """ + Create a new table ID each time, so random_model_id_eu can be used + as a target for load jobs. + """ + random_model_id_eu = prefixer.create_prefix() + full_model_id = f"{project_id}.{dataset_id_eu}.{random_model_id_eu}" + yield full_model_id + bigquery_client.delete_model(full_model_id, not_found_ok=True) diff --git a/samples/snippets/create_kmeans_model_test.py b/samples/snippets/create_kmeans_model_test.py index 4afbc42971..2429060d09 100644 --- a/samples/snippets/create_kmeans_model_test.py +++ b/samples/snippets/create_kmeans_model_test.py @@ -13,9 +13,9 @@ # limitations under the License. -def test_kmeans_sample(project_id: str, random_model_id: str): +def test_kmeans_sample(project_id: str, random_model_id_eu: str): your_gcp_project_id = project_id - your_model_id = random_model_id + your_model_id = random_model_id_eu # [START bigquery_dataframes_bqml_kmeans] import datetime