From 69fe5d7392f03c6ef14d42e8cd6e26be62d409cd Mon Sep 17 00:00:00 2001
From: Salem Boyland <salemb@google.com>
Date: Wed, 6 Dec 2023 12:12:49 -0600
Subject: [PATCH 01/28] k-means code sample

---
 samples/snippets/create_k-means_model | 84 +++++++++++++++++++++++++++
 1 file changed, 84 insertions(+)
 create mode 100644 samples/snippets/create_k-means_model

diff --git a/samples/snippets/create_k-means_model b/samples/snippets/create_k-means_model
new file mode 100644
index 0000000000..fb129f19c0
--- /dev/null
+++ b/samples/snippets/create_k-means_model
@@ -0,0 +1,84 @@
+def test_kmeans_sample():
+   import bigframes.pandas as bpd
+   import bigframes
+   from bigframes import dataframe
+   import datetime
+
+
+   bigframes.options.bigquery.project= "username-testing"
+
+
+   # read_gbq: Loads a DataFrame from BigQuery
+
+
+   h = bpd.read_gbq("bigquery-public-data.london_bicycles.cycle_hire")
+   s= bpd.read_gbq(
+       '''
+       SELECT
+       id,
+       ST_DISTANCE(
+           ST_GEOGPOINT(s.longitude, s.latitude),
+           ST_GEOGPOINT(-0.1, 51.5)
+       ) / 1000 AS distance_from_city_center
+       FROM
+       `bigquery-public-data.london_bicycles.cycle_stations` s
+       '''
+   )
+   # transform the data
+
+
+   h= h.rename(columns={"start_station_name": "station_name","start_station_id": "station_id"} )
+
+
+   h= h[["start_date", "station_name", "station_id", "duration"]]
+
+
+   start_date = datetime.datetime.now()
+
+
+   sample_time = datetime.datetime(2015, 1, 1, 0, 0, 0, tzinfo= datetime.timezone.utc)
+
+
+   sample_time2 = datetime.datetime(2016, 1, 1, 0, 0, 0, tzinfo= datetime.timezone.utc)
+
+
+   h= h.loc[(h["start_date"] >= sample_time) & (h["start_date"] <= sample_time2)]
+
+
+   isweekday = h.start_date.dt.dayofweek.map({0: "weekday", 1: "weekday", 2: "weekday", 3: "weekday",
+                                   4:"weekday",5:"weekend", 6:"weekend"})
+
+
+   # create the dataframe variable
+
+
+   df= bpd.DataFrame()
+
+
+   merged_df = h.merge(
+       right= s,
+       how="inner",
+       left_on= "station_id",
+       right_on= "id",
+   )
+
+
+  stationstats = merged_df.groupby("station_name").agg({"duration":[ "mean","count"] , "distance_from_city_center": "max"})
+
+
+   stationstats.columns=["duration","num_trips","distance_from_city_center"]
+
+
+   stationstats.sort_values(by="distance_from_city_center", ascending=True)
+
+
+   from bigframes.ml.cluster import KMeans
+
+
+   cluster_model = KMeans(n_clusters=4)
+
+
+   cluster_model.fit(stationstats)
+
+
+   predict = cluster_model.predict(stationstats)
\ No newline at end of file

From 5fb1d4f94d798a97deee8e8de46b7640a0e48c3b Mon Sep 17 00:00:00 2001
From: Salem Boyland <salemb@google.com>
Date: Mon, 11 Dec 2023 12:28:11 -0600
Subject: [PATCH 02/28] formatting

---
 samples/snippets/create_k-means_model | 87 +++++++++++++--------------
 1 file changed, 43 insertions(+), 44 deletions(-)

diff --git a/samples/snippets/create_k-means_model b/samples/snippets/create_k-means_model
index fb129f19c0..c1eb05f27a 100644
--- a/samples/snippets/create_k-means_model
+++ b/samples/snippets/create_k-means_model
@@ -1,84 +1,83 @@
 def test_kmeans_sample():
-   import bigframes.pandas as bpd
-   import bigframes
-   from bigframes import dataframe
-   import datetime
+    import bigframes.pandas as bpd
+    import bigframes
+    from bigframes import dataframe
+    import datetime
 
 
-   bigframes.options.bigquery.project= "username-testing"
+    bigframes.options.bigquery.project= "username-testing"
 
 
-   # read_gbq: Loads a DataFrame from BigQuery
+    # read_gbq: Loads a DataFrame from BigQuery
 
+    h = bpd.read_gbq("bigquery-public-data.london_bicycles.cycle_hire")
+    s= bpd.read_gbq(
+        '''
+        SELECT
+        id,
+        ST_DISTANCE(
+            ST_GEOGPOINT(s.longitude, s.latitude),
+            ST_GEOGPOINT(-0.1, 51.5)
+        ) / 1000 AS distance_from_city_center
+        FROM
+        `bigquery-public-data.london_bicycles.cycle_stations` s
+        '''
+    )
+    # transform the data
 
-   h = bpd.read_gbq("bigquery-public-data.london_bicycles.cycle_hire")
-   s= bpd.read_gbq(
-       '''
-       SELECT
-       id,
-       ST_DISTANCE(
-           ST_GEOGPOINT(s.longitude, s.latitude),
-           ST_GEOGPOINT(-0.1, 51.5)
-       ) / 1000 AS distance_from_city_center
-       FROM
-       `bigquery-public-data.london_bicycles.cycle_stations` s
-       '''
-   )
-   # transform the data
 
+    h= h.rename(columns={"start_station_name": "station_name","start_station_id": "station_id"} )
 
-   h= h.rename(columns={"start_station_name": "station_name","start_station_id": "station_id"} )
 
+    h= h[["start_date", "station_name", "station_id", "duration"]]
 
-   h= h[["start_date", "station_name", "station_id", "duration"]]
 
+    start_date = datetime.datetime.now()
 
-   start_date = datetime.datetime.now()
 
+    sample_time = datetime.datetime(2015, 1, 1, 0, 0, 0, tzinfo= datetime.timezone.utc)
 
-   sample_time = datetime.datetime(2015, 1, 1, 0, 0, 0, tzinfo= datetime.timezone.utc)
 
+    sample_time2 = datetime.datetime(2016, 1, 1, 0, 0, 0, tzinfo= datetime.timezone.utc)
 
-   sample_time2 = datetime.datetime(2016, 1, 1, 0, 0, 0, tzinfo= datetime.timezone.utc)
 
+    h= h.loc[(h["start_date"] >= sample_time) & (h["start_date"] <= sample_time2)]
 
-   h= h.loc[(h["start_date"] >= sample_time) & (h["start_date"] <= sample_time2)]
 
+    isweekday = h.start_date.dt.dayofweek.map({0: "weekday", 1: "weekday", 2: "weekday", 3: "weekday",
+                                    4:"weekday",5:"weekend", 6:"weekend"})
 
-   isweekday = h.start_date.dt.dayofweek.map({0: "weekday", 1: "weekday", 2: "weekday", 3: "weekday",
-                                   4:"weekday",5:"weekend", 6:"weekend"})
 
+    # create the dataframe variable
 
-   # create the dataframe variable
 
+    df= bpd.DataFrame()
 
-   df= bpd.DataFrame()
 
+    merged_df = h.merge(
+        right= s,
+        how="inner",
+        left_on= "station_id",
+        right_on= "id",
+    )
 
-   merged_df = h.merge(
-       right= s,
-       how="inner",
-       left_on= "station_id",
-       right_on= "id",
-   )
 
+    stationstats = merged_df.groupby("station_name").agg({"duration":[ "mean","count"] , "distance_from_city_center": "max"})
 
-  stationstats = merged_df.groupby("station_name").agg({"duration":[ "mean","count"] , "distance_from_city_center": "max"})
 
+    stationstats.columns=["duration","num_trips","distance_from_city_center"]
 
-   stationstats.columns=["duration","num_trips","distance_from_city_center"]
 
+    stationstats.sort_values(by="distance_from_city_center", ascending=True)
 
-   stationstats.sort_values(by="distance_from_city_center", ascending=True)
 
+    from bigframes.ml.cluster import KMeans
 
-   from bigframes.ml.cluster import KMeans
 
+    cluster_model = KMeans(n_clusters=4)
 
-   cluster_model = KMeans(n_clusters=4)
 
+    cluster_model.fit(stationstats)
 
-   cluster_model.fit(stationstats)
 
-
-   predict = cluster_model.predict(stationstats)
\ No newline at end of file
+    predict = cluster_model.predict(stationstats)
\ No newline at end of file

From 523255f7c823f6e4e5663025ea0ec27559e0fdff Mon Sep 17 00:00:00 2001
From: Salem Boyland <salemb@google.com>
Date: Mon, 11 Dec 2023 12:57:34 -0600
Subject: [PATCH 03/28] added test

---
 samples/snippets/create_k-means_model | 38 +++++----------------------
 1 file changed, 7 insertions(+), 31 deletions(-)

diff --git a/samples/snippets/create_k-means_model b/samples/snippets/create_k-means_model
index c1eb05f27a..932ae43d44 100644
--- a/samples/snippets/create_k-means_model
+++ b/samples/snippets/create_k-means_model
@@ -4,12 +4,11 @@ def test_kmeans_sample():
     from bigframes import dataframe
     import datetime
 
-
-    bigframes.options.bigquery.project= "username-testing"
+    #NOTE:  ask about line below and whether it is needed outside of colab notebooks
+    #bigframes.options.bigquery.project= "username-testing"
 
 
     # read_gbq: Loads a DataFrame from BigQuery
-
     h = bpd.read_gbq("bigquery-public-data.london_bicycles.cycle_hire")
     s= bpd.read_gbq(
         '''
@@ -24,60 +23,37 @@ def test_kmeans_sample():
         '''
     )
     # transform the data
-
-
     h= h.rename(columns={"start_station_name": "station_name","start_station_id": "station_id"} )
-
-
     h= h[["start_date", "station_name", "station_id", "duration"]]
 
-
     start_date = datetime.datetime.now()
-
-
     sample_time = datetime.datetime(2015, 1, 1, 0, 0, 0, tzinfo= datetime.timezone.utc)
-
-
     sample_time2 = datetime.datetime(2016, 1, 1, 0, 0, 0, tzinfo= datetime.timezone.utc)
 
-
     h= h.loc[(h["start_date"] >= sample_time) & (h["start_date"] <= sample_time2)]
 
-
     isweekday = h.start_date.dt.dayofweek.map({0: "weekday", 1: "weekday", 2: "weekday", 3: "weekday",
                                     4:"weekday",5:"weekend", 6:"weekend"})
 
-
-    # create the dataframe variable
-
-
+    # create the new dataframe variable
     df= bpd.DataFrame()
 
-
     merged_df = h.merge(
         right= s,
         how="inner",
         left_on= "station_id",
         right_on= "id",
     )
-
-
     stationstats = merged_df.groupby("station_name").agg({"duration":[ "mean","count"] , "distance_from_city_center": "max"})
 
-
     stationstats.columns=["duration","num_trips","distance_from_city_center"]
 
-
     stationstats.sort_values(by="distance_from_city_center", ascending=True)
 
-
     from bigframes.ml.cluster import KMeans
-
-
     cluster_model = KMeans(n_clusters=4)
+    cluster_model= cluster_model.fit(stationstats)
 
-
-    cluster_model.fit(stationstats)
-
-
-    predict = cluster_model.predict(stationstats)
\ No newline at end of file
+    def predict_kennington_stations(stationstats):
+        station_filter = "REGEXP_CONTAINS(station_name, 'Kennington')"
+        predict = cluster_model.predict(station_filter)

From 3bb267aecd663a072bf243f8ec5391926ebc050f Mon Sep 17 00:00:00 2001
From: Salem Boyland <salemb@google.com>
Date: Mon, 11 Dec 2023 14:44:19 -0600
Subject: [PATCH 04/28] docs: add code sampke for creating kmeans model

---
 ...eans_model => create_kmeans_model_test.py} | 33 ++++++++++---------
 1 file changed, 17 insertions(+), 16 deletions(-)
 rename samples/snippets/{create_k-means_model => create_kmeans_model_test.py} (68%)

diff --git a/samples/snippets/create_k-means_model b/samples/snippets/create_kmeans_model_test.py
similarity index 68%
rename from samples/snippets/create_k-means_model
rename to samples/snippets/create_kmeans_model_test.py
index 932ae43d44..789b1109b3 100644
--- a/samples/snippets/create_k-means_model
+++ b/samples/snippets/create_kmeans_model_test.py
@@ -7,7 +7,6 @@ def test_kmeans_sample():
     #NOTE:  ask about line below and whether it is needed outside of colab notebooks
     #bigframes.options.bigquery.project= "username-testing"
 
-
     # read_gbq: Loads a DataFrame from BigQuery
     h = bpd.read_gbq("bigquery-public-data.london_bicycles.cycle_hire")
     s= bpd.read_gbq(
@@ -26,6 +25,7 @@ def test_kmeans_sample():
     h= h.rename(columns={"start_station_name": "station_name","start_station_id": "station_id"} )
     h= h[["start_date", "station_name", "station_id", "duration"]]
 
+    # NOTE: line below is not accessed, is it needed outside of colab notebook?
     start_date = datetime.datetime.now()
     sample_time = datetime.datetime(2015, 1, 1, 0, 0, 0, tzinfo= datetime.timezone.utc)
     sample_time2 = datetime.datetime(2016, 1, 1, 0, 0, 0, tzinfo= datetime.timezone.utc)
@@ -35,25 +35,26 @@ def test_kmeans_sample():
     isweekday = h.start_date.dt.dayofweek.map({0: "weekday", 1: "weekday", 2: "weekday", 3: "weekday",
                                     4:"weekday",5:"weekend", 6:"weekend"})
 
-    # create the new dataframe variable
-    df= bpd.DataFrame()
-
+    # create the new dataframe variable, stationstats
     merged_df = h.merge(
-        right= s,
-        how="inner",
-        left_on= "station_id",
-        right_on= "id",
-    )
-    stationstats = merged_df.groupby("station_name").agg({"duration":[ "mean","count"] , "distance_from_city_center": "max"})
-
-    stationstats.columns=["duration","num_trips","distance_from_city_center"]
+            right= s,
+            how="inner",
+            left_on= "station_id",
+            right_on= "id",
+            )
 
-    stationstats.sort_values(by="distance_from_city_center", ascending=True)
+    stationstats = merged_df.groupby("station_name").agg({"duration":[ "mean","count"] , "distance_from_city_center": "max"})
+    def station_filter():
+        stationstats.columns = ["duration","num_trips","distance_from_city_center"]
+        stationstats.sort_values(by = "distance_from_city_center", ascending = True)
+        filter = '''REGEXP_CONTAINS(station_name, 'Kennington')'''
 
+    # import the KMeans model to cluster the data
     from bigframes.ml.cluster import KMeans
     cluster_model = KMeans(n_clusters=4)
     cluster_model= cluster_model.fit(stationstats)
 
-    def predict_kennington_stations(stationstats):
-        station_filter = "REGEXP_CONTAINS(station_name, 'Kennington')"
-        predict = cluster_model.predict(station_filter)
+    #the following function predicts the cluster of every station that has the string "Kennington" in its name.
+    def predict_kennington_stations():
+        cluster_model.predict(stationstats)
+        

From 73d2a4681212c1881366c298f44f228bbf208932 Mon Sep 17 00:00:00 2001
From: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com>
Date: Mon, 11 Dec 2023 20:48:09 +0000
Subject: [PATCH 05/28] =?UTF-8?q?=F0=9F=A6=89=20Updates=20from=20OwlBot=20?=
 =?UTF-8?q?post-processor?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md
---
 samples/snippets/create_kmeans_model_test.py | 68 ++++++++++++--------
 1 file changed, 41 insertions(+), 27 deletions(-)

diff --git a/samples/snippets/create_kmeans_model_test.py b/samples/snippets/create_kmeans_model_test.py
index 789b1109b3..2c6e5712b5 100644
--- a/samples/snippets/create_kmeans_model_test.py
+++ b/samples/snippets/create_kmeans_model_test.py
@@ -1,16 +1,16 @@
 def test_kmeans_sample():
-    import bigframes.pandas as bpd
-    import bigframes
-    from bigframes import dataframe
     import datetime
 
-    #NOTE:  ask about line below and whether it is needed outside of colab notebooks
-    #bigframes.options.bigquery.project= "username-testing"
+    import bigframes
+    from bigframes import dataframe
+    import bigframes.pandas as bpd
 
+    # NOTE:  ask about line below and whether it is needed outside of colab notebooks
+    # bigframes.options.bigquery.project= "username-testing"
     # read_gbq: Loads a DataFrame from BigQuery
     h = bpd.read_gbq("bigquery-public-data.london_bicycles.cycle_hire")
-    s= bpd.read_gbq(
-        '''
+    s = bpd.read_gbq(
+        """
         SELECT
         id,
         ST_DISTANCE(
@@ -19,42 +19,56 @@ def test_kmeans_sample():
         ) / 1000 AS distance_from_city_center
         FROM
         `bigquery-public-data.london_bicycles.cycle_stations` s
-        '''
+        """
     )
     # transform the data
-    h= h.rename(columns={"start_station_name": "station_name","start_station_id": "station_id"} )
-    h= h[["start_date", "station_name", "station_id", "duration"]]
+    h = h.rename(
+        columns={"start_station_name": "station_name", "start_station_id": "station_id"}
+    )
+    h = h[["start_date", "station_name", "station_id", "duration"]]
 
     # NOTE: line below is not accessed, is it needed outside of colab notebook?
     start_date = datetime.datetime.now()
-    sample_time = datetime.datetime(2015, 1, 1, 0, 0, 0, tzinfo= datetime.timezone.utc)
-    sample_time2 = datetime.datetime(2016, 1, 1, 0, 0, 0, tzinfo= datetime.timezone.utc)
+    sample_time = datetime.datetime(2015, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc)
+    sample_time2 = datetime.datetime(2016, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc)
 
-    h= h.loc[(h["start_date"] >= sample_time) & (h["start_date"] <= sample_time2)]
+    h = h.loc[(h["start_date"] >= sample_time) & (h["start_date"] <= sample_time2)]
 
-    isweekday = h.start_date.dt.dayofweek.map({0: "weekday", 1: "weekday", 2: "weekday", 3: "weekday",
-                                    4:"weekday",5:"weekend", 6:"weekend"})
+    isweekday = h.start_date.dt.dayofweek.map(
+        {
+            0: "weekday",
+            1: "weekday",
+            2: "weekday",
+            3: "weekday",
+            4: "weekday",
+            5: "weekend",
+            6: "weekend",
+        }
+    )
 
     # create the new dataframe variable, stationstats
     merged_df = h.merge(
-            right= s,
-            how="inner",
-            left_on= "station_id",
-            right_on= "id",
-            )
+        right=s,
+        how="inner",
+        left_on="station_id",
+        right_on="id",
+    )
+
+    stationstats = merged_df.groupby("station_name").agg(
+        {"duration": ["mean", "count"], "distance_from_city_center": "max"}
+    )
 
-    stationstats = merged_df.groupby("station_name").agg({"duration":[ "mean","count"] , "distance_from_city_center": "max"})
     def station_filter():
-        stationstats.columns = ["duration","num_trips","distance_from_city_center"]
-        stationstats.sort_values(by = "distance_from_city_center", ascending = True)
-        filter = '''REGEXP_CONTAINS(station_name, 'Kennington')'''
+        stationstats.columns = ["duration", "num_trips", "distance_from_city_center"]
+        stationstats.sort_values(by="distance_from_city_center", ascending=True)
+        filter = """REGEXP_CONTAINS(station_name, 'Kennington')"""
 
     # import the KMeans model to cluster the data
     from bigframes.ml.cluster import KMeans
+
     cluster_model = KMeans(n_clusters=4)
-    cluster_model= cluster_model.fit(stationstats)
+    cluster_model = cluster_model.fit(stationstats)
 
-    #the following function predicts the cluster of every station that has the string "Kennington" in its name.
+    # the following function predicts the cluster of every station that has the string "Kennington" in its name.
     def predict_kennington_stations():
         cluster_model.predict(stationstats)
-        

From b3c0578f1059085430b35c87f105567f0d14ed1c Mon Sep 17 00:00:00 2001
From: Salem Boyland <salemb@google.com>
Date: Tue, 12 Dec 2023 14:24:06 -0600
Subject: [PATCH 06/28] license header + region tags added

---
 samples/snippets/create_kmeans_model_test.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/samples/snippets/create_kmeans_model_test.py b/samples/snippets/create_kmeans_model_test.py
index 789b1109b3..7eb0222a8e 100644
--- a/samples/snippets/create_kmeans_model_test.py
+++ b/samples/snippets/create_kmeans_model_test.py
@@ -1,4 +1,20 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 def test_kmeans_sample():
+# [START bigquery_dataframes_bqml_kmeans]
+
     import bigframes.pandas as bpd
     import bigframes
     from bigframes import dataframe
@@ -58,3 +74,4 @@ def station_filter():
     def predict_kennington_stations():
         cluster_model.predict(stationstats)
         
+# [END bigquery_dataframes_bqml_kmeans]
\ No newline at end of file

From db9f43986ef14aabe9fc9e83d6e7d1ffdfc50e91 Mon Sep 17 00:00:00 2001
From: Salem Jorden <115185670+SalemJorden@users.noreply.github.com>
Date: Tue, 12 Dec 2023 14:55:01 -0600
Subject: [PATCH 07/28] Update samples/snippets/create_kmeans_model_test.py

Co-authored-by: Tim Swast <swast@google.com>
---
 samples/snippets/create_kmeans_model_test.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/samples/snippets/create_kmeans_model_test.py b/samples/snippets/create_kmeans_model_test.py
index 83823e1936..702e83035a 100644
--- a/samples/snippets/create_kmeans_model_test.py
+++ b/samples/snippets/create_kmeans_model_test.py
@@ -30,7 +30,12 @@ def test_kmeans_sample():
     # NOTE:  ask about line below and whether it is needed outside of colab notebooks
     # bigframes.options.bigquery.project= "username-testing"
     # read_gbq: Loads a DataFrame from BigQuery
-    h = bpd.read_gbq("bigquery-public-data.london_bicycles.cycle_hire")
+    h = bpd.read_gbq(
+        "bigquery-public-data.london_bicycles.cycle_hire",
+        columns=[
+            "start_station_name", "start_station_id", "start_date", "duration",
+        ],
+    )
     s = bpd.read_gbq(
         """
         SELECT

From e7bd5ef34bcdbb58a76fb13555e4daae940a6713 Mon Sep 17 00:00:00 2001
From: Salem Boyland <salemb@google.com>
Date: Wed, 13 Dec 2023 12:26:12 -0600
Subject: [PATCH 08/28] code corrections resolved

---
 samples/snippets/create_kmeans_model_test.py | 66 +++++++++-----------
 1 file changed, 28 insertions(+), 38 deletions(-)

diff --git a/samples/snippets/create_kmeans_model_test.py b/samples/snippets/create_kmeans_model_test.py
index 83823e1936..8c133ef755 100644
--- a/samples/snippets/create_kmeans_model_test.py
+++ b/samples/snippets/create_kmeans_model_test.py
@@ -13,24 +13,17 @@
 # limitations under the License.
 
 def test_kmeans_sample():
-<<<<<<< HEAD
 # [START bigquery_dataframes_bqml_kmeans]
-
     import bigframes.pandas as bpd
     import bigframes
     from bigframes import dataframe
-=======
->>>>>>> 73d2a4681212c1881366c298f44f228bbf208932
-    import datetime
-
-    import bigframes
-    from bigframes import dataframe
     import bigframes.pandas as bpd
+    import datetime
 
-    # NOTE:  ask about line below and whether it is needed outside of colab notebooks
-    # bigframes.options.bigquery.project= "username-testing"
-    # read_gbq: Loads a DataFrame from BigQuery
-    h = bpd.read_gbq("bigquery-public-data.london_bicycles.cycle_hire")
+    #Load data from BigQuery
+    h = bpd.read_gbq("bigquery-public-data.london_bicycles.cycle_hire",  h.rename(
+        columns = {"start_station_name": "station_name", "start_station_id": "station_id"}
+    ))
     s = bpd.read_gbq(
         """
         SELECT
@@ -41,22 +34,15 @@ def test_kmeans_sample():
         ) / 1000 AS distance_from_city_center
         FROM
         `bigquery-public-data.london_bicycles.cycle_stations` s
-        """
-    )
-    # transform the data
-    h = h.rename(
-        columns={"start_station_name": "station_name", "start_station_id": "station_id"}
-    )
-    h = h[["start_date", "station_name", "station_id", "duration"]]
+        """ )
 
-    # NOTE: line below is not accessed, is it needed outside of colab notebook?
-    start_date = datetime.datetime.now()
+    # transform data into queryable format
     sample_time = datetime.datetime(2015, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc)
     sample_time2 = datetime.datetime(2016, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc)
 
     h = h.loc[(h["start_date"] >= sample_time) & (h["start_date"] <= sample_time2)]
 
-    isweekday = h.start_date.dt.dayofweek.map(
+    h.start_date.dt.dayofweek.map(
         {
             0: "weekday",
             1: "weekday",
@@ -68,34 +54,38 @@ def test_kmeans_sample():
         }
     )
 
-    # create the new dataframe variable, stationstats
+    #merge dataframes h and s
     merged_df = h.merge(
         right=s,
         how="inner",
         left_on="station_id",
         right_on="id",
     )
-
+    # Create new dataframe variable from merge: 'stationstats' 
     stationstats = merged_df.groupby("station_name").agg(
         {"duration": ["mean", "count"], "distance_from_city_center": "max"}
     )
+    # [END bigquery_dataframes_bqml_kmeans]
+    
 
-    def station_filter():
-        stationstats.columns = ["duration", "num_trips", "distance_from_city_center"]
-        stationstats.sort_values(by="distance_from_city_center", ascending=True)
-        filter = """REGEXP_CONTAINS(station_name, 'Kennington')"""
+    # [START bigquery_dataframes_bqml_kmeans_fit]
 
-    # import the KMeans model to cluster the data
+    # import the KMeans model from bigframes.ml to cluster the data
     from bigframes.ml.cluster import KMeans
 
     cluster_model = KMeans(n_clusters=4)
-    cluster_model = cluster_model.fit(stationstats)
+    cluster_model = cluster_model.fit(stationstats).to_gbq(cluster_model)
+
+    # [END bigquery_dataframes_bqml_kmeans_fit]
+    
+    # [START bigquery_dataframes_bqml_kmeans_predict]
+
+    # Use 'contains' function to find all entries with string "Kennington". 
+    stationstats = stationstats.str.contains("Kennington")
+
+    #Predict using the model
+    result = cluster_model.predict(stationstats)
+
+    # [END bigquery_dataframes_bqml_kmeans_predict]
 
-    # the following function predicts the cluster of every station that has the string "Kennington" in its name.
-    def predict_kennington_stations():
-        cluster_model.predict(stationstats)
-<<<<<<< HEAD
-        
-# [END bigquery_dataframes_bqml_kmeans]
-=======
->>>>>>> 73d2a4681212c1881366c298f44f228bbf208932
+    assert result is not None

From 809ed05547914024b22df2f5b9319b31ebe01e29 Mon Sep 17 00:00:00 2001
From: Salem Boyland <salemb@google.com>
Date: Mon, 18 Dec 2023 14:32:32 -0600
Subject: [PATCH 09/28] code corrections commit 1

---
 samples/snippets/create_kmeans_model_test.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/samples/snippets/create_kmeans_model_test.py b/samples/snippets/create_kmeans_model_test.py
index 8c133ef755..780051fca6 100644
--- a/samples/snippets/create_kmeans_model_test.py
+++ b/samples/snippets/create_kmeans_model_test.py
@@ -13,14 +13,14 @@
 # limitations under the License.
 
 def test_kmeans_sample():
-# [START bigquery_dataframes_bqml_kmeans]
-    import bigframes.pandas as bpd
+    # [START bigquery_dataframes_bqml_kmeans]
+    import datetime
+
     import bigframes
-    from bigframes import dataframe
     import bigframes.pandas as bpd
-    import datetime
 
-    #Load data from BigQuery
+    # Load cycle hires data from BigQuery into a dataframe variable using read_gbq function in order to
+    # extract the relevant information needed to train the model later on in tutorial.
     h = bpd.read_gbq("bigquery-public-data.london_bicycles.cycle_hire",  h.rename(
         columns = {"start_station_name": "station_name", "start_station_id": "station_id"}
     ))

From 5dba2b906bdc4bb389eeb62285dce7fe22b94ee8 Mon Sep 17 00:00:00 2001
From: Salem Boyland <salemb@google.com>
Date: Mon, 18 Dec 2023 17:15:46 -0600
Subject: [PATCH 10/28] descriptions of geospatial analysis functions

---
 samples/snippets/create_kmeans_model_test.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/samples/snippets/create_kmeans_model_test.py b/samples/snippets/create_kmeans_model_test.py
index 780051fca6..68bcb6d129 100644
--- a/samples/snippets/create_kmeans_model_test.py
+++ b/samples/snippets/create_kmeans_model_test.py
@@ -25,6 +25,9 @@ def test_kmeans_sample():
         columns = {"start_station_name": "station_name", "start_station_id": "station_id"}
     ))
     s = bpd.read_gbq(
+    # Here we use a SQL query so that we can use the geospatial analytics functions, ST_GEOPOINT
+    # and ST_DISTANCE, which are supported in GoogleSQL for BigQuery. These functions allow us to analyze the 
+    # geographical data and determine spatial relationships between the geographical features.
         """
         SELECT
         id,

From 2207941b16763e69cd5f46ba77abbb4fe9d521de Mon Sep 17 00:00:00 2001
From: Salem Boyland <salemb@google.com>
Date: Mon, 18 Dec 2023 17:29:01 -0600
Subject: [PATCH 11/28] explantions revised for clarity

---
 samples/snippets/create_kmeans_model_test.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/samples/snippets/create_kmeans_model_test.py b/samples/snippets/create_kmeans_model_test.py
index 68bcb6d129..5c1a275379 100644
--- a/samples/snippets/create_kmeans_model_test.py
+++ b/samples/snippets/create_kmeans_model_test.py
@@ -39,12 +39,15 @@ def test_kmeans_sample():
         `bigquery-public-data.london_bicycles.cycle_stations` s
         """ )
 
-    # transform data into queryable format
+    # Here we transform the datetime data into the UTC timezone for standardization because BigQuery priortizes
+    # UTC as the internal format for global analysis.
     sample_time = datetime.datetime(2015, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc)
     sample_time2 = datetime.datetime(2016, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc)
 
     h = h.loc[(h["start_date"] >= sample_time) & (h["start_date"] <= sample_time2)]
-
+    
+    # In this section, we use a mapping function to transform the start_date column by replacing each day-of-the-week
+    # number with the corresponding label ("weekday" or "weekend").
     h.start_date.dt.dayofweek.map(
         {
             0: "weekday",

From 5e00a3c78e4ae91516cdaa05d9930e6e8f7530c5 Mon Sep 17 00:00:00 2001
From: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com>
Date: Mon, 18 Dec 2023 23:31:27 +0000
Subject: [PATCH 12/28] =?UTF-8?q?=F0=9F=A6=89=20Updates=20from=20OwlBot=20?=
 =?UTF-8?q?post-processor?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md
---
 samples/snippets/create_kmeans_model_test.py | 35 ++++++++++++--------
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/samples/snippets/create_kmeans_model_test.py b/samples/snippets/create_kmeans_model_test.py
index 5c1a275379..3c451e4939 100644
--- a/samples/snippets/create_kmeans_model_test.py
+++ b/samples/snippets/create_kmeans_model_test.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+
 def test_kmeans_sample():
     # [START bigquery_dataframes_bqml_kmeans]
     import datetime
@@ -21,13 +22,19 @@ def test_kmeans_sample():
 
     # Load cycle hires data from BigQuery into a dataframe variable using read_gbq function in order to
     # extract the relevant information needed to train the model later on in tutorial.
-    h = bpd.read_gbq("bigquery-public-data.london_bicycles.cycle_hire",  h.rename(
-        columns = {"start_station_name": "station_name", "start_station_id": "station_id"}
-    ))
+    h = bpd.read_gbq(
+        "bigquery-public-data.london_bicycles.cycle_hire",
+        h.rename(
+            columns={
+                "start_station_name": "station_name",
+                "start_station_id": "station_id",
+            }
+        ),
+    )
     s = bpd.read_gbq(
-    # Here we use a SQL query so that we can use the geospatial analytics functions, ST_GEOPOINT
-    # and ST_DISTANCE, which are supported in GoogleSQL for BigQuery. These functions allow us to analyze the 
-    # geographical data and determine spatial relationships between the geographical features.
+        # Here we use a SQL query so that we can use the geospatial analytics functions, ST_GEOPOINT
+        # and ST_DISTANCE, which are supported in GoogleSQL for BigQuery. These functions allow us to analyze the
+        # geographical data and determine spatial relationships between the geographical features.
         """
         SELECT
         id,
@@ -37,7 +44,8 @@ def test_kmeans_sample():
         ) / 1000 AS distance_from_city_center
         FROM
         `bigquery-public-data.london_bicycles.cycle_stations` s
-        """ )
+        """
+    )
 
     # Here we transform the datetime data into the UTC timezone for standardization because BigQuery priortizes
     # UTC as the internal format for global analysis.
@@ -45,7 +53,7 @@ def test_kmeans_sample():
     sample_time2 = datetime.datetime(2016, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc)
 
     h = h.loc[(h["start_date"] >= sample_time) & (h["start_date"] <= sample_time2)]
-    
+
     # In this section, we use a mapping function to transform the start_date column by replacing each day-of-the-week
     # number with the corresponding label ("weekday" or "weekend").
     h.start_date.dt.dayofweek.map(
@@ -60,19 +68,18 @@ def test_kmeans_sample():
         }
     )
 
-    #merge dataframes h and s
+    # merge dataframes h and s
     merged_df = h.merge(
         right=s,
         how="inner",
         left_on="station_id",
         right_on="id",
     )
-    # Create new dataframe variable from merge: 'stationstats' 
+    # Create new dataframe variable from merge: 'stationstats'
     stationstats = merged_df.groupby("station_name").agg(
         {"duration": ["mean", "count"], "distance_from_city_center": "max"}
     )
     # [END bigquery_dataframes_bqml_kmeans]
-    
 
     # [START bigquery_dataframes_bqml_kmeans_fit]
 
@@ -83,13 +90,13 @@ def test_kmeans_sample():
     cluster_model = cluster_model.fit(stationstats).to_gbq(cluster_model)
 
     # [END bigquery_dataframes_bqml_kmeans_fit]
-    
+
     # [START bigquery_dataframes_bqml_kmeans_predict]
 
-    # Use 'contains' function to find all entries with string "Kennington". 
+    # Use 'contains' function to find all entries with string "Kennington".
     stationstats = stationstats.str.contains("Kennington")
 
-    #Predict using the model
+    # Predict using the model
     result = cluster_model.predict(stationstats)
 
     # [END bigquery_dataframes_bqml_kmeans_predict]

From 7c6422731a76bae2fa4fad51421d833f2e3fa422 Mon Sep 17 00:00:00 2001
From: Salem Jorden <115185670+SalemJorden@users.noreply.github.com>
Date: Tue, 19 Dec 2023 12:44:57 -0600
Subject: [PATCH 13/28] Update samples/snippets/create_kmeans_model_test.py

Co-authored-by: Tim Swast <swast@google.com>
---
 samples/snippets/create_kmeans_model_test.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/samples/snippets/create_kmeans_model_test.py b/samples/snippets/create_kmeans_model_test.py
index 3c451e4939..516a9de5b6 100644
--- a/samples/snippets/create_kmeans_model_test.py
+++ b/samples/snippets/create_kmeans_model_test.py
@@ -20,6 +20,9 @@ def test_kmeans_sample():
     import bigframes
     import bigframes.pandas as bpd
 
+    # You must compute in the EU multi-region to query the London bicycles dataset.
+    bigframes.options.bigquery.location = "EU"
+
     # Load cycle hires data from BigQuery into a dataframe variable using read_gbq function in order to
     # extract the relevant information needed to train the model later on in tutorial.
     h = bpd.read_gbq(

From 11678e06e700dff3d861e25e4a39bc977f26518c Mon Sep 17 00:00:00 2001
From: Salem Boyland <salemb@google.com>
Date: Tue, 19 Dec 2023 13:02:48 -0600
Subject: [PATCH 14/28] code corrections

---
 samples/snippets/create_kmeans_model_test.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/samples/snippets/create_kmeans_model_test.py b/samples/snippets/create_kmeans_model_test.py
index 516a9de5b6..db99ec339e 100644
--- a/samples/snippets/create_kmeans_model_test.py
+++ b/samples/snippets/create_kmeans_model_test.py
@@ -23,11 +23,18 @@ def test_kmeans_sample():
     # You must compute in the EU multi-region to query the London bicycles dataset.
     bigframes.options.bigquery.location = "EU"
 
-    # Load cycle hires data from BigQuery into a dataframe variable using read_gbq function in order to
-    # extract the relevant information needed to train the model later on in tutorial.
+    # Extract the information you'll need to train the k-means model later in this tutorial. Use the
+    # read_gbq function to represent cycle hires data as a DataFrame.
     h = bpd.read_gbq(
         "bigquery-public-data.london_bicycles.cycle_hire",
-        h.rename(
+        col_order =[  
+            "start_station_name",  
+            "start_station_id", 
+            "start_date",
+            "duration"
+        ],
+    )
+    h.rename(
             columns={
                 "start_station_name": "station_name",
                 "start_station_id": "station_id",

From f95cd9fdddb3d63f97cdb0fb4a99f733a87b9c39 Mon Sep 17 00:00:00 2001
From: Salem Boyland <salemb@google.com>
Date: Tue, 19 Dec 2023 15:28:45 -0600
Subject: [PATCH 15/28] code revision

---
 samples/snippets/create_kmeans_model_test.py | 40 ++++++++++----------
 1 file changed, 21 insertions(+), 19 deletions(-)

diff --git a/samples/snippets/create_kmeans_model_test.py b/samples/snippets/create_kmeans_model_test.py
index db99ec339e..4bbafac1c0 100644
--- a/samples/snippets/create_kmeans_model_test.py
+++ b/samples/snippets/create_kmeans_model_test.py
@@ -33,18 +33,16 @@ def test_kmeans_sample():
             "start_date",
             "duration"
         ],
-    )
-    h.rename(
+    ).rename(
             columns={
                 "start_station_name": "station_name",
                 "start_station_id": "station_id",
             }
-        ),
-    )
+        )
+    
     s = bpd.read_gbq(
-        # Here we use a SQL query so that we can use the geospatial analytics functions, ST_GEOPOINT
-        # and ST_DISTANCE, which are supported in GoogleSQL for BigQuery. These functions allow us to analyze the
-        # geographical data and determine spatial relationships between the geographical features.
+        # Use ST_GEOPOINT and ST_DISTANCE to analyze geographical data.
+        # These functions determine spatial relationships between the geographical features.
         """
         SELECT
         id,
@@ -57,16 +55,17 @@ def test_kmeans_sample():
         """
     )
 
-    # Here we transform the datetime data into the UTC timezone for standardization because BigQuery priortizes
-    # UTC as the internal format for global analysis.
+    # Define Python datetime objects in the UTC timezone for range comparison, because BigQuery stores 
+    # timestamp data in the UTC timezone.
     sample_time = datetime.datetime(2015, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc)
     sample_time2 = datetime.datetime(2016, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc)
 
     h = h.loc[(h["start_date"] >= sample_time) & (h["start_date"] <= sample_time2)]
 
-    # In this section, we use a mapping function to transform the start_date column by replacing each day-of-the-week
-    # number with the corresponding label ("weekday" or "weekend").
-    h.start_date.dt.dayofweek.map(
+    # Replace each day-of-the-week number with the corresponding "weekday" or "weekend" label by using the 
+    # Series.map method.
+    h = h.assign(
+        isweekday = h.start_date.dt.dayofweek.map(
         {
             0: "weekday",
             1: "weekday",
@@ -76,26 +75,30 @@ def test_kmeans_sample():
             5: "weekend",
             6: "weekend",
         }
-    )
+    ))
 
-    # merge dataframes h and s
+    # Supplement each trip in "h" with the station distance information from "s" by 
+    # merging the two DataFrames by station ID.
     merged_df = h.merge(
         right=s,
         how="inner",
         left_on="station_id",
         right_on="id",
     )
-    # Create new dataframe variable from merge: 'stationstats'
-    stationstats = merged_df.groupby("station_name").agg(
+
+    # Engineer features to cluster the stations. For each station, find the average trip duration, number of 
+    # trips, and distance from city center.
+    stationstats = merged_df.groupby("station_name", "isweekday").agg(
         {"duration": ["mean", "count"], "distance_from_city_center": "max"}
     )
     # [END bigquery_dataframes_bqml_kmeans]
 
     # [START bigquery_dataframes_bqml_kmeans_fit]
 
-    # import the KMeans model from bigframes.ml to cluster the data
     from bigframes.ml.cluster import KMeans
 
+    # To determine an optimal number of clusters, you would run the CREATE MODEL query for different values of
+    # num_clusters, find the error measure, and pick the point at which the error measure is at its minimum value.
     cluster_model = KMeans(n_clusters=4)
     cluster_model = cluster_model.fit(stationstats).to_gbq(cluster_model)
 
@@ -103,10 +106,9 @@ def test_kmeans_sample():
 
     # [START bigquery_dataframes_bqml_kmeans_predict]
 
-    # Use 'contains' function to find all entries with string "Kennington".
+    # Use 'contains' function to predict which clusters contain the stations with string "Kennington".
     stationstats = stationstats.str.contains("Kennington")
 
-    # Predict using the model
     result = cluster_model.predict(stationstats)
 
     # [END bigquery_dataframes_bqml_kmeans_predict]

From 0df2dec28e17584bc285f305caf68c7988acfb2c Mon Sep 17 00:00:00 2001
From: Salem Boyland <salemb@google.com>
Date: Wed, 20 Dec 2023 14:46:30 -0600
Subject: [PATCH 16/28] code changes

---
 samples/snippets/create_kmeans_model_test.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/samples/snippets/create_kmeans_model_test.py b/samples/snippets/create_kmeans_model_test.py
index 4bbafac1c0..8cf928eedf 100644
--- a/samples/snippets/create_kmeans_model_test.py
+++ b/samples/snippets/create_kmeans_model_test.py
@@ -88,9 +88,13 @@ def test_kmeans_sample():
 
     # Engineer features to cluster the stations. For each station, find the average trip duration, number of 
     # trips, and distance from city center.
-    stationstats = merged_df.groupby("station_name", "isweekday").agg(
+    stationstats = merged_df.groupby(["station_name", "isweekday"]).agg(
         {"duration": ["mean", "count"], "distance_from_city_center": "max"}
     )
+    stationstats.columns=["duration","num_trips","distance_from_city_center"]
+    stationstats.sort_values(by="distance_from_city_center", ascending=True)
+    #Expected output looks as follows
+
     # [END bigquery_dataframes_bqml_kmeans]
 
     # [START bigquery_dataframes_bqml_kmeans_fit]
@@ -100,7 +104,7 @@ def test_kmeans_sample():
     # To determine an optimal number of clusters, you would run the CREATE MODEL query for different values of
     # num_clusters, find the error measure, and pick the point at which the error measure is at its minimum value.
     cluster_model = KMeans(n_clusters=4)
-    cluster_model = cluster_model.fit(stationstats).to_gbq(cluster_model)
+    cluster_model.fit(stationstats)
 
     # [END bigquery_dataframes_bqml_kmeans_fit]
 

From 06a24905aae918e2a8a33a618547bce80586651d Mon Sep 17 00:00:00 2001
From: Salem Boyland <salemb@google.com>
Date: Thu, 21 Dec 2023 15:27:27 -0600
Subject: [PATCH 17/28] revisions

---
 samples/snippets/create_kmeans_model_test.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/samples/snippets/create_kmeans_model_test.py b/samples/snippets/create_kmeans_model_test.py
index 8cf928eedf..a3afbf3644 100644
--- a/samples/snippets/create_kmeans_model_test.py
+++ b/samples/snippets/create_kmeans_model_test.py
@@ -20,6 +20,7 @@ def test_kmeans_sample():
     import bigframes
     import bigframes.pandas as bpd
 
+    bigframes.options.bigquery.project = "salemb-testing"
     # You must compute in the EU multi-region to query the London bicycles dataset.
     bigframes.options.bigquery.location = "EU"
 
@@ -89,7 +90,7 @@ def test_kmeans_sample():
     # Engineer features to cluster the stations. For each station, find the average trip duration, number of 
     # trips, and distance from city center.
     stationstats = merged_df.groupby(["station_name", "isweekday"]).agg(
-        {"duration": ["mean", "count"], "distance_from_city_center": "max"}
+    {"duration": ["mean", "count"], "distance_from_city_center": "max"}
     )
     stationstats.columns=["duration","num_trips","distance_from_city_center"]
     stationstats.sort_values(by="distance_from_city_center", ascending=True)
@@ -111,9 +112,10 @@ def test_kmeans_sample():
     # [START bigquery_dataframes_bqml_kmeans_predict]
 
     # Use 'contains' function to predict which clusters contain the stations with string "Kennington".
-    stationstats = stationstats.str.contains("Kennington")
+    stationstats = stationstats.contains("Kennington")
 
     result = cluster_model.predict(stationstats)
+    #Expected output results:
 
     # [END bigquery_dataframes_bqml_kmeans_predict]
 

From 1a9f7d9ff8fef3951c9a93f6d49dbff577cd4c98 Mon Sep 17 00:00:00 2001
From: Salem Boyland <salemb@google.com>
Date: Tue, 26 Dec 2023 12:47:30 -0600
Subject: [PATCH 18/28] expected output previews

---
 samples/snippets/create_kmeans_model_test.py | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/samples/snippets/create_kmeans_model_test.py b/samples/snippets/create_kmeans_model_test.py
index a3afbf3644..65f2408ee3 100644
--- a/samples/snippets/create_kmeans_model_test.py
+++ b/samples/snippets/create_kmeans_model_test.py
@@ -94,7 +94,14 @@ def test_kmeans_sample():
     )
     stationstats.columns=["duration","num_trips","distance_from_city_center"]
     stationstats.sort_values(by="distance_from_city_center", ascending=True)
-    #Expected output looks as follows
+
+#Expected output results: >>> stationstats.head(3)
+#                                                          duration	        num_trips	    distance_from_city_center
+#       station_name	                    isweekday			
+#       Abbey Orchard Street, Westminster	weekday	    1139.686075	        14908	                2.231931
+#                                           weekend	    1538.533802	        2278	                2.231931
+#       Abbotsbury Road, Holland Park	    weekday	    1110.262258	2631	7.338276
+# 3 rows × 3 columns
 
     # [END bigquery_dataframes_bqml_kmeans]
 
@@ -115,7 +122,12 @@ def test_kmeans_sample():
     stationstats = stationstats.contains("Kennington")
 
     result = cluster_model.predict(stationstats)
-    #Expected output results:
+    #Expected output results:   >>>results.head(2)
+    #                                                    CENTROID_ID     NEAREST_CENTROIDS_DISTANCE	          duration	num_trips	distance_from_city_center
+    #                   station_name	    isweekday					
+    #   Abbey Orchard Street, Westminster	weekday        2	        [{'CENTROID_ID': 2, 'DISTANCE': 0.695970380477...	1139.686075	14908	2.231931
+    #                                       weekend	       1	        [{'CENTROID_ID': 1, 'DISTANCE': 0.467343170961...	1538.533802	2278	2.231931
+    # 2 rows × 5 columns
 
     # [END bigquery_dataframes_bqml_kmeans_predict]
 

From 464cf1c285912c0da654997a0dc4af8b859fedc2 Mon Sep 17 00:00:00 2001
From: Salem Boyland <salemb@google.com>
Date: Tue, 26 Dec 2023 12:57:08 -0600
Subject: [PATCH 19/28] revisions

---
 samples/snippets/create_kmeans_model_test.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/samples/snippets/create_kmeans_model_test.py b/samples/snippets/create_kmeans_model_test.py
index 65f2408ee3..915f519a2d 100644
--- a/samples/snippets/create_kmeans_model_test.py
+++ b/samples/snippets/create_kmeans_model_test.py
@@ -97,7 +97,8 @@ def test_kmeans_sample():
 
 #Expected output results: >>> stationstats.head(3)
 #                                                          duration	        num_trips	    distance_from_city_center
-#       station_name	                    isweekday			
+#       station_name	                    isweekday
+#			
 #       Abbey Orchard Street, Westminster	weekday	    1139.686075	        14908	                2.231931
 #                                           weekend	    1538.533802	        2278	                2.231931
 #       Abbotsbury Road, Holland Park	    weekday	    1110.262258	2631	7.338276
@@ -119,14 +120,15 @@ def test_kmeans_sample():
     # [START bigquery_dataframes_bqml_kmeans_predict]
 
     # Use 'contains' function to predict which clusters contain the stations with string "Kennington".
-    stationstats = stationstats.contains("Kennington")
+    stationstats = stationstats.str.contains("Kennington")
 
     result = cluster_model.predict(stationstats)
     #Expected output results:   >>>results.head(2)
-    #                                                    CENTROID_ID     NEAREST_CENTROIDS_DISTANCE	          duration	num_trips	distance_from_city_center
-    #                   station_name	    isweekday					
-    #   Abbey Orchard Street, Westminster	weekday        2	        [{'CENTROID_ID': 2, 'DISTANCE': 0.695970380477...	1139.686075	14908	2.231931
-    #                                       weekend	       1	        [{'CENTROID_ID': 1, 'DISTANCE': 0.467343170961...	1538.533802	2278	2.231931
+    #                                                    CENTROID_ID     NEAREST_CENTROIDS_DISTANCE	                             duration	                num_trips	    distance_from_city_center
+    #                   station_name	    isweekday
+    #					
+    #   Abbey Orchard Street, Westminster	weekday            2	      [{'CENTROID_ID': 2, 'DISTANCE': 0.695970380477...	    1139.686075	                  14908	            2.231931
+    #                                       weekend	           1	      [{'CENTROID_ID': 1, 'DISTANCE': 0.467343170961...	    1538.533802	                  2278	            2.231931
     # 2 rows × 5 columns
 
     # [END bigquery_dataframes_bqml_kmeans_predict]

From 019e2432c4879a7f6837d2cf9acbfdf4c6c8954c Mon Sep 17 00:00:00 2001
From: Salem Boyland <salemb@google.com>
Date: Tue, 26 Dec 2023 13:54:20 -0600
Subject: [PATCH 20/28] tests passing, expected output characters >80

---
 samples/snippets/create_kmeans_model_test.py | 92 ++++++++++----------
 1 file changed, 45 insertions(+), 47 deletions(-)

diff --git a/samples/snippets/create_kmeans_model_test.py b/samples/snippets/create_kmeans_model_test.py
index 915f519a2d..9e2c7c4da4 100644
--- a/samples/snippets/create_kmeans_model_test.py
+++ b/samples/snippets/create_kmeans_model_test.py
@@ -28,19 +28,14 @@ def test_kmeans_sample():
     # read_gbq function to represent cycle hires data as a DataFrame.
     h = bpd.read_gbq(
         "bigquery-public-data.london_bicycles.cycle_hire",
-        col_order =[  
-            "start_station_name",  
-            "start_station_id", 
-            "start_date",
-            "duration"
-        ],
+        col_order=["start_station_name", "start_station_id", "start_date", "duration"],
     ).rename(
-            columns={
-                "start_station_name": "station_name",
-                "start_station_id": "station_id",
-            }
-        )
-    
+        columns={
+            "start_station_name": "station_name",
+            "start_station_id": "station_id",
+        }
+    )
+
     s = bpd.read_gbq(
         # Use ST_GEOPOINT and ST_DISTANCE to analyze geographical data.
         # These functions determine spatial relationships between the geographical features.
@@ -56,29 +51,30 @@ def test_kmeans_sample():
         """
     )
 
-    # Define Python datetime objects in the UTC timezone for range comparison, because BigQuery stores 
+    # Define Python datetime objects in the UTC timezone for range comparison, because BigQuery stores
     # timestamp data in the UTC timezone.
     sample_time = datetime.datetime(2015, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc)
     sample_time2 = datetime.datetime(2016, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc)
 
     h = h.loc[(h["start_date"] >= sample_time) & (h["start_date"] <= sample_time2)]
 
-    # Replace each day-of-the-week number with the corresponding "weekday" or "weekend" label by using the 
+    # Replace each day-of-the-week number with the corresponding "weekday" or "weekend" label by using the
     # Series.map method.
     h = h.assign(
-        isweekday = h.start_date.dt.dayofweek.map(
-        {
-            0: "weekday",
-            1: "weekday",
-            2: "weekday",
-            3: "weekday",
-            4: "weekday",
-            5: "weekend",
-            6: "weekend",
-        }
-    ))
+        isweekday=h.start_date.dt.dayofweek.map(
+            {
+                0: "weekday",
+                1: "weekday",
+                2: "weekday",
+                3: "weekday",
+                4: "weekday",
+                5: "weekend",
+                6: "weekend",
+            }
+        )
+    )
 
-    # Supplement each trip in "h" with the station distance information from "s" by 
+    # Supplement each trip in "h" with the station distance information from "s" by
     # merging the two DataFrames by station ID.
     merged_df = h.merge(
         right=s,
@@ -87,22 +83,22 @@ def test_kmeans_sample():
         right_on="id",
     )
 
-    # Engineer features to cluster the stations. For each station, find the average trip duration, number of 
+    # Engineer features to cluster the stations. For each station, find the average trip duration, number of
     # trips, and distance from city center.
     stationstats = merged_df.groupby(["station_name", "isweekday"]).agg(
-    {"duration": ["mean", "count"], "distance_from_city_center": "max"}
+        {"duration": ["mean", "count"], "distance_from_city_center": "max"}
     )
-    stationstats.columns=["duration","num_trips","distance_from_city_center"]
-    stationstats.sort_values(by="distance_from_city_center", ascending=True)
-
-#Expected output results: >>> stationstats.head(3)
-#                                                          duration	        num_trips	    distance_from_city_center
-#       station_name	                    isweekday
-#			
-#       Abbey Orchard Street, Westminster	weekday	    1139.686075	        14908	                2.231931
-#                                           weekend	    1538.533802	        2278	                2.231931
-#       Abbotsbury Road, Holland Park	    weekday	    1110.262258	2631	7.338276
-# 3 rows × 3 columns
+    stationstats.columns = ["duration", "num_trips", "distance_from_city_center"]
+    stationstats = stationstats.sort_values(
+        by="distance_from_city_center", ascending=True
+    ).reset_index()
+
+    # Expected output results: >>> stationstats.head(3)
+    # station_name	   isweekday	duration	num_trips	distance_from_city_center
+    # Borough Road...	weekday	    1110	    5749	    0.12624
+    # Borough Road...	weekend	    2125	    1774	    0.12624
+    # Webber Street...	weekday	    795	        6517	    0.164021
+    #   3 rows × 5 columns
 
     # [END bigquery_dataframes_bqml_kmeans]
 
@@ -120,16 +116,18 @@ def test_kmeans_sample():
     # [START bigquery_dataframes_bqml_kmeans_predict]
 
     # Use 'contains' function to predict which clusters contain the stations with string "Kennington".
-    stationstats = stationstats.str.contains("Kennington")
+    stationstats = stationstats.loc[
+        stationstats["station_name"].str.contains("Kennington")
+    ]
 
     result = cluster_model.predict(stationstats)
-    #Expected output results:   >>>results.head(2)
-    #                                                    CENTROID_ID     NEAREST_CENTROIDS_DISTANCE	                             duration	                num_trips	    distance_from_city_center
-    #                   station_name	    isweekday
-    #					
-    #   Abbey Orchard Street, Westminster	weekday            2	      [{'CENTROID_ID': 2, 'DISTANCE': 0.695970380477...	    1139.686075	                  14908	            2.231931
-    #                                       weekend	           1	      [{'CENTROID_ID': 1, 'DISTANCE': 0.467343170961...	    1538.533802	                  2278	            2.231931
-    # 2 rows × 5 columns
+
+    # Expected output results:   >>>results.head(3)
+    # CENTROID_ID	NEAREST_CENTROIDS...	    station_name    isweekday	duration	num_trips	distance...
+    #	1	   [{'CENTROID_ID': 1, 'DISTANCE': 2	Borough...	    weekday	    1110	    5749	    0.13
+    #	2	   [{'CENTROID_ID': 2, 'DISTANCE': 2	Borough...	    weekend	    2125        1774	    0.13
+    #	1	   [{'CENTROID_ID': 1, 'DISTANCE': 2	Webber...	    weekday	    795	        6517	    0.16
+    #   3 rows × 7 columns
 
     # [END bigquery_dataframes_bqml_kmeans_predict]
 

From 72174f90fa795710fd803cb9453460fab1fcba13 Mon Sep 17 00:00:00 2001
From: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com>
Date: Tue, 26 Dec 2023 19:56:36 +0000
Subject: [PATCH 21/28] =?UTF-8?q?=F0=9F=A6=89=20Updates=20from=20OwlBot=20?=
 =?UTF-8?q?post-processor?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md
---
 samples/snippets/create_kmeans_model_test.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/samples/snippets/create_kmeans_model_test.py b/samples/snippets/create_kmeans_model_test.py
index 9e2c7c4da4..854f642e41 100644
--- a/samples/snippets/create_kmeans_model_test.py
+++ b/samples/snippets/create_kmeans_model_test.py
@@ -124,9 +124,9 @@ def test_kmeans_sample():
 
     # Expected output results:   >>>results.head(3)
     # CENTROID_ID	NEAREST_CENTROIDS...	    station_name    isweekday	duration	num_trips	distance...
-    #	1	   [{'CENTROID_ID': 1, 'DISTANCE': 2	Borough...	    weekday	    1110	    5749	    0.13
-    #	2	   [{'CENTROID_ID': 2, 'DISTANCE': 2	Borough...	    weekend	    2125        1774	    0.13
-    #	1	   [{'CENTROID_ID': 1, 'DISTANCE': 2	Webber...	    weekday	    795	        6517	    0.16
+    # 	1	   [{'CENTROID_ID': 1, 'DISTANCE': 2	Borough...	    weekday	    1110	    5749	    0.13
+    # 	2	   [{'CENTROID_ID': 2, 'DISTANCE': 2	Borough...	    weekend	    2125        1774	    0.13
+    # 	1	   [{'CENTROID_ID': 1, 'DISTANCE': 2	Webber...	    weekday	    795	        6517	    0.16
     #   3 rows × 7 columns
 
     # [END bigquery_dataframes_bqml_kmeans_predict]

From ac348bf397d6a4e540c3ae57b1a01fc714d69509 Mon Sep 17 00:00:00 2001
From: Salem Boyland <salemb@google.com>
Date: Wed, 27 Dec 2023 12:07:16 -0600
Subject: [PATCH 22/28] column wrapping

---
 samples/snippets/create_kmeans_model_test.py | 52 +++++++++++---------
 1 file changed, 29 insertions(+), 23 deletions(-)

diff --git a/samples/snippets/create_kmeans_model_test.py b/samples/snippets/create_kmeans_model_test.py
index 9e2c7c4da4..47afdec02e 100644
--- a/samples/snippets/create_kmeans_model_test.py
+++ b/samples/snippets/create_kmeans_model_test.py
@@ -13,19 +13,21 @@
 # limitations under the License.
 
 
-def test_kmeans_sample():
+def test_kmeans_sample(project_id: str):
+    your_gcp_project_id = project_id
     # [START bigquery_dataframes_bqml_kmeans]
     import datetime
 
     import bigframes
     import bigframes.pandas as bpd
 
-    bigframes.options.bigquery.project = "salemb-testing"
-    # You must compute in the EU multi-region to query the London bicycles dataset.
+    bigframes.options.bigquery.project = your_gcp_project_id
+    # Compute in the EU multi-region to query the London bicycles dataset.
     bigframes.options.bigquery.location = "EU"
 
-    # Extract the information you'll need to train the k-means model later in this tutorial. Use the
-    # read_gbq function to represent cycle hires data as a DataFrame.
+    # Extract the information you'll need to train the k-means model in this
+    # tutorial. Use the read_gbq function to represent cycle hires
+    # data as a DataFrame.
     h = bpd.read_gbq(
         "bigquery-public-data.london_bicycles.cycle_hire",
         col_order=["start_station_name", "start_station_id", "start_date", "duration"],
@@ -37,8 +39,9 @@ def test_kmeans_sample():
     )
 
     s = bpd.read_gbq(
-        # Use ST_GEOPOINT and ST_DISTANCE to analyze geographical data.
-        # These functions determine spatial relationships between the geographical features.
+        # Use ST_GEOPOINT and ST_DISTANCE to analyze geographical
+        # data.These functions determine spatial relationships between
+        # geographical features.
         """
         SELECT
         id,
@@ -51,15 +54,15 @@ def test_kmeans_sample():
         """
     )
 
-    # Define Python datetime objects in the UTC timezone for range comparison, because BigQuery stores
-    # timestamp data in the UTC timezone.
+    # Define Python datetime objects in the UTC timezone for range comparison,
+    # because BigQuery stores timestamp data in the UTC timezone.
     sample_time = datetime.datetime(2015, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc)
     sample_time2 = datetime.datetime(2016, 1, 1, 0, 0, 0, tzinfo=datetime.timezone.utc)
 
     h = h.loc[(h["start_date"] >= sample_time) & (h["start_date"] <= sample_time2)]
 
-    # Replace each day-of-the-week number with the corresponding "weekday" or "weekend" label by using the
-    # Series.map method.
+    # Replace each day-of-the-week number with the corresponding "weekday" or 
+    # "weekend" label by using the Series.map method.
     h = h.assign(
         isweekday=h.start_date.dt.dayofweek.map(
             {
@@ -74,8 +77,8 @@ def test_kmeans_sample():
         )
     )
 
-    # Supplement each trip in "h" with the station distance information from "s" by
-    # merging the two DataFrames by station ID.
+    # Supplement each trip in "h" with the station distance information from
+    # "s" by merging the two DataFrames by station ID.
     merged_df = h.merge(
         right=s,
         how="inner",
@@ -83,8 +86,8 @@ def test_kmeans_sample():
         right_on="id",
     )
 
-    # Engineer features to cluster the stations. For each station, find the average trip duration, number of
-    # trips, and distance from city center.
+    # Engineer features to cluster the stations. For each station, find the 
+    # average trip duration, number of trips, and distance from city center.
     stationstats = merged_df.groupby(["station_name", "isweekday"]).agg(
         {"duration": ["mean", "count"], "distance_from_city_center": "max"}
     )
@@ -94,7 +97,7 @@ def test_kmeans_sample():
     ).reset_index()
 
     # Expected output results: >>> stationstats.head(3)
-    # station_name	   isweekday	duration	num_trips	distance_from_city_center
+    # station_name	isweekday duration  num_trips	distance_from_city_center
     # Borough Road...	weekday	    1110	    5749	    0.12624
     # Borough Road...	weekend	    2125	    1774	    0.12624
     # Webber Street...	weekday	    795	        6517	    0.164021
@@ -106,8 +109,10 @@ def test_kmeans_sample():
 
     from bigframes.ml.cluster import KMeans
 
-    # To determine an optimal number of clusters, you would run the CREATE MODEL query for different values of
-    # num_clusters, find the error measure, and pick the point at which the error measure is at its minimum value.
+    # To determine an optimal number of clusters, construct and fit several 
+    # K-Means objects with different values of num_clusters, find the error
+    # measure, and pick the point at which the error measure is at its minimum 
+    # value.
     cluster_model = KMeans(n_clusters=4)
     cluster_model.fit(stationstats)
 
@@ -115,7 +120,8 @@ def test_kmeans_sample():
 
     # [START bigquery_dataframes_bqml_kmeans_predict]
 
-    # Use 'contains' function to predict which clusters contain the stations with string "Kennington".
+    # Use 'contains' function to predict which clusters contain the stations 
+    # with string "Kennington".
     stationstats = stationstats.loc[
         stationstats["station_name"].str.contains("Kennington")
     ]
@@ -123,10 +129,10 @@ def test_kmeans_sample():
     result = cluster_model.predict(stationstats)
 
     # Expected output results:   >>>results.head(3)
-    # CENTROID_ID	NEAREST_CENTROIDS...	    station_name    isweekday	duration	num_trips	distance...
-    #	1	   [{'CENTROID_ID': 1, 'DISTANCE': 2	Borough...	    weekday	    1110	    5749	    0.13
-    #	2	   [{'CENTROID_ID': 2, 'DISTANCE': 2	Borough...	    weekend	    2125        1774	    0.13
-    #	1	   [{'CENTROID_ID': 1, 'DISTANCE': 2	Webber...	    weekday	    795	        6517	    0.16
+    # CENTROID...	NEAREST...	station_name  isweekday	 duration num_trips dist...
+    #	1	[{'CENTROID_ID'...	Borough...	  weekday	  1110	    5749	0.13
+    #	2	[{'CENTROID_ID'...	Borough...	  weekend	  2125      1774	0.13
+    #	1	[{'CENTROID_ID'...	Webber...	  weekday	  795	    6517	0.16
     #   3 rows × 7 columns
 
     # [END bigquery_dataframes_bqml_kmeans_predict]

From 1572ddd2b68452ff2879a5387f3091cd2f7ee466 Mon Sep 17 00:00:00 2001
From: Salem Jorden <115185670+SalemJorden@users.noreply.github.com>
Date: Mon, 5 Feb 2024 14:22:06 -0600
Subject: [PATCH 23/28] reset session before running code smaples

---
 samples/snippets/conftest.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/samples/snippets/conftest.py b/samples/snippets/conftest.py
index 1ce54b3c0c..968dac011b 100644
--- a/samples/snippets/conftest.py
+++ b/samples/snippets/conftest.py
@@ -18,6 +18,8 @@
 import pytest
 import test_utils.prefixer
 
+import bigframes.pandas as bpd
+
 prefixer = test_utils.prefixer.Prefixer(
     "python-bigquery-dataframes", "samples/snippets"
 )
@@ -43,6 +45,11 @@ def project_id(bigquery_client: bigquery.Client) -> str:
     return bigquery_client.project
 
 
+@pytest.fixture(autouse=True)
+def reset_session():
+    bpd.reset_session()
+
+
 @pytest.fixture(scope="session")
 def dataset_id(bigquery_client: bigquery.Client, project_id: str) -> Iterator[str]:
     dataset_id = prefixer.create_prefix()

From 3d77ddd06d0baf45ff9d69aec012d3964a244193 Mon Sep 17 00:00:00 2001
From: Salem Jorden <115185670+SalemJorden@users.noreply.github.com>
Date: Tue, 6 Feb 2024 14:09:10 -0600
Subject: [PATCH 24/28] Update samples/snippets/create_kmeans_model_test.py

Co-authored-by: Tim Swast <swast@google.com>
---
 samples/snippets/create_kmeans_model_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/samples/snippets/create_kmeans_model_test.py b/samples/snippets/create_kmeans_model_test.py
index 47afdec02e..cfc14f56ae 100644
--- a/samples/snippets/create_kmeans_model_test.py
+++ b/samples/snippets/create_kmeans_model_test.py
@@ -40,7 +40,7 @@ def test_kmeans_sample(project_id: str):
 
     s = bpd.read_gbq(
         # Use ST_GEOPOINT and ST_DISTANCE to analyze geographical
-        # data.These functions determine spatial relationships between
+        # data. These functions determine spatial relationships between
         # geographical features.
         """
         SELECT

From 505b79015d6557326dcc5651ea46127c26b847ae Mon Sep 17 00:00:00 2001
From: Salem Jorden <115185670+SalemJorden@users.noreply.github.com>
Date: Wed, 7 Feb 2024 12:47:18 -0600
Subject: [PATCH 25/28] predict function added to tutorial

---
 samples/snippets/create_kmeans_model_test.py | 29 ++++++++++++++------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/samples/snippets/create_kmeans_model_test.py b/samples/snippets/create_kmeans_model_test.py
index cfc14f56ae..49fe8d0559 100644
--- a/samples/snippets/create_kmeans_model_test.py
+++ b/samples/snippets/create_kmeans_model_test.py
@@ -61,7 +61,7 @@ def test_kmeans_sample(project_id: str):
 
     h = h.loc[(h["start_date"] >= sample_time) & (h["start_date"] <= sample_time2)]
 
-    # Replace each day-of-the-week number with the corresponding "weekday" or 
+    # Replace each day-of-the-week number with the corresponding "weekday" or
     # "weekend" label by using the Series.map method.
     h = h.assign(
         isweekday=h.start_date.dt.dayofweek.map(
@@ -86,7 +86,7 @@ def test_kmeans_sample(project_id: str):
         right_on="id",
     )
 
-    # Engineer features to cluster the stations. For each station, find the 
+    # Engineer features to cluster the stations. For each station, find the
     # average trip duration, number of trips, and distance from city center.
     stationstats = merged_df.groupby(["station_name", "isweekday"]).agg(
         {"duration": ["mean", "count"], "distance_from_city_center": "max"}
@@ -109,18 +109,29 @@ def test_kmeans_sample(project_id: str):
 
     from bigframes.ml.cluster import KMeans
 
-    # To determine an optimal number of clusters, construct and fit several 
+    # To determine an optimal number of clusters, construct and fit several
     # K-Means objects with different values of num_clusters, find the error
-    # measure, and pick the point at which the error measure is at its minimum 
+    # measure, and pick the point at which the error measure is at its minimum
     # value.
     cluster_model = KMeans(n_clusters=4)
     cluster_model.fit(stationstats)
-
+    cluster_model.to_gbq(
+        your_gcp_project_id,  # For example: "bqml_tutorial.sample_model"
+        replace=True,
+    )
     # [END bigquery_dataframes_bqml_kmeans_fit]
 
     # [START bigquery_dataframes_bqml_kmeans_predict]
 
-    # Use 'contains' function to predict which clusters contain the stations 
+    # Select model you'll use for training. `read_gbq_model` loads model data
+    # from BigQuery, but you could also use the `cluster_model` object from
+    # previous steps.
+    cluster_model = bpd.read_gbq_model(
+        your_gcp_project_id,
+        # For example: "bqml_tutorial.london_station_clusters",
+    )
+
+    # Use 'contains' function to predict which clusters contain the stations
     # with string "Kennington".
     stationstats = stationstats.loc[
         stationstats["station_name"].str.contains("Kennington")
@@ -130,9 +141,9 @@ def test_kmeans_sample(project_id: str):
 
     # Expected output results:   >>>results.head(3)
     # CENTROID...	NEAREST...	station_name  isweekday	 duration num_trips dist...
-    #	1	[{'CENTROID_ID'...	Borough...	  weekday	  1110	    5749	0.13
-    #	2	[{'CENTROID_ID'...	Borough...	  weekend	  2125      1774	0.13
-    #	1	[{'CENTROID_ID'...	Webber...	  weekday	  795	    6517	0.16
+    # 	1	[{'CENTROID_ID'...	Borough...	  weekday	  1110	    5749	0.13
+    # 	2	[{'CENTROID_ID'...	Borough...	  weekend	  2125      1774	0.13
+    # 	1	[{'CENTROID_ID'...	Webber...	  weekday	  795	    6517	0.16
     #   3 rows × 7 columns
 
     # [END bigquery_dataframes_bqml_kmeans_predict]

From 4505c5caa4e69bc459326afcf48f7951d10f23be Mon Sep 17 00:00:00 2001
From: Salem Jorden <115185670+SalemJorden@users.noreply.github.com>
Date: Fri, 9 Feb 2024 15:22:42 -0600
Subject: [PATCH 26/28] replaced project_id with model_id

---
 samples/snippets/conftest.py                 | 5 +++++
 samples/snippets/create_kmeans_model_test.py | 6 +++---
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/samples/snippets/conftest.py b/samples/snippets/conftest.py
index 968dac011b..5e43f079fa 100644
--- a/samples/snippets/conftest.py
+++ b/samples/snippets/conftest.py
@@ -47,6 +47,11 @@ def project_id(bigquery_client: bigquery.Client) -> str:
 
 @pytest.fixture(autouse=True)
 def reset_session():
+    """An autouse fixture ensuring each sample runs in a fresh session.
+
+    This allows us to have samples that query data in different locations.
+
+    """
     bpd.reset_session()
 
 
diff --git a/samples/snippets/create_kmeans_model_test.py b/samples/snippets/create_kmeans_model_test.py
index 49fe8d0559..aaaa4e3e6f 100644
--- a/samples/snippets/create_kmeans_model_test.py
+++ b/samples/snippets/create_kmeans_model_test.py
@@ -1,4 +1,4 @@
-# Copyright 2023 Google LLC
+# Copyright 2024 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -116,7 +116,7 @@ def test_kmeans_sample(project_id: str):
     cluster_model = KMeans(n_clusters=4)
     cluster_model.fit(stationstats)
     cluster_model.to_gbq(
-        your_gcp_project_id,  # For example: "bqml_tutorial.sample_model"
+        "bqml_tutrial.sample_model",  # For example: "bqml_tutorial.sample_model"
         replace=True,
     )
     # [END bigquery_dataframes_bqml_kmeans_fit]
@@ -127,7 +127,7 @@ def test_kmeans_sample(project_id: str):
     # from BigQuery, but you could also use the `cluster_model` object from
     # previous steps.
     cluster_model = bpd.read_gbq_model(
-        your_gcp_project_id,
+        "bqml_tutorial.sample_model",
         # For example: "bqml_tutorial.london_station_clusters",
     )
 

From 3ab8220956d7acf4dbcf7f280815e4e462a6ae2f Mon Sep 17 00:00:00 2001
From: Salem Jorden <115185670+SalemJorden@users.noreply.github.com>
Date: Fri, 23 Feb 2024 14:22:11 -0600
Subject: [PATCH 27/28] reformatting

---
 samples/snippets/conftest.py                 |  1 -
 samples/snippets/create_kmeans_model_test.py | 19 ++++++++++---------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/samples/snippets/conftest.py b/samples/snippets/conftest.py
index 5e43f079fa..c8180565e1 100644
--- a/samples/snippets/conftest.py
+++ b/samples/snippets/conftest.py
@@ -50,7 +50,6 @@ def reset_session():
     """An autouse fixture ensuring each sample runs in a fresh session.
 
     This allows us to have samples that query data in different locations.
-
     """
     bpd.reset_session()
 
diff --git a/samples/snippets/create_kmeans_model_test.py b/samples/snippets/create_kmeans_model_test.py
index aaaa4e3e6f..4afbc42971 100644
--- a/samples/snippets/create_kmeans_model_test.py
+++ b/samples/snippets/create_kmeans_model_test.py
@@ -13,8 +13,9 @@
 # limitations under the License.
 
 
-def test_kmeans_sample(project_id: str):
+def test_kmeans_sample(project_id: str, random_model_id: str):
     your_gcp_project_id = project_id
+    your_model_id = random_model_id
     # [START bigquery_dataframes_bqml_kmeans]
     import datetime
 
@@ -116,30 +117,30 @@ def test_kmeans_sample(project_id: str):
     cluster_model = KMeans(n_clusters=4)
     cluster_model.fit(stationstats)
     cluster_model.to_gbq(
-        "bqml_tutrial.sample_model",  # For example: "bqml_tutorial.sample_model"
+        your_model_id,  # For example: "bqml_tutorial.london_station_clusters"
         replace=True,
     )
     # [END bigquery_dataframes_bqml_kmeans_fit]
 
     # [START bigquery_dataframes_bqml_kmeans_predict]
 
-    # Select model you'll use for training. `read_gbq_model` loads model data
-    # from BigQuery, but you could also use the `cluster_model` object from
-    # previous steps.
+    # Select model you'll use for predictions. `read_gbq_model` loads model
+    # data from BigQuery, but you could also use the `cluster_model` object
+    # from previous steps.
     cluster_model = bpd.read_gbq_model(
-        "bqml_tutorial.sample_model",
+        your_model_id,
         # For example: "bqml_tutorial.london_station_clusters",
     )
 
-    # Use 'contains' function to predict which clusters contain the stations
-    # with string "Kennington".
+    # Use 'contains' function to filter by stations containing the string
+    # "Kennington".
     stationstats = stationstats.loc[
         stationstats["station_name"].str.contains("Kennington")
     ]
 
     result = cluster_model.predict(stationstats)
 
-    # Expected output results:   >>>results.head(3)
+    # Expected output results:   >>>results.peek(3)
     # CENTROID...	NEAREST...	station_name  isweekday	 duration num_trips dist...
     # 	1	[{'CENTROID_ID'...	Borough...	  weekday	  1110	    5749	0.13
     # 	2	[{'CENTROID_ID'...	Borough...	  weekend	  2125      1774	0.13

From ae9a36284108513d08c25cfd9e350400b3cb349b Mon Sep 17 00:00:00 2001
From: Salem Jorden <115185670+SalemJorden@users.noreply.github.com>
Date: Mon, 26 Feb 2024 14:27:28 -0600
Subject: [PATCH 28/28] reformat

---
 samples/snippets/conftest.py                 | 26 ++++++++++++++++++++
 samples/snippets/create_kmeans_model_test.py |  4 +--
 2 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/samples/snippets/conftest.py b/samples/snippets/conftest.py
index c8180565e1..d34837b3e2 100644
--- a/samples/snippets/conftest.py
+++ b/samples/snippets/conftest.py
@@ -52,6 +52,7 @@ def reset_session():
     This allows us to have samples that query data in different locations.
     """
     bpd.reset_session()
+    bpd.options.bigquery.location = None
 
 
 @pytest.fixture(scope="session")
@@ -64,6 +65,17 @@ def dataset_id(bigquery_client: bigquery.Client, project_id: str) -> Iterator[st
     bigquery_client.delete_dataset(dataset, delete_contents=True, not_found_ok=True)
 
 
+@pytest.fixture(scope="session")
+def dataset_id_eu(bigquery_client: bigquery.Client, project_id: str) -> Iterator[str]:
+    dataset_id = prefixer.create_prefix()
+    full_dataset_id = f"{project_id}.{dataset_id}"
+    dataset = bigquery.Dataset(full_dataset_id)
+    dataset.location = "EU"
+    bigquery_client.create_dataset(dataset)
+    yield dataset_id
+    bigquery_client.delete_dataset(dataset, delete_contents=True, not_found_ok=True)
+
+
 @pytest.fixture
 def random_model_id(
     bigquery_client: bigquery.Client, project_id: str, dataset_id: str
@@ -75,3 +87,17 @@ def random_model_id(
     full_model_id = f"{project_id}.{dataset_id}.{random_model_id}"
     yield full_model_id
     bigquery_client.delete_model(full_model_id, not_found_ok=True)
+
+
+@pytest.fixture
+def random_model_id_eu(
+    bigquery_client: bigquery.Client, project_id: str, dataset_id_eu: str
+) -> Iterator[str]:
+    """
+    Create a new table ID each time, so random_model_id_eu can be used
+    as a target for load jobs.
+    """
+    random_model_id_eu = prefixer.create_prefix()
+    full_model_id = f"{project_id}.{dataset_id_eu}.{random_model_id_eu}"
+    yield full_model_id
+    bigquery_client.delete_model(full_model_id, not_found_ok=True)
diff --git a/samples/snippets/create_kmeans_model_test.py b/samples/snippets/create_kmeans_model_test.py
index 4afbc42971..2429060d09 100644
--- a/samples/snippets/create_kmeans_model_test.py
+++ b/samples/snippets/create_kmeans_model_test.py
@@ -13,9 +13,9 @@
 # limitations under the License.
 
 
-def test_kmeans_sample(project_id: str, random_model_id: str):
+def test_kmeans_sample(project_id: str, random_model_id_eu: str):
     your_gcp_project_id = project_id
-    your_model_id = random_model_id
+    your_model_id = random_model_id_eu
     # [START bigquery_dataframes_bqml_kmeans]
     import datetime