From 4cf9a0ef9889e932aea9fb18c71b1226a8c0bd7b Mon Sep 17 00:00:00 2001
From: Your Name <stabd@google.com>
Date: Wed, 31 Jan 2024 15:25:29 -0600
Subject: [PATCH 01/10] docs: Add a sample to demonstrate the evaluation
 results

---
 samples/snippets/bqml_getting_started_test.py | 47 +++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/samples/snippets/bqml_getting_started_test.py b/samples/snippets/bqml_getting_started_test.py
index 783f963feb..14e7a3eb45 100644
--- a/samples/snippets/bqml_getting_started_test.py
+++ b/samples/snippets/bqml_getting_started_test.py
@@ -91,3 +91,50 @@ def test_bqml_getting_started(random_model_id):
         replace=True,
     )
     # [END bigquery_dataframes_bqml_getting_started_tutorial]
+
+    # [START bigquery_dataframes_bqml_getting_started_tutorial_evaluate]
+    import bigframes.pandas as bpd
+
+    # WHAT IS READ_GBQ DOING?!
+    model = bpd.read_gbq_model(
+        your_model_id,  # For example: "bqml_tutorial.sample_model",
+    )
+
+    # The WHERE clause — _TABLE_SUFFIX BETWEEN '20170701' AND '20170801' —
+    # limits the number of tables scanned by the query. The date range scanned is
+    # July 1, 2017 to August 1, 2017. This is the data you're using to evaluate the predictive performance
+    # of the model. It was collected in the month immediately following the time
+    # period spanned by the training data.
+
+    df = bpd.read_gbq(
+        """
+        SELECT GENERATE_UUID() AS rowindex, *
+        FROM
+        `bigquery-public-data.google_analytics_sample.ga_sessions_*`
+        WHERE
+        _TABLE_SUFFIX BETWEEN '20170701' AND '20170801'
+        """,
+        index_col="rowindex",
+    )
+    transactions = df["totals"].struct.field("transactions")
+    label = transactions.notnull().map({True: 1, False: 0})
+    operatingSystem = df["device"].struct.field("operatingSystem")
+    operatingSystem = operatingSystem.fillna("")
+    isMobile = df["device"].struct.field("isMobile")
+    country = df["geoNetwork"].struct.field("country").fillna("")
+    pageviews = df["totals"].struct.field("pageviews").fillna(0)
+    features = bpd.DataFrame(
+        {
+            "os": operatingSystem,
+            "is_mobile": isMobile,
+            "country": country,
+            "pageviews": pageviews,
+        }
+    )
+
+    # Some models include a convenient .score(X, y) method for evaluation with a preset accuracy metric:
+    model.score(features, label)
+    #    precision    recall  accuracy  f1_score  log_loss   roc_auc
+    # 0   0.412621  0.079143  0.985074  0.132812  0.049764  0.974285
+    # [1 rows x 6 columns]
+    # [END bigquery_dataframes_bqml_getting_started_tutorial_evaluate]

From ffcf185b48edda796f2962716f516d96deb10d50 Mon Sep 17 00:00:00 2001
From: Your Name <stabd@google.com>
Date: Thu, 1 Feb 2024 12:11:49 -0600
Subject: [PATCH 02/10] Adding comments explaining logistic regression results

---
 samples/snippets/bqml_getting_started_test.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/samples/snippets/bqml_getting_started_test.py b/samples/snippets/bqml_getting_started_test.py
index 14e7a3eb45..7767e1c484 100644
--- a/samples/snippets/bqml_getting_started_test.py
+++ b/samples/snippets/bqml_getting_started_test.py
@@ -133,6 +133,22 @@ def test_bqml_getting_started(random_model_id):
     )
 
     # Some models include a convenient .score(X, y) method for evaluation with a preset accuracy metric:
+
+    # Because you performed a logistic regression, the results include the following columns:
+    # precision — A metric for classification models. Precision identifies the frequency with
+    # which a model was correct when predicting the positive class.
+    # recall — A metric for classification models that answers the following question:
+    # Out of all the possible positive labels, how many did the model correctly identify?
+    # accuracy — Accuracy is the fraction of predictions that a classification model got right.
+    # f1_score — A measure of the accuracy of the model. The f1 score is the harmonic average of
+    # the precision and recall. An f1 score's best value is 1. The worst value is 0.
+    # log_loss — The loss function used in a logistic regression. This is the measure of how far the
+    # model's predictions are from the correct labels.
+    # roc_auc — The area under the ROC curve. This is the probability that a classifier is more confident that
+    # a randomly chosen positive example
+    # is actually positive than that a randomly chosen negative example is positive. For more information,
+    # see Classification in the Machine Learning Crash Course.
+
     model.score(features, label)
     #    precision    recall  accuracy  f1_score  log_loss   roc_auc
     # 0   0.412621  0.079143  0.985074  0.132812  0.049764  0.974285

From 8e5ba68172cde07f9c55b55a3ef6e0942104853a Mon Sep 17 00:00:00 2001
From: Your Name <stabd@google.com>
Date: Mon, 5 Feb 2024 10:22:06 -0600
Subject: [PATCH 03/10] editing read_gbd explanation

---
 samples/snippets/bqml_getting_started_test.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/samples/snippets/bqml_getting_started_test.py b/samples/snippets/bqml_getting_started_test.py
index 7767e1c484..dfb46103bb 100644
--- a/samples/snippets/bqml_getting_started_test.py
+++ b/samples/snippets/bqml_getting_started_test.py
@@ -95,7 +95,8 @@ def test_bqml_getting_started(random_model_id):
     # [START bigquery_dataframes_bqml_getting_started_tutorial_evaluate]
     import bigframes.pandas as bpd
 
-    # WHAT IS READ_GBQ DOING?!
+    # Select model you'll use for training. 'read_gbq' accepts either a SQL query
+    # or a table ID.
     model = bpd.read_gbq_model(
         your_model_id,  # For example: "bqml_tutorial.sample_model",
     )

From 202bf76f05b1f5f5c94d136519874d4a2cb53920 Mon Sep 17 00:00:00 2001
From: Your Name <stabd@google.com>
Date: Fri, 23 Feb 2024 13:37:48 -0600
Subject: [PATCH 04/10] docs: add predict sample to
 samples/snippets/bqml_getting_started_test.py

---
 samples/snippets/bqml_getting_started_test.py | 40 +++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/samples/snippets/bqml_getting_started_test.py b/samples/snippets/bqml_getting_started_test.py
index dfb46103bb..1df814fc0a 100644
--- a/samples/snippets/bqml_getting_started_test.py
+++ b/samples/snippets/bqml_getting_started_test.py
@@ -155,3 +155,43 @@ def test_bqml_getting_started(random_model_id):
     # 0   0.412621  0.079143  0.985074  0.132812  0.049764  0.974285
     # [1 rows x 6 columns]
     # [END bigquery_dataframes_bqml_getting_started_tutorial_evaluate]
+
+    # [START bigquery_dataframes_bqml_getting_started_tutorial_predict]
+    df = bpd.read_gbq(
+        """
+    SELECT GENERATE_UUID() AS rowindex, *
+    FROM
+    `bigquery-public-data.google_analytics_sample.ga_sessions_*`
+    WHERE
+    _TABLE_SUFFIX BETWEEN '20170701' AND '20170801'
+    """,
+        index_col="rowindex",
+    )
+
+    operatingSystem = df["device"].struct.field("operatingSystem")
+    operatingSystem = operatingSystem.fillna("")
+    isMobile = df["device"].struct.field("isMobile")
+    country = df["geoNetwork"].struct.field("country").fillna("")
+    pageviews = df["totals"].struct.field("pageviews").fillna(0)
+    features = bpd.DataFrame(
+        {
+            "os": operatingSystem,
+            "is_mobile": isMobile,
+            "country": country,
+            "pageviews": pageviews,
+        }
+    )
+    # Use Logistic Regression predict method to, find more information here in
+    # [BigFrames](/bigframes/latest/bigframes.ml.linear_model.LogisticRegression#bigframes_ml_linear_model_LogisticRegression_predict)
+    predictions = model.predict(features)
+    countries = predictions.groupby(["country"])[["predicted_transactions"]].sum()
+    # type(countries)
+    countries.sort_values(ascending=False).head(10)
+
+    predictions = model.predict(features)
+
+    visitor_id = predictions.groupby(["country"])[["predicted_transactions"]].sum()
+
+    visitor_id.sort_values(ascending=False).head(10)
+
+    # [END bigquery_dataframes_bqml_getting_started_tutorial_predict]

From fb795261e7ac03f01e196616de806092365f05b7 Mon Sep 17 00:00:00 2001
From: Your Name <stabd@google.com>
Date: Wed, 28 Feb 2024 15:03:13 -0600
Subject: [PATCH 05/10] correcting variable names

---
 samples/snippets/bqml_getting_started_test.py | 85 +++++++++----------
 1 file changed, 38 insertions(+), 47 deletions(-)

diff --git a/samples/snippets/bqml_getting_started_test.py b/samples/snippets/bqml_getting_started_test.py
index 72a6486cb3..74dd5d4501 100644
--- a/samples/snippets/bqml_getting_started_test.py
+++ b/samples/snippets/bqml_getting_started_test.py
@@ -26,17 +26,12 @@ def test_bqml_getting_started(random_model_id):
     # https://github.com/googleapis/python-bigquery-dataframes/issues/169
     # for updates to `read_gbq` to support wildcard tables.
 
-    df = bpd.read_gbq(
-        """
-        -- Since the order of rows isn't useful for the model training,
-        -- generate a random ID to use as the index for the DataFrame.
-        SELECT GENERATE_UUID() AS rowindex, *
-        FROM
-        `bigquery-public-data.google_analytics_sample.ga_sessions_*`
-        WHERE
-        _TABLE_SUFFIX BETWEEN '20160801' AND '20170630'
-        """,
-        index_col="rowindex",
+    df = bpd.read_gbq_table(
+        "bigquery-public-data.google_analytics_sample.ga_sessions_*",
+        filters=[
+            ("_table_suffix", ">=", "20170701"),
+            ("_table_suffix", "<=", "20170801"),
+        ],
     )
 
     # Extract the total number of transactions within
@@ -56,11 +51,11 @@ def test_bqml_getting_started(random_model_id):
     label = transactions.notnull().map({True: 1, False: 0})
 
     # Extract the operating system of the visitor's device.
-    operatingSystem = df["device"].struct.field("operatingSystem")
-    operatingSystem = operatingSystem.fillna("")
+    operating_system = df["device"].struct.field("operatingSystem")
+    operating_system = operating_system.fillna("")
 
     # Extract whether the visitor's device is a mobile device.
-    isMobile = df["device"].struct.field("isMobile")
+    is_mobile = df["device"].struct.field("isMobile")
 
     # Extract the country from which the sessions originated, based on the IP address.
     country = df["geoNetwork"].struct.field("country").fillna("")
@@ -72,8 +67,8 @@ def test_bqml_getting_started(random_model_id):
     # to use as training data.
     features = bpd.DataFrame(
         {
-            "os": operatingSystem,
-            "is_mobile": isMobile,
+            "os": operating_system,
+            "isMobile": is_mobile,
             "country": country,
             "pageviews": pageviews,
         }
@@ -107,27 +102,24 @@ def test_bqml_getting_started(random_model_id):
     # of the model. It was collected in the month immediately following the time
     # period spanned by the training data.
 
-    df = bpd.read_gbq(
-        """
-        SELECT GENERATE_UUID() AS rowindex, *
-        FROM
-        `bigquery-public-data.google_analytics_sample.ga_sessions_*`
-        WHERE
-        _TABLE_SUFFIX BETWEEN '20170701' AND '20170801'
-        """,
-        index_col="rowindex",
+    df = bpd.read_gbq_table(
+        "bigquery-public-data.google_analytics_sample.ga_sessions_*",
+        filters=[
+            ("_table_suffix", ">=", "20170701"),
+            ("_table_suffix", "<=", "20170801"),
+        ],
     )
     transactions = df["totals"].struct.field("transactions")
     label = transactions.notnull().map({True: 1, False: 0})
-    operatingSystem = df["device"].struct.field("operatingSystem")
-    operatingSystem = operatingSystem.fillna("")
-    isMobile = df["device"].struct.field("isMobile")
+    operating_system = df["device"].struct.field("operatingSystem")
+    operating_system = operating_system.fillna("")
+    is_mobile = df["device"].struct.field("isMobile")
     country = df["geoNetwork"].struct.field("country").fillna("")
     pageviews = df["totals"].struct.field("pageviews").fillna(0)
     features = bpd.DataFrame(
         {
-            "os": operatingSystem,
-            "is_mobile": isMobile,
+            "os": operating_system,
+            "isMobile": is_mobile,
             "country": country,
             "pageviews": pageviews,
         }
@@ -164,26 +156,23 @@ def test_bqml_getting_started(random_model_id):
     # [END bigquery_dataframes_bqml_getting_started_tutorial_evaluate]
 
     # [START bigquery_dataframes_bqml_getting_started_tutorial_predict]
-    df = bpd.read_gbq(
-        """
-    SELECT GENERATE_UUID() AS rowindex, *
-    FROM
-    `bigquery-public-data.google_analytics_sample.ga_sessions_*`
-    WHERE
-    _TABLE_SUFFIX BETWEEN '20170701' AND '20170801'
-    """,
-        index_col="rowindex",
+    df = bpd.read_gbq_table(
+        "bigquery-public-data.google_analytics_sample.ga_sessions_*",
+        filters=[
+            ("_table_suffix", ">=", "20170701"),
+            ("_table_suffix", "<=", "20170801"),
+        ],
     )
 
-    operatingSystem = df["device"].struct.field("operatingSystem")
-    operatingSystem = operatingSystem.fillna("")
-    isMobile = df["device"].struct.field("isMobile")
+    operating_system = df["device"].struct.field("operatingSystem")
+    operating_system = operating_system.fillna("")
+    is_mobile = df["device"].struct.field("isMobile")
     country = df["geoNetwork"].struct.field("country").fillna("")
     pageviews = df["totals"].struct.field("pageviews").fillna(0)
     features = bpd.DataFrame(
         {
-            "os": operatingSystem,
-            "is_mobile": isMobile,
+            "os": operating_system,
+            "isMobile": is_mobile,
             "country": country,
             "pageviews": pageviews,
         }
@@ -192,13 +181,15 @@ def test_bqml_getting_started(random_model_id):
     # [BigFrames](/bigframes/latest/bigframes.ml.linear_model.LogisticRegression#bigframes_ml_linear_model_LogisticRegression_predict)
     predictions = model.predict(features)
     countries = predictions.groupby(["country"])[["predicted_transactions"]].sum()
-    # type(countries)
+
     countries.sort_values(ascending=False).head(10)
 
     predictions = model.predict(features)
 
-    visitor_id = predictions.groupby(["country"])[["predicted_transactions"]].sum()
+    total_predicted_purchases = predictions.groupby(["country"])[
+        ["predicted_transactions"]
+    ].sum()
 
-    visitor_id.sort_values(ascending=False).head(10)
+    total_predicted_purchases.sort_values(ascending=False).head(10)
 
     # [END bigquery_dataframes_bqml_getting_started_tutorial_predict]

From ad398ad5766af6edaed560b498b1f319e76447c9 Mon Sep 17 00:00:00 2001
From: Your Name <stabd@google.com>
Date: Mon, 4 Mar 2024 11:58:20 -0600
Subject: [PATCH 06/10] Correcting python variables

---
 samples/snippets/bqml_getting_started_test.py | 41 +++++++++++++------
 1 file changed, 28 insertions(+), 13 deletions(-)

diff --git a/samples/snippets/bqml_getting_started_test.py b/samples/snippets/bqml_getting_started_test.py
index 74dd5d4501..91c37f1340 100644
--- a/samples/snippets/bqml_getting_started_test.py
+++ b/samples/snippets/bqml_getting_started_test.py
@@ -29,8 +29,8 @@ def test_bqml_getting_started(random_model_id):
     df = bpd.read_gbq_table(
         "bigquery-public-data.google_analytics_sample.ga_sessions_*",
         filters=[
-            ("_table_suffix", ">=", "20170701"),
-            ("_table_suffix", "<=", "20170801"),
+            ("_table_suffix", ">=", "20160801"),
+            ("_table_suffix", "<=", "20170630"),
         ],
     )
 
@@ -90,17 +90,17 @@ def test_bqml_getting_started(random_model_id):
     # [START bigquery_dataframes_bqml_getting_started_tutorial_evaluate]
     import bigframes.pandas as bpd
 
-    # Select model you'll use for training. `read_gbq_model` loads model data from a
+    # Select model you'll use for evaluating. `read_gbq_model` loads model data from a
     # BigQuery, but you could also use the `model` object from the previous steps.
     model = bpd.read_gbq_model(
         your_model_id,  # For example: "bqml_tutorial.sample_model",
     )
 
-    # The WHERE clause — _TABLE_SUFFIX BETWEEN '20170701' AND '20170801' —
-    # limits the number of tables scanned by the query. The date range scanned is
-    # July 1, 2017 to August 1, 2017. This is the data you're using to evaluate the predictive performance
-    # of the model. It was collected in the month immediately following the time
-    # period spanned by the training data.
+    # The filters parameter limits the number of tables scanned by the query.
+    # The date range scanned is July 1, 2017 to August 1, 2017. This is the
+    # data you're using to evaluate the predictive performance of the model.
+    # It was collected in the month immediately following the time period
+    # spanned by the training data.
 
     df = bpd.read_gbq_table(
         "bigquery-public-data.google_analytics_sample.ga_sessions_*",
@@ -143,7 +143,7 @@ def test_bqml_getting_started(random_model_id):
     # - log_loss — The loss function used in a logistic regression. This is the measure of how far the
     # model's predictions are from the correct labels.
 
-    # - roc_auc — The area under the ROC curve. This is the probability that a classifier is more confident that
+    # - roc_auc — The area under the ROC curve. This is the probability that a classifier is morepy confident that
     # a randomly chosen positive example
     # is actually positive than that a randomly chosen negative example is positive. For more information,
     # see ['Classification']('https://developers.google.com/machine-learning/crash-course/classification/video-lecture')
@@ -155,7 +155,14 @@ def test_bqml_getting_started(random_model_id):
     # [1 rows x 6 columns]
     # [END bigquery_dataframes_bqml_getting_started_tutorial_evaluate]
 
-    # [START bigquery_dataframes_bqml_getting_started_tutorial_predict]
+    # [START bigquery_dataframes_bqml_getting_started_tutorial_predict_by_country]
+
+    # Select model you'll use for prediciting. `read_gbq_model` loads model data from a
+    # BigQuery, but you could also use the `model` object from the previous steps.
+    model = bpd.read_gbq_model(
+        your_model_id,  # For example: "bqml_tutorial.sample_model",
+    )
+
     df = bpd.read_gbq_table(
         "bigquery-public-data.google_analytics_sample.ga_sessions_*",
         filters=[
@@ -178,13 +185,21 @@ def test_bqml_getting_started(random_model_id):
         }
     )
     # Use Logistic Regression predict method to, find more information here in
-    # [BigFrames](/bigframes/latest/bigframes.ml.linear_model.LogisticRegression#bigframes_ml_linear_model_LogisticRegression_predict)
+    # [BigFrames](https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.linear_model.LogisticRegression#bigframes_ml_linear_model_LogisticRegression_predict)
+
+    # This code groups the DataFrame by 'country', calculates the sum of
+    # 'predicted_purchase' for each group, sorts the results by the sum in
+    # descending order, and selects the top 10 rows using the 'head' method.
+
     predictions = model.predict(features)
     countries = predictions.groupby(["country"])[["predicted_transactions"]].sum()
 
     countries.sort_values(ascending=False).head(10)
 
-    predictions = model.predict(features)
+    # [END bigquery_dataframes_bqml_getting_started_tutorial_predict_by_country]
+
+    # [START bigquery_dataframes_bqml_getting_started_tutorial_predict_by_visitor_id]
+    predictions = model.predict(features, label)
 
     total_predicted_purchases = predictions.groupby(["country"])[
         ["predicted_transactions"]
@@ -192,4 +207,4 @@ def test_bqml_getting_started(random_model_id):
 
     total_predicted_purchases.sort_values(ascending=False).head(10)
 
-    # [END bigquery_dataframes_bqml_getting_started_tutorial_predict]
+    # [END bigquery_dataframes_bqml_getting_started_tutorial_predict_by_visitor_id]

From ca17b39f497201a8dfad4bf707e3873ef3cde1db Mon Sep 17 00:00:00 2001
From: Your Name <stabd@google.com>
Date: Wed, 6 Mar 2024 13:04:05 -0600
Subject: [PATCH 07/10] feat: add predict by visit to
 samples/snippets/bqml_getting_started_test.py

---
 samples/snippets/bqml_getting_started_test.py | 100 +++++++++++++++---
 1 file changed, 83 insertions(+), 17 deletions(-)

diff --git a/samples/snippets/bqml_getting_started_test.py b/samples/snippets/bqml_getting_started_test.py
index 74dd5d4501..3f1a1453ec 100644
--- a/samples/snippets/bqml_getting_started_test.py
+++ b/samples/snippets/bqml_getting_started_test.py
@@ -14,7 +14,7 @@
 
 
 def test_bqml_getting_started(random_model_id):
-    your_model_id = random_model_id
+    your_model_id = random_model_id  # for example: bqml_tutorial.sample_model
 
     # [START bigquery_dataframes_bqml_getting_started_tutorial]
     from bigframes.ml.linear_model import LogisticRegression
@@ -29,8 +29,8 @@ def test_bqml_getting_started(random_model_id):
     df = bpd.read_gbq_table(
         "bigquery-public-data.google_analytics_sample.ga_sessions_*",
         filters=[
-            ("_table_suffix", ">=", "20170701"),
-            ("_table_suffix", "<=", "20170801"),
+            ("_table_suffix", ">=", "20160801"),
+            ("_table_suffix", "<=", "20170630"),
         ],
     )
 
@@ -68,7 +68,7 @@ def test_bqml_getting_started(random_model_id):
     features = bpd.DataFrame(
         {
             "os": operating_system,
-            "isMobile": is_mobile,
+            "is_mobile": is_mobile,
             "country": country,
             "pageviews": pageviews,
         }
@@ -96,9 +96,7 @@ def test_bqml_getting_started(random_model_id):
         your_model_id,  # For example: "bqml_tutorial.sample_model",
     )
 
-    # The WHERE clause — _TABLE_SUFFIX BETWEEN '20170701' AND '20170801' —
-    # limits the number of tables scanned by the query. The date range scanned is
-    # July 1, 2017 to August 1, 2017. This is the data you're using to evaluate the predictive performance
+    # July 1, 2017 to August 1, 2017 is the data you're using to evaluate the predictive performance
     # of the model. It was collected in the month immediately following the time
     # period spanned by the training data.
 
@@ -109,6 +107,7 @@ def test_bqml_getting_started(random_model_id):
             ("_table_suffix", "<=", "20170801"),
         ],
     )
+
     transactions = df["totals"].struct.field("transactions")
     label = transactions.notnull().map({True: 1, False: 0})
     operating_system = df["device"].struct.field("operatingSystem")
@@ -119,7 +118,7 @@ def test_bqml_getting_started(random_model_id):
     features = bpd.DataFrame(
         {
             "os": operating_system,
-            "isMobile": is_mobile,
+            "is_mobile": is_mobile,
             "country": country,
             "pageviews": pageviews,
         }
@@ -155,7 +154,14 @@ def test_bqml_getting_started(random_model_id):
     # [1 rows x 6 columns]
     # [END bigquery_dataframes_bqml_getting_started_tutorial_evaluate]
 
-    # [START bigquery_dataframes_bqml_getting_started_tutorial_predict]
+    # [START bigquery_dataframes_bqml_getting_started_tutorial_predict_by_country]
+    import bigframes.pandas as bpd
+
+    # Select model you'll use for training. `read_gbq_model` loads model data from a
+    # BigQuery, but you could also use the `model` object from the previous steps.
+    model = bpd.read_gbq_model(
+        your_model_id,  # For example: "bqml_tutorial.sample_model",
+    )
     df = bpd.read_gbq_table(
         "bigquery-public-data.google_analytics_sample.ga_sessions_*",
         filters=[
@@ -172,24 +178,84 @@ def test_bqml_getting_started(random_model_id):
     features = bpd.DataFrame(
         {
             "os": operating_system,
-            "isMobile": is_mobile,
+            "is_mobile": is_mobile,
             "country": country,
             "pageviews": pageviews,
         }
     )
     # Use Logistic Regression predict method to, find more information here in
-    # [BigFrames](/bigframes/latest/bigframes.ml.linear_model.LogisticRegression#bigframes_ml_linear_model_LogisticRegression_predict)
+    # [BigFrames](https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.linear_model.LogisticRegression#bigframes_ml_linear_model_LogisticRegression_predict)
+
     predictions = model.predict(features)
-    countries = predictions.groupby(["country"])[["predicted_transactions"]].sum()
+    total_predicted_purchases = predictions.groupby(["country"])[
+        ["predicted_label"]
+    ].sum()
+    total_predicted_purchases.sort_values(ascending=False).head(10)
 
-    countries.sort_values(ascending=False).head(10)
+    # country         # total_predicted_purchases
+    # United States    220
+    # Taiwan             8
+    # Canada             7
+    # India              2
+    # Japan              2
+    # Turkey             2
+    # Australia          1
+    # Brazil             1
+    # Germany            1
+    # Guyana             1
+    # Name: predicted_label, dtype: Int64
 
-    predictions = model.predict(features)
+    # [END bigquery_dataframes_bqml_getting_started_tutorial_predict_by_country]
 
-    total_predicted_purchases = predictions.groupby(["country"])[
-        ["predicted_transactions"]
+    # [START bigquery_dataframes_bqml_getting_started_tutorial_predict_by_visitor]
+
+    model = bpd.read_gbq_model(
+        your_model_id,  # For example: "bqml_tutorial.sample_model",
+    )
+    df = bpd.read_gbq_table(
+        "bigquery-public-data.google_analytics_sample.ga_sessions_*",
+        filters=[
+            ("_table_suffix", ">=", "20170701"),
+            ("_table_suffix", "<=", "20170801"),
+        ],
+    )
+
+    operating_system = df["device"].struct.field("operatingSystem")
+    operating_system = operating_system.fillna("")
+    is_mobile = df["device"].struct.field("isMobile")
+    country = df["geoNetwork"].struct.field("country").fillna("")
+    pageviews = df["totals"].struct.field("pageviews").fillna(0)
+    full_visitor_id = df["fullVisitorId"]
+
+    features = bpd.DataFrame(
+        {
+            "os": operating_system,
+            "is_mobile": is_mobile,
+            "country": country,
+            "pageviews": pageviews,
+            "fullVisitorId": full_visitor_id,
+        }
+    )
+
+    predictions = model.predict(features)
+    total_predicted_purchases = predictions.groupby(["fullVisitorId"])[
+        ["predicted_label"]
     ].sum()
 
     total_predicted_purchases.sort_values(ascending=False).head(10)
 
-    # [END bigquery_dataframes_bqml_getting_started_tutorial_predict]
+    # fullVisitorId         # total_predicted_purchases
+    # 9417857471295131045    4
+    # 0376394056092189113    2
+    # 0456807427403774085    2
+    # 057693500927581077     2
+    # 112288330928895942     2
+    # 1280993661204347450    2
+    # 2105122376016897629    2
+    # 2158257269735455737    2
+    # 2969418676126258798    2
+    # 489038402765684003     2
+    # Name: predicted_label, dtype: Int64
+
+
+# [END bigquery_dataframes_bqml_getting_started_tutorial_predict_by_visitor]

From 9df8bddadbf8fda97b5db30ac9eb072defe5e0c7 Mon Sep 17 00:00:00 2001
From: Your Name <stabd@google.com>
Date: Wed, 6 Mar 2024 14:10:48 -0600
Subject: [PATCH 08/10] file

---
 samples/snippets/bqml_getting_started_test.py | 34 ++++++++++++++++---
 1 file changed, 30 insertions(+), 4 deletions(-)

diff --git a/samples/snippets/bqml_getting_started_test.py b/samples/snippets/bqml_getting_started_test.py
index 523b399b3d..05893c7fc4 100644
--- a/samples/snippets/bqml_getting_started_test.py
+++ b/samples/snippets/bqml_getting_started_test.py
@@ -16,6 +16,7 @@
 def test_bqml_getting_started(random_model_id):
     your_model_id = random_model_id  # for example: bqml_tutorial.sample_model
 
+    your_model_id = "stabd-testing.bqml_tutorial1.sample_model"
     # [START bigquery_dataframes_bqml_getting_started_tutorial]
     from bigframes.ml.linear_model import LogisticRegression
     import bigframes.pandas as bpd
@@ -48,7 +49,7 @@ def test_bqml_getting_started(random_model_id):
     # ecommerce transactions within the Google Analytics session.
     # If the number of transactions is NULL, the value in the label
     # column is set to 0. Otherwise, it is set to 1.
-    label = transactions.notnull().map({True: 1, False: 0})
+    label = transactions.notnull().map({True: 1, False: 0}).rename("label")
 
     # Extract the operating system of the visitor's device.
     operating_system = df["device"].struct.field("operatingSystem")
@@ -110,7 +111,7 @@ def test_bqml_getting_started(random_model_id):
     )
 
     transactions = df["totals"].struct.field("transactions")
-    label = transactions.notnull().map({True: 1, False: 0})
+    label = transactions.notnull().map({True: 1, False: 0}).rename("label")
     operating_system = df["device"].struct.field("operatingSystem")
     operating_system = operating_system.fillna("")
     is_mobile = df["device"].struct.field("isMobile")
@@ -196,12 +197,19 @@ def test_bqml_getting_started(random_model_id):
     # [BigFrames](https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.linear_model.LogisticRegression#bigframes_ml_linear_model_LogisticRegression_predict)
 
     predictions = model.predict(features)
+
+    # Call groupby method to group predicted_label by country.
+    # Call sum method to get the total_predicted_label by country.
     total_predicted_purchases = predictions.groupby(["country"])[
         ["predicted_label"]
     ].sum()
+
+    # Call the sort_values method with the parameter
+    # ascending = False to get the highest values.
+    # Call head method to limit to the 10 highest values.
     total_predicted_purchases.sort_values(ascending=False).head(10)
 
-    # country         # total_predicted_purchases
+    # country
     # United States    220
     # Taiwan             8
     # Canada             7
@@ -218,9 +226,21 @@ def test_bqml_getting_started(random_model_id):
 
     # [START bigquery_dataframes_bqml_getting_started_tutorial_predict_by_visitor]
 
+    import bigframes.pandas as bpd
+
+    # Select model you'll use for predicting.
+    # `read_gbq_model` loads model data from
+    # BigQuery, but you could also use the `model`
+    # object from the previous steps.
     model = bpd.read_gbq_model(
         your_model_id,  # For example: "bqml_tutorial.sample_model",
     )
+
+    # The filters parameter limits the number of tables scanned by the query.
+    # The date range scanned is July 1, 2017 to August 1, 2017. This is the
+    # data you're using to make the prediction.
+    # It was collected in the month immediately following the time period
+    # spanned by the training data.
     df = bpd.read_gbq_table(
         "bigquery-public-data.google_analytics_sample.ga_sessions_*",
         filters=[
@@ -247,13 +267,19 @@ def test_bqml_getting_started(random_model_id):
     )
 
     predictions = model.predict(features)
+
+    # Call groupby method to group predicted_label by visitor.
+    # Call sum method to get the total_predicted_label by visitor.
     total_predicted_purchases = predictions.groupby(["fullVisitorId"])[
         ["predicted_label"]
     ].sum()
 
+    # Call the sort_values method with the parameter
+    # ascending = False to get the highest values.
+    # Call head method to limit to the 10 highest values.
     total_predicted_purchases.sort_values(ascending=False).head(10)
 
-    # fullVisitorId         # total_predicted_purchases
+    # fullVisitorId
     # 9417857471295131045    4
     # 0376394056092189113    2
     # 0456807427403774085    2

From 1a25f5f322f484b8702f0d21998af3243c2e2b31 Mon Sep 17 00:00:00 2001
From: Your Name <stabd@google.com>
Date: Wed, 6 Mar 2024 14:11:33 -0600
Subject: [PATCH 09/10] file

---
 samples/snippets/bqml_getting_started_test.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/samples/snippets/bqml_getting_started_test.py b/samples/snippets/bqml_getting_started_test.py
index 05893c7fc4..c5a581d87a 100644
--- a/samples/snippets/bqml_getting_started_test.py
+++ b/samples/snippets/bqml_getting_started_test.py
@@ -16,7 +16,6 @@
 def test_bqml_getting_started(random_model_id):
     your_model_id = random_model_id  # for example: bqml_tutorial.sample_model
 
-    your_model_id = "stabd-testing.bqml_tutorial1.sample_model"
     # [START bigquery_dataframes_bqml_getting_started_tutorial]
     from bigframes.ml.linear_model import LogisticRegression
     import bigframes.pandas as bpd

From daa3bdbe9e04ad834e87c00d2b069299bf50e0b2 Mon Sep 17 00:00:00 2001
From: Your Name <stabd@google.com>
Date: Wed, 6 Mar 2024 14:17:20 -0600
Subject: [PATCH 10/10] file

---
 samples/snippets/bqml_getting_started_test.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/samples/snippets/bqml_getting_started_test.py b/samples/snippets/bqml_getting_started_test.py
index c5a581d87a..d9f9135faa 100644
--- a/samples/snippets/bqml_getting_started_test.py
+++ b/samples/snippets/bqml_getting_started_test.py
@@ -143,7 +143,7 @@ def test_bqml_getting_started(random_model_id):
     # - log_loss — The loss function used in a logistic regression. This is the measure of how far the
     # model's predictions are from the correct labels.
 
-    # - roc_auc — The area under the ROC curve. This is the probability that a classifier is morepy confident that
+    # - roc_auc — The area under the ROC curve. This is the probability that a classifier is more confident that
     # a randomly chosen positive example
     # is actually positive than that a randomly chosen negative example is positive. For more information,
     # see ['Classification']('https://developers.google.com/machine-learning/crash-course/classification/video-lecture')
@@ -192,7 +192,9 @@ def test_bqml_getting_started(random_model_id):
             "pageviews": pageviews,
         }
     )
-    # Use Logistic Regression predict method to, find more information here in
+    # Use Logistic Regression predict method to predict results
+    # using your model.
+    # Find more information here in
     # [BigFrames](https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.linear_model.LogisticRegression#bigframes_ml_linear_model_LogisticRegression_predict)
 
     predictions = model.predict(features)