From 4cf9a0ef9889e932aea9fb18c71b1226a8c0bd7b Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 31 Jan 2024 15:25:29 -0600 Subject: [PATCH 01/10] docs: Add a sample to demonstrate the evaluation results --- samples/snippets/bqml_getting_started_test.py | 47 +++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/samples/snippets/bqml_getting_started_test.py b/samples/snippets/bqml_getting_started_test.py index 783f963feb..14e7a3eb45 100644 --- a/samples/snippets/bqml_getting_started_test.py +++ b/samples/snippets/bqml_getting_started_test.py @@ -91,3 +91,50 @@ def test_bqml_getting_started(random_model_id): replace=True, ) # [END bigquery_dataframes_bqml_getting_started_tutorial] + + # [START bigquery_dataframes_bqml_getting_started_tutorial_evaluate] + import bigframes.pandas as bpd + + # WHAT IS READ_GBQ DOING?! + model = bpd.read_gbq_model( + your_model_id, # For example: "bqml_tutorial.sample_model", + ) + + # The WHERE clause — _TABLE_SUFFIX BETWEEN '20170701' AND '20170801' — + # limits the number of tables scanned by the query. The date range scanned is + # July 1, 2017 to August 1, 2017. This is the data you're using to evaluate the predictive performance + # of the model. It was collected in the month immediately following the time + # period spanned by the training data. + + df = bpd.read_gbq( + """ + SELECT GENERATE_UUID() AS rowindex, * + FROM + `bigquery-public-data.google_analytics_sample.ga_sessions_*` + WHERE + _TABLE_SUFFIX BETWEEN '20170701' AND '20170801' + """, + index_col="rowindex", + ) + transactions = df["totals"].struct.field("transactions") + label = transactions.notnull().map({True: 1, False: 0}) + operatingSystem = df["device"].struct.field("operatingSystem") + operatingSystem = operatingSystem.fillna("") + isMobile = df["device"].struct.field("isMobile") + country = df["geoNetwork"].struct.field("country").fillna("") + pageviews = df["totals"].struct.field("pageviews").fillna(0) + features = bpd.DataFrame( + { + "os": operatingSystem, + "is_mobile": isMobile, + "country": country, + "pageviews": pageviews, + } + ) + + # Some models include a convenient .score(X, y) method for evaluation with a preset accuracy metric: + model.score(features, label) + # precision recall accuracy f1_score log_loss roc_auc + # 0 0.412621 0.079143 0.985074 0.132812 0.049764 0.974285 + # [1 rows x 6 columns] + # [END bigquery_dataframes_bqml_getting_started_tutorial_evaluate] From ffcf185b48edda796f2962716f516d96deb10d50 Mon Sep 17 00:00:00 2001 From: Your Name Date: Thu, 1 Feb 2024 12:11:49 -0600 Subject: [PATCH 02/10] Adding comments explaining logistic regression results --- samples/snippets/bqml_getting_started_test.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/samples/snippets/bqml_getting_started_test.py b/samples/snippets/bqml_getting_started_test.py index 14e7a3eb45..7767e1c484 100644 --- a/samples/snippets/bqml_getting_started_test.py +++ b/samples/snippets/bqml_getting_started_test.py @@ -133,6 +133,22 @@ def test_bqml_getting_started(random_model_id): ) # Some models include a convenient .score(X, y) method for evaluation with a preset accuracy metric: + + # Because you performed a logistic regression, the results include the following columns: + # precision — A metric for classification models. Precision identifies the frequency with + # which a model was correct when predicting the positive class. + # recall — A metric for classification models that answers the following question: + # Out of all the possible positive labels, how many did the model correctly identify? + # accuracy — Accuracy is the fraction of predictions that a classification model got right. + # f1_score — A measure of the accuracy of the model. The f1 score is the harmonic average of + # the precision and recall. An f1 score's best value is 1. The worst value is 0. + # log_loss — The loss function used in a logistic regression. This is the measure of how far the + # model's predictions are from the correct labels. + # roc_auc — The area under the ROC curve. This is the probability that a classifier is more confident that + # a randomly chosen positive example + # is actually positive than that a randomly chosen negative example is positive. For more information, + # see Classification in the Machine Learning Crash Course. + model.score(features, label) # precision recall accuracy f1_score log_loss roc_auc # 0 0.412621 0.079143 0.985074 0.132812 0.049764 0.974285 From 8e5ba68172cde07f9c55b55a3ef6e0942104853a Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 5 Feb 2024 10:22:06 -0600 Subject: [PATCH 03/10] editing read_gbd explanation --- samples/snippets/bqml_getting_started_test.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/samples/snippets/bqml_getting_started_test.py b/samples/snippets/bqml_getting_started_test.py index 7767e1c484..dfb46103bb 100644 --- a/samples/snippets/bqml_getting_started_test.py +++ b/samples/snippets/bqml_getting_started_test.py @@ -95,7 +95,8 @@ def test_bqml_getting_started(random_model_id): # [START bigquery_dataframes_bqml_getting_started_tutorial_evaluate] import bigframes.pandas as bpd - # WHAT IS READ_GBQ DOING?! + # Select model you'll use for training. 'read_gbq' accepts either a SQL query + # or a table ID. model = bpd.read_gbq_model( your_model_id, # For example: "bqml_tutorial.sample_model", ) From 202bf76f05b1f5f5c94d136519874d4a2cb53920 Mon Sep 17 00:00:00 2001 From: Your Name Date: Fri, 23 Feb 2024 13:37:48 -0600 Subject: [PATCH 04/10] docs: add predict sample to samples/snippets/bqml_getting_started_test.py --- samples/snippets/bqml_getting_started_test.py | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/samples/snippets/bqml_getting_started_test.py b/samples/snippets/bqml_getting_started_test.py index dfb46103bb..1df814fc0a 100644 --- a/samples/snippets/bqml_getting_started_test.py +++ b/samples/snippets/bqml_getting_started_test.py @@ -155,3 +155,43 @@ def test_bqml_getting_started(random_model_id): # 0 0.412621 0.079143 0.985074 0.132812 0.049764 0.974285 # [1 rows x 6 columns] # [END bigquery_dataframes_bqml_getting_started_tutorial_evaluate] + + # [START bigquery_dataframes_bqml_getting_started_tutorial_predict] + df = bpd.read_gbq( + """ + SELECT GENERATE_UUID() AS rowindex, * + FROM + `bigquery-public-data.google_analytics_sample.ga_sessions_*` + WHERE + _TABLE_SUFFIX BETWEEN '20170701' AND '20170801' + """, + index_col="rowindex", + ) + + operatingSystem = df["device"].struct.field("operatingSystem") + operatingSystem = operatingSystem.fillna("") + isMobile = df["device"].struct.field("isMobile") + country = df["geoNetwork"].struct.field("country").fillna("") + pageviews = df["totals"].struct.field("pageviews").fillna(0) + features = bpd.DataFrame( + { + "os": operatingSystem, + "is_mobile": isMobile, + "country": country, + "pageviews": pageviews, + } + ) + # Use Logistic Regression predict method to, find more information here in + # [BigFrames](/bigframes/latest/bigframes.ml.linear_model.LogisticRegression#bigframes_ml_linear_model_LogisticRegression_predict) + predictions = model.predict(features) + countries = predictions.groupby(["country"])[["predicted_transactions"]].sum() + # type(countries) + countries.sort_values(ascending=False).head(10) + + predictions = model.predict(features) + + visitor_id = predictions.groupby(["country"])[["predicted_transactions"]].sum() + + visitor_id.sort_values(ascending=False).head(10) + + # [END bigquery_dataframes_bqml_getting_started_tutorial_predict] From fb795261e7ac03f01e196616de806092365f05b7 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 28 Feb 2024 15:03:13 -0600 Subject: [PATCH 05/10] correcting variable names --- samples/snippets/bqml_getting_started_test.py | 85 +++++++++---------- 1 file changed, 38 insertions(+), 47 deletions(-) diff --git a/samples/snippets/bqml_getting_started_test.py b/samples/snippets/bqml_getting_started_test.py index 72a6486cb3..74dd5d4501 100644 --- a/samples/snippets/bqml_getting_started_test.py +++ b/samples/snippets/bqml_getting_started_test.py @@ -26,17 +26,12 @@ def test_bqml_getting_started(random_model_id): # https://github.com/googleapis/python-bigquery-dataframes/issues/169 # for updates to `read_gbq` to support wildcard tables. - df = bpd.read_gbq( - """ - -- Since the order of rows isn't useful for the model training, - -- generate a random ID to use as the index for the DataFrame. - SELECT GENERATE_UUID() AS rowindex, * - FROM - `bigquery-public-data.google_analytics_sample.ga_sessions_*` - WHERE - _TABLE_SUFFIX BETWEEN '20160801' AND '20170630' - """, - index_col="rowindex", + df = bpd.read_gbq_table( + "bigquery-public-data.google_analytics_sample.ga_sessions_*", + filters=[ + ("_table_suffix", ">=", "20170701"), + ("_table_suffix", "<=", "20170801"), + ], ) # Extract the total number of transactions within @@ -56,11 +51,11 @@ def test_bqml_getting_started(random_model_id): label = transactions.notnull().map({True: 1, False: 0}) # Extract the operating system of the visitor's device. - operatingSystem = df["device"].struct.field("operatingSystem") - operatingSystem = operatingSystem.fillna("") + operating_system = df["device"].struct.field("operatingSystem") + operating_system = operating_system.fillna("") # Extract whether the visitor's device is a mobile device. - isMobile = df["device"].struct.field("isMobile") + is_mobile = df["device"].struct.field("isMobile") # Extract the country from which the sessions originated, based on the IP address. country = df["geoNetwork"].struct.field("country").fillna("") @@ -72,8 +67,8 @@ def test_bqml_getting_started(random_model_id): # to use as training data. features = bpd.DataFrame( { - "os": operatingSystem, - "is_mobile": isMobile, + "os": operating_system, + "isMobile": is_mobile, "country": country, "pageviews": pageviews, } @@ -107,27 +102,24 @@ def test_bqml_getting_started(random_model_id): # of the model. It was collected in the month immediately following the time # period spanned by the training data. - df = bpd.read_gbq( - """ - SELECT GENERATE_UUID() AS rowindex, * - FROM - `bigquery-public-data.google_analytics_sample.ga_sessions_*` - WHERE - _TABLE_SUFFIX BETWEEN '20170701' AND '20170801' - """, - index_col="rowindex", + df = bpd.read_gbq_table( + "bigquery-public-data.google_analytics_sample.ga_sessions_*", + filters=[ + ("_table_suffix", ">=", "20170701"), + ("_table_suffix", "<=", "20170801"), + ], ) transactions = df["totals"].struct.field("transactions") label = transactions.notnull().map({True: 1, False: 0}) - operatingSystem = df["device"].struct.field("operatingSystem") - operatingSystem = operatingSystem.fillna("") - isMobile = df["device"].struct.field("isMobile") + operating_system = df["device"].struct.field("operatingSystem") + operating_system = operating_system.fillna("") + is_mobile = df["device"].struct.field("isMobile") country = df["geoNetwork"].struct.field("country").fillna("") pageviews = df["totals"].struct.field("pageviews").fillna(0) features = bpd.DataFrame( { - "os": operatingSystem, - "is_mobile": isMobile, + "os": operating_system, + "isMobile": is_mobile, "country": country, "pageviews": pageviews, } @@ -164,26 +156,23 @@ def test_bqml_getting_started(random_model_id): # [END bigquery_dataframes_bqml_getting_started_tutorial_evaluate] # [START bigquery_dataframes_bqml_getting_started_tutorial_predict] - df = bpd.read_gbq( - """ - SELECT GENERATE_UUID() AS rowindex, * - FROM - `bigquery-public-data.google_analytics_sample.ga_sessions_*` - WHERE - _TABLE_SUFFIX BETWEEN '20170701' AND '20170801' - """, - index_col="rowindex", + df = bpd.read_gbq_table( + "bigquery-public-data.google_analytics_sample.ga_sessions_*", + filters=[ + ("_table_suffix", ">=", "20170701"), + ("_table_suffix", "<=", "20170801"), + ], ) - operatingSystem = df["device"].struct.field("operatingSystem") - operatingSystem = operatingSystem.fillna("") - isMobile = df["device"].struct.field("isMobile") + operating_system = df["device"].struct.field("operatingSystem") + operating_system = operating_system.fillna("") + is_mobile = df["device"].struct.field("isMobile") country = df["geoNetwork"].struct.field("country").fillna("") pageviews = df["totals"].struct.field("pageviews").fillna(0) features = bpd.DataFrame( { - "os": operatingSystem, - "is_mobile": isMobile, + "os": operating_system, + "isMobile": is_mobile, "country": country, "pageviews": pageviews, } @@ -192,13 +181,15 @@ def test_bqml_getting_started(random_model_id): # [BigFrames](/bigframes/latest/bigframes.ml.linear_model.LogisticRegression#bigframes_ml_linear_model_LogisticRegression_predict) predictions = model.predict(features) countries = predictions.groupby(["country"])[["predicted_transactions"]].sum() - # type(countries) + countries.sort_values(ascending=False).head(10) predictions = model.predict(features) - visitor_id = predictions.groupby(["country"])[["predicted_transactions"]].sum() + total_predicted_purchases = predictions.groupby(["country"])[ + ["predicted_transactions"] + ].sum() - visitor_id.sort_values(ascending=False).head(10) + total_predicted_purchases.sort_values(ascending=False).head(10) # [END bigquery_dataframes_bqml_getting_started_tutorial_predict] From ad398ad5766af6edaed560b498b1f319e76447c9 Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 4 Mar 2024 11:58:20 -0600 Subject: [PATCH 06/10] Correcting python variables --- samples/snippets/bqml_getting_started_test.py | 41 +++++++++++++------ 1 file changed, 28 insertions(+), 13 deletions(-) diff --git a/samples/snippets/bqml_getting_started_test.py b/samples/snippets/bqml_getting_started_test.py index 74dd5d4501..91c37f1340 100644 --- a/samples/snippets/bqml_getting_started_test.py +++ b/samples/snippets/bqml_getting_started_test.py @@ -29,8 +29,8 @@ def test_bqml_getting_started(random_model_id): df = bpd.read_gbq_table( "bigquery-public-data.google_analytics_sample.ga_sessions_*", filters=[ - ("_table_suffix", ">=", "20170701"), - ("_table_suffix", "<=", "20170801"), + ("_table_suffix", ">=", "20160801"), + ("_table_suffix", "<=", "20170630"), ], ) @@ -90,17 +90,17 @@ def test_bqml_getting_started(random_model_id): # [START bigquery_dataframes_bqml_getting_started_tutorial_evaluate] import bigframes.pandas as bpd - # Select model you'll use for training. `read_gbq_model` loads model data from a + # Select model you'll use for evaluating. `read_gbq_model` loads model data from a # BigQuery, but you could also use the `model` object from the previous steps. model = bpd.read_gbq_model( your_model_id, # For example: "bqml_tutorial.sample_model", ) - # The WHERE clause — _TABLE_SUFFIX BETWEEN '20170701' AND '20170801' — - # limits the number of tables scanned by the query. The date range scanned is - # July 1, 2017 to August 1, 2017. This is the data you're using to evaluate the predictive performance - # of the model. It was collected in the month immediately following the time - # period spanned by the training data. + # The filters parameter limits the number of tables scanned by the query. + # The date range scanned is July 1, 2017 to August 1, 2017. This is the + # data you're using to evaluate the predictive performance of the model. + # It was collected in the month immediately following the time period + # spanned by the training data. df = bpd.read_gbq_table( "bigquery-public-data.google_analytics_sample.ga_sessions_*", @@ -143,7 +143,7 @@ def test_bqml_getting_started(random_model_id): # - log_loss — The loss function used in a logistic regression. This is the measure of how far the # model's predictions are from the correct labels. - # - roc_auc — The area under the ROC curve. This is the probability that a classifier is more confident that + # - roc_auc — The area under the ROC curve. This is the probability that a classifier is morepy confident that # a randomly chosen positive example # is actually positive than that a randomly chosen negative example is positive. For more information, # see ['Classification']('https://developers.google.com/machine-learning/crash-course/classification/video-lecture') @@ -155,7 +155,14 @@ def test_bqml_getting_started(random_model_id): # [1 rows x 6 columns] # [END bigquery_dataframes_bqml_getting_started_tutorial_evaluate] - # [START bigquery_dataframes_bqml_getting_started_tutorial_predict] + # [START bigquery_dataframes_bqml_getting_started_tutorial_predict_by_country] + + # Select model you'll use for prediciting. `read_gbq_model` loads model data from a + # BigQuery, but you could also use the `model` object from the previous steps. + model = bpd.read_gbq_model( + your_model_id, # For example: "bqml_tutorial.sample_model", + ) + df = bpd.read_gbq_table( "bigquery-public-data.google_analytics_sample.ga_sessions_*", filters=[ @@ -178,13 +185,21 @@ def test_bqml_getting_started(random_model_id): } ) # Use Logistic Regression predict method to, find more information here in - # [BigFrames](/bigframes/latest/bigframes.ml.linear_model.LogisticRegression#bigframes_ml_linear_model_LogisticRegression_predict) + # [BigFrames](https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.linear_model.LogisticRegression#bigframes_ml_linear_model_LogisticRegression_predict) + + # This code groups the DataFrame by 'country', calculates the sum of + # 'predicted_purchase' for each group, sorts the results by the sum in + # descending order, and selects the top 10 rows using the 'head' method. + predictions = model.predict(features) countries = predictions.groupby(["country"])[["predicted_transactions"]].sum() countries.sort_values(ascending=False).head(10) - predictions = model.predict(features) + # [END bigquery_dataframes_bqml_getting_started_tutorial_predict_by_country] + + # [START bigquery_dataframes_bqml_getting_started_tutorial_predict_by_visitor_id] + predictions = model.predict(features, label) total_predicted_purchases = predictions.groupby(["country"])[ ["predicted_transactions"] @@ -192,4 +207,4 @@ def test_bqml_getting_started(random_model_id): total_predicted_purchases.sort_values(ascending=False).head(10) - # [END bigquery_dataframes_bqml_getting_started_tutorial_predict] + # [END bigquery_dataframes_bqml_getting_started_tutorial_predict_by_visitor_id] From ca17b39f497201a8dfad4bf707e3873ef3cde1db Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 6 Mar 2024 13:04:05 -0600 Subject: [PATCH 07/10] feat: add predict by visit to samples/snippets/bqml_getting_started_test.py --- samples/snippets/bqml_getting_started_test.py | 100 +++++++++++++++--- 1 file changed, 83 insertions(+), 17 deletions(-) diff --git a/samples/snippets/bqml_getting_started_test.py b/samples/snippets/bqml_getting_started_test.py index 74dd5d4501..3f1a1453ec 100644 --- a/samples/snippets/bqml_getting_started_test.py +++ b/samples/snippets/bqml_getting_started_test.py @@ -14,7 +14,7 @@ def test_bqml_getting_started(random_model_id): - your_model_id = random_model_id + your_model_id = random_model_id # for example: bqml_tutorial.sample_model # [START bigquery_dataframes_bqml_getting_started_tutorial] from bigframes.ml.linear_model import LogisticRegression @@ -29,8 +29,8 @@ def test_bqml_getting_started(random_model_id): df = bpd.read_gbq_table( "bigquery-public-data.google_analytics_sample.ga_sessions_*", filters=[ - ("_table_suffix", ">=", "20170701"), - ("_table_suffix", "<=", "20170801"), + ("_table_suffix", ">=", "20160801"), + ("_table_suffix", "<=", "20170630"), ], ) @@ -68,7 +68,7 @@ def test_bqml_getting_started(random_model_id): features = bpd.DataFrame( { "os": operating_system, - "isMobile": is_mobile, + "is_mobile": is_mobile, "country": country, "pageviews": pageviews, } @@ -96,9 +96,7 @@ def test_bqml_getting_started(random_model_id): your_model_id, # For example: "bqml_tutorial.sample_model", ) - # The WHERE clause — _TABLE_SUFFIX BETWEEN '20170701' AND '20170801' — - # limits the number of tables scanned by the query. The date range scanned is - # July 1, 2017 to August 1, 2017. This is the data you're using to evaluate the predictive performance + # July 1, 2017 to August 1, 2017 is the data you're using to evaluate the predictive performance # of the model. It was collected in the month immediately following the time # period spanned by the training data. @@ -109,6 +107,7 @@ def test_bqml_getting_started(random_model_id): ("_table_suffix", "<=", "20170801"), ], ) + transactions = df["totals"].struct.field("transactions") label = transactions.notnull().map({True: 1, False: 0}) operating_system = df["device"].struct.field("operatingSystem") @@ -119,7 +118,7 @@ def test_bqml_getting_started(random_model_id): features = bpd.DataFrame( { "os": operating_system, - "isMobile": is_mobile, + "is_mobile": is_mobile, "country": country, "pageviews": pageviews, } @@ -155,7 +154,14 @@ def test_bqml_getting_started(random_model_id): # [1 rows x 6 columns] # [END bigquery_dataframes_bqml_getting_started_tutorial_evaluate] - # [START bigquery_dataframes_bqml_getting_started_tutorial_predict] + # [START bigquery_dataframes_bqml_getting_started_tutorial_predict_by_country] + import bigframes.pandas as bpd + + # Select model you'll use for training. `read_gbq_model` loads model data from a + # BigQuery, but you could also use the `model` object from the previous steps. + model = bpd.read_gbq_model( + your_model_id, # For example: "bqml_tutorial.sample_model", + ) df = bpd.read_gbq_table( "bigquery-public-data.google_analytics_sample.ga_sessions_*", filters=[ @@ -172,24 +178,84 @@ def test_bqml_getting_started(random_model_id): features = bpd.DataFrame( { "os": operating_system, - "isMobile": is_mobile, + "is_mobile": is_mobile, "country": country, "pageviews": pageviews, } ) # Use Logistic Regression predict method to, find more information here in - # [BigFrames](/bigframes/latest/bigframes.ml.linear_model.LogisticRegression#bigframes_ml_linear_model_LogisticRegression_predict) + # [BigFrames](https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.linear_model.LogisticRegression#bigframes_ml_linear_model_LogisticRegression_predict) + predictions = model.predict(features) - countries = predictions.groupby(["country"])[["predicted_transactions"]].sum() + total_predicted_purchases = predictions.groupby(["country"])[ + ["predicted_label"] + ].sum() + total_predicted_purchases.sort_values(ascending=False).head(10) - countries.sort_values(ascending=False).head(10) + # country # total_predicted_purchases + # United States 220 + # Taiwan 8 + # Canada 7 + # India 2 + # Japan 2 + # Turkey 2 + # Australia 1 + # Brazil 1 + # Germany 1 + # Guyana 1 + # Name: predicted_label, dtype: Int64 - predictions = model.predict(features) + # [END bigquery_dataframes_bqml_getting_started_tutorial_predict_by_country] - total_predicted_purchases = predictions.groupby(["country"])[ - ["predicted_transactions"] + # [START bigquery_dataframes_bqml_getting_started_tutorial_predict_by_visitor] + + model = bpd.read_gbq_model( + your_model_id, # For example: "bqml_tutorial.sample_model", + ) + df = bpd.read_gbq_table( + "bigquery-public-data.google_analytics_sample.ga_sessions_*", + filters=[ + ("_table_suffix", ">=", "20170701"), + ("_table_suffix", "<=", "20170801"), + ], + ) + + operating_system = df["device"].struct.field("operatingSystem") + operating_system = operating_system.fillna("") + is_mobile = df["device"].struct.field("isMobile") + country = df["geoNetwork"].struct.field("country").fillna("") + pageviews = df["totals"].struct.field("pageviews").fillna(0) + full_visitor_id = df["fullVisitorId"] + + features = bpd.DataFrame( + { + "os": operating_system, + "is_mobile": is_mobile, + "country": country, + "pageviews": pageviews, + "fullVisitorId": full_visitor_id, + } + ) + + predictions = model.predict(features) + total_predicted_purchases = predictions.groupby(["fullVisitorId"])[ + ["predicted_label"] ].sum() total_predicted_purchases.sort_values(ascending=False).head(10) - # [END bigquery_dataframes_bqml_getting_started_tutorial_predict] + # fullVisitorId # total_predicted_purchases + # 9417857471295131045 4 + # 0376394056092189113 2 + # 0456807427403774085 2 + # 057693500927581077 2 + # 112288330928895942 2 + # 1280993661204347450 2 + # 2105122376016897629 2 + # 2158257269735455737 2 + # 2969418676126258798 2 + # 489038402765684003 2 + # Name: predicted_label, dtype: Int64 + + +# [END bigquery_dataframes_bqml_getting_started_tutorial_predict_by_visitor] From 9df8bddadbf8fda97b5db30ac9eb072defe5e0c7 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 6 Mar 2024 14:10:48 -0600 Subject: [PATCH 08/10] file --- samples/snippets/bqml_getting_started_test.py | 34 ++++++++++++++++--- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/samples/snippets/bqml_getting_started_test.py b/samples/snippets/bqml_getting_started_test.py index 523b399b3d..05893c7fc4 100644 --- a/samples/snippets/bqml_getting_started_test.py +++ b/samples/snippets/bqml_getting_started_test.py @@ -16,6 +16,7 @@ def test_bqml_getting_started(random_model_id): your_model_id = random_model_id # for example: bqml_tutorial.sample_model + your_model_id = "stabd-testing.bqml_tutorial1.sample_model" # [START bigquery_dataframes_bqml_getting_started_tutorial] from bigframes.ml.linear_model import LogisticRegression import bigframes.pandas as bpd @@ -48,7 +49,7 @@ def test_bqml_getting_started(random_model_id): # ecommerce transactions within the Google Analytics session. # If the number of transactions is NULL, the value in the label # column is set to 0. Otherwise, it is set to 1. - label = transactions.notnull().map({True: 1, False: 0}) + label = transactions.notnull().map({True: 1, False: 0}).rename("label") # Extract the operating system of the visitor's device. operating_system = df["device"].struct.field("operatingSystem") @@ -110,7 +111,7 @@ def test_bqml_getting_started(random_model_id): ) transactions = df["totals"].struct.field("transactions") - label = transactions.notnull().map({True: 1, False: 0}) + label = transactions.notnull().map({True: 1, False: 0}).rename("label") operating_system = df["device"].struct.field("operatingSystem") operating_system = operating_system.fillna("") is_mobile = df["device"].struct.field("isMobile") @@ -196,12 +197,19 @@ def test_bqml_getting_started(random_model_id): # [BigFrames](https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.linear_model.LogisticRegression#bigframes_ml_linear_model_LogisticRegression_predict) predictions = model.predict(features) + + # Call groupby method to group predicted_label by country. + # Call sum method to get the total_predicted_label by country. total_predicted_purchases = predictions.groupby(["country"])[ ["predicted_label"] ].sum() + + # Call the sort_values method with the parameter + # ascending = False to get the highest values. + # Call head method to limit to the 10 highest values. total_predicted_purchases.sort_values(ascending=False).head(10) - # country # total_predicted_purchases + # country # United States 220 # Taiwan 8 # Canada 7 @@ -218,9 +226,21 @@ def test_bqml_getting_started(random_model_id): # [START bigquery_dataframes_bqml_getting_started_tutorial_predict_by_visitor] + import bigframes.pandas as bpd + + # Select model you'll use for predicting. + # `read_gbq_model` loads model data from + # BigQuery, but you could also use the `model` + # object from the previous steps. model = bpd.read_gbq_model( your_model_id, # For example: "bqml_tutorial.sample_model", ) + + # The filters parameter limits the number of tables scanned by the query. + # The date range scanned is July 1, 2017 to August 1, 2017. This is the + # data you're using to make the prediction. + # It was collected in the month immediately following the time period + # spanned by the training data. df = bpd.read_gbq_table( "bigquery-public-data.google_analytics_sample.ga_sessions_*", filters=[ @@ -247,13 +267,19 @@ def test_bqml_getting_started(random_model_id): ) predictions = model.predict(features) + + # Call groupby method to group predicted_label by visitor. + # Call sum method to get the total_predicted_label by visitor. total_predicted_purchases = predictions.groupby(["fullVisitorId"])[ ["predicted_label"] ].sum() + # Call the sort_values method with the parameter + # ascending = False to get the highest values. + # Call head method to limit to the 10 highest values. total_predicted_purchases.sort_values(ascending=False).head(10) - # fullVisitorId # total_predicted_purchases + # fullVisitorId # 9417857471295131045 4 # 0376394056092189113 2 # 0456807427403774085 2 From 1a25f5f322f484b8702f0d21998af3243c2e2b31 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 6 Mar 2024 14:11:33 -0600 Subject: [PATCH 09/10] file --- samples/snippets/bqml_getting_started_test.py | 1 - 1 file changed, 1 deletion(-) diff --git a/samples/snippets/bqml_getting_started_test.py b/samples/snippets/bqml_getting_started_test.py index 05893c7fc4..c5a581d87a 100644 --- a/samples/snippets/bqml_getting_started_test.py +++ b/samples/snippets/bqml_getting_started_test.py @@ -16,7 +16,6 @@ def test_bqml_getting_started(random_model_id): your_model_id = random_model_id # for example: bqml_tutorial.sample_model - your_model_id = "stabd-testing.bqml_tutorial1.sample_model" # [START bigquery_dataframes_bqml_getting_started_tutorial] from bigframes.ml.linear_model import LogisticRegression import bigframes.pandas as bpd From daa3bdbe9e04ad834e87c00d2b069299bf50e0b2 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 6 Mar 2024 14:17:20 -0600 Subject: [PATCH 10/10] file --- samples/snippets/bqml_getting_started_test.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/samples/snippets/bqml_getting_started_test.py b/samples/snippets/bqml_getting_started_test.py index c5a581d87a..d9f9135faa 100644 --- a/samples/snippets/bqml_getting_started_test.py +++ b/samples/snippets/bqml_getting_started_test.py @@ -143,7 +143,7 @@ def test_bqml_getting_started(random_model_id): # - log_loss — The loss function used in a logistic regression. This is the measure of how far the # model's predictions are from the correct labels. - # - roc_auc — The area under the ROC curve. This is the probability that a classifier is morepy confident that + # - roc_auc — The area under the ROC curve. This is the probability that a classifier is more confident that # a randomly chosen positive example # is actually positive than that a randomly chosen negative example is positive. For more information, # see ['Classification']('https://developers.google.com/machine-learning/crash-course/classification/video-lecture') @@ -192,7 +192,9 @@ def test_bqml_getting_started(random_model_id): "pageviews": pageviews, } ) - # Use Logistic Regression predict method to, find more information here in + # Use Logistic Regression predict method to predict results + # using your model. + # Find more information here in # [BigFrames](https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.linear_model.LogisticRegression#bigframes_ml_linear_model_LogisticRegression_predict) predictions = model.predict(features)