Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

docs: add predict sample to samples/snippets/bqml_getting_started_test.py #388

Merged
merged 27 commits into from
Mar 8, 2024
Merged
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
4cf9a0e
docs: Add a sample to demonstrate the evaluation results
DevStephanie Jan 31, 2024
ffcf185
Adding comments explaining logistic regression results
DevStephanie Feb 1, 2024
8e5ba68
editing read_gbd explanation
DevStephanie Feb 5, 2024
202bf76
docs: add predict sample to samples/snippets/bqml_getting_started_tes…
DevStephanie Feb 23, 2024
ca3783f
Merge remote-tracking branch 'origin/main' into bqml_predict1
DevStephanie Feb 23, 2024
d3a8d8d
Merge branch 'main' into bqml_predict1
DevStephanie Feb 26, 2024
7198e7f
Merge branch 'main' into bqml_predict1
DevStephanie Feb 27, 2024
4984cfc
Merge branch 'main' into bqml_predict1
DevStephanie Feb 27, 2024
b89f30b
Merge branch 'main' of https://github.com/googleapis/python-bigquery-…
DevStephanie Feb 28, 2024
0aba4d2
Merge branch 'main' into bqml_predict1
DevStephanie Feb 28, 2024
fb79526
correcting variable names
DevStephanie Feb 28, 2024
b6d6430
Merge remote-tracking branch 'refs/remotes/origin/main' into bqml_pre…
DevStephanie Feb 28, 2024
262661c
Merge remote-tracking branch 'origin/bqml_predict1' into bqml_predict1
DevStephanie Feb 28, 2024
ad398ad
Correcting python variables
DevStephanie Mar 4, 2024
f0eaa6c
Merge branch 'main' into bqml_predict1
DevStephanie Mar 4, 2024
7f06521
Merge branch 'main' into bqml_predict2
DevStephanie Mar 4, 2024
ca17b39
feat: add predict by visit to samples/snippets/bqml_getting_started_t…
DevStephanie Mar 6, 2024
190cf9e
Merge branch 'bqml_predict2' into bqml_predict1
DevStephanie Mar 6, 2024
9df8bdd
file
DevStephanie Mar 6, 2024
1a25f5f
file
DevStephanie Mar 6, 2024
daa3bdb
file
DevStephanie Mar 6, 2024
bde7a12
Merge branch 'main' into bqml_predict1
tswast Mar 6, 2024
3613489
Merge branch 'main' into bqml_predict1
tswast Mar 6, 2024
249631c
Merge branch 'main' into bqml_predict1
tswast Mar 7, 2024
6ef78bb
Merge branch 'main' into bqml_predict1
tswast Mar 7, 2024
aa6d323
Merge branch 'main' into bqml_predict1
tswast Mar 7, 2024
defabf8
Merge branch 'main' into bqml_predict1
tswast Mar 8, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
213 changes: 171 additions & 42 deletions samples/snippets/bqml_getting_started_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@


def test_bqml_getting_started(random_model_id):
your_model_id = random_model_id
your_model_id = random_model_id # for example: bqml_tutorial.sample_model

# [START bigquery_dataframes_bqml_getting_started_tutorial]
from bigframes.ml.linear_model import LogisticRegression
Expand All @@ -26,17 +26,12 @@ def test_bqml_getting_started(random_model_id):
# https://github.com/googleapis/python-bigquery-dataframes/issues/169
# for updates to `read_gbq` to support wildcard tables.

df = bpd.read_gbq(
"""
-- Since the order of rows isn't useful for the model training,
-- generate a random ID to use as the index for the DataFrame.
SELECT GENERATE_UUID() AS rowindex, *
FROM
`bigquery-public-data.google_analytics_sample.ga_sessions_*`
WHERE
_TABLE_SUFFIX BETWEEN '20160801' AND '20170630'
""",
index_col="rowindex",
df = bpd.read_gbq_table(
"bigquery-public-data.google_analytics_sample.ga_sessions_*",
filters=[
("_table_suffix", ">=", "20160801"),
("_table_suffix", "<=", "20170630"),
],
)

# Extract the total number of transactions within
Expand All @@ -53,14 +48,14 @@ def test_bqml_getting_started(random_model_id):
# ecommerce transactions within the Google Analytics session.
# If the number of transactions is NULL, the value in the label
# column is set to 0. Otherwise, it is set to 1.
label = transactions.notnull().map({True: 1, False: 0})
label = transactions.notnull().map({True: 1, False: 0}).rename("label")

# Extract the operating system of the visitor's device.
operatingSystem = df["device"].struct.field("operatingSystem")
operatingSystem = operatingSystem.fillna("")
operating_system = df["device"].struct.field("operatingSystem")
operating_system = operating_system.fillna("")

# Extract whether the visitor's device is a mobile device.
isMobile = df["device"].struct.field("isMobile")
is_mobile = df["device"].struct.field("isMobile")

# Extract the country from which the sessions originated, based on the IP address.
country = df["geoNetwork"].struct.field("country").fillna("")
Expand All @@ -72,8 +67,8 @@ def test_bqml_getting_started(random_model_id):
# to use as training data.
features = bpd.DataFrame(
{
"os": operatingSystem,
"is_mobile": isMobile,
"os": operating_system,
"is_mobile": is_mobile,
"country": country,
"pageviews": pageviews,
}
Expand All @@ -95,39 +90,36 @@ def test_bqml_getting_started(random_model_id):
# [START bigquery_dataframes_bqml_getting_started_tutorial_evaluate]
import bigframes.pandas as bpd

# Select model you'll use for training. `read_gbq_model` loads model data from a
# Select model you'll use for evaluating. `read_gbq_model` loads model data from a
# BigQuery, but you could also use the `model` object from the previous steps.
model = bpd.read_gbq_model(
your_model_id, # For example: "bqml_tutorial.sample_model",
)

# The WHERE clause — _TABLE_SUFFIX BETWEEN '20170701' AND '20170801' —
# limits the number of tables scanned by the query. The date range scanned is
# July 1, 2017 to August 1, 2017. This is the data you're using to evaluate the predictive performance
# of the model. It was collected in the month immediately following the time
# period spanned by the training data.

df = bpd.read_gbq(
"""
SELECT GENERATE_UUID() AS rowindex, *
FROM
`bigquery-public-data.google_analytics_sample.ga_sessions_*`
WHERE
_TABLE_SUFFIX BETWEEN '20170701' AND '20170801'
""",
index_col="rowindex",
# The filters parameter limits the number of tables scanned by the query.
# The date range scanned is July 1, 2017 to August 1, 2017. This is the
# data you're using to evaluate the predictive performance of the model.
# It was collected in the month immediately following the time period
# spanned by the training data.
df = bpd.read_gbq_table(
"bigquery-public-data.google_analytics_sample.ga_sessions_*",
filters=[
("_table_suffix", ">=", "20170701"),
("_table_suffix", "<=", "20170801"),
],
)

transactions = df["totals"].struct.field("transactions")
label = transactions.notnull().map({True: 1, False: 0})
operatingSystem = df["device"].struct.field("operatingSystem")
operatingSystem = operatingSystem.fillna("")
isMobile = df["device"].struct.field("isMobile")
label = transactions.notnull().map({True: 1, False: 0}).rename("label")
operating_system = df["device"].struct.field("operatingSystem")
operating_system = operating_system.fillna("")
is_mobile = df["device"].struct.field("isMobile")
country = df["geoNetwork"].struct.field("country").fillna("")
pageviews = df["totals"].struct.field("pageviews").fillna(0)
features = bpd.DataFrame(
{
"os": operatingSystem,
"is_mobile": isMobile,
"os": operating_system,
"is_mobile": is_mobile,
"country": country,
"pageviews": pageviews,
}
Expand Down Expand Up @@ -163,6 +155,143 @@ def test_bqml_getting_started(random_model_id):
# [1 rows x 6 columns]
# [END bigquery_dataframes_bqml_getting_started_tutorial_evaluate]

# [START bigquery_dataframes_bqml_getting_started_tutorial_predict]
# [START bigquery_dataframes_bqml_getting_started_tutorial_predict_by_country]
import bigframes.pandas as bpd

# Select model you'll use for predicting.
# `read_gbq_model` loads model data from
# BigQuery, but you could also use the `model`
# object from the previous steps.
model = bpd.read_gbq_model(
your_model_id, # For example: "bqml_tutorial.sample_model",
)

# The filters parameter limits the number of tables scanned by the query.
# The date range scanned is July 1, 2017 to August 1, 2017. This is the
# data you're using to make the prediction.
# It was collected in the month immediately following the time period
# spanned by the training data.
df = bpd.read_gbq_table(
"bigquery-public-data.google_analytics_sample.ga_sessions_*",
filters=[
("_table_suffix", ">=", "20170701"),
("_table_suffix", "<=", "20170801"),
],
)

operating_system = df["device"].struct.field("operatingSystem")
operating_system = operating_system.fillna("")
is_mobile = df["device"].struct.field("isMobile")
country = df["geoNetwork"].struct.field("country").fillna("")
pageviews = df["totals"].struct.field("pageviews").fillna(0)
features = bpd.DataFrame(
{
"os": operating_system,
"is_mobile": is_mobile,
"country": country,
"pageviews": pageviews,
}
)
# Use Logistic Regression predict method to predict results
# using your model.
# Find more information here in
# [BigFrames](https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.linear_model.LogisticRegression#bigframes_ml_linear_model_LogisticRegression_predict)

predictions = model.predict(features)

# Call groupby method to group predicted_label by country.
# Call sum method to get the total_predicted_label by country.
total_predicted_purchases = predictions.groupby(["country"])[
["predicted_label"]
].sum()

# Call the sort_values method with the parameter
# ascending = False to get the highest values.
# Call head method to limit to the 10 highest values.
total_predicted_purchases.sort_values(ascending=False).head(10)

# country
# United States 220
# Taiwan 8
# Canada 7
# India 2
# Japan 2
# Turkey 2
# Australia 1
# Brazil 1
# Germany 1
# Guyana 1
# Name: predicted_label, dtype: Int64

# [END bigquery_dataframes_bqml_getting_started_tutorial_predict_by_country]

# [START bigquery_dataframes_bqml_getting_started_tutorial_predict_by_visitor]

import bigframes.pandas as bpd

# Select model you'll use for predicting.
# `read_gbq_model` loads model data from
# BigQuery, but you could also use the `model`
# object from the previous steps.
model = bpd.read_gbq_model(
your_model_id, # For example: "bqml_tutorial.sample_model",
)

# The filters parameter limits the number of tables scanned by the query.
# The date range scanned is July 1, 2017 to August 1, 2017. This is the
# data you're using to make the prediction.
# It was collected in the month immediately following the time period
# spanned by the training data.
df = bpd.read_gbq_table(
"bigquery-public-data.google_analytics_sample.ga_sessions_*",
filters=[
("_table_suffix", ">=", "20170701"),
("_table_suffix", "<=", "20170801"),
],
)

operating_system = df["device"].struct.field("operatingSystem")
operating_system = operating_system.fillna("")
is_mobile = df["device"].struct.field("isMobile")
country = df["geoNetwork"].struct.field("country").fillna("")
pageviews = df["totals"].struct.field("pageviews").fillna(0)
full_visitor_id = df["fullVisitorId"]

features = bpd.DataFrame(
{
"os": operating_system,
"is_mobile": is_mobile,
"country": country,
"pageviews": pageviews,
"fullVisitorId": full_visitor_id,
}
)

predictions = model.predict(features)

# Call groupby method to group predicted_label by visitor.
# Call sum method to get the total_predicted_label by visitor.
total_predicted_purchases = predictions.groupby(["fullVisitorId"])[
["predicted_label"]
].sum()

# Call the sort_values method with the parameter
# ascending = False to get the highest values.
# Call head method to limit to the 10 highest values.
total_predicted_purchases.sort_values(ascending=False).head(10)

DevStephanie marked this conversation as resolved.
Show resolved Hide resolved
# fullVisitorId
# 9417857471295131045 4
# 0376394056092189113 2
# 0456807427403774085 2
# 057693500927581077 2
# 112288330928895942 2
# 1280993661204347450 2
# 2105122376016897629 2
# 2158257269735455737 2
# 2969418676126258798 2
# 489038402765684003 2
# Name: predicted_label, dtype: Int64


# [END bigquery_dataframes_bqml_getting_started_tutorial_predict]
# [END bigquery_dataframes_bqml_getting_started_tutorial_predict_by_visitor]