-
Notifications
You must be signed in to change notification settings - Fork 43
/
Copy pathbqml_getting_started_test.py
93 lines (79 loc) · 3.77 KB
/
bqml_getting_started_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
def test_bqml_getting_started(random_model_id):
your_model_id = random_model_id
# [START bigquery_dataframes_bqml_getting_started_tutorial]
from bigframes.ml.linear_model import LogisticRegression
import bigframes.pandas as bpd
# Start by selecting the data you'll use for training. `read_gbq` accepts
# either a SQL query or a table ID. Since this example selects from multiple
# tables via a wildcard, use SQL to define this data. Watch issue
# https://github.com/googleapis/python-bigquery-dataframes/issues/169
# for updates to `read_gbq` to support wildcard tables.
df = bpd.read_gbq(
"""
-- Since the order of rows isn't useful for the model training,
-- generate a random ID to use as the index for the DataFrame.
SELECT GENERATE_UUID() AS rowindex, *
FROM
`bigquery-public-data.google_analytics_sample.ga_sessions_*`
WHERE
_TABLE_SUFFIX BETWEEN '20160801' AND '20170630'
""",
index_col="rowindex",
)
# Extract the total number of transactions within
# the Google Analytics session.
#
# Because the totals column is a STRUCT data type, call
# Series.struct.field("transactions") to extract the transactions field.
# See the reference documentation below:
# https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.operations.structs.StructAccessor#bigframes_operations_structs_StructAccessor_field
transactions = df["totals"].struct.field("transactions")
# The "label" values represent the outcome of the model's
# prediction. In this case, the model predicts if there are any
# ecommerce transactions within the Google Analytics session.
# If the number of transactions is NULL, the value in the label
# column is set to 0. Otherwise, it is set to 1.
label = transactions.notnull().map({True: 1, False: 0})
# Extract the operating system of the visitor's device.
operatingSystem = df["device"].struct.field("operatingSystem")
operatingSystem = operatingSystem.fillna("")
# Extract whether the visitor's device is a mobile device.
isMobile = df["device"].struct.field("isMobile")
# Extract the country from which the sessions originated, based on the IP address.
country = df["geoNetwork"].struct.field("country").fillna("")
# Extract the total number of page views within the session.
pageviews = df["totals"].struct.field("pageviews").fillna(0)
# Combine all the feature columns into a single DataFrame
# to use as training data.
features = bpd.DataFrame(
{
"os": operatingSystem,
"is_mobile": isMobile,
"country": country,
"pageviews": pageviews,
}
)
# Logistic Regression model splits data into two classes, giving the
# a confidence score that the data is in one of the classes.
model = LogisticRegression()
model.fit(features, label)
# The model.fit() call above created a temporary model.
# Use the to_gbq() method to write to a permanent location.
model.to_gbq(
your_model_id, # For example: "bqml_tutorial.sample_model",
replace=True,
)
# [END bigquery_dataframes_bqml_getting_started_tutorial]