-
Notifications
You must be signed in to change notification settings - Fork 73
/
sp_demo_bigsearch.sql
174 lines (142 loc) · 8.21 KB
/
sp_demo_bigsearch.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
/*##################################################################################
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
###################################################################################*/
/*
Use Cases:
- Show BigSearch on 5.83 TB table with 5.2 billion rows
- Show the table with and without a BigSearch index
- NOTE: You will be consuming at lot of bytes scanned during this demo ($6.25 per 1 TB scanned)
- NOTE: Drop the Large tables to save on Costs! (at end of script)
Description:
- Shows searching a large table on various columns with various partitioning filters
Show:
-
References:
- https://cloud.google.com/bigquery/docs/search-intro
Clean up / Reset script:
DROP SEARCH INDEX IF EXISTS idx_all_bigsearch_log_5b_5t_json_hourly ON `${project_id}.${bigquery_taxi_dataset}.bigsearch_log_5b_5t_json_hourly_INDEXED`;
DROP TABLE IF EXISTS `${project_id}.${bigquery_taxi_dataset}.bigsearch_log_5b_5t_json_hourly_NOT_INDEXED`;
DROP TABLE IF EXISTS `${project_id}.${bigquery_taxi_dataset}.bigsearch_log_5b_5t_json_hourly_INDEXED`;
More Data (BigSearch on more Data):
- NOTE: To run some SQL on a 60TB, 50 billon row table head over to the shared project
- Open a new tab and paste the URL below.
- https://console.cloud.google.com/bigquery?project=${shared_demo_project_id}
- OPEN the stored procedure: ${shared_demo_project_id}.bigquery_features.sp_demo_bigsearch_50b_60t
*/
---------------------------------------------------------------------------------------------------------
-- NOTE: You might want to copy the tables and create the indexs BEFORE your demo.
-- It can take 15 minutes for everything to complete!
---------------------------------------------------------------------------------------------------------
CREATE TABLE `${project_id}.${bigquery_taxi_dataset}.bigsearch_log_5b_5t_json_hourly_NOT_INDEXED`
COPY `${shared_demo_project_id}.data_analytics_shared_data.bigsearch_log_5b_5t_json_hourly`;
CREATE TABLE `${project_id}.${bigquery_taxi_dataset}.bigsearch_log_5b_5t_json_hourly_INDEXED`
COPY `${shared_demo_project_id}.data_analytics_shared_data.bigsearch_log_5b_5t_json_hourly`;
---------------------------------------------------------------------------------------------------------
-- Create an index on the "Indexed" table
---------------------------------------------------------------------------------------------------------
-- LOG_ANALYZER analyzer works well for machine generated logs and has special rules around tokens commonly found in observability data, such as IP addresses or emails.
CREATE SEARCH INDEX idx_all_bigsearch_log_5b_5t_json_hourly
ON `${project_id}.${bigquery_taxi_dataset}.bigsearch_log_5b_5t_json_hourly_INDEXED` (ALL COLUMNS)
OPTIONS (analyzer = 'LOG_ANALYZER');
-- Check our index (index_status = "Active", "total storage bytes" = 117 GB)
-- See the unindexed_row_count (rows not indexed)
-- Make sure your coverage_percent = 100 (so we know we are done indexing)
SELECT * -- table_name, index_name, ddl, coverage_percentage
FROM `${project_id}.${bigquery_taxi_dataset}.INFORMATION_SCHEMA.SEARCH_INDEXES`
WHERE index_status = 'ACTIVE';
-- See the columns that are in the index
SELECT * -- table_name, index_name, ddl, coverage_percentage
FROM `${project_id}.${bigquery_taxi_dataset}.INFORMATION_SCHEMA.SEARCH_INDEX_COLUMNS`
WHERE index_name = 'idx_all_bigsearch_log_5b_5t_json_hourly';
---------------------------------------------------------------------------------------------------------
-- Compare the INDEX versus the NOT INDEX tables for performance
---------------------------------------------------------------------------------------------------------
-- NOT_INDEXED
-- Search the Text Payload field of the log table (lots of text to sort through)
-- Result: Cnt = 351,124
-- Duration: 8 sec
-- Bytes processed: 2.99 TB
SELECT COUNT(*) AS Cnt
FROM `${project_id}.${bigquery_taxi_dataset}.bigsearch_log_5b_5t_json_hourly_NOT_INDEXED`
WHERE SEARCH(textPayload, 'service281');
-- Search the Text Payload field of the log table (lots of text to sort through)
-- Result: Cnt = 351,124
-- Duration: 6 sec
-- Bytes processed: 2.99 TB
-- Index Usage Mode: FULLY_USED
SELECT COUNT(*) AS Cnt
FROM `${project_id}.${bigquery_taxi_dataset}.bigsearch_log_5b_5t_json_hourly_INDEXED`
WHERE SEARCH(textPayload, 'service281');
-- NOT_INDEXED
-- Search the field "insertId" for a value (just one column)
-- Duration: 5 sec
-- Bytes processed: 5.83 TB
-- Index Usage Mode: UNUSED
SELECT *
FROM `${project_id}.${bigquery_taxi_dataset}.bigsearch_log_5b_5t_json_hourly_NOT_INDEXED`
WHERE SEARCH(insertId, '40ms4zeh78tg6o6jx');
-- Search the field "insertId" for a value (just one column)
-- Duration: 0 sec
-- Bytes processed: 81.44 MB
-- Index Usage Mode: FULLY_USED
SELECT *
FROM `${project_id}.${bigquery_taxi_dataset}.bigsearch_log_5b_5t_json_hourly_INDEXED`
WHERE SEARCH(insertId, '40ms4zeh78tg6o6jx');
---------------------------------------------------------------------------------------------------------
-- Compare RegEx search versus BigSearch SEARCH
---------------------------------------------------------------------------------------------------------
-- Use RegEx
-- Duration: 5 sec
-- Bytes processed: 739.99 GB
-- Count: 26,504
SELECT COUNT(*) AS Cnt
FROM `${project_id}.${bigquery_taxi_dataset}.bigsearch_log_5b_5t_json_hourly_INDEXED`
WHERE REGEXP_CONTAINS(internalClusterId, r"(?i)(\b|_)gke-workload-test-gpu-pool-c358b5f5-2fll(\b|_)");
-- Use BQ Search
-- Duration: 2 sec
-- Bytes processed: 123.58 GB
-- Count: 26,504
SELECT COUNT(*) AS Cnt
FROM `${project_id}.${bigquery_taxi_dataset}.bigsearch_log_5b_5t_json_hourly_INDEXED`
WHERE SEARCH(internalClusterId, '`gke-workload-test-gpu-pool-c358b5f5-2fll`');
---------------------------------------------------------------------------------------------------------
-- Search ALL columns or on SEVERAL columns
---------------------------------------------------------------------------------------------------------
-- Search across all columns (has to search lots of data)
-- Duration: 36 sec
-- Bytes processed: 5.79 TB
-- Cnt: 2,207,223
SELECT COUNT(*) AS Cnt
FROM `${project_id}.${bigquery_taxi_dataset}.bigsearch_log_5b_5t_json_hourly_INDEXED` AS LogsIndexed
WHERE SEARCH(LogsIndexed, 'service281');
-- Search 2 columns (searches less data since we know the columns to search)
-- Duration: 4 sec
-- Bytes processed: 1.11 TB
-- Cnt: 1,856,005
SELECT COUNT(*) AS Cnt
FROM `${project_id}.${bigquery_taxi_dataset}.bigsearch_log_5b_5t_json_hourly_INDEXED` AS LogsIndexed
WHERE SEARCH((internalClusterId, labels), 'service281');
---------------------------------------------------------------------------------------------------------
-- Clean up Large Data (to save on costs)
---------------------------------------------------------------------------------------------------------
DROP SEARCH INDEX IF EXISTS idx_all_bigsearch_log_5b_5t_json_hourly ON `${project_id}.${bigquery_taxi_dataset}.bigsearch_log_5b_5t_json_hourly_INDEXED`;
DROP TABLE IF EXISTS `${project_id}.${bigquery_taxi_dataset}.bigsearch_log_5b_5t_json_hourly_NOT_INDEXED`;
DROP TABLE IF EXISTS `${project_id}.${bigquery_taxi_dataset}.bigsearch_log_5b_5t_json_hourly_INDEXED`;
---------------------------------------------------------------------------------------------------------
-- NOTE: To run some SQL on a 60TB, 50 billon row table head over to the shared project
-- Open a new tab and paste the URL below.
-- https://console.cloud.google.com/bigquery?project=${shared_demo_project_id}
-- OPEN the stored procedure: ${shared_demo_project_id}.bigquery_features.sp_demo_bigsearch_50b_60t
---------------------------------------------------------------------------------------------------------