Skip to content

Commit

Permalink
feat: add Cloud Monitoring MQL backend (#245)
Browse files Browse the repository at this point in the history
* ignore virtual environments like venv3.9.12 too

* bump google-cloud-monitoring to v2 and migrate code

* remove redundant parentheses

* fix typo

* remove unused argument in docstring

* fix mutable default arguments

* add `make uninstall` target to uninstall all pip packages

* fix wrong migration of field names

* disable Pylint warnings for fields that do exist

* add type hints

* add Cloud Monitoring MQL samples

* reformat for flake8

* aggregate (sum) time series before computing their ratio

* annotate and clean up distribution_cut()

* add private method to format MQL query + tests

* disable warnings for known unused arguments

* fix syntax error

* disable flake8 checks in tests (for trailing whitespaces for example)

* fix type hints for dict.get() operations

* make type hints and pylint warnings compatible with python < 3.9
  • Loading branch information
lvaylet authored Oct 17, 2022
1 parent f20b74f commit 159f4d5
Show file tree
Hide file tree
Showing 16 changed files with 658 additions and 73 deletions.
4 changes: 2 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,6 @@ htmlcov/
*.tfstate.*.backup
.vscode
.env
venv/
.venv/
venv*/
.venv*/
reports/
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,9 @@ develop:
install: clean
$(PIP) install -e ."[api, datadog, prometheus, elasticsearch, pubsub, cloud_monitoring, bigquery, dev]"

uninstall: clean
$(PIP) freeze --exclude-editable | xargs $(PIP) uninstall -y

test: install unit lint

unit: clean
Expand Down
37 changes: 37 additions & 0 deletions samples/cloud_monitoring_mql/slo_gae_app_availability.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
apiVersion: sre.google.com/v2
kind: ServiceLevelObjective
metadata:
name: gae-app-availability
labels:
service_name: gae
feature_name: app
slo_name: availability
spec:
description: Availability of App Engine app
backend: cloud_monitoring_mql
method: good_bad_ratio
exporters:
- cloud_monitoring
service_level_indicator:
filter_good: >
fetch gae_app
| metric 'appengine.googleapis.com/http/server/response_count'
| filter resource.project_id == '${GAE_PROJECT_ID}'
| filter
metric.response_code == 429
|| metric.response_code == 200
|| metric.response_code == 201
|| metric.response_code == 202
|| metric.response_code == 203
|| metric.response_code == 204
|| metric.response_code == 205
|| metric.response_code == 206
|| metric.response_code == 207
|| metric.response_code == 208
|| metric.response_code == 226
|| metric.response_code == 304
filter_valid: >
fetch gae_app
| metric 'appengine.googleapis.com/http/server/response_count'
| filter resource.project_id == '${GAE_PROJECT_ID}'
goal: 0.95
36 changes: 36 additions & 0 deletions samples/cloud_monitoring_mql/slo_gae_app_availability_ratio.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
apiVersion: sre.google.com/v2
kind: ServiceLevelObjective
metadata:
name: gae-app-availability
labels:
service_name: gae
feature_name: app
slo_name: availability
spec:
description: Availability of App Engine app
backend: cloud_monitoring_mql
method: query_sli
exporters:
- cloud_monitoring
service_level_indicator:
query: >
fetch gae_app
| metric 'appengine.googleapis.com/http/server/response_count'
| filter resource.project_id == '${GAE_PROJECT_ID}'
| { filter
metric.response_code == 429
|| metric.response_code == 200
|| metric.response_code == 201
|| metric.response_code == 202
|| metric.response_code == 203
|| metric.response_code == 204
|| metric.response_code == 205
|| metric.response_code == 206
|| metric.response_code == 207
|| metric.response_code == 208
|| metric.response_code == 226
|| metric.response_code == 304
; ident }
| sum
| ratio
goal: 0.95
39 changes: 39 additions & 0 deletions samples/cloud_monitoring_mql/slo_gae_app_latency.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Copyright 2019 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
---
apiVersion: sre.google.com/v2
kind: ServiceLevelObjective
metadata:
name: gae-app-latency724ms
labels:
service_name: gae
feature_name: app
slo_name: latency724ms
spec:
description: Latency of App Engine app requests < 724ms
backend: cloud_monitoring_mql
method: distribution_cut
exporters:
- cloud_monitoring
service_level_indicator:
filter_valid: >
fetch gae_app
| metric 'appengine.googleapis.com/http/server/response_latencies'
| filter resource.project_id == '${GAE_PROJECT_ID}'
| filter
metric.response_code >= 200
&& metric.response_code < 500
good_below_threshold: true
threshold_bucket: 19
goal: 0.999
28 changes: 28 additions & 0 deletions samples/cloud_monitoring_mql/slo_lb_request_availability.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
apiVersion: sre.google.com/v2
kind: ServiceLevelObjective
metadata:
name: lb-request-availability
labels:
service_name: lb
feature_name: request
slo_name: availability
spec:
description: Availability of HTTP Load Balancer
backend: cloud_monitoring_mql
method: good_bad_ratio
exporters:
- cloud_monitoring
service_level_indicator:
filter_good: >
fetch 'https_lb_rule'
| metric 'loadbalancing.googleapis.com/https/request_count'
| filter resource.project_id == '${LB_PROJECT_ID}'
| filter
metric.response_code_class="200"
|| metric.response_code_class="300"
|| metric.response_code_class="400"
filter_valid: >
fetch 'https_lb_rule'
| metric 'loadbalancing.googleapis.com/https/request_count'
| filter resource.project_id == '${LB_PROJECT_ID}'
goal: 0.98
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
apiVersion: sre.google.com/v2
kind: ServiceLevelObjective
metadata:
name: lb-request-availability
labels:
service_name: lb
feature_name: request
slo_name: availability
spec:
description: Availability of HTTP Load Balancer
backend: cloud_monitoring_mql
method: query_sli
exporters:
- cloud_monitoring
service_level_indicator:
query: >
fetch 'https_lb_rule'
| metric 'loadbalancing.googleapis.com/https/request_count'
| filter resource.project_id == '${LB_PROJECT_ID}'
| { filter
metric.response_code_class="200"
|| metric.response_code_class="300"
|| metric.response_code_class="400"
; ident }
| sum
| ratio
goal: 0.98
25 changes: 25 additions & 0 deletions samples/cloud_monitoring_mql/slo_lb_request_latency.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
apiVersion: sre.google.com/v2
kind: ServiceLevelObjective
metadata:
name: lb-request-latency724ms
labels:
service_name: lb
feature_name: request
slo_name: latency724ms
spec:
description: Latency of HTTP Load Balancer < 724ms
backend: cloud_monitoring_mql
method: distribution_cut
exporters:
- cloud_monitoring
service_level_indicator:
filter_valid: >
fetch https_lb_rule
| metric 'loadbalancing.googleapis.com/https/total_latencies'
| filter resource.project_id == '${LB_PROJECT_ID}'
| filter metric.label.response_code_class = "200"
|| metric.response_code_class = "300"
|| metric.response_code_class = "400"
good_below_threshold: true
threshold_bucket: 19
goal: 0.98
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
apiVersion: sre.google.com/v2
kind: ServiceLevelObjective
metadata:
name: pubsub-subscription-throughput
labels:
service_name: pubsub
feature_name: subscription
slo_name: throughput
spec:
description: Throughput of Pub/Sub subscription
backend: cloud_monitoring_mql
method: good_bad_ratio
exporters:
- cloud_monitoring
service_level_indicator:
filter_good: >
fetch 'pubsub_subscription'
| metric 'pubsub.googleapis.com/subscription/ack_message_count'
| filter resource.project_id == '${PUBSUB_PROJECT_ID}'
filter_bad: >
fetch 'pubsub_subscription'
| metric 'pubsub.googleapis.com/subscription/num_outstanding_messages'
| filter resource.project_id == '${PUBSUB_PROJECT_ID}'
goal: 0.95
6 changes: 3 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,10 @@
'dynatrace': ['requests'],
'bigquery': ['google-api-python-client <2', 'google-cloud-bigquery <3'],
'cloud_monitoring': [
'google-api-python-client <2', 'google-cloud-monitoring ==1.1.0'
'google-api-python-client <2', 'google-cloud-monitoring <3'
],
'cloud_service_monitoring': [
'google-api-python-client <2', 'google-cloud-monitoring ==1.1.0'
'google-api-python-client <2', 'google-cloud-monitoring <3'
],
'cloud_storage': ['google-api-python-client <2', 'google-cloud-storage'],
'pubsub': ['google-api-python-client <2', 'google-cloud-pubsub <2'],
Expand Down Expand Up @@ -89,4 +89,4 @@
entry_points={
'console_scripts': ['slo-generator=slo_generator.cli:main'],
},
python_requires='>=3.4')
python_requires='>=3.6')
64 changes: 40 additions & 24 deletions slo_generator/backends/cloud_monitoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def __init__(self, project_id, client=None):
self.client = client
if client is None:
self.client = monitoring_v3.MetricServiceClient()
self.parent = self.client.project_path(project_id)
self.parent = self.client.common_project_path(project_id)

def good_bad_ratio(self, timestamp, window, slo_config):
"""Query two timeseries, one containing 'good' events, one containing
Expand Down Expand Up @@ -87,10 +87,10 @@ def good_bad_ratio(self, timestamp, window, slo_config):
LOGGER.debug(f'Good events: {good_event_count} | '
f'Bad events: {bad_event_count}')

return (good_event_count, bad_event_count)
return good_event_count, bad_event_count

def distribution_cut(self, timestamp, window, slo_config):
"""Query one timeserie of type 'exponential'.
"""Query one timeseries of type 'exponential'.
Args:
timestamp (int): UNIX timestamp.
Expand All @@ -112,7 +112,7 @@ def distribution_cut(self, timestamp, window, slo_config):
series = list(series)

if not series:
return (NO_DATA, NO_DATA) # no timeseries
return NO_DATA, NO_DATA # no timeseries

distribution_value = series[0].points[0].value.distribution_value
# bucket_options = distribution_value.bucket_options
Expand Down Expand Up @@ -149,7 +149,7 @@ def distribution_cut(self, timestamp, window, slo_config):
good_event_count = upper_events_count
bad_event_count = lower_events_count

return (good_event_count, bad_event_count)
return good_event_count, bad_event_count

def exponential_distribution_cut(self, *args, **kwargs):
"""Alias for `distribution_cut` method to allow for backwards
Expand All @@ -166,7 +166,7 @@ def query(self,
filter,
aligner='ALIGN_SUM',
reducer='REDUCE_SUM',
group_by=[]):
group_by=None):
"""Query timeseries from Cloud Monitoring.
Args:
Expand All @@ -180,15 +180,20 @@ def query(self,
Returns:
list: List of timeseries objects.
"""
if group_by is None:
group_by = []
measurement_window = CM.get_window(timestamp, window)
aggregation = CM.get_aggregation(window,
aligner=aligner,
reducer=reducer,
group_by=group_by)
timeseries = self.client.list_time_series(
self.parent, filter, measurement_window,
monitoring_v3.enums.ListTimeSeriesRequest.TimeSeriesView.FULL,
aggregation)
request = monitoring_v3.ListTimeSeriesRequest()
request.name = self.parent
request.filter = filter
request.interval = measurement_window
request.view = monitoring_v3.ListTimeSeriesRequest.TimeSeriesView.FULL
request.aggregation = aggregation
timeseries = self.client.list_time_series(request)
LOGGER.debug(pprint.pformat(timeseries))
return timeseries

Expand Down Expand Up @@ -220,20 +225,28 @@ def get_window(timestamp, window):
Returns:
:obj:`monitoring_v3.types.TimeInterval`: Measurement window object.
"""
measurement_window = monitoring_v3.types.TimeInterval()
measurement_window.end_time.seconds = int(timestamp)
measurement_window.end_time.nanos = int(
(timestamp - measurement_window.end_time.seconds) * 10**9)
measurement_window.start_time.seconds = int(timestamp - window)
measurement_window.start_time.nanos = measurement_window.end_time.nanos
end_time_seconds = int(timestamp)
end_time_nanos = int((timestamp - end_time_seconds) * 10 ** 9)
start_time_seconds = int(timestamp - window)
start_time_nanos = end_time_nanos
measurement_window = monitoring_v3.TimeInterval({
"end_time": {
"seconds": end_time_seconds,
"nanos": end_time_nanos
},
"start_time": {
"seconds": start_time_seconds,
"nanos": start_time_nanos
}
})
LOGGER.debug(pprint.pformat(measurement_window))
return measurement_window

@staticmethod
def get_aggregation(window,
aligner='ALIGN_SUM',
reducer='REDUCE_SUM',
group_by=[]):
group_by=None):
"""Helper for aggregation object.
Default aggregation is `ALIGN_SUM`.
Expand All @@ -248,13 +261,16 @@ def get_aggregation(window,
Returns:
:obj:`monitoring_v3.types.Aggregation`: Aggregation object.
"""
aggregation = monitoring_v3.types.Aggregation()
aggregation.alignment_period.seconds = window
aggregation.per_series_aligner = (getattr(
monitoring_v3.enums.Aggregation.Aligner, aligner))
aggregation.cross_series_reducer = (getattr(
monitoring_v3.enums.Aggregation.Reducer, reducer))
aggregation.group_by_fields.extend(group_by)
if group_by is None:
group_by = []
aggregation = monitoring_v3.Aggregation({
"alignment_period": {"seconds": window},
"per_series_aligner":
getattr(monitoring_v3.Aggregation.Aligner, aligner),
"cross_series_reducer":
getattr(monitoring_v3.Aggregation.Reducer, reducer),
"group_by_fields": group_by,
})
LOGGER.debug(pprint.pformat(aggregation))
return aggregation

Expand Down
Loading

0 comments on commit 159f4d5

Please sign in to comment.