From 4b3128ab2137d73e202ea808f9ffb26d34580117 Mon Sep 17 00:00:00 2001 From: Cyril TOVENA Date: Tue, 15 Jan 2019 00:18:40 -0500 Subject: [PATCH] add client-go metrics and grafana dashboards There's also some refactoring involved within the metrics package to improve flexibility. --- build/grafana/dashboard-goclient-caches.yaml | 573 ++++++++++++++++++ .../grafana/dashboard-goclient-requests.yaml | 351 +++++++++++ .../dashboard-goclient-workqueues.yaml | 547 +++++++++++++++++ pkg/metrics/controller.go | 52 +- pkg/metrics/controller_metrics.go | 131 ++++ pkg/metrics/controller_test.go | 16 +- .../{gameservers.go => gameservers_count.go} | 2 +- pkg/metrics/kubernetes_client.go | 295 +++++++++ pkg/metrics/metrics.go | 138 ----- pkg/metrics/util.go | 48 ++ pkg/metrics/util_test.go | 12 +- site/content/en/docs/Guides/metrics.md | 32 +- 12 files changed, 1987 insertions(+), 210 deletions(-) create mode 100644 build/grafana/dashboard-goclient-caches.yaml create mode 100644 build/grafana/dashboard-goclient-requests.yaml create mode 100644 build/grafana/dashboard-goclient-workqueues.yaml create mode 100644 pkg/metrics/controller_metrics.go rename pkg/metrics/{gameservers.go => gameservers_count.go} (97%) create mode 100644 pkg/metrics/kubernetes_client.go delete mode 100644 pkg/metrics/metrics.go create mode 100644 pkg/metrics/util.go diff --git a/build/grafana/dashboard-goclient-caches.yaml b/build/grafana/dashboard-goclient-caches.yaml new file mode 100644 index 0000000000..c5c9cc510d --- /dev/null +++ b/build/grafana/dashboard-goclient-caches.yaml @@ -0,0 +1,573 @@ +# Copyright 2019 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# configs map used by grafana +apiVersion: v1 +kind: ConfigMap +metadata: + name: agones-goclient-caches + namespace: metrics + labels: + grafana_dashboard: "1" +data: + dashboard-agones-goclient-caches.json: | + { + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": 7, + "links": [], + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": null, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 9, + "w": 4, + "x": 0, + "y": 0 + }, + "id": 4, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "$$hashKey": "object:5575", + "name": "value to text", + "value": 1 + }, + { + "$$hashKey": "object:5576", + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "$$hashKey": "object:5432", + "expr": "agones_k8s_client_cache_last_resource_version", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "refId": "A" + } + ], + "thresholds": "", + "title": "Last resource version", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "$$hashKey": "object:5578", + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 1, + "gridPos": { + "h": 9, + "w": 10, + "x": 4, + "y": 0 + }, + "id": 8, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "$$hashKey": "object:6208", + "expr": "sum(rate(agones_k8s_client_cache_list_items_sum[5m]) / rate(agones_k8s_client_cache_list_items_count[5m]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "list items", + "refId": "A" + }, + { + "$$hashKey": "object:6270", + "expr": "sum(rate(agones_k8s_client_cache_watch_events_sum[5m]) / rate(agones_k8s_client_cache_watch_events_count[5m]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "watch events", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Cache items rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:6309", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:6310", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 1, + "gridPos": { + "h": 9, + "w": 10, + "x": 14, + "y": 0 + }, + "id": 9, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "$$hashKey": "object:6208", + "expr": "sum(agones_k8s_client_cache_list_items_sum / agones_k8s_client_cache_list_items_count)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "list items", + "refId": "A" + }, + { + "$$hashKey": "object:6270", + "expr": "sum(agones_k8s_client_cache_watch_events_sum / agones_k8s_client_cache_watch_events_count)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "watch events", + "refId": "B" + }, + { + "$$hashKey": "object:6551", + "expr": "", + "format": "time_series", + "intervalFactor": 1, + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Cache avg items per operations", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:6309", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:6310", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 1, + "gridPos": { + "h": 9, + "w": 14, + "x": 0, + "y": 9 + }, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "$$hashKey": "object:5637", + "expr": "sum(rate(agones_k8s_client_cache_list_duration_seconds_sum[5m]) / rate(agones_k8s_client_cache_list_duration_seconds_count[5m]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "list", + "refId": "A" + }, + { + "$$hashKey": "object:5785", + "expr": "sum(rate(agones_k8s_client_cache_watch_duration_seconds_sum[5m]) / rate(agones_k8s_client_cache_watch_duration_seconds_count[5m]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "watch", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Avg List/Watch requests duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:5813", + "format": "s", + "label": null, + "logBase": 10, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:5814", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 1, + "gridPos": { + "h": 9, + "w": 10, + "x": 14, + "y": 9 + }, + "id": 2, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(agones_k8s_client_cache_list_total[1m]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "list", + "refId": "A" + }, + { + "expr": "sum(rate(agones_k8s_client_cache_watches_total[1m]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "watch", + "refId": "B" + }, + { + "expr": "sum(rate(agones_k8s_client_cache_short_watches_total[1m]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "short watch", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Caches operations rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": "10s", + "schemaVersion": 16, + "style": "dark", + "tags": [ + "controller", + "agones", + "client-go" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Agones Controller go-client caches", + "uid": "IcQY6i_iz", + "version": 6 + } \ No newline at end of file diff --git a/build/grafana/dashboard-goclient-requests.yaml b/build/grafana/dashboard-goclient-requests.yaml new file mode 100644 index 0000000000..465017abd3 --- /dev/null +++ b/build/grafana/dashboard-goclient-requests.yaml @@ -0,0 +1,351 @@ +# Copyright 2019 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# configs map used by grafana +apiVersion: v1 +kind: ConfigMap +metadata: + name: agones-goclient-requests + namespace: metrics + labels: + grafana_dashboard: "1" +data: + dashboard-agones-goclient-requests.json: | + { + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": 5, + "links": [], + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 1, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 4, + "legend": { + "avg": false, + "current": false, + "hideEmpty": true, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(agones_k8s_client_http_request_total[5m])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{status_code}} {{verb}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Requests rate by status", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 1, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 6, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(agones_k8s_client_http_request_duration_seconds_count[5m])) by (endpoint,verb)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{verb}} {{endpoint}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Requests rate by verb and path", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 1, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 9 + }, + "id": 2, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "hideEmpty": true, + "hideZero": false, + "max": true, + "min": true, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(agones_k8s_client_http_request_duration_seconds_sum[5m]) / rate(agones_k8s_client_http_request_duration_seconds_count[5m])) by (endpoint,verb)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{verb}} {{endpoint}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Average request duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": "10s", + "schemaVersion": 16, + "style": "dark", + "tags": [ + "agones", + "controller", + "client-go" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Agones Controller go-client requests", + "uid": "9VRr_m_iz", + "version": 4 + } \ No newline at end of file diff --git a/build/grafana/dashboard-goclient-workqueues.yaml b/build/grafana/dashboard-goclient-workqueues.yaml new file mode 100644 index 0000000000..7f6b9e1380 --- /dev/null +++ b/build/grafana/dashboard-goclient-workqueues.yaml @@ -0,0 +1,547 @@ +# Copyright 2019 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# configs map used by grafana +apiVersion: v1 +kind: ConfigMap +metadata: + name: agones-goclient-workqueues + namespace: metrics + labels: + grafana_dashboard: "1" +data: + dashboard-agones-goclient-workqueues.json: | + { + "annotations": { + "list": [ + { + "$$hashKey": "object:3802", + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "limit": 100, + "name": "Annotations & Alerts", + "showIn": 0, + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": 6, + "iteration": 1548342576612, + "links": [], + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 1, + "gridPos": { + "h": 9, + "w": 8, + "x": 0, + "y": 0 + }, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "$$hashKey": "object:4248", + "expr": "clamp_min(agones_k8s_client_workqueue_depth{queue_name=~\"$workqueue\"},0)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{queue_name}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Workqueue depth", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 1, + "gridPos": { + "h": 9, + "w": 8, + "x": 8, + "y": 0 + }, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "$$hashKey": "object:3968", + "expr": "rate(agones_k8s_client_workqueue_items_total{queue_name=~\"$workqueue\"}[1m])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{queue_name}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Workqueue items rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 1, + "gridPos": { + "h": 9, + "w": 8, + "x": 16, + "y": 0 + }, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "$$hashKey": "object:4382", + "expr": "rate(agones_k8s_client_workqueue_retries_total{queue_name=~\"$workqueue\"}[1m])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{queue_name}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Workqueue retry items rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "How long an item stays in the work queue.", + "fill": 1, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 9 + }, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "$$hashKey": "object:4436", + "expr": "sum(rate(agones_k8s_client_workqueue_latency_seconds_sum{queue_name=~\"$workqueue\"}[1m]) / rate(agones_k8s_client_workqueue_latency_seconds_count{queue_name=~\"$workqueue\"}[1m])) by(queue_name)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{queue_name}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Workqueue average items duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "dtdurations", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "How long processing an item from the work queue takes.", + "fill": 1, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 9 + }, + "id": 7, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "$$hashKey": "object:4490", + "expr": "sum(rate(agones_k8s_client_workqueue_work_duration_seconds_sum{queue_name=~\"$workqueue\"}[1m]) / rate(agones_k8s_client_workqueue_work_duration_seconds_count{queue_name=~\"$workqueue\"}[1m])) by(queue_name)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{queue_name}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Workqueue average work duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "dtdurations", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": "10s", + "schemaVersion": 16, + "style": "dark", + "tags": [ + "agones", + "client-go", + "controller" + ], + "templating": { + "list": [ + { + "allValue": ".*", + "current": { + "text": "All", + "value": [ + "$__all" + ] + }, + "datasource": "Prometheus", + "hide": 0, + "includeAll": true, + "label": "workqueue", + "multi": true, + "name": "workqueue", + "options": [], + "query": "label_values(agones_k8s_client_workqueue_items_total,queue_name)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Agones Controller go-client workqueue", + "uid": "bztnCilik", + "version": 10 + } \ No newline at end of file diff --git a/pkg/metrics/controller.go b/pkg/metrics/controller.go index 1ae7aaec70..3f9c321e5b 100644 --- a/pkg/metrics/controller.go +++ b/pkg/metrics/controller.go @@ -28,7 +28,6 @@ import ( "github.com/pkg/errors" "github.com/sirupsen/logrus" "go.opencensus.io/stats" - "go.opencensus.io/stats/view" "go.opencensus.io/tag" "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/util/intstr" @@ -42,7 +41,11 @@ var ( MetricResyncPeriod = time.Second * 1 ) -// Controller is a metrics controller +func init() { + registerViews() +} + +// Controller is a metrics controller collecting Agones state metrics type Controller struct { logger *logrus.Entry gameServerLister listerv1alpha1.GameServerLister @@ -109,26 +112,9 @@ func NewController( UpdateFunc: c.recordGameServerStatusChanges, }, 0) - c.registerViews() return c } -// register all our views to OpenCensus -func (c *Controller) registerViews() { - for _, v := range views { - if err := view.Register(v); err != nil { - c.logger.WithError(err).Error("could not register view") - } - } -} - -// unregister views, this is only useful for tests as it trigger reporting. -func (c *Controller) unRegisterViews() { - for _, v := range views { - view.Unregister(v) - } -} - func (c *Controller) recordFleetAutoScalerChanges(old, new interface{}) { fas, ok := new.(*stablev1alpha1.FleetAutoscaler) @@ -172,9 +158,9 @@ func (c *Controller) recordFleetAutoScalerChanges(old, new interface{}) { // recording buffer policy if fas.Spec.Policy.Buffer != nil { // recording limits - c.recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "max")}, + recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "max")}, fasBufferLimitsCountStats.M(int64(fas.Spec.Policy.Buffer.MaxReplicas))) - c.recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "min")}, + recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "min")}, fasBufferLimitsCountStats.M(int64(fas.Spec.Policy.Buffer.MinReplicas))) // recording size @@ -183,13 +169,13 @@ func (c *Controller) recordFleetAutoScalerChanges(old, new interface{}) { sizeString := fas.Spec.Policy.Buffer.BufferSize.StrVal if sizeString != "" { if size, err := strconv.Atoi(sizeString[:len(sizeString)-1]); err == nil { - c.recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "percentage")}, + recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "percentage")}, fasBufferSizeStats.M(int64(size))) } } } else { // as count - c.recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "count")}, + recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "count")}, fasBufferSizeStats.M(int64(fas.Spec.Policy.Buffer.BufferSize.IntVal))) } } @@ -240,22 +226,16 @@ func (c *Controller) recordFleetReplicas(fleetName string, total, allocated, rea ctx, _ := tag.New(context.Background(), tag.Upsert(keyName, fleetName)) - c.recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "total")}, + recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "total")}, fleetsReplicasCountStats.M(int64(total))) - c.recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "allocated")}, + recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "allocated")}, fleetsReplicasCountStats.M(int64(allocated))) - c.recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "ready")}, + recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "ready")}, fleetsReplicasCountStats.M(int64(ready))) - c.recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "desired")}, + recordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "desired")}, fleetsReplicasCountStats.M(int64(desired))) } -func (c *Controller) recordWithTags(ctx context.Context, mutators []tag.Mutator, ms ...stats.Measurement) { - if err := stats.RecordWithTags(ctx, mutators, ms...); err != nil { - c.logger.WithError(err).Warn("error while recoding stats") - } -} - // recordGameServerStatusChanged records gameserver status changes, however since it's based // on cache events some events might collapsed and not appear, for example transition state // like creating, port allocation, could be skipped. @@ -278,7 +258,7 @@ func (c *Controller) recordGameServerStatusChanges(old, new interface{}) { if fleetName == "" { fleetName = "none" } - c.recordWithTags(context.Background(), []tag.Mutator{tag.Upsert(keyType, string(newGs.Status.State)), + recordWithTags(context.Background(), []tag.Mutator{tag.Upsert(keyType, string(newGs.Status.State)), tag.Upsert(keyFleetName, fleetName)}, gameServerTotalStats.M(1)) } } @@ -296,7 +276,7 @@ func (c *Controller) recordFleetAllocationChanges(old, new interface{}) { // fleet allocations are added without gameserver allocated // but then get modified on successful allocation with their gameserver if oldFa.Status.GameServer == nil && newFa.Status.GameServer != nil { - c.recordWithTags(context.Background(), []tag.Mutator{tag.Upsert(keyFleetName, newFa.Spec.FleetName)}, + recordWithTags(context.Background(), []tag.Mutator{tag.Upsert(keyFleetName, newFa.Spec.FleetName)}, fleetAllocationTotalStats.M(1)) } } @@ -339,7 +319,7 @@ func (c *Controller) collectFleetAllocationCounts() { } for fleetName, count := range c.faCount { - c.recordWithTags(context.Background(), []tag.Mutator{tag.Insert(keyFleetName, fleetName)}, + recordWithTags(context.Background(), []tag.Mutator{tag.Insert(keyFleetName, fleetName)}, fleetAllocationCountStats.M(count)) } } diff --git a/pkg/metrics/controller_metrics.go b/pkg/metrics/controller_metrics.go new file mode 100644 index 0000000000..d489348efe --- /dev/null +++ b/pkg/metrics/controller_metrics.go @@ -0,0 +1,131 @@ +// Copyright 2019 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package metrics + +import ( + "go.opencensus.io/stats" + "go.opencensus.io/stats/view" + "go.opencensus.io/tag" +) + +var ( + fleetsReplicasCountStats = stats.Int64("fleets/replicas_count", "The count of replicas per fleet", "1") + fasBufferLimitsCountStats = stats.Int64("fas/buffer_limits", "The buffer limits of autoscalers", "1") + fasBufferSizeStats = stats.Int64("fas/buffer_size", "The buffer size value of autoscalers", "1") + fasCurrentReplicasStats = stats.Int64("fas/current_replicas_count", "The current replicas cout as seen by autoscalers", "1") + fasDesiredReplicasStats = stats.Int64("fas/desired_replicas_count", "The desired replicas cout as seen by autoscalers", "1") + fasAbleToScaleStats = stats.Int64("fas/able_to_scale", "The fleet autoscaler can access the fleet to scale (0 indicates false, 1 indicates true)", "1") + fasLimitedStats = stats.Int64("fas/limited", "The fleet autoscaler is capped (0 indicates false, 1 indicates true)", "1") + gameServerCountStats = stats.Int64("gameservers/count", "The count of gameservers", "1") + fleetAllocationCountStats = stats.Int64("fleet_allocations/count", "The count of fleet allocations", "1") + gameServerTotalStats = stats.Int64("gameservers/total", "The total of gameservers", "1") + fleetAllocationTotalStats = stats.Int64("fleet_allocations/total", "The total of fleet allocations", "1") + + stateViews = []*view.View{ + &view.View{ + Name: "fleets_replicas_count", + Measure: fleetsReplicasCountStats, + Description: "The number of replicas per fleet", + Aggregation: view.LastValue(), + TagKeys: []tag.Key{keyName, keyType}, + }, + &view.View{ + Name: "fleet_autoscalers_buffer_limits", + Measure: fasBufferLimitsCountStats, + Description: "The limits of buffer based fleet autoscalers", + Aggregation: view.LastValue(), + TagKeys: []tag.Key{keyName, keyType, keyFleetName}, + }, + &view.View{ + Name: "fleet_autoscalers_buffer_size", + Measure: fasBufferSizeStats, + Description: "The buffer size of fleet autoscalers", + Aggregation: view.LastValue(), + TagKeys: []tag.Key{keyName, keyType, keyFleetName}, + }, + &view.View{ + Name: "fleet_autoscalers_current_replicas_count", + Measure: fasCurrentReplicasStats, + Description: "The current replicas count as seen by autoscalers", + Aggregation: view.LastValue(), + TagKeys: []tag.Key{keyName, keyFleetName}, + }, + &view.View{ + Name: "fleet_autoscalers_desired_replicas_count", + Measure: fasDesiredReplicasStats, + Description: "The desired replicas count as seen by autoscalers", + Aggregation: view.LastValue(), + TagKeys: []tag.Key{keyName, keyFleetName}, + }, + &view.View{ + Name: "fleet_autoscalers_able_to_scale", + Measure: fasAbleToScaleStats, + Description: "The fleet autoscaler can access the fleet to scale", + Aggregation: view.LastValue(), + TagKeys: []tag.Key{keyName, keyFleetName}, + }, + &view.View{ + Name: "fleet_autoscalers_limited", + Measure: fasLimitedStats, + Description: "The fleet autoscaler is capped", + Aggregation: view.LastValue(), + TagKeys: []tag.Key{keyName, keyFleetName}, + }, + &view.View{ + Name: "gameservers_count", + Measure: gameServerCountStats, + Description: "The number of gameservers", + Aggregation: view.LastValue(), + TagKeys: []tag.Key{keyType, keyFleetName}, + }, + &view.View{ + Name: "fleet_allocations_count", + Measure: fleetAllocationCountStats, + Description: "The number of fleet allocations", + Aggregation: view.LastValue(), + TagKeys: []tag.Key{keyFleetName}, + }, + &view.View{ + Name: "fleet_allocations_total", + Measure: fleetAllocationTotalStats, + Description: "The total of fleet allocations", + Aggregation: view.Count(), + TagKeys: []tag.Key{keyFleetName}, + }, + &view.View{ + Name: "gameservers_total", + Measure: gameServerTotalStats, + Description: "The total of gameservers", + Aggregation: view.Count(), + TagKeys: []tag.Key{keyType, keyFleetName}, + }, + } +) + +// register all our state views to OpenCensus +func registerViews() { + for _, v := range stateViews { + if err := view.Register(v); err != nil { + logger.WithError(err).Error("could not register view") + } + } +} + +// unregister views, this is only useful for tests as it trigger reporting. +func unRegisterViews() { + for _, v := range stateViews { + view.Unregister(v) + } +} diff --git a/pkg/metrics/controller_test.go b/pkg/metrics/controller_test.go index a167dc31d9..c4ade4587a 100644 --- a/pkg/metrics/controller_test.go +++ b/pkg/metrics/controller_test.go @@ -42,7 +42,7 @@ func TestControllerGameServerCount(t *testing.T) { c.sync() c.collect() - c.report() + report() gs1 = gs1.DeepCopy() gs1.Status.State = v1alpha1.GameServerStateShutdown @@ -52,7 +52,7 @@ func TestControllerGameServerCount(t *testing.T) { c.sync() c.collect() - c.report() + report() assert.Nil(t, testutil.GatherAndCompare(registry, strings.NewReader(gsCountExpected), "agones_gameservers_count")) } @@ -74,7 +74,7 @@ func TestControllerFleetAllocationCount(t *testing.T) { c.sync() c.collect() - c.report() + report() c.faWatch.Delete(fa1) c.faWatch.Add(fleetAllocation("test-fleet")) @@ -82,7 +82,7 @@ func TestControllerFleetAllocationCount(t *testing.T) { c.sync() c.collect() - c.report() + report() assert.Nil(t, testutil.GatherAndCompare(registry, strings.NewReader(faCountExpected), "agones_fleet_allocations_count")) } @@ -123,7 +123,7 @@ func TestControllerFleetAllocationTotal(t *testing.T) { c.faWatch.Modify(faUpdated) } c.sync() - c.report() + report() assert.Nil(t, testutil.GatherAndCompare(registry, strings.NewReader(faTotalExpected), "agones_fleet_allocations_total")) } @@ -153,7 +153,7 @@ func TestControllerGameServersTotal(t *testing.T) { generateGsEvents(1, v1alpha1.GameServerStateUnhealthy, "", c.gsWatch) c.sync() - c.report() + report() assert.Nil(t, testutil.GatherAndCompare(registry, strings.NewReader(gsTotalExpected), "agones_gameservers_total")) } @@ -179,7 +179,7 @@ func TestControllerFleetReplicasCount(t *testing.T) { c.fleetWatch.Delete(fd) c.sync() - c.report() + report() assert.Nil(t, testutil.GatherAndCompare(registry, strings.NewReader(fleetReplicasCountExpected), "agones_fleets_replicas_count")) } @@ -216,7 +216,7 @@ func TestControllerFleetAutoScalerState(t *testing.T) { c.fasWatch.Delete(fasDeleted) c.sync() - c.report() + report() assert.Nil(t, testutil.GatherAndCompare(registry, strings.NewReader(fasStateExpected), "agones_fleet_autoscalers_able_to_scale", "agones_fleet_autoscalers_buffer_limits", "agones_fleet_autoscalers_buffer_size", diff --git a/pkg/metrics/gameservers.go b/pkg/metrics/gameservers_count.go similarity index 97% rename from pkg/metrics/gameservers.go rename to pkg/metrics/gameservers_count.go index fb7d28dd81..7c3dff21d3 100644 --- a/pkg/metrics/gameservers.go +++ b/pkg/metrics/gameservers_count.go @@ -1,4 +1,4 @@ -// Copyright 2018 Google Inc. All Rights Reserved. +// Copyright 2019 Google Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/pkg/metrics/kubernetes_client.go b/pkg/metrics/kubernetes_client.go new file mode 100644 index 0000000000..94fc2caab4 --- /dev/null +++ b/pkg/metrics/kubernetes_client.go @@ -0,0 +1,295 @@ +// Copyright 2019 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package metrics + +import ( + "context" + "net/url" + "time" + + "agones.dev/agones/pkg/util/runtime" + "go.opencensus.io/stats" + "go.opencensus.io/stats/view" + "go.opencensus.io/tag" + "k8s.io/client-go/tools/cache" + "k8s.io/client-go/tools/metrics" + "k8s.io/client-go/util/workqueue" +) + +var ( + keyQueueName = mustTagKey("queue_name") + + httpRequestTotalStats = stats.Int64("http/request_total", "The total of HTTP requests.", "1") + httpRequestLatencyStats = stats.Float64("http/latency", "The duration of HTTP requests.", "s") + + cacheListTotalStats = stats.Float64("cache/list_total", "The total number of list operations.", "1") + cacheListLatencyStats = stats.Float64("cache/list_latency", "Duration of a Kubernetes API call in seconds", "s") + cacheListItemCountStats = stats.Float64("cache/list_items_count", "Count of items in a list from the Kubernetes API.", "1") + cacheWatchesTotalStats = stats.Float64("cache/watches_total", "Total number of watch operations.", "1") + cacheShortWatchesTotalStats = stats.Float64("cache/short_watches_total", "Total number of short watch operations.", "1") + cacheWatchesLatencyStats = stats.Float64("cache/watches_latency", "Duration of watches on the Kubernetes API.", "s") + cacheItemsInWatchesCountStats = stats.Float64("cache/watch_events", "Number of items in watches on the Kubernetes API.", "1") + cacheLastResourceVersionStats = stats.Float64("cache/last_resource_version", "Last resource version from the Kubernetes API.", "1") + + workQueueDepthStats = stats.Float64("workqueue/depth", "Current depth of the work queue.", "1") + workQueueItemsTotalStats = stats.Float64("workqueue/items_total", "Total number of items added to the work queue.", "1") + workQueueLatencyStats = stats.Float64("workqueue/latency", "How long an item stays in the work queue.", "s") + workQueueWorkDurationStats = stats.Float64("workqueue/work_duration", "How long processing an item from the work queue takes.", "s") + workQueueRetriesTotalStats = stats.Float64("workqueue/retries_total", "Total number of items retried to the work queue.", "1") +) + +func init() { + + runtime.Must(view.Register(&view.View{ + Name: "k8s_client_http_request_total", + Measure: httpRequestTotalStats, + Description: "The total of HTTP requests to the Kubernetes API by status code", + Aggregation: view.Count(), + TagKeys: []tag.Key{keyVerb, keyStatusCode}, + })) + + runtime.Must(view.Register(&view.View{ + Name: "k8s_client_http_request_duration_seconds", + Measure: httpRequestLatencyStats, + Description: "The distribution of HTTP requests latencies to the Kubernetes API by status code", + Aggregation: view.Distribution(0, 0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1, 2, 3), + TagKeys: []tag.Key{keyVerb, keyEndpoint}, + })) + + runtime.Must(view.Register(&view.View{ + Name: "k8s_client_cache_list_total", + Measure: cacheListTotalStats, + Description: "The total number of list operations for client-go caches", + Aggregation: view.Count(), + })) + + runtime.Must(view.Register(&view.View{ + Name: "k8s_client_cache_list_duration_seconds", + Measure: cacheListLatencyStats, + Description: "Duration of a Kubernetes list API call in seconds", + Aggregation: view.Distribution(0, 0.01, 0.025, 0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1, 2, 3), + })) + + runtime.Must(view.Register(&view.View{ + Name: "k8s_client_cache_list_items", + Measure: cacheListItemCountStats, + Description: "Count of items in a list from the Kubernetes API.", + Aggregation: view.Distribution(), + })) + + runtime.Must(view.Register(&view.View{ + Name: "k8s_client_cache_watches_total", + Measure: cacheWatchesTotalStats, + Description: "The total number of watch operations for client-go caches", + Aggregation: view.Count(), + })) + + runtime.Must(view.Register(&view.View{ + Name: "k8s_client_cache_short_watches_total", + Measure: cacheShortWatchesTotalStats, + Description: "The total number of short watch operations for client-go caches", + Aggregation: view.Count(), + })) + + runtime.Must(view.Register(&view.View{ + Name: "k8s_client_cache_watch_duration_seconds", + Measure: cacheWatchesLatencyStats, + Description: "Duration of watches on the Kubernetes API.", + Aggregation: view.Distribution(), + })) + + runtime.Must(view.Register(&view.View{ + Name: "k8s_client_cache_watch_events", + Measure: cacheItemsInWatchesCountStats, + Description: "Number of items in watches on the Kubernetes API.", + Aggregation: view.Distribution(), + })) + + runtime.Must(view.Register(&view.View{ + Name: "k8s_client_cache_last_resource_version", + Measure: cacheLastResourceVersionStats, + Description: "Last resource version from the Kubernetes API.", + Aggregation: view.LastValue(), + })) + + runtime.Must(view.Register(&view.View{ + Name: "k8s_client_workqueue_depth", + Measure: workQueueDepthStats, + Description: "Current depth of the work queue.", + Aggregation: view.LastValue(), + TagKeys: []tag.Key{keyQueueName}, + })) + + runtime.Must(view.Register(&view.View{ + Name: "k8s_client_workqueue_items_total", + Measure: workQueueItemsTotalStats, + Description: "Total number of items added to the work queue.", + Aggregation: view.Count(), + TagKeys: []tag.Key{keyQueueName}, + })) + + runtime.Must(view.Register(&view.View{ + Name: "k8s_client_workqueue_latency_seconds", + Measure: workQueueLatencyStats, + Description: "How long an item stays in the work queue.", + Aggregation: view.Distribution(), + TagKeys: []tag.Key{keyQueueName}, + })) + + runtime.Must(view.Register(&view.View{ + Name: "k8s_client_workqueue_work_duration_seconds", + Measure: workQueueWorkDurationStats, + Description: "How long processing an item from the work queue takes.", + Aggregation: view.Distribution(), + TagKeys: []tag.Key{keyQueueName}, + })) + + runtime.Must(view.Register(&view.View{ + Name: "k8s_client_workqueue_retries_total", + Measure: workQueueRetriesTotalStats, + Description: "Total number of items retried to the work queue.", + Aggregation: view.Count(), + TagKeys: []tag.Key{keyQueueName}, + })) + + clientGoRequest := &clientGoMetricAdapter{} + clientGoRequest.Register() +} + +// Definition of client-go metrics adapter for HTTP requests, caches and workerqueues observations +type clientGoMetricAdapter struct{} + +func (c *clientGoMetricAdapter) Register() { + metrics.Register(c, c) + cache.SetReflectorMetricsProvider(c) + workqueue.SetProvider(c) +} + +func (clientGoMetricAdapter) Increment(code string, method string, host string) { + recordWithTags(context.Background(), []tag.Mutator{tag.Insert(keyStatusCode, code), + tag.Insert(keyVerb, method)}, httpRequestTotalStats.M(int64(1))) +} + +func (clientGoMetricAdapter) Observe(verb string, u url.URL, latency time.Duration) { + // url is without {namespace} and {name}, so cardinality of resulting metrics is low. + recordWithTags(context.Background(), []tag.Mutator{tag.Insert(keyVerb, verb), + tag.Insert(keyEndpoint, u.Path)}, httpRequestLatencyStats.M(latency.Seconds())) +} + +// ocMetric adapts OpenCensus measures to cache metrics +type ocMetric struct { + *stats.Float64Measure + ctx context.Context +} + +func newOcMetric(m *stats.Float64Measure) *ocMetric { + return &ocMetric{ + Float64Measure: m, + ctx: context.Background(), + } +} + +func (m *ocMetric) withTag(key tag.Key, value string) *ocMetric { + ctx, err := tag.New(m.ctx, tag.Upsert(key, value)) + if err != nil { + panic(err) + } + m.ctx = ctx + return m +} + +func (m *ocMetric) Inc() { + stats.Record(m.ctx, m.Float64Measure.M(float64(1))) +} + +func (m *ocMetric) Dec() { + stats.Record(m.ctx, m.Float64Measure.M(float64(-1))) +} + +// observeFunc is an adapter that allows the use of functions as summary metric. +// useful for converting metrics unit before sending them to OC +type observeFunc func(float64) + +func (o observeFunc) Observe(f float64) { + o(f) +} + +func (m *ocMetric) Observe(f float64) { + stats.Record(m.ctx, m.Float64Measure.M(f)) +} + +func (m *ocMetric) Set(f float64) { + stats.Record(m.ctx, m.Float64Measure.M(f)) +} + +func (clientGoMetricAdapter) NewListsMetric(string) cache.CounterMetric { + return newOcMetric(cacheListTotalStats) +} + +func (clientGoMetricAdapter) NewListDurationMetric(string) cache.SummaryMetric { + return newOcMetric(cacheListLatencyStats) +} + +func (clientGoMetricAdapter) NewItemsInListMetric(string) cache.SummaryMetric { + return newOcMetric(cacheListItemCountStats) +} + +func (clientGoMetricAdapter) NewWatchesMetric(string) cache.CounterMetric { + return newOcMetric(cacheWatchesTotalStats) +} + +func (clientGoMetricAdapter) NewShortWatchesMetric(string) cache.CounterMetric { + return newOcMetric(cacheShortWatchesTotalStats) +} + +func (clientGoMetricAdapter) NewWatchDurationMetric(string) cache.SummaryMetric { + return newOcMetric(cacheWatchesLatencyStats) +} + +func (clientGoMetricAdapter) NewItemsInWatchMetric(string) cache.SummaryMetric { + return newOcMetric(cacheItemsInWatchesCountStats) +} + +func (clientGoMetricAdapter) NewLastResourceVersionMetric(string) cache.GaugeMetric { + return newOcMetric(cacheLastResourceVersionStats) +} + +func (clientGoMetricAdapter) NewDepthMetric(name string) workqueue.GaugeMetric { + return newOcMetric(workQueueDepthStats).withTag(keyQueueName, name) +} + +func (clientGoMetricAdapter) NewAddsMetric(name string) workqueue.CounterMetric { + return newOcMetric(workQueueItemsTotalStats).withTag(keyQueueName, name) +} + +func (clientGoMetricAdapter) NewLatencyMetric(name string) workqueue.SummaryMetric { + m := newOcMetric(workQueueLatencyStats).withTag(keyQueueName, name) + // Convert microseconds to seconds for consistency across metrics. + return observeFunc(func(f float64) { + m.Observe(f / 1e6) + }) +} + +func (clientGoMetricAdapter) NewWorkDurationMetric(name string) workqueue.SummaryMetric { + m := newOcMetric(workQueueWorkDurationStats).withTag(keyQueueName, name) + // Convert microseconds to seconds for consistency across metrics. + return observeFunc(func(f float64) { + m.Observe(f / 1e6) + }) +} + +func (clientGoMetricAdapter) NewRetriesMetric(name string) workqueue.CounterMetric { + return newOcMetric(workQueueRetriesTotalStats).withTag(keyQueueName, name) +} diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go deleted file mode 100644 index 0cfa1152ff..0000000000 --- a/pkg/metrics/metrics.go +++ /dev/null @@ -1,138 +0,0 @@ -// Copyright 2018 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package metrics - -import ( - "go.opencensus.io/stats" - "go.opencensus.io/stats/view" - "go.opencensus.io/tag" -) - -var ( - keyName = mustTagKey("name") - keyFleetName = mustTagKey("fleet_name") - keyType = mustTagKey("type") - - fleetsReplicasCountStats = stats.Int64("fleets/replicas_count", "The count of replicas per fleet", "1") - fleetsReplicasCountView = &view.View{ - Name: "fleets_replicas_count", - Measure: fleetsReplicasCountStats, - Description: "The number of replicas per fleet", - Aggregation: view.LastValue(), - TagKeys: []tag.Key{keyName, keyType}, - } - - fasBufferLimitsCountStats = stats.Int64("fas/buffer_limits", "The buffer limits of autoscalers", "1") - fasBufferLimitsCountView = &view.View{ - Name: "fleet_autoscalers_buffer_limits", - Measure: fasBufferLimitsCountStats, - Description: "The limits of buffer based fleet autoscalers", - Aggregation: view.LastValue(), - TagKeys: []tag.Key{keyName, keyType, keyFleetName}, - } - - fasBufferSizeStats = stats.Int64("fas/buffer_size", "The buffer size value of autoscalers", "1") - fasBufferSizeView = &view.View{ - Name: "fleet_autoscalers_buffer_size", - Measure: fasBufferSizeStats, - Description: "The buffer size of fleet autoscalers", - Aggregation: view.LastValue(), - TagKeys: []tag.Key{keyName, keyType, keyFleetName}, - } - - fasCurrentReplicasStats = stats.Int64("fas/current_replicas_count", "The current replicas cout as seen by autoscalers", "1") - fasCurrentReplicasView = &view.View{ - Name: "fleet_autoscalers_current_replicas_count", - Measure: fasCurrentReplicasStats, - Description: "The current replicas count as seen by autoscalers", - Aggregation: view.LastValue(), - TagKeys: []tag.Key{keyName, keyFleetName}, - } - - fasDesiredReplicasStats = stats.Int64("fas/desired_replicas_count", "The desired replicas cout as seen by autoscalers", "1") - fasDesiredReplicasView = &view.View{ - Name: "fleet_autoscalers_desired_replicas_count", - Measure: fasDesiredReplicasStats, - Description: "The desired replicas count as seen by autoscalers", - Aggregation: view.LastValue(), - TagKeys: []tag.Key{keyName, keyFleetName}, - } - - fasAbleToScaleStats = stats.Int64("fas/able_to_scale", "The fleet autoscaler can access the fleet to scale (0 indicates false, 1 indicates true)", "1") - fasAbleToScaleView = &view.View{ - Name: "fleet_autoscalers_able_to_scale", - Measure: fasAbleToScaleStats, - Description: "The fleet autoscaler can access the fleet to scale", - Aggregation: view.LastValue(), - TagKeys: []tag.Key{keyName, keyFleetName}, - } - - fasLimitedStats = stats.Int64("fas/limited", "The fleet autoscaler is capped (0 indicates false, 1 indicates true)", "1") - fasLimitedView = &view.View{ - Name: "fleet_autoscalers_limited", - Measure: fasLimitedStats, - Description: "The fleet autoscaler is capped", - Aggregation: view.LastValue(), - TagKeys: []tag.Key{keyName, keyFleetName}, - } - - gameServerCountStats = stats.Int64("gameservers/count", "The count of gameservers", "1") - gameServersCountView = &view.View{ - Name: "gameservers_count", - Measure: gameServerCountStats, - Description: "The number of gameservers", - Aggregation: view.LastValue(), - TagKeys: []tag.Key{keyType, keyFleetName}, - } - - fleetAllocationCountStats = stats.Int64("fleet_allocations/count", "The count of fleet allocations", "1") - fleetAllocationCountView = &view.View{ - Name: "fleet_allocations_count", - Measure: fleetAllocationCountStats, - Description: "The number of fleet allocations", - Aggregation: view.LastValue(), - TagKeys: []tag.Key{keyFleetName}, - } - - fleetAllocationTotalStats = stats.Int64("fleet_allocations/total", "The total of fleet allocations", "1") - fleetAllocationTotalView = &view.View{ - Name: "fleet_allocations_total", - Measure: fleetAllocationTotalStats, - Description: "The total of fleet allocations", - Aggregation: view.Count(), - TagKeys: []tag.Key{keyFleetName}, - } - - gameServerTotalStats = stats.Int64("gameservers/total", "The total of gameservers", "1") - gameServersTotalView = &view.View{ - Name: "gameservers_total", - Measure: gameServerTotalStats, - Description: "The total of gameservers", - Aggregation: view.Count(), - TagKeys: []tag.Key{keyType, keyFleetName}, - } - - views = []*view.View{fleetsReplicasCountView, gameServersCountView, gameServersTotalView, - fasBufferSizeView, fasBufferLimitsCountView, fasCurrentReplicasView, fasDesiredReplicasView, - fasAbleToScaleView, fasLimitedView, fleetAllocationCountView, fleetAllocationTotalView} -) - -func mustTagKey(key string) tag.Key { - t, err := tag.NewKey(key) - if err != nil { - panic(err) - } - return t -} diff --git a/pkg/metrics/util.go b/pkg/metrics/util.go new file mode 100644 index 0000000000..8a5837cb00 --- /dev/null +++ b/pkg/metrics/util.go @@ -0,0 +1,48 @@ +// Copyright 2019 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package metrics + +import ( + "context" + + "agones.dev/agones/pkg/util/runtime" + "go.opencensus.io/stats" + "go.opencensus.io/tag" +) + +var ( + logger = runtime.NewLoggerWithSource("metrics") + + keyName = mustTagKey("name") + keyFleetName = mustTagKey("fleet_name") + keyType = mustTagKey("type") + keyStatusCode = mustTagKey("status_code") + keyVerb = mustTagKey("verb") + keyEndpoint = mustTagKey("endpoint") +) + +func recordWithTags(ctx context.Context, mutators []tag.Mutator, ms ...stats.Measurement) { + if err := stats.RecordWithTags(ctx, mutators, ms...); err != nil { + logger.WithError(err).Warn("error while recoding stats") + } +} + +func mustTagKey(key string) tag.Key { + t, err := tag.NewKey(key) + if err != nil { + panic(err) + } + return t +} diff --git a/pkg/metrics/util_test.go b/pkg/metrics/util_test.go index 18a9383eac..4245ec28a1 100644 --- a/pkg/metrics/util_test.go +++ b/pkg/metrics/util_test.go @@ -65,12 +65,12 @@ func (c *fakeController) close() { } } -func (c *fakeController) report() { - // hacky: unregistering views force view collections - // so to not have to wait for the reporting period to hit we can - // unregister and register again - c.unRegisterViews() - c.registerViews() +// hacky: unregistering views force view collections +// so to not have to wait for the reporting period to hit we can +// unregister and register again +func report() { + unRegisterViews() + registerViews() } func (c *fakeController) run(t *testing.T) { diff --git a/site/content/en/docs/Guides/metrics.md b/site/content/en/docs/Guides/metrics.md index c1adb77ee1..fae2599991 100644 --- a/site/content/en/docs/Guides/metrics.md +++ b/site/content/en/docs/Guides/metrics.md @@ -23,8 +23,11 @@ Table of Contents - [Installation](#installation) - [Prometheus installation](#prometheus-installation) - [Grafana installation](#grafana-installation) +<<<<<<< HEAD - [Stackdriver installation](#stackdriver-installation) - [Adding more metrics](#adding-more-metrics) +======= +>>>>>>> add client-go metrics and grafana dashboards ## Backend integrations @@ -97,6 +100,14 @@ We provide a set of useful [Grafana](https://grafana.com/) dashboards to monitor - {{< ghlink href="/build/grafana/dashboard-controller-usage.yaml" branch="master" >}}Agones Controller Resource Usage{{< /ghlink >}} displays Agones Controller CPU and memory usage and also some Golang runtime metrics. +{{% feature publishVersion="0.8.0" %}} +- {{< ghlink href="/build/grafana/dashboard-goclient-requests.yaml" branch="master" >}}Agones Controller go-client requests{{< /ghlink >}} displays Agones Controller Kubernetes API consumption. + +- {{< ghlink href="/build/grafana/dashboard-goclient-caches.yaml" branch="master" >}}Agones Controller go-client caches{{< /ghlink >}} displays Agones Controller Kubernetes Watches/Lists operations used. + +- {{< ghlink href="/build/grafana/dashboard-goclient-workqueues.yaml" branch="master" >}}Agones Controller go-client workqueues{{< /ghlink >}} displays Agones Controller workqueue processing time and rates. +{{% /feature %}} + Dashboard screenshots : ![grafana dashboard autoscalers](../../../images/grafana-dashboard-autoscalers.png) @@ -227,24 +238,3 @@ Permissions problem example from controller logs: Failed to export to Stackdriver: rpc error: code = PermissionDenied desc = Permission monitoring.metricDescriptors.create denied (or the resource may not exist). ``` {{% /feature %}} - -## Adding more metrics - -If you want to contribute and add more metrics we recommend to use shared informers (cache) as it is currently implemented in the {{< ghlink href="pkg/metrics/controller.go" branch="master" >}}metrics controller{{< /ghlink >}}. -Using shared informers allows to keep metrics code in one place and doesn't overload the Kubernetes API. - -However there is some cases where you will have to add code inside your ressource controller (eg. latency metrics), you should minize metrics code in your controller by adding specific functions in the metrics packages as shown below. - -```golang -package metrics - -import "go.opencensus.io/stats" - -... - -func RecordSomeLatency(latency int64,ressourceName string) { - stats.RecordWithTags(....) -} -``` - -