From 64fb9c359449a20c7a5fe116b9635e4025121622 Mon Sep 17 00:00:00 2001 From: Douglas Camata <159076+douglascamata@users.noreply.github.com> Date: Mon, 4 Jul 2022 15:59:30 +0000 Subject: [PATCH 1/9] Start to add tenant charts to Receive Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> --- examples/dashboards/receive.json | 702 ++++++++++++++++++++++++++++- mixin/dashboards/receive.libsonnet | 90 +++- 2 files changed, 774 insertions(+), 18 deletions(-) diff --git a/examples/dashboards/receive.json b/examples/dashboards/receive.json index a103ea8d9b..649ab36dd8 100644 --- a/examples/dashboards/receive.json +++ b/examples/dashboards/receive.json @@ -314,6 +314,651 @@ "title": "WRITE - Incoming Request", "titleSize": "h6" }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job, tenant) (rate(http_requests_total{job=~\"$job\", tenant=~\"$tenant\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{code}} - {{tenant}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate of write requests (per code and tenant)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + "error": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 5, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job, tenant) (rate(http_requests_total{job=~\"$job\", tenant=~\"$tenant\",code=~\"5..\"}[$interval])) / sum by (job, tenant) (rate(http_requests_total{job=~\"$job\", tenant=~\"$tenant\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "error", + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Number of errors (by tenant and code)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job, tenant) (rate(http_request_duration_seconds_sum{job=~\"$job\", tenant=~\"$tenant\"}[$interval])) / sum by (job, tenant) (http_request_duration_seconds_count{job=~\"$job\", tenant=~\"$tenant\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{tenant}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Average request duration (by tenant)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "WRITE - Incoming Request focus per tenant", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 7, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job, tenant) (rate(http_request_size_bytes_sum{job=~\"$job\", tenant=~\"$tenant\"}[$interval]))/ sum by (job, tenant) (rate(http_request_size_bytes_count{job=~\"$job\", tenant=~\"$tenant\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{tenant}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Average successful HTTP request size (per tenant and code, only 2XX)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (job, tenant) (rate(http_request_size_bytes_sum{job=~\"$job\", tenant=~\"$tenant\"}[$interval]))/ sum by (job, tenant) (rate(http_request_size_bytes_count{job=~\"$job\", tenant=~\"$tenant\"}[$interval]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{tenant}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Average failed HTTP request size (per tenant and code, non 2XX)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 9, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(http_inflight_requests{job=~\"$job\", tenant=~\"$tenant\"}) by (job, tenant, code)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{method}} - {{tenant}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Inflight requests (per tenant and method)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "HTTP requests", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(thanos_receive_write_timeseries_bucket{job=~\"$job\", tenant=~\"$tenant\"}[$interval])) by (job, tenant) ", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{tenant}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate of series received (per tenant, only 2XX)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 11, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(thanos_receive_write_timeseries_bucket{job=~\"$job\", tenant=~\"$tenant\"}[$interval])) by (job, tenant, code) ", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{code}} - {{tenant}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate of series not written (per tenant and code, non 2XX)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Series & Samples", + "titleSize": "h6" + }, { "collapse": false, "height": "250px", @@ -326,7 +971,7 @@ "datasource": "$datasource", "description": "Shows rate of replications to other receive nodes.", "fill": 1, - "id": 4, + "id": 12, "legend": { "avg": false, "current": false, @@ -405,7 +1050,7 @@ "datasource": "$datasource", "description": "Shows ratio of errors compared to the total number of replications to other receive nodes.", "fill": 10, - "id": 5, + "id": 13, "legend": { "avg": false, "current": false, @@ -493,7 +1138,7 @@ "datasource": "$datasource", "description": "Shows rate of forwarded requests to other receive nodes.", "fill": 1, - "id": 6, + "id": 14, "legend": { "avg": false, "current": false, @@ -572,7 +1217,7 @@ "datasource": "$datasource", "description": "Shows ratio of errors compared to the total number of forwareded requests to other receive nodes.", "fill": 10, - "id": 7, + "id": 15, "legend": { "avg": false, "current": false, @@ -660,7 +1305,7 @@ "datasource": "$datasource", "description": "Shows rate of handled Unary gRPC requests from queriers.", "fill": 10, - "id": 8, + "id": 16, "legend": { "avg": false, "current": false, @@ -811,7 +1456,7 @@ "datasource": "$datasource", "description": "Shows ratio of errors compared to the total number of handled requests from queriers.", "fill": 10, - "id": 9, + "id": 17, "legend": { "avg": false, "current": false, @@ -887,7 +1532,7 @@ "datasource": "$datasource", "description": "Shows how long has it taken to handle requests from queriers, in quantiles.", "fill": 1, - "id": 10, + "id": 18, "legend": { "avg": false, "current": false, @@ -1017,7 +1662,7 @@ "datasource": "$datasource", "description": "Shows rate of handled Unary gRPC requests from queriers.", "fill": 10, - "id": 11, + "id": 19, "legend": { "avg": false, "current": false, @@ -1168,7 +1813,7 @@ "datasource": "$datasource", "description": "Shows ratio of errors compared to the total number of handled requests from queriers.", "fill": 10, - "id": 12, + "id": 20, "legend": { "avg": false, "current": false, @@ -1244,7 +1889,7 @@ "datasource": "$datasource", "description": "Shows how long has it taken to handle requests from queriers, in quantiles.", "fill": 1, - "id": 13, + "id": 21, "legend": { "avg": false, "current": false, @@ -1374,7 +2019,7 @@ "datasource": "$datasource", "description": "Shows rate of handled Streamed gRPC requests from queriers.", "fill": 10, - "id": 14, + "id": 22, "legend": { "avg": false, "current": false, @@ -1525,7 +2170,7 @@ "datasource": "$datasource", "description": "Shows ratio of errors compared to the total number of handled requests from queriers.", "fill": 10, - "id": 15, + "id": 23, "legend": { "avg": false, "current": false, @@ -1601,7 +2246,7 @@ "datasource": "$datasource", "description": "Shows how long has it taken to handle requests from queriers, in quantiles.", "fill": 1, - "id": 16, + "id": 24, "legend": { "avg": false, "current": false, @@ -1731,7 +2376,7 @@ "datasource": "$datasource", "description": "Shows the relative time of last successful upload to the object-store bucket.", "fill": 1, - "id": 17, + "id": 25, "legend": { "avg": false, "current": false, @@ -1855,7 +2500,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 18, + "id": 26, "legend": { "avg": false, "current": false, @@ -1971,7 +2616,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 19, + "id": 27, "legend": { "avg": false, "current": false, @@ -2047,7 +2692,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 20, + "id": 28, "legend": { "avg": false, "current": false, @@ -2146,6 +2791,29 @@ "regex": "", "type": "datasource" }, + { + "allValue": null, + "current": { + "text": "all", + "value": "$__all" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": "tenant", + "multi": false, + "name": "tenant", + "options": [ ], + "query": "label_values(http_requests_total{job=~\"$job\", tenant!=\"\"}, tenant)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, { "auto": true, "auto_count": 300, diff --git a/mixin/dashboards/receive.libsonnet b/mixin/dashboards/receive.libsonnet index 8e3134fccf..402da9ba66 100644 --- a/mixin/dashboards/receive.libsonnet +++ b/mixin/dashboards/receive.libsonnet @@ -1,6 +1,7 @@ local g = import '../lib/thanos-grafana-builder/builder.libsonnet'; local utils = import '../lib/utils.libsonnet'; + { local thanos = self, receive+:: { @@ -9,15 +10,35 @@ local utils = import '../lib/utils.libsonnet'; dashboard:: { selector: std.join(', ', thanos.dashboard.selector + ['job=~"$job"']), dimensions: std.join(', ', thanos.dashboard.dimensions + ['job']), + tenantSelector: std.join(', ', thanos.dashboard.selector + ['job=~"$job"', 'tenant=~"$tenant"']), + tenantDimensions: std.join(', ', thanos.dashboard.dimensions + ['job', 'tenant']), }, }, grafanaDashboards+:: { + local grafana = import 'grafonnet/grafana.libsonnet', + local template = grafana.template, [if thanos.receive != null then 'receive.json']: local receiveHandlerSelector = utils.joinLabels([thanos.receive.dashboard.selector, 'handler="receive"']); local grpcUnaryWriteSelector = utils.joinLabels([thanos.receive.dashboard.selector, 'grpc_type="unary"', 'grpc_method="RemoteWrite"']); local grpcUnaryReadSelector = utils.joinLabels([thanos.receive.dashboard.selector, 'grpc_type="unary"', 'grpc_method!="RemoteWrite"']); local grpcServerStreamSelector = utils.joinLabels([thanos.receive.dashboard.selector, 'grpc_type="server_stream"']); - g.dashboard(thanos.receive.title) + g.dashboard(thanos.receive.title) { + templating+: { + list+: [ + template.new( + 'tenant', + '$datasource', + 'label_values(http_requests_total{%s}, %s)' % [std.join(', ', [thanos.receive.dashboard.selector] + ['tenant!=""']), 'tenant'], + label='tenant', + refresh=1, + sort=2, + current='all', + allValues=null, + includeAll=true + ), + ], + }, + } .addRow( g.row('WRITE - Incoming Request') .addPanel( @@ -33,6 +54,73 @@ local utils = import '../lib/utils.libsonnet'; g.latencyPanel('http_request_duration_seconds', receiveHandlerSelector, thanos.receive.dashboard.dimensions) ) ) + .addRow( + g.row('WRITE - Incoming Request (tenant focus)') + .addPanel( + g.panel('Rate of write requests (per code and tenant)') + + g.queryPanel( + 'sum by (%s) (rate(http_requests_total{%s}[$interval]))' % [thanos.receive.dashboard.tenantDimensions, thanos.receive.dashboard.tenantSelector], + '{{code}} - {{tenant}}' + ) + ) + // TODO: fix error panel label + .addPanel( + g.panel('Number of errors (by tenant and code)') + + g.httpErrPanel('http_requests_total', thanos.receive.dashboard.tenantSelector, thanos.receive.dashboard.tenantDimensions) + ) + .addPanel( + g.panel('Average request duration (by tenant)') + + g.queryPanel( + 'sum by (%s) (rate(http_request_duration_seconds_sum{%s}[$interval])) / sum by (%s) (http_request_duration_seconds_count{%s})' % [thanos.receive.dashboard.tenantDimensions, thanos.receive.dashboard.tenantSelector, thanos.receive.dashboard.tenantDimensions, thanos.receive.dashboard.tenantSelector], + '{{tenant}}' + ) + ) + ) + .addRow( + g.row('HTTP requests (tenant focus)') + // TODO: filter HTTP codes + .addPanel( + g.panel('Average successful HTTP request size (per tenant and code, only 2XX)') + + g.queryPanel( + 'sum by (%s) (rate(http_request_size_bytes_sum{%s}[$interval]))/ sum by (%s) (rate(http_request_size_bytes_count{%s}[$interval]))' % [thanos.receive.dashboard.tenantDimensions, thanos.receive.dashboard.tenantSelector, thanos.receive.dashboard.tenantDimensions, thanos.receive.dashboard.tenantSelector], + '{{tenant}}' + ) + ) + // TODO: filter HTTP codes + .addPanel( + g.panel('Average failed HTTP request size (per tenant and code, non 2XX)') + + g.queryPanel( + 'sum by (%s) (rate(http_request_size_bytes_sum{%s}[$interval]))/ sum by (%s) (rate(http_request_size_bytes_count{%s}[$interval]))' % [thanos.receive.dashboard.tenantDimensions, thanos.receive.dashboard.tenantSelector, thanos.receive.dashboard.tenantDimensions, thanos.receive.dashboard.tenantSelector], + '{{tenant}}' + ) + ) + .addPanel( + g.panel('Inflight requests (per tenant and method)') + + g.queryPanel( + 'sum(http_inflight_requests{%s}) by (%s)' % [thanos.receive.dashboard.tenantSelector, thanos.receive.dashboard.tenantDimensions + ', code'], + '{{method}} - {{tenant}}' + ) + ) + ) + .addRow( + g.row('Series & Samples (tenant focus)') + // TODO: filter HTTP codes + .addPanel( + g.panel('Rate of series received (per tenant, only 2XX)') + + g.queryPanel( + 'sum(rate(thanos_receive_write_timeseries_bucket{%s}[$interval])) by (%s) ' % [thanos.receive.dashboard.tenantSelector, thanos.receive.dashboard.tenantDimensions], + '{{tenant}}' + ) + ) + // TODO: filter HTTP codes + .addPanel( + g.panel('Rate of series not written (per tenant and code, non 2XX)') + + g.queryPanel( + 'sum(rate(thanos_receive_write_timeseries_bucket{%s}[$interval])) by (%s) ' % [thanos.receive.dashboard.tenantSelector, thanos.receive.dashboard.tenantDimensions + ', code'], + '{{code}} - {{tenant}}' + ) + ) + ) .addRow( g.row('WRITE - Replication') .addPanel( From 6e398bb11e46e501c04dd9530a2b7930ea039f20 Mon Sep 17 00:00:00 2001 From: Douglas Camata <159076+douglascamata@users.noreply.github.com> Date: Mon, 4 Jul 2022 16:35:33 +0000 Subject: [PATCH 2/9] Properly filter HTTP status codes Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> --- examples/dashboards/receive.json | 16 ++++++++-------- mixin/dashboards/receive.libsonnet | 18 ++++++++---------- 2 files changed, 16 insertions(+), 18 deletions(-) diff --git a/examples/dashboards/receive.json b/examples/dashboards/receive.json index 649ab36dd8..4f48a25d87 100644 --- a/examples/dashboards/receive.json +++ b/examples/dashboards/receive.json @@ -350,7 +350,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (job, tenant) (rate(http_requests_total{job=~\"$job\", tenant=~\"$tenant\"}[$interval]))", + "expr": "sum by (job, tenant, code) (rate(http_requests_total{job=~\"$job\", tenant=~\"$tenant\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{code}} - {{tenant}}", @@ -552,7 +552,7 @@ "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "WRITE - Incoming Request focus per tenant", + "title": "WRITE - Incoming Request (tenant focus)", "titleSize": "h6" }, { @@ -591,7 +591,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (job, tenant) (rate(http_request_size_bytes_sum{job=~\"$job\", tenant=~\"$tenant\"}[$interval]))/ sum by (job, tenant) (rate(http_request_size_bytes_count{job=~\"$job\", tenant=~\"$tenant\"}[$interval]))", + "expr": "sum by (job, tenant) (rate(http_request_size_bytes_sum{job=~\"$job\", tenant=~\"$tenant\", code=~\"2..\"}[$interval]))/ sum by (job, tenant) (rate(http_request_size_bytes_count{job=~\"$job\", tenant=~\"$tenant\", code=~\"2..\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{tenant}}", @@ -667,7 +667,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (job, tenant) (rate(http_request_size_bytes_sum{job=~\"$job\", tenant=~\"$tenant\"}[$interval]))/ sum by (job, tenant) (rate(http_request_size_bytes_count{job=~\"$job\", tenant=~\"$tenant\"}[$interval]))", + "expr": "sum by (job, tenant) (rate(http_request_size_bytes_sum{job=~\"$job\", tenant=~\"$tenant\", code!~\"2..\"}[$interval]))/ sum by (job, tenant) (rate(http_request_size_bytes_count{job=~\"$job\", tenant=~\"$tenant\", code!~\"2..\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{tenant}}", @@ -792,7 +792,7 @@ "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "HTTP requests", + "title": "HTTP requests (tenant focus)", "titleSize": "h6" }, { @@ -831,7 +831,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(thanos_receive_write_timeseries_bucket{job=~\"$job\", tenant=~\"$tenant\"}[$interval])) by (job, tenant) ", + "expr": "sum(rate(thanos_receive_write_timeseries_bucket{job=~\"$job\", tenant=~\"$tenant\", code=~\"2..\"}[$interval])) by (job, tenant) ", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{tenant}}", @@ -907,7 +907,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(thanos_receive_write_timeseries_bucket{job=~\"$job\", tenant=~\"$tenant\"}[$interval])) by (job, tenant, code) ", + "expr": "sum(rate(thanos_receive_write_timeseries_bucket{job=~\"$job\", tenant=~\"$tenant\", code!~\"2..\"}[$interval])) by (job, tenant, code) ", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{code}} - {{tenant}}", @@ -956,7 +956,7 @@ "repeatIteration": null, "repeatRowId": null, "showTitle": true, - "title": "Series & Samples", + "title": "Series & Samples (tenant focus)", "titleSize": "h6" }, { diff --git a/mixin/dashboards/receive.libsonnet b/mixin/dashboards/receive.libsonnet index 402da9ba66..5b1cfd9afd 100644 --- a/mixin/dashboards/receive.libsonnet +++ b/mixin/dashboards/receive.libsonnet @@ -22,6 +22,8 @@ local utils = import '../lib/utils.libsonnet'; local grpcUnaryWriteSelector = utils.joinLabels([thanos.receive.dashboard.selector, 'grpc_type="unary"', 'grpc_method="RemoteWrite"']); local grpcUnaryReadSelector = utils.joinLabels([thanos.receive.dashboard.selector, 'grpc_type="unary"', 'grpc_method!="RemoteWrite"']); local grpcServerStreamSelector = utils.joinLabels([thanos.receive.dashboard.selector, 'grpc_type="server_stream"']); + local tenantHttpCode2XXSelector = std.join(', ', [thanos.receive.dashboard.tenantSelector, 'code=~"2.."']); + local tenantHttpCodeNot2XXSelector = std.join(', ', [thanos.receive.dashboard.tenantSelector, 'code!~"2.."']); g.dashboard(thanos.receive.title) { templating+: { list+: [ @@ -59,11 +61,11 @@ local utils = import '../lib/utils.libsonnet'; .addPanel( g.panel('Rate of write requests (per code and tenant)') + g.queryPanel( - 'sum by (%s) (rate(http_requests_total{%s}[$interval]))' % [thanos.receive.dashboard.tenantDimensions, thanos.receive.dashboard.tenantSelector], + 'sum by (%s) (rate(http_requests_total{%s}[$interval]))' % [thanos.receive.dashboard.tenantDimensions + ', code', thanos.receive.dashboard.tenantSelector], '{{code}} - {{tenant}}' ) ) - // TODO: fix error panel label + // TODO: change this to a query panel .addPanel( g.panel('Number of errors (by tenant and code)') + g.httpErrPanel('http_requests_total', thanos.receive.dashboard.tenantSelector, thanos.receive.dashboard.tenantDimensions) @@ -78,19 +80,17 @@ local utils = import '../lib/utils.libsonnet'; ) .addRow( g.row('HTTP requests (tenant focus)') - // TODO: filter HTTP codes .addPanel( g.panel('Average successful HTTP request size (per tenant and code, only 2XX)') + g.queryPanel( - 'sum by (%s) (rate(http_request_size_bytes_sum{%s}[$interval]))/ sum by (%s) (rate(http_request_size_bytes_count{%s}[$interval]))' % [thanos.receive.dashboard.tenantDimensions, thanos.receive.dashboard.tenantSelector, thanos.receive.dashboard.tenantDimensions, thanos.receive.dashboard.tenantSelector], + 'sum by (%s) (rate(http_request_size_bytes_sum{%s}[$interval]))/ sum by (%s) (rate(http_request_size_bytes_count{%s}[$interval]))' % [thanos.receive.dashboard.tenantDimensions, tenantHttpCode2XXSelector, thanos.receive.dashboard.tenantDimensions, tenantHttpCode2XXSelector], '{{tenant}}' ) ) - // TODO: filter HTTP codes .addPanel( g.panel('Average failed HTTP request size (per tenant and code, non 2XX)') + g.queryPanel( - 'sum by (%s) (rate(http_request_size_bytes_sum{%s}[$interval]))/ sum by (%s) (rate(http_request_size_bytes_count{%s}[$interval]))' % [thanos.receive.dashboard.tenantDimensions, thanos.receive.dashboard.tenantSelector, thanos.receive.dashboard.tenantDimensions, thanos.receive.dashboard.tenantSelector], + 'sum by (%s) (rate(http_request_size_bytes_sum{%s}[$interval]))/ sum by (%s) (rate(http_request_size_bytes_count{%s}[$interval]))' % [thanos.receive.dashboard.tenantDimensions, tenantHttpCodeNot2XXSelector, thanos.receive.dashboard.tenantDimensions, tenantHttpCodeNot2XXSelector], '{{tenant}}' ) ) @@ -104,19 +104,17 @@ local utils = import '../lib/utils.libsonnet'; ) .addRow( g.row('Series & Samples (tenant focus)') - // TODO: filter HTTP codes .addPanel( g.panel('Rate of series received (per tenant, only 2XX)') + g.queryPanel( - 'sum(rate(thanos_receive_write_timeseries_bucket{%s}[$interval])) by (%s) ' % [thanos.receive.dashboard.tenantSelector, thanos.receive.dashboard.tenantDimensions], + 'sum(rate(thanos_receive_write_timeseries_bucket{%s}[$interval])) by (%s) ' % [tenantHttpCode2XXSelector, thanos.receive.dashboard.tenantDimensions], '{{tenant}}' ) ) - // TODO: filter HTTP codes .addPanel( g.panel('Rate of series not written (per tenant and code, non 2XX)') + g.queryPanel( - 'sum(rate(thanos_receive_write_timeseries_bucket{%s}[$interval])) by (%s) ' % [thanos.receive.dashboard.tenantSelector, thanos.receive.dashboard.tenantDimensions + ', code'], + 'sum(rate(thanos_receive_write_timeseries_bucket{%s}[$interval])) by (%s) ' % [tenantHttpCodeNot2XXSelector, thanos.receive.dashboard.tenantDimensions + ', code'], '{{code}} - {{tenant}}' ) ) From 24b0ba0d67425b34a75585b956a2e739e20f4f41 Mon Sep 17 00:00:00 2001 From: Douglas Camata <159076+douglascamata@users.noreply.github.com> Date: Tue, 5 Jul 2022 09:22:29 +0000 Subject: [PATCH 3/9] Fix tenant error rate chart Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> --- examples/dashboards/receive.json | 17 ++++++----- mixin/dashboards/receive.libsonnet | 45 ++++++++++++++++++++++++------ 2 files changed, 45 insertions(+), 17 deletions(-) diff --git a/examples/dashboards/receive.json b/examples/dashboards/receive.json index 4f48a25d87..f88d5b4900 100644 --- a/examples/dashboards/receive.json +++ b/examples/dashboards/receive.json @@ -395,14 +395,12 @@ ] }, { - "aliasColors": { - "error": "#E24D42" - }, + "aliasColors": { }, "bars": false, "dashLength": 10, "dashes": false, "datasource": "$datasource", - "fill": 10, + "fill": 1, "id": 5, "legend": { "avg": false, @@ -414,7 +412,7 @@ "values": false }, "lines": true, - "linewidth": 0, + "linewidth": 1, "links": [ ], "nullPointMode": "null as zero", "percentage": false, @@ -424,14 +422,15 @@ "seriesOverrides": [ ], "spaceLength": 10, "span": 4, - "stack": true, + "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum by (job, tenant) (rate(http_requests_total{job=~\"$job\", tenant=~\"$tenant\",code=~\"5..\"}[$interval])) / sum by (job, tenant) (rate(http_requests_total{job=~\"$job\", tenant=~\"$tenant\"}[$interval]))", + "expr": "sum by (job, tenant, code) (rate(http_requests_total{job=~\"$job\", tenant=~\"$tenant\", code!~\"2..\"}[$interval]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "error", + "legendFormat": "{{code}} - {{tenant}}", + "legendLink": null, "step": 10 } ], @@ -454,7 +453,7 @@ }, "yaxes": [ { - "format": "percentunit", + "format": "short", "label": null, "logBase": 1, "max": null, diff --git a/mixin/dashboards/receive.libsonnet b/mixin/dashboards/receive.libsonnet index 5b1cfd9afd..6394461c0f 100644 --- a/mixin/dashboards/receive.libsonnet +++ b/mixin/dashboards/receive.libsonnet @@ -65,15 +65,25 @@ local utils = import '../lib/utils.libsonnet'; '{{code}} - {{tenant}}' ) ) - // TODO: change this to a query panel .addPanel( g.panel('Number of errors (by tenant and code)') + - g.httpErrPanel('http_requests_total', thanos.receive.dashboard.tenantSelector, thanos.receive.dashboard.tenantDimensions) + g.queryPanel( + 'sum by (%s) (rate(http_requests_total{%s}[$interval]))' % [ + thanos.receive.dashboard.tenantDimensions + ', code', + tenantHttpCodeNot2XXSelector, + ], + '{{code}} - {{tenant}}' + ) ) .addPanel( g.panel('Average request duration (by tenant)') + g.queryPanel( - 'sum by (%s) (rate(http_request_duration_seconds_sum{%s}[$interval])) / sum by (%s) (http_request_duration_seconds_count{%s})' % [thanos.receive.dashboard.tenantDimensions, thanos.receive.dashboard.tenantSelector, thanos.receive.dashboard.tenantDimensions, thanos.receive.dashboard.tenantSelector], + 'sum by (%s) (rate(http_request_duration_seconds_sum{%s}[$interval])) / sum by (%s) (http_request_duration_seconds_count{%s})' % [ + thanos.receive.dashboard.tenantDimensions, + thanos.receive.dashboard.tenantSelector, + thanos.receive.dashboard.tenantDimensions, + thanos.receive.dashboard.tenantSelector, + ], '{{tenant}}' ) ) @@ -83,21 +93,34 @@ local utils = import '../lib/utils.libsonnet'; .addPanel( g.panel('Average successful HTTP request size (per tenant and code, only 2XX)') + g.queryPanel( - 'sum by (%s) (rate(http_request_size_bytes_sum{%s}[$interval]))/ sum by (%s) (rate(http_request_size_bytes_count{%s}[$interval]))' % [thanos.receive.dashboard.tenantDimensions, tenantHttpCode2XXSelector, thanos.receive.dashboard.tenantDimensions, tenantHttpCode2XXSelector], + 'sum by (%s) (rate(http_request_size_bytes_sum{%s}[$interval]))/ sum by (%s) (rate(http_request_size_bytes_count{%s}[$interval]))' % [ + thanos.receive.dashboard.tenantDimensions, + tenantHttpCode2XXSelector, + thanos.receive.dashboard.tenantDimensions, + tenantHttpCode2XXSelector, + ], '{{tenant}}' ) ) .addPanel( g.panel('Average failed HTTP request size (per tenant and code, non 2XX)') + g.queryPanel( - 'sum by (%s) (rate(http_request_size_bytes_sum{%s}[$interval]))/ sum by (%s) (rate(http_request_size_bytes_count{%s}[$interval]))' % [thanos.receive.dashboard.tenantDimensions, tenantHttpCodeNot2XXSelector, thanos.receive.dashboard.tenantDimensions, tenantHttpCodeNot2XXSelector], + 'sum by (%s) (rate(http_request_size_bytes_sum{%s}[$interval]))/ sum by (%s) (rate(http_request_size_bytes_count{%s}[$interval]))' % [ + thanos.receive.dashboard.tenantDimensions, + tenantHttpCodeNot2XXSelector, + thanos.receive.dashboard.tenantDimensions, + tenantHttpCodeNot2XXSelector, + ], '{{tenant}}' ) ) .addPanel( g.panel('Inflight requests (per tenant and method)') + g.queryPanel( - 'sum(http_inflight_requests{%s}) by (%s)' % [thanos.receive.dashboard.tenantSelector, thanos.receive.dashboard.tenantDimensions + ', code'], + 'sum(http_inflight_requests{%s}) by (%s)' % [ + thanos.receive.dashboard.tenantSelector, + thanos.receive.dashboard.tenantDimensions + ', code', + ], '{{method}} - {{tenant}}' ) ) @@ -107,14 +130,20 @@ local utils = import '../lib/utils.libsonnet'; .addPanel( g.panel('Rate of series received (per tenant, only 2XX)') + g.queryPanel( - 'sum(rate(thanos_receive_write_timeseries_bucket{%s}[$interval])) by (%s) ' % [tenantHttpCode2XXSelector, thanos.receive.dashboard.tenantDimensions], + 'sum(rate(thanos_receive_write_timeseries_bucket{%s}[$interval])) by (%s) ' % [ + tenantHttpCode2XXSelector, + thanos.receive.dashboard.tenantDimensions, + ], '{{tenant}}' ) ) .addPanel( g.panel('Rate of series not written (per tenant and code, non 2XX)') + g.queryPanel( - 'sum(rate(thanos_receive_write_timeseries_bucket{%s}[$interval])) by (%s) ' % [tenantHttpCodeNot2XXSelector, thanos.receive.dashboard.tenantDimensions + ', code'], + 'sum(rate(thanos_receive_write_timeseries_bucket{%s}[$interval])) by (%s) ' % [ + tenantHttpCodeNot2XXSelector, + thanos.receive.dashboard.tenantDimensions + ', code', + ], '{{code}} - {{tenant}}' ) ) From 105dfce4b552dfd17d3a3b3b888bc98e52e3460a Mon Sep 17 00:00:00 2001 From: Douglas Camata <159076+douglascamata@users.noreply.github.com> Date: Tue, 5 Jul 2022 09:42:19 +0000 Subject: [PATCH 4/9] Refactor to improve readability and consistency Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> --- examples/dashboards/receive.json | 8 ++++---- mixin/dashboards/receive.libsonnet | 11 ++++++----- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/examples/dashboards/receive.json b/examples/dashboards/receive.json index f88d5b4900..73ebe55a63 100644 --- a/examples/dashboards/receive.json +++ b/examples/dashboards/receive.json @@ -350,7 +350,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (job, tenant, code) (rate(http_requests_total{job=~\"$job\", tenant=~\"$tenant\"}[$interval]))", + "expr": "sum by (tenant, code) (rate(http_requests_total{job=~\"$job\", tenant=~\"$tenant\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{code}} - {{tenant}}", @@ -361,7 +361,7 @@ "thresholds": [ ], "timeFrom": null, "timeShift": null, - "title": "Rate of write requests (per code and tenant)", + "title": "Rate of write requests (by tenant and code)", "tooltip": { "shared": false, "sort": 0, @@ -426,7 +426,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (job, tenant, code) (rate(http_requests_total{job=~\"$job\", tenant=~\"$tenant\", code!~\"2..\"}[$interval]))", + "expr": "sum by (tenant, code) (rate(http_requests_total{job=~\"$job\", tenant=~\"$tenant\", code!~\"2..\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{code}} - {{tenant}}", @@ -742,7 +742,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(http_inflight_requests{job=~\"$job\", tenant=~\"$tenant\"}) by (job, tenant, code)", + "expr": "sum by (job, tenant ,method) (http_inflight_requests{job=~\"$job\", tenant=~\"$tenant\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{method}} - {{tenant}}", diff --git a/mixin/dashboards/receive.libsonnet b/mixin/dashboards/receive.libsonnet index 6394461c0f..ad34e95ccc 100644 --- a/mixin/dashboards/receive.libsonnet +++ b/mixin/dashboards/receive.libsonnet @@ -22,6 +22,7 @@ local utils = import '../lib/utils.libsonnet'; local grpcUnaryWriteSelector = utils.joinLabels([thanos.receive.dashboard.selector, 'grpc_type="unary"', 'grpc_method="RemoteWrite"']); local grpcUnaryReadSelector = utils.joinLabels([thanos.receive.dashboard.selector, 'grpc_type="unary"', 'grpc_method!="RemoteWrite"']); local grpcServerStreamSelector = utils.joinLabels([thanos.receive.dashboard.selector, 'grpc_type="server_stream"']); + local tenantWithHttpCodeSelector = std.join(', ', ['tenant', 'code']); local tenantHttpCode2XXSelector = std.join(', ', [thanos.receive.dashboard.tenantSelector, 'code=~"2.."']); local tenantHttpCodeNot2XXSelector = std.join(', ', [thanos.receive.dashboard.tenantSelector, 'code!~"2.."']); g.dashboard(thanos.receive.title) { @@ -59,9 +60,9 @@ local utils = import '../lib/utils.libsonnet'; .addRow( g.row('WRITE - Incoming Request (tenant focus)') .addPanel( - g.panel('Rate of write requests (per code and tenant)') + + g.panel('Rate of write requests (by tenant and code)') + g.queryPanel( - 'sum by (%s) (rate(http_requests_total{%s}[$interval]))' % [thanos.receive.dashboard.tenantDimensions + ', code', thanos.receive.dashboard.tenantSelector], + 'sum by (%s) (rate(http_requests_total{%s}[$interval]))' % [tenantWithHttpCodeSelector, thanos.receive.dashboard.tenantSelector], '{{code}} - {{tenant}}' ) ) @@ -69,7 +70,7 @@ local utils = import '../lib/utils.libsonnet'; g.panel('Number of errors (by tenant and code)') + g.queryPanel( 'sum by (%s) (rate(http_requests_total{%s}[$interval]))' % [ - thanos.receive.dashboard.tenantDimensions + ', code', + tenantWithHttpCodeSelector, tenantHttpCodeNot2XXSelector, ], '{{code}} - {{tenant}}' @@ -117,9 +118,9 @@ local utils = import '../lib/utils.libsonnet'; .addPanel( g.panel('Inflight requests (per tenant and method)') + g.queryPanel( - 'sum(http_inflight_requests{%s}) by (%s)' % [ + 'sum by (%s) (http_inflight_requests{%s})' % [ + std.join(' ,', [thanos.receive.dashboard.tenantDimensions, 'method']), thanos.receive.dashboard.tenantSelector, - thanos.receive.dashboard.tenantDimensions + ', code', ], '{{method}} - {{tenant}}' ) From ee1db1b662ba760a4424c6fa5375a6985b015e17 Mon Sep 17 00:00:00 2001 From: Douglas Camata <159076+douglascamata@users.noreply.github.com> Date: Tue, 5 Jul 2022 09:44:57 +0000 Subject: [PATCH 5/9] Refactor one more usage of code and tenant labels Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> --- examples/dashboards/receive.json | 2 +- mixin/dashboards/receive.libsonnet | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/dashboards/receive.json b/examples/dashboards/receive.json index 73ebe55a63..f989622fb9 100644 --- a/examples/dashboards/receive.json +++ b/examples/dashboards/receive.json @@ -906,7 +906,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(thanos_receive_write_timeseries_bucket{job=~\"$job\", tenant=~\"$tenant\", code!~\"2..\"}[$interval])) by (job, tenant, code) ", + "expr": "sum(rate(thanos_receive_write_timeseries_bucket{job=~\"$job\", tenant=~\"$tenant\", code!~\"2..\"}[$interval])) by (tenant, code) ", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{code}} - {{tenant}}", diff --git a/mixin/dashboards/receive.libsonnet b/mixin/dashboards/receive.libsonnet index ad34e95ccc..9d14fc4574 100644 --- a/mixin/dashboards/receive.libsonnet +++ b/mixin/dashboards/receive.libsonnet @@ -143,7 +143,7 @@ local utils = import '../lib/utils.libsonnet'; g.queryPanel( 'sum(rate(thanos_receive_write_timeseries_bucket{%s}[$interval])) by (%s) ' % [ tenantHttpCodeNot2XXSelector, - thanos.receive.dashboard.tenantDimensions + ', code', + tenantWithHttpCodeSelector, ], '{{code}} - {{tenant}}' ) From ed541046c37dcc2413a05d62b6ebbe173fb51884 Mon Sep 17 00:00:00 2001 From: Douglas Camata <159076+douglascamata@users.noreply.github.com> Date: Tue, 5 Jul 2022 10:04:25 +0000 Subject: [PATCH 6/9] Filter tenant metrics to the Receive handler Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> --- examples/dashboards/receive.json | 12 ++++++------ mixin/dashboards/receive.libsonnet | 27 +++++++++++++++------------ 2 files changed, 21 insertions(+), 18 deletions(-) diff --git a/examples/dashboards/receive.json b/examples/dashboards/receive.json index f989622fb9..c0222b2efe 100644 --- a/examples/dashboards/receive.json +++ b/examples/dashboards/receive.json @@ -350,7 +350,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (tenant, code) (rate(http_requests_total{job=~\"$job\", tenant=~\"$tenant\"}[$interval]))", + "expr": "sum by (tenant, code) (rate(http_requests_total{job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{code}} - {{tenant}}", @@ -426,7 +426,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (tenant, code) (rate(http_requests_total{job=~\"$job\", tenant=~\"$tenant\", code!~\"2..\"}[$interval]))", + "expr": "sum by (tenant, code) (rate(http_requests_total{job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\", code!~\"2..\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{code}} - {{tenant}}", @@ -502,7 +502,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (job, tenant) (rate(http_request_duration_seconds_sum{job=~\"$job\", tenant=~\"$tenant\"}[$interval])) / sum by (job, tenant) (http_request_duration_seconds_count{job=~\"$job\", tenant=~\"$tenant\"})", + "expr": "sum by (job, tenant) (rate(http_request_duration_seconds_sum{job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\"}[$interval])) / sum by (job, tenant) (http_request_duration_seconds_count{job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{tenant}}", @@ -590,7 +590,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (job, tenant) (rate(http_request_size_bytes_sum{job=~\"$job\", tenant=~\"$tenant\", code=~\"2..\"}[$interval]))/ sum by (job, tenant) (rate(http_request_size_bytes_count{job=~\"$job\", tenant=~\"$tenant\", code=~\"2..\"}[$interval]))", + "expr": "sum by (job, tenant) (rate(http_request_size_bytes_sum{job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\", code=~\"2..\"}[$interval]))/ sum by (job, tenant) (rate(http_request_size_bytes_count{job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\", code=~\"2..\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{tenant}}", @@ -666,7 +666,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (job, tenant) (rate(http_request_size_bytes_sum{job=~\"$job\", tenant=~\"$tenant\", code!~\"2..\"}[$interval]))/ sum by (job, tenant) (rate(http_request_size_bytes_count{job=~\"$job\", tenant=~\"$tenant\", code!~\"2..\"}[$interval]))", + "expr": "sum by (job, tenant) (rate(http_request_size_bytes_sum{job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\", code!~\"2..\"}[$interval]))/ sum by (job, tenant) (rate(http_request_size_bytes_count{job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\", code!~\"2..\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{tenant}}", @@ -742,7 +742,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (job, tenant ,method) (http_inflight_requests{job=~\"$job\", tenant=~\"$tenant\"})", + "expr": "sum by (job, tenant, method) (http_inflight_requests{job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{method}} - {{tenant}}", diff --git a/mixin/dashboards/receive.libsonnet b/mixin/dashboards/receive.libsonnet index 9d14fc4574..dff6571deb 100644 --- a/mixin/dashboards/receive.libsonnet +++ b/mixin/dashboards/receive.libsonnet @@ -22,9 +22,12 @@ local utils = import '../lib/utils.libsonnet'; local grpcUnaryWriteSelector = utils.joinLabels([thanos.receive.dashboard.selector, 'grpc_type="unary"', 'grpc_method="RemoteWrite"']); local grpcUnaryReadSelector = utils.joinLabels([thanos.receive.dashboard.selector, 'grpc_type="unary"', 'grpc_method!="RemoteWrite"']); local grpcServerStreamSelector = utils.joinLabels([thanos.receive.dashboard.selector, 'grpc_type="server_stream"']); - local tenantWithHttpCodeSelector = std.join(', ', ['tenant', 'code']); - local tenantHttpCode2XXSelector = std.join(', ', [thanos.receive.dashboard.tenantSelector, 'code=~"2.."']); - local tenantHttpCodeNot2XXSelector = std.join(', ', [thanos.receive.dashboard.tenantSelector, 'code!~"2.."']); + + local tenantReceiveHandlerSeclector = utils.joinLabels([thanos.receive.dashboard.tenantSelector, 'handler="receive"']); + local tenantHttpCode2XXSelector = std.join(', ', [tenantReceiveHandlerSeclector, 'code=~"2.."']); + local tenantHttpCodeNot2XXSelector = std.join(', ', [tenantReceiveHandlerSeclector, 'code!~"2.."']); + + local tenantWithHttpCodeDimensions = std.join(', ', ['tenant', 'code']); g.dashboard(thanos.receive.title) { templating+: { list+: [ @@ -62,7 +65,7 @@ local utils = import '../lib/utils.libsonnet'; .addPanel( g.panel('Rate of write requests (by tenant and code)') + g.queryPanel( - 'sum by (%s) (rate(http_requests_total{%s}[$interval]))' % [tenantWithHttpCodeSelector, thanos.receive.dashboard.tenantSelector], + 'sum by (%s) (rate(http_requests_total{%s}[$interval]))' % [tenantWithHttpCodeDimensions, tenantReceiveHandlerSeclector], '{{code}} - {{tenant}}' ) ) @@ -70,7 +73,7 @@ local utils = import '../lib/utils.libsonnet'; g.panel('Number of errors (by tenant and code)') + g.queryPanel( 'sum by (%s) (rate(http_requests_total{%s}[$interval]))' % [ - tenantWithHttpCodeSelector, + tenantWithHttpCodeDimensions, tenantHttpCodeNot2XXSelector, ], '{{code}} - {{tenant}}' @@ -81,9 +84,9 @@ local utils = import '../lib/utils.libsonnet'; g.queryPanel( 'sum by (%s) (rate(http_request_duration_seconds_sum{%s}[$interval])) / sum by (%s) (http_request_duration_seconds_count{%s})' % [ thanos.receive.dashboard.tenantDimensions, - thanos.receive.dashboard.tenantSelector, + tenantReceiveHandlerSeclector, thanos.receive.dashboard.tenantDimensions, - thanos.receive.dashboard.tenantSelector, + tenantReceiveHandlerSeclector, ], '{{tenant}}' ) @@ -119,8 +122,8 @@ local utils = import '../lib/utils.libsonnet'; g.panel('Inflight requests (per tenant and method)') + g.queryPanel( 'sum by (%s) (http_inflight_requests{%s})' % [ - std.join(' ,', [thanos.receive.dashboard.tenantDimensions, 'method']), - thanos.receive.dashboard.tenantSelector, + std.join(', ', [thanos.receive.dashboard.tenantDimensions, 'method']), + tenantReceiveHandlerSeclector, ], '{{method}} - {{tenant}}' ) @@ -132,7 +135,7 @@ local utils = import '../lib/utils.libsonnet'; g.panel('Rate of series received (per tenant, only 2XX)') + g.queryPanel( 'sum(rate(thanos_receive_write_timeseries_bucket{%s}[$interval])) by (%s) ' % [ - tenantHttpCode2XXSelector, + utils.joinLabels([thanos.receive.dashboard.tenantSelector, 'code=~"2.."']), thanos.receive.dashboard.tenantDimensions, ], '{{tenant}}' @@ -142,8 +145,8 @@ local utils = import '../lib/utils.libsonnet'; g.panel('Rate of series not written (per tenant and code, non 2XX)') + g.queryPanel( 'sum(rate(thanos_receive_write_timeseries_bucket{%s}[$interval])) by (%s) ' % [ - tenantHttpCodeNot2XXSelector, - tenantWithHttpCodeSelector, + utils.joinLabels([thanos.receive.dashboard.tenantSelector, 'code!~"2.."']), + tenantWithHttpCodeDimensions, ], '{{code}} - {{tenant}}' ) From 374e27704f79d09034caf256eabcafdcc2393498 Mon Sep 17 00:00:00 2001 From: Douglas Camata <159076+douglascamata@users.noreply.github.com> Date: Tue, 5 Jul 2022 12:26:02 +0000 Subject: [PATCH 7/9] Format math expression properly Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> --- examples/dashboards/receive.json | 4 ++-- mixin/dashboards/receive.libsonnet | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/dashboards/receive.json b/examples/dashboards/receive.json index c0222b2efe..60b97c5ba5 100644 --- a/examples/dashboards/receive.json +++ b/examples/dashboards/receive.json @@ -590,7 +590,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (job, tenant) (rate(http_request_size_bytes_sum{job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\", code=~\"2..\"}[$interval]))/ sum by (job, tenant) (rate(http_request_size_bytes_count{job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\", code=~\"2..\"}[$interval]))", + "expr": "sum by (job, tenant) (rate(http_request_size_bytes_sum{job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\", code=~\"2..\"}[$interval])) / sum by (job, tenant) (rate(http_request_size_bytes_count{job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\", code=~\"2..\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{tenant}}", @@ -666,7 +666,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (job, tenant) (rate(http_request_size_bytes_sum{job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\", code!~\"2..\"}[$interval]))/ sum by (job, tenant) (rate(http_request_size_bytes_count{job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\", code!~\"2..\"}[$interval]))", + "expr": "sum by (job, tenant) (rate(http_request_size_bytes_sum{job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\", code!~\"2..\"}[$interval])) / sum by (job, tenant) (rate(http_request_size_bytes_count{job=~\"$job\", tenant=~\"$tenant\", handler=\"receive\", code!~\"2..\"}[$interval]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{tenant}}", diff --git a/mixin/dashboards/receive.libsonnet b/mixin/dashboards/receive.libsonnet index dff6571deb..d521ee3b56 100644 --- a/mixin/dashboards/receive.libsonnet +++ b/mixin/dashboards/receive.libsonnet @@ -97,7 +97,7 @@ local utils = import '../lib/utils.libsonnet'; .addPanel( g.panel('Average successful HTTP request size (per tenant and code, only 2XX)') + g.queryPanel( - 'sum by (%s) (rate(http_request_size_bytes_sum{%s}[$interval]))/ sum by (%s) (rate(http_request_size_bytes_count{%s}[$interval]))' % [ + 'sum by (%s) (rate(http_request_size_bytes_sum{%s}[$interval])) / sum by (%s) (rate(http_request_size_bytes_count{%s}[$interval]))' % [ thanos.receive.dashboard.tenantDimensions, tenantHttpCode2XXSelector, thanos.receive.dashboard.tenantDimensions, @@ -109,7 +109,7 @@ local utils = import '../lib/utils.libsonnet'; .addPanel( g.panel('Average failed HTTP request size (per tenant and code, non 2XX)') + g.queryPanel( - 'sum by (%s) (rate(http_request_size_bytes_sum{%s}[$interval]))/ sum by (%s) (rate(http_request_size_bytes_count{%s}[$interval]))' % [ + 'sum by (%s) (rate(http_request_size_bytes_sum{%s}[$interval])) / sum by (%s) (rate(http_request_size_bytes_count{%s}[$interval]))' % [ thanos.receive.dashboard.tenantDimensions, tenantHttpCodeNot2XXSelector, thanos.receive.dashboard.tenantDimensions, From 513f7a1cfb835f6757b1599625de05cbcbc79420 Mon Sep 17 00:00:00 2001 From: Douglas Camata <159076+douglascamata@users.noreply.github.com> Date: Tue, 5 Jul 2022 14:12:26 +0000 Subject: [PATCH 8/9] Update CHANGELOG Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5fe29bf5a0..ceb7166fa1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ We use *breaking :warning:* to mark changes that are not backward compatible (re ### Added - [#5440](https://github.com/thanos-io/thanos/pull/5440) HTTP metrics: export number of in-flight HTTP requests. +- [#5472](https://github.com/thanos-io/thanos/pull/5472) Receive: add new tenant metrics to example dashboard. ### Changed From 4aa6aafe0d748bb0c77d78317f2684f717979ad2 Mon Sep 17 00:00:00 2001 From: Douglas Camata <159076+douglascamata@users.noreply.github.com> Date: Tue, 5 Jul 2022 14:17:39 +0000 Subject: [PATCH 9/9] Add samples charts to series & samples row Signed-off-by: Douglas Camata <159076+douglascamata@users.noreply.github.com> --- examples/dashboards/receive.json | 190 ++++++++++++++++++++++++++--- mixin/dashboards/receive.libsonnet | 20 +++ 2 files changed, 191 insertions(+), 19 deletions(-) diff --git a/examples/dashboards/receive.json b/examples/dashboards/receive.json index 60b97c5ba5..01352c42f9 100644 --- a/examples/dashboards/receive.json +++ b/examples/dashboards/receive.json @@ -825,7 +825,7 @@ "renderer": "flot", "seriesOverrides": [ ], "spaceLength": 10, - "span": 6, + "span": 3, "stack": false, "steppedLine": false, "targets": [ @@ -901,7 +901,7 @@ "renderer": "flot", "seriesOverrides": [ ], "spaceLength": 10, - "span": 6, + "span": 3, "stack": false, "steppedLine": false, "targets": [ @@ -949,6 +949,158 @@ "show": false } ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 12, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(thanos_receive_write_samples_bucket{job=~\"$job\", tenant=~\"$tenant\", code=~\"2..\"}[$interval])) by (job, tenant) ", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{tenant}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate of samples received (per tenant, only 2XX)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 13, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(thanos_receive_write_samples_bucket{job=~\"$job\", tenant=~\"$tenant\", code!~\"2..\"}[$interval])) by (tenant, code) ", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{code}} - {{tenant}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Rate of samples not written (per tenant and code, non 2XX)", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] } ], "repeat": null, @@ -970,7 +1122,7 @@ "datasource": "$datasource", "description": "Shows rate of replications to other receive nodes.", "fill": 1, - "id": 12, + "id": 14, "legend": { "avg": false, "current": false, @@ -1049,7 +1201,7 @@ "datasource": "$datasource", "description": "Shows ratio of errors compared to the total number of replications to other receive nodes.", "fill": 10, - "id": 13, + "id": 15, "legend": { "avg": false, "current": false, @@ -1137,7 +1289,7 @@ "datasource": "$datasource", "description": "Shows rate of forwarded requests to other receive nodes.", "fill": 1, - "id": 14, + "id": 16, "legend": { "avg": false, "current": false, @@ -1216,7 +1368,7 @@ "datasource": "$datasource", "description": "Shows ratio of errors compared to the total number of forwareded requests to other receive nodes.", "fill": 10, - "id": 15, + "id": 17, "legend": { "avg": false, "current": false, @@ -1304,7 +1456,7 @@ "datasource": "$datasource", "description": "Shows rate of handled Unary gRPC requests from queriers.", "fill": 10, - "id": 16, + "id": 18, "legend": { "avg": false, "current": false, @@ -1455,7 +1607,7 @@ "datasource": "$datasource", "description": "Shows ratio of errors compared to the total number of handled requests from queriers.", "fill": 10, - "id": 17, + "id": 19, "legend": { "avg": false, "current": false, @@ -1531,7 +1683,7 @@ "datasource": "$datasource", "description": "Shows how long has it taken to handle requests from queriers, in quantiles.", "fill": 1, - "id": 18, + "id": 20, "legend": { "avg": false, "current": false, @@ -1661,7 +1813,7 @@ "datasource": "$datasource", "description": "Shows rate of handled Unary gRPC requests from queriers.", "fill": 10, - "id": 19, + "id": 21, "legend": { "avg": false, "current": false, @@ -1812,7 +1964,7 @@ "datasource": "$datasource", "description": "Shows ratio of errors compared to the total number of handled requests from queriers.", "fill": 10, - "id": 20, + "id": 22, "legend": { "avg": false, "current": false, @@ -1888,7 +2040,7 @@ "datasource": "$datasource", "description": "Shows how long has it taken to handle requests from queriers, in quantiles.", "fill": 1, - "id": 21, + "id": 23, "legend": { "avg": false, "current": false, @@ -2018,7 +2170,7 @@ "datasource": "$datasource", "description": "Shows rate of handled Streamed gRPC requests from queriers.", "fill": 10, - "id": 22, + "id": 24, "legend": { "avg": false, "current": false, @@ -2169,7 +2321,7 @@ "datasource": "$datasource", "description": "Shows ratio of errors compared to the total number of handled requests from queriers.", "fill": 10, - "id": 23, + "id": 25, "legend": { "avg": false, "current": false, @@ -2245,7 +2397,7 @@ "datasource": "$datasource", "description": "Shows how long has it taken to handle requests from queriers, in quantiles.", "fill": 1, - "id": 24, + "id": 26, "legend": { "avg": false, "current": false, @@ -2375,7 +2527,7 @@ "datasource": "$datasource", "description": "Shows the relative time of last successful upload to the object-store bucket.", "fill": 1, - "id": 25, + "id": 27, "legend": { "avg": false, "current": false, @@ -2499,7 +2651,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 26, + "id": 28, "legend": { "avg": false, "current": false, @@ -2615,7 +2767,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 27, + "id": 29, "legend": { "avg": false, "current": false, @@ -2691,7 +2843,7 @@ "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 28, + "id": 30, "legend": { "avg": false, "current": false, diff --git a/mixin/dashboards/receive.libsonnet b/mixin/dashboards/receive.libsonnet index d521ee3b56..e8bbe8ceda 100644 --- a/mixin/dashboards/receive.libsonnet +++ b/mixin/dashboards/receive.libsonnet @@ -151,6 +151,26 @@ local utils = import '../lib/utils.libsonnet'; '{{code}} - {{tenant}}' ) ) + .addPanel( + g.panel('Rate of samples received (per tenant, only 2XX)') + + g.queryPanel( + 'sum(rate(thanos_receive_write_samples_bucket{%s}[$interval])) by (%s) ' % [ + utils.joinLabels([thanos.receive.dashboard.tenantSelector, 'code=~"2.."']), + thanos.receive.dashboard.tenantDimensions, + ], + '{{tenant}}' + ) + ) + .addPanel( + g.panel('Rate of samples not written (per tenant and code, non 2XX)') + + g.queryPanel( + 'sum(rate(thanos_receive_write_samples_bucket{%s}[$interval])) by (%s) ' % [ + utils.joinLabels([thanos.receive.dashboard.tenantSelector, 'code!~"2.."']), + tenantWithHttpCodeDimensions, + ], + '{{code}} - {{tenant}}' + ) + ) ) .addRow( g.row('WRITE - Replication')