diff --git a/presto-mixin/.lint b/presto-mixin/.lint new file mode 100644 index 000000000..34fc10d39 --- /dev/null +++ b/presto-mixin/.lint @@ -0,0 +1,17 @@ +exclusions: + template-job-rule: + reason: "Prometheus datasource variable is being named as prometheus_datasource now while linter expects 'datasource'" + panel-datasource-rule: + reason: "Loki datasource variable is being named as loki_datasource now while linter expects 'datasource'" + template-datasource-rule: + reason: "Based on new convention we are using variable names prometheus_datasource and loki_datasource where as linter expects 'datasource'" + template-instance-rule: + reason: "Based on new convention we are using variable names prometheus_datasource and loki_datasource where as linter expects 'datasource'" + target-instance-rule: + reason: "The dashboard is a 'cluster' dashboard where the instance refers to nodes, this dashboard focuses only on the cluster view." + entries: + - dashboard: "Presto overview" + panel-title-description-rule: + reason: "Not required for logs volume" + panel-units-rule: + reason: "Logs volume has no unit" diff --git a/presto-mixin/Makefile b/presto-mixin/Makefile new file mode 100644 index 000000000..e8895a00e --- /dev/null +++ b/presto-mixin/Makefile @@ -0,0 +1,34 @@ +JSONNET_FMT := jsonnetfmt -n 2 --max-blank-lines 1 --string-style s --comment-style s + +.PHONY: all +all: build dashboards_out prometheus_alerts.yaml + +vendor: jsonnetfile.json + jb install + +.PHONY: build +build: vendor + +.PHONY: fmt +fmt: + find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \ + xargs -n 1 -- $(JSONNET_FMT) -i + +.PHONY: lint +lint: build + find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \ + while read f; do \ + $(JSONNET_FMT) "$$f" | diff -u "$$f" -; \ + done + mixtool lint mixin.libsonnet + +dashboards_out: mixin.libsonnet config.libsonnet $(wildcard dashboards/*) + @mkdir -p dashboards_out + mixtool generate dashboards mixin.libsonnet -d dashboards_out + +prometheus_alerts.yaml: mixin.libsonnet alerts/*.libsonnet + mixtool generate alerts mixin.libsonnet -a prometheus_alerts.yaml + +.PHONY: clean +clean: + rm -rf dashboards_out prometheus_alerts.yaml diff --git a/presto-mixin/README.md b/presto-mixin/README.md new file mode 100644 index 000000000..15c60c113 --- /dev/null +++ b/presto-mixin/README.md @@ -0,0 +1,132 @@ +# Presto mixin + +The Presto mixin is a set of configurable Grafana dashboards and alerts. + +The Presto mixin contains the following dashboards: + +- Presto overview +- Presto coordinator +- Presto worker +- Presto logs + +and the following alerts: + +- PrestoHighInsufficientResources +- PrestoHighTaskFailuresWarning +- PrestoHighTaskFailuresCritical +- PrestoHighQueuedTaskCount +- PrestoHighBlockedNodes +- PrestoHighFailedQueriesWarning +- PrestoHighFailedQueriesCritical + +## Presto overview + +The Presto overview dashboard provides details on integration status/alerts, workers/coordinators, error failures, data throughput, blocked nodes, and distributed bytes. +![Presto overview dashboard (queries)](https://storage.googleapis.com/grafanalabs-integration-assets/presto/screenshots/presto_overview_1.png) +![Presto overview dashboard (processing)](https://storage.googleapis.com/grafanalabs-integration-assets/presto/screenshots/presto_overview_1.png) + +## Presto coordinator overview + +The Presto coordinator overview dashboard provides details on various query counts and rates, query execution time, CPU time consumed, CPU input throughput, error failures, JVM metrics, and memory pool information. +![Presto coordinator dashboard (queries)](https://storage.googleapis.com/grafanalabs-integration-assets/presto/screenshots/presto_coordinator_1.png) +![Presto coordinator dashboard (JVM)](https://storage.googleapis.com/grafanalabs-integration-assets/presto/screenshots/presto_coordinator_2.png) + +## Presto worker overview + +The Presto worker overview dashboard provides details on various task rates, pool sizes, output positions, data throughput, JVM metrics, and memory pool information. +![Presto worker dashboard (tasks)](https://storage.googleapis.com/grafanalabs-integration-assets/presto/screenshots/presto_worker_1.png) +![Presto worker dashboard (JVM)](https://storage.googleapis.com/grafanalabs-integration-assets/presto/screenshots/presto_worker_2.png) + +## Presto logs + +The Presto logs dashboard provides details on incoming system logs. +![Presto logs dashboard](https://storage.googleapis.com/grafanalabs-integration-assets/presto/screenshots/presto_logs_overview.png) + +Presto system logs are enabled by default in the `config.libsonnet` and can be removed by setting `enableLokiLogs` to `false`. Then run `make` again to regenerate the dashboard: + +``` +{ + _config+:: { + enableLokiLogs: false, + }, +} +``` + +In order for the selectors to properly work for system logs ingested into your logs datasource, please also include the matching `instance`, `job`, and `presto_cluster` labels onto the [scrape configs](https://grafana.com/docs/loki/latest/clients/promtail/configuration/#scrape_configs) as to match the labels for ingested metrics. + +```yaml +scrape_configs: + - job_name: integrations/presto + static_configs: + - targets: [localhost] + labels: + job: integrations/presto + instance: "" + presto_cluster: "" + __path__: /var/presto/logs/*.log + pipeline_stages: + - multiline: + firstline: '\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}' + - regex: + expression: '\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z\s+(?P\w+)(?P.+)' + - labels: + level: +``` + +## Alerts overview + +- PrestoHighInsufficientFailures: The amount of failures that are occurring due to insufficient resources are scaling, causing saturation in the system. +- PrestoHighTaskFailuresWarning: The amount of tasks that are failing is increasing, this might affect query processing and could result in incomplete or incorrect results. +- PrestoHighTaskFailuresCritical: The amount of tasks that are failing has reached a critical level. This might affect query processing and could result in incomplete or incorrect results. +- PrestoHighQueuedTaskCount: The amount of tasks that are being put in queue is increasing. A high number of queued tasks can lead to increased query latencies and degraded system performance. +- PrestoHighBlockedNodes: The amount of nodes that are blocked due to memory restrictions is increasing. Blocked nodes can cause performance degradation and resource starvation. +- PrestoHighFailedQueriesWarning: The amount of queries failing is increasing. Failed queries can prevent users from accessing data, disrupt analytics processes, and might indicate underlying issues with the system or data. +- PrestoHighFailedQueriesCritical: The amount of queries failing has increased to critical levels. Failed queries can prevent users from accessing data, disrupt analytics processes, and might indicate underlying issues with the system or data. + +Default thresholds can be configured in `config.libsonnet`. + +```js +{ + _configs+:: { + + // alerts thresholds + alertsHighInsufficientResourceErrors: 0, // count + alertsHighTaskFailuresWarning: 0, // count + alertsHighTaskFailuresCritical: 30, // percent + alertsHighQueuedTaskCount: 5, // count + alertsHighBlockedNodesCount: 0, // count + alertsHighFailedQueryCountWarning: 0, // count + alertsHighFailedQueryCountCritical: 30, // percent + } +} +``` + +## Install tools + +```bash +go install github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb@latest +go install github.com/monitoring-mixins/mixtool/cmd/mixtool@latest +``` + +For linting and formatting, you would also need `jsonnetfmt` installed. If you +have a working Go development environment, it's easiest to run the following: + +```bash +go install github.com/google/go-jsonnet/cmd/jsonnetfmt@latest +``` + +The files in `dashboards_out` need to be imported +into your Grafana server. The exact details will be depending on your environment. + +`prometheus_alerts.yaml` needs to be imported into Prometheus. + +## Generate dashboards and alerts + +Edit `config.libsonnet` if required and then build JSON dashboard files for Grafana: + +```bash +make +``` + +For more advanced uses of mixins, see +https://github.com/monitoring-mixins/docs. diff --git a/presto-mixin/alerts/alerts.libsonnet b/presto-mixin/alerts/alerts.libsonnet new file mode 100644 index 000000000..fb0604d1c --- /dev/null +++ b/presto-mixin/alerts/alerts.libsonnet @@ -0,0 +1,130 @@ +{ + prometheusAlerts+:: { + groups+: [ + { + name: 'presto-alerts', + rules: [ + { + alert: 'PrestoHighInsufficientResources', + expr: ||| + increase(presto_QueryManager_InsufficientResourcesFailures_TotalCount[5m]) > %(alertsHighInsufficientResourceErrors)s + ||| % $._config, + 'for': '5m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'The amount of failures that are occurring due to insufficient resources are scaling, causing saturation in the system.', + description: + ( + 'The number of insufficient resource failures on {{$labels.instance}} is {{ printf "%%.0f" $value }} which is greater than the threshold of %(alertsHighInsufficientResourceErrors)s.' + ) % $._config, + }, + }, + { + alert: 'PrestoHighTaskFailuresWarning', + expr: ||| + increase(presto_TaskManager_FailedTasks_TotalCount[5m]) > %(alertsHighTaskFailuresWarning)s + ||| % $._config, + 'for': '5m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'The amount of tasks that are failing is increasing, this might affect query processing and could result in incomplete or incorrect results.', + description: + ( + 'The number of task failures on {{$labels.instance}} is {{ printf "%%.0f" $value }} which is above the threshold of %(alertsHighTaskFailuresWarning)s.' + ) % $._config, + }, + }, + { + alert: 'PrestoHighTaskFailuresCritical', + expr: ||| + increase(presto_TaskManager_FailedTasks_TotalCount[5m]) / clamp_min(increase(presto_TaskManager_FailedTasks_TotalCount[10m]), 1) * 100 > %(alertsHighTaskFailuresCritical)s + ||| % $._config, + 'for': '5m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'The amount of tasks that are failing has reached a critical level. This might affect query processing and could result in incomplete or incorrect results.', + description: + ( + 'The number of task failures on {{$labels.instance}} is {{ printf "%%.0f" $value }} which is above the threshold of %(alertsHighTaskFailuresCritical)s%%s.' + ) % $._config, + }, + }, + { + alert: 'PrestoHighQueuedTaskCount', + expr: ||| + increase(presto_QueryExecution_Executor_QueuedTaskCount[5m]) > %(alertsHighQueuedTaskCount)s + ||| % $._config, + 'for': '5m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'The amount of tasks that are being put in queue is increasing. A high number of queued tasks can lead to increased query latencies and degraded system performance.', + description: + ( + 'The number of queued tasks on {{$labels.instance}} is {{ printf "%%.0f" $value }} which is greater than the threshold of %(alertsHighQueuedTaskCount)s' + ) % $._config, + }, + }, + { + alert: 'PrestoHighBlockedNodes', + expr: ||| + increase(presto_ClusterMemoryPool_general_BlockedNodes[5m]) > %(alertsHighBlockedNodesCount)s + ||| % $._config, + 'for': '5m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'The amount of nodes that are blocked due to memory restrictions is increasing. Blocked nodes can cause performance degradation and resource starvation.', + description: + ( + 'The number of blocked nodes on {{$labels.instance}} is {{ printf "%%.0f" $value }} which is greater than the threshold of %(alertsHighBlockedNodesCount)s' + ) % $._config, + }, + }, + { + alert: 'PrestoHighFailedQueriesWarning', + expr: ||| + increase(presto_QueryManager_FailedQueries_TotalCount[5m]) > %(alertsHighFailedQueryCountWarning)s + ||| % $._config, + 'for': '5m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'The amount of queries failing is increasing. Failed queries can prevent users from accessing data, disrupt analytics processes, and might indicate underlying issues with the system or data.', + description: + ( + 'The number of failed queries on {{$labels.instance}} is {{ printf "%%.0f" $value }} which is greater than the threshold of %(alertsHighFailedQueryCountWarning)s' + ) % $._config, + }, + }, + { + alert: 'PrestoHighFailedQueriesCritical', + expr: ||| + increase(presto_QueryManager_FailedQueries_TotalCount[5m]) / clamp_min(increase(presto_QueryManager_FailedQueries_TotalCount[10m]), 1) * 100 > %(alertsHighFailedQueryCountCritical)s + ||| % $._config, + 'for': '5m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'The amount of queries failing has increased to critical levels. Failed queries can prevent users from accessing data, disrupt analytics processes, and might indicate underlying issues with the system or data.', + description: + ( + 'The number of failed queries on {{$labels.instance}} is {{ printf "%%.0f" $value }} which is greater than the threshold of %(alertsHighFailedQueryCountCritical)s%%s.' + ) % $._config, + }, + }, + ], + }, + ], + }, +} diff --git a/presto-mixin/config.libsonnet b/presto-mixin/config.libsonnet new file mode 100644 index 000000000..47a4bec50 --- /dev/null +++ b/presto-mixin/config.libsonnet @@ -0,0 +1,26 @@ +{ + _config+:: { + enableMultiCluster: false, + prestoOverviewSelector: if self.enableMultiCluster then 'job=~"$job", cluster=~"$cluster"' else 'job=~"$job"', + prestoSelector: if self.enableMultiCluster then 'job=~"$job", instance=~"$instance", cluster=~"$cluster"' else 'job=~"$job", instance=~"$instance"', + prestoAlertSelector: if self.enableMultiCluster then 'job=~"${job:regex}", cluster=~"${cluster:regex}"' else 'job=~"${job:regex}"', + prestoOverviewLegendSelector: if self.enableMultiCluster then '{{cluster}} - {{presto_cluster}}' else '{{presto_cluster}}', + prestoLegendSelector: if self.enableMultiCluster then '{{cluster}} - {{instance}}' else '{{instance}}', + filterSelector: 'job=~"integrations/presto"', + + dashboardTags: ['presto-mixin'], + dashboardPeriod: 'now-30m', + dashboardTimezone: 'default', + dashboardRefresh: '1m', + + // alerts thresholds + alertsHighInsufficientResourceErrors: 0, // count + alertsHighTaskFailuresWarning: 0, // count + alertsHighTaskFailuresCritical: 30, // percent + alertsHighQueuedTaskCount: 5, // count + alertsHighBlockedNodesCount: 0, // count + alertsHighFailedQueryCountWarning: 0, // count + alertsHighFailedQueryCountCritical: 30, // percent + enableLokiLogs: true, + }, +} diff --git a/presto-mixin/dashboards/dashboards.libsonnet b/presto-mixin/dashboards/dashboards.libsonnet new file mode 100644 index 000000000..4a71626ea --- /dev/null +++ b/presto-mixin/dashboards/dashboards.libsonnet @@ -0,0 +1,4 @@ +(import 'presto-overview.libsonnet') + +(import 'presto-coordinator.libsonnet') + +(import 'presto-worker.libsonnet') + +(import 'presto-logs-overview.libsonnet') diff --git a/presto-mixin/dashboards/presto-coordinator.libsonnet b/presto-mixin/dashboards/presto-coordinator.libsonnet new file mode 100644 index 000000000..8170fa77a --- /dev/null +++ b/presto-mixin/dashboards/presto-coordinator.libsonnet @@ -0,0 +1,1242 @@ +local g = (import 'grafana-builder/grafana.libsonnet'); +local grafana = (import 'grafonnet/grafana.libsonnet'); +local dashboard = grafana.dashboard; +local template = grafana.template; +local prometheus = grafana.prometheus; + +local dashboardUid = 'presto-coordinator'; + +local promDatasourceName = 'prometheus_datasource'; +local getMatcher(cfg) = '%(prestoSelector)s' % cfg; +local getLegendMatcher(cfg) = '%(prestoLegendSelector)s' % cfg; +local promDatasource = { + uid: '${%s}' % promDatasourceName, +}; + +local nonheapMemoryUsagePanel(legendMatcher, matcher) = { + datasource: promDatasource, + targets: [ + prometheus.target( + 'avg (jvm_nonheap_memory_used{' + matcher + ', presto_cluster=~"$presto_cluster"} / clamp_min((jvm_nonheap_memory_used{' + matcher + ', presto_cluster=~"$presto_cluster"} + jvm_nonheap_memory_committed{' + matcher + ', presto_cluster=~"$presto_cluster"}),1))', + datasource=promDatasource, + legendFormat='' + legendMatcher + '', + ), + ], + type: 'gauge', + title: 'Non-heap memory usage', + description: "An average gauge of the JVM's non-heap memory usage across coordinators.", + fieldConfig: { + defaults: { + color: { + mode: 'thresholds', + }, + mappings: [], + thresholds: { + mode: 'absolute', + steps: [ + { + color: 'green', + value: null, + }, + { + color: '#EAB839', + value: 0.7, + }, + { + color: 'red', + value: 0.8, + }, + ], + }, + unit: 'percentunit', + }, + overrides: [], + }, + options: { + minVizHeight: 75, + minVizWidth: 75, + orientation: 'auto', + reduceOptions: { + calcs: [ + 'lastNotNull', + ], + fields: '', + values: false, + }, + showThresholdLabels: false, + showThresholdMarkers: true, + }, + pluginVersion: '10.2.0-62263', +}; + +local heapMemoryUsagePanel(legendMatcher, matcher) = { + datasource: promDatasource, + targets: [ + prometheus.target( + 'avg (jvm_heap_memory_used{' + matcher + ', presto_cluster=~"$presto_cluster"} / clamp_min((jvm_heap_memory_used{' + matcher + ', presto_cluster=~"$presto_cluster"} + jvm_heap_memory_committed{' + matcher + ', presto_cluster=~"$presto_cluster"}),1))', + datasource=promDatasource, + legendFormat='' + legendMatcher + '', + ), + ], + type: 'gauge', + title: 'Heap memory usage', + description: "An average gauge of the JVM's heap memory usage across coordinators.", + fieldConfig: { + defaults: { + color: { + mode: 'thresholds', + }, + mappings: [], + thresholds: { + mode: 'absolute', + steps: [ + { + color: 'green', + value: null, + }, + { + color: '#EAB839', + value: 0.7, + }, + { + color: 'red', + value: 0.8, + }, + ], + }, + unit: 'percentunit', + }, + overrides: [], + }, + options: { + minVizHeight: 75, + minVizWidth: 75, + orientation: 'auto', + reduceOptions: { + calcs: [ + 'lastNotNull', + ], + fields: '', + values: false, + }, + showThresholdLabels: false, + showThresholdMarkers: true, + }, + pluginVersion: '10.2.0-62263', +}; + +local errorFailuresOneMinuteCountPanel(legendMatcher, matcher) = { + datasource: promDatasource, + targets: [ + prometheus.target( + 'presto_QueryManager_InternalFailures_OneMinute_Count{' + matcher + ', presto_cluster=~"$presto_cluster"}', + datasource=promDatasource, + legendFormat='' + legendMatcher + ' - internal', + format='time_series', + ), + prometheus.target( + 'presto_QueryManager_UserErrorFailures_OneMinute_Count{' + matcher + ', presto_cluster=~"$presto_cluster"}', + datasource=promDatasource, + legendFormat='' + legendMatcher + ' - user', + ), + ], + type: 'timeseries', + title: 'Error failures - one minute count', + description: 'The number of internal and user error failures occurring on the instance.', + fieldConfig: { + defaults: { + color: { + mode: 'palette-classic', + }, + custom: { + axisBorderShow: false, + axisCenteredZero: false, + axisColorMode: 'text', + axisLabel: '', + axisPlacement: 'auto', + barAlignment: 0, + drawStyle: 'line', + fillOpacity: 15, + gradientMode: 'none', + hideFrom: { + legend: false, + tooltip: false, + viz: false, + }, + insertNulls: false, + lineInterpolation: 'stepBefore', + lineWidth: 2, + pointSize: 5, + scaleDistribution: { + type: 'linear', + }, + showPoints: 'never', + spanNulls: false, + stacking: { + group: 'A', + mode: 'normal', + }, + thresholdsStyle: { + mode: 'off', + }, + }, + mappings: [], + thresholds: { + mode: 'absolute', + steps: [ + { + color: 'green', + value: null, + }, + ], + }, + unit: 'none', + }, + overrides: [], + }, + options: { + legend: { + calcs: [], + displayMode: 'list', + placement: 'bottom', + showLegend: true, + }, + tooltip: { + mode: 'multi', + sort: 'desc', + }, + }, +}; + +local normalQueryOneMinuteCountPanel(legendMatcher, matcher) = { + datasource: promDatasource, + targets: [ + prometheus.target( + 'presto_QueryManager_CompletedQueries_OneMinute_Count{' + matcher + ', presto_cluster=~"$presto_cluster"}', + datasource=promDatasource, + legendFormat='' + legendMatcher + ' - completed', + format='time_series', + ), + prometheus.target( + 'presto_QueryManager_RunningQueries{' + matcher + ', presto_cluster=~"$presto_cluster"}', + datasource=promDatasource, + legendFormat='' + legendMatcher + ' - running', + ), + prometheus.target( + 'presto_QueryManager_StartedQueries_OneMinute_Count{' + matcher + ', presto_cluster=~"$presto_cluster"}', + datasource=promDatasource, + legendFormat='' + legendMatcher + ' - started', + ), + ], + type: 'timeseries', + title: 'Normal query - one minute count', + description: 'A count of completed, running, and started queries.', + fieldConfig: { + defaults: { + color: { + fixedColor: '#C8F2C2', + mode: 'palette-classic', + }, + custom: { + axisBorderShow: false, + axisCenteredZero: false, + axisColorMode: 'text', + axisLabel: '', + axisPlacement: 'left', + barAlignment: 0, + drawStyle: 'line', + fillOpacity: 15, + gradientMode: 'none', + hideFrom: { + legend: false, + tooltip: false, + viz: false, + }, + insertNulls: false, + lineInterpolation: 'stepBefore', + lineWidth: 2, + pointSize: 5, + scaleDistribution: { + type: 'linear', + }, + showPoints: 'never', + spanNulls: false, + stacking: { + group: 'A', + mode: 'normal', + }, + thresholdsStyle: { + mode: 'off', + }, + }, + mappings: [], + thresholds: { + mode: 'absolute', + steps: [ + { + color: 'green', + value: null, + }, + ], + }, + unit: 'none', + }, + overrides: [], + }, + options: { + legend: { + calcs: [], + displayMode: 'list', + placement: 'bottom', + showLegend: true, + }, + tooltip: { + mode: 'multi', + sort: 'desc', + }, + }, +}; + +local abnormalQueryOneMinuteCountPanel(legendMatcher, matcher) = { + datasource: promDatasource, + targets: [ + prometheus.target( + 'presto_QueryManager_FailedQueries_OneMinute_Count{' + matcher + ', presto_cluster=~"$presto_cluster"}', + datasource=promDatasource, + legendFormat='' + legendMatcher + ' - failed', + format='time_series', + ), + prometheus.target( + 'presto_QueryManager_AbandonedQueries_OneMinute_Count{' + matcher + ', presto_cluster=~"$presto_cluster"}', + datasource=promDatasource, + legendFormat='' + legendMatcher + ' - abandoned', + ), + prometheus.target( + 'presto_QueryManager_CanceledQueries_OneMinute_Count{' + matcher + ', presto_cluster=~"$presto_cluster"}', + datasource=promDatasource, + legendFormat='' + legendMatcher + ' - canceled', + ), + ], + type: 'timeseries', + title: 'Abnormal query - one minute count', + description: 'A count of failed, abandoned, and canceled queries.', + fieldConfig: { + defaults: { + color: { + fixedColor: '#C8F2C2', + mode: 'palette-classic', + }, + custom: { + axisBorderShow: false, + axisCenteredZero: false, + axisColorMode: 'text', + axisLabel: '', + axisPlacement: 'left', + barAlignment: 0, + drawStyle: 'line', + fillOpacity: 15, + gradientMode: 'none', + hideFrom: { + legend: false, + tooltip: false, + viz: false, + }, + insertNulls: false, + lineInterpolation: 'stepBefore', + lineWidth: 2, + pointSize: 5, + scaleDistribution: { + type: 'linear', + }, + showPoints: 'never', + spanNulls: false, + stacking: { + group: 'A', + mode: 'normal', + }, + thresholdsStyle: { + mode: 'off', + }, + }, + mappings: [], + thresholds: { + mode: 'absolute', + steps: [ + { + color: 'green', + value: null, + }, + ], + }, + unit: 'none', + }, + overrides: [], + }, + options: { + legend: { + calcs: [], + displayMode: 'list', + placement: 'bottom', + showLegend: true, + }, + tooltip: { + mode: 'multi', + sort: 'desc', + }, + }, +}; + +local normalQueryOneMinuteRatePanel(legendMatcher, matcher) = { + datasource: promDatasource, + targets: [ + prometheus.target( + 'presto_QueryManager_CompletedQueries_OneMinute_Rate{' + matcher + ', presto_cluster=~"$presto_cluster"}', + datasource=promDatasource, + legendFormat='' + legendMatcher + ' - completed', + format='time_series', + ), + prometheus.target( + 'presto_QueryManager_RunningQueries{' + matcher + ', presto_cluster=~"$presto_cluster"}', + datasource=promDatasource, + legendFormat='' + legendMatcher + ' - running', + ), + prometheus.target( + 'presto_QueryManager_StartedQueries_OneMinute_Rate{' + matcher + ', presto_cluster=~"$presto_cluster"}', + datasource=promDatasource, + legendFormat='' + legendMatcher + ' - started', + ), + ], + type: 'timeseries', + title: 'Normal query - one minute rate', + description: 'The rate of normally operating queries such as the completed, running, and started queries.', + fieldConfig: { + defaults: { + color: { + fixedColor: '#C8F2C2', + mode: 'palette-classic', + }, + custom: { + axisBorderShow: false, + axisCenteredZero: false, + axisColorMode: 'text', + axisLabel: '', + axisPlacement: 'left', + barAlignment: 0, + drawStyle: 'line', + fillOpacity: 15, + gradientMode: 'none', + hideFrom: { + legend: false, + tooltip: false, + viz: false, + }, + insertNulls: false, + lineInterpolation: 'stepBefore', + lineWidth: 2, + pointSize: 5, + scaleDistribution: { + type: 'linear', + }, + showPoints: 'never', + spanNulls: false, + stacking: { + group: 'A', + mode: 'normal', + }, + thresholdsStyle: { + mode: 'off', + }, + }, + mappings: [], + thresholds: { + mode: 'absolute', + steps: [ + { + color: 'green', + value: null, + }, + ], + }, + unit: 'ops', + }, + overrides: [], + }, + options: { + legend: { + calcs: [], + displayMode: 'list', + placement: 'bottom', + showLegend: true, + }, + tooltip: { + mode: 'multi', + sort: 'desc', + }, + }, +}; + +local abnormalQueryOneMinuteRatePanel(legendMatcher, matcher) = { + datasource: promDatasource, + targets: [ + prometheus.target( + 'rate(presto_QueryManager_FailedQueries_TotalCount{' + matcher + ', presto_cluster=~"$presto_cluster"}[$__rate_interval])', + datasource=promDatasource, + legendFormat='' + legendMatcher + ' - failed', + format='time_series', + ), + prometheus.target( + 'rate(presto_QueryManager_AbandonedQueries_TotalCount{' + matcher + ', presto_cluster=~"$presto_cluster"}[$__rate_interval])', + datasource=promDatasource, + legendFormat='' + legendMatcher + ' - abandoned', + ), + prometheus.target( + 'rate(presto_QueryManager_CanceledQueries_TotalCount{' + matcher + ', presto_cluster=~"$presto_cluster"}[$__rate_interval])', + datasource=promDatasource, + legendFormat='' + legendMatcher + ' - canceled', + ), + ], + type: 'timeseries', + title: 'Abnormal query - one minute rate', + description: 'The rate of abnormal queries such as the failed, abandoned, and canceled queries.', + fieldConfig: { + defaults: { + color: { + fixedColor: '#C8F2C2', + mode: 'palette-classic', + }, + custom: { + axisBorderShow: false, + axisCenteredZero: false, + axisColorMode: 'text', + axisLabel: '', + axisPlacement: 'left', + barAlignment: 0, + drawStyle: 'line', + fillOpacity: 15, + gradientMode: 'none', + hideFrom: { + legend: false, + tooltip: false, + viz: false, + }, + insertNulls: false, + lineInterpolation: 'stepBefore', + lineWidth: 2, + pointSize: 5, + scaleDistribution: { + type: 'linear', + }, + showPoints: 'never', + spanNulls: false, + stacking: { + group: 'A', + mode: 'normal', + }, + thresholdsStyle: { + mode: 'off', + }, + }, + mappings: [], + thresholds: { + mode: 'absolute', + steps: [ + { + color: 'green', + value: null, + }, + ], + }, + unit: 'ops', + }, + overrides: [], + }, + options: { + legend: { + calcs: [], + displayMode: 'list', + placement: 'bottom', + showLegend: true, + }, + tooltip: { + mode: 'multi', + sort: 'desc', + }, + }, +}; + +local queryExecutionTimeOneMinuteCountPanel(legendMatcher, matcher) = { + datasource: promDatasource, + targets: [ + prometheus.target( + 'presto_QueryManager_ExecutionTime_OneMinute_P75{' + matcher + ', presto_cluster=~"$presto_cluster"}', + datasource=promDatasource, + legendFormat='' + legendMatcher + ' - p75', + format='time_series', + ), + prometheus.target( + 'presto_QueryManager_ExecutionTime_OneMinute_P95{' + matcher + ', presto_cluster=~"$presto_cluster"}', + datasource=promDatasource, + legendFormat='' + legendMatcher + ' - p95', + ), + prometheus.target( + 'presto_QueryManager_ExecutionTime_OneMinute_P99{' + matcher + ', presto_cluster=~"$presto_cluster"}', + datasource=promDatasource, + legendFormat='' + legendMatcher + ' - p99', + ), + prometheus.target( + 'presto_QueryManager_ExecutionTime_OneMinute_P50{' + matcher + ', presto_cluster=~"$presto_cluster"}', + datasource=promDatasource, + legendFormat='' + legendMatcher + ' - p50', + ), + ], + type: 'timeseries', + title: 'Query execution time - one minute count', + description: 'The time it took to run queries over the past one minute period.\n', + fieldConfig: { + defaults: { + color: { + mode: 'palette-classic', + }, + custom: { + axisBorderShow: false, + axisCenteredZero: false, + axisColorMode: 'text', + axisLabel: '', + axisPlacement: 'auto', + barAlignment: 0, + drawStyle: 'line', + fillOpacity: 15, + gradientMode: 'none', + hideFrom: { + legend: false, + tooltip: false, + viz: false, + }, + insertNulls: false, + lineInterpolation: 'stepBefore', + lineWidth: 2, + pointSize: 5, + scaleDistribution: { + type: 'linear', + }, + showPoints: 'never', + spanNulls: false, + stacking: { + group: 'A', + mode: 'none', + }, + thresholdsStyle: { + mode: 'off', + }, + }, + mappings: [], + thresholds: { + mode: 'absolute', + steps: [ + { + color: 'green', + value: null, + }, + ], + }, + unit: 'ms', + }, + overrides: [], + }, + options: { + legend: { + calcs: [], + displayMode: 'list', + placement: 'bottom', + showLegend: true, + }, + tooltip: { + mode: 'multi', + sort: 'desc', + }, + }, +}; + +local cpuTimeConsumedOneMinuteRatePanel(legendMatcher, matcher) = { + datasource: promDatasource, + targets: [ + prometheus.target( + 'presto_QueryManager_ConsumedCpuTimeSecs_OneMinute_Count{' + matcher + ', presto_cluster=~"$presto_cluster"}', + datasource=promDatasource, + legendFormat='' + legendMatcher + ' ', + format='time_series', + ), + ], + type: 'timeseries', + title: 'CPU time consumed - one minute rate', + description: "CPU time consumed by Presto's QueryManager for executing queries over one-minute intervals, measured in CPU seconds used.", + fieldConfig: { + defaults: { + color: { + mode: 'palette-classic', + }, + custom: { + axisBorderShow: false, + axisCenteredZero: false, + axisColorMode: 'text', + axisLabel: '', + axisPlacement: 'auto', + barAlignment: 0, + drawStyle: 'line', + fillOpacity: 15, + gradientMode: 'none', + hideFrom: { + legend: false, + tooltip: false, + viz: false, + }, + insertNulls: false, + lineInterpolation: 'stepBefore', + lineWidth: 2, + pointSize: 5, + scaleDistribution: { + type: 'linear', + }, + showPoints: 'never', + spanNulls: false, + stacking: { + group: 'A', + mode: 'none', + }, + thresholdsStyle: { + mode: 'off', + }, + }, + mappings: [], + thresholds: { + mode: 'absolute', + steps: [ + { + color: 'green', + value: null, + }, + ], + }, + unit: 's', + }, + overrides: [], + }, + options: { + legend: { + calcs: [], + displayMode: 'list', + placement: 'bottom', + showLegend: true, + }, + tooltip: { + mode: 'multi', + sort: 'desc', + }, + }, +}; + +local cpuInputThroughputOneMinuteCountPanel(legendMatcher, matcher) = { + datasource: promDatasource, + targets: [ + prometheus.target( + 'presto_QueryManager_CpuInputByteRate_OneMinute_Total{' + matcher + ', presto_cluster=~"$presto_cluster"}', + datasource=promDatasource, + legendFormat='' + legendMatcher + ' ', + format='time_series', + ), + ], + type: 'timeseries', + title: 'CPU input throughput - one minute count', + description: 'The rate at which input data is being read and processed by the CPU.', + fieldConfig: { + defaults: { + color: { + mode: 'palette-classic', + }, + custom: { + axisBorderShow: false, + axisCenteredZero: false, + axisColorMode: 'text', + axisLabel: '', + axisPlacement: 'auto', + barAlignment: 0, + drawStyle: 'line', + fillOpacity: 15, + gradientMode: 'none', + hideFrom: { + legend: false, + tooltip: false, + viz: false, + }, + insertNulls: false, + lineInterpolation: 'smooth', + lineWidth: 2, + pointSize: 5, + scaleDistribution: { + type: 'linear', + }, + showPoints: 'never', + spanNulls: false, + stacking: { + group: 'A', + mode: 'none', + }, + thresholdsStyle: { + mode: 'off', + }, + }, + mappings: [], + thresholds: { + mode: 'absolute', + steps: [ + { + color: 'green', + value: null, + }, + ], + }, + unit: 'Bps', + }, + overrides: [], + }, + options: { + legend: { + calcs: [], + displayMode: 'list', + placement: 'bottom', + showLegend: true, + }, + tooltip: { + mode: 'multi', + sort: 'desc', + }, + }, +}; + +local jvmMetricsRow = { + datasource: promDatasource, + targets: [], + type: 'row', + title: 'JVM metrics', + collapsed: false, +}; + +local garbageCollectionCount(legendMatcher, matcher) = { + datasource: promDatasource, + targets: [ + prometheus.target( + 'increase(jvm_gc_collection_count{' + matcher + ', presto_cluster=~"$presto_cluster", name="G1 Young Generation"}[$__interval:])', + datasource=promDatasource, + legendFormat='' + legendMatcher + ' ', + interval='1m', + format='time_series', + ), + ], + type: 'timeseries', + title: 'Garbage collection count / $__interval', + description: 'The recent increase in the number of garbage collection events for the JVM.', + fieldConfig: { + defaults: { + color: { + mode: 'palette-classic', + }, + custom: { + axisBorderShow: false, + axisCenteredZero: false, + axisColorMode: 'text', + axisLabel: '', + axisPlacement: 'auto', + barAlignment: 0, + drawStyle: 'line', + fillOpacity: 15, + gradientMode: 'none', + hideFrom: { + legend: false, + tooltip: false, + viz: false, + }, + insertNulls: false, + lineInterpolation: 'smooth', + lineWidth: 2, + pointSize: 5, + scaleDistribution: { + type: 'linear', + }, + showPoints: 'never', + spanNulls: false, + stacking: { + group: 'A', + mode: 'normal', + }, + thresholdsStyle: { + mode: 'off', + }, + }, + mappings: [], + thresholds: { + mode: 'absolute', + steps: [ + { + color: 'green', + value: null, + }, + ], + }, + unit: 'none', + }, + overrides: [], + }, + options: { + legend: { + calcs: [], + displayMode: 'list', + placement: 'bottom', + showLegend: true, + }, + tooltip: { + mode: 'multi', + sort: 'desc', + }, + }, +}; + +local garbageCollectionDurationPanel(legendMatcher, matcher) = { + datasource: promDatasource, + targets: [ + prometheus.target( + 'jvm_gc_duration{' + matcher + ', presto_cluster=~"$presto_cluster", name="G1 Young Generation"}', + datasource=promDatasource, + legendFormat='' + legendMatcher + '', + format='time_series', + ), + ], + type: 'timeseries', + title: 'Garbage collection duration', + description: 'The average duration for each garbage collection operation in the JVM.', + fieldConfig: { + defaults: { + color: { + mode: 'palette-classic', + }, + custom: { + axisBorderShow: false, + axisCenteredZero: false, + axisColorMode: 'text', + axisLabel: '', + axisPlacement: 'auto', + barAlignment: 0, + drawStyle: 'line', + fillOpacity: 15, + gradientMode: 'none', + hideFrom: { + legend: false, + tooltip: false, + viz: false, + }, + insertNulls: false, + lineInterpolation: 'smooth', + lineWidth: 2, + pointSize: 5, + scaleDistribution: { + type: 'linear', + }, + showPoints: 'never', + spanNulls: false, + stacking: { + group: 'A', + mode: 'normal', + }, + thresholdsStyle: { + mode: 'off', + }, + }, + mappings: [], + thresholds: { + mode: 'absolute', + steps: [ + { + color: 'green', + value: null, + }, + ], + }, + unit: 'ms', + }, + overrides: [], + }, + options: { + legend: { + calcs: [], + displayMode: 'list', + placement: 'bottom', + showLegend: true, + }, + tooltip: { + mode: 'multi', + sort: 'desc', + }, + }, +}; + +local memoryUsedPanel(legendMatcher, matcher) = { + datasource: promDatasource, + targets: [ + prometheus.target( + 'jvm_nonheap_memory_used{' + matcher + ', presto_cluster=~"$presto_cluster"}', + datasource=promDatasource, + legendFormat='' + legendMatcher + ' - non heap', + format='time_series', + ), + prometheus.target( + 'jvm_heap_memory_used{' + matcher + ', presto_cluster=~"$presto_cluster"}', + datasource=promDatasource, + legendFormat='' + legendMatcher + ' - heap', + ), + ], + type: 'timeseries', + title: 'Memory used', + description: 'The heap and non-heap memory used by the JVM.', + fieldConfig: { + defaults: { + color: { + mode: 'palette-classic', + }, + custom: { + axisBorderShow: false, + axisCenteredZero: false, + axisColorMode: 'text', + axisLabel: '', + axisPlacement: 'auto', + barAlignment: 0, + drawStyle: 'line', + fillOpacity: 15, + gradientMode: 'none', + hideFrom: { + legend: false, + tooltip: false, + viz: false, + }, + insertNulls: false, + lineInterpolation: 'smooth', + lineWidth: 2, + pointSize: 5, + scaleDistribution: { + type: 'linear', + }, + showPoints: 'never', + spanNulls: false, + stacking: { + group: 'A', + mode: 'normal', + }, + thresholdsStyle: { + mode: 'off', + }, + }, + mappings: [], + thresholds: { + mode: 'absolute', + steps: [ + { + color: 'green', + value: null, + }, + ], + }, + unit: 'decbytes', + }, + overrides: [], + }, + options: { + legend: { + calcs: [], + displayMode: 'list', + placement: 'bottom', + showLegend: true, + }, + tooltip: { + mode: 'multi', + sort: 'desc', + }, + }, +}; + +local memoryCommittedPanel(legendMatcher, matcher) = { + datasource: promDatasource, + targets: [ + prometheus.target( + 'jvm_heap_memory_committed{' + matcher + ', presto_cluster=~"$presto_cluster"}', + datasource=promDatasource, + legendFormat='' + legendMatcher + ' - heap', + format='time_series', + ), + prometheus.target( + 'jvm_nonheap_memory_committed{' + matcher + ', presto_cluster=~"$presto_cluster"}', + datasource=promDatasource, + legendFormat='' + legendMatcher + ' - non heap', + ), + ], + type: 'timeseries', + title: 'Memory committed', + description: 'The heap and non-heap memory committed.', + fieldConfig: { + defaults: { + color: { + mode: 'palette-classic', + }, + custom: { + axisBorderShow: false, + axisCenteredZero: false, + axisColorMode: 'text', + axisLabel: '', + axisPlacement: 'auto', + barAlignment: 0, + drawStyle: 'line', + fillOpacity: 15, + gradientMode: 'none', + hideFrom: { + legend: false, + tooltip: false, + viz: false, + }, + insertNulls: false, + lineInterpolation: 'smooth', + lineWidth: 2, + pointSize: 5, + scaleDistribution: { + type: 'linear', + }, + showPoints: 'never', + spanNulls: false, + stacking: { + group: 'A', + mode: 'normal', + }, + thresholdsStyle: { + mode: 'off', + }, + }, + mappings: [], + thresholds: { + mode: 'absolute', + steps: [ + { + color: 'green', + value: null, + }, + ], + }, + unit: 'decbytes', + }, + overrides: [], + }, + options: { + legend: { + calcs: [], + displayMode: 'list', + placement: 'bottom', + showLegend: true, + }, + tooltip: { + mode: 'multi', + sort: 'desc', + }, + }, +}; + +{ + grafanaDashboards+:: { + 'presto-coordinator.json': + dashboard.new( + 'Presto coordinator', + time_from='%s' % $._config.dashboardPeriod, + tags=($._config.dashboardTags), + timezone='%s' % $._config.dashboardTimezone, + refresh='%s' % $._config.dashboardRefresh, + description='', + uid=dashboardUid, + ) + .addLink(grafana.link.dashboards( + asDropdown=false, + title='Other Presto dashboards', + includeVars=true, + keepTime=true, + tags=($._config.dashboardTags), + )) + .addTemplates( + [ + template.datasource( + promDatasourceName, + 'prometheus', + null, + label='Data Source', + refresh='load' + ), + template.new( + 'job', + promDatasource, + 'label_values(presto_HeartbeatDetector_ActiveCount,job)', + label='Job', + refresh=2, + includeAll=true, + multi=true, + allValues='.+', + sort=0 + ), + template.new( + 'cluster', + promDatasource, + 'label_values(presto_HeartbeatDetector_ActiveCount{job=~"$job"}, cluster)', + label='Cluster', + refresh=2, + includeAll=true, + multi=true, + allValues='', + hide=if $._config.enableMultiCluster then '' else 'variable', + sort=0 + ), + template.new( + 'presto_cluster', + promDatasource, + 'label_values(presto_HeartbeatDetector_ActiveCount{job=~"$job"},presto_cluster)', + label='Presto cluster', + refresh=2, + includeAll=false, + multi=false, + allValues='.*', + sort=0 + ), + template.new( + 'instance', + promDatasource, + 'label_values(presto_HeartbeatDetector_ActiveCount{job=~"$job", presto_cluster=~"$presto_cluster"},instance)', + label='Instance', + refresh=2, + includeAll=false, + multi=true, + allValues='', + sort=0 + ), + ] + ) + .addPanels( + [ + nonheapMemoryUsagePanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 9, w: 6, x: 0, y: 0 } }, + heapMemoryUsagePanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 9, w: 6, x: 6, y: 0 } }, + errorFailuresOneMinuteCountPanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 9, w: 12, x: 12, y: 0 } }, + normalQueryOneMinuteCountPanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 9, w: 12, x: 0, y: 9 } }, + abnormalQueryOneMinuteCountPanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 9, w: 12, x: 12, y: 9 } }, + normalQueryOneMinuteRatePanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 9, w: 12, x: 0, y: 18 } }, + abnormalQueryOneMinuteRatePanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 9, w: 12, x: 12, y: 18 } }, + queryExecutionTimeOneMinuteCountPanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 8, w: 24, x: 0, y: 27 } }, + cpuTimeConsumedOneMinuteRatePanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 0, y: 35 } }, + cpuInputThroughputOneMinuteCountPanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 12, y: 35 } }, + jvmMetricsRow { gridPos: { h: 1, w: 24, x: 0, y: 43 } }, + garbageCollectionCount(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 0, y: 44 } }, + garbageCollectionDurationPanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 12, y: 44 } }, + memoryUsedPanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 0, y: 52 } }, + memoryCommittedPanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 12, y: 52 } }, + ] + ), + }, +} diff --git a/presto-mixin/dashboards/presto-logs-overview.libsonnet b/presto-mixin/dashboards/presto-logs-overview.libsonnet new file mode 100644 index 000000000..d963c7e89 --- /dev/null +++ b/presto-mixin/dashboards/presto-logs-overview.libsonnet @@ -0,0 +1,32 @@ +local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; +local logsDashboard = import 'github.com/grafana/jsonnet-libs/logs-lib/logs/main.libsonnet'; +{ + grafanaDashboards+:: + if $._config.enableLokiLogs then { + local prestoLogs = + logsDashboard.new( + 'Presto logs overview', + datasourceName='loki_datasource', + datasourceRegex='', + filterSelector=$._config.filterSelector, + labels=['job', 'presto_cluster', 'instance', 'level'], + formatParser=null, + showLogsVolume=true + ) + { + panels+: + { + logs+: + // presto logs already have timestamp + g.panel.logs.options.withShowTime(false), + }, + dashboards+: + { + logs+: g.dashboard.withLinksMixin($.grafanaDashboards['presto-overview.json'].links) + + g.dashboard.withTags($._config.dashboardTags) + + g.dashboard.withRefresh($._config.dashboardRefresh), + }, + }, + 'presto-logs.json': prestoLogs.dashboards.logs, + } else {}, +} diff --git a/presto-mixin/dashboards/presto-overview.libsonnet b/presto-mixin/dashboards/presto-overview.libsonnet new file mode 100644 index 000000000..6304d9c91 --- /dev/null +++ b/presto-mixin/dashboards/presto-overview.libsonnet @@ -0,0 +1,993 @@ +local g = (import 'grafana-builder/grafana.libsonnet'); +local grafana = (import 'grafonnet/grafana.libsonnet'); +local dashboard = grafana.dashboard; +local template = grafana.template; +local prometheus = grafana.prometheus; + +local dashboardUid = 'presto-overview'; + +local promDatasourceName = 'prometheus_datasource'; +local getMatcher(cfg) = '%(prestoOverviewSelector)s' % cfg; +local getLegendMatcher(cfg) = '%(prestoOverviewLegendSelector)s' % cfg; +local getAlertMatcher(cfg) = '%(prestoAlertSelector)s' % cfg; +local promDatasource = { + uid: '${%s}' % promDatasourceName, +}; + +local activeResourceManagersPanel(legendMatcher, matcher) = { + datasource: promDatasource, + targets: [ + prometheus.target( + 'sum (max by (presto_cluster) (presto_metadata_DiscoveryNodeManager_ActiveResourceManagerCount{' + matcher + ', presto_cluster=~"$presto_cluster"}))', + datasource=promDatasource, + legendFormat='Resource manager', + format='time_series', + ), + ], + type: 'stat', + title: 'Active resource managers', + description: 'The number of active resource managers.', + fieldConfig: { + defaults: { + color: { + mode: 'thresholds', + }, + mappings: [], + thresholds: { + mode: 'absolute', + steps: [ + { + color: 'green', + value: null, + }, + { + color: 'text', + value: 0, + }, + { + color: 'green', + value: 1, + }, + ], + }, + unit: 'none', + }, + overrides: [], + }, + options: { + colorMode: 'value', + graphMode: 'none', + justifyMode: 'auto', + orientation: 'auto', + reduceOptions: { + calcs: [ + 'lastNotNull', + ], + fields: '', + values: false, + }, + textMode: 'auto', + }, + pluginVersion: '10.2.0-62263', +}; + +local activeCoordinatorsPanel(legendMatcher, matcher) = { + datasource: promDatasource, + targets: [ + prometheus.target( + 'sum(max by (presto_cluster) (presto_metadata_DiscoveryNodeManager_ActiveCoordinatorCount{' + matcher + ', presto_cluster=~"$presto_cluster"}))', + datasource=promDatasource, + legendFormat='Coordinator', + format='time_series', + ), + ], + type: 'stat', + title: 'Active coordinators', + description: 'Number of broker instances across clusters.', + fieldConfig: { + defaults: { + color: { + mode: 'thresholds', + }, + mappings: [], + thresholds: { + mode: 'absolute', + steps: [ + { + color: 'green', + value: null, + }, + { + color: 'red', + value: 0, + }, + { + color: 'green', + value: 1, + }, + ], + }, + unit: 'none', + }, + overrides: [], + }, + options: { + colorMode: 'value', + graphMode: 'none', + justifyMode: 'auto', + orientation: 'auto', + reduceOptions: { + calcs: [ + 'lastNotNull', + ], + fields: '', + values: false, + }, + textMode: 'auto', + }, + pluginVersion: '10.2.0-62263', +}; + +local activeWorkersPanel(legendMatcher, matcher) = { + datasource: promDatasource, + targets: [ + prometheus.target( + 'sum(max by (presto_cluster) (presto_metadata_DiscoveryNodeManager_ActiveNodeCount{' + matcher + ', presto_cluster=~"$presto_cluster"}) - max by (presto_cluster) (presto_metadata_DiscoveryNodeManager_ActiveCoordinatorCount{' + matcher + ', presto_cluster=~"$presto_cluster"}) - max by (presto_cluster) (presto_metadata_DiscoveryNodeManager_ActiveResourceManagerCount{' + matcher + ', presto_cluster=~"$presto_cluster"}))', + datasource=promDatasource, + legendFormat='Worker', + format='time_series', + ), + ], + type: 'stat', + title: 'Active workers', + description: 'The number of active workers.', + fieldConfig: { + defaults: { + color: { + mode: 'thresholds', + }, + mappings: [], + thresholds: { + mode: 'absolute', + steps: [ + { + color: 'green', + value: null, + }, + { + color: 'red', + value: 0, + }, + { + color: 'green', + value: 1, + }, + ], + }, + unit: 'none', + }, + overrides: [], + }, + options: { + colorMode: 'value', + graphMode: 'none', + justifyMode: 'auto', + orientation: 'auto', + reduceOptions: { + calcs: [ + 'lastNotNull', + ], + fields: '', + values: false, + }, + textMode: 'auto', + }, + pluginVersion: '10.2.0-62263', +}; + +local inactiveWorkersPanel(legendMatcher, matcher) = { + datasource: promDatasource, + targets: [ + prometheus.target( + 'sum(max by (presto_cluster) (presto_metadata_DiscoveryNodeManager_InactiveNodeCount{' + matcher + ', presto_cluster=~"$presto_cluster"}))', + datasource=promDatasource, + legendFormat='Worker', + format='time_series', + ), + ], + type: 'stat', + title: 'Inactive workers', + description: 'The number of inactive workers.', + fieldConfig: { + defaults: { + color: { + mode: 'thresholds', + }, + mappings: [], + thresholds: { + mode: 'absolute', + steps: [ + { + color: 'green', + value: null, + }, + { + color: 'red', + value: 3, + }, + ], + }, + unit: 'none', + }, + overrides: [], + }, + options: { + colorMode: 'value', + graphMode: 'none', + justifyMode: 'auto', + orientation: 'auto', + reduceOptions: { + calcs: [ + 'lastNotNull', + ], + fields: '', + values: false, + }, + textMode: 'auto', + }, + pluginVersion: '10.2.0-62263', +}; + +local completedQueriesOneMinuteCountPanel(legendMatcher, matcher) = { + datasource: promDatasource, + targets: [ + prometheus.target( + 'presto_QueryManager_CompletedQueries_OneMinute_Count{' + matcher + ', presto_cluster=~"$presto_cluster"}', + datasource=promDatasource, + legendFormat='' + legendMatcher + '', + format='time_series', + ), + ], + type: 'timeseries', + title: 'Completed queries - one minute count', + description: 'The number of completed queries.', + fieldConfig: { + defaults: { + color: { + fixedColor: '#C8F2C2', + mode: 'palette-classic', + }, + custom: { + axisBorderShow: false, + axisCenteredZero: false, + axisColorMode: 'text', + axisLabel: '', + axisPlacement: 'auto', + barAlignment: 0, + drawStyle: 'line', + fillOpacity: 15, + gradientMode: 'none', + hideFrom: { + legend: false, + tooltip: false, + viz: false, + }, + insertNulls: false, + lineInterpolation: 'stepBefore', + lineWidth: 2, + pointSize: 5, + scaleDistribution: { + type: 'linear', + }, + showPoints: 'never', + spanNulls: false, + stacking: { + group: 'A', + mode: 'none', + }, + thresholdsStyle: { + mode: 'off', + }, + }, + mappings: [], + thresholds: { + mode: 'absolute', + steps: [ + { + color: 'green', + value: null, + }, + ], + }, + unit: 'none', + }, + overrides: [], + }, + options: { + legend: { + calcs: [], + displayMode: 'list', + placement: 'bottom', + showLegend: true, + }, + tooltip: { + mode: 'multi', + sort: 'desc', + }, + }, +}; + +local alertsPanel(matcher) = { + datasource: promDatasource, + targets: [ + prometheus.target( + '', + datasource=promDatasource, + legendFormat='', + format='time_series', + ), + ], + type: 'alertlist', + title: 'Alerts', + description: 'Reports firing alerts.', + options: { + alertInstanceLabelFilter: '{' + matcher + ', presto_cluster=~"${presto_cluster:regex}"}', + alertName: '', + dashboardAlerts: false, + datasource: 'Prometheus', + groupBy: [], + groupMode: 'default', + maxItems: 20, + sortOrder: 1, + stateFilter: { + 'error': true, + firing: true, + noData: false, + normal: true, + pending: true, + }, + viewMode: 'list', + }, +}; + +local userErrorFailuresOneMinuteRatePanel(legendMatcher, matcher) = { + datasource: promDatasource, + targets: [ + prometheus.target( + 'presto_QueryManager_UserErrorFailures_OneMinute_Rate{' + matcher + ', presto_cluster=~"$presto_cluster"}', + datasource=promDatasource, + legendFormat='' + legendMatcher + '', + format='time_series', + ), + ], + type: 'timeseries', + title: 'User error failures - one minute rate', + description: 'The rate of user error failures occurring across the clusters.', + fieldConfig: { + defaults: { + color: { + mode: 'palette-classic', + }, + custom: { + axisBorderShow: false, + axisCenteredZero: false, + axisColorMode: 'text', + axisLabel: '', + axisPlacement: 'auto', + barAlignment: 0, + drawStyle: 'line', + fillOpacity: 15, + gradientMode: 'none', + hideFrom: { + legend: false, + tooltip: false, + viz: false, + }, + insertNulls: false, + lineInterpolation: 'stepBefore', + lineWidth: 2, + pointSize: 5, + scaleDistribution: { + type: 'linear', + }, + showPoints: 'never', + spanNulls: false, + stacking: { + group: 'A', + mode: 'none', + }, + thresholdsStyle: { + mode: 'off', + }, + }, + mappings: [], + thresholds: { + mode: 'absolute', + steps: [ + { + color: 'green', + value: null, + }, + ], + }, + unit: 'err/s', + }, + overrides: [], + }, + options: { + legend: { + calcs: [], + displayMode: 'list', + placement: 'bottom', + showLegend: true, + }, + tooltip: { + mode: 'multi', + sort: 'desc', + }, + }, +}; + +local queuedQueriesPanel(legendMatcher, matcher) = { + datasource: promDatasource, + targets: [ + prometheus.target( + 'presto_QueryManager_QueuedQueries{' + matcher + ', presto_cluster=~"$presto_cluster"}', + datasource=promDatasource, + legendFormat='' + legendMatcher + '', + format='time_series', + ), + ], + type: 'timeseries', + title: 'Queued queries', + description: 'The number of queued queries.', + fieldConfig: { + defaults: { + color: { + mode: 'palette-classic', + }, + custom: { + axisBorderShow: false, + axisCenteredZero: false, + axisColorMode: 'text', + axisLabel: '', + axisPlacement: 'auto', + barAlignment: 0, + drawStyle: 'bars', + fillOpacity: 15, + gradientMode: 'none', + hideFrom: { + legend: false, + tooltip: false, + viz: false, + }, + insertNulls: false, + lineInterpolation: 'smooth', + lineWidth: 2, + pointSize: 5, + scaleDistribution: { + type: 'linear', + }, + showPoints: 'never', + spanNulls: false, + stacking: { + group: 'A', + mode: 'none', + }, + thresholdsStyle: { + mode: 'off', + }, + }, + mappings: [], + thresholds: { + mode: 'absolute', + steps: [ + { + color: 'green', + value: null, + }, + ], + }, + unit: 'none', + }, + overrides: [], + }, + options: { + legend: { + calcs: [], + displayMode: 'list', + placement: 'bottom', + showLegend: true, + }, + tooltip: { + mode: 'multi', + sort: 'desc', + }, + }, +}; + +local blockedNodesPanel(legendMatcher, matcher) = { + datasource: promDatasource, + targets: [ + prometheus.target( + 'presto_ClusterMemoryPool_general_BlockedNodes{' + matcher + ', presto_cluster=~"$presto_cluster"}', + datasource=promDatasource, + legendFormat='' + legendMatcher + '', + format='time_series', + ), + ], + type: 'timeseries', + title: 'Blocked nodes', + description: 'The number of nodes that are blocked due to memory restrictions.', + fieldConfig: { + defaults: { + color: { + mode: 'palette-classic', + }, + custom: { + axisBorderShow: false, + axisCenteredZero: false, + axisColorMode: 'text', + axisLabel: '', + axisPlacement: 'auto', + barAlignment: 0, + drawStyle: 'line', + fillOpacity: 15, + gradientMode: 'none', + hideFrom: { + legend: false, + tooltip: false, + viz: false, + }, + insertNulls: false, + lineInterpolation: 'stepBefore', + lineWidth: 2, + pointSize: 5, + scaleDistribution: { + type: 'linear', + }, + showPoints: 'never', + spanNulls: false, + stacking: { + group: 'A', + mode: 'normal', + }, + thresholdsStyle: { + mode: 'off', + }, + }, + mappings: [], + thresholds: { + mode: 'absolute', + steps: [ + { + color: 'green', + value: null, + }, + ], + }, + unit: 'none', + }, + overrides: [], + }, + options: { + legend: { + calcs: [], + displayMode: 'list', + placement: 'bottom', + showLegend: true, + }, + tooltip: { + mode: 'multi', + sort: 'desc', + }, + }, +}; + +local internalErrorFailuresOneMinuteRatePanel(legendMatcher, matcher) = { + datasource: promDatasource, + targets: [ + prometheus.target( + 'presto_QueryManager_InternalFailures_OneMinute_Rate{' + matcher + ', presto_cluster=~"$presto_cluster"}', + datasource=promDatasource, + legendFormat='' + legendMatcher + '', + format='time_series', + ), + ], + type: 'timeseries', + title: 'Internal error failures - one minute rate', + description: 'The rate of internal failures occurring across the clusters.', + fieldConfig: { + defaults: { + color: { + mode: 'palette-classic', + }, + custom: { + axisBorderShow: false, + axisCenteredZero: false, + axisColorMode: 'text', + axisLabel: '', + axisPlacement: 'auto', + barAlignment: 0, + drawStyle: 'line', + fillOpacity: 15, + gradientMode: 'none', + hideFrom: { + legend: false, + tooltip: false, + viz: false, + }, + insertNulls: false, + lineInterpolation: 'stepBefore', + lineWidth: 2, + pointSize: 5, + scaleDistribution: { + type: 'linear', + }, + showPoints: 'never', + spanNulls: false, + stacking: { + group: 'A', + mode: 'none', + }, + thresholdsStyle: { + mode: 'off', + }, + }, + mappings: [], + thresholds: { + mode: 'absolute', + steps: [ + { + color: 'green', + value: null, + }, + ], + }, + unit: 'err/s', + }, + overrides: [], + }, + options: { + legend: { + calcs: [], + displayMode: 'list', + placement: 'bottom', + showLegend: true, + }, + tooltip: { + mode: 'multi', + sort: 'desc', + }, + }, +}; + +local clusterMemoryDistributedBytesPanel(legendMatcher, matcher) = { + datasource: promDatasource, + targets: [ + prometheus.target( + 'sum by (presto_cluster) (presto_ClusterMemoryPool_general_FreeDistributedBytes{' + matcher + ', presto_cluster=~"$presto_cluster"})', + datasource=promDatasource, + legendFormat='' + legendMatcher + ' - free', + format='time_series', + ), + prometheus.target( + 'sum by (presto_cluster) (presto_ClusterMemoryPool_reserved_FreeDistributedBytes{' + matcher + ', presto_cluster=~"$presto_cluster"})', + datasource=promDatasource, + legendFormat='' + legendMatcher + ' - reserved', + format='time_series', + ), + ], + type: 'timeseries', + title: 'Cluster memory distributed bytes', + description: 'The amount of memory available across the clusters.', + fieldConfig: { + defaults: { + color: { + mode: 'palette-classic', + }, + custom: { + axisBorderShow: false, + axisCenteredZero: false, + axisColorMode: 'text', + axisLabel: '', + axisPlacement: 'auto', + barAlignment: 0, + drawStyle: 'line', + fillOpacity: 15, + gradientMode: 'none', + hideFrom: { + legend: false, + tooltip: false, + viz: false, + }, + insertNulls: false, + lineInterpolation: 'smooth', + lineWidth: 2, + pointSize: 5, + scaleDistribution: { + type: 'linear', + }, + showPoints: 'never', + spanNulls: false, + stacking: { + group: 'A', + mode: 'normal', + }, + thresholdsStyle: { + mode: 'off', + }, + }, + mappings: [], + thresholds: { + mode: 'absolute', + steps: [ + { + color: 'green', + value: null, + }, + ], + }, + unit: 'decbytes', + }, + overrides: [], + }, + options: { + legend: { + calcs: [], + displayMode: 'list', + placement: 'bottom', + showLegend: true, + }, + tooltip: { + mode: 'multi', + sort: 'desc', + }, + }, +}; + +local InsufficientResourceFailuresOneMinuteRatePanel(legendMatcher, matcher) = { + datasource: promDatasource, + targets: [ + prometheus.target( + 'presto_QueryManager_InsufficientResourcesFailures_OneMinute_Rate{' + matcher + ', presto_cluster=~"$presto_cluster"}', + datasource=promDatasource, + legendFormat='' + legendMatcher + '', + format='time_series', + ), + ], + type: 'timeseries', + title: ' Insufficient resource failures - one minute rate', + description: 'The rate that failures are occurring due to insufficient resources.', + fieldConfig: { + defaults: { + color: { + mode: 'palette-classic', + }, + custom: { + axisBorderShow: false, + axisCenteredZero: false, + axisColorMode: 'text', + axisLabel: '', + axisPlacement: 'auto', + barAlignment: 0, + drawStyle: 'line', + fillOpacity: 15, + gradientMode: 'none', + hideFrom: { + legend: false, + tooltip: false, + viz: false, + }, + insertNulls: false, + lineInterpolation: 'stepBefore', + lineWidth: 2, + pointSize: 5, + scaleDistribution: { + type: 'linear', + }, + showPoints: 'never', + spanNulls: false, + stacking: { + group: 'A', + mode: 'none', + }, + thresholdsStyle: { + mode: 'off', + }, + }, + mappings: [], + thresholds: { + mode: 'absolute', + steps: [ + { + color: 'green', + value: null, + }, + ], + }, + unit: 'err/s', + }, + overrides: [], + }, + options: { + legend: { + calcs: [], + displayMode: 'list', + placement: 'bottom', + showLegend: true, + }, + tooltip: { + mode: 'multi', + sort: 'desc', + }, + }, +}; + +local dataProcessingThroughputOneMinuteRatePanel(legendMatcher, matcher) = { + datasource: promDatasource, + targets: [ + prometheus.target( + 'sum by (presto_cluster) (presto_TaskManager_InputDataSize_OneMinute_Rate{' + matcher + ', presto_cluster=~"$presto_cluster"})', + datasource=promDatasource, + legendFormat='' + legendMatcher + ' - input', + format='time_series', + ), + prometheus.target( + 'sum by (presto_cluster) (presto_TaskManager_OutputDataSize_OneMinute_Rate{' + matcher + ', presto_cluster=~"$presto_cluster"})', + datasource=promDatasource, + legendFormat='' + legendMatcher + ' - output', + format='time_series', + ), + ], + type: 'timeseries', + title: 'Data processing throughput - one minute rate', + description: 'The rate at which volumes of data are being processed', + fieldConfig: { + defaults: { + color: { + mode: 'palette-classic', + }, + custom: { + axisBorderShow: false, + axisCenteredZero: false, + axisColorMode: 'text', + axisLabel: '', + axisPlacement: 'auto', + barAlignment: 0, + drawStyle: 'line', + fillOpacity: 15, + gradientMode: 'none', + hideFrom: { + legend: false, + tooltip: false, + viz: false, + }, + insertNulls: false, + lineInterpolation: 'stepBefore', + lineWidth: 2, + pointSize: 5, + scaleDistribution: { + type: 'linear', + }, + showPoints: 'never', + spanNulls: false, + stacking: { + group: 'A', + mode: 'normal', + }, + thresholdsStyle: { + mode: 'off', + }, + }, + mappings: [], + thresholds: { + mode: 'absolute', + steps: [ + { + color: 'green', + value: null, + }, + ], + }, + unit: 'Bps', + }, + overrides: [], + }, + options: { + legend: { + calcs: [], + displayMode: 'list', + placement: 'bottom', + showLegend: true, + }, + tooltip: { + mode: 'multi', + sort: 'desc', + }, + }, +}; + +{ + grafanaDashboards+:: { + 'presto-overview.json': + dashboard.new( + 'Presto overview', + time_from='%s' % $._config.dashboardPeriod, + tags=($._config.dashboardTags), + timezone='%s' % $._config.dashboardTimezone, + refresh='%s' % $._config.dashboardRefresh, + description='', + uid=dashboardUid, + ) + .addLink(grafana.link.dashboards( + asDropdown=false, + title='Other Presto dashboards', + includeVars=true, + keepTime=true, + tags=($._config.dashboardTags), + )) + .addTemplates( + [ + template.datasource( + promDatasourceName, + 'prometheus', + null, + label='Data Source', + refresh='load' + ), + template.new( + 'job', + promDatasource, + 'label_values(presto_HeartbeatDetector_ActiveCount,job)', + label='Job', + refresh=2, + includeAll=true, + multi=true, + allValues='.+', + sort=0 + ), + template.new( + 'cluster', + promDatasource, + 'label_values(presto_HeartbeatDetector_ActiveCount{job=~"$job"}, cluster)', + label='Cluster', + refresh=2, + includeAll=true, + multi=true, + allValues='', + hide=if $._config.enableMultiCluster then '' else 'variable', + sort=0 + ), + template.new( + 'presto_cluster', + promDatasource, + 'label_values(presto_HeartbeatDetector_ActiveCount{job=~"$job"},presto_cluster)', + label='Presto cluster', + refresh=2, + includeAll=true, + multi=true, + allValues='.*', + sort=0 + ), + ] + ) + .addPanels( + [ + activeResourceManagersPanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 4, w: 6, x: 0, y: 0 } }, + activeCoordinatorsPanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 4, w: 6, x: 6, y: 0 } }, + activeWorkersPanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 4, w: 6, x: 12, y: 0 } }, + inactiveWorkersPanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 4, w: 6, x: 18, y: 0 } }, + completedQueriesOneMinuteCountPanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 0, y: 4 } }, + alertsPanel(getAlertMatcher($._config)) { gridPos: { h: 8, w: 12, x: 12, y: 4 } }, + userErrorFailuresOneMinuteRatePanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 0, y: 12 } }, + queuedQueriesPanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 12, y: 12 } }, + blockedNodesPanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 0, y: 20 } }, + internalErrorFailuresOneMinuteRatePanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 12, y: 20 } }, + clusterMemoryDistributedBytesPanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 0, y: 28 } }, + InsufficientResourceFailuresOneMinuteRatePanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 12, y: 28 } }, + dataProcessingThroughputOneMinuteRatePanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 9, w: 24, x: 0, y: 36 } }, + ] + ), + }, +} diff --git a/presto-mixin/dashboards/presto-worker.libsonnet b/presto-mixin/dashboards/presto-worker.libsonnet new file mode 100644 index 000000000..5e22ce362 --- /dev/null +++ b/presto-mixin/dashboards/presto-worker.libsonnet @@ -0,0 +1,1083 @@ +local g = (import 'grafana-builder/grafana.libsonnet'); +local grafana = (import 'grafonnet/grafana.libsonnet'); +local dashboard = grafana.dashboard; +local template = grafana.template; +local prometheus = grafana.prometheus; + +local dashboardUid = 'presto-worker'; + +local promDatasourceName = 'prometheus_datasource'; +local getMatcher(cfg) = '%(prestoSelector)s' % cfg; +local getLegendMatcher(cfg) = '%(prestoLegendSelector)s' % cfg; +local promDatasource = { + uid: '${%s}' % promDatasourceName, +}; + +local nonheapMemoryUsagePanel(legendMatcher, matcher) = { + datasource: promDatasource, + targets: [ + prometheus.target( + 'avg (jvm_nonheap_memory_used{' + matcher + ', presto_cluster=~"$presto_cluster"} / clamp_min((jvm_nonheap_memory_used{' + matcher + ', presto_cluster=~"$presto_cluster"} + jvm_nonheap_memory_committed{' + matcher + ', presto_cluster=~"$presto_cluster"}),1))', + datasource=promDatasource, + ), + ], + type: 'gauge', + title: 'Non-heap memory usage', + description: "An average gauge of the JVM's non-heap memory usage across coordinators.", + fieldConfig: { + defaults: { + color: { + mode: 'thresholds', + }, + mappings: [], + thresholds: { + mode: 'absolute', + steps: [ + { + color: 'green', + value: null, + }, + { + color: '#EAB839', + value: 0.7, + }, + { + color: 'red', + value: 0.8, + }, + ], + }, + unit: 'percentunit', + }, + overrides: [], + }, + options: { + minVizHeight: 75, + minVizWidth: 75, + orientation: 'auto', + reduceOptions: { + calcs: [ + 'lastNotNull', + ], + fields: '', + values: false, + }, + showThresholdLabels: false, + showThresholdMarkers: true, + }, + pluginVersion: '10.2.0-62263', +}; + +local heapMemoryUsagePanel(legendMatcher, matcher) = { + datasource: promDatasource, + targets: [ + prometheus.target( + 'avg (jvm_heap_memory_used{' + matcher + ', presto_cluster=~"$presto_cluster"} / clamp_min((jvm_heap_memory_used{' + matcher + ', presto_cluster=~"$presto_cluster"} + jvm_heap_memory_committed{' + matcher + ', presto_cluster=~"$presto_cluster"}),1))', + datasource=promDatasource, + ), + ], + type: 'gauge', + title: 'Heap memory usage', + description: "An average gauge of the JVM's heap memory usage across workers.", + fieldConfig: { + defaults: { + color: { + mode: 'thresholds', + }, + mappings: [], + thresholds: { + mode: 'absolute', + steps: [ + { + color: 'green', + value: null, + }, + { + color: '#EAB839', + value: 0.7, + }, + { + color: 'red', + value: 0.8, + }, + ], + }, + unit: 'percentunit', + }, + overrides: [], + }, + options: { + minVizHeight: 75, + minVizWidth: 75, + orientation: 'auto', + reduceOptions: { + calcs: [ + 'lastNotNull', + ], + fields: '', + values: false, + }, + showThresholdLabels: false, + showThresholdMarkers: true, + }, + pluginVersion: '10.2.0-62263', +}; + +local queuedTasksPanel(legendMatcher, matcher) = { + datasource: promDatasource, + targets: [ + prometheus.target( + 'presto_TaskExecutor_ProcessorExecutor_QueuedTaskCount{' + matcher + ', presto_cluster=~"$presto_cluster"}', + datasource=promDatasource, + legendFormat='' + legendMatcher + '', + format='time_series', + ), + ], + type: 'timeseries', + title: 'Queued tasks', + description: 'The number of tasks that are being queued by the task executor.', + fieldConfig: { + defaults: { + color: { + fixedColor: '#C8F2C2', + mode: 'palette-classic', + }, + custom: { + axisBorderShow: false, + axisCenteredZero: false, + axisColorMode: 'text', + axisLabel: '', + axisPlacement: 'auto', + barAlignment: 0, + drawStyle: 'line', + fillOpacity: 15, + gradientMode: 'none', + hideFrom: { + legend: false, + tooltip: false, + viz: false, + }, + insertNulls: false, + lineInterpolation: 'smooth', + lineWidth: 2, + pointSize: 5, + scaleDistribution: { + type: 'linear', + }, + showPoints: 'never', + spanNulls: false, + stacking: { + group: 'A', + mode: 'normal', + }, + thresholdsStyle: { + mode: 'off', + }, + }, + mappings: [], + thresholds: { + mode: 'absolute', + steps: [ + { + color: 'green', + value: null, + }, + ], + }, + unit: 'none', + }, + overrides: [], + }, + options: { + legend: { + calcs: [], + displayMode: 'list', + placement: 'bottom', + showLegend: true, + }, + tooltip: { + mode: 'multi', + sort: 'desc', + }, + }, +}; + +local failedCompletedTasksPanel(legendMatcher, matcher) = { + datasource: promDatasource, + targets: [ + prometheus.target( + 'rate(presto_TaskManager_FailedTasks_TotalCount{' + matcher + ', presto_cluster=~"$presto_cluster"}[$__rate_interval])', + datasource=promDatasource, + legendFormat='' + legendMatcher + ' - failed', + format='time_series', + ), + prometheus.target( + 'rate(presto_TaskExecutor_ProcessorExecutor_CompletedTaskCount{' + matcher + ', presto_cluster=~"$presto_cluster"}[$__rate_interval])', + datasource=promDatasource, + legendFormat='' + legendMatcher + ' - completed', + ), + ], + type: 'timeseries', + title: 'Failed & Completed Tasks', + description: 'The rate at which tasks have failed and completed', + fieldConfig: { + defaults: { + color: { + mode: 'palette-classic', + }, + custom: { + axisBorderShow: false, + axisCenteredZero: false, + axisColorMode: 'text', + axisLabel: '', + axisPlacement: 'auto', + barAlignment: 0, + drawStyle: 'line', + fillOpacity: 15, + gradientMode: 'none', + hideFrom: { + legend: false, + tooltip: false, + viz: false, + }, + insertNulls: false, + lineInterpolation: 'smooth', + lineWidth: 2, + pointSize: 5, + scaleDistribution: { + type: 'linear', + }, + showPoints: 'never', + spanNulls: false, + stacking: { + group: 'A', + mode: 'normal', + }, + thresholdsStyle: { + mode: 'off', + }, + }, + mappings: [], + thresholds: { + mode: 'absolute', + steps: [ + { + color: 'green', + value: null, + }, + ], + }, + unit: 'ops', + }, + overrides: [ + { + matcher: { + id: 'byFrameRefID', + options: 'A', + }, + properties: [ + { + id: 'color', + value: { + fixedColor: 'red', + mode: 'fixed', + }, + }, + { + id: 'custom.axisPlacement', + value: 'left', + }, + ], + }, + { + matcher: { + id: 'byFrameRefID', + options: 'B', + }, + properties: [ + { + id: 'color', + value: { + fixedColor: 'green', + mode: 'fixed', + }, + }, + { + id: 'custom.axisPlacement', + value: 'right', + }, + ], + }, + ], + }, + options: { + legend: { + calcs: [], + displayMode: 'list', + placement: 'bottom', + showLegend: true, + }, + tooltip: { + mode: 'multi', + sort: 'desc', + }, + }, +}; + +local outputPositionsPanel(legendMatcher, matcher) = { + datasource: promDatasource, + targets: [ + prometheus.target( + 'presto_TaskManager_OutputPositions_OneMinute_Rate{' + matcher + ', presto_cluster=~"$presto_cluster"}', + datasource=promDatasource, + legendFormat='' + legendMatcher + '', + format='time_series', + ), + ], + type: 'timeseries', + title: 'Output positions - one minute rate', + description: 'The rate of rows (or records) produced by an operation.', + fieldConfig: { + defaults: { + color: { + mode: 'palette-classic', + }, + custom: { + axisBorderShow: false, + axisCenteredZero: false, + axisColorMode: 'text', + axisLabel: '', + axisPlacement: 'auto', + barAlignment: 0, + drawStyle: 'line', + fillOpacity: 15, + gradientMode: 'none', + hideFrom: { + legend: false, + tooltip: false, + viz: false, + }, + insertNulls: false, + lineInterpolation: 'stepBefore', + lineWidth: 2, + pointSize: 5, + scaleDistribution: { + type: 'linear', + }, + showPoints: 'never', + spanNulls: false, + stacking: { + group: 'A', + mode: 'normal', + }, + thresholdsStyle: { + mode: 'off', + }, + }, + mappings: [], + thresholds: { + mode: 'absolute', + steps: [ + { + color: 'green', + value: null, + }, + ], + }, + unit: 'rowsps', + }, + overrides: [], + }, + options: { + legend: { + calcs: [], + displayMode: 'list', + placement: 'bottom', + showLegend: true, + }, + tooltip: { + mode: 'multi', + sort: 'desc', + }, + }, +}; + +local executorPoolSizePanel(legendMatcher, matcher) = { + datasource: promDatasource, + targets: [ + prometheus.target( + 'presto_TaskManager_TaskNotificationExecutor_PoolSize{' + matcher + ', presto_cluster=~"$presto_cluster"}', + datasource=promDatasource, + legendFormat='' + legendMatcher + ' - task notification', + format='time_series', + ), + prometheus.target( + 'presto_TaskExecutor_ProcessorExecutor_CorePoolSize{' + matcher + ', presto_cluster=~"$presto_cluster"}', + datasource=promDatasource, + legendFormat='' + legendMatcher + ' - process executor core', + ), + prometheus.target( + 'presto_TaskExecutor_ProcessorExecutor_PoolSize{' + matcher + ', presto_cluster=~"$presto_cluster"}', + datasource=promDatasource, + legendFormat='' + legendMatcher + ' - process executor', + ), + ], + type: 'timeseries', + title: 'Executor pool size', + description: 'The pool size of the task notification executor and process executor.', + fieldConfig: { + defaults: { + color: { + mode: 'palette-classic', + }, + custom: { + axisBorderShow: false, + axisCenteredZero: false, + axisColorMode: 'text', + axisLabel: '', + axisPlacement: 'auto', + barAlignment: 0, + drawStyle: 'line', + fillOpacity: 15, + gradientMode: 'none', + hideFrom: { + legend: false, + tooltip: false, + viz: false, + }, + insertNulls: false, + lineInterpolation: 'smooth', + lineWidth: 2, + pointSize: 5, + scaleDistribution: { + type: 'linear', + }, + showPoints: 'never', + spanNulls: false, + stacking: { + group: 'A', + mode: 'normal', + }, + thresholdsStyle: { + mode: 'off', + }, + }, + mappings: [], + thresholds: { + mode: 'absolute', + steps: [ + { + color: 'green', + value: null, + }, + ], + }, + unit: 'none', + }, + overrides: [], + }, + options: { + legend: { + calcs: [], + displayMode: 'list', + placement: 'bottom', + showLegend: true, + }, + tooltip: { + mode: 'multi', + sort: 'desc', + }, + }, +}; + +local memoryPoolPanel(legendMatcher, matcher) = { + datasource: promDatasource, + targets: [ + prometheus.target( + 'sum by (instance, presto_cluster) (presto_MemoryPool_general_FreeBytes{' + matcher + ', presto_cluster=~"$presto_cluster"})', + datasource=promDatasource, + legendFormat='' + legendMatcher + ' - free', + format='time_series', + ), + prometheus.target( + 'sum by (instance, presto_cluster) (presto_MemoryPool_reserved_FreeBytes{' + matcher + ', presto_cluster=~"$presto_cluster"})', + datasource=promDatasource, + legendFormat='' + legendMatcher + ' - reserved', + ), + ], + type: 'timeseries', + title: 'Memory pool', + description: 'The amount of Presto memory available.', + fieldConfig: { + defaults: { + color: { + mode: 'palette-classic', + }, + custom: { + axisBorderShow: false, + axisCenteredZero: false, + axisColorMode: 'text', + axisLabel: '', + axisPlacement: 'auto', + barAlignment: 0, + drawStyle: 'line', + fillOpacity: 15, + gradientMode: 'none', + hideFrom: { + legend: false, + tooltip: false, + viz: false, + }, + insertNulls: false, + lineInterpolation: 'smooth', + lineWidth: 2, + pointSize: 5, + scaleDistribution: { + type: 'linear', + }, + showPoints: 'never', + spanNulls: false, + stacking: { + group: 'A', + mode: 'normal', + }, + thresholdsStyle: { + mode: 'off', + }, + }, + mappings: [], + thresholds: { + mode: 'absolute', + steps: [ + { + color: 'green', + value: null, + }, + ], + }, + unit: 'decbytes', + }, + overrides: [], + }, + options: { + legend: { + calcs: [], + displayMode: 'list', + placement: 'bottom', + showLegend: true, + }, + tooltip: { + mode: 'multi', + sort: 'desc', + }, + }, +}; + +local dataProcessingThroughputOneMinuteRatePanel(legendMatcher, matcher) = { + datasource: promDatasource, + targets: [ + prometheus.target( + 'presto_TaskManager_InputDataSize_OneMinute_Rate{' + matcher + ', presto_cluster=~"$presto_cluster"}', + datasource=promDatasource, + legendFormat='' + legendMatcher + ' - input', + format='time_series', + ), + prometheus.target( + 'presto_TaskManager_OutputDataSize_OneMinute_Rate{' + matcher + ', presto_cluster=~"$presto_cluster"}', + datasource=promDatasource, + legendFormat='' + legendMatcher + ' - output', + ), + ], + type: 'timeseries', + title: 'Data processing throughput - one minute rate', + description: 'The rate at which volumes of data are being processed', + fieldConfig: { + defaults: { + color: { + mode: 'palette-classic', + }, + custom: { + axisBorderShow: false, + axisCenteredZero: false, + axisColorMode: 'text', + axisLabel: '', + axisPlacement: 'auto', + barAlignment: 0, + drawStyle: 'line', + fillOpacity: 15, + gradientMode: 'none', + hideFrom: { + legend: false, + tooltip: false, + viz: false, + }, + insertNulls: false, + lineInterpolation: 'stepBefore', + lineWidth: 2, + pointSize: 5, + scaleDistribution: { + type: 'linear', + }, + showPoints: 'never', + spanNulls: false, + stacking: { + group: 'A', + mode: 'normal', + }, + thresholdsStyle: { + mode: 'off', + }, + }, + mappings: [], + thresholds: { + mode: 'absolute', + steps: [ + { + color: 'green', + value: null, + }, + ], + }, + unit: 'Bps', + }, + overrides: [], + }, + options: { + legend: { + calcs: [], + displayMode: 'list', + placement: 'bottom', + showLegend: true, + }, + tooltip: { + mode: 'multi', + sort: 'desc', + }, + }, +}; + +local jvmMetricsRow = { + datasource: promDatasource, + targets: [], + type: 'row', + title: 'JVM metrics', + collapsed: false, +}; + +local garbageCollectionCount(legendMatcher, matcher) = { + datasource: promDatasource, + targets: [ + prometheus.target( + 'increase(jvm_gc_collection_count{' + matcher + ', presto_cluster=~"$presto_cluster", name="G1 Young Generation"}[$__interval:])', + datasource=promDatasource, + legendFormat='' + legendMatcher + '', + interval='1m', + format='time_series', + ), + ], + type: 'timeseries', + title: 'Garbage collection count / $__interval', + description: 'The recent increase in the number of garbage collection events for the JVM.', + fieldConfig: { + defaults: { + color: { + mode: 'palette-classic', + }, + custom: { + axisBorderShow: false, + axisCenteredZero: false, + axisColorMode: 'text', + axisLabel: '', + axisPlacement: 'auto', + barAlignment: 0, + drawStyle: 'line', + fillOpacity: 15, + gradientMode: 'none', + hideFrom: { + legend: false, + tooltip: false, + viz: false, + }, + insertNulls: false, + lineInterpolation: 'smooth', + lineWidth: 2, + pointSize: 5, + scaleDistribution: { + type: 'linear', + }, + showPoints: 'never', + spanNulls: false, + stacking: { + group: 'A', + mode: 'normal', + }, + thresholdsStyle: { + mode: 'off', + }, + }, + mappings: [], + thresholds: { + mode: 'absolute', + steps: [ + { + color: 'green', + value: null, + }, + ], + }, + unit: 'none', + }, + overrides: [], + }, + options: { + legend: { + calcs: [], + displayMode: 'list', + placement: 'bottom', + showLegend: true, + }, + tooltip: { + mode: 'multi', + sort: 'desc', + }, + }, +}; + +local garbageCollectionDurationPanel(legendMatcher, matcher) = { + datasource: promDatasource, + targets: [ + prometheus.target( + 'jvm_gc_duration{' + matcher + ', presto_cluster=~"$presto_cluster", name="G1 Young Generation"}', + datasource=promDatasource, + legendFormat='' + legendMatcher + '', + format='time_series', + ), + ], + type: 'timeseries', + title: 'Garbage collection duration', + description: 'The average duration for each garbage collection operation in the JVM.', + fieldConfig: { + defaults: { + color: { + mode: 'palette-classic', + }, + custom: { + axisBorderShow: false, + axisCenteredZero: false, + axisColorMode: 'text', + axisLabel: '', + axisPlacement: 'auto', + barAlignment: 0, + drawStyle: 'line', + fillOpacity: 15, + gradientMode: 'none', + hideFrom: { + legend: false, + tooltip: false, + viz: false, + }, + insertNulls: false, + lineInterpolation: 'smooth', + lineWidth: 2, + pointSize: 5, + scaleDistribution: { + type: 'linear', + }, + showPoints: 'never', + spanNulls: false, + stacking: { + group: 'A', + mode: 'normal', + }, + thresholdsStyle: { + mode: 'off', + }, + }, + mappings: [], + thresholds: { + mode: 'absolute', + steps: [ + { + color: 'green', + value: null, + }, + ], + }, + unit: 'ms', + }, + overrides: [], + }, + options: { + legend: { + calcs: [], + displayMode: 'list', + placement: 'bottom', + showLegend: true, + }, + tooltip: { + mode: 'multi', + sort: 'desc', + }, + }, +}; + +local memoryUsedPanel(legendMatcher, matcher) = { + datasource: promDatasource, + targets: [ + prometheus.target( + 'jvm_nonheap_memory_used{' + matcher + ', presto_cluster=~"$presto_cluster"}', + datasource=promDatasource, + legendFormat='' + legendMatcher + ' - non heap', + format='time_series', + ), + prometheus.target( + 'jvm_heap_memory_used{' + matcher + ', presto_cluster=~"$presto_cluster"}', + datasource=promDatasource, + legendFormat='' + legendMatcher + ' - heap', + ), + ], + type: 'timeseries', + title: 'Memory used', + description: 'The heap and non-heap memory used by the JVM.', + fieldConfig: { + defaults: { + color: { + mode: 'palette-classic', + }, + custom: { + axisBorderShow: false, + axisCenteredZero: false, + axisColorMode: 'text', + axisLabel: '', + axisPlacement: 'auto', + barAlignment: 0, + drawStyle: 'line', + fillOpacity: 15, + gradientMode: 'none', + hideFrom: { + legend: false, + tooltip: false, + viz: false, + }, + insertNulls: false, + lineInterpolation: 'smooth', + lineWidth: 2, + pointSize: 5, + scaleDistribution: { + type: 'linear', + }, + showPoints: 'never', + spanNulls: false, + stacking: { + group: 'A', + mode: 'normal', + }, + thresholdsStyle: { + mode: 'off', + }, + }, + mappings: [], + thresholds: { + mode: 'absolute', + steps: [ + { + color: 'green', + value: null, + }, + ], + }, + unit: 'decbytes', + }, + overrides: [], + }, + options: { + legend: { + calcs: [], + displayMode: 'list', + placement: 'bottom', + showLegend: true, + }, + tooltip: { + mode: 'multi', + sort: 'desc', + }, + }, +}; + +local memoryCommittedPanel(legendMatcher, matcher) = { + datasource: promDatasource, + targets: [ + prometheus.target( + 'jvm_heap_memory_committed{' + matcher + ', presto_cluster=~"$presto_cluster"}', + datasource=promDatasource, + legendFormat='' + legendMatcher + ' - heap', + format='time_series', + ), + prometheus.target( + 'jvm_nonheap_memory_committed{' + matcher + ', presto_cluster=~"$presto_cluster"}', + datasource=promDatasource, + legendFormat='' + legendMatcher + ' - non heap', + ), + ], + type: 'timeseries', + title: 'Memory committed', + description: 'The heap and non-heap memory committed.', + fieldConfig: { + defaults: { + color: { + mode: 'palette-classic', + }, + custom: { + axisBorderShow: false, + axisCenteredZero: false, + axisColorMode: 'text', + axisLabel: '', + axisPlacement: 'auto', + barAlignment: 0, + drawStyle: 'line', + fillOpacity: 15, + gradientMode: 'none', + hideFrom: { + legend: false, + tooltip: false, + viz: false, + }, + insertNulls: false, + lineInterpolation: 'smooth', + lineWidth: 2, + pointSize: 5, + scaleDistribution: { + type: 'linear', + }, + showPoints: 'never', + spanNulls: false, + stacking: { + group: 'A', + mode: 'normal', + }, + thresholdsStyle: { + mode: 'off', + }, + }, + mappings: [], + thresholds: { + mode: 'absolute', + steps: [ + { + color: 'green', + value: null, + }, + ], + }, + unit: 'decbytes', + }, + overrides: [], + }, + options: { + legend: { + calcs: [], + displayMode: 'list', + placement: 'bottom', + showLegend: true, + }, + tooltip: { + mode: 'multi', + sort: 'desc', + }, + }, +}; + +{ + grafanaDashboards+:: { + 'presto-worker.json': + dashboard.new( + 'Presto worker', + time_from='%s' % $._config.dashboardPeriod, + tags=($._config.dashboardTags), + timezone='%s' % $._config.dashboardTimezone, + refresh='%s' % $._config.dashboardRefresh, + description='', + uid=dashboardUid, + ) + .addLink(grafana.link.dashboards( + asDropdown=false, + title='Other Presto dashboards', + includeVars=true, + keepTime=true, + tags=($._config.dashboardTags), + )) + .addTemplates( + [ + template.datasource( + promDatasourceName, + 'prometheus', + null, + label='Data Source', + refresh='load' + ), + template.new( + 'job', + promDatasource, + 'label_values(presto_metadata_DiscoveryNodeManager_ActiveNodeCount,job)', + label='Job', + refresh=2, + includeAll=true, + multi=true, + allValues='.+', + sort=0 + ), + template.new( + 'cluster', + promDatasource, + 'label_values(presto_metadata_DiscoveryNodeManager_ActiveNodeCount{job=~"$job"}, cluster)', + label='Cluster', + refresh=2, + includeAll=true, + multi=true, + allValues='', + hide=if $._config.enableMultiCluster then '' else 'variable', + sort=0 + ), + template.new( + 'presto_cluster', + promDatasource, + 'label_values(presto_metadata_DiscoveryNodeManager_ActiveNodeCount{job=~"$job"},presto_cluster)', + label='Presto cluster', + refresh=2, + includeAll=false, + multi=false, + allValues='.*', + sort=0 + ), + template.new( + 'instance', + promDatasource, + 'label_values(presto_metadata_DiscoveryNodeManager_ActiveNodeCount{job=~"$job", presto_cluster=~"$presto_cluster"},instance)', + label='Instance', + refresh=2, + includeAll=false, + multi=true, + allValues='', + sort=0 + ), + ] + ) + .addPanels( + [ + nonheapMemoryUsagePanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 8, w: 3, x: 0, y: 0 } }, + heapMemoryUsagePanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 8, w: 3, x: 3, y: 0 } }, + queuedTasksPanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 8, w: 6, x: 6, y: 0 } }, + failedCompletedTasksPanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 12, y: 0 } }, + outputPositionsPanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 0, y: 8 } }, + executorPoolSizePanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 12, y: 8 } }, + memoryPoolPanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 0, y: 16 } }, + dataProcessingThroughputOneMinuteRatePanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 12, y: 16 } }, + jvmMetricsRow { gridPos: { h: 1, w: 24, x: 0, y: 24 } }, + garbageCollectionCount(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 0, y: 25 } }, + garbageCollectionDurationPanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 12, y: 25 } }, + memoryUsedPanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 0, y: 33 } }, + memoryCommittedPanel(getLegendMatcher($._config), getMatcher($._config)) { gridPos: { h: 8, w: 12, x: 12, y: 33 } }, + ] + ), + }, +} diff --git a/presto-mixin/jsonnetfile.json b/presto-mixin/jsonnetfile.json new file mode 100644 index 000000000..e8255b65e --- /dev/null +++ b/presto-mixin/jsonnetfile.json @@ -0,0 +1,33 @@ +{ + "version": 1, + "dependencies": [ + { + "source": { + "git": { + "remote": "https://github.com/grafana/grafonnet.git", + "subdir": "gen/grafonnet-latest" + } + }, + "version": "main" + }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/grafonnet-lib.git", + "subdir": "grafonnet" + } + }, + "version": "master" + }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs.git", + "subdir": "logs-lib" + } + }, + "version": "master" + } + ], + "legacyImports": true +} diff --git a/presto-mixin/mixin.libsonnet b/presto-mixin/mixin.libsonnet new file mode 100644 index 000000000..4d987cf31 --- /dev/null +++ b/presto-mixin/mixin.libsonnet @@ -0,0 +1,3 @@ +(import 'dashboards/dashboards.libsonnet') + +(import 'alerts/alerts.libsonnet') + +(import 'config.libsonnet')