From 78c08b89ed28f54f8868412668a37a7d25c4cfa0 Mon Sep 17 00:00:00 2001 From: ymao1 Date: Mon, 3 May 2021 19:18:13 -0400 Subject: [PATCH] [Alerting][Docs] Adding query to identify long running rules to docs (#98773) * Adding query to identify long running rules to docs * Wording suggestsion from PR review * Adding event.provider to query. Allowing copy to console * Adding note for system privileges * Adding runtime field to query * Removing extra dollar sign * PR fixes --- .../alerting-troubleshooting.asciidoc | 164 ++++++++++++++++++ 1 file changed, 164 insertions(+) diff --git a/docs/user/alerting/alerting-troubleshooting.asciidoc b/docs/user/alerting/alerting-troubleshooting.asciidoc index 6d4a0e9375678..b7fd98d1c674e 100644 --- a/docs/user/alerting/alerting-troubleshooting.asciidoc +++ b/docs/user/alerting/alerting-troubleshooting.asciidoc @@ -69,3 +69,167 @@ Configuration options are available to specialize connections to TLS servers, including ignoring server certificate validation, and providing certificate authority data to verify servers using custom certificates. For more details, see <>. + +[float] +[[rules-long-execution-time]] +=== Identify long-running rules + +The following query can help you identify rules that are taking a long time to execute and might impact the overall health of your deployment. + +[IMPORTANT] +============================================== +By default, only users with a `superuser` role can query the {kib} event log because it is a system index. To enable additional users to execute this query, assign `read` privileges to the `.kibana-event-log*` index. +============================================== + +Query for a list of rule ids, bucketed by their execution times: + +[source,console] +-------------------------------------------------- +GET /.kibana-event-log*/_search +{ + "size": 0, + "query": { + "bool": { + "filter": [ + { + "range": { + "@timestamp": { + "gte": "now-1d", <1> + "lte": "now" + } + } + }, + { + "term": { + "event.action": { + "value": "execute" + } + } + }, + { + "term": { + "event.provider": { + "value": "alerting" <2> + } + } + } + ] + } + }, + "runtime_mappings": { <3> + "event.duration_in_seconds": { + "type": "double", + "script": { + "source": "emit(doc['event.duration'].value / 1E9)" + } + } + }, + "aggs": { + "ruleIdsByExecutionDuration": { + "histogram": { + "field": "event.duration_in_seconds", + "min_doc_count": 1, + "interval": 1 <4> + }, + "aggs": { + "ruleId": { + "nested": { + "path": "kibana.saved_objects" + }, + "aggs": { + "ruleId": { + "terms": { + "field": "kibana.saved_objects.id", + "size": 10 <5> + } + } + } + } + } + } + } +} +-------------------------------------------------- +// TEST + +<1> This queries for rules executed in the last day. Update the values of `lte` and `gte` to query over a different time range. +<2> Use `event.provider: actions` to query for long-running action executions. +<3> Execution durations are stored as nanoseconds. This adds a runtime field to convert that duration into seconds. +<4> This interval buckets the event.duration_in_seconds runtime field into 1 second intervals. Update this value to change the granularity of the buckets. If you are unable to use runtime fields, make sure this aggregation targets `event.duration` and use nanoseconds for the interval. +<5> This retrieves the top 10 rule ids for this duration interval. Update this value to retrieve more rule ids. + +This query returns the following: + +[source,json] +-------------------------------------------------- +{ + "took" : 322, + "timed_out" : false, + "_shards" : { + "total" : 1, + "successful" : 1, + "skipped" : 0, + "failed" : 0 + }, + "hits" : { + "total" : { + "value" : 326, + "relation" : "eq" + }, + "max_score" : null, + "hits" : [ ] + }, + "aggregations" : { + "ruleIdsByExecutionDuration" : { + "buckets" : [ + { + "key" : 0.0, <1> + "doc_count" : 320, + "ruleId" : { + "doc_count" : 320, + "ruleId" : { + "doc_count_error_upper_bound" : 0, + "sum_other_doc_count" : 0, + "buckets" : [ + { + "key" : "1923ada0-a8f3-11eb-a04b-13d723cdfdc5", + "doc_count" : 140 + }, + { + "key" : "15415ecf-cdb0-4fef-950a-f824bd277fe4", + "doc_count" : 130 + }, + { + "key" : "dceeb5d0-6b41-11eb-802b-85b0c1bc8ba2", + "doc_count" : 50 + } + ] + } + } + }, + { + "key" : 30.0, <2> + "doc_count" : 6, + "ruleId" : { + "doc_count" : 6, + "ruleId" : { + "doc_count_error_upper_bound" : 0, + "sum_other_doc_count" : 0, + "buckets" : [ + { + "key" : "41893910-6bca-11eb-9e0d-85d233e3ee35", + "doc_count" : 6 + } + ] + } + } + } + ] + } + } +} +-------------------------------------------------- +<1> Most rule execution durations fall within the first bucket (0 - 1 seconds). +<2> A single rule with id `41893910-6bca-11eb-9e0d-85d233e3ee35` took between 30 and 31 seconds to execute. + +Use the <> to retrieve additional information about rules that take a long time to execute. \ No newline at end of file