From 89ea991765565251f235676c06d4bc0bfff58235 Mon Sep 17 00:00:00 2001 From: James Rodewig <40268737+jrodewig@users.noreply.github.com> Date: Mon, 2 Aug 2021 09:11:23 -0400 Subject: [PATCH] [DOCS] Add docs for rejected requests and high CPU usage (#72640) Adds docs for rejected requests and high CPU usage. Closes #72468. Closes #69868. --- .../how-to/fix-common-cluster-issues.asciidoc | 147 ++++++++++++++++++ .../tab-widgets/cpu-usage-widget.asciidoc | 40 +++++ docs/reference/tab-widgets/cpu-usage.asciidoc | 30 ++++ 3 files changed, 217 insertions(+) create mode 100644 docs/reference/tab-widgets/cpu-usage-widget.asciidoc create mode 100644 docs/reference/tab-widgets/cpu-usage.asciidoc diff --git a/docs/reference/how-to/fix-common-cluster-issues.asciidoc b/docs/reference/how-to/fix-common-cluster-issues.asciidoc index 3eb67477c1e55..f5e58d1adc234 100644 --- a/docs/reference/how-to/fix-common-cluster-issues.asciidoc +++ b/docs/reference/how-to/fix-common-cluster-issues.asciidoc @@ -100,6 +100,108 @@ POST _cache/clear?fielddata=true ---- // TEST[s/^/PUT my-index\n/] +[discrete] +[[high-cpu-usage]] +=== High CPU usage + +{es} uses <> to manage CPU resources for +concurrent operations. High CPU usage typically means one or more thread pools +are running low. + +If a thread pool is depleted, {es} will <> +related to the thread pool. For example, if the `search` thread pool is +depleted, {es} will reject search requests until more threads are available. + +[discrete] +[[diagnose-high-cpu-usage]] +==== Diagnose high CPU usage + +**Check CPU usage** + +include::{es-repo-dir}/tab-widgets/cpu-usage-widget.asciidoc[] + +**Check hot threads** + +If a node has high CPU usage, use the <> to check for resource-intensive threads running on the node. + +[source,console] +---- +GET _nodes/my-node,my-other-node/hot_threads +---- +// TEST[s/\/my-node,my-other-node//] + +This API returns a breakdown of any hot threads in plain text. + +[discrete] +[[reduce-cpu-usage]] +==== Reduce CPU usage + +The following tips outline the most common causes of high CPU usage and their +solutions. + +**Scale your cluster** + +Heavy indexing and search loads can deplete smaller thread pools. To better +handle heavy workloads, add more nodes to your cluster or upgrade your existing +nodes to increase capacity. + +**Spread out bulk requests** + +While more efficient than individual requests, large <> +or <> requests still require CPU resources. If +possible, submit smaller requests and allow more time between them. + +**Cancel long-running searches** + +Long-running searches can block threads in the `search` thread pool. To check +for these searches, use the <>. + +[source,console] +---- +GET _tasks?actions=*search&detailed +---- + +The response's `description` contains the search request and its queries. +`running_time_in_nanos` shows how long the search has been running. + +[source,console-result] +---- +{ + "nodes" : { + "oTUltX4IQMOUUVeiohTt8A" : { + "name" : "my-node", + "transport_address" : "127.0.0.1:9300", + "host" : "127.0.0.1", + "ip" : "127.0.0.1:9300", + "tasks" : { + "oTUltX4IQMOUUVeiohTt8A:464" : { + "node" : "oTUltX4IQMOUUVeiohTt8A", + "id" : 464, + "type" : "transport", + "action" : "indices:data/read/search", + "description" : "indices[my-index], search_type[QUERY_THEN_FETCH], source[{\"query\":...}]", + "start_time_in_millis" : 4081771730000, + "running_time_in_nanos" : 13991383, + "cancellable" : true + } + } + } + } +} +---- +// TESTRESPONSE[skip: no way to get tasks] + +To cancel a search and free up resources, use the API's `_cancel` endpoint. + +[source,console] +---- +POST _tasks/oTUltX4IQMOUUVeiohTt8A:464/_cancel +---- + +For additional tips on how to track and avoid resource-intensive searches, see +<>. + [discrete] [[high-jvm-memory-pressure]] === High JVM memory pressure @@ -141,6 +243,7 @@ Every shard uses memory. In most cases, a small set of large shards uses fewer resources than many small shards. For tips on reducing your shard count, see <>. +[[avoid-expensive-searches]] **Avoid expensive searches** Expensive searches can use large amounts of memory. To better track expensive @@ -439,3 +542,47 @@ POST _cluster/reroute If you backed up the missing index data to a snapshot, use the <> to restore the individual index. Alternatively, you can index the missing data from the original data source. + +[discrete] +[[rejected-requests]] +=== Rejected requests + +When {es} rejects a request, it stops the operation and returns an error with a +`429` response code. Rejected requests are commonly caused by: + +* A <>. A depleted `search` or `write` +thread pool returns a `TOO_MANY_REQUESTS` error message. + +* A <>. + +* High <> that exceeds the +<>. + +[discrete] +[[check-rejected-tasks]] +==== Check rejected tasks + +To check the number of rejected tasks for each thread pool, use the +<>. A high ratio of `rejected` to +`completed` tasks, particularly in the `search` and `write` thread pools, means +{es} regularly rejects requests. + +[source,console] +---- +GET /_cat/thread_pool?v=true&h=id,name,active,rejected,completed +---- + +[discrete] +[[prevent-rejected-requests]] +==== Prevent rejected requests + +**Fix high CPU and memory usage** + +If {es} regularly rejects requests and other tasks, your cluster likely has high +CPU usage or high JVM memory pressure. For tips, see <> and +<>. + +**Prevent circuit breaker errors** + +If you regularly trigger circuit breaker errors, see <> +for tips on diagnosing and preventing them. diff --git a/docs/reference/tab-widgets/cpu-usage-widget.asciidoc b/docs/reference/tab-widgets/cpu-usage-widget.asciidoc new file mode 100644 index 0000000000000..a57d45790d518 --- /dev/null +++ b/docs/reference/tab-widgets/cpu-usage-widget.asciidoc @@ -0,0 +1,40 @@ +++++ +
+
+ + +
+
+++++ + +include::cpu-usage.asciidoc[tag=cloud] + +++++ +
+ +
+++++ diff --git a/docs/reference/tab-widgets/cpu-usage.asciidoc b/docs/reference/tab-widgets/cpu-usage.asciidoc new file mode 100644 index 0000000000000..1c4913cc0f6d4 --- /dev/null +++ b/docs/reference/tab-widgets/cpu-usage.asciidoc @@ -0,0 +1,30 @@ +// tag::cloud[] +From your deployment menu, click **Performance**. The page's **CPU Usage** chart +shows your deployment's CPU usage as a percentage. + +High CPU usage can also deplete your CPU credits. CPU credits let {ess} provide +smaller clusters with a performance boost when needed. The **CPU credits** +chart shows your remaining CPU credits, measured in seconds of CPU time. + +You can also use the <> to get the current CPU usage +for each node. + +// tag::cpu-usage-cat-nodes[] +[source,console] +---- +GET _cat/nodes?v=true&s=cpu:desc +---- + +The response's `cpu` column contains the current CPU usage as a percentage. The +`node` column contains the node's name. +// end::cpu-usage-cat-nodes[] + +// end::cloud[] + +// tag::self-managed[] + +Use the <> to get the current CPU usage for each node. + +include::cpu-usage.asciidoc[tag=cpu-usage-cat-nodes] + +// end::self-managed[]