-
Notifications
You must be signed in to change notification settings - Fork 24.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Move monitoring collection timeouts to coordinator #67084
Changes from all commits
ab5e0d7
7b284a3
d0da26b
3c07511
7588ecd
f68ef2e
6b845ba
56ce978
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
/* | ||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one | ||
* or more contributor license agreements. Licensed under the Elastic License; | ||
* you may not use this file except in compliance with the Elastic License. | ||
*/ | ||
|
||
package org.elasticsearch.xpack.monitoring.collector; | ||
|
||
import org.elasticsearch.ElasticsearchException; | ||
import org.elasticsearch.ElasticsearchTimeoutException; | ||
import org.elasticsearch.action.FailedNodeException; | ||
import org.elasticsearch.action.support.DefaultShardOperationFailedException; | ||
import org.elasticsearch.action.support.broadcast.BroadcastResponse; | ||
import org.elasticsearch.action.support.nodes.BaseNodeResponse; | ||
import org.elasticsearch.action.support.nodes.BaseNodesResponse; | ||
import org.elasticsearch.action.support.tasks.BaseTasksResponse; | ||
import org.elasticsearch.common.unit.TimeValue; | ||
import org.elasticsearch.transport.ReceiveTimeoutTransportException; | ||
|
||
import java.util.HashSet; | ||
import java.util.concurrent.TimeoutException; | ||
|
||
/** | ||
* Utilities for identifying timeouts in responses to collection requests, since we prefer to fail the whole collection attempt if any of | ||
* the involved nodes times out. | ||
*/ | ||
public final class TimeoutUtils { | ||
private TimeoutUtils() { | ||
} | ||
|
||
/** | ||
* @throws ElasticsearchTimeoutException iff the {@code response} contains any node-level timeout. The exception message identifies the | ||
* nodes that timed out and mentions {@code collectionTimeout}. | ||
*/ | ||
public static <T extends BaseNodeResponse> void ensureNoTimeouts(TimeValue collectionTimeout, BaseNodesResponse<T> response) { | ||
HashSet<String> timedOutNodeIds = null; | ||
for (FailedNodeException failedNodeException : response.failures()) { | ||
if (isTimeoutFailure(failedNodeException)) { | ||
if (timedOutNodeIds == null) { | ||
timedOutNodeIds = new HashSet<>(); | ||
} | ||
timedOutNodeIds.add(failedNodeException.nodeId()); | ||
} | ||
} | ||
ensureNoTimeouts(collectionTimeout, timedOutNodeIds); | ||
} | ||
|
||
/** | ||
* @throws ElasticsearchTimeoutException iff the {@code response} contains any node-level timeout. The exception message identifies the | ||
* nodes that timed out and mentions {@code collectionTimeout}. | ||
*/ | ||
public static void ensureNoTimeouts(TimeValue collectionTimeout, BaseTasksResponse response) { | ||
HashSet<String> timedOutNodeIds = null; | ||
for (ElasticsearchException nodeFailure : response.getNodeFailures()) { | ||
if (nodeFailure instanceof FailedNodeException) { | ||
FailedNodeException failedNodeException = (FailedNodeException) nodeFailure; | ||
if (isTimeoutFailure(failedNodeException)) { | ||
if (timedOutNodeIds == null) { | ||
timedOutNodeIds = new HashSet<>(); | ||
} | ||
timedOutNodeIds.add(failedNodeException.nodeId()); | ||
} | ||
} | ||
} | ||
ensureNoTimeouts(collectionTimeout, timedOutNodeIds); | ||
} | ||
|
||
/** | ||
* @throws ElasticsearchTimeoutException iff the {@code response} contains any node-level timeout. The exception message identifies the | ||
* nodes that timed out and mentions {@code collectionTimeout}. | ||
*/ | ||
public static void ensureNoTimeouts(TimeValue collectionTimeout, BroadcastResponse response) { | ||
HashSet<String> timedOutNodeIds = null; | ||
for (DefaultShardOperationFailedException shardFailure : response.getShardFailures()) { | ||
final Throwable shardFailureCause = shardFailure.getCause(); | ||
if (shardFailureCause instanceof FailedNodeException) { | ||
FailedNodeException failedNodeException = (FailedNodeException) shardFailureCause; | ||
if (isTimeoutFailure(failedNodeException)) { | ||
if (timedOutNodeIds == null) { | ||
timedOutNodeIds = new HashSet<>(); | ||
} | ||
timedOutNodeIds.add(failedNodeException.nodeId()); | ||
} | ||
} | ||
} | ||
ensureNoTimeouts(collectionTimeout, timedOutNodeIds); | ||
} | ||
|
||
private static boolean isTimeoutFailure(FailedNodeException failedNodeException) { | ||
final Throwable cause = failedNodeException.getCause(); | ||
return cause instanceof ElasticsearchTimeoutException | ||
|| cause instanceof TimeoutException | ||
|| cause instanceof ReceiveTimeoutTransportException; | ||
} | ||
|
||
private static void ensureNoTimeouts(TimeValue collectionTimeout, HashSet<String> timedOutNodeIds) { | ||
if (timedOutNodeIds != null) { | ||
throw new ElasticsearchTimeoutException((timedOutNodeIds.size() == 1 ? "node " : "nodes ") + timedOutNodeIds + | ||
" did not respond within [" + collectionTimeout + "]"); | ||
} | ||
} | ||
|
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,6 +23,8 @@ | |
import java.util.Collections; | ||
import java.util.Objects; | ||
|
||
import static org.elasticsearch.xpack.monitoring.collector.TimeoutUtils.ensureNoTimeouts; | ||
|
||
/** | ||
* Collector for nodes statistics. | ||
* <p> | ||
|
@@ -65,7 +67,7 @@ protected boolean shouldCollect(final boolean isElectedMaster) { | |
@Override | ||
protected Collection<MonitoringDoc> doCollect(final MonitoringDoc.Node node, | ||
final long interval, | ||
final ClusterState clusterState) throws Exception { | ||
final ClusterState clusterState) { | ||
NodesStatsRequest request = new NodesStatsRequest("_local"); | ||
request.indices(FLAGS); | ||
request.addMetrics( | ||
|
@@ -74,8 +76,10 @@ protected Collection<MonitoringDoc> doCollect(final MonitoringDoc.Node node, | |
NodesStatsRequest.Metric.PROCESS.metricName(), | ||
NodesStatsRequest.Metric.THREAD_POOL.metricName(), | ||
NodesStatsRequest.Metric.FS.metricName()); | ||
request.timeout(getCollectionTimeout()); | ||
|
||
final NodesStatsResponse response = client.admin().cluster().nodesStats(request).actionGet(getCollectionTimeout()); | ||
final NodesStatsResponse response = client.admin().cluster().nodesStats(request).actionGet(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need an There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm, I was thinking no since we There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok done, see 56ce978. |
||
ensureNoTimeouts(getCollectionTimeout(), response); | ||
|
||
// if there's a failure, then we failed to work with the | ||
// _local node (guaranteed a single exception) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you add javadocs for this class as well as its public static methods please?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sure, see 6b845ba.