-
Notifications
You must be signed in to change notification settings - Fork 1.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Cancel in-flight search tasks due to search backpressure with 429 sta… #6634
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -50,13 +50,15 @@ | |
import org.opensearch.common.lease.Releasables; | ||
import org.opensearch.common.util.concurrent.AbstractRunnable; | ||
import org.opensearch.common.util.concurrent.AtomicArray; | ||
import org.opensearch.common.util.concurrent.OpenSearchRejectedExecutionException; | ||
import org.opensearch.index.shard.ShardId; | ||
import org.opensearch.search.SearchPhaseResult; | ||
import org.opensearch.search.SearchShardTarget; | ||
import org.opensearch.search.internal.AliasFilter; | ||
import org.opensearch.search.internal.InternalSearchResponse; | ||
import org.opensearch.search.internal.SearchContext; | ||
import org.opensearch.search.internal.ShardSearchRequest; | ||
import org.opensearch.tasks.TaskCancelledException; | ||
import org.opensearch.transport.Transport; | ||
|
||
import java.util.ArrayDeque; | ||
|
@@ -370,6 +372,15 @@ public final void executeNextPhase(SearchPhase currentPhase, SearchPhase nextPha | |
: OpenSearchException.guessRootCauses(shardSearchFailures[0].getCause())[0]; | ||
logger.debug(() -> new ParameterizedMessage("All shards failed for phase: [{}]", getName()), cause); | ||
onPhaseFailure(currentPhase, "all shards failed", cause); | ||
} else if (getTask().isCancelled()) { | ||
// checking if the task handling the search got cancelled. Adding this check only while starting the next phase to avoid | ||
// slowing down the search operation | ||
String reason = getTask().getReasonCancelled(); | ||
onPhaseFailure( | ||
currentPhase, | ||
"SearchTask was cancelled", | ||
new TaskCancelledException(new OpenSearchRejectedExecutionException("cancelled task with reason: " + reason)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This layered exceptions wrapping does not look right, I think we should introduce dedicated exception (fe |
||
); | ||
} else { | ||
Boolean allowPartialResults = request.allowPartialSearchResults(); | ||
assert allowPartialResults != null : "SearchRequest missing setting for allowPartialSearchResults"; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -43,6 +43,7 @@ | |
import org.opensearch.cluster.node.DiscoveryNode; | ||
import org.opensearch.common.io.stream.StreamInput; | ||
import org.opensearch.common.io.stream.StreamOutput; | ||
import org.opensearch.common.util.concurrent.OpenSearchRejectedExecutionException; | ||
import org.opensearch.threadpool.ThreadPool; | ||
import org.opensearch.transport.EmptyTransportResponseHandler; | ||
import org.opensearch.transport.TransportChannel; | ||
|
@@ -63,6 +64,8 @@ | |
*/ | ||
public class TaskCancellationService { | ||
public static final String BAN_PARENT_ACTION_NAME = "internal:admin/tasks/ban"; | ||
public static final String REASON_PARENT_CANCELLED_HIGH_RESOURCE_CONSUMPTION = | ||
"The parent task was cancelled due to high resource consumption"; | ||
private static final Logger logger = LogManager.getLogger(TaskCancellationService.class); | ||
private final TransportService transportService; | ||
private final TaskManager taskManager; | ||
|
@@ -88,7 +91,13 @@ void cancelTaskAndDescendants(CancellableTask task, String reason, boolean waitF | |
logger.trace("cancelling task [{}] and its descendants", taskId); | ||
StepListener<Void> completedListener = new StepListener<>(); | ||
GroupedActionListener<Void> groupedListener = new GroupedActionListener<>(ActionListener.map(completedListener, r -> null), 3); | ||
Collection<DiscoveryNode> childrenNodes = taskManager.startBanOnChildrenNodes(task.getId(), () -> { | ||
String cancelChildTaskReason = reason; | ||
// if parent task gets cancelled due to high resource consumption, child task should be cancelled saying parent task was | ||
// cancelled | ||
if (reason.contains("usage exceeded")) { | ||
cancelChildTaskReason = REASON_PARENT_CANCELLED_HIGH_RESOURCE_CONSUMPTION; | ||
} | ||
Collection<DiscoveryNode> childrenNodes = taskManager.startBanOnChildrenNodes(task.getId(), cancelChildTaskReason, () -> { | ||
logger.trace("child tasks of parent [{}] are completed", taskId); | ||
groupedListener.onResponse(null); | ||
}); | ||
|
@@ -97,7 +106,7 @@ void cancelTaskAndDescendants(CancellableTask task, String reason, boolean waitF | |
groupedListener.onResponse(null); | ||
}); | ||
StepListener<Void> banOnNodesListener = new StepListener<>(); | ||
setBanOnNodes(reason, waitForCompletion, task, childrenNodes, banOnNodesListener); | ||
setBanOnNodes(cancelChildTaskReason, waitForCompletion, task, childrenNodes, banOnNodesListener); | ||
banOnNodesListener.whenComplete(groupedListener::onResponse, groupedListener::onFailure); | ||
// If we start unbanning when the last child task completed and that child task executed with a specific user, then unban | ||
// requests are denied because internal requests can't run with a user. We need to remove bans with the current thread context. | ||
|
@@ -257,4 +266,15 @@ public void messageReceived(final BanParentTaskRequest request, final TransportC | |
} | ||
} | ||
} | ||
|
||
public static void throwTaskCancelledException(String reason) { | ||
if (isRejection(reason)) { | ||
throw new TaskCancelledException(new OpenSearchRejectedExecutionException("cancelled task with reason: " + reason)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can't we just throw |
||
} | ||
throw new TaskCancelledException(reason); | ||
} | ||
|
||
private static boolean isRejection(String reason) { | ||
return (reason.contains("usage exceeded") || REASON_PARENT_CANCELLED_HIGH_RESOURCE_CONSUMPTION.equals(reason)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please make There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ack |
||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can we avoid
TaskCancelledException
inOpenSearchException
? This looks like an anti pattern.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
One alternate here is to make it generic and throw the underlying cause's status in all cases till level 2 . We need to think about whether it is the right behavior and also make sure it doesn't cause any regression in existing code (ITs would make sure of that). Something on the below lines.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
trying this
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The other better option is to override this in
TaskCancelledException
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@PritLadani Can you explore the second suggestion to do the override in TaskCancelledException? The code snippet above doesn't look quite right because it calls an "unwrapCause()" method but then proceeds to do another level of cause unwrapping. I'm really concerned about unintended side effects of that approach, and I also don't have a ton of confidence that ITs would catch every possible regression.