Skip to content

Commit

Permalink
HBASE-28050 RSProcedureDispatcher to fail-fast for krb auth failures (#…
Browse files Browse the repository at this point in the history
…5391)

Signed-off-by: Duo Zhang <[email protected]>
Signed-off-by: Andrew Purtell <[email protected]>
Signed-off-by: Aman Poonia <[email protected]>
Signed-off-by: David Manning <[email protected]>
  • Loading branch information
virajjasani committed Sep 29, 2023
1 parent 31974ae commit 597da71
Show file tree
Hide file tree
Showing 3 changed files with 85 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,7 @@ public void operationComplete(ChannelFuture future) throws Exception {
private void sendRequest0(Call call, HBaseRpcController hrc) throws IOException {
assert eventLoop.inEventLoop();
if (reloginInProgress) {
throw new IOException("Can not send request because relogin is in progress.");
throw new IOException(RpcConnectionConstants.RELOGIN_IS_IN_PROGRESS);
}
hrc.notifyOnCancel(new RpcCallback<Object>() {

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.ipc;

import org.apache.yetus.audience.InterfaceAudience;

/**
* Constants to be used by RPC connection based utilities.
*/
@InterfaceAudience.Private
public final class RpcConnectionConstants {

private RpcConnectionConstants() {
}

public static final String RELOGIN_IS_IN_PROGRESS =
"Can not send request because relogin is in progress.";

}
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,12 @@
import java.util.List;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import javax.security.sasl.SaslException;
import org.apache.hadoop.hbase.CallQueueTooBigException;
import org.apache.hadoop.hbase.DoNotRetryIOException;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.client.RegionInfo;
import org.apache.hadoop.hbase.ipc.RpcConnectionConstants;
import org.apache.hadoop.hbase.ipc.ServerNotRunningYetException;
import org.apache.hadoop.hbase.master.MasterServices;
import org.apache.hadoop.hbase.master.ServerListener;
Expand Down Expand Up @@ -292,17 +294,15 @@ private boolean scheduleForRetry(IOException e) {
numberOfAttemptsSoFar);
return false;
}
// This exception is thrown in the rpc framework, where we can make sure that the call has not
// been executed yet, so it is safe to mark it as fail. Especially for open a region, we'd
// better choose another region server.
// This category of exceptions is thrown in the rpc framework, where we can make sure
// that the call has not been executed yet, so it is safe to mark it as fail.
// Especially for open a region, we'd better choose another region server.
// Notice that, it is safe to quit only if this is the first time we send request to region
// server. Maybe the region server has accepted our request the first time, and then there is
// a network error which prevents we receive the response, and the second time we hit a
// CallQueueTooBigException, obviously it is not safe to quit here, otherwise it may lead to a
// double assign...
if (e instanceof CallQueueTooBigException && numberOfAttemptsSoFar == 0) {
LOG.warn("request to {} failed due to {}, try={}, this usually because"
+ " server is overloaded, give up", serverName, e.toString(), numberOfAttemptsSoFar);
// a network error which prevents we receive the response, and the second time we hit
// this category of exceptions, obviously it is not safe to quit here, otherwise it may lead
// to a double assign...
if (numberOfAttemptsSoFar == 0 && unableToConnectToServer(e)) {
return false;
}
// Always retry for other exception types if the region server is not dead yet.
Expand Down Expand Up @@ -335,6 +335,47 @@ private boolean scheduleForRetry(IOException e) {
return true;
}

/**
* The category of exceptions where we can ensure that the request has not yet been received
* and/or processed by the target regionserver yet and hence we can determine whether it is safe
* to choose different regionserver as the target.
* @param e IOException thrown by the underlying rpc framework.
* @return true if the exception belongs to the category where the regionserver has not yet
* received the request yet.
*/
private boolean unableToConnectToServer(IOException e) {
if (e instanceof CallQueueTooBigException) {
LOG.warn("request to {} failed due to {}, try={}, this usually because"
+ " server is overloaded, give up", serverName, e, numberOfAttemptsSoFar);
return true;
}
if (isSaslError(e)) {
LOG.warn("{} is not reachable; give up after first attempt", serverName, e);
return true;
}
return false;
}

private boolean isSaslError(IOException e) {
Throwable cause = e;
while (true) {
if (cause instanceof IOException) {
IOException unwrappedCause = unwrapException((IOException) cause);
if (
unwrappedCause instanceof SaslException
|| (unwrappedCause.getMessage() != null && unwrappedCause.getMessage()
.contains(RpcConnectionConstants.RELOGIN_IS_IN_PROGRESS))
) {
return true;
}
}
cause = cause.getCause();
if (cause == null) {
return false;
}
}
}

private long getMaxWaitTime() {
if (this.maxWaitTime < 0) {
// This is the max attempts, not retries, so it should be at least 1.
Expand Down

0 comments on commit 597da71

Please sign in to comment.