diff --git a/pkg/kv/kvserver/liveness/liveness.go b/pkg/kv/kvserver/liveness/liveness.go index 150748ecc2d0..291e2daa5a1e 100644 --- a/pkg/kv/kvserver/liveness/liveness.go +++ b/pkg/kv/kvserver/liveness/liveness.go @@ -802,7 +802,7 @@ func (nl *NodeLiveness) Start(ctx context.Context, opts NodeLivenessStartOptions }) } -const heartbeatFailureLogFormat = `failed node liveness heartbeat: %+v +const heartbeatFailureLogFormat = `failed node liveness heartbeat: %v An inability to maintain liveness will prevent a node from participating in a cluster. If this problem persists, it may be a sign of resource starvation or diff --git a/pkg/rpc/context.go b/pkg/rpc/context.go index 320c71fa1ae0..097cb297c42e 100644 --- a/pkg/rpc/context.go +++ b/pkg/rpc/context.go @@ -2496,6 +2496,7 @@ func (rpcCtx *Context) runHeartbeat( } if err != nil { + log.VEventf(ctx, 2, "received error on ping response from n%d, %v", conn.remoteNodeID, err) return err } @@ -2604,8 +2605,8 @@ func (rpcCtx *Context) NewHeartbeatService() *HeartbeatService { } } -// VerifyDialback verifies connectivity from the recipient of a PingRequest -// back to the sender. If there is already a connection in place, it will return +// VerifyDialback verifies connectivity from the recipient of a PingRequest back +// to the sender. If there is already a connection in place, it will return // immediately without error. If there is no connection in place and the // NeedsDialback on the PingRequest is not set to NONE, then it will establish a // connection in either blocking or non-blocking mode. @@ -2614,6 +2615,9 @@ func (rpcCtx *Context) NewHeartbeatService() *HeartbeatService { // established. // NON_BLOCKING mode will attempt to establish a reverse connection and send the // result on the next PingRequest that is sent on this connection. +// This method keeps track of non blocking attempts in the dialbackMu and will +// clear out any pending attempts as soon as a successful connection is +// established. func (rpcCtx *Context) VerifyDialback( ctx context.Context, request *PingRequest, response *PingResponse, locality roachpb.Locality, ) error { @@ -2623,71 +2627,81 @@ func (rpcCtx *Context) VerifyDialback( baseAddr := util.UnresolvedAddr{NetworkField: "tcp", AddressField: request.OriginAddr} target := locality.LookupAddress(request.LocalityAddress, &baseAddr).AddressField + // nodeID may be null for "bootstrapping" requests. In that case we always + // assume blocking mode since we can't track connection attempts. nodeID := request.OriginNodeID - // Initially the nodeID might not be set since it is assigned by the cluster - // not the node. In that case, we can't look up if we have a connection to the - // node and instead need to always try dialback. - if nodeID != 0 { - prevErr, found := rpcCtx.previousAttempt(nodeID) - if found { - return prevErr - } + // Check in our regular connection map to see if we are healthy. We use the + // System class because that is what is important from a liveness perspective. + // If we are unable to maintain a healthy connection on the System class we + // will fail other connections also. + connHealthErr := rpcCtx.ConnHealth(target, nodeID, SystemClass) - // Check in our regular connection map to see if we are healthy. We only - // care about the SystemClass because that is what is important from a Raft - // liveness perspective. - err := rpcCtx.ConnHealth(target, nodeID, SystemClass) - // We have a successful connection, nothing else to do. - if err == nil { - return nil - } - log.VEventf(ctx, 2, "unable to verify health on open conn, trying dialback conn to %s, n%d, %v", target, nodeID, err) + // We have a successful connection so report success. Any ongoing attempts no + // longer need to be tracked. + if connHealthErr == nil { + rpcCtx.clearPreviousAttempt(nodeID) + return nil } + log.VEventf(ctx, 2, "unable to verify health on existing conn, trying dialback conn to %s, n%d mode %v, %v", + target, nodeID, request.NeedsDialback, connHealthErr) + if nodeID == 0 || request.NeedsDialback == PingRequest_BLOCKING { // Since we don't have a successful reverse connection, try and dial back // manually. We don't use the regular dialer pool since we don't want to wait // for heartbeats on this connection. // TODO(baptist): Consider using GRPCUnvalidatedDial and use the // WaitForStateChange to detect when the TCP connection is established. This - // will keep this connection in the pool after establishment. Note the class - // here matter since this connection is not added to a pool and immediately - // closed. + // will keep this connection in the pool after establishment. Wait until + // https://github.com/grpc/grpc-go/issues/5496 is completed. ctx := rpcCtx.makeDialCtx(target, 0, SystemClass) conn, err := rpcCtx.grpcDialRaw(ctx, target, SystemClass, grpc.WithBlock()) if err != nil { - log.Warningf(ctx, "dialback connection failed to %s, n%d, %v", target, nodeID, err) + log.Infof(ctx, "blocking dialback connection failed to %s, n%d, %v", target, nodeID, err) return err } + log.VEventf(ctx, 2, "blocking dialback connection to n%d succeeded", nodeID) + // Clear any previous attempts since we are known to be able to initiate a + // TCP connection. + rpcCtx.clearPreviousAttempt(nodeID) _ = conn.Close() // nolint:grpcconnclose return nil } else { - // We don't have a previous attempt and the current health was not healthy, - // but we can't just trust that because there might not be new connection - // attempts. Instead, establish a new connection using the standard - // GRPCDialNode, this connection is added to the connection pool. Always - // return success on this ping, but check this connection attempt on future - // pings. Use the SystemClass to ensure that Raft traffic is not - // interrupted. It is unusual for some classes to be affected and not others - // but the SystemClass is the one we really care about. + // If the previous attempt ended in an error, we can confidently report we + // are unable to dialback. If the attempt is still ongoing, then we want + // to allow it to finish. Once it has finished, we will leave this error + // here until the connection health becomes healthy either through + // checking the health manually or a blocking ping succeeding. + return rpcCtx.loadOrCreateConnAttempt(nodeID, func() *Connection { + return rpcCtx.GRPCDialNode(target, nodeID, SystemClass) + }) + } +} + +// clearPreviousAttempt will clear out any previous errors on connection +// attempts. This is only done after we have verified we have a healthy +// established connection to the sender of this ping. +func (rpcCtx *Context) clearPreviousAttempt(nodeID roachpb.NodeID) { + if nodeID > 0 { rpcCtx.dialbackMu.Lock() defer rpcCtx.dialbackMu.Unlock() - rpcCtx.dialbackMu.m[nodeID] = rpcCtx.GRPCDialNode(target, nodeID, SystemClass) - return nil + rpcCtx.dialbackMu.m[nodeID] = nil } } -// previousAttempt checks if any prior attempt that started but hadn't complete -// has now completed. Until this attempt completes, the Pings will continue to -// return success. Once this completes, we remove this from our map and return -// whatever error this attempt returned. -func (rpcCtx *Context) previousAttempt(nodeID roachpb.NodeID) (error, bool) { - // Check if there was a previous attempt and if so use that and clear out - // the previous attempt. +// loadOrCreateConnAttempt checks if we have an in-progress connection attempt +// to a store, and if not will create a connection and store it in the map. It +// takes a function to create a connection because the connection is only +// created in the case where it doesn't already exist. If there is already a +// ongoing connection attempt, it will instead check the status of that attempt. +// If it is completed and is in error, then it will return that error, if it is +// still ongoing, then it returns nil to signify that it might be healthy. +func (rpcCtx *Context) loadOrCreateConnAttempt( + nodeID roachpb.NodeID, createConnFunc func() *Connection, +) error { rpcCtx.dialbackMu.Lock() defer rpcCtx.dialbackMu.Unlock() - previousAttempt := rpcCtx.dialbackMu.m[nodeID] // Check if the previous connection is completed (successfully or not). This // happens only on subsequent pings after not detecting a healthy reverse @@ -2697,19 +2711,26 @@ func (rpcCtx *Context) previousAttempt(nodeID roachpb.NodeID) (error, bool) { // happen if our previous connect attempt failed between pings. Without this // protection we would continually try opening new dialback connections, but // never observe the result. - if previousAttempt != nil { + if previousAttempt := rpcCtx.dialbackMu.m[nodeID]; previousAttempt != nil { select { case <-previousAttempt.initialHeartbeatDone: // The connection attempt was completed, return the outcome of it. err, _ := previousAttempt.err.Load().(error) - rpcCtx.dialbackMu.m[nodeID] = nil - return err, true + if err == nil { + // If it completed without error then don't track the connection + // anymore. If it did have an error we need to track it until it later gets cleared. + rpcCtx.dialbackMu.m[nodeID] = nil + } + return err default: // We still don't know the outcome of the previous attempt. For now - // allow this Ping to continue and check on the following attempt. - return nil, true + // allow this attempt to continue and check in the future. + return nil } } - // There is no previous attempt in place. - return nil, false + + // There is no previous attempt in place. Create a connection and store it for + // the future, for now return success. + rpcCtx.dialbackMu.m[nodeID] = createConnFunc() + return nil } diff --git a/pkg/rpc/heartbeat.go b/pkg/rpc/heartbeat.go index 9653dbe9f9dc..ffe8b8d39b4a 100644 --- a/pkg/rpc/heartbeat.go +++ b/pkg/rpc/heartbeat.go @@ -171,6 +171,7 @@ func (hs *HeartbeatService) Ping(ctx context.Context, request *PingRequest) (*Pi if fn := hs.onHandlePing; fn != nil { if err := fn(ctx, request, &response); err != nil { + log.Infof(ctx, "failing ping request from node n%d", request.OriginNodeID) return nil, err } }