Skip to content

Commit

Permalink
rpc: ConnHealth now kicks off a connection attempt
Browse files Browse the repository at this point in the history
When #22658 changed ConnHealth to be pessimistic instead of
optimistic, it meant that distsql could theoretically get stuck in a
state without connections to the necessary nodes (distsql would never
initiate connections on its own; it only attempts to use connections
for which ConnHealth returns true so it was effectively relying on
raft/kv to initiate these connections).

Now ConnHealth will attempt to start a connection process if none is
in flight to ensure that we will eventually discover the health of any
address we are concerned about.

Updated the ConnHealth docstring to reflect this change and the change
to pessimistic behavior.

Fixes #23829

Release note: None
  • Loading branch information
bdarnell committed Apr 17, 2018
1 parent 96295ac commit 98b99aa
Show file tree
Hide file tree
Showing 4 changed files with 22 additions and 35 deletions.
2 changes: 1 addition & 1 deletion pkg/kv/transport.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ func grpcTransportFactoryImpl(
})
}

// Put known-unhealthy clients last.
// Put known-healthy clients first.
splitHealthy(clients)

return &grpcTransport{
Expand Down
34 changes: 11 additions & 23 deletions pkg/rpc/context.go
Original file line number Diff line number Diff line change
Expand Up @@ -529,38 +529,26 @@ func (ctx *Context) NewBreaker() *circuit.Breaker {
return newBreaker(&ctx.breakerClock)
}

// ErrNotConnected is returned by ConnHealth when there is no connection to the
// host (e.g. GRPCDial was never called for that address, or a connection has
// been closed and not reconnected).
var ErrNotConnected = errors.New("not connected")

// ErrNotHeartbeated is returned by ConnHealth when we have not yet performed
// the first heartbeat.
var ErrNotHeartbeated = errors.New("not yet heartbeated")

// ConnHealth returns whether the most recent heartbeat succeeded or not.
// This should not be used as a definite status of a node's health and just used
// to prioritize healthy nodes over unhealthy ones.
//
// NB: as of #22658, this does not work as you think. We kick
// connections out of the connection pool as soon as they run into an
// error, at which point their ConnHealth will reset to
// ErrNotConnected. ConnHealth does no more return a sane notion of
// "recent connection health". When it returns nil all seems well, but
// if it doesn't then this may mean that the node is simply refusing
// connections (and is thus unconnected most of the time), or that the
// node hasn't been connected to but is perfectly healthy.
//
// See #23829.
// ConnHealth returns nil if we have an open connection to the given
// target that succeeded on its most recent heartbeat. Otherwise, it
// kicks off a connection attempt (unless one is already in progress
// or we are in a backoff state) and returns an error (typically
// ErrNotHeartbeated). This is a conservative/pessimistic indicator:
// if we have not attempted to talk to the target node recently, an
// error will be returned. This method should therefore be used to
// prioritize among a list of candidate nodes, but not to filter out
// "unhealthy" nodes.
func (ctx *Context) ConnHealth(target string) error {
if ctx.GetLocalInternalServerForAddr(target) != nil {
// The local server is always considered healthy.
return nil
}
if value, ok := ctx.conns.Load(target); ok {
return value.(*Connection).heartbeatResult.Load().(heartbeatResult).err
}
return ErrNotConnected
conn := ctx.GRPCDial(target)
return conn.heartbeatResult.Load().(heartbeatResult).err
}

func (ctx *Context) runHeartbeat(
Expand Down
19 changes: 9 additions & 10 deletions pkg/rpc/context_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -254,18 +254,18 @@ func TestHeartbeatHealth(t *testing.T) {
return clientCtx.ConnHealth(remoteAddr)
})

if err := clientCtx.ConnHealth("non-existent connection"); err != ErrNotConnected {
t.Errorf("wanted ErrConnected, not %v", err)
if err := clientCtx.ConnHealth("non-existent connection"); err != ErrNotHeartbeated {
t.Errorf("wanted ErrNotHeartbeated, not %v", err)
}

if err := clientCtx.ConnHealth(clientCtx.Addr); err != ErrNotConnected {
t.Errorf("wanted ErrConnected, not %v", err)
if err := clientCtx.ConnHealth(clientCtx.Addr); err != ErrNotHeartbeated {
t.Errorf("wanted ErrNotHeartbeated, not %v", err)
}

clientCtx.SetLocalInternalServer(&internalServer{})

if err := clientCtx.ConnHealth(clientCtx.Addr); err != ErrNotConnected {
t.Errorf("wanted ErrConnected, not %v", err)
if err := clientCtx.ConnHealth(clientCtx.Addr); err != ErrNotHeartbeated {
t.Errorf("wanted ErrNotHeartbeated, not %v", err)
}
if err := clientCtx.ConnHealth(clientCtx.AdvertiseAddr); err != nil {
t.Error(err)
Expand Down Expand Up @@ -378,11 +378,10 @@ func TestHeartbeatHealthTransport(t *testing.T) {

isUnhealthy := func(err error) bool {
// Most of the time, an unhealthy connection will get
// ErrNotConnected, but there are brief periods during which we
// could get ErrNotHeartbeated (while we're trying to start a new
// connection) or one of the grpc errors below (while the old
// ErrNotHeartbeated, but there are brief periods during which we
// could get one of the grpc errors below (while the old
// connection is in the middle of closing).
if err == ErrNotConnected || err == ErrNotHeartbeated {
if err == ErrNotHeartbeated {
return true
}
// The expected code here is Unavailable, but at least on OSX you can also get
Expand Down
2 changes: 1 addition & 1 deletion pkg/sql/distsql_physical_planner.go
Original file line number Diff line number Diff line change
Expand Up @@ -578,7 +578,7 @@ func checkNodeHealth(
err = connHealth(addr)
}

if err != nil && err != rpc.ErrNotConnected && err != rpc.ErrNotHeartbeated {
if err != nil && err != rpc.ErrNotHeartbeated {
// This host is known to be unhealthy. Don't use it (use the gateway
// instead). Note: this can never happen for our nodeID (which
// always has its address in the nodeMap).
Expand Down

0 comments on commit 98b99aa

Please sign in to comment.