Skip to content

Commit

Permalink
rpc/nodedialer: avoid tripping breaker on context errors
Browse files Browse the repository at this point in the history
In #34026 we added logic to ensure that the context was not canclled before
calling in to GRPCDial. This change extends that to also avoid calling
breaker.Fail if the returned error was context.Canceled.

The motivation for this is an observation that breakers can trip due to context
cancellation which race with calls to Dial (from say DistSQL). This behavior
was observed after a node died due to an unrelated corruption bug. It appears
that this node failure triggered a context cancellation which then tripped a
breaker which then lead to a different flow to fail which then lead to another
cancellation which seems to have then tripped another breaker. The evidence for
this exact serioes of events is somewhat scant but we do know for certain that
we saw breakers tripped due to context cancelled which seems wrong.

```
ip-172-31-44-174> I190305 14:53:47.387863 150672 vendor/github.com/cockroachdb/circuitbreaker/circuitbreaker.go:322  [n6] circuitbreaker: rpc [::]:26257->2 tripped: failed to grpc dial n2 at ip-172-31-34-81:26257: context canceled
```

This change also cosmetically refactors DialInternalClient and Dial to share
some copy-pasted code which was becoming burdensome.

Lastly this change adds a bunch of unit testing to the nodedialer package.

Release note: None
  • Loading branch information
ajwerner committed Mar 7, 2019
1 parent f0f024d commit 72aeb7f
Show file tree
Hide file tree
Showing 3 changed files with 353 additions and 32 deletions.
67 changes: 35 additions & 32 deletions pkg/rpc/nodedialer/nodedialer.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,33 +83,13 @@ func (n *Dialer) Dial(ctx context.Context, nodeID roachpb.NodeID) (_ *grpc.Clien
return nil, ctxErr
}
breaker := n.getBreaker(nodeID)

if !breaker.Ready() {
err := errors.Wrapf(circuit.ErrBreakerOpen, "unable to dial n%d", nodeID)
return nil, err
}

defer func() {
// Enforce a minimum interval between warnings for failed connections.
if err != nil && breaker.ShouldLog() {
log.Infof(ctx, "unable to connect to n%d: %s", nodeID, err)
}
}()

addr, err := n.resolver(nodeID)
if err != nil {
err = errors.Wrapf(err, "failed to resolve n%d", nodeID)
breaker.Fail(err)
return nil, err
}
conn, err := n.rpcContext.GRPCDial(addr.String()).Connect(ctx)
if err != nil {
err = errors.Wrapf(err, "failed to grpc dial n%d at %v", nodeID, addr)
breaker.Fail(err)
return nil, err
}
breaker.Success()
return conn, nil
return n.dial(ctx, nodeID, addr, breaker)
}

// DialInternalClient is a specialization of Dial for callers that
Expand All @@ -124,10 +104,6 @@ func (n *Dialer) DialInternalClient(
if n == nil || n.resolver == nil {
return nil, nil, errors.New("no node dialer configured")
}
// Don't trip the breaker if we're already canceled.
if ctxErr := ctx.Err(); ctxErr != nil {
return nil, nil, ctxErr
}
addr, err := n.resolver(nodeID)
if err != nil {
return nil, nil, err
Expand All @@ -141,32 +117,59 @@ func (n *Dialer) DialInternalClient(

return localCtx, localClient, nil
}

breaker := n.getBreaker(nodeID)

log.VEventf(ctx, 2, "sending request to %s", addr)
conn, err := n.dial(ctx, nodeID, addr, n.getBreaker(nodeID))
if err != nil {
return nil, nil, err
}
return ctx, roachpb.NewInternalClient(conn), err
}

// dial performs the dialing of the remove connection.
func (n *Dialer) dial(
ctx context.Context, nodeID roachpb.NodeID, addr net.Addr, breaker *wrappedBreaker,
) (_ *grpc.ClientConn, err error) {
// Don't trip the breaker if we're already canceled.
if ctxErr := ctx.Err(); ctxErr != nil {
return nil, ctxErr
}
if !breaker.Ready() {
err = errors.Wrapf(circuit.ErrBreakerOpen, "unable to dial n%d", nodeID)
return nil, err
}
defer func() {
// Enforce a minimum interval between warnings for failed connections.
if err != nil && err != ctx.Err() && breaker.ShouldLog() {
log.Infof(ctx, "unable to connect to n%d: %s", nodeID, err)
}
}()
conn, err := n.rpcContext.GRPCDial(addr.String()).Connect(ctx)
if err != nil {
// If we were canceled during the dial, don't trip the breaker.
if ctxErr := ctx.Err(); ctxErr != nil {
return nil, ctxErr
}
err = errors.Wrapf(err, "failed to connect to n%d at %v", nodeID, addr)
breaker.Fail(err)
return nil, nil, err
return nil, err
}
// Check to see if the connection is in the transient failure state. This can
// happen if the connection already existed, but a recent heartbeat has
// failed and we haven't yet torn down the connection.
if err := grpcutil.ConnectionReady(conn); err != nil {
err = errors.Wrapf(err, "failed to check for connection ready to n%d at %v", nodeID, addr)
err = errors.Wrapf(err, "failed to check for ready connection to n%d at %v", nodeID, addr)
breaker.Fail(err)
return nil, nil, err
return nil, err
}

// TODO(bdarnell): Reconcile the different health checks and circuit breaker
// behavior in this file. Note that this different behavior causes problems
// for higher-levels in the system. For example, DistSQL checks for
// ConnHealth when scheduling processors, but can then see attempts to send
// RPCs fail when dial fails due to an open breaker. Reset the breaker here
// as a stop-gap before the reconciliation occurs.
breaker.Success()
return ctx, roachpb.NewInternalClient(conn), nil
return conn, nil
}

// ConnHealth returns nil if we have an open connection to the given node
Expand Down
Loading

0 comments on commit 72aeb7f

Please sign in to comment.