Skip to content

Commit

Permalink
[DNM] rpc/nodedialer: avoid tripping context on context.Canceled
Browse files Browse the repository at this point in the history
Marked DNM because Testing is coming, nodedialer currently has no testing and
I'm going to change that. Just want to get this change out there to start the
conversation and make myself more visibly accountable.

In cockroachdb#34026 we added logic to ensure that the context was not canclled before
calling in to GRPCDial. This change extends that to also avoid calling
breaker.Fail if the returned error was context.Canceled.

The motivation for this is an observation that breakers can trip due to context
cancellation which race with calls to Dial (from say DistSQL). This behavior
was observed after a node died due to an unrelated corruption bug. It appears
that this node failure triggered a context cancellation which then tripped a
breaker which then lead to a different flow to fail which then lead to another
cancellation which seems to have then tripped another breaker. The evidence for
this exact serioes of events is somewhat scant but we do know for certain that
we saw breakers tripped due to context cancelled which seems wrong.

```
ip-172-31-44-174> I190305 14:53:47.387863 150672 vendor/github.com/cockroachdb/circuitbreaker/circuitbreaker.go:322  [n6] circuitbreaker: rpc [::]:26257->2 tripped: failed to grpc dial n2 at ip-172-31-34-81:26257: context canceled
```

This change also cosmetically refactors DialInternalClient and Dial to share
some copy-pasted code which was becoming burdensome.

Release note: None
  • Loading branch information
ajwerner committed Mar 5, 2019
1 parent f0f024d commit 76c5ea1
Showing 1 changed file with 41 additions and 34 deletions.
75 changes: 41 additions & 34 deletions pkg/rpc/nodedialer/nodedialer.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,33 +83,13 @@ func (n *Dialer) Dial(ctx context.Context, nodeID roachpb.NodeID) (_ *grpc.Clien
return nil, ctxErr
}
breaker := n.getBreaker(nodeID)

if !breaker.Ready() {
err := errors.Wrapf(circuit.ErrBreakerOpen, "unable to dial n%d", nodeID)
return nil, err
}

defer func() {
// Enforce a minimum interval between warnings for failed connections.
if err != nil && breaker.ShouldLog() {
log.Infof(ctx, "unable to connect to n%d: %s", nodeID, err)
}
}()

addr, err := n.resolver(nodeID)
if err != nil {
err = errors.Wrapf(err, "failed to resolve n%d", nodeID)
breaker.Fail(err)
return nil, err
}
conn, err := n.rpcContext.GRPCDial(addr.String()).Connect(ctx)
if err != nil {
err = errors.Wrapf(err, "failed to grpc dial n%d at %v", nodeID, addr)
breaker.Fail(err)
return nil, err
}
breaker.Success()
return conn, nil
return n.dial(ctx, nodeID, addr, breaker)
}

// DialInternalClient is a specialization of Dial for callers that
Expand All @@ -124,10 +104,6 @@ func (n *Dialer) DialInternalClient(
if n == nil || n.resolver == nil {
return nil, nil, errors.New("no node dialer configured")
}
// Don't trip the breaker if we're already canceled.
if ctxErr := ctx.Err(); ctxErr != nil {
return nil, nil, ctxErr
}
addr, err := n.resolver(nodeID)
if err != nil {
return nil, nil, err
Expand All @@ -141,32 +117,63 @@ func (n *Dialer) DialInternalClient(

return localCtx, localClient, nil
}

breaker := n.getBreaker(nodeID)

log.VEventf(ctx, 2, "sending request to %s", addr)
conn, err := n.rpcContext.GRPCDial(addr.String()).Connect(ctx)
conn, err := n.dial(ctx, nodeID, addr, n.getBreaker(nodeID))
if err != nil {
err = errors.Wrapf(err, "failed to connect to n%d at %v", nodeID, addr)
breaker.Fail(err)
return nil, nil, err
}
return ctx, roachpb.NewInternalClient(conn), err
}

// dial performs the dialing of the remove connection.
func (n *Dialer) dial(
ctx context.Context, nodeID roachpb.NodeID, addr net.Addr, breaker *wrappedBreaker,
) (_ *grpc.ClientConn, err error) {
// Don't trip the breaker if we're already canceled.
if ctxErr := ctx.Err(); ctxErr != nil {
return nil, ctxErr
}
if !breaker.Ready() {
err = errors.Wrapf(circuit.ErrBreakerOpen, "unable to dial n%d", nodeID)
return nil, err
}
defer func() {
// Enforce a minimum interval between warnings for failed connections.
if err != nil && breaker.ShouldLog() {
log.Infof(ctx, "unable to connect to n%d: %s", nodeID, err)
}
}()
conn, err := n.rpcContext.GRPCDial(addr.String()).Connect(ctx)
if err != nil {
// If we were cancelled during the dial, don't trip the breaker.
// NB: If the underlying transport were to have its context cancelled it
// would carry an error wrapped with GRPC information. While it might seem
// that a context cancellation might imply something is amiss with the
// connection it is not a clear enough signal. We will however let
// ErrDeadlineExceeded trip the breaker.
if err != ctx.Err() || ctx.Err() == context.DeadlineExceeded {
err = errors.Wrapf(err, "failed to connect to n%d at %v", nodeID, addr)
breaker.Fail(err)
}
return nil, err
}
// Check to see if the connection is in the transient failure state. This can
// happen if the connection already existed, but a recent heartbeat has
// failed and we haven't yet torn down the connection.
if err := grpcutil.ConnectionReady(conn); err != nil {
err = errors.Wrapf(err, "failed to check for connection ready to n%d at %v", nodeID, addr)
err = errors.Wrapf(err, "failed to check for ready connection to n%d at %v", nodeID, addr)
breaker.Fail(err)
return nil, nil, err
return nil, err
}

// TODO(bdarnell): Reconcile the different health checks and circuit breaker
// behavior in this file. Note that this different behavior causes problems
// for higher-levels in the system. For example, DistSQL checks for
// ConnHealth when scheduling processors, but can then see attempts to send
// RPCs fail when dial fails due to an open breaker. Reset the breaker here
// as a stop-gap before the reconciliation occurs.
breaker.Success()
return ctx, roachpb.NewInternalClient(conn), nil
return conn, nil
}

// ConnHealth returns nil if we have an open connection to the given node
Expand Down

0 comments on commit 76c5ea1

Please sign in to comment.