Skip to content

Commit

Permalink
[wip]: distsql: consult liveness during physical planning
Browse files Browse the repository at this point in the history
WIP because needs a test.

----

The recent PR cockroachdb#22658 introduced a regression in
`(*rpcContext).ConnHealth` which caused DistSQL to continue planning on
unavailable nodes for about an hour (`ttlNodeDescriptorGossip`) if the
leaseholder cache happened to not be updated by other non-DistSQL
requests.

Instead, consult node liveness and avoid planning on dead nodes. This
reduces the problem to a <10s window. The defunct `ConnHealth` mechanism
still protects against planning in some of cases (supposedly due to a
once-per-second reconnection policy) and is retained for that reason,
with issue cockroachdb#23829 filed to decide its future.

NB: I'm not putting a release note since this was introduced after 1.1.
We released it in a beta, though, so it may be worth calling out there.

Touches cockroachdb#23601. (Not fixing it because this issue should only close
when there's a roachtest).

Release note (bug fix): NB: this fixes a regression introduced in
2.0-beta, and not present in 1.1: Avoid planning DistSQL errors against
unavailable nodes.
  • Loading branch information
tbg committed Mar 14, 2018
1 parent 090c805 commit 7129623
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 13 deletions.
11 changes: 11 additions & 0 deletions pkg/rpc/context.go
Original file line number Diff line number Diff line change
Expand Up @@ -541,6 +541,17 @@ var ErrNotHeartbeated = errors.New("not yet heartbeated")
// ConnHealth returns whether the most recent heartbeat succeeded or not.
// This should not be used as a definite status of a node's health and just used
// to prioritize healthy nodes over unhealthy ones.
//
// NB: as of #22658, this does not work as you think. We kick
// connections out of the connection pool as soon as they run into an
// error, at which point their ConnHealth will reset to
// ErrNotConnected. ConnHealth does no more return a sane notion of
// "recent connection health". When it returns nil all seems well, but
// if it doesn't then this may mean that the node is simply refusing
// connections (and is thus unconnected most of the time), or that the
// node hasn't been connected to but is perfectly healthy.
//
// See #23829.
func (ctx *Context) ConnHealth(target string) error {
if ctx.GetLocalInternalServerForAddr(target) != nil {
// The local server is always considered healthy.
Expand Down
1 change: 1 addition & 0 deletions pkg/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -523,6 +523,7 @@ func NewServer(cfg Config, stopper *stop.Stopper) (*Server, error) {
s.distSender,
s.gossip,
s.stopper,
s.nodeLiveness,
sqlExecutorTestingKnobs.DistSQLPlannerKnobs,
),
ExecLogger: log.NewSecondaryLogger(nil, "sql-exec", true /*enableGc*/, false /*forceSyncWrites*/),
Expand Down
59 changes: 46 additions & 13 deletions pkg/sql/distsql_physical_planner.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
"github.com/cockroachdb/cockroach/pkg/sql/sem/types"
"github.com/cockroachdb/cockroach/pkg/sql/sqlbase"
"github.com/cockroachdb/cockroach/pkg/storage"
"github.com/cockroachdb/cockroach/pkg/util"
"github.com/cockroachdb/cockroach/pkg/util/encoding"
"github.com/cockroachdb/cockroach/pkg/util/envutil"
Expand Down Expand Up @@ -83,6 +84,8 @@ type DistSQLPlanner struct {

// gossip handle used to check node version compatibility.
gossip *gossip.Gossip
// liveness is used to avoid planning on down nodes.
liveness *storage.NodeLiveness
}

const resolverPolicy = distsqlplan.BinPackingLeaseHolderChoice
Expand Down Expand Up @@ -123,8 +126,12 @@ func NewDistSQLPlanner(
distSender *kv.DistSender,
gossip *gossip.Gossip,
stopper *stop.Stopper,
liveness *storage.NodeLiveness,
testingKnobs DistSQLPlannerTestingKnobs,
) *DistSQLPlanner {
if liveness == nil {
panic("must specify liveness")
}
dsp := &DistSQLPlanner{
planVersion: planVersion,
st: st,
Expand All @@ -134,6 +141,7 @@ func NewDistSQLPlanner(
distSQLSrv: distSQLSrv,
gossip: gossip,
spanResolver: distsqlplan.NewSpanResolver(distSender, gossip, nodeDesc, resolverPolicy),
liveness: liveness,
testingKnobs: testingKnobs,
}
dsp.initRunners()
Expand Down Expand Up @@ -515,26 +523,51 @@ type spanPartition struct {
func (dsp *DistSQLPlanner) checkNodeHealth(
ctx context.Context, nodeID roachpb.NodeID, addr string,
) error {
// Check if the node is still in gossip - i.e. if it hasn't been
// decommissioned or overridden by another node at the same address.
// Check if the target's node descriptor is gossiped. If it isn't, the node
// is definitely gone and has been for a while.
//
// TODO(tschottdorf): it's not clear that this adds anything to the liveness
// check below. The node descriptor TTL is an hour at the time of writing.
if _, err := dsp.gossip.GetNodeIDAddress(nodeID); err != nil {
log.VEventf(ctx, 1, "not using n%d because gossip doesn't know about it. "+
"It might have gone away from the cluster. Gossip said: %s.", nodeID, err)
return err
}

var err error
if dsp.testingKnobs.OverrideHealthCheck != nil {
err = dsp.testingKnobs.OverrideHealthCheck(nodeID, addr)
} else {
err = dsp.rpcContext.ConnHealth(addr)
{
// NB: as of #22658, ConnHealth does not work as expected; see the
// comment within. We still keep this code for now because in
// practice, once the node is down it will prevent using this node
// 90% of the time (it gets used around once per second as an
// artifact of rpcContext's reconnection mechanism at the time of
// writing). This is better than having it used in 100% of cases
// (until the liveness check below kicks in).
var err error
if dsp.testingKnobs.OverrideHealthCheck != nil {
err = dsp.testingKnobs.OverrideHealthCheck(nodeID, addr)
} else {
err = dsp.rpcContext.ConnHealth(addr)
}

if err != nil && err != rpc.ErrNotConnected && err != rpc.ErrNotHeartbeated {
// This host is known to be unhealthy. Don't use it (use the gateway
// instead). Note: this can never happen for our nodeID (which
// always has its address in the nodeMap).
log.VEventf(ctx, 1, "marking n%d as unhealthy for this plan: %v", nodeID, err)
return err
}
}
if err != nil && err != rpc.ErrNotConnected && err != rpc.ErrNotHeartbeated {
// This host is known to be unhealthy. Don't use it (use the gateway
// instead). Note: this can never happen for our nodeID (which
// always has its address in the nodeMap).
log.VEventf(ctx, 1, "marking n%d as unhealthy for this plan: %v", nodeID, err)
return err

// NB: not all tests populate a NodeLiveness. Everything using the
// proper constructor NewDistSQLPlanner will, though.
if dsp.liveness != nil {
live, err := dsp.liveness.IsLive(nodeID)
if err == nil && !live {
err = errors.New("node is not live")
}
if err != nil {
return errors.Wrapf(err, "not using n%d due to liveness", nodeID)
}
}

// Check that the node is not draining.
Expand Down

0 comments on commit 7129623

Please sign in to comment.