diff --git a/docs/generated/http/full.md b/docs/generated/http/full.md index 37034516a076..b48a91915456 100644 --- a/docs/generated/http/full.md +++ b/docs/generated/http/full.md @@ -6193,6 +6193,7 @@ If no NodeIDs are given, it targets the recipient node. | ----- | ---- | ----- | ----------- | -------------- | | node_ids | [int32](#cockroach.server.serverpb.DecommissionRequest-int32) | repeated | | [reserved](#support-status) | | target_membership | [cockroach.kv.kvserver.liveness.livenesspb.MembershipStatus](#cockroach.server.serverpb.DecommissionRequest-cockroach.kv.kvserver.liveness.livenesspb.MembershipStatus) | | | [reserved](#support-status) | +| num_replica_report | [int32](#cockroach.server.serverpb.DecommissionRequest-int32) | | The number of decommissioning replicas to be reported. | [reserved](#support-status) | @@ -6229,6 +6230,21 @@ DecommissionStatusResponse lists decommissioning statuses for a number of NodeID | replica_count | [int64](#cockroach.server.serverpb.DecommissionStatusResponse-int64) | | The number of replicas on the node, computed by scanning meta2 ranges. | [reserved](#support-status) | | membership | [cockroach.kv.kvserver.liveness.livenesspb.MembershipStatus](#cockroach.server.serverpb.DecommissionStatusResponse-cockroach.kv.kvserver.liveness.livenesspb.MembershipStatus) | | The membership status of the given node. | [reserved](#support-status) | | draining | [bool](#cockroach.server.serverpb.DecommissionStatusResponse-bool) | | | [reserved](#support-status) | +| reported_replicas | [DecommissionStatusResponse.Replica](#cockroach.server.serverpb.DecommissionStatusResponse-cockroach.server.serverpb.DecommissionStatusResponse.Replica) | repeated | Decommissioning replicas on the given node to be reported. How many replicas are reported is determined by what was specified in the request. | [reserved](#support-status) | + + + + + + +#### DecommissionStatusResponse.Replica + + + +| Field | Type | Label | Description | Support status | +| ----- | ---- | ----- | ----------- | -------------- | +| replica_id | [int32](#cockroach.server.serverpb.DecommissionStatusResponse-int32) | | | [reserved](#support-status) | +| range_id | [int32](#cockroach.server.serverpb.DecommissionStatusResponse-int32) | | | [reserved](#support-status) | @@ -6257,6 +6273,7 @@ specified or, if none are specified, all nodes. | Field | Type | Label | Description | Support status | | ----- | ---- | ----- | ----------- | -------------- | | node_ids | [int32](#cockroach.server.serverpb.DecommissionStatusRequest-int32) | repeated | | [reserved](#support-status) | +| num_replica_report | [int32](#cockroach.server.serverpb.DecommissionStatusRequest-int32) | | The number of decommissioning replicas to be reported. | [reserved](#support-status) | @@ -6293,6 +6310,21 @@ DecommissionStatusResponse lists decommissioning statuses for a number of NodeID | replica_count | [int64](#cockroach.server.serverpb.DecommissionStatusResponse-int64) | | The number of replicas on the node, computed by scanning meta2 ranges. | [reserved](#support-status) | | membership | [cockroach.kv.kvserver.liveness.livenesspb.MembershipStatus](#cockroach.server.serverpb.DecommissionStatusResponse-cockroach.kv.kvserver.liveness.livenesspb.MembershipStatus) | | The membership status of the given node. | [reserved](#support-status) | | draining | [bool](#cockroach.server.serverpb.DecommissionStatusResponse-bool) | | | [reserved](#support-status) | +| reported_replicas | [DecommissionStatusResponse.Replica](#cockroach.server.serverpb.DecommissionStatusResponse-cockroach.server.serverpb.DecommissionStatusResponse.Replica) | repeated | Decommissioning replicas on the given node to be reported. How many replicas are reported is determined by what was specified in the request. | [reserved](#support-status) | + + + + + + +#### DecommissionStatusResponse.Replica + + + +| Field | Type | Label | Description | Support status | +| ----- | ---- | ----- | ----------- | -------------- | +| replica_id | [int32](#cockroach.server.serverpb.DecommissionStatusResponse-int32) | | | [reserved](#support-status) | +| range_id | [int32](#cockroach.server.serverpb.DecommissionStatusResponse-int32) | | | [reserved](#support-status) | diff --git a/pkg/cli/node.go b/pkg/cli/node.go index 421142b22f4f..1ddf911f62ed 100644 --- a/pkg/cli/node.go +++ b/pkg/cli/node.go @@ -457,6 +457,17 @@ func runDecommissionNodeImpl( MaxBackoff: 20 * time.Second, } + // Log verbosity is increased when there is possibly a decommission stall. + // If the number of decommissioning replicas does not decrease after some time + // (i.e. the decommission status has not changed after + // sameStatusThreshold iterations), verbosity is automatically set. + // Some decommissioning replicas will be reported to the operator. + const sameStatusThreshold = 15 + var ( + numReplicaReport = 0 + sameStatusCount = 0 + ) + // Decommissioning a node is driven by a three-step process. // 1) Mark each node as 'decommissioning'. In doing so, all replicas are // slowly moved off of these nodes. @@ -472,6 +483,7 @@ func runDecommissionNodeImpl( req := &serverpb.DecommissionRequest{ NodeIDs: nodeIDs, TargetMembership: livenesspb.MembershipStatus_DECOMMISSIONING, + NumReplicaReport: int32(numReplicaReport), } resp, err := c.Decommission(ctx, req) if err != nil { @@ -479,14 +491,33 @@ func runDecommissionNodeImpl( return errors.Wrap(err, "while trying to mark as decommissioning") } + if numReplicaReport > 0 { + printDecommissionReplicas(ctx, *resp) + } + if !reflect.DeepEqual(&prevResponse, resp) { fmt.Fprintln(stderr) - if err := printDecommissionStatus(*resp); err != nil { + if err = printDecommissionStatus(*resp); err != nil { return err } prevResponse = *resp + + // The decommissioning status changed. Set `sameStatusCount` back to zero. + sameStatusCount = 0 + numReplicaReport = 0 } else { + // Print a marker indicating that there has been no progress, + // instead of printing the same status. fmt.Fprintf(stderr, ".") + + // Report decommissioning replicas if there's been significant time of + // no progress. + if sameStatusCount >= sameStatusThreshold && numReplicaReport == 0 { + // Configure a number of replicas to report. + numReplicaReport = 5 + } else { + sameStatusCount++ + } } anyActive := false @@ -591,6 +622,21 @@ func printDecommissionStatus(resp serverpb.DecommissionStatusResponse) error { clisqlexec.NewRowSliceIter(decommissionResponseValueToRows(resp.Status), decommissionResponseAlignment())) } +func printDecommissionReplicas(ctx context.Context, resp serverpb.DecommissionStatusResponse) { + fmt.Fprintln(stderr, "\npossible decommission stall detected; reporting decommissioning replicas") + + for _, nodeStatus := range resp.Status { + for _, replica := range nodeStatus.ReportedReplicas { + fmt.Fprintf(stderr, + "n%d still has replica id %d for range r%d", + nodeStatus.NodeID, + replica.ReplicaID, + replica.RangeID, + ) + } + } +} + func runRecommissionNode(cmd *cobra.Command, args []string) error { ctx, cancel := context.WithCancel(context.Background()) defer cancel() diff --git a/pkg/cmd/roachtest/tests/decommission.go b/pkg/cmd/roachtest/tests/decommission.go index 71bee78138c1..062879e5d3a2 100644 --- a/pkg/cmd/roachtest/tests/decommission.go +++ b/pkg/cmd/roachtest/tests/decommission.go @@ -16,6 +16,7 @@ import ( "fmt" "math/rand" "reflect" + "regexp" "strconv" "strings" "time" @@ -26,6 +27,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" "github.com/cockroachdb/cockroach/pkg/roachprod/install" + "github.com/cockroachdb/cockroach/pkg/testutils" "github.com/cockroachdb/cockroach/pkg/testutils/sqlutils" "github.com/cockroachdb/cockroach/pkg/util/retry" "github.com/cockroachdb/cockroach/pkg/util/timeutil" @@ -99,6 +101,17 @@ func registerDecommission(r registry.Registry) { }, }) } + { + numNodes := 6 + r.Add(registry.TestSpec{ + Name: "decommission/slow", + Owner: registry.OwnerServer, + Cluster: r.MakeClusterSpec(numNodes), + Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { + runDecommissionSlow(ctx, t, c) + }, + }) + } } // runDrainAndDecommission marks 3 nodes in the test cluster as "draining" and @@ -1091,6 +1104,84 @@ func runDecommissionDrains(ctx context.Context, t test.Test, c cluster.Cluster) require.NoError(t, cli.MatchCSV(o, expDecommissioned)) } +// runDecommissionSlow decommissions 5 nodes in a test cluster of 6 +// (with a replication factor of 5), which will guarantee a replica transfer +// stall. This test is meant to ensure that decommissioning replicas are +// reported when replica transfers stall. +func runDecommissionSlow(ctx context.Context, t test.Test, c cluster.Cluster) { + const ( + numNodes = 6 + pinnedNodeID = 1 + replicationFactor = 5 + ) + + var verboseStoreLogRe = regexp.MustCompile("possible decommission stall detected") + + err := c.PutE(ctx, t.L(), t.Cockroach(), "./cockroach", c.All()) + require.NoError(t, err) + + c.Start(ctx, t.L(), option.DefaultStartOpts(), install.MakeClusterSettings(), c.All()) + + run := func(db *gosql.DB, query string) { + _, err = db.ExecContext(ctx, query) + require.NoError(t, err) + t.L().Printf("run: %s\n", query) + } + + { + db := c.Conn(ctx, t.L(), pinnedNodeID) + defer db.Close() + + // Set the replication factor to 5. + run(db, fmt.Sprintf(`ALTER RANGE default CONFIGURE ZONE USING num_replicas=%d`, replicationFactor)) + run(db, fmt.Sprintf(`ALTER DATABASE system CONFIGURE ZONE USING num_replicas=%d`, replicationFactor)) + + // Increase the speed of decommissioning. + run(db, `SET CLUSTER SETTING kv.snapshot_rebalance.max_rate='2GiB'`) + run(db, `SET CLUSTER SETTING kv.snapshot_recovery.max_rate='2GiB'`) + + // Wait for initial up-replication. + err := WaitForReplication(ctx, t, db, replicationFactor) + require.NoError(t, err) + } + + // Decommission 5 nodes from the cluster, resulting in immovable replicas. + // Be prepared to cancel the context for the processes running decommissions + // since the decommissions will stall. + decomCtx, cancel := context.WithTimeout(ctx, 10*time.Minute) + defer cancel() + m := c.NewMonitor(decomCtx) + for nodeID := 2; nodeID <= numNodes; nodeID++ { + id := nodeID + m.Go(func(ctx context.Context) error { + decom := func(id int) error { + t.Status(fmt.Sprintf("decommissioning node %d", id)) + return c.RunE(ctx, + c.Node(id), + fmt.Sprintf("./cockroach node decommission %d --insecure", id), + ) + } + return decom(id) + }) + } + + // Check for reported decommissioning replicas. + t.Status("checking for decommissioning replicas report...") + testutils.SucceedsWithin(t, func() error { + for nodeID := 1; nodeID <= numNodes; nodeID++ { + if err = c.RunE(ctx, + c.Node(nodeID), + fmt.Sprintf("grep -q '%s' logs/cockroach.log", verboseStoreLogRe), + ); err == nil { + return nil + } + } + return errors.New("still waiting for decommissioning replicas report") + }, + 3*time.Minute, + ) +} + // Header from the output of `cockroach node decommission`. var decommissionHeader = []string{ "id", "is_live", "replicas", "is_decommissioning", "membership", "is_draining", diff --git a/pkg/server/admin.go b/pkg/server/admin.go index 05dcfa9f208c..d4d153c7cbd5 100644 --- a/pkg/server/admin.go +++ b/pkg/server/admin.go @@ -2302,6 +2302,7 @@ func (s *adminServer) decommissionStatusHelper( // Get the number of replicas on each node. We *may* not need all of them, // but that would be more complicated than seems worth it right now. nodeIDs := req.NodeIDs + numReplicaReport := req.NumReplicaReport // If no nodeIDs given, use all nodes. if len(nodeIDs) == 0 { @@ -2314,6 +2315,24 @@ func (s *adminServer) decommissionStatusHelper( } } + // If the client specified a number of decommissioning replicas to report, + // prepare to get decommissioning replicas to report to the operator. + // numReplicaReport is the number of replicas reported for each node. + var replicasToReport map[roachpb.NodeID][]*serverpb.DecommissionStatusResponse_Replica + if numReplicaReport > 0 { + log.Ops.Warning(ctx, "possible decommission stall detected; reporting decommissioning replicas") + replicasToReport = make(map[roachpb.NodeID][]*serverpb.DecommissionStatusResponse_Replica) + } + + isDecommissioningNode := func(n roachpb.NodeID) bool { + for _, nodeID := range nodeIDs { + if n == nodeID { + return true + } + } + return false + } + // Compute the replica counts for the target nodes only. This map doubles as // a lookup table to check whether we care about a given node. var replicaCounts map[roachpb.NodeID]int64 @@ -2331,6 +2350,24 @@ func (s *adminServer) decommissionStatusHelper( return errors.Wrapf(err, "%s: unable to unmarshal range descriptor", row.Key) } for _, r := range rangeDesc.Replicas().Descriptors() { + if numReplicaReport > 0 { + if len(replicasToReport[r.NodeID]) < int(numReplicaReport) { + if isDecommissioningNode(r.NodeID) { + replicasToReport[r.NodeID] = append(replicasToReport[r.NodeID], + &serverpb.DecommissionStatusResponse_Replica{ + ReplicaID: r.ReplicaID, + RangeID: rangeDesc.RangeID, + }, + ) + log.Ops.Warningf(ctx, + "n%d still has replica id %d for range r%d", + r.NodeID, + r.ReplicaID, + rangeDesc.RangeID, + ) + } + } + } if _, ok := replicaCounts[r.NodeID]; ok { replicaCounts[r.NodeID]++ } @@ -2368,15 +2405,15 @@ func (s *adminServer) decommissionStatusHelper( return nil, errors.Newf("unable to get liveness for %d", nodeID) } nodeResp := serverpb.DecommissionStatusResponse_Status{ - NodeID: l.NodeID, - ReplicaCount: replicaCounts[l.NodeID], - Membership: l.Membership, - Draining: l.Draining, + NodeID: l.NodeID, + ReplicaCount: replicaCounts[l.NodeID], + Membership: l.Membership, + Draining: l.Draining, + ReportedReplicas: replicasToReport[l.NodeID], } if l.IsLive(s.server.clock.Now().GoTime()) { nodeResp.IsLive = true } - res.Status = append(res.Status, nodeResp) } @@ -2415,7 +2452,7 @@ func (s *adminServer) Decommission( return &serverpb.DecommissionStatusResponse{}, nil } - return s.DecommissionStatus(ctx, &serverpb.DecommissionStatusRequest{NodeIDs: nodeIDs}) + return s.DecommissionStatus(ctx, &serverpb.DecommissionStatusRequest{NodeIDs: nodeIDs, NumReplicaReport: req.NumReplicaReport}) } // DataDistribution returns a count of replicas on each node for each table. diff --git a/pkg/server/serverpb/admin.proto b/pkg/server/serverpb/admin.proto index 42e65aea9c98..197c942b58bc 100644 --- a/pkg/server/serverpb/admin.proto +++ b/pkg/server/serverpb/admin.proto @@ -476,6 +476,8 @@ message DrainResponse { message DecommissionStatusRequest { repeated int32 node_ids = 1 [(gogoproto.customname) = "NodeIDs", (gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/roachpb.NodeID"]; + // The number of decommissioning replicas to be reported. + int32 num_replica_report = 2; } // DecommissionRequest requests the server to set the membership status on @@ -486,10 +488,18 @@ message DecommissionRequest { repeated int32 node_ids = 1 [(gogoproto.customname) = "NodeIDs", (gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/roachpb.NodeID"]; kv.kvserver.liveness.livenesspb.MembershipStatus target_membership = 2; + // The number of decommissioning replicas to be reported. + int32 num_replica_report = 3; } // DecommissionStatusResponse lists decommissioning statuses for a number of NodeIDs. message DecommissionStatusResponse { + message Replica { + int32 replica_id = 1 [ (gogoproto.customname) = "ReplicaID", + (gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/roachpb.ReplicaID"]; + int32 range_id = 2 [ (gogoproto.customname) = "RangeID", + (gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/roachpb.RangeID"]; + } message Status { int32 node_id = 1 [ (gogoproto.customname) = "NodeID", (gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/roachpb.NodeID"]; @@ -499,6 +509,10 @@ message DecommissionStatusResponse { // The membership status of the given node. kv.kvserver.liveness.livenesspb.MembershipStatus membership = 4; bool draining = 5; + // Decommissioning replicas on the given node to be reported. + // How many replicas are reported is determined by what was specified in the + // request. + repeated Replica reported_replicas = 6; } // Status of all affected nodes. repeated Status status = 2 [(gogoproto.nullable) = false];