diff --git a/docs/generated/http/full.md b/docs/generated/http/full.md
index 88c4ba18c44d..c4af373f1845 100644
--- a/docs/generated/http/full.md
+++ b/docs/generated/http/full.md
@@ -6202,6 +6202,7 @@ If no NodeIDs are given, it targets the recipient node.
| ----- | ---- | ----- | ----------- | -------------- |
| node_ids | [int32](#cockroach.server.serverpb.DecommissionRequest-int32) | repeated | | [reserved](#support-status) |
| target_membership | [cockroach.kv.kvserver.liveness.livenesspb.MembershipStatus](#cockroach.server.serverpb.DecommissionRequest-cockroach.kv.kvserver.liveness.livenesspb.MembershipStatus) | | | [reserved](#support-status) |
+| num_replica_report | [int32](#cockroach.server.serverpb.DecommissionRequest-int32) | | The number of decommissioning replicas to be reported. | [reserved](#support-status) |
@@ -6238,6 +6239,21 @@ DecommissionStatusResponse lists decommissioning statuses for a number of NodeID
| replica_count | [int64](#cockroach.server.serverpb.DecommissionStatusResponse-int64) | | The number of replicas on the node, computed by scanning meta2 ranges. | [reserved](#support-status) |
| membership | [cockroach.kv.kvserver.liveness.livenesspb.MembershipStatus](#cockroach.server.serverpb.DecommissionStatusResponse-cockroach.kv.kvserver.liveness.livenesspb.MembershipStatus) | | The membership status of the given node. | [reserved](#support-status) |
| draining | [bool](#cockroach.server.serverpb.DecommissionStatusResponse-bool) | | | [reserved](#support-status) |
+| reported_replicas | [DecommissionStatusResponse.Replica](#cockroach.server.serverpb.DecommissionStatusResponse-cockroach.server.serverpb.DecommissionStatusResponse.Replica) | repeated | Decommissioning replicas on the given node to be reported. How many replicas are reported is determined by what was specified in the request. | [reserved](#support-status) |
+
+
+
+
+
+
+#### DecommissionStatusResponse.Replica
+
+
+
+| Field | Type | Label | Description | Support status |
+| ----- | ---- | ----- | ----------- | -------------- |
+| replica_id | [int32](#cockroach.server.serverpb.DecommissionStatusResponse-int32) | | | [reserved](#support-status) |
+| range_id | [int32](#cockroach.server.serverpb.DecommissionStatusResponse-int32) | | | [reserved](#support-status) |
@@ -6266,6 +6282,7 @@ specified or, if none are specified, all nodes.
| Field | Type | Label | Description | Support status |
| ----- | ---- | ----- | ----------- | -------------- |
| node_ids | [int32](#cockroach.server.serverpb.DecommissionStatusRequest-int32) | repeated | | [reserved](#support-status) |
+| num_replica_report | [int32](#cockroach.server.serverpb.DecommissionStatusRequest-int32) | | The number of decommissioning replicas to be reported. | [reserved](#support-status) |
@@ -6302,6 +6319,21 @@ DecommissionStatusResponse lists decommissioning statuses for a number of NodeID
| replica_count | [int64](#cockroach.server.serverpb.DecommissionStatusResponse-int64) | | The number of replicas on the node, computed by scanning meta2 ranges. | [reserved](#support-status) |
| membership | [cockroach.kv.kvserver.liveness.livenesspb.MembershipStatus](#cockroach.server.serverpb.DecommissionStatusResponse-cockroach.kv.kvserver.liveness.livenesspb.MembershipStatus) | | The membership status of the given node. | [reserved](#support-status) |
| draining | [bool](#cockroach.server.serverpb.DecommissionStatusResponse-bool) | | | [reserved](#support-status) |
+| reported_replicas | [DecommissionStatusResponse.Replica](#cockroach.server.serverpb.DecommissionStatusResponse-cockroach.server.serverpb.DecommissionStatusResponse.Replica) | repeated | Decommissioning replicas on the given node to be reported. How many replicas are reported is determined by what was specified in the request. | [reserved](#support-status) |
+
+
+
+
+
+
+#### DecommissionStatusResponse.Replica
+
+
+
+| Field | Type | Label | Description | Support status |
+| ----- | ---- | ----- | ----------- | -------------- |
+| replica_id | [int32](#cockroach.server.serverpb.DecommissionStatusResponse-int32) | | | [reserved](#support-status) |
+| range_id | [int32](#cockroach.server.serverpb.DecommissionStatusResponse-int32) | | | [reserved](#support-status) |
diff --git a/pkg/cli/node.go b/pkg/cli/node.go
index 421142b22f4f..1ddf911f62ed 100644
--- a/pkg/cli/node.go
+++ b/pkg/cli/node.go
@@ -457,6 +457,17 @@ func runDecommissionNodeImpl(
MaxBackoff: 20 * time.Second,
}
+ // Log verbosity is increased when there is possibly a decommission stall.
+ // If the number of decommissioning replicas does not decrease after some time
+ // (i.e. the decommission status has not changed after
+ // sameStatusThreshold iterations), verbosity is automatically set.
+ // Some decommissioning replicas will be reported to the operator.
+ const sameStatusThreshold = 15
+ var (
+ numReplicaReport = 0
+ sameStatusCount = 0
+ )
+
// Decommissioning a node is driven by a three-step process.
// 1) Mark each node as 'decommissioning'. In doing so, all replicas are
// slowly moved off of these nodes.
@@ -472,6 +483,7 @@ func runDecommissionNodeImpl(
req := &serverpb.DecommissionRequest{
NodeIDs: nodeIDs,
TargetMembership: livenesspb.MembershipStatus_DECOMMISSIONING,
+ NumReplicaReport: int32(numReplicaReport),
}
resp, err := c.Decommission(ctx, req)
if err != nil {
@@ -479,14 +491,33 @@ func runDecommissionNodeImpl(
return errors.Wrap(err, "while trying to mark as decommissioning")
}
+ if numReplicaReport > 0 {
+ printDecommissionReplicas(ctx, *resp)
+ }
+
if !reflect.DeepEqual(&prevResponse, resp) {
fmt.Fprintln(stderr)
- if err := printDecommissionStatus(*resp); err != nil {
+ if err = printDecommissionStatus(*resp); err != nil {
return err
}
prevResponse = *resp
+
+ // The decommissioning status changed. Set `sameStatusCount` back to zero.
+ sameStatusCount = 0
+ numReplicaReport = 0
} else {
+ // Print a marker indicating that there has been no progress,
+ // instead of printing the same status.
fmt.Fprintf(stderr, ".")
+
+ // Report decommissioning replicas if there's been significant time of
+ // no progress.
+ if sameStatusCount >= sameStatusThreshold && numReplicaReport == 0 {
+ // Configure a number of replicas to report.
+ numReplicaReport = 5
+ } else {
+ sameStatusCount++
+ }
}
anyActive := false
@@ -591,6 +622,21 @@ func printDecommissionStatus(resp serverpb.DecommissionStatusResponse) error {
clisqlexec.NewRowSliceIter(decommissionResponseValueToRows(resp.Status), decommissionResponseAlignment()))
}
+func printDecommissionReplicas(ctx context.Context, resp serverpb.DecommissionStatusResponse) {
+ fmt.Fprintln(stderr, "\npossible decommission stall detected; reporting decommissioning replicas")
+
+ for _, nodeStatus := range resp.Status {
+ for _, replica := range nodeStatus.ReportedReplicas {
+ fmt.Fprintf(stderr,
+ "n%d still has replica id %d for range r%d",
+ nodeStatus.NodeID,
+ replica.ReplicaID,
+ replica.RangeID,
+ )
+ }
+ }
+}
+
func runRecommissionNode(cmd *cobra.Command, args []string) error {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
diff --git a/pkg/cmd/roachtest/tests/decommission.go b/pkg/cmd/roachtest/tests/decommission.go
index 71bee78138c1..062879e5d3a2 100644
--- a/pkg/cmd/roachtest/tests/decommission.go
+++ b/pkg/cmd/roachtest/tests/decommission.go
@@ -16,6 +16,7 @@ import (
"fmt"
"math/rand"
"reflect"
+ "regexp"
"strconv"
"strings"
"time"
@@ -26,6 +27,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
+ "github.com/cockroachdb/cockroach/pkg/testutils"
"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils"
"github.com/cockroachdb/cockroach/pkg/util/retry"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
@@ -99,6 +101,17 @@ func registerDecommission(r registry.Registry) {
},
})
}
+ {
+ numNodes := 6
+ r.Add(registry.TestSpec{
+ Name: "decommission/slow",
+ Owner: registry.OwnerServer,
+ Cluster: r.MakeClusterSpec(numNodes),
+ Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
+ runDecommissionSlow(ctx, t, c)
+ },
+ })
+ }
}
// runDrainAndDecommission marks 3 nodes in the test cluster as "draining" and
@@ -1091,6 +1104,84 @@ func runDecommissionDrains(ctx context.Context, t test.Test, c cluster.Cluster)
require.NoError(t, cli.MatchCSV(o, expDecommissioned))
}
+// runDecommissionSlow decommissions 5 nodes in a test cluster of 6
+// (with a replication factor of 5), which will guarantee a replica transfer
+// stall. This test is meant to ensure that decommissioning replicas are
+// reported when replica transfers stall.
+func runDecommissionSlow(ctx context.Context, t test.Test, c cluster.Cluster) {
+ const (
+ numNodes = 6
+ pinnedNodeID = 1
+ replicationFactor = 5
+ )
+
+ var verboseStoreLogRe = regexp.MustCompile("possible decommission stall detected")
+
+ err := c.PutE(ctx, t.L(), t.Cockroach(), "./cockroach", c.All())
+ require.NoError(t, err)
+
+ c.Start(ctx, t.L(), option.DefaultStartOpts(), install.MakeClusterSettings(), c.All())
+
+ run := func(db *gosql.DB, query string) {
+ _, err = db.ExecContext(ctx, query)
+ require.NoError(t, err)
+ t.L().Printf("run: %s\n", query)
+ }
+
+ {
+ db := c.Conn(ctx, t.L(), pinnedNodeID)
+ defer db.Close()
+
+ // Set the replication factor to 5.
+ run(db, fmt.Sprintf(`ALTER RANGE default CONFIGURE ZONE USING num_replicas=%d`, replicationFactor))
+ run(db, fmt.Sprintf(`ALTER DATABASE system CONFIGURE ZONE USING num_replicas=%d`, replicationFactor))
+
+ // Increase the speed of decommissioning.
+ run(db, `SET CLUSTER SETTING kv.snapshot_rebalance.max_rate='2GiB'`)
+ run(db, `SET CLUSTER SETTING kv.snapshot_recovery.max_rate='2GiB'`)
+
+ // Wait for initial up-replication.
+ err := WaitForReplication(ctx, t, db, replicationFactor)
+ require.NoError(t, err)
+ }
+
+ // Decommission 5 nodes from the cluster, resulting in immovable replicas.
+ // Be prepared to cancel the context for the processes running decommissions
+ // since the decommissions will stall.
+ decomCtx, cancel := context.WithTimeout(ctx, 10*time.Minute)
+ defer cancel()
+ m := c.NewMonitor(decomCtx)
+ for nodeID := 2; nodeID <= numNodes; nodeID++ {
+ id := nodeID
+ m.Go(func(ctx context.Context) error {
+ decom := func(id int) error {
+ t.Status(fmt.Sprintf("decommissioning node %d", id))
+ return c.RunE(ctx,
+ c.Node(id),
+ fmt.Sprintf("./cockroach node decommission %d --insecure", id),
+ )
+ }
+ return decom(id)
+ })
+ }
+
+ // Check for reported decommissioning replicas.
+ t.Status("checking for decommissioning replicas report...")
+ testutils.SucceedsWithin(t, func() error {
+ for nodeID := 1; nodeID <= numNodes; nodeID++ {
+ if err = c.RunE(ctx,
+ c.Node(nodeID),
+ fmt.Sprintf("grep -q '%s' logs/cockroach.log", verboseStoreLogRe),
+ ); err == nil {
+ return nil
+ }
+ }
+ return errors.New("still waiting for decommissioning replicas report")
+ },
+ 3*time.Minute,
+ )
+}
+
// Header from the output of `cockroach node decommission`.
var decommissionHeader = []string{
"id", "is_live", "replicas", "is_decommissioning", "membership", "is_draining",
diff --git a/pkg/server/admin.go b/pkg/server/admin.go
index 05dcfa9f208c..d4d153c7cbd5 100644
--- a/pkg/server/admin.go
+++ b/pkg/server/admin.go
@@ -2302,6 +2302,7 @@ func (s *adminServer) decommissionStatusHelper(
// Get the number of replicas on each node. We *may* not need all of them,
// but that would be more complicated than seems worth it right now.
nodeIDs := req.NodeIDs
+ numReplicaReport := req.NumReplicaReport
// If no nodeIDs given, use all nodes.
if len(nodeIDs) == 0 {
@@ -2314,6 +2315,24 @@ func (s *adminServer) decommissionStatusHelper(
}
}
+ // If the client specified a number of decommissioning replicas to report,
+ // prepare to get decommissioning replicas to report to the operator.
+ // numReplicaReport is the number of replicas reported for each node.
+ var replicasToReport map[roachpb.NodeID][]*serverpb.DecommissionStatusResponse_Replica
+ if numReplicaReport > 0 {
+ log.Ops.Warning(ctx, "possible decommission stall detected; reporting decommissioning replicas")
+ replicasToReport = make(map[roachpb.NodeID][]*serverpb.DecommissionStatusResponse_Replica)
+ }
+
+ isDecommissioningNode := func(n roachpb.NodeID) bool {
+ for _, nodeID := range nodeIDs {
+ if n == nodeID {
+ return true
+ }
+ }
+ return false
+ }
+
// Compute the replica counts for the target nodes only. This map doubles as
// a lookup table to check whether we care about a given node.
var replicaCounts map[roachpb.NodeID]int64
@@ -2331,6 +2350,24 @@ func (s *adminServer) decommissionStatusHelper(
return errors.Wrapf(err, "%s: unable to unmarshal range descriptor", row.Key)
}
for _, r := range rangeDesc.Replicas().Descriptors() {
+ if numReplicaReport > 0 {
+ if len(replicasToReport[r.NodeID]) < int(numReplicaReport) {
+ if isDecommissioningNode(r.NodeID) {
+ replicasToReport[r.NodeID] = append(replicasToReport[r.NodeID],
+ &serverpb.DecommissionStatusResponse_Replica{
+ ReplicaID: r.ReplicaID,
+ RangeID: rangeDesc.RangeID,
+ },
+ )
+ log.Ops.Warningf(ctx,
+ "n%d still has replica id %d for range r%d",
+ r.NodeID,
+ r.ReplicaID,
+ rangeDesc.RangeID,
+ )
+ }
+ }
+ }
if _, ok := replicaCounts[r.NodeID]; ok {
replicaCounts[r.NodeID]++
}
@@ -2368,15 +2405,15 @@ func (s *adminServer) decommissionStatusHelper(
return nil, errors.Newf("unable to get liveness for %d", nodeID)
}
nodeResp := serverpb.DecommissionStatusResponse_Status{
- NodeID: l.NodeID,
- ReplicaCount: replicaCounts[l.NodeID],
- Membership: l.Membership,
- Draining: l.Draining,
+ NodeID: l.NodeID,
+ ReplicaCount: replicaCounts[l.NodeID],
+ Membership: l.Membership,
+ Draining: l.Draining,
+ ReportedReplicas: replicasToReport[l.NodeID],
}
if l.IsLive(s.server.clock.Now().GoTime()) {
nodeResp.IsLive = true
}
-
res.Status = append(res.Status, nodeResp)
}
@@ -2415,7 +2452,7 @@ func (s *adminServer) Decommission(
return &serverpb.DecommissionStatusResponse{}, nil
}
- return s.DecommissionStatus(ctx, &serverpb.DecommissionStatusRequest{NodeIDs: nodeIDs})
+ return s.DecommissionStatus(ctx, &serverpb.DecommissionStatusRequest{NodeIDs: nodeIDs, NumReplicaReport: req.NumReplicaReport})
}
// DataDistribution returns a count of replicas on each node for each table.
diff --git a/pkg/server/serverpb/admin.proto b/pkg/server/serverpb/admin.proto
index 42e65aea9c98..197c942b58bc 100644
--- a/pkg/server/serverpb/admin.proto
+++ b/pkg/server/serverpb/admin.proto
@@ -476,6 +476,8 @@ message DrainResponse {
message DecommissionStatusRequest {
repeated int32 node_ids = 1 [(gogoproto.customname) = "NodeIDs",
(gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/roachpb.NodeID"];
+ // The number of decommissioning replicas to be reported.
+ int32 num_replica_report = 2;
}
// DecommissionRequest requests the server to set the membership status on
@@ -486,10 +488,18 @@ message DecommissionRequest {
repeated int32 node_ids = 1 [(gogoproto.customname) = "NodeIDs",
(gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/roachpb.NodeID"];
kv.kvserver.liveness.livenesspb.MembershipStatus target_membership = 2;
+ // The number of decommissioning replicas to be reported.
+ int32 num_replica_report = 3;
}
// DecommissionStatusResponse lists decommissioning statuses for a number of NodeIDs.
message DecommissionStatusResponse {
+ message Replica {
+ int32 replica_id = 1 [ (gogoproto.customname) = "ReplicaID",
+ (gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/roachpb.ReplicaID"];
+ int32 range_id = 2 [ (gogoproto.customname) = "RangeID",
+ (gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/roachpb.RangeID"];
+ }
message Status {
int32 node_id = 1 [ (gogoproto.customname) = "NodeID",
(gogoproto.casttype) = "github.com/cockroachdb/cockroach/pkg/roachpb.NodeID"];
@@ -499,6 +509,10 @@ message DecommissionStatusResponse {
// The membership status of the given node.
kv.kvserver.liveness.livenesspb.MembershipStatus membership = 4;
bool draining = 5;
+ // Decommissioning replicas on the given node to be reported.
+ // How many replicas are reported is determined by what was specified in the
+ // request.
+ repeated Replica reported_replicas = 6;
}
// Status of all affected nodes.
repeated Status status = 2 [(gogoproto.nullable) = false];