Skip to content

Commit

Permalink
storage: use raft learners in replica addition, defaulted off
Browse files Browse the repository at this point in the history
Release note: None
  • Loading branch information
danhhz committed Jun 13, 2019
1 parent b5b57e1 commit 51925ce
Show file tree
Hide file tree
Showing 11 changed files with 330 additions and 79 deletions.
3 changes: 2 additions & 1 deletion docs/generated/settings/settings.html
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
<tr><td><code>kv.closed_timestamp.target_duration</code></td><td>duration</td><td><code>30s</code></td><td>if nonzero, attempt to provide closed timestamp notifications for timestamps trailing cluster time by approximately this duration</td></tr>
<tr><td><code>kv.follower_read.target_multiple</code></td><td>float</td><td><code>3</code></td><td>if above 1, encourages the distsender to perform a read against the closest replica if a request is older than kv.closed_timestamp.target_duration * (1 + kv.closed_timestamp.close_fraction * this) less a clock uncertainty interval. This value also is used to create follower_timestamp(). (WARNING: may compromise cluster stability or correctness; do not edit without supervision)</td></tr>
<tr><td><code>kv.import.batch_size</code></td><td>byte size</td><td><code>32 MiB</code></td><td>the maximum size of the payload in an AddSSTable request (WARNING: may compromise cluster stability or correctness; do not edit without supervision)</td></tr>
<tr><td><code>kv.learner_replicas.enabled</code></td><td>boolean</td><td><code>false</code></td><td>use learner replicas for replica addition</td></tr>
<tr><td><code>kv.raft.command.max_size</code></td><td>byte size</td><td><code>64 MiB</code></td><td>maximum size of a raft command</td></tr>
<tr><td><code>kv.raft_log.disable_synchronization_unsafe</code></td><td>boolean</td><td><code>false</code></td><td>set to true to disable synchronization on Raft log writes to persistent storage. Setting to true risks data loss or data corruption on server crashes. The setting is meant for internal testing only and SHOULD NOT be used in production.</td></tr>
<tr><td><code>kv.range.backpressure_range_size_multiplier</code></td><td>float</td><td><code>2</code></td><td>multiple of range_max_bytes that a range is allowed to grow to without splitting before writes to that range are blocked, or 0 to disable</td></tr>
Expand Down Expand Up @@ -120,6 +121,6 @@
<tr><td><code>trace.debug.enable</code></td><td>boolean</td><td><code>false</code></td><td>if set, traces for recent requests can be seen in the /debug page</td></tr>
<tr><td><code>trace.lightstep.token</code></td><td>string</td><td><code></code></td><td>if set, traces go to Lightstep using this token</td></tr>
<tr><td><code>trace.zipkin.collector</code></td><td>string</td><td><code></code></td><td>if set, traces go to the given Zipkin instance (example: '127.0.0.1:9411'); ignored if trace.lightstep.token is set</td></tr>
<tr><td><code>version</code></td><td>custom validation</td><td><code>19.1-4</code></td><td>set the active cluster version in the format '<major>.<minor>'</td></tr>
<tr><td><code>version</code></td><td>custom validation</td><td><code>19.1-5</code></td><td>set the active cluster version in the format '<major>.<minor>'</td></tr>
</tbody>
</table>
10 changes: 5 additions & 5 deletions pkg/roachpb/metadata.go
Original file line number Diff line number Diff line change
Expand Up @@ -143,15 +143,15 @@ func (r *RangeDescriptor) AddReplica(toAdd ReplicaDescriptor) {
r.SetReplicas(rs)
}

// RemoveReplica removes the given replica from this range's set. If it wasn't
// found to remove, false is returned.
func (r *RangeDescriptor) RemoveReplica(toRemove ReplicaDescriptor) bool {
// RemoveReplica removes the matching replica from this range's set and returns
// it. If it wasn't found to remove, false is returned.
func (r *RangeDescriptor) RemoveReplica(nodeID NodeID, storeID StoreID) (ReplicaDescriptor, bool) {
rs := r.Replicas()
found := rs.RemoveReplica(toRemove)
removed, found := rs.RemoveReplica(nodeID, storeID)
if found {
r.SetReplicas(rs)
}
return found
return removed, found
}

// GetReplicaDescriptor returns the replica which matches the specified store
Expand Down
13 changes: 7 additions & 6 deletions pkg/roachpb/metadata_replicas.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,23 +89,24 @@ func (d *ReplicaDescriptors) AddReplica(r ReplicaDescriptor) {
d.wrapped = append(d.wrapped, r)
}

// RemoveReplica removes the given replica from this set. If it wasn't found to
// remove, false is returned.
func (d *ReplicaDescriptors) RemoveReplica(r ReplicaDescriptor) bool {
// RemoveReplica removes the matching replica from this set. If it wasn't found
// to remove, false is returned.
func (d *ReplicaDescriptors) RemoveReplica(nodeID NodeID, storeID StoreID) (ReplicaDescriptor, bool) {
idx := -1
for i := range d.wrapped {
if d.wrapped[i].Equal(r) {
if d.wrapped[i].NodeID == nodeID && d.wrapped[i].StoreID == storeID {
idx = i
break
}
}
if idx == -1 {
return false
return ReplicaDescriptor{}, false
}
// Swap with the last element so we can simply truncate the slice.
d.wrapped[idx], d.wrapped[len(d.wrapped)-1] = d.wrapped[len(d.wrapped)-1], d.wrapped[idx]
removed := d.wrapped[len(d.wrapped)-1]
d.wrapped = d.wrapped[:len(d.wrapped)-1]
return true
return removed, true
}

// QuorumSize returns the number of voter replicas required for quorum in a raft
Expand Down
6 changes: 6 additions & 0 deletions pkg/settings/cluster/cockroach_versions.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ const (
VersionQueryTxnTimestamp
VersionStickyBit
VersionParallelCommits
VersionLearnerReplicas

// Add new versions here (step one of two).

Expand Down Expand Up @@ -479,6 +480,11 @@ var versionsSingleton = keyedVersions([]keyedVersion{
Key: VersionParallelCommits,
Version: roachpb.Version{Major: 19, Minor: 1, Unstable: 4},
},
{
// VersionLearnerReplicas is WIP
Key: VersionLearnerReplicas,
Version: roachpb.Version{Major: 19, Minor: 1, Unstable: 5},
},

// Add new versions here (step two of two).

Expand Down
5 changes: 3 additions & 2 deletions pkg/settings/cluster/versionkey_string.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

36 changes: 34 additions & 2 deletions pkg/storage/allocator.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ const (
addMissingReplicaPriority float64 = 10000
addDecommissioningReplacementPriority float64 = 5000
removeDeadReplicaPriority float64 = 1000
removeLearnerReplicaPriority float64 = 300
removeDecommissioningReplicaPriority float64 = 200
removeExtraReplicaPriority float64 = 100
)
Expand Down Expand Up @@ -100,6 +101,7 @@ const (
AllocatorAdd
AllocatorRemoveDead
AllocatorRemoveDecommissioning
AllocatorRemoveLearner
AllocatorConsiderRebalance
)

Expand All @@ -109,6 +111,7 @@ var allocatorActionNames = map[AllocatorAction]string{
AllocatorAdd: "add",
AllocatorRemoveDead: "remove dead",
AllocatorRemoveDecommissioning: "remove decommissioning",
AllocatorRemoveLearner: "remove learner",
AllocatorConsiderRebalance: "consider rebalance",
}

Expand Down Expand Up @@ -296,8 +299,37 @@ func (a *Allocator) ComputeAction(
// Do nothing if storePool is nil for some unittests.
return AllocatorNoop, 0
}
// TODO(mrtracy): Handle non-homogeneous and mismatched attribute sets.

// Seeing a learner replica at this point is unexpected because learners are a
// short-lived (ish) transient state in a learner+snapshot+voter cycle, which
// is always done atomically. Only two places could have added a learner: the
// replicate queue or AdminChangeReplicas request.
//
// The replicate queue only operates on leaseholders, which means that only
// one node at a time is operating on a given range. Combined with the above
// atomicity, this means that if the replicate queue sees a learner, either
// the node that was adding it crashed somewhere in the learner+snapshot+voter
// cycle and we're the new leaseholder or we caught a race with
// AdminChangeReplicas.
//
// In the first case, we could assume the node that was adding it knew what it
// was doing and finish the addition. Or we could leave it and do higher
// priority operations first if there are any. However, this comes with code
// complexity and concept complexity (computing old vs new quorum sizes
// becomes ambiguous, the learner isn't in the quorum but it likely will be
// soon, so do you count it?). Instead, we do the simplest thing and remove it
// before doing any other operations to the range and we'll revisit this
// decision if and when the complexity becomes necessary.
//
// TODO(dan): Address the race with the AdminChangeReplicas request.
if learners := rangeInfo.Desc.Replicas().Learners(); len(learners) > 0 {
// TODO(dan): Since this goes before anything else, the priority here should
// be influenced by whatever operations would happen right after the learner
// is removed.
return AllocatorRemoveLearner, removeLearnerReplicaPriority
}

// TODO(mrtracy): Handle non-homogeneous and mismatched attribute sets.
have := len(rangeInfo.Desc.Replicas().Unwrap())
decommissioningReplicas := a.storePool.decommissioningReplicas(
rangeInfo.Desc.RangeID, rangeInfo.Desc.Replicas().Unwrap())
Expand Down Expand Up @@ -654,7 +686,7 @@ func (a Allocator) RebalanceTarget(

log.VEventf(ctx, 2, "not rebalancing to s%d because we'd immediately remove it: %s",
target.store.StoreID, removeDetails)
rangeInfo.Desc.RemoveReplica(newReplica)
rangeInfo.Desc.RemoveReplica(newReplica.NodeID, newReplica.StoreID)
}

// Compile the details entry that will be persisted into system.rangelog for
Expand Down
34 changes: 34 additions & 0 deletions pkg/storage/allocator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ import (
"github.com/olekukonko/tablewriter"
"github.com/pkg/errors"
"go.etcd.io/etcd/raft"
"github.com/stretchr/testify/require"
)

const firstRange = roachpb.RangeID(1)
Expand Down Expand Up @@ -4771,6 +4772,39 @@ func TestAllocatorComputeActionDecommission(t *testing.T) {
}
}

func TestAllocatorRemoveLearner(t *testing.T) {
defer leaktest.AfterTest(t)()

zone := config.ZoneConfig{
NumReplicas: proto.Int32(3),
}
rangeWithLearnerDesc := roachpb.RangeDescriptor{
InternalReplicas: []roachpb.ReplicaDescriptor{
{
StoreID: 1,
NodeID: 1,
ReplicaID: 1,
},
{
StoreID: 2,
NodeID: 2,
ReplicaID: 2,
Type: roachpb.ReplicaType_LEARNER,
},
},
}

// Removing a learner is prioritized over adding a new replica to an under
// replicated range.
stopper, _, sp, a, _ := createTestAllocator(10, false /* deterministic */)
ctx := context.Background()
defer stopper.Stop(ctx)
live, dead := []roachpb.StoreID{1, 2}, []roachpb.StoreID{3}
mockStorePool(sp, live, nil, dead, nil, nil)
action, _ := a.ComputeAction(ctx, &zone, RangeInfo{Desc: &rangeWithLearnerDesc})
require.Equal(t, AllocatorRemoveLearner, action)
}

func TestAllocatorComputeActionDynamicNumReplicas(t *testing.T) {
defer leaktest.AfterTest(t)()

Expand Down
Loading

0 comments on commit 51925ce

Please sign in to comment.