Skip to content

Commit

Permalink
Merge pull request #33047 from a-robinson/backport-32949
Browse files Browse the repository at this point in the history
release-2.1: storage: Fix GetNeededReplicas to count non-live, non-dead nodes
  • Loading branch information
tbg authored Dec 12, 2018
2 parents 138369e + 4aa0b52 commit 67e4d98
Show file tree
Hide file tree
Showing 10 changed files with 175 additions and 94 deletions.
1 change: 1 addition & 0 deletions pkg/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -545,6 +545,7 @@ func NewServer(cfg Config, stopper *stop.Stopper) (*Server, error) {
s.gossip,
s.recorder,
s.nodeLiveness,
s.storePool,
s.rpcContext,
s.node.stores,
s.stopper,
Expand Down
10 changes: 7 additions & 3 deletions pkg/server/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ import (
"strings"
"sync"

"github.com/cockroachdb/cockroach/pkg/config"
"github.com/cockroachdb/cockroach/pkg/storage/engine"
gwruntime "github.com/grpc-ecosystem/grpc-gateway/runtime"
"github.com/pkg/errors"
Expand All @@ -46,7 +47,6 @@ import (

"github.com/cockroachdb/cockroach/pkg/base"
"github.com/cockroachdb/cockroach/pkg/build"
"github.com/cockroachdb/cockroach/pkg/config"
"github.com/cockroachdb/cockroach/pkg/gossip"
"github.com/cockroachdb/cockroach/pkg/internal/client"
"github.com/cockroachdb/cockroach/pkg/keys"
Expand Down Expand Up @@ -130,6 +130,7 @@ type statusServer struct {
gossip *gossip.Gossip
metricSource metricMarshaler
nodeLiveness *storage.NodeLiveness
storePool *storage.StorePool
rpcCtx *rpc.Context
stores *storage.Stores
stopper *stop.Stopper
Expand All @@ -146,6 +147,7 @@ func newStatusServer(
gossip *gossip.Gossip,
metricSource metricMarshaler,
nodeLiveness *storage.NodeLiveness,
storePool *storage.StorePool,
rpcCtx *rpc.Context,
stores *storage.Stores,
stopper *stop.Stopper,
Expand All @@ -161,6 +163,7 @@ func newStatusServer(
gossip: gossip,
metricSource: metricSource,
nodeLiveness: nodeLiveness,
storePool: storePool,
rpcCtx: rpcCtx,
stores: stores,
stopper: stopper,
Expand Down Expand Up @@ -1230,6 +1233,7 @@ func (s *statusServer) Ranges(
cfg = config.SystemConfig{}
}
isLiveMap := s.nodeLiveness.GetIsLiveMap()
availableNodes := s.storePool.AvailableNodeCount()

err = s.stores.VisitStores(func(store *storage.Store) error {
timestamp := store.Clock().Now()
Expand All @@ -1249,7 +1253,7 @@ func (s *statusServer) Ranges(
desc,
rep,
store.Ident.StoreID,
rep.Metrics(ctx, timestamp, cfg, isLiveMap),
rep.Metrics(ctx, timestamp, cfg, isLiveMap, availableNodes),
))
return false, nil
})
Expand All @@ -1269,7 +1273,7 @@ func (s *statusServer) Ranges(
*desc,
rep,
store.Ident.StoreID,
rep.Metrics(ctx, timestamp, cfg, isLiveMap),
rep.Metrics(ctx, timestamp, cfg, isLiveMap, availableNodes),
))
}
return nil
Expand Down
26 changes: 15 additions & 11 deletions pkg/storage/allocator.go
Original file line number Diff line number Diff line change
Expand Up @@ -239,17 +239,19 @@ func MakeAllocator(
}
}

// GetNeededReplicas calculates the number of replicas a range should have given its zone config and
// dynamic up and down replication.
func GetNeededReplicas(
zoneConfigReplicaCount int32, aliveReplicas int, decommissioningReplicas int,
) int {
// GetNeededReplicas calculates the number of replicas a range should
// have given its zone config and the number of nodes available for
// up-replication (i.e. not dead and not decommissioning).
func GetNeededReplicas(zoneConfigReplicaCount int32, availableNodes int) int {
numZoneReplicas := int(zoneConfigReplicaCount)
need := numZoneReplicas

// We're adjusting the replication factor all ranges so that if there are less nodes than
// replicas specified in the zone config, the cluster can still function.
need = int(math.Min(float64(aliveReplicas-decommissioningReplicas), float64(need)))
// Adjust the replication factor for all ranges if there are fewer
// nodes than replicas specified in the zone config, so the cluster
// can still function.
if availableNodes < need {
need = availableNodes
}

// Ensure that we don't up- or down-replicate to an even number of replicas
// unless an even number of replicas was specifically requested by the user
Expand All @@ -264,7 +266,9 @@ func GetNeededReplicas(
if need%2 == 0 {
need = need - 1
}
need = int(math.Max(3.0, float64(need)))
if need < 3 {
need = 3
}
if need > numZoneReplicas {
need = numZoneReplicas
}
Expand All @@ -286,8 +290,8 @@ func (a *Allocator) ComputeAction(

have := len(rangeInfo.Desc.Replicas)
decommissioningReplicas := a.storePool.decommissioningReplicas(rangeInfo.Desc.RangeID, rangeInfo.Desc.Replicas)
_, aliveStoreCount, _ := a.storePool.getStoreList(rangeInfo.Desc.RangeID, storeFilterNone)
need := GetNeededReplicas(zone.NumReplicas, aliveStoreCount, len(decommissioningReplicas))
availableNodes := a.storePool.AvailableNodeCount()
need := GetNeededReplicas(zone.NumReplicas, availableNodes)
desiredQuorum := computeQuorum(need)
quorum := computeQuorum(have)

Expand Down
Loading

0 comments on commit 67e4d98

Please sign in to comment.