Skip to content

Commit

Permalink
storage: Add metrics tracking load-based rebalance operations
Browse files Browse the repository at this point in the history
Release note: None
  • Loading branch information
a-robinson committed Sep 6, 2018
1 parent 9d4d78d commit 8488d02
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 7 deletions.
10 changes: 6 additions & 4 deletions pkg/storage/store.go
Original file line number Diff line number Diff line change
Expand Up @@ -1021,10 +1021,10 @@ func NewStore(cfg StoreConfig, eng engine.Engine, nodeDesc *roachpb.NodeDescript
)
s.scanner.AddQueues(s.tsMaintenanceQueue)
}
}

s.storeRebalancer = NewStoreRebalancer(
s.cfg.AmbientCtx, cfg.Settings, s.replicateQueue, s.replRankings)
s.storeRebalancer = NewStoreRebalancer(
s.cfg.AmbientCtx, cfg.Settings, s.replicateQueue, s.replRankings)
}

if cfg.TestingKnobs.DisableGCQueue {
s.setGCQueueActive(false)
Expand Down Expand Up @@ -1560,7 +1560,9 @@ func (s *Store) Start(ctx context.Context, stopper *stop.Stopper) error {
// Connect rangefeeds to closed timestamp updates.
s.startClosedTimestampRangefeedSubscriber(ctx)

s.storeRebalancer.Start(ctx, s.stopper)
if s.storeRebalancer != nil {
s.storeRebalancer.Start(ctx, s.stopper)
}

// Start the storage engine compactor.
if envutil.EnvOrDefaultBool("COCKROACH_ENABLE_COMPACTOR", true) {
Expand Down
41 changes: 38 additions & 3 deletions pkg/storage/store_rebalancer.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
"github.com/cockroachdb/cockroach/pkg/util/hlc"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/cockroach/pkg/util/metric"
"github.com/cockroachdb/cockroach/pkg/util/stop"
)

Expand All @@ -42,6 +43,34 @@ const (
minQPSThresholdDifference = 100
)

var (
metaStoreRebalancerLeaseTransferCount = metric.Metadata{
Name: "rebalancing.lease.transfers",
Help: "Number of lease transfers motivated by store-level load imbalances",
Measurement: "Lease Transfers",
Unit: metric.Unit_COUNT,
}
metaStoreRebalancerRangeRebalanceCount = metric.Metadata{
Name: "rebalancing.range.rebalances",
Help: "Number of range rebalance operations motivated by store-level load imbalances",
Measurement: "Range Rebalances",
Unit: metric.Unit_COUNT,
}
)

// StoreRebalancerMetrics is the set of metrics for the store-level rebalancer.
type StoreRebalancerMetrics struct {
LeaseTransferCount *metric.Counter
RangeRebalanceCount *metric.Counter
}

func makeStoreRebalancerMetrics() StoreRebalancerMetrics {
return StoreRebalancerMetrics{
LeaseTransferCount: metric.NewCounter(metaStoreRebalancerLeaseTransferCount),
RangeRebalanceCount: metric.NewCounter(metaStoreRebalancerRangeRebalanceCount),
}
}

// LoadBasedRebalancingMode controls whether range rebalancing takes
// additional variables such as write load and disk usage into account.
// If disabled, rebalancing is done purely based on replica count.
Expand Down Expand Up @@ -95,6 +124,7 @@ const (
// will best accomplish the store-level goals.
type StoreRebalancer struct {
log.AmbientContext
metrics StoreRebalancerMetrics
st *cluster.Settings
rq *replicateQueue
replRankings *replicaRankings
Expand All @@ -109,12 +139,15 @@ func NewStoreRebalancer(
replRankings *replicaRankings,
) *StoreRebalancer {
ambientCtx.AddLogTag("store-rebalancer", nil)
return &StoreRebalancer{
sr := &StoreRebalancer{
AmbientContext: ambientCtx,
metrics: makeStoreRebalancerMetrics(),
st: st,
rq: rq,
replRankings: replRankings,
}
sr.rq.store.metrics.registry.AddMetricStruct(&sr.metrics)
return sr
}

// Start runs an infinite loop in a goroutine which regularly checks whether
Expand Down Expand Up @@ -217,6 +250,7 @@ func (sr *StoreRebalancer) rebalanceStore(
continue
}
cancel()
sr.metrics.LeaseTransferCount.Inc(1)

// Finally, update our local copies of the descriptors so that if
// additional transfers are needed we'll be making the decisions with more
Expand Down Expand Up @@ -271,14 +305,15 @@ func (sr *StoreRebalancer) rebalanceStore(
log.VEventf(ctx, 1, "rebalancing r%d (%.2f qps) from %v to %v to better balance load",
replWithStats.repl.RangeID, replWithStats.qps, descBeforeRebalance.Replicas, targets)
replCtx, cancel := context.WithTimeout(replWithStats.repl.AnnotateCtx(ctx), sr.rq.processTimeout)
// TODO: Either make RelocateRange production-ready or do the rebalancing
// another way.
// TODO(a-robinson): Either make RelocateRange production-ready or do the
// rebalancing another way.
if err := RelocateRange(replCtx, sr.rq.store.DB(), *descBeforeRebalance, targets); err != nil {
cancel()
log.Errorf(replCtx, "unable to relocate range to %v: %v", targets, err)
continue
}
cancel()
sr.metrics.RangeRebalanceCount.Inc(1)

// Finally, update our local copies of the descriptors so that if
// additional transfers are needed we'll be making the decisions with more
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,8 @@ export default function (props: GraphDashboardProps) {
<Metric name="cr.store.range.adds" title="Adds" nonNegativeRate />
<Metric name="cr.store.range.removes" title="Removes" nonNegativeRate />
<Metric name="cr.store.leases.transfers.success" title="Lease Transfers" nonNegativeRate />
<Metric name="cr.store.rebalancing.lease.transfers" title="Load-based Lease Transfers" nonNegativeRate />
<Metric name="cr.store.rebalancing.range.rebalances" title="Load-based Range Rebalances" nonNegativeRate />
</Axis>
</LineGraph>,

Expand Down

0 comments on commit 8488d02

Please sign in to comment.