From 8488d026f1c50794f9fc94f028b8452fa26bc78e Mon Sep 17 00:00:00 2001 From: Alex Robinson Date: Wed, 5 Sep 2018 11:55:46 -0500 Subject: [PATCH] storage: Add metrics tracking load-based rebalance operations Release note: None --- pkg/storage/store.go | 10 +++-- pkg/storage/store_rebalancer.go | 41 +++++++++++++++++-- .../nodeGraphs/dashboards/replication.tsx | 2 + 3 files changed, 46 insertions(+), 7 deletions(-) diff --git a/pkg/storage/store.go b/pkg/storage/store.go index 66ea4b7cd31d..23f692c23e17 100644 --- a/pkg/storage/store.go +++ b/pkg/storage/store.go @@ -1021,10 +1021,10 @@ func NewStore(cfg StoreConfig, eng engine.Engine, nodeDesc *roachpb.NodeDescript ) s.scanner.AddQueues(s.tsMaintenanceQueue) } - } - s.storeRebalancer = NewStoreRebalancer( - s.cfg.AmbientCtx, cfg.Settings, s.replicateQueue, s.replRankings) + s.storeRebalancer = NewStoreRebalancer( + s.cfg.AmbientCtx, cfg.Settings, s.replicateQueue, s.replRankings) + } if cfg.TestingKnobs.DisableGCQueue { s.setGCQueueActive(false) @@ -1560,7 +1560,9 @@ func (s *Store) Start(ctx context.Context, stopper *stop.Stopper) error { // Connect rangefeeds to closed timestamp updates. s.startClosedTimestampRangefeedSubscriber(ctx) - s.storeRebalancer.Start(ctx, s.stopper) + if s.storeRebalancer != nil { + s.storeRebalancer.Start(ctx, s.stopper) + } // Start the storage engine compactor. if envutil.EnvOrDefaultBool("COCKROACH_ENABLE_COMPACTOR", true) { diff --git a/pkg/storage/store_rebalancer.go b/pkg/storage/store_rebalancer.go index 4627828b824c..41daaf277a25 100644 --- a/pkg/storage/store_rebalancer.go +++ b/pkg/storage/store_rebalancer.go @@ -26,6 +26,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/settings/cluster" "github.com/cockroachdb/cockroach/pkg/util/hlc" "github.com/cockroachdb/cockroach/pkg/util/log" + "github.com/cockroachdb/cockroach/pkg/util/metric" "github.com/cockroachdb/cockroach/pkg/util/stop" ) @@ -42,6 +43,34 @@ const ( minQPSThresholdDifference = 100 ) +var ( + metaStoreRebalancerLeaseTransferCount = metric.Metadata{ + Name: "rebalancing.lease.transfers", + Help: "Number of lease transfers motivated by store-level load imbalances", + Measurement: "Lease Transfers", + Unit: metric.Unit_COUNT, + } + metaStoreRebalancerRangeRebalanceCount = metric.Metadata{ + Name: "rebalancing.range.rebalances", + Help: "Number of range rebalance operations motivated by store-level load imbalances", + Measurement: "Range Rebalances", + Unit: metric.Unit_COUNT, + } +) + +// StoreRebalancerMetrics is the set of metrics for the store-level rebalancer. +type StoreRebalancerMetrics struct { + LeaseTransferCount *metric.Counter + RangeRebalanceCount *metric.Counter +} + +func makeStoreRebalancerMetrics() StoreRebalancerMetrics { + return StoreRebalancerMetrics{ + LeaseTransferCount: metric.NewCounter(metaStoreRebalancerLeaseTransferCount), + RangeRebalanceCount: metric.NewCounter(metaStoreRebalancerRangeRebalanceCount), + } +} + // LoadBasedRebalancingMode controls whether range rebalancing takes // additional variables such as write load and disk usage into account. // If disabled, rebalancing is done purely based on replica count. @@ -95,6 +124,7 @@ const ( // will best accomplish the store-level goals. type StoreRebalancer struct { log.AmbientContext + metrics StoreRebalancerMetrics st *cluster.Settings rq *replicateQueue replRankings *replicaRankings @@ -109,12 +139,15 @@ func NewStoreRebalancer( replRankings *replicaRankings, ) *StoreRebalancer { ambientCtx.AddLogTag("store-rebalancer", nil) - return &StoreRebalancer{ + sr := &StoreRebalancer{ AmbientContext: ambientCtx, + metrics: makeStoreRebalancerMetrics(), st: st, rq: rq, replRankings: replRankings, } + sr.rq.store.metrics.registry.AddMetricStruct(&sr.metrics) + return sr } // Start runs an infinite loop in a goroutine which regularly checks whether @@ -217,6 +250,7 @@ func (sr *StoreRebalancer) rebalanceStore( continue } cancel() + sr.metrics.LeaseTransferCount.Inc(1) // Finally, update our local copies of the descriptors so that if // additional transfers are needed we'll be making the decisions with more @@ -271,14 +305,15 @@ func (sr *StoreRebalancer) rebalanceStore( log.VEventf(ctx, 1, "rebalancing r%d (%.2f qps) from %v to %v to better balance load", replWithStats.repl.RangeID, replWithStats.qps, descBeforeRebalance.Replicas, targets) replCtx, cancel := context.WithTimeout(replWithStats.repl.AnnotateCtx(ctx), sr.rq.processTimeout) - // TODO: Either make RelocateRange production-ready or do the rebalancing - // another way. + // TODO(a-robinson): Either make RelocateRange production-ready or do the + // rebalancing another way. if err := RelocateRange(replCtx, sr.rq.store.DB(), *descBeforeRebalance, targets); err != nil { cancel() log.Errorf(replCtx, "unable to relocate range to %v: %v", targets, err) continue } cancel() + sr.metrics.RangeRebalanceCount.Inc(1) // Finally, update our local copies of the descriptors so that if // additional transfers are needed we'll be making the decisions with more diff --git a/pkg/ui/src/views/cluster/containers/nodeGraphs/dashboards/replication.tsx b/pkg/ui/src/views/cluster/containers/nodeGraphs/dashboards/replication.tsx index 135bd59f95a6..c81e41a2eb9d 100644 --- a/pkg/ui/src/views/cluster/containers/nodeGraphs/dashboards/replication.tsx +++ b/pkg/ui/src/views/cluster/containers/nodeGraphs/dashboards/replication.tsx @@ -85,6 +85,8 @@ export default function (props: GraphDashboardProps) { + + ,