From d91c823a016a2bd5bb58bb36016d489ae4678bdc Mon Sep 17 00:00:00 2001 From: lhy1024 Date: Tue, 27 Dec 2022 02:14:19 +0800 Subject: [PATCH 1/5] *: reduce WithLabelValues of prometheus Signed-off-by: lhy1024 --- pkg/storage/kv/etcd_kv.go | 18 ++- server/cluster/cluster.go | 14 ++- server/cluster/coordinator.go | 11 +- server/metrics.go | 1 + server/region_syncer/history_buffer.go | 13 ++- .../schedule/checker/joint_state_checker.go | 21 +++- server/schedule/checker/learner_checker.go | 7 +- server/schedule/checker/merge_checker.go | 71 ++++++++---- server/schedule/checker/replica_checker.go | 71 ++++++++---- server/schedule/checker/rule_checker.go | 102 +++++++++++------ server/schedule/checker/split_checker.go | 12 +- server/schedule/filter/filters.go | 1 + server/schedule/hbstream/metric.go | 1 + server/schedule/metrics.go | 1 + server/schedule/region_scatterer.go | 37 +++--- server/schedulers/balance_leader.go | 32 ++++-- server/schedulers/balance_region.go | 29 +++-- server/schedulers/evict_leader.go | 19 +++- server/schedulers/evict_slow_store.go | 5 +- server/schedulers/grant_hot_region.go | 10 +- server/schedulers/grant_leader.go | 13 ++- server/schedulers/hot_region.go | 105 +++++++++++++++--- server/schedulers/label.go | 19 +++- server/schedulers/metrics.go | 2 + server/schedulers/random_merge.go | 22 +++- server/schedulers/scatter_range.go | 39 ++++--- server/schedulers/shuffle_hot_region.go | 13 ++- server/schedulers/shuffle_leader.go | 16 ++- server/schedulers/shuffle_region.go | 24 ++-- server/schedulers/split_bucket.go | 34 ++++-- server/schedulers/transfer_witness_leader.go | 13 ++- server/schedulers/utils.go | 2 + server/server.go | 13 ++- server/statistics/region_collection.go | 49 +++++--- server/statistics/store_collection.go | 2 +- server/statistics/store_hot_peers_infos.go | 1 + server/tso/metrics.go | 1 + tools/pd-simulator/simulator/task.go | 2 + 38 files changed, 612 insertions(+), 234 deletions(-) diff --git a/pkg/storage/kv/etcd_kv.go b/pkg/storage/kv/etcd_kv.go index 4a941e1abd4..5f8f92e4e98 100644 --- a/pkg/storage/kv/etcd_kv.go +++ b/pkg/storage/kv/etcd_kv.go @@ -34,6 +34,14 @@ const ( slowRequestTime = time.Second ) +var ( + // WithLabelValues is a heavy operation, define variable to avoid call it every time. + txnFailedCounter = txnCounter.WithLabelValues("failed") + txnSuccessCounter = txnCounter.WithLabelValues("success") + txnFailedDurationHist = txnDuration.WithLabelValues("failed") + txnSuccessDurationHist = txnDuration.WithLabelValues("success") +) + type etcdKVBase struct { client *clientv3.Client rootPath string @@ -162,12 +170,14 @@ func (t *SlowLogTxn) Commit() (*clientv3.TxnResponse, error) { zap.Duration("cost", cost), errs.ZapError(err)) } - label := "success" + if err != nil { - label = "failed" + txnFailedCounter.Inc() + txnFailedDurationHist.Observe(cost.Seconds()) + } else { + txnSuccessCounter.Inc() + txnSuccessDurationHist.Observe(cost.Seconds()) } - txnCounter.WithLabelValues(label).Inc() - txnDuration.WithLabelValues(label).Observe(cost.Seconds()) return resp, errors.WithStack(err) } diff --git a/server/cluster/cluster.go b/server/cluster/cluster.go index ae615dc35e8..0c22a595560 100644 --- a/server/cluster/cluster.go +++ b/server/cluster/cluster.go @@ -64,8 +64,12 @@ var ( // DefaultMinResolvedTSPersistenceInterval is the default value of min resolved ts persistence interval. // If interval in config is zero, it means not to persist resolved ts and check config with this DefaultMinResolvedTSPersistenceInterval DefaultMinResolvedTSPersistenceInterval = config.DefaultMinResolvedTSPersistenceInterval - regionUpdateCacheEventCounter = regionEventCounter.WithLabelValues("update_cache") - regionUpdateKVEventCounter = regionEventCounter.WithLabelValues("update_kv") + // WithLabelValues is a heavy operation, define variable to avoid call it every time. + regionUpdateCacheEventCounter = regionEventCounter.WithLabelValues("update_cache") + regionUpdateKVEventCounter = regionEventCounter.WithLabelValues("update_kv") + regionCacheMissCounter = bucketEventCounter.WithLabelValues("region_cache_miss") + versionNotMatchCounter = bucketEventCounter.WithLabelValues("version_not_match") + updateFailedCounter = bucketEventCounter.WithLabelValues("update_failed") ) // regionLabelGCInterval is the interval to run region-label's GC work. @@ -795,7 +799,7 @@ func (c *RaftCluster) HandleStoreHeartbeat(heartbeat *pdpb.StoreHeartbeatRequest func (c *RaftCluster) processReportBuckets(buckets *metapb.Buckets) error { region := c.core.GetRegion(buckets.GetRegionId()) if region == nil { - bucketEventCounter.WithLabelValues("region_cache_miss").Inc() + regionCacheMissCounter.Inc() return errors.Errorf("region %v not found", buckets.GetRegionId()) } // use CAS to update the bucket information. @@ -806,7 +810,7 @@ func (c *RaftCluster) processReportBuckets(buckets *metapb.Buckets) error { old := region.GetBuckets() // region should not update if the version of the buckets is less than the old one. if old != nil && buckets.GetVersion() <= old.GetVersion() { - bucketEventCounter.WithLabelValues("version_not_match").Inc() + versionNotMatchCounter.Inc() return nil } failpoint.Inject("concurrentBucketHeartbeat", func() { @@ -816,7 +820,7 @@ func (c *RaftCluster) processReportBuckets(buckets *metapb.Buckets) error { return nil } } - bucketEventCounter.WithLabelValues("update_failed").Inc() + updateFailedCounter.Inc() return nil } diff --git a/server/cluster/coordinator.go b/server/cluster/coordinator.go index 03f0182f2ba..1271e768992 100644 --- a/server/cluster/coordinator.go +++ b/server/cluster/coordinator.go @@ -58,6 +58,12 @@ const ( PluginUnload = "PluginUnload" ) +var ( + // WithLabelValues is a heavy operation, define variable to avoid call it every time. + waitingListGauge = regionListGauge.WithLabelValues("waiting_list") + priorityListGauge = regionListGauge.WithLabelValues("priority_list") +) + // coordinator is used to manage all schedulers and checkers to decide if the region needs to be scheduled. type coordinator struct { syncutil.RWMutex @@ -181,7 +187,7 @@ func (c *coordinator) checkSuspectRegions() { func (c *coordinator) checkWaitingRegions() { items := c.checkers.GetWaitingRegions() - regionListGauge.WithLabelValues("waiting_list").Set(float64(len(items))) + waitingListGauge.Set(float64(len(items))) for _, item := range items { region := c.cluster.GetRegion(item.Key) c.tryAddOperators(region) @@ -192,7 +198,7 @@ func (c *coordinator) checkWaitingRegions() { func (c *coordinator) checkPriorityRegions() { items := c.checkers.GetPriorityRegions() removes := make([]uint64, 0) - regionListGauge.WithLabelValues("priority_list").Set(float64(len(items))) + priorityListGauge.Set(float64(len(items))) for _, id := range items { region := c.cluster.GetRegion(id) if region == nil { @@ -569,6 +575,7 @@ func collectHotMetrics(cluster *RaftCluster, stores []*core.StoreInfo, typ stati status := statistics.CollectHotPeerInfos(stores, regionStats) // only returns TotalBytesRate,TotalKeysRate,TotalQueryRate,Count for _, s := range stores { + // todo: pre-allocate gauge metrics storeAddress := s.GetAddress() storeID := s.GetID() storeLabel := strconv.FormatUint(storeID, 10) diff --git a/server/metrics.go b/server/metrics.go index 4a3efaeebbc..338f61d421f 100644 --- a/server/metrics.go +++ b/server/metrics.go @@ -118,6 +118,7 @@ var ( Buckets: prometheus.ExponentialBuckets(0.0001, 2, 29), // 0.1ms ~ 7hours }, []string{"address", "store"}) + // todo: pre-allocate gauge metrics storeHeartbeatHandleDuration = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Namespace: "pd", diff --git a/server/region_syncer/history_buffer.go b/server/region_syncer/history_buffer.go index a8b07498458..a5b58149a4a 100644 --- a/server/region_syncer/history_buffer.go +++ b/server/region_syncer/history_buffer.go @@ -30,6 +30,13 @@ const ( defaultFlushCount = 100 ) +var ( + // WithLabelValues is a heavy operation, define variable to avoid call it every time. + syncIndexGauge = regionSyncerStatus.WithLabelValues("sync_index") + firstIndexGauge = regionSyncerStatus.WithLabelValues("first_index") + lastIndexGauge = regionSyncerStatus.WithLabelValues("last_index") +) + type historyBuffer struct { syncutil.RWMutex index uint64 @@ -80,7 +87,7 @@ func (h *historyBuffer) firstIndex() uint64 { func (h *historyBuffer) Record(r *core.RegionInfo) { h.Lock() defer h.Unlock() - regionSyncerStatus.WithLabelValues("sync_index").Set(float64(h.index)) + syncIndexGauge.Set(float64(h.index)) h.records[h.tail] = r h.tail = (h.tail + 1) % h.size if h.tail == h.head { @@ -148,8 +155,8 @@ func (h *historyBuffer) reload() { } func (h *historyBuffer) persist() { - regionSyncerStatus.WithLabelValues("first_index").Set(float64(h.firstIndex())) - regionSyncerStatus.WithLabelValues("last_index").Set(float64(h.nextIndex())) + firstIndexGauge.Set(float64(h.firstIndex())) + lastIndexGauge.Set(float64(h.nextIndex())) err := h.kv.Save(historyKey, strconv.FormatUint(h.nextIndex(), 10)) if err != nil { log.Warn("persist history index failed", zap.Uint64("persist-index", h.nextIndex()), errs.ZapError(err)) diff --git a/server/schedule/checker/joint_state_checker.go b/server/schedule/checker/joint_state_checker.go index cc1edbea5ba..79c961179dd 100644 --- a/server/schedule/checker/joint_state_checker.go +++ b/server/schedule/checker/joint_state_checker.go @@ -28,6 +28,17 @@ type JointStateChecker struct { cluster schedule.Cluster } +const jointStateCheckerName = "joint_state_checker" + +var ( + // WithLabelValues is a heavy operation, define variable to avoid call it every time. + jointCheckCounter = checkerCounter.WithLabelValues(jointStateCheckerName, "check") + jointCheckerPausedCounter = checkerCounter.WithLabelValues(jointStateCheckerName, "paused") + jointCheckerFailedCounter = checkerCounter.WithLabelValues(jointStateCheckerName, "create-operator-fail") + jointCheckerNewOpCounter = checkerCounter.WithLabelValues(jointStateCheckerName, "new-operator") + jointCheckerTransferLeaderCounter = checkerCounter.WithLabelValues(jointStateCheckerName, "transfer-leader") +) + // NewJointStateChecker creates a joint state checker. func NewJointStateChecker(cluster schedule.Cluster) *JointStateChecker { return &JointStateChecker{ @@ -37,9 +48,9 @@ func NewJointStateChecker(cluster schedule.Cluster) *JointStateChecker { // Check verifies a region's role, creating an Operator if need. func (c *JointStateChecker) Check(region *core.RegionInfo) *operator.Operator { - checkerCounter.WithLabelValues("joint_state_checker", "check").Inc() + jointCheckCounter.Inc() if c.IsPaused() { - checkerCounter.WithLabelValues("joint_state_checker", "paused").Inc() + jointCheckerPausedCounter.Inc() return nil } if !core.IsInJointState(region.GetPeers()...) { @@ -47,13 +58,13 @@ func (c *JointStateChecker) Check(region *core.RegionInfo) *operator.Operator { } op, err := operator.CreateLeaveJointStateOperator(operator.OpDescLeaveJointState, c.cluster, region) if err != nil { - checkerCounter.WithLabelValues("joint_state_checker", "create-operator-fail").Inc() + jointCheckerFailedCounter.Inc() log.Debug("fail to create leave joint state operator", errs.ZapError(err)) return nil } else if op != nil { - checkerCounter.WithLabelValues("joint_state_checker", "new-operator").Inc() + jointCheckerNewOpCounter.Inc() if op.Len() > 1 { - checkerCounter.WithLabelValues("joint_state_checker", "transfer-leader").Inc() + jointCheckerTransferLeaderCounter.Inc() } op.SetPriorityLevel(core.High) } diff --git a/server/schedule/checker/learner_checker.go b/server/schedule/checker/learner_checker.go index f120982dd39..8fb76873909 100644 --- a/server/schedule/checker/learner_checker.go +++ b/server/schedule/checker/learner_checker.go @@ -28,6 +28,11 @@ type LearnerChecker struct { cluster schedule.Cluster } +var ( + // WithLabelValues is a heavy operation, define variable to avoid call it every time. + learnerCheckerPausedCounter = checkerCounter.WithLabelValues("learner_checker", "paused") +) + // NewLearnerChecker creates a learner checker. func NewLearnerChecker(cluster schedule.Cluster) *LearnerChecker { return &LearnerChecker{ @@ -38,7 +43,7 @@ func NewLearnerChecker(cluster schedule.Cluster) *LearnerChecker { // Check verifies a region's role, creating an Operator if need. func (l *LearnerChecker) Check(region *core.RegionInfo) *operator.Operator { if l.IsPaused() { - checkerCounter.WithLabelValues("learner_checker", "paused").Inc() + learnerCheckerPausedCounter.Inc() return nil } for _, p := range region.GetLearners() { diff --git a/server/schedule/checker/merge_checker.go b/server/schedule/checker/merge_checker.go index f30d7ebb077..5789c54d363 100644 --- a/server/schedule/checker/merge_checker.go +++ b/server/schedule/checker/merge_checker.go @@ -41,10 +41,37 @@ const ( // When a region has label `merge_option=deny`, skip merging the region. // If label value is `allow` or other value, it will be treated as `allow`. const ( + mergeCheckerName = "merge_checker" mergeOptionLabel = "merge_option" mergeOptionValueDeny = "deny" ) +var ( + // WithLabelValues is a heavy operation, define variable to avoid call it every time. + mergeCheckerCounter = checkerCounter.WithLabelValues(mergeCheckerName, "check") + mergeCheckerPausedCounter = checkerCounter.WithLabelValues(mergeCheckerName, "paused") + mergeCheckerRecentlySplitCounter = checkerCounter.WithLabelValues(mergeCheckerName, "recently-split") + mergeCheckerRecentlyStartCounter = checkerCounter.WithLabelValues(mergeCheckerName, "recently-start") + mergeCheckerSkipUninitRegionCounter = checkerCounter.WithLabelValues(mergeCheckerName, "skip-uninit-region") + mergeCheckerNoNeedCounter = checkerCounter.WithLabelValues(mergeCheckerName, "no-need") + mergeCheckerSpecialPeerCounter = checkerCounter.WithLabelValues(mergeCheckerName, "special-peer") + mergeCheckerAbnormalReplicaCounter = checkerCounter.WithLabelValues(mergeCheckerName, "abnormal-replica") + mergeCheckerHotRegionCounter = checkerCounter.WithLabelValues(mergeCheckerName, "hot-region") + mergeCheckerNoTargetCounter = checkerCounter.WithLabelValues(mergeCheckerName, "no-target") + mergeCheckerTargetTooLargeCounter = checkerCounter.WithLabelValues(mergeCheckerName, "target-too-large") + mergeCheckerSplitSizeAfterMergeCounter = checkerCounter.WithLabelValues(mergeCheckerName, "split-size-after-merge") + mergeCheckerSplitKeysAfterMergeCounter = checkerCounter.WithLabelValues(mergeCheckerName, "split-keys-after-merge") + mergeCheckerNewOpCounter = checkerCounter.WithLabelValues(mergeCheckerName, "new-operator") + mergeCheckerLargerSourceCounter = checkerCounter.WithLabelValues(mergeCheckerName, "larger-source") + mergeCheckerAdjNotExistCounter = checkerCounter.WithLabelValues(mergeCheckerName, "adj-not-exist") + mergeCheckerAdjRecentlySplitCounter = checkerCounter.WithLabelValues(mergeCheckerName, "adj-recently-split") + mergeCheckerAdjRegionHotCounter = checkerCounter.WithLabelValues(mergeCheckerName, "adj-region-hot") + mergeCheckerAdjDisallowMergeCounter = checkerCounter.WithLabelValues(mergeCheckerName, "adj-disallow-merge") + mergeCheckerAdjAbnormalPeerStoreCounter = checkerCounter.WithLabelValues(mergeCheckerName, "adj-abnormal-peerstore") + mergeCheckerAdjSpecialPeerCounter = checkerCounter.WithLabelValues(mergeCheckerName, "adj-special-peer") + mergeCheckerAdjAbnormalReplicaCounter = checkerCounter.WithLabelValues(mergeCheckerName, "adj-abnormal-replica") +) + // MergeChecker ensures region to merge with adjacent region when size is small type MergeChecker struct { PauseController @@ -81,51 +108,51 @@ func (m *MergeChecker) RecordRegionSplit(regionIDs []uint64) { // Check verifies a region's replicas, creating an Operator if need. func (m *MergeChecker) Check(region *core.RegionInfo) []*operator.Operator { - checkerCounter.WithLabelValues("merge_checker", "check").Inc() + mergeCheckerCounter.Inc() if m.IsPaused() { - checkerCounter.WithLabelValues("merge_checker", "paused").Inc() + mergeCheckerPausedCounter.Inc() return nil } expireTime := m.startTime.Add(m.opts.GetSplitMergeInterval()) if time.Now().Before(expireTime) { - checkerCounter.WithLabelValues("merge_checker", "recently-start").Inc() + mergeCheckerRecentlyStartCounter.Inc() return nil } m.splitCache.UpdateTTL(m.opts.GetSplitMergeInterval()) if m.splitCache.Exists(region.GetID()) { - checkerCounter.WithLabelValues("merge_checker", "recently-split").Inc() + mergeCheckerRecentlySplitCounter.Inc() return nil } // when pd just started, it will load region meta from region storage, if region.GetLeader() == nil { - checkerCounter.WithLabelValues("merge_checker", "skip-uninit-region").Inc() + mergeCheckerSkipUninitRegionCounter.Inc() return nil } // region is not small enough if !region.NeedMerge(int64(m.opts.GetMaxMergeRegionSize()), int64(m.opts.GetMaxMergeRegionKeys())) { - checkerCounter.WithLabelValues("merge_checker", "no-need").Inc() + mergeCheckerNoNeedCounter.Inc() return nil } // skip region has down peers or pending peers if !filter.IsRegionHealthy(region) { - checkerCounter.WithLabelValues("merge_checker", "special-peer").Inc() + mergeCheckerSpecialPeerCounter.Inc() return nil } if !filter.IsRegionReplicated(m.cluster, region) { - checkerCounter.WithLabelValues("merge_checker", "abnormal-replica").Inc() + mergeCheckerAbnormalReplicaCounter.Inc() return nil } // skip hot region if m.cluster.IsRegionHot(region) { - checkerCounter.WithLabelValues("merge_checker", "hot-region").Inc() + mergeCheckerHotRegionCounter.Inc() return nil } @@ -142,7 +169,7 @@ func (m *MergeChecker) Check(region *core.RegionInfo) []*operator.Operator { } if target == nil { - checkerCounter.WithLabelValues("merge_checker", "no-target").Inc() + mergeCheckerNoTargetCounter.Inc() return nil } @@ -152,18 +179,18 @@ func (m *MergeChecker) Check(region *core.RegionInfo) []*operator.Operator { maxTargetRegionSizeThreshold = maxTargetRegionSize } if target.GetApproximateSize() > maxTargetRegionSizeThreshold { - checkerCounter.WithLabelValues("merge_checker", "target-too-large").Inc() + mergeCheckerTargetTooLargeCounter.Inc() return nil } if err := m.cluster.GetStoreConfig().CheckRegionSize(uint64(target.GetApproximateSize()+region.GetApproximateSize()), m.opts.GetMaxMergeRegionSize()); err != nil { - checkerCounter.WithLabelValues("merge_checker", "split-size-after-merge").Inc() + mergeCheckerSplitSizeAfterMergeCounter.Inc() return nil } if err := m.cluster.GetStoreConfig().CheckRegionKeys(uint64(target.GetApproximateKeys()+region.GetApproximateKeys()), m.opts.GetMaxMergeRegionKeys()); err != nil { - checkerCounter.WithLabelValues("merge_checker", "split-keys-after-merge").Inc() + mergeCheckerSplitKeysAfterMergeCounter.Inc() return nil } @@ -175,47 +202,47 @@ func (m *MergeChecker) Check(region *core.RegionInfo) []*operator.Operator { log.Warn("create merge region operator failed", errs.ZapError(err)) return nil } - checkerCounter.WithLabelValues("merge_checker", "new-operator").Inc() + mergeCheckerNewOpCounter.Inc() if region.GetApproximateSize() > target.GetApproximateSize() || region.GetApproximateKeys() > target.GetApproximateKeys() { - checkerCounter.WithLabelValues("merge_checker", "larger-source").Inc() + mergeCheckerLargerSourceCounter.Inc() } return ops } func (m *MergeChecker) checkTarget(region, adjacent *core.RegionInfo) bool { if adjacent == nil { - checkerCounter.WithLabelValues("merge_checker", "adj-not-exist").Inc() + mergeCheckerAdjNotExistCounter.Inc() return false } if m.splitCache.Exists(adjacent.GetID()) { - checkerCounter.WithLabelValues("merge_checker", "adj-recently-split").Inc() + mergeCheckerAdjRecentlySplitCounter.Inc() return false } if m.cluster.IsRegionHot(adjacent) { - checkerCounter.WithLabelValues("merge_checker", "adj-region-hot").Inc() + mergeCheckerAdjRegionHotCounter.Inc() return false } if !AllowMerge(m.cluster, region, adjacent) { - checkerCounter.WithLabelValues("merge_checker", "adj-disallow-merge").Inc() + mergeCheckerAdjDisallowMergeCounter.Inc() return false } if !checkPeerStore(m.cluster, region, adjacent) { - checkerCounter.WithLabelValues("merge_checker", "adj-abnormal-peerstore").Inc() + mergeCheckerAdjAbnormalPeerStoreCounter.Inc() return false } if !filter.IsRegionHealthy(adjacent) { - checkerCounter.WithLabelValues("merge_checker", "adj-special-peer").Inc() + mergeCheckerAdjSpecialPeerCounter.Inc() return false } if !filter.IsRegionReplicated(m.cluster, adjacent) { - checkerCounter.WithLabelValues("merge_checker", "adj-abnormal-replica").Inc() + mergeCheckerAdjAbnormalReplicaCounter.Inc() return false } diff --git a/server/schedule/checker/replica_checker.go b/server/schedule/checker/replica_checker.go index fe092a36c46..d2511453ac4 100644 --- a/server/schedule/checker/replica_checker.go +++ b/server/schedule/checker/replica_checker.go @@ -30,11 +30,27 @@ import ( const ( replicaCheckerName = "replica-checker" + replicaChecker = "replica_checker" + offlineStatus = "offline" + downStatus = "down" ) -const ( - offlineStatus = "offline" - downStatus = "down" +var ( + // WithLabelValues is a heavy operation, define variable to avoid call it every time. + replicaCheckerCounter = checkerCounter.WithLabelValues(replicaChecker, "check") + replicaCheckerPausedCounter = checkerCounter.WithLabelValues(replicaChecker, "paused") + replicaCheckerOpCounter = checkerCounter.WithLabelValues(replicaChecker, "new-operator") + replicaCheckerNoTargetStoreCounter = checkerCounter.WithLabelValues(replicaChecker, "no-target-store") + replicaCheckerNoWorstPeerCounter = checkerCounter.WithLabelValues(replicaChecker, "no-worst-peer") + replicaCheckerCreateOpFailedCounter = checkerCounter.WithLabelValues(replicaChecker, "create-operator-failed") + replicaCheckerAllRightCounter = checkerCounter.WithLabelValues(replicaChecker, "all-right") + replicaCheckerNotBetterCounter = checkerCounter.WithLabelValues(replicaChecker, "not-better") + replicaCheckerRemoveExtraOfflineFailedCounter = checkerCounter.WithLabelValues(replicaChecker, "remove-extra-offline-replica-failed") + replicaCheckerRemoveExtraDownFailedCounter = checkerCounter.WithLabelValues(replicaChecker, "remove-extra-down-replica-failed") + replicaCheckerNoStoreOfflineCounter = checkerCounter.WithLabelValues(replicaChecker, "no-store-offline") + replicaCheckerNoStoreDownCounter = checkerCounter.WithLabelValues(replicaChecker, "no-store-down") + replicaCheckerReplaceOfflineFailedCounter = checkerCounter.WithLabelValues(replicaChecker, "replace-offline-replica-failed") + replicaCheckerReplaceDownFailedCounter = checkerCounter.WithLabelValues(replicaChecker, "replace-down-replica-failed") ) // ReplicaChecker ensures region has the best replicas. @@ -60,37 +76,37 @@ func NewReplicaChecker(cluster schedule.Cluster, regionWaitingList cache.Cache) // GetType return ReplicaChecker's type func (r *ReplicaChecker) GetType() string { - return "replica-checker" + return replicaCheckerName } // Check verifies a region's replicas, creating an operator.Operator if need. func (r *ReplicaChecker) Check(region *core.RegionInfo) *operator.Operator { - checkerCounter.WithLabelValues("replica_checker", "check").Inc() + replicaCheckerCounter.Inc() if r.IsPaused() { - checkerCounter.WithLabelValues("replica_checker", "paused").Inc() + replicaCheckerPausedCounter.Inc() return nil } if op := r.checkDownPeer(region); op != nil { - checkerCounter.WithLabelValues("replica_checker", "new-operator").Inc() + replicaCheckerOpCounter.Inc() op.SetPriorityLevel(core.High) return op } if op := r.checkOfflinePeer(region); op != nil { - checkerCounter.WithLabelValues("replica_checker", "new-operator").Inc() + replicaCheckerOpCounter.Inc() op.SetPriorityLevel(core.High) return op } if op := r.checkMakeUpReplica(region); op != nil { - checkerCounter.WithLabelValues("replica_checker", "new-operator").Inc() + replicaCheckerOpCounter.Inc() op.SetPriorityLevel(core.High) return op } if op := r.checkRemoveExtraReplica(region); op != nil { - checkerCounter.WithLabelValues("replica_checker", "new-operator").Inc() + replicaCheckerOpCounter.Inc() return op } if op := r.checkLocationReplacement(region); op != nil { - checkerCounter.WithLabelValues("replica_checker", "new-operator").Inc() + replicaCheckerOpCounter.Inc() return op } return nil @@ -160,7 +176,7 @@ func (r *ReplicaChecker) checkMakeUpReplica(region *core.RegionInfo) *operator.O target, filterByTempState := r.strategy(region).SelectStoreToAdd(regionStores) if target == 0 { log.Debug("no store to add replica", zap.Uint64("region-id", region.GetID())) - checkerCounter.WithLabelValues("replica_checker", "no-target-store").Inc() + replicaCheckerNoTargetStoreCounter.Inc() if filterByTempState { r.regionWaitingList.Put(region.GetID(), nil) } @@ -188,13 +204,13 @@ func (r *ReplicaChecker) checkRemoveExtraReplica(region *core.RegionInfo) *opera regionStores := r.cluster.GetRegionStores(region) old := r.strategy(region).SelectStoreToRemove(regionStores) if old == 0 { - checkerCounter.WithLabelValues("replica_checker", "no-worst-peer").Inc() + replicaCheckerNoWorstPeerCounter.Inc() r.regionWaitingList.Put(region.GetID(), nil) return nil } op, err := operator.CreateRemovePeerOperator("remove-extra-replica", r.cluster, operator.OpReplica, region, old) if err != nil { - checkerCounter.WithLabelValues("replica_checker", "create-operator-fail").Inc() + replicaCheckerCreateOpFailedCounter.Inc() return nil } return op @@ -209,20 +225,20 @@ func (r *ReplicaChecker) checkLocationReplacement(region *core.RegionInfo) *oper regionStores := r.cluster.GetRegionStores(region) oldStore := strategy.SelectStoreToRemove(regionStores) if oldStore == 0 { - checkerCounter.WithLabelValues("replica_checker", "all-right").Inc() + replicaCheckerAllRightCounter.Inc() return nil } newStore, _ := strategy.SelectStoreToImprove(regionStores, oldStore) if newStore == 0 { log.Debug("no better peer", zap.Uint64("region-id", region.GetID())) - checkerCounter.WithLabelValues("replica_checker", "not-better").Inc() + replicaCheckerNotBetterCounter.Inc() return nil } newPeer := &metapb.Peer{StoreId: newStore} op, err := operator.CreateMovePeerOperator("move-to-better-location", r.cluster, region, operator.OpReplica, oldStore, newPeer) if err != nil { - checkerCounter.WithLabelValues("replica_checker", "create-operator-fail").Inc() + replicaCheckerCreateOpFailedCounter.Inc() return nil } return op @@ -234,8 +250,11 @@ func (r *ReplicaChecker) fixPeer(region *core.RegionInfo, storeID uint64, status removeExtra := fmt.Sprintf("remove-extra-%s-replica", status) op, err := operator.CreateRemovePeerOperator(removeExtra, r.cluster, operator.OpReplica, region, storeID) if err != nil { - reason := fmt.Sprintf("%s-fail", removeExtra) - checkerCounter.WithLabelValues("replica_checker", reason).Inc() + if status == offlineStatus { + replicaCheckerRemoveExtraOfflineFailedCounter.Inc() + } else if status == downStatus { + replicaCheckerRemoveExtraDownFailedCounter.Inc() + } return nil } return op @@ -244,8 +263,11 @@ func (r *ReplicaChecker) fixPeer(region *core.RegionInfo, storeID uint64, status regionStores := r.cluster.GetRegionStores(region) target, filterByTempState := r.strategy(region).SelectStoreToFix(regionStores, storeID) if target == 0 { - reason := fmt.Sprintf("no-store-%s", status) - checkerCounter.WithLabelValues("replica_checker", reason).Inc() + if status == offlineStatus { + replicaCheckerNoStoreOfflineCounter.Inc() + } else if status == downStatus { + replicaCheckerNoStoreDownCounter.Inc() + } log.Debug("no best store to add replica", zap.Uint64("region-id", region.GetID())) if filterByTempState { r.regionWaitingList.Put(region.GetID(), nil) @@ -256,8 +278,11 @@ func (r *ReplicaChecker) fixPeer(region *core.RegionInfo, storeID uint64, status replace := fmt.Sprintf("replace-%s-replica", status) op, err := operator.CreateMovePeerOperator(replace, r.cluster, region, operator.OpReplica, storeID, newPeer) if err != nil { - reason := fmt.Sprintf("%s-fail", replace) - checkerCounter.WithLabelValues("replica_checker", reason).Inc() + if status == offlineStatus { + replicaCheckerReplaceOfflineFailedCounter.Inc() + } else if status == downStatus { + replicaCheckerReplaceDownFailedCounter.Inc() + } return nil } return op diff --git a/server/schedule/checker/rule_checker.go b/server/schedule/checker/rule_checker.go index e3b02b96905..2d56e06df89 100644 --- a/server/schedule/checker/rule_checker.go +++ b/server/schedule/checker/rule_checker.go @@ -35,6 +35,12 @@ import ( "go.uber.org/zap" ) +const ( + maxPendingListLen = 100000 + ruleChecker = "rule_checker" + ruleCheckerName = "rule-checker" +) + var ( errNoStoreToAdd = errors.New("no store to add peer") errNoStoreToReplace = errors.New("no store to replace peer") @@ -42,10 +48,36 @@ var ( errPeerCannotBeWitness = errors.New("peer cannot be witness") errNoNewLeader = errors.New("no new leader") errRegionNoLeader = errors.New("region no leader") + // WithLabelValues is a heavy operation, define variable to avoid call it every time. + ruleCheckerCounter = checkerCounter.WithLabelValues(ruleChecker, "check") + ruleCheckerPausedCounter = checkerCounter.WithLabelValues(ruleChecker, "paused") + ruleCheckerRegionNoLeaderCounter = checkerCounter.WithLabelValues(ruleChecker, "region-no-leader") + ruleCheckerGetCacheCounter = checkerCounter.WithLabelValues(ruleChecker, "get-cache") + ruleCheckerNeedSplitCounter = checkerCounter.WithLabelValues(ruleChecker, "need-split") + ruleCheckerSetCacheCounter = checkerCounter.WithLabelValues(ruleChecker, "set-cache") + ruleCheckerReplaceDownCounter = checkerCounter.WithLabelValues(ruleChecker, "replace-down") + ruleCheckerPromoteWitnessCounter = checkerCounter.WithLabelValues(ruleChecker, "promote-witness") + ruleCheckerReplaceOfflineCounter = checkerCounter.WithLabelValues(ruleChecker, "replace-offline") + ruleCheckerAddRulePeerCounter = checkerCounter.WithLabelValues(ruleChecker, "add-rule-peer") + ruleCheckerNoStoreAddCounter = checkerCounter.WithLabelValues(ruleChecker, "no-store-add") + ruleCheckerNoStoreReplaceCounter = checkerCounter.WithLabelValues(ruleChecker, "no-store-replace") + ruleCheckerFixPeerRoleCounter = checkerCounter.WithLabelValues(ruleChecker, "fix-peer-role") + ruleCheckerFixLeaderRoleCounter = checkerCounter.WithLabelValues(ruleChecker, "fix-leader-role") + ruleCheckerNotAllowLeaderCounter = checkerCounter.WithLabelValues(ruleChecker, "not-allow-leader") + ruleCheckerFixFollowerRoleCounter = checkerCounter.WithLabelValues(ruleChecker, "fix-follower-role") + ruleCheckerNoNewLeaderCounter = checkerCounter.WithLabelValues(ruleChecker, "no-new-leader") + ruleCheckerDemoteVoterRoleCounter = checkerCounter.WithLabelValues(ruleChecker, "demote-voter-role") + ruleCheckerRecentlyPromoteToNonWitnessCounter = checkerCounter.WithLabelValues(ruleChecker, "recently-promote-to-non-witness") + ruleCheckerCancelSwitchToWitnessCounter = checkerCounter.WithLabelValues(ruleChecker, "cancel-switch-to-witness") + ruleCheckerSetVoterWitnessCounter = checkerCounter.WithLabelValues(ruleChecker, "set-voter-witness") + ruleCheckerSetLearnerWitnessCounter = checkerCounter.WithLabelValues(ruleChecker, "set-learner-witness") + ruleCheckerSetVoterNonWitnessCounter = checkerCounter.WithLabelValues(ruleChecker, "set-voter-non-witness") + ruleCheckerSetLearnerNonWitnessCounter = checkerCounter.WithLabelValues(ruleChecker, "set-learner-non-witness") + ruleCheckerMoveToBetterLocationCounter = checkerCounter.WithLabelValues(ruleChecker, "move-to-better-location") + ruleCheckerSkipRemoveOrphanPeerCounter = checkerCounter.WithLabelValues(ruleChecker, "skip-remove-orphan-peer") + ruleCheckerRemoveOrphanPeerCounter = checkerCounter.WithLabelValues(ruleChecker, "remove-orphan-peer") ) -const maxPendingListLen = 100000 - // RuleChecker fix/improve region by placement rules. type RuleChecker struct { PauseController @@ -63,7 +95,7 @@ func NewRuleChecker(ctx context.Context, cluster schedule.Cluster, ruleManager * return &RuleChecker{ cluster: cluster, ruleManager: ruleManager, - name: "rule-checker", + name: ruleCheckerName, regionWaitingList: regionWaitingList, pendingList: cache.NewDefaultCache(maxPendingListLen), switchWitnessCache: cache.NewIDTTL(ctx, time.Minute, cluster.GetOpts().GetSwitchWitnessInterval()), @@ -73,7 +105,7 @@ func NewRuleChecker(ctx context.Context, cluster schedule.Cluster, ruleManager * // GetType returns RuleChecker's Type func (c *RuleChecker) GetType() string { - return "rule-checker" + return ruleCheckerName } // Check checks if the region matches placement rules and returns Operator to @@ -87,12 +119,12 @@ func (c *RuleChecker) Check(region *core.RegionInfo) *operator.Operator { func (c *RuleChecker) CheckWithFit(region *core.RegionInfo, fit *placement.RegionFit) (op *operator.Operator) { // checker is paused if c.IsPaused() { - checkerCounter.WithLabelValues("rule_checker", "paused").Inc() + ruleCheckerPausedCounter.Inc() return nil } // skip no leader region if region.GetLeader() == nil { - checkerCounter.WithLabelValues("rule_checker", "region-no-leader").Inc() + ruleCheckerRegionNoLeaderCounter.Inc() log.Debug("fail to check region", zap.Uint64("region-id", region.GetID()), zap.Error(errRegionNoLeader)) return } @@ -101,7 +133,7 @@ func (c *RuleChecker) CheckWithFit(region *core.RegionInfo, fit *placement.Regio failpoint.Inject("assertShouldNotCache", func() { panic("cached shouldn't be used") }) - checkerCounter.WithLabelValues("rule_checker", "get-cache").Inc() + ruleCheckerGetCacheCounter.Inc() return nil } failpoint.Inject("assertShouldCache", func() { @@ -112,11 +144,11 @@ func (c *RuleChecker) CheckWithFit(region *core.RegionInfo, fit *placement.Regio // invalid the cache if it exists c.ruleManager.InvalidCache(region.GetID()) - checkerCounter.WithLabelValues("rule_checker", "check").Inc() + ruleCheckerCounter.Inc() c.record.refresh(c.cluster) if len(fit.RuleFits) == 0 { - checkerCounter.WithLabelValues("rule_checker", "need-split").Inc() + ruleCheckerNeedSplitCounter.Inc() // If the region matches no rules, the most possible reason is it spans across // multiple rules. return nil @@ -143,7 +175,7 @@ func (c *RuleChecker) CheckWithFit(region *core.RegionInfo, fit *placement.Regio if placement.ValidateFit(fit) && placement.ValidateRegion(region) && placement.ValidateStores(fit.GetRegionStores()) { // If there is no need to fix, we will cache the fit c.ruleManager.SetRegionFitCache(region, fit) - checkerCounter.WithLabelValues("rule_checker", "set-cache").Inc() + ruleCheckerSetCacheCounter.Inc() } } return nil @@ -169,19 +201,19 @@ func (c *RuleChecker) fixRulePeer(region *core.RegionInfo, fit *placement.Region for _, peer := range rf.Peers { if c.isDownPeer(region, peer) { if c.isStoreDownTimeHitMaxDownTime(peer.GetStoreId()) { - checkerCounter.WithLabelValues("rule_checker", "replace-down").Inc() + ruleCheckerReplaceDownCounter.Inc() return c.replaceUnexpectRulePeer(region, rf, fit, peer, downStatus) } // When witness placement rule is enabled, promotes the witness to voter when region has down voter. if c.isWitnessEnabled() && core.IsVoter(peer) { if witness, ok := c.hasAvailableWitness(region, peer); ok { - checkerCounter.WithLabelValues("rule_checker", "promote-witness").Inc() + ruleCheckerPromoteWitnessCounter.Inc() return operator.CreateNonWitnessPeerOperator("promote-witness", c.cluster, region, witness) } } } if c.isOfflinePeer(peer) { - checkerCounter.WithLabelValues("rule_checker", "replace-offline").Inc() + ruleCheckerReplaceOfflineCounter.Inc() return c.replaceUnexpectRulePeer(region, rf, fit, peer, offlineStatus) } } @@ -199,11 +231,11 @@ func (c *RuleChecker) fixRulePeer(region *core.RegionInfo, fit *placement.Region } func (c *RuleChecker) addRulePeer(region *core.RegionInfo, rf *placement.RuleFit) (*operator.Operator, error) { - checkerCounter.WithLabelValues("rule_checker", "add-rule-peer").Inc() + ruleCheckerAddRulePeerCounter.Inc() ruleStores := c.getRuleFitStores(rf) store, filterByTempState := c.strategy(region, rf.Rule).SelectStoreToAdd(ruleStores) if store == 0 { - checkerCounter.WithLabelValues("rule_checker", "no-store-add").Inc() + ruleCheckerNoStoreAddCounter.Inc() c.handleFilterState(region, filterByTempState) return nil, errNoStoreToAdd } @@ -225,7 +257,7 @@ func (c *RuleChecker) replaceUnexpectRulePeer(region *core.RegionInfo, rf *place ruleStores := c.getRuleFitStores(rf) store, filterByTempState := c.strategy(region, rf.Rule).SelectStoreToFix(ruleStores, peer.GetStoreId()) if store == 0 { - checkerCounter.WithLabelValues("rule_checker", "no-store-replace").Inc() + ruleCheckerNoStoreReplaceCounter.Inc() c.handleFilterState(region, filterByTempState) return nil, errNoStoreToReplace } @@ -284,29 +316,29 @@ func (c *RuleChecker) replaceUnexpectRulePeer(region *core.RegionInfo, rf *place func (c *RuleChecker) fixLooseMatchPeer(region *core.RegionInfo, fit *placement.RegionFit, rf *placement.RuleFit, peer *metapb.Peer) (*operator.Operator, error) { if core.IsLearner(peer) && rf.Rule.Role != placement.Learner { - checkerCounter.WithLabelValues("rule_checker", "fix-peer-role").Inc() + ruleCheckerFixPeerRoleCounter.Inc() return operator.CreatePromoteLearnerOperator("fix-peer-role", c.cluster, region, peer) } if region.GetLeader().GetId() != peer.GetId() && rf.Rule.Role == placement.Leader { - checkerCounter.WithLabelValues("rule_checker", "fix-leader-role").Inc() + ruleCheckerFixLeaderRoleCounter.Inc() if c.allowLeader(fit, peer) { return operator.CreateTransferLeaderOperator("fix-leader-role", c.cluster, region, region.GetLeader().GetStoreId(), peer.GetStoreId(), []uint64{}, 0) } - checkerCounter.WithLabelValues("rule_checker", "not-allow-leader") + ruleCheckerNotAllowLeaderCounter.Inc() return nil, errPeerCannotBeLeader } if region.GetLeader().GetId() == peer.GetId() && rf.Rule.Role == placement.Follower { - checkerCounter.WithLabelValues("rule_checker", "fix-follower-role").Inc() + ruleCheckerFixFollowerRoleCounter.Inc() for _, p := range region.GetPeers() { if c.allowLeader(fit, p) { return operator.CreateTransferLeaderOperator("fix-follower-role", c.cluster, region, peer.GetStoreId(), p.GetStoreId(), []uint64{}, 0) } } - checkerCounter.WithLabelValues("rule_checker", "no-new-leader").Inc() + ruleCheckerNoNewLeaderCounter.Inc() return nil, errNoNewLeader } if core.IsVoter(peer) && rf.Rule.Role == placement.Learner { - checkerCounter.WithLabelValues("rule_checker", "demote-voter-role").Inc() + ruleCheckerDemoteVoterRoleCounter.Inc() return operator.CreateDemoteVoterOperator("fix-demote-voter", c.cluster, region, peer) } if region.GetLeader().GetId() == peer.GetId() && rf.Rule.IsWitness { @@ -315,25 +347,25 @@ func (c *RuleChecker) fixLooseMatchPeer(region *core.RegionInfo, fit *placement. if !core.IsWitness(peer) && rf.Rule.IsWitness && c.isWitnessEnabled() { c.switchWitnessCache.UpdateTTL(c.cluster.GetOpts().GetSwitchWitnessInterval()) if c.switchWitnessCache.Exists(region.GetID()) { - checkerCounter.WithLabelValues("rule_checker", "recently-promote-to-non-witness").Inc() + ruleCheckerRecentlyPromoteToNonWitnessCounter.Inc() return nil, nil } if len(region.GetPendingPeers()) > 0 { - checkerCounter.WithLabelValues("rule_checker", "cancel-switch-to-witness").Inc() + ruleCheckerCancelSwitchToWitnessCounter.Inc() return nil, nil } - lv := "set-voter-witness" if core.IsLearner(peer) { - lv = "set-learner-witness" + ruleCheckerSetLearnerWitnessCounter.Inc() + } else { + ruleCheckerSetVoterWitnessCounter.Inc() } - checkerCounter.WithLabelValues("rule_checker", lv).Inc() return operator.CreateWitnessPeerOperator("fix-witness-peer", c.cluster, region, peer) } else if core.IsWitness(peer) && (!rf.Rule.IsWitness || !c.isWitnessEnabled()) { - lv := "set-voter-non-witness" if core.IsLearner(peer) { - lv = "set-learner-non-witness" + ruleCheckerSetLearnerNonWitnessCounter.Inc() + } else { + ruleCheckerSetVoterNonWitnessCounter.Inc() } - checkerCounter.WithLabelValues("rule_checker", lv).Inc() return operator.CreateNonWitnessPeerOperator("fix-non-witness-peer", c.cluster, region, peer) } return nil, nil @@ -377,7 +409,7 @@ func (c *RuleChecker) fixBetterLocation(region *core.RegionInfo, rf *placement.R c.handleFilterState(region, filterByTempState) return nil, nil } - checkerCounter.WithLabelValues("rule_checker", "move-to-better-location").Inc() + ruleCheckerMoveToBetterLocationCounter.Inc() isWitness := rf.Rule.IsWitness if !c.isWitnessEnabled() { isWitness = false @@ -394,25 +426,25 @@ func (c *RuleChecker) fixOrphanPeers(region *core.RegionInfo, fit *placement.Reg // by RuleFits is not pending or down. for _, rf := range fit.RuleFits { if !rf.IsSatisfied() { - checkerCounter.WithLabelValues("rule_checker", "skip-remove-orphan-peer").Inc() + ruleCheckerSkipRemoveOrphanPeerCounter.Inc() return nil, nil } for _, p := range rf.Peers { for _, pendingPeer := range region.GetPendingPeers() { if pendingPeer.Id == p.Id { - checkerCounter.WithLabelValues("rule_checker", "skip-remove-orphan-peer").Inc() + ruleCheckerSkipRemoveOrphanPeerCounter.Inc() return nil, nil } } for _, downPeer := range region.GetDownPeers() { if downPeer.Peer.Id == p.Id { - checkerCounter.WithLabelValues("rule_checker", "skip-remove-orphan-peer").Inc() + ruleCheckerSkipRemoveOrphanPeerCounter.Inc() return nil, nil } } } } - checkerCounter.WithLabelValues("rule_checker", "remove-orphan-peer").Inc() + ruleCheckerRemoveOrphanPeerCounter.Inc() peer := fit.OrphanPeers[0] return operator.CreateRemovePeerOperator("remove-orphan-peer", c.cluster, 0, region, peer.StoreId) } diff --git a/server/schedule/checker/split_checker.go b/server/schedule/checker/split_checker.go index 1b0db44df47..ef3d2b089f1 100644 --- a/server/schedule/checker/split_checker.go +++ b/server/schedule/checker/split_checker.go @@ -33,6 +33,14 @@ type SplitChecker struct { labeler *labeler.RegionLabeler } +const splitCheckerName = "split_checker" + +var ( + // WithLabelValues is a heavy operation, define variable to avoid call it every time. + splitCheckerCounter = checkerCounter.WithLabelValues(splitCheckerName, "check") + splitCheckerPausedCounter = checkerCounter.WithLabelValues(splitCheckerName, "paused") +) + // NewSplitChecker creates a new SplitChecker. func NewSplitChecker(cluster schedule.Cluster, ruleManager *placement.RuleManager, labeler *labeler.RegionLabeler) *SplitChecker { return &SplitChecker{ @@ -49,10 +57,10 @@ func (c *SplitChecker) GetType() string { // Check checks whether the region need to split and returns Operator to fix. func (c *SplitChecker) Check(region *core.RegionInfo) *operator.Operator { - checkerCounter.WithLabelValues("split_checker", "check").Inc() + splitCheckerCounter.Inc() if c.IsPaused() { - checkerCounter.WithLabelValues("split_checker", "paused").Inc() + splitCheckerPausedCounter.Inc() return nil } diff --git a/server/schedule/filter/filters.go b/server/schedule/filter/filters.go index 1100defd549..1aaaf5bad8e 100644 --- a/server/schedule/filter/filters.go +++ b/server/schedule/filter/filters.go @@ -40,6 +40,7 @@ func SelectSourceStores(stores []*core.StoreInfo, filters []Filter, opt *config. counter.inc(source, filters[i].Type(), s.GetID(), 0) } else { sourceID := strconv.FormatUint(s.GetID(), 10) + // todo: pre-allocate gauge metrics filterCounter.WithLabelValues(source.String(), filters[i].Scope(), filters[i].Type().String(), sourceID, "").Inc() } if collector != nil { diff --git a/server/schedule/hbstream/metric.go b/server/schedule/hbstream/metric.go index 2b2c42bea00..624b6776b08 100644 --- a/server/schedule/hbstream/metric.go +++ b/server/schedule/hbstream/metric.go @@ -17,6 +17,7 @@ package hbstream import "github.com/prometheus/client_golang/prometheus" var ( + // todo: pre-allocate gauge metrics heartbeatStreamCounter = prometheus.NewCounterVec( prometheus.CounterOpts{ Namespace: "pd", diff --git a/server/schedule/metrics.go b/server/schedule/metrics.go index 89dabf8e74e..9005925568b 100644 --- a/server/schedule/metrics.go +++ b/server/schedule/metrics.go @@ -17,6 +17,7 @@ package schedule import "github.com/prometheus/client_golang/prometheus" var ( + // todo: pre-allocate gauge metrics operatorCounter = prometheus.NewCounterVec( prometheus.CounterOpts{ Namespace: "pd", diff --git a/server/schedule/region_scatterer.go b/server/schedule/region_scatterer.go index 26993f1292f..950394045ff 100644 --- a/server/schedule/region_scatterer.go +++ b/server/schedule/region_scatterer.go @@ -38,8 +38,19 @@ import ( const regionScatterName = "region-scatter" -var gcInterval = time.Minute -var gcTTL = time.Minute * 3 +var ( + gcInterval = time.Minute + gcTTL = time.Minute * 3 + // WithLabelValues is a heavy operation, define variable to avoid call it every time. + scatterSkipEmptyRegionCounter = scatterCounter.WithLabelValues("skip", "empty-region") + scatterSkipNoRegionCounter = scatterCounter.WithLabelValues("skip", "no-region") + scatterSkipNoLeaderCounter = scatterCounter.WithLabelValues("skip", "no-leader") + scatterSkipHotRegionCounter = scatterCounter.WithLabelValues("skip", "hot") + scatterSkipNotReplicatedCounter = scatterCounter.WithLabelValues("skip", "not-replicated") + scatterUnnecessaryCounter = scatterCounter.WithLabelValues("unnecessary", "") + scatterFailCounter = scatterCounter.WithLabelValues("fail", "") + scatterSuccessCounter = scatterCounter.WithLabelValues("success", "") +) type selectedStores struct { mu syncutil.RWMutex @@ -166,7 +177,7 @@ const maxRetryLimit = 30 func (r *RegionScatterer) ScatterRegionsByRange(startKey, endKey []byte, group string, retryLimit int) (int, map[uint64]error, error) { regions := r.cluster.ScanRegions(startKey, endKey, -1) if len(regions) < 1 { - scatterCounter.WithLabelValues("skip", "empty-region").Inc() + scatterSkipEmptyRegionCounter.Inc() return 0, nil, errors.New("empty region") } failures := make(map[uint64]error, len(regions)) @@ -185,7 +196,7 @@ func (r *RegionScatterer) ScatterRegionsByRange(startKey, endKey []byte, group s // ScatterRegionsByID directly scatter regions by ScatterRegions func (r *RegionScatterer) ScatterRegionsByID(regionsID []uint64, group string, retryLimit int) (int, map[uint64]error, error) { if len(regionsID) < 1 { - scatterCounter.WithLabelValues("skip", "empty-region").Inc() + scatterSkipEmptyRegionCounter.Inc() return 0, nil, errors.New("empty region") } failures := make(map[uint64]error, len(regionsID)) @@ -193,7 +204,7 @@ func (r *RegionScatterer) ScatterRegionsByID(regionsID []uint64, group string, r for _, id := range regionsID { region := r.cluster.GetRegion(id) if region == nil { - scatterCounter.WithLabelValues("skip", "no-region").Inc() + scatterSkipNoRegionCounter.Inc() log.Warn("failed to find region during scatter", zap.Uint64("region-id", id)) failures[id] = errors.New(fmt.Sprintf("failed to find region %v", id)) continue @@ -220,7 +231,7 @@ func (r *RegionScatterer) ScatterRegionsByID(regionsID []uint64, group string, r // and the value of the failures indicates the failure error. func (r *RegionScatterer) scatterRegions(regions map[uint64]*core.RegionInfo, failures map[uint64]error, group string, retryLimit int) (int, error) { if len(regions) < 1 { - scatterCounter.WithLabelValues("skip", "empty-region").Inc() + scatterSkipEmptyRegionCounter.Inc() return 0, errors.New("empty region") } if retryLimit > maxRetryLimit { @@ -269,19 +280,19 @@ func (r *RegionScatterer) scatterRegions(regions map[uint64]*core.RegionInfo, fa func (r *RegionScatterer) Scatter(region *core.RegionInfo, group string) (*operator.Operator, error) { if !filter.IsRegionReplicated(r.cluster, region) { r.cluster.AddSuspectRegions(region.GetID()) - scatterCounter.WithLabelValues("skip", "not-replicated").Inc() + scatterSkipNotReplicatedCounter.Inc() log.Warn("region not replicated during scatter", zap.Uint64("region-id", region.GetID())) return nil, errors.Errorf("region %d is not fully replicated", region.GetID()) } if region.GetLeader() == nil { - scatterCounter.WithLabelValues("skip", "no-leader").Inc() + scatterSkipNoLeaderCounter.Inc() log.Warn("region no leader during scatter", zap.Uint64("region-id", region.GetID())) return nil, errors.Errorf("region %d has no leader", region.GetID()) } if r.cluster.IsRegionHot(region) { - scatterCounter.WithLabelValues("skip", "hot").Inc() + scatterSkipHotRegionCounter.Inc() log.Warn("region too hot during scatter", zap.Uint64("region-id", region.GetID())) return nil, errors.Errorf("region %d is hot", region.GetID()) } @@ -339,7 +350,7 @@ func (r *RegionScatterer) scatterRegion(region *core.RegionInfo, group string) * // one engine, tiflash, which does not support the leader, so don't consider it for now. targetLeader := r.selectAvailableLeaderStore(group, region, targetPeers, r.ordinaryEngine) if targetLeader == 0 { - scatterCounter.WithLabelValues("no-leader", "").Inc() + scatterSkipNoLeaderCounter.Inc() return nil } @@ -355,13 +366,13 @@ func (r *RegionScatterer) scatterRegion(region *core.RegionInfo, group string) * } if isSameDistribution(region, targetPeers, targetLeader) { - scatterCounter.WithLabelValues("unnecessary", "").Inc() + scatterUnnecessaryCounter.Inc() r.Put(targetPeers, targetLeader, group) return nil } op, err := operator.CreateScatterRegionOperator("scatter-region", r.cluster, region, targetPeers, targetLeader) if err != nil { - scatterCounter.WithLabelValues("fail", "").Inc() + scatterFailCounter.Inc() for _, peer := range region.GetPeers() { targetPeers[peer.GetStoreId()] = peer } @@ -370,7 +381,7 @@ func (r *RegionScatterer) scatterRegion(region *core.RegionInfo, group string) * return nil } if op != nil { - scatterCounter.WithLabelValues("success", "").Inc() + scatterSuccessCounter.Inc() r.Put(targetPeers, targetLeader, group) op.SetPriorityLevel(core.High) } diff --git a/server/schedulers/balance_leader.go b/server/schedulers/balance_leader.go index cc2b2090dea..0629318ea30 100644 --- a/server/schedulers/balance_leader.go +++ b/server/schedulers/balance_leader.go @@ -57,6 +57,17 @@ const ( transferOut = "transfer-out" ) +var ( + // WithLabelValues is a heavy operation, define variable to avoid call it every time. + balanceLeaderScheduleCounter = schedulerCounter.WithLabelValues(BalanceLeaderName, "schedule") + balanceLeaderNoLeaderRegionCounter = schedulerCounter.WithLabelValues(BalanceLeaderName, "no-leader-region") + balanceLeaderRegionHotCounter = schedulerCounter.WithLabelValues(BalanceLeaderName, "region-hot") + balanceLeaderNoTargetStoreCounter = schedulerCounter.WithLabelValues(BalanceLeaderName, "no-target-store") + balanceLeaderNoFollowerRegionCounter = schedulerCounter.WithLabelValues(BalanceLeaderName, "no-follower-region") + balanceLeaderSkipCounter = schedulerCounter.WithLabelValues(BalanceLeaderName, "skip") + balanceLeaderNewOpCounter = schedulerCounter.WithLabelValues(BalanceLeaderName, "new-operator") +) + func init() { schedule.RegisterSliceDecoderBuilder(BalanceLeaderType, func(args []string) schedule.ConfigDecoder { return func(v interface{}) error { @@ -354,7 +365,7 @@ func (l *balanceLeaderScheduler) Schedule(cluster schedule.Cluster, dryRun bool) collector = plan.NewCollector(basePlan) } batch := l.conf.Batch - schedulerCounter.WithLabelValues(l.GetName(), "schedule").Inc() + balanceLeaderScheduleCounter.Inc() leaderSchedulePolicy := cluster.GetOpts().GetLeaderSchedulePolicy() opInfluence := l.opController.GetOpInfluence(cluster) @@ -456,7 +467,7 @@ func (l *balanceLeaderScheduler) transferLeaderOut(solver *solver, collector *pl collector, filter.NewRegionPendingFilter(), filter.NewRegionDownFilter()) if solver.region == nil { log.Debug("store has no leader", zap.String("scheduler", l.GetName()), zap.Uint64("store-id", solver.SourceStoreID())) - schedulerCounter.WithLabelValues(l.GetName(), "no-leader-region").Inc() + balanceLeaderNoLeaderRegionCounter.Inc() return nil } if solver.IsRegionHot(solver.region) { @@ -464,7 +475,7 @@ func (l *balanceLeaderScheduler) transferLeaderOut(solver *solver, collector *pl if collector != nil { collector.Collect(plan.SetResource(solver.region), plan.SetStatus(plan.NewStatus(plan.StatusRegionHot))) } - schedulerCounter.WithLabelValues(l.GetName(), "region-hot").Inc() + balanceLeaderRegionHotCounter.Inc() return nil } solver.step++ @@ -488,7 +499,7 @@ func (l *balanceLeaderScheduler) transferLeaderOut(solver *solver, collector *pl } } log.Debug("region has no target store", zap.String("scheduler", l.GetName()), zap.Uint64("region-id", solver.region.GetID())) - schedulerCounter.WithLabelValues(l.GetName(), "no-target-store").Inc() + balanceLeaderNoTargetStoreCounter.Inc() return nil } @@ -500,12 +511,12 @@ func (l *balanceLeaderScheduler) transferLeaderIn(solver *solver, collector *pla nil, filter.NewRegionPendingFilter(), filter.NewRegionDownFilter()) if solver.region == nil { log.Debug("store has no follower", zap.String("scheduler", l.GetName()), zap.Uint64("store-id", solver.TargetStoreID())) - schedulerCounter.WithLabelValues(l.GetName(), "no-follower-region").Inc() + balanceLeaderNoFollowerRegionCounter.Inc() return nil } if solver.IsRegionHot(solver.region) { log.Debug("region is hot region, ignore it", zap.String("scheduler", l.GetName()), zap.Uint64("region-id", solver.region.GetID())) - schedulerCounter.WithLabelValues(l.GetName(), "region-hot").Inc() + balanceLeaderRegionHotCounter.Inc() return nil } leaderStoreID := solver.region.GetLeader().GetStoreId() @@ -516,7 +527,7 @@ func (l *balanceLeaderScheduler) transferLeaderIn(solver *solver, collector *pla zap.Uint64("region-id", solver.region.GetID()), zap.Uint64("store-id", leaderStoreID), ) - schedulerCounter.WithLabelValues(l.GetName(), "no-leader").Inc() + balanceLeaderNoLeaderRegionCounter.Inc() return nil } finalFilters := l.filters @@ -529,7 +540,7 @@ func (l *balanceLeaderScheduler) transferLeaderIn(solver *solver, collector *pla PickFirst() if target == nil { log.Debug("region has no target store", zap.String("scheduler", l.GetName()), zap.Uint64("region-id", solver.region.GetID())) - schedulerCounter.WithLabelValues(l.GetName(), "no-target-store").Inc() + balanceLeaderNoTargetStoreCounter.Inc() return nil } return l.createOperator(solver, collector) @@ -544,7 +555,7 @@ func (l *balanceLeaderScheduler) createOperator(solver *solver, collector *plan. defer func() { solver.step-- }() solver.sourceScore, solver.targetScore = solver.sourceStoreScore(l.GetName()), solver.targetStoreScore(l.GetName()) if !solver.shouldBalance(l.GetName()) { - schedulerCounter.WithLabelValues(l.GetName(), "skip").Inc() + balanceLeaderSkipCounter.Inc() if collector != nil { collector.Collect(plan.SetStatus(plan.NewStatus(plan.StatusStoreScoreDisallowed))) } @@ -561,10 +572,11 @@ func (l *balanceLeaderScheduler) createOperator(solver *solver, collector *plan. return nil } op.Counters = append(op.Counters, - schedulerCounter.WithLabelValues(l.GetName(), "new-operator"), + balanceLeaderNewOpCounter, ) op.FinishedCounters = append(op.FinishedCounters, balanceDirectionCounter.WithLabelValues(l.GetName(), solver.SourceMetricLabel(), solver.TargetMetricLabel()), + // todo: pre-allocate gauge metrics l.counter.WithLabelValues("move-leader", solver.SourceMetricLabel()+"-out"), l.counter.WithLabelValues("move-leader", solver.TargetMetricLabel()+"-in"), ) diff --git a/server/schedulers/balance_region.go b/server/schedulers/balance_region.go index 4a78fc3e1b7..42943b6c7b1 100644 --- a/server/schedulers/balance_region.go +++ b/server/schedulers/balance_region.go @@ -65,6 +65,18 @@ const ( BalanceRegionType = "balance-region" ) +var ( + // WithLabelValues is a heavy operation, define variable to avoid call it every time. + balanceRegionScheduleCounter = balanceRegionCounter.WithLabelValues(BalanceRegionName, "schedule") + balanceRegionNoRegionCounter = balanceRegionCounter.WithLabelValues(BalanceRegionName, "no-region") + balanceRegionHotCounter = balanceRegionCounter.WithLabelValues(BalanceRegionName, "region-hot") + balanceRegionNoLeaderCounter = balanceRegionCounter.WithLabelValues(BalanceRegionName, "no-leader") + balanceRegionNewOpCounter = balanceRegionCounter.WithLabelValues(BalanceRegionName, "new-operator") + balanceRegionSkipCounter = balanceRegionCounter.WithLabelValues(BalanceRegionName, "skip") + balanceRegionCreateOpFailCounter = balanceRegionCounter.WithLabelValues(BalanceRegionName, "create-operator-fail") + balanceRegionNoReplacementCounter = balanceRegionCounter.WithLabelValues(BalanceRegionName, "no-replacement") +) + type balanceRegionSchedulerConfig struct { Name string `json:"name"` Ranges []core.KeyRange `json:"ranges"` @@ -145,7 +157,7 @@ func (s *balanceRegionScheduler) Schedule(cluster schedule.Cluster, dryRun bool) if dryRun { collector = plan.NewCollector(basePlan) } - schedulerCounter.WithLabelValues(s.GetName(), "schedule").Inc() + balanceRegionScheduleCounter.Inc() stores := cluster.GetStores() opts := cluster.GetOpts() faultTargets := filter.SelectUnavailableTargetStores(stores, s.filters, opts, collector, s.filterCounter) @@ -208,7 +220,7 @@ func (s *balanceRegionScheduler) Schedule(cluster schedule.Cluster, dryRun bool) append(baseRegionFilters, pendingFilter)...) } if solver.region == nil { - schedulerCounter.WithLabelValues(s.GetName(), "no-region").Inc() + balanceRegionNoRegionCounter.Inc() continue } log.Debug("select region", zap.String("scheduler", s.GetName()), zap.Uint64("region-id", solver.region.GetID())) @@ -218,7 +230,7 @@ func (s *balanceRegionScheduler) Schedule(cluster schedule.Cluster, dryRun bool) if collector != nil { collector.Collect(plan.SetResource(solver.region), plan.SetStatus(plan.NewStatus(plan.StatusRegionHot))) } - schedulerCounter.WithLabelValues(s.GetName(), "region-hot").Inc() + balanceRegionHotCounter.Inc() continue } // Check region leader @@ -227,7 +239,7 @@ func (s *balanceRegionScheduler) Schedule(cluster schedule.Cluster, dryRun bool) if collector != nil { collector.Collect(plan.SetResource(solver.region), plan.SetStatus(plan.NewStatus(plan.StatusRegionNoLeader))) } - schedulerCounter.WithLabelValues(s.GetName(), "no-leader").Inc() + balanceRegionNoLeaderCounter.Inc() continue } solver.step++ @@ -236,7 +248,7 @@ func (s *balanceRegionScheduler) Schedule(cluster schedule.Cluster, dryRun bool) solver.fit = replicaFilter.(*filter.RegionReplicatedFilter).GetFit() if op := s.transferPeer(solver, collector, sourceStores[sourceIndex+1:], faultTargets); op != nil { s.retryQuota.ResetLimit(solver.source) - op.Counters = append(op.Counters, schedulerCounter.WithLabelValues(s.GetName(), "new-operator")) + op.Counters = append(op.Counters, balanceRegionNewOpCounter) return []*operator.Operator{op}, collector.GetPlans() } solver.step-- @@ -276,7 +288,7 @@ func (s *balanceRegionScheduler) transferPeer(solver *solver, collector *plan.Co log.Debug("", zap.Uint64("region-id", regionID), zap.Uint64("source-store", sourceID), zap.Uint64("target-store", targetID)) if !solver.shouldBalance(s.GetName()) { - schedulerCounter.WithLabelValues(s.GetName(), "skip").Inc() + balanceRegionSkipCounter.Inc() if collector != nil { collector.Collect(plan.SetStatus(plan.NewStatus(plan.StatusStoreScoreDisallowed))) } @@ -288,7 +300,7 @@ func (s *balanceRegionScheduler) transferPeer(solver *solver, collector *plan.Co solver.step++ op, err := operator.CreateMovePeerOperator(BalanceRegionType, solver, solver.region, operator.OpRegion, oldPeer.GetStoreId(), newPeer) if err != nil { - schedulerCounter.WithLabelValues(s.GetName(), "create-operator-fail").Inc() + balanceRegionCreateOpFailCounter.Inc() if collector != nil { collector.Collect(plan.SetStatus(plan.NewStatus(plan.StatusCreateOperatorFailed))) } @@ -302,6 +314,7 @@ func (s *balanceRegionScheduler) transferPeer(solver *solver, collector *plan.Co targetLabel := strconv.FormatUint(targetID, 10) op.FinishedCounters = append(op.FinishedCounters, balanceDirectionCounter.WithLabelValues(s.GetName(), sourceLabel, targetLabel), + // todo: pre-allocate gauge metrics s.counter.WithLabelValues("move-peer", sourceLabel+"-out"), s.counter.WithLabelValues("move-peer", targetLabel+"-in"), ) @@ -310,7 +323,7 @@ func (s *balanceRegionScheduler) transferPeer(solver *solver, collector *plan.Co return op } - schedulerCounter.WithLabelValues(s.GetName(), "no-replacement").Inc() + balanceRegionNoReplacementCounter.Inc() if len(candidates.Stores) != 0 { solver.step-- } diff --git a/server/schedulers/evict_leader.go b/server/schedulers/evict_leader.go index 8249cb8bb1f..1c20e68667a 100644 --- a/server/schedulers/evict_leader.go +++ b/server/schedulers/evict_leader.go @@ -45,6 +45,15 @@ const ( lastStoreDeleteInfo = "The last store has been deleted" ) +var ( + // WithLabelValues is a heavy operation, define variable to avoid call it every time. + evictLeaderCounter = schedulerCounter.WithLabelValues(EvictLeaderName, "schedule") + evictLeaderNoLeaderCounter = schedulerCounter.WithLabelValues(EvictLeaderName, "no-leader") + evictLeaderPickUnhealthyCounter = schedulerCounter.WithLabelValues(EvictLeaderName, "pick-unhealthy-region") + evictLeaderNoTargetStoreCounter = schedulerCounter.WithLabelValues(EvictLeaderName, "no-target-store") + evictLeaderNewOperatorCounter = schedulerCounter.WithLabelValues(EvictLeaderName, "new-operator") +) + func init() { schedule.RegisterSliceDecoderBuilder(EvictLeaderType, func(args []string) schedule.ConfigDecoder { return func(v interface{}) error { @@ -257,7 +266,7 @@ func (s *evictLeaderScheduler) IsScheduleAllowed(cluster schedule.Cluster) bool } func (s *evictLeaderScheduler) Schedule(cluster schedule.Cluster, dryRun bool) ([]*operator.Operator, []plan.Plan) { - schedulerCounter.WithLabelValues(s.GetName(), "schedule").Inc() + evictLeaderCounter.Inc() return scheduleEvictLeaderBatch(s.GetName(), s.GetType(), cluster, s.conf, EvictLeaderBatchSize), nil } @@ -314,10 +323,10 @@ func scheduleEvictLeaderOnce(name, typ string, cluster schedule.Cluster, conf ev // try to pick unhealthy region region = filter.SelectOneRegion(cluster.RandLeaderRegions(storeID, ranges), nil) if region == nil { - schedulerCounter.WithLabelValues(name, "no-leader").Inc() + evictLeaderNoLeaderCounter.Inc() continue } - schedulerCounter.WithLabelValues(name, "pick-unhealthy-region").Inc() + evictLeaderPickUnhealthyCounter.Inc() unhealthyPeerStores := make(map[uint64]struct{}) for _, peer := range region.GetDownPeers() { unhealthyPeerStores[peer.GetPeer().GetStoreId()] = struct{}{} @@ -336,7 +345,7 @@ func scheduleEvictLeaderOnce(name, typ string, cluster schedule.Cluster, conf ev targets := candidates.PickAll() // `targets` MUST contains `target`, so only needs to check if `target` is nil here. if target == nil { - schedulerCounter.WithLabelValues(name, "no-target-store").Inc() + evictLeaderNoTargetStoreCounter.Inc() continue } targetIDs := make([]uint64, 0, len(targets)) @@ -349,7 +358,7 @@ func scheduleEvictLeaderOnce(name, typ string, cluster schedule.Cluster, conf ev continue } op.SetPriorityLevel(core.Urgent) - op.Counters = append(op.Counters, schedulerCounter.WithLabelValues(name, "new-operator")) + op.Counters = append(op.Counters, evictLeaderNewOperatorCounter) ops = append(ops, op) } return ops diff --git a/server/schedulers/evict_slow_store.go b/server/schedulers/evict_slow_store.go index e2fc4417441..ab88dd8f93d 100644 --- a/server/schedulers/evict_slow_store.go +++ b/server/schedulers/evict_slow_store.go @@ -36,6 +36,9 @@ const ( slowStoreRecoverThreshold = 1 ) +// WithLabelValues is a heavy operation, define variable to avoid call it every time. +var evictSlowStoreCounter = schedulerCounter.WithLabelValues(EvictSlowStoreName, "schedule") + func init() { schedule.RegisterSliceDecoderBuilder(EvictSlowStoreType, func(args []string) schedule.ConfigDecoder { return func(v interface{}) error { @@ -171,7 +174,7 @@ func (s *evictSlowStoreScheduler) IsScheduleAllowed(cluster schedule.Cluster) bo } func (s *evictSlowStoreScheduler) Schedule(cluster schedule.Cluster, dryRun bool) ([]*operator.Operator, []plan.Plan) { - schedulerCounter.WithLabelValues(s.GetName(), "schedule").Inc() + evictSlowStoreCounter.Inc() var ops []*operator.Operator if s.conf.evictStore() != 0 { diff --git a/server/schedulers/grant_hot_region.go b/server/schedulers/grant_hot_region.go index 39fde37eb4c..ea89421ff86 100644 --- a/server/schedulers/grant_hot_region.go +++ b/server/schedulers/grant_hot_region.go @@ -45,6 +45,12 @@ const ( GrantHotRegionType = "grant-hot-region" ) +var ( + // WithLabelValues is a heavy operation, define variable to avoid call it every time. + grantHotRegionCounter = schedulerCounter.WithLabelValues(GrantHotRegionName, "schedule") + grantHotRegionSkipCounter = schedulerCounter.WithLabelValues(GrantHotRegionName, "skip") +) + func init() { schedule.RegisterSliceDecoderBuilder(GrantHotRegionType, func(args []string) schedule.ConfigDecoder { return func(v interface{}) error { @@ -261,7 +267,7 @@ func newGrantHotRegionHandler(config *grantHotRegionSchedulerConfig) http.Handle } func (s *grantHotRegionScheduler) Schedule(cluster schedule.Cluster, dryRun bool) ([]*operator.Operator, []plan.Plan) { - schedulerCounter.WithLabelValues(s.GetName(), "schedule").Inc() + grantHotRegionCounter.Inc() rw := s.randomRWType() s.prepareForBalance(rw, cluster) return s.dispatch(rw, cluster), nil @@ -308,7 +314,7 @@ func (s *grantHotRegionScheduler) randomSchedule(cluster schedule.Cluster, srcSt return []*operator.Operator{op} } } - schedulerCounter.WithLabelValues(s.GetName(), "skip").Inc() + grantHotRegionSkipCounter.Inc() return nil } diff --git a/server/schedulers/grant_leader.go b/server/schedulers/grant_leader.go index 95fc4061c6e..5e8a8ecaf7d 100644 --- a/server/schedulers/grant_leader.go +++ b/server/schedulers/grant_leader.go @@ -40,6 +40,13 @@ const ( GrantLeaderType = "grant-leader" ) +var ( + // WithLabelValues is a heavy operation, define variable to avoid call it every time. + grantLeaderCounter = schedulerCounter.WithLabelValues(GrantLeaderName, "schedule") + grantLeaderNoFollowerCounter = schedulerCounter.WithLabelValues(GrantLeaderName, "no-follower") + grantLeaderNewOperatorCounter = schedulerCounter.WithLabelValues(GrantLeaderName, "new-operator") +) + func init() { schedule.RegisterSliceDecoderBuilder(GrantLeaderType, func(args []string) schedule.ConfigDecoder { return func(v interface{}) error { @@ -233,7 +240,7 @@ func (s *grantLeaderScheduler) IsScheduleAllowed(cluster schedule.Cluster) bool } func (s *grantLeaderScheduler) Schedule(cluster schedule.Cluster, dryRun bool) ([]*operator.Operator, []plan.Plan) { - schedulerCounter.WithLabelValues(s.GetName(), "schedule").Inc() + grantLeaderCounter.Inc() s.conf.mu.RLock() defer s.conf.mu.RUnlock() ops := make([]*operator.Operator, 0, len(s.conf.StoreIDWithRanges)) @@ -242,7 +249,7 @@ func (s *grantLeaderScheduler) Schedule(cluster schedule.Cluster, dryRun bool) ( for id, ranges := range s.conf.StoreIDWithRanges { region := filter.SelectOneRegion(cluster.RandFollowerRegions(id, ranges), nil, pendingFilter, downFilter) if region == nil { - schedulerCounter.WithLabelValues(s.GetName(), "no-follower").Inc() + grantLeaderNoFollowerCounter.Inc() continue } @@ -251,7 +258,7 @@ func (s *grantLeaderScheduler) Schedule(cluster schedule.Cluster, dryRun bool) ( log.Debug("fail to create grant leader operator", errs.ZapError(err)) continue } - op.Counters = append(op.Counters, schedulerCounter.WithLabelValues(s.GetName(), "new-operator")) + op.Counters = append(op.Counters, grantLeaderNewOperatorCounter) op.SetPriorityLevel(core.High) ops = append(ops, op) } diff --git a/server/schedulers/hot_region.go b/server/schedulers/hot_region.go index c4f869acc6e..9e32422ab97 100644 --- a/server/schedulers/hot_region.go +++ b/server/schedulers/hot_region.go @@ -25,6 +25,7 @@ import ( "github.com/pingcap/kvproto/pkg/metapb" "github.com/pingcap/log" + "github.com/prometheus/client_golang/prometheus" "github.com/tikv/pd/pkg/errs" "github.com/tikv/pd/pkg/slice" "github.com/tikv/pd/pkg/storage/endpoint" @@ -38,7 +39,35 @@ import ( "go.uber.org/zap" ) -var statisticsInterval = time.Second +var ( + statisticsInterval = time.Second + // WithLabelValues is a heavy operation, define variable to avoid call it every time. + hotSchedulerCounter = schedulerCounter.WithLabelValues(HotRegionName, "schedule") + hotSchedulerSkipCounter = schedulerCounter.WithLabelValues(HotRegionName, "skip") + hotSchedulerNeedSplitBeforeScheduleCounter = schedulerCounter.WithLabelValues(HotRegionName, "need_split_before_move_peer") + hotSchedulerSearchRevertRegionsCounter = schedulerCounter.WithLabelValues(HotRegionName, "search_revert_regions") + hotSchedulerNotSameEngineCounter = schedulerCounter.WithLabelValues(HotRegionName, "not_same_engine") + hotSchedulerNoRegionCounter = schedulerCounter.WithLabelValues(HotRegionName, "no_region") + hotSchedulerUnhealthyReplicaCounter = schedulerCounter.WithLabelValues(HotRegionName, "unhealthy_replica") + hotSchedulerAbnormalReplicaCounter = schedulerCounter.WithLabelValues(HotRegionName, "abnormal_replica") + hotSchedulerCreateOperatorFailedCounter = schedulerCounter.WithLabelValues(HotRegionName, "create_operator_failed") + hotSchedulerNewOperatorCounter = schedulerCounter.WithLabelValues(HotRegionName, "new_operator") + + hotSchedulerMoveLeaderCounter = schedulerCounter.WithLabelValues(HotRegionName, moveLeader.String()) + hotSchedulerMovePeerCounter = schedulerCounter.WithLabelValues(HotRegionName, movePeer.String()) + hotSchedulerTransferLeaderCounter = schedulerCounter.WithLabelValues(HotRegionName, transferLeader.String()) + + readSkipAllDimUniformStoreCounter = schedulerCounter.WithLabelValues(HotRegionName, "read-skip-all-dim-uniform-store") + writeSkipAllDimUniformStoreCounter = schedulerCounter.WithLabelValues(HotRegionName, "write-skip-all-dim-uniform-store") + readSkipByteDimUniformStoreCounter = schedulerCounter.WithLabelValues(HotRegionName, "read-skip-byte-uniform-store") + writeSkipByteDimUniformStoreCounter = schedulerCounter.WithLabelValues(HotRegionName, "write-skip-byte-uniform-store") + readSkipKeyDimUniformStoreCounter = schedulerCounter.WithLabelValues(HotRegionName, "read-skip-key-uniform-store") + writeSkipKeyDimUniformStoreCounter = schedulerCounter.WithLabelValues(HotRegionName, "write-skip-key-uniform-store") + readSkipQueryDimUniformStoreCounter = schedulerCounter.WithLabelValues(HotRegionName, "read-skip-query-uniform-store") + writeSkipQueryDimUniformStoreCounter = schedulerCounter.WithLabelValues(HotRegionName, "write-skip-query-uniform-store") + + pendingOpFails = schedulerStatus.WithLabelValues(HotRegionName, "pending_op_fails") +) type baseHotScheduler struct { *BaseScheduler @@ -254,7 +283,7 @@ func (h *hotScheduler) IsScheduleAllowed(cluster schedule.Cluster) bool { } func (h *hotScheduler) Schedule(cluster schedule.Cluster, dryRun bool) ([]*operator.Operator, []plan.Plan) { - schedulerCounter.WithLabelValues(h.GetName(), "schedule").Inc() + hotSchedulerCounter.Inc() rw := h.randomRWType() return h.dispatch(rw, cluster), nil } @@ -281,7 +310,7 @@ func (h *hotScheduler) tryAddPendingInfluence(op *operator.Operator, srcStore, d regionID := op.RegionID() _, ok := h.regionPendings[regionID] if ok { - schedulerStatus.WithLabelValues(h.GetName(), "pending_op_fails").Inc() + pendingOpFails.Inc() return false } @@ -300,21 +329,21 @@ func (h *hotScheduler) balanceHotReadRegions(cluster schedule.Cluster) []*operat peerSolver := newBalanceSolver(h, cluster, statistics.Read, movePeer) peerOps := peerSolver.solve() if len(leaderOps) == 0 && len(peerOps) == 0 { - schedulerCounter.WithLabelValues(h.GetName(), "skip").Inc() + hotSchedulerSkipCounter.Inc() return nil } if len(leaderOps) == 0 { if peerSolver.tryAddPendingInfluence() { return peerOps } - schedulerCounter.WithLabelValues(h.GetName(), "skip").Inc() + hotSchedulerSkipCounter.Inc() return nil } if len(peerOps) == 0 { if leaderSolver.tryAddPendingInfluence() { return leaderOps } - schedulerCounter.WithLabelValues(h.GetName(), "skip").Inc() + hotSchedulerSkipCounter.Inc() return nil } leaderSolver.cur = leaderSolver.best @@ -333,7 +362,7 @@ func (h *hotScheduler) balanceHotReadRegions(cluster schedule.Cluster) []*operat return leaderOps } } - schedulerCounter.WithLabelValues(h.GetName(), "skip").Inc() + hotSchedulerSkipCounter.Inc() return nil } @@ -356,7 +385,7 @@ func (h *hotScheduler) balanceHotWriteRegions(cluster schedule.Cluster) []*opera return ops } - schedulerCounter.WithLabelValues(h.GetName(), "skip").Inc() + hotSchedulerSkipCounter.Inc() return nil } @@ -604,7 +633,7 @@ func (bs *balanceSolver) solve() []*operator.Operator { bs.cur = &solution{} tryUpdateBestSolution := func() { if label, ok := bs.filterUniformStore(); ok { - schedulerCounter.WithLabelValues(bs.sche.GetName(), fmt.Sprintf("%s-skip-%s-uniform-store", bs.rwTy.String(), label)).Inc() + bs.skipCounter(label).Inc() return } if bs.isAvailable(bs.cur) && bs.betterThan(bs.best) { @@ -635,7 +664,7 @@ func (bs *balanceSolver) solve() []*operator.Operator { if bs.cur.region = bs.getRegion(mainPeerStat, srcStoreID); bs.cur.region == nil { continue } else if bs.opTy == movePeer && bs.cur.region.GetApproximateSize() > bs.GetOpts().GetMaxMovableHotPeerSize() { - schedulerCounter.WithLabelValues(bs.sche.GetName(), "need_split_before_move_peer").Inc() + hotSchedulerNeedSplitBeforeScheduleCounter.Inc() continue } bs.cur.mainPeerStat = mainPeerStat @@ -645,7 +674,7 @@ func (bs *balanceSolver) solve() []*operator.Operator { bs.calcProgressiveRank() tryUpdateBestSolution() if bs.needSearchRevertRegions() { - schedulerCounter.WithLabelValues(bs.sche.GetName(), "search-revert-regions").Inc() + hotSchedulerSearchRevertRegionsCounter.Inc() dstStoreID := dstStore.GetID() for _, revertPeerStat := range bs.filterHotPeers(bs.cur.dstStore) { revertRegion := bs.getRegion(revertPeerStat, dstStoreID) @@ -669,12 +698,37 @@ func (bs *balanceSolver) solve() []*operator.Operator { return bs.ops } +func (bs *balanceSolver) skipCounter(label string) prometheus.Counter { + if bs.rwTy == statistics.Read { + switch label { + case "byte": + return readSkipByteDimUniformStoreCounter + case "key": + return readSkipKeyDimUniformStoreCounter + case "query": + return readSkipQueryDimUniformStoreCounter + default: + return readSkipAllDimUniformStoreCounter + } + } + switch label { + case "byte": + return writeSkipByteDimUniformStoreCounter + case "key": + return writeSkipKeyDimUniformStoreCounter + case "query": + return writeSkipQueryDimUniformStoreCounter + default: + return writeSkipAllDimUniformStoreCounter + } +} + func (bs *balanceSolver) tryAddPendingInfluence() bool { if bs.best == nil || len(bs.ops) == 0 { return false } if bs.best.srcStore.IsTiFlash() != bs.best.dstStore.IsTiFlash() { - schedulerCounter.WithLabelValues(bs.sche.GetName(), "not-same-engine").Inc() + hotSchedulerNotSameEngineCounter.Inc() return false } maxZombieDur := bs.calcMaxZombieDur() @@ -832,18 +886,18 @@ func (bs *balanceSolver) sortHotPeers(ret []*statistics.HotPeerStat) map[*statis // isRegionAvailable checks whether the given region is not available to schedule. func (bs *balanceSolver) isRegionAvailable(region *core.RegionInfo) bool { if region == nil { - schedulerCounter.WithLabelValues(bs.sche.GetName(), "no-region").Inc() + hotSchedulerNoRegionCounter.Inc() return false } if !filter.IsRegionHealthyAllowPending(region) { - schedulerCounter.WithLabelValues(bs.sche.GetName(), "unhealthy-replica").Inc() + hotSchedulerUnhealthyReplicaCounter.Inc() return false } if !filter.IsRegionReplicated(bs.Cluster, region) { log.Debug("region has abnormal replica count", zap.String("scheduler", bs.sche.GetName()), zap.Uint64("region-id", region.GetID())) - schedulerCounter.WithLabelValues(bs.sche.GetName(), "abnormal-replica").Inc() + hotSchedulerAbnormalReplicaCounter.Inc() return false } @@ -966,6 +1020,7 @@ func (bs *balanceSolver) pickDstStores(filters []filter.Filter, candidates []*st } if filter.Target(bs.GetOpts(), store, filters) { id := store.GetID() + // todo if bs.checkDstByPriorityAndTolerance(detail.LoadPred.Max(), &detail.LoadPred.Expect, dstToleranceRatio) { ret[id] = detail hotSchedulerResultCounter.WithLabelValues("dst-store-succ", strconv.FormatUint(id, 10)).Inc() @@ -1341,7 +1396,7 @@ func (bs *balanceSolver) buildOperators() (ops []*operator.Operator) { if err != nil { log.Debug("fail to create operator", zap.Stringer("rw-type", bs.rwTy), zap.Stringer("op-type", bs.opTy), errs.ZapError(err)) - schedulerCounter.WithLabelValues(bs.sche.GetName(), "create-operator-fail").Inc() + hotSchedulerCreateOperatorFailedCounter.Inc() return nil } @@ -1418,8 +1473,8 @@ func (bs *balanceSolver) decorateOperator(op *operator.Operator, isRevert bool, hotDirectionCounter.WithLabelValues(typ, bs.rwTy.String(), targetLabel, "in", dim), balanceDirectionCounter.WithLabelValues(bs.sche.GetName(), sourceLabel, targetLabel)) op.Counters = append(op.Counters, - schedulerCounter.WithLabelValues(bs.sche.GetName(), "new-operator"), - schedulerCounter.WithLabelValues(bs.sche.GetName(), typ)) + hotSchedulerNewOperatorCounter, + opCounter(typ)) if isRevert { op.FinishedCounters = append(op.FinishedCounters, hotDirectionCounter.WithLabelValues(typ, bs.rwTy.String(), sourceLabel, "out-for-revert", dim), @@ -1427,6 +1482,17 @@ func (bs *balanceSolver) decorateOperator(op *operator.Operator, isRevert bool, } } +func opCounter(typ string) prometheus.Counter { + switch typ { + case "move-leader": + return hotSchedulerMoveLeaderCounter + case "move-peer": + return hotSchedulerMovePeerCounter + default: //transfer-leader + return hotSchedulerTransferLeaderCounter + } +} + func (bs *balanceSolver) logBestSolution() { best := bs.best if best == nil { @@ -1484,12 +1550,15 @@ type opType int const ( movePeer opType = iota transferLeader + moveLeader ) func (ty opType) String() string { switch ty { case movePeer: return "move-peer" + case moveLeader: + return "move-leader" case transferLeader: return "transfer-leader" default: diff --git a/server/schedulers/label.go b/server/schedulers/label.go index 7a772f1e814..4c6ed0bff98 100644 --- a/server/schedulers/label.go +++ b/server/schedulers/label.go @@ -34,6 +34,15 @@ const ( LabelType = "label" ) +var ( + // WithLabelValues is a heavy operation, define variable to avoid call it every time. + labelCounter = schedulerCounter.WithLabelValues(LabelName, "schedule") + labelNewOperatorCounter = schedulerCounter.WithLabelValues(LabelName, "new-operator") + labelNoTargetCounter = schedulerCounter.WithLabelValues(LabelName, "no-target") + labelSkipCounter = schedulerCounter.WithLabelValues(LabelName, "skip") + labelNoRegionCounter = schedulerCounter.WithLabelValues(LabelName, "no-region") +) + func init() { schedule.RegisterSliceDecoderBuilder(LabelType, func(args []string) schedule.ConfigDecoder { return func(v interface{}) error { @@ -101,7 +110,7 @@ func (s *labelScheduler) IsScheduleAllowed(cluster schedule.Cluster) bool { } func (s *labelScheduler) Schedule(cluster schedule.Cluster, dryRun bool) ([]*operator.Operator, []plan.Plan) { - schedulerCounter.WithLabelValues(s.GetName(), "schedule").Inc() + labelCounter.Inc() stores := cluster.GetStores() rejectLeaderStores := make(map[uint64]struct{}) for _, s := range stores { @@ -110,7 +119,7 @@ func (s *labelScheduler) Schedule(cluster schedule.Cluster, dryRun bool) ([]*ope } } if len(rejectLeaderStores) == 0 { - schedulerCounter.WithLabelValues(s.GetName(), "skip").Inc() + labelSkipCounter.Inc() return nil, nil } log.Debug("label scheduler reject leader store list", zap.Reflect("stores", rejectLeaderStores)) @@ -131,7 +140,7 @@ func (s *labelScheduler) Schedule(cluster schedule.Cluster, dryRun bool) ([]*ope RandomPick() if target == nil { log.Debug("label scheduler no target found for region", zap.Uint64("region-id", region.GetID())) - schedulerCounter.WithLabelValues(s.GetName(), "no-target").Inc() + labelNoTargetCounter.Inc() continue } @@ -140,10 +149,10 @@ func (s *labelScheduler) Schedule(cluster schedule.Cluster, dryRun bool) ([]*ope log.Debug("fail to create transfer label reject leader operator", errs.ZapError(err)) return nil, nil } - op.Counters = append(op.Counters, schedulerCounter.WithLabelValues(s.GetName(), "new-operator")) + op.Counters = append(op.Counters, labelNewOperatorCounter) return []*operator.Operator{op}, nil } } - schedulerCounter.WithLabelValues(s.GetName(), "no-region").Inc() + labelNoRegionCounter.Inc() return nil, nil } diff --git a/server/schedulers/metrics.go b/server/schedulers/metrics.go index 21dd8bbe414..10247c6de03 100644 --- a/server/schedulers/metrics.go +++ b/server/schedulers/metrics.go @@ -64,6 +64,7 @@ var balanceRegionCounter = prometheus.NewCounterVec( Help: "Counter of balance region scheduler.", }, []string{"type", "store"}) +// todo: pre-allocate gauge metrics var hotSchedulerResultCounter = prometheus.NewCounterVec( prometheus.CounterOpts{ Namespace: "pd", @@ -80,6 +81,7 @@ var balanceDirectionCounter = prometheus.NewCounterVec( Help: "Counter of direction of balance related schedulers.", }, []string{"type", "source", "target"}) +// todo: pre-allocate gauge metrics var hotDirectionCounter = prometheus.NewCounterVec( prometheus.CounterOpts{ Namespace: "pd", diff --git a/server/schedulers/random_merge.go b/server/schedulers/random_merge.go index cfa8909f5c6..209cdf84a12 100644 --- a/server/schedulers/random_merge.go +++ b/server/schedulers/random_merge.go @@ -35,6 +35,16 @@ const ( RandomMergeType = "random-merge" ) +var ( + // WithLabelValues is a heavy operation, define variable to avoid call it every time. + randomMergeCounter = schedulerCounter.WithLabelValues(RandomMergeName, "schedule") + randomMergeNewOperatorCounter = schedulerCounter.WithLabelValues(RandomMergeName, "new-operator") + randomMergeNoSourceStoreCounter = schedulerCounter.WithLabelValues(RandomMergeName, "no-source-store") + randomMergeNoRegionCounter = schedulerCounter.WithLabelValues(RandomMergeName, "no-region") + randomMergeNoTargetStoreCounter = schedulerCounter.WithLabelValues(RandomMergeName, "no-target-store") + randomMergeNotAllowedCounter = schedulerCounter.WithLabelValues(RandomMergeName, "not-allowed") +) + func init() { schedule.RegisterSliceDecoderBuilder(RandomMergeType, func(args []string) schedule.ConfigDecoder { return func(v interface{}) error { @@ -101,20 +111,20 @@ func (s *randomMergeScheduler) IsScheduleAllowed(cluster schedule.Cluster) bool } func (s *randomMergeScheduler) Schedule(cluster schedule.Cluster, dryRun bool) ([]*operator.Operator, []plan.Plan) { - schedulerCounter.WithLabelValues(s.GetName(), "schedule").Inc() + randomMergeCounter.Inc() store := filter.NewCandidates(cluster.GetStores()). FilterSource(cluster.GetOpts(), nil, nil, &filter.StoreStateFilter{ActionScope: s.conf.Name, MoveRegion: true}). RandomPick() if store == nil { - schedulerCounter.WithLabelValues(s.GetName(), "no-source-store").Inc() + randomMergeNoSourceStoreCounter.Inc() return nil, nil } pendingFilter := filter.NewRegionPendingFilter() downFilter := filter.NewRegionDownFilter() region := filter.SelectOneRegion(cluster.RandLeaderRegions(store.GetID(), s.conf.Ranges), nil, pendingFilter, downFilter) if region == nil { - schedulerCounter.WithLabelValues(s.GetName(), "no-region").Inc() + randomMergeNoRegionCounter.Inc() return nil, nil } @@ -123,12 +133,12 @@ func (s *randomMergeScheduler) Schedule(cluster schedule.Cluster, dryRun bool) ( target = other } if target == nil { - schedulerCounter.WithLabelValues(s.GetName(), "no-target-store").Inc() + randomMergeNoTargetStoreCounter.Inc() return nil, nil } if !s.allowMerge(cluster, region, target) { - schedulerCounter.WithLabelValues(s.GetName(), "not-allowed").Inc() + randomMergeNotAllowedCounter.Inc() return nil, nil } @@ -139,7 +149,7 @@ func (s *randomMergeScheduler) Schedule(cluster schedule.Cluster, dryRun bool) ( } ops[0].SetPriorityLevel(core.Low) ops[1].SetPriorityLevel(core.Low) - ops[0].Counters = append(ops[0].Counters, schedulerCounter.WithLabelValues(s.GetName(), "new-operator")) + ops[0].Counters = append(ops[0].Counters, randomMergeNewOperatorCounter) return ops, nil } diff --git a/server/schedulers/scatter_range.go b/server/schedulers/scatter_range.go index 8579c2d149e..4ac06fb38f8 100644 --- a/server/schedulers/scatter_range.go +++ b/server/schedulers/scatter_range.go @@ -31,6 +31,23 @@ import ( "github.com/unrolled/render" ) +const ( + // ScatterRangeType is scatter range scheduler type + ScatterRangeType = "scatter-range" + // ScatterRangeName is scatter range scheduler name + ScatterRangeName = "scatter-range" +) + +var ( + // WithLabelValues is a heavy operation, define variable to avoid call it every time. + scatterRangeCounter = schedulerCounter.WithLabelValues(ScatterRangeName, "schedule") + scatterRangeNewOperatorCounter = schedulerCounter.WithLabelValues(ScatterRangeName, "new-operator") + scatterRangeNewLeaderOperatorCounter = schedulerCounter.WithLabelValues(ScatterRangeName, "new-leader-operator") + scatterRangeNewRegionOperatorCounter = schedulerCounter.WithLabelValues(ScatterRangeName, "new-region-operator") + scatterRangeNoNeedBalanceRegionCounter = schedulerCounter.WithLabelValues(ScatterRangeName, "no-need-balance-region") + scatterRangeNoNeedBalanceLeaderCounter = schedulerCounter.WithLabelValues(ScatterRangeName, "no-need-balance-leader") +) + func init() { // args: [start-key, end-key, range-name]. schedule.RegisterSliceDecoderBuilder(ScatterRangeType, func(args []string) schedule.ConfigDecoder { @@ -67,13 +84,6 @@ func init() { }) } -const ( - // ScatterRangeType is scatter range scheduler type - ScatterRangeType = "scatter-range" - // ScatterRangeName is scatter range scheduler name - ScatterRangeName = "scatter-range" -) - type scatterRangeSchedulerConfig struct { mu syncutil.RWMutex storage endpoint.ConfigStorage @@ -215,7 +225,7 @@ func (l *scatterRangeScheduler) allowBalanceRegion(cluster schedule.Cluster) boo } func (l *scatterRangeScheduler) Schedule(cluster schedule.Cluster, dryRun bool) ([]*operator.Operator, []plan.Plan) { - schedulerCounter.WithLabelValues(l.GetName(), "schedule").Inc() + scatterRangeCounter.Inc() // isolate a new cluster according to the key range c := schedule.GenRangeCluster(cluster, l.config.GetStartKey(), l.config.GetEndKey()) c.SetTolerantSizeRatio(2) @@ -225,11 +235,11 @@ func (l *scatterRangeScheduler) Schedule(cluster schedule.Cluster, dryRun bool) ops[0].SetDesc(fmt.Sprintf("scatter-range-leader-%s", l.config.RangeName)) ops[0].AttachKind(operator.OpRange) ops[0].Counters = append(ops[0].Counters, - schedulerCounter.WithLabelValues(l.GetName(), "new-operator"), - schedulerCounter.WithLabelValues(l.GetName(), "new-leader-operator")) + scatterRangeNewOperatorCounter, + scatterRangeNewLeaderOperatorCounter) return ops, nil } - schedulerCounter.WithLabelValues(l.GetName(), "no-need-balance-leader").Inc() + scatterRangeNoNeedBalanceLeaderCounter.Inc() } if l.allowBalanceRegion(cluster) { ops, _ := l.balanceRegion.Schedule(c, false) @@ -237,12 +247,11 @@ func (l *scatterRangeScheduler) Schedule(cluster schedule.Cluster, dryRun bool) ops[0].SetDesc(fmt.Sprintf("scatter-range-region-%s", l.config.RangeName)) ops[0].AttachKind(operator.OpRange) ops[0].Counters = append(ops[0].Counters, - schedulerCounter.WithLabelValues(l.GetName(), "new-operator"), - schedulerCounter.WithLabelValues(l.GetName(), "new-region-operator"), - ) + scatterRangeNewOperatorCounter, + scatterRangeNewRegionOperatorCounter) return ops, nil } - schedulerCounter.WithLabelValues(l.GetName(), "no-need-balance-region").Inc() + scatterRangeNoNeedBalanceRegionCounter.Inc() } return nil, nil diff --git a/server/schedulers/shuffle_hot_region.go b/server/schedulers/shuffle_hot_region.go index 79ec205bf48..fe5720393c2 100644 --- a/server/schedulers/shuffle_hot_region.go +++ b/server/schedulers/shuffle_hot_region.go @@ -37,6 +37,13 @@ const ( ShuffleHotRegionType = "shuffle-hot-region" ) +var ( + // WithLabelValues is a heavy operation, define variable to avoid call it every time. + shuffleHotRegionCounter = schedulerCounter.WithLabelValues(ShuffleHotRegionName, "schedule") + shuffleHotRegionNewOperatorCounter = schedulerCounter.WithLabelValues(ShuffleHotRegionName, "new-operator") + shuffleHotRegionSkipCounter = schedulerCounter.WithLabelValues(ShuffleHotRegionName, "skip") +) + func init() { schedule.RegisterSliceDecoderBuilder(ShuffleHotRegionType, func(args []string) schedule.ConfigDecoder { return func(v interface{}) error { @@ -119,7 +126,7 @@ func (s *shuffleHotRegionScheduler) IsScheduleAllowed(cluster schedule.Cluster) } func (s *shuffleHotRegionScheduler) Schedule(cluster schedule.Cluster, dryRun bool) ([]*operator.Operator, []plan.Plan) { - schedulerCounter.WithLabelValues(s.GetName(), "schedule").Inc() + shuffleHotRegionCounter.Inc() rw := s.randomRWType() s.prepareForBalance(rw, cluster) operators := s.randomSchedule(cluster, s.stLoadInfos[buildResourceType(rw, core.LeaderKind)]) @@ -176,9 +183,9 @@ func (s *shuffleHotRegionScheduler) randomSchedule(cluster schedule.Cluster, loa return nil } op.SetPriorityLevel(core.Low) - op.Counters = append(op.Counters, schedulerCounter.WithLabelValues(s.GetName(), "new-operator")) + op.Counters = append(op.Counters, shuffleHotRegionNewOperatorCounter) return []*operator.Operator{op} } - schedulerCounter.WithLabelValues(s.GetName(), "skip").Inc() + shuffleHotRegionSkipCounter.Inc() return nil } diff --git a/server/schedulers/shuffle_leader.go b/server/schedulers/shuffle_leader.go index 038a6b0b3b2..52a7327d92a 100644 --- a/server/schedulers/shuffle_leader.go +++ b/server/schedulers/shuffle_leader.go @@ -32,6 +32,14 @@ const ( ShuffleLeaderType = "shuffle-leader" ) +var ( + // WithLabelValues is a heavy operation, define variable to avoid call it every time. + shuffleLeaderCounter = schedulerCounter.WithLabelValues(ShuffleLeaderName, "schedule") + shuffleLeaderNewOperatorCounter = schedulerCounter.WithLabelValues(ShuffleLeaderName, "new-operator") + shuffleLeaderNoTargetStoreCounter = schedulerCounter.WithLabelValues(ShuffleLeaderName, "no-target-store") + shuffleLeaderNoFollowerCounter = schedulerCounter.WithLabelValues(ShuffleLeaderName, "no-follower") +) + func init() { schedule.RegisterSliceDecoderBuilder(ShuffleLeaderType, func(args []string) schedule.ConfigDecoder { return func(v interface{}) error { @@ -108,19 +116,19 @@ func (s *shuffleLeaderScheduler) Schedule(cluster schedule.Cluster, dryRun bool) // We shuffle leaders between stores by: // 1. random select a valid store. // 2. transfer a leader to the store. - schedulerCounter.WithLabelValues(s.GetName(), "schedule").Inc() + shuffleLeaderCounter.Inc() targetStore := filter.NewCandidates(cluster.GetStores()). FilterTarget(cluster.GetOpts(), nil, nil, s.filters...). RandomPick() if targetStore == nil { - schedulerCounter.WithLabelValues(s.GetName(), "no-target-store").Inc() + shuffleLeaderNoTargetStoreCounter.Inc() return nil, nil } pendingFilter := filter.NewRegionPendingFilter() downFilter := filter.NewRegionDownFilter() region := filter.SelectOneRegion(cluster.RandFollowerRegions(targetStore.GetID(), s.conf.Ranges), nil, pendingFilter, downFilter) if region == nil { - schedulerCounter.WithLabelValues(s.GetName(), "no-follower").Inc() + shuffleLeaderNoFollowerCounter.Inc() return nil, nil } op, err := operator.CreateTransferLeaderOperator(ShuffleLeaderType, cluster, region, region.GetLeader().GetId(), targetStore.GetID(), []uint64{}, operator.OpAdmin) @@ -129,6 +137,6 @@ func (s *shuffleLeaderScheduler) Schedule(cluster schedule.Cluster, dryRun bool) return nil, nil } op.SetPriorityLevel(core.Low) - op.Counters = append(op.Counters, schedulerCounter.WithLabelValues(s.GetName(), "new-operator")) + op.Counters = append(op.Counters, shuffleLeaderNewOperatorCounter) return []*operator.Operator{op}, nil } diff --git a/server/schedulers/shuffle_region.go b/server/schedulers/shuffle_region.go index 919ce366233..30687de772c 100644 --- a/server/schedulers/shuffle_region.go +++ b/server/schedulers/shuffle_region.go @@ -34,6 +34,16 @@ const ( ShuffleRegionType = "shuffle-region" ) +var ( + // WithLabelValues is a heavy operation, define variable to avoid call it every time. + shuffleRegionCounter = schedulerCounter.WithLabelValues(ShuffleRegionName, "schedule") + shuffleRegionNewOperatorCounter = schedulerCounter.WithLabelValues(ShuffleRegionName, "new-operator") + shuffleRegionNoRegionCounter = schedulerCounter.WithLabelValues(ShuffleRegionName, "no-region") + shuffleRegionNoNewPeerCounter = schedulerCounter.WithLabelValues(ShuffleRegionName, "no-new-peer") + shuffleRegionCreateOperatorFailCounter = schedulerCounter.WithLabelValues(ShuffleRegionName, "create-operator-fail") + shuffleRegionNoSourceStoreCounter = schedulerCounter.WithLabelValues(ShuffleRegionName, "no-source-store") +) + func init() { schedule.RegisterSliceDecoderBuilder(ShuffleRegionType, func(args []string) schedule.ConfigDecoder { return func(v interface{}) error { @@ -105,25 +115,25 @@ func (s *shuffleRegionScheduler) IsScheduleAllowed(cluster schedule.Cluster) boo } func (s *shuffleRegionScheduler) Schedule(cluster schedule.Cluster, dryRun bool) ([]*operator.Operator, []plan.Plan) { - schedulerCounter.WithLabelValues(s.GetName(), "schedule").Inc() + shuffleRegionCounter.Inc() region, oldPeer := s.scheduleRemovePeer(cluster) if region == nil { - schedulerCounter.WithLabelValues(s.GetName(), "no-region").Inc() + shuffleRegionNoRegionCounter.Inc() return nil, nil } newPeer := s.scheduleAddPeer(cluster, region, oldPeer) if newPeer == nil { - schedulerCounter.WithLabelValues(s.GetName(), "no-new-peer").Inc() + shuffleRegionNoNewPeerCounter.Inc() return nil, nil } op, err := operator.CreateMovePeerOperator(ShuffleRegionType, cluster, region, operator.OpRegion, oldPeer.GetStoreId(), newPeer) if err != nil { - schedulerCounter.WithLabelValues(s.GetName(), "create-operator-fail").Inc() + shuffleRegionCreateOperatorFailCounter.Inc() return nil, nil } - op.Counters = append(op.Counters, schedulerCounter.WithLabelValues(s.GetName(), "new-operator")) + op.Counters = append(op.Counters, shuffleRegionNewOperatorCounter) op.SetPriorityLevel(core.Low) return []*operator.Operator{op}, nil } @@ -153,10 +163,10 @@ func (s *shuffleRegionScheduler) scheduleRemovePeer(cluster schedule.Cluster) (* if region != nil { return region, region.GetStorePeer(source.GetID()) } - schedulerCounter.WithLabelValues(s.GetName(), "no-region").Inc() + shuffleRegionNoRegionCounter.Inc() } - schedulerCounter.WithLabelValues(s.GetName(), "no-source-store").Inc() + shuffleRegionNoSourceStoreCounter.Inc() return nil, nil } diff --git a/server/schedulers/split_bucket.go b/server/schedulers/split_bucket.go index be69fda23ba..aba93d8aa95 100644 --- a/server/schedulers/split_bucket.go +++ b/server/schedulers/split_bucket.go @@ -43,6 +43,20 @@ const ( defaultSplitLimit = 10 ) +var ( + // WithLabelValues is a heavy operation, define variable to avoid call it every time. + splitBucketDisableCounter = schedulerCounter.WithLabelValues(SplitBucketName, "bucket-disable") + splitBuckerSplitLimitCounter = schedulerCounter.WithLabelValues(SplitBucketName, "split-limit") + splitBucketScheduleCounter = schedulerCounter.WithLabelValues(SplitBucketName, "schedule") + splitBucketNoRegionCounter = schedulerCounter.WithLabelValues(SplitBucketName, "no-region") + splitBucketRegionTooSmallCounter = schedulerCounter.WithLabelValues(SplitBucketName, "region-too-small") + splitBucketOperatorExistCounter = schedulerCounter.WithLabelValues(SplitBucketName, "operator-exist") + splitBucketKeyRangeNotMatchCounter = schedulerCounter.WithLabelValues(SplitBucketName, "key-range-not-match") + splitBucketNoSplitKeysCounter = schedulerCounter.WithLabelValues(SplitBucketName, "no-split-keys") + splitBucketCreateOpeartorFailCounter = schedulerCounter.WithLabelValues(SplitBucketName, "create-operator-fail") + splitBucketNewOperatorCounter = schedulerCounter.WithLabelValues(SplitBucketName, "new-operator") +) + func init() { schedule.RegisterSliceDecoderBuilder(SplitBucketType, func(args []string) schedule.ConfigDecoder { return func(v interface{}) error { @@ -171,12 +185,12 @@ func (s *splitBucketScheduler) ServeHTTP(w http.ResponseWriter, r *http.Request) // IsScheduleAllowed return true if the sum of executing opSplit operator is less . func (s *splitBucketScheduler) IsScheduleAllowed(cluster schedule.Cluster) bool { if !cluster.GetStoreConfig().IsEnableRegionBucket() { - schedulerCounter.WithLabelValues(s.GetName(), "bucket-disable").Inc() + splitBucketDisableCounter.Inc() return false } allowed := s.BaseScheduler.OpController.OperatorCount(operator.OpSplit) < s.conf.SplitLimit if !allowed { - schedulerCounter.WithLabelValues(s.GetName(), "split-limit").Inc() + splitBuckerSplitLimitCounter.Inc() operator.OperatorLimitCounter.WithLabelValues(s.GetType(), operator.OpSplit.String()).Inc() } return allowed @@ -191,7 +205,7 @@ type splitBucketPlan struct { // Schedule return operators if some bucket is too hot. func (s *splitBucketScheduler) Schedule(cluster schedule.Cluster, dryRun bool) ([]*operator.Operator, []plan.Plan) { - schedulerCounter.WithLabelValues(s.GetName(), "schedule").Inc() + splitBucketScheduleCounter.Inc() conf := s.conf.Clone() plan := &splitBucketPlan{ conf: conf, @@ -208,16 +222,16 @@ func (s *splitBucketScheduler) splitBucket(plan *splitBucketPlan) []*operator.Op region := plan.cluster.GetRegion(regionID) // skip if the region doesn't exist if region == nil { - schedulerCounter.WithLabelValues(s.GetName(), "no-region").Inc() + splitBucketNoRegionCounter.Inc() continue } // region size is less than split region size if region.GetApproximateSize() <= plan.hotRegionSplitSize { - schedulerCounter.WithLabelValues(s.GetName(), "region-too-small").Inc() + splitBucketRegionTooSmallCounter.Inc() continue } if op := s.OpController.GetOperator(regionID); op != nil { - schedulerCounter.WithLabelValues(s.GetName(), "operator-exist").Inc() + splitBucketOperatorExistCounter.Inc() continue } for _, bucket := range buckets { @@ -225,11 +239,11 @@ func (s *splitBucketScheduler) splitBucket(plan *splitBucketPlan) []*operator.Op // like bucket: [001 100] and region: [001 100] will not pass. // like bucket: [003 100] and region: [002 100] will pass. if bytes.Compare(bucket.StartKey, region.GetStartKey()) < 0 || bytes.Compare(bucket.EndKey, region.GetEndKey()) > 0 { - schedulerCounter.WithLabelValues(s.GetName(), "key-range-not-match").Inc() + splitBucketKeyRangeNotMatchCounter.Inc() continue } if bytes.Equal(bucket.StartKey, region.GetStartKey()) && bytes.Equal(bucket.EndKey, region.GetEndKey()) { - schedulerCounter.WithLabelValues(s.GetName(), "no-split-keys").Inc() + splitBucketNoSplitKeysCounter.Inc() continue } @@ -250,10 +264,10 @@ func (s *splitBucketScheduler) splitBucket(plan *splitBucketPlan) []*operator.Op op, err := operator.CreateSplitRegionOperator(SplitBucketType, plan.cluster.GetRegion(splitBucket.RegionID), operator.OpSplit, pdpb.CheckPolicy_USEKEY, splitKey) if err != nil { - schedulerCounter.WithLabelValues(s.GetName(), "create-operator-fail").Inc() + splitBucketCreateOpeartorFailCounter.Inc() return nil } - schedulerCounter.WithLabelValues(s.GetName(), "new-operator").Inc() + splitBucketNewOperatorCounter.Inc() op.AdditionalInfos["region-start-key"] = core.HexRegionKeyStr(region.GetStartKey()) op.AdditionalInfos["region-end-key"] = core.HexRegionKeyStr(region.GetEndKey()) op.AdditionalInfos["hot-degree"] = strconv.FormatInt(int64(splitBucket.HotDegree), 10) diff --git a/server/schedulers/transfer_witness_leader.go b/server/schedulers/transfer_witness_leader.go index 5d2980fc0ce..52da7735254 100644 --- a/server/schedulers/transfer_witness_leader.go +++ b/server/schedulers/transfer_witness_leader.go @@ -39,6 +39,13 @@ const ( transferWitnessLeaderRecvMaxRegionSize = 1000 ) +var ( + // WithLabelValues is a heavy operation, define variable to avoid call it every time. + transferWitnessLeaderCounter = schedulerCounter.WithLabelValues(TransferWitnessLeaderName, "schedule") + transferWitnessLeaderNewOperatorCounter = schedulerCounter.WithLabelValues(TransferWitnessLeaderName, "new-operator") + transferWitnessLeaderNoTargetStoreCounter = schedulerCounter.WithLabelValues(TransferWitnessLeaderName, "no-target-store") +) + func init() { schedule.RegisterSliceDecoderBuilder(TransferWitnessLeaderType, func(args []string) schedule.ConfigDecoder { return func(v interface{}) error { @@ -82,7 +89,7 @@ func (s *trasferWitnessLeaderScheduler) IsScheduleAllowed(cluster schedule.Clust } func (s *trasferWitnessLeaderScheduler) Schedule(cluster schedule.Cluster, dryRun bool) ([]*operator.Operator, []plan.Plan) { - schedulerCounter.WithLabelValues(s.GetName(), "schedule").Inc() + transferWitnessLeaderCounter.Inc() return s.scheduleTransferWitnessLeaderBatch(s.GetName(), s.GetType(), cluster, transferWitnessLeaderBatchSize), nil } @@ -98,7 +105,7 @@ func (s *trasferWitnessLeaderScheduler) scheduleTransferWitnessLeaderBatch(name, } if op != nil { op.SetPriorityLevel(core.Urgent) - op.Counters = append(op.Counters, schedulerCounter.WithLabelValues(name, "new-operator")) + op.Counters = append(op.Counters, transferWitnessLeaderNewOperatorCounter) ops = append(ops, op) } default: @@ -124,7 +131,7 @@ func (s *trasferWitnessLeaderScheduler) scheduleTransferWitnessLeader(name, typ targets := candidates.PickAll() // `targets` MUST contains `target`, so only needs to check if `target` is nil here. if target == nil { - schedulerCounter.WithLabelValues(name, "no-target-store").Inc() + transferWitnessLeaderNoTargetStoreCounter.Inc() return nil, errors.New("no target store to schedule") } targetIDs := make([]uint64, 0, len(targets)) diff --git a/server/schedulers/utils.go b/server/schedulers/utils.go index 277ca904fdb..65ae36285e5 100644 --- a/server/schedulers/utils.go +++ b/server/schedulers/utils.go @@ -39,6 +39,8 @@ const ( defaultRetryQuotaAttenuation = 2 ) +// WithLabelValues is a heavy operation, define variable to avoid call it every time. + type solver struct { *balanceSchedulerPlan schedule.Cluster diff --git a/server/server.go b/server/server.go index 89049b8e25b..bd1803a878d 100644 --- a/server/server.go +++ b/server/server.go @@ -92,6 +92,13 @@ const ( // EtcdStartTimeout the timeout of the startup etcd. var EtcdStartTimeout = time.Minute * 5 +var ( + // WithLabelValues is a heavy operation, define variable to avoid call it every time. + etcdTermGauge = etcdStateGauge.WithLabelValues("term") + etcdAppliedIndexGauge = etcdStateGauge.WithLabelValues("appliedIndex") + etcdCommittedIndexGauge = etcdStateGauge.WithLabelValues("committedIndex") +) + // Server is the pd server. // nolint type Server struct { @@ -568,9 +575,9 @@ func (s *Server) encryptionKeyManagerLoop() { } func (s *Server) collectEtcdStateMetrics() { - etcdStateGauge.WithLabelValues("term").Set(float64(s.member.Etcd().Server.Term())) - etcdStateGauge.WithLabelValues("appliedIndex").Set(float64(s.member.Etcd().Server.AppliedIndex())) - etcdStateGauge.WithLabelValues("committedIndex").Set(float64(s.member.Etcd().Server.CommittedIndex())) + etcdTermGauge.Set(float64(s.member.Etcd().Server.Term())) + etcdAppliedIndexGauge.Set(float64(s.member.Etcd().Server.AppliedIndex())) + etcdCommittedIndexGauge.Set(float64(s.member.Etcd().Server.CommittedIndex())) } func (s *Server) bootstrapCluster(req *pdpb.BootstrapRequest) (*pdpb.BootstrapResponse, error) { diff --git a/server/statistics/region_collection.go b/server/statistics/region_collection.go index ab5e6b22d9c..d7561ea9d89 100644 --- a/server/statistics/region_collection.go +++ b/server/statistics/region_collection.go @@ -42,6 +42,25 @@ const ( const nonIsolation = "none" +var ( + // WithLabelValues is a heavy operation, define variable to avoid call it every time. + regionMissPeerRegionCounter = regionStatusGauge.WithLabelValues("miss-peer-region-count") + regionExtraPeerRegionCounter = regionStatusGauge.WithLabelValues("extra-peer-region-count") + regionDownPeerRegionCounter = regionStatusGauge.WithLabelValues("down-peer-region-count") + regionPendingPeerRegionCounter = regionStatusGauge.WithLabelValues("pending-peer-region-count") + regionLearnerPeerRegionCounter = regionStatusGauge.WithLabelValues("learner-peer-region-count") + regionEmptyRegionCounter = regionStatusGauge.WithLabelValues("empty-region-count") + regionOversizedRegionCounter = regionStatusGauge.WithLabelValues("oversized-region-count") + regionUndersizedRegionCounter = regionStatusGauge.WithLabelValues("undersized-region-count") + + offlineMissPeerRegionCounter = offlineRegionStatusGauge.WithLabelValues("miss-peer-region-count") + offlineExtraPeerRegionCounter = offlineRegionStatusGauge.WithLabelValues("extra-peer-region-count") + offlineDownPeerRegionCounter = offlineRegionStatusGauge.WithLabelValues("down-peer-region-count") + offlinePendingPeerRegionCounter = offlineRegionStatusGauge.WithLabelValues("pending-peer-region-count") + offlineLearnerPeerRegionCounter = offlineRegionStatusGauge.WithLabelValues("learner-peer-region-count") + offlineOfflinePeerRegionCounter = offlineRegionStatusGauge.WithLabelValues("offline-peer-region-count") +) + // RegionInfo is used to record the status of region. type RegionInfo struct { *core.RegionInfo @@ -274,21 +293,21 @@ func (r *RegionStatistics) ClearDefunctRegion(regionID uint64) { func (r *RegionStatistics) Collect() { r.RLock() defer r.RUnlock() - regionStatusGauge.WithLabelValues("miss-peer-region-count").Set(float64(len(r.stats[MissPeer]))) - regionStatusGauge.WithLabelValues("extra-peer-region-count").Set(float64(len(r.stats[ExtraPeer]))) - regionStatusGauge.WithLabelValues("down-peer-region-count").Set(float64(len(r.stats[DownPeer]))) - regionStatusGauge.WithLabelValues("pending-peer-region-count").Set(float64(len(r.stats[PendingPeer]))) - regionStatusGauge.WithLabelValues("learner-peer-region-count").Set(float64(len(r.stats[LearnerPeer]))) - regionStatusGauge.WithLabelValues("empty-region-count").Set(float64(len(r.stats[EmptyRegion]))) - regionStatusGauge.WithLabelValues("oversized-region-count").Set(float64(len(r.stats[OversizedRegion]))) - regionStatusGauge.WithLabelValues("undersized-region-count").Set(float64(len(r.stats[UndersizedRegion]))) - - offlineRegionStatusGauge.WithLabelValues("miss-peer-region-count").Set(float64(len(r.offlineStats[MissPeer]))) - offlineRegionStatusGauge.WithLabelValues("extra-peer-region-count").Set(float64(len(r.offlineStats[ExtraPeer]))) - offlineRegionStatusGauge.WithLabelValues("down-peer-region-count").Set(float64(len(r.offlineStats[DownPeer]))) - offlineRegionStatusGauge.WithLabelValues("pending-peer-region-count").Set(float64(len(r.offlineStats[PendingPeer]))) - offlineRegionStatusGauge.WithLabelValues("learner-peer-region-count").Set(float64(len(r.offlineStats[LearnerPeer]))) - offlineRegionStatusGauge.WithLabelValues("offline-peer-region-count").Set(float64(len(r.offlineStats[OfflinePeer]))) + regionMissPeerRegionCounter.Set(float64(len(r.stats[MissPeer]))) + regionExtraPeerRegionCounter.Set(float64(len(r.stats[ExtraPeer]))) + regionDownPeerRegionCounter.Set(float64(len(r.stats[DownPeer]))) + regionPendingPeerRegionCounter.Set(float64(len(r.stats[PendingPeer]))) + regionLearnerPeerRegionCounter.Set(float64(len(r.stats[LearnerPeer]))) + regionEmptyRegionCounter.Set(float64(len(r.stats[EmptyRegion]))) + regionOversizedRegionCounter.Set(float64(len(r.stats[OversizedRegion]))) + regionUndersizedRegionCounter.Set(float64(len(r.stats[UndersizedRegion]))) + + offlineMissPeerRegionCounter.Set(float64(len(r.offlineStats[MissPeer]))) + offlineExtraPeerRegionCounter.Set(float64(len(r.offlineStats[ExtraPeer]))) + offlineDownPeerRegionCounter.Set(float64(len(r.offlineStats[DownPeer]))) + offlinePendingPeerRegionCounter.Set(float64(len(r.offlineStats[PendingPeer]))) + offlineLearnerPeerRegionCounter.Set(float64(len(r.offlineStats[LearnerPeer]))) + offlineOfflinePeerRegionCounter.Set(float64(len(r.offlineStats[OfflinePeer]))) } // Reset resets the metrics of the regions' status. diff --git a/server/statistics/store_collection.go b/server/statistics/store_collection.go index 04138f61c58..7f18199f860 100644 --- a/server/statistics/store_collection.go +++ b/server/statistics/store_collection.go @@ -111,7 +111,7 @@ func (s *storeStatistics) Observe(store *core.StoreInfo, stats *StoresStats) { s.RegionCount += store.GetRegionCount() s.LeaderCount += store.GetLeaderCount() s.WitnessCount += store.GetWitnessCount() - + // todo: pre-allocate gauge metrics storeStatusGauge.WithLabelValues(storeAddress, id, "region_score").Set(store.RegionScore(s.opt.GetRegionScoreFormulaVersion(), s.opt.GetHighSpaceRatio(), s.opt.GetLowSpaceRatio(), 0)) storeStatusGauge.WithLabelValues(storeAddress, id, "leader_score").Set(store.LeaderScore(s.opt.GetLeaderSchedulePolicy(), 0)) storeStatusGauge.WithLabelValues(storeAddress, id, "region_size").Set(float64(store.GetRegionSize())) diff --git a/server/statistics/store_hot_peers_infos.go b/server/statistics/store_hot_peers_infos.go index 17ed6f47598..38db458b668 100644 --- a/server/statistics/store_hot_peers_infos.go +++ b/server/statistics/store_hot_peers_infos.go @@ -167,6 +167,7 @@ func summaryStoresLoadByEngine( } { // Metric for debug. + // todo: pre-allocate gauge metrics ty := "byte-rate-" + rwTy.String() + "-" + kind.String() hotPeerSummary.WithLabelValues(ty, fmt.Sprintf("%v", id)).Set(peerLoadSum[ByteDim]) ty = "key-rate-" + rwTy.String() + "-" + kind.String() diff --git a/server/tso/metrics.go b/server/tso/metrics.go index 17ff9dbb8e1..fa9c083cd90 100644 --- a/server/tso/metrics.go +++ b/server/tso/metrics.go @@ -22,6 +22,7 @@ const ( ) var ( + // todo: pre-allocate gauge metrics tsoCounter = prometheus.NewCounterVec( prometheus.CounterOpts{ Namespace: "pd", diff --git a/tools/pd-simulator/simulator/task.go b/tools/pd-simulator/simulator/task.go index 379707f4b3b..9662828620d 100644 --- a/tools/pd-simulator/simulator/task.go +++ b/tools/pd-simulator/simulator/task.go @@ -53,6 +53,8 @@ const ( finished ) +// WithLabelValues is a heavy operation, define variable to avoid call it every time. + func responseToTask(engine *RaftEngine, resp *pdpb.RegionHeartbeatResponse) *Task { var ( regionID = resp.GetRegionId() From 73b49a13d363e108c39f24a6a5a9dc4636c2feed Mon Sep 17 00:00:00 2001 From: lhy1024 Date: Tue, 27 Dec 2022 02:28:17 +0800 Subject: [PATCH 2/5] fix lint Signed-off-by: lhy1024 --- server/schedulers/hot_region.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/schedulers/hot_region.go b/server/schedulers/hot_region.go index 9e32422ab97..5915f5d10cd 100644 --- a/server/schedulers/hot_region.go +++ b/server/schedulers/hot_region.go @@ -1488,7 +1488,7 @@ func opCounter(typ string) prometheus.Counter { return hotSchedulerMoveLeaderCounter case "move-peer": return hotSchedulerMovePeerCounter - default: //transfer-leader + default: // transfer-leader return hotSchedulerTransferLeaderCounter } } From 88982d64610c7e2812e3e17d15bc47f6f3ca4196 Mon Sep 17 00:00:00 2001 From: lhy1024 Date: Fri, 6 Jan 2023 11:11:58 +0800 Subject: [PATCH 3/5] fix conflict Signed-off-by: lhy1024 --- server/schedule/checker/rule_checker.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/server/schedule/checker/rule_checker.go b/server/schedule/checker/rule_checker.go index 1d9eea4ad37..0224eb08243 100644 --- a/server/schedule/checker/rule_checker.go +++ b/server/schedule/checker/rule_checker.go @@ -453,7 +453,7 @@ loopFits: } // If hasUnhealthyFit is false, it is safe to delete the OrphanPeer. if !hasUnhealthyFit { - ruleCheckerSkipRemoveOrphanPeerCounter.Inc() + ruleCheckerRemoveOrphanPeerCounter.Inc() return operator.CreateRemovePeerOperator("remove-orphan-peer", c.cluster, 0, region, fit.OrphanPeers[0].StoreId) } // If hasUnhealthyFit is true, try to remove unhealthy orphan peers only if number of OrphanPeers is >= 2. @@ -461,7 +461,7 @@ loopFits: if len(fit.OrphanPeers) >= 2 { for _, orphanPeer := range fit.OrphanPeers { if isUnhealthyPeer(orphanPeer.GetId()) { - ruleCheckerSkipRemoveOrphanPeerCounter.Inc() + ruleCheckerRemoveOrphanPeerCounter.Inc() return operator.CreateRemovePeerOperator("remove-orphan-peer", c.cluster, 0, region, orphanPeer.StoreId) } } From aedc8ef94a8c123264b79847f44ff9ef19267993 Mon Sep 17 00:00:00 2001 From: lhy1024 Date: Mon, 9 Jan 2023 12:04:19 +0800 Subject: [PATCH 4/5] address comments Signed-off-by: lhy1024 --- tools/pd-simulator/simulator/task.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/tools/pd-simulator/simulator/task.go b/tools/pd-simulator/simulator/task.go index 9662828620d..379707f4b3b 100644 --- a/tools/pd-simulator/simulator/task.go +++ b/tools/pd-simulator/simulator/task.go @@ -53,8 +53,6 @@ const ( finished ) -// WithLabelValues is a heavy operation, define variable to avoid call it every time. - func responseToTask(engine *RaftEngine, resp *pdpb.RegionHeartbeatResponse) *Task { var ( regionID = resp.GetRegionId() From b826ddc6826bdff7f48c0fe2dbc0d65bc94f6214 Mon Sep 17 00:00:00 2001 From: lhy1024 Date: Tue, 10 Jan 2023 15:38:58 +0800 Subject: [PATCH 5/5] move comments Signed-off-by: lhy1024 --- server/schedule/checker/replica_checker.go | 12 ++++++------ server/schedulers/metrics.go | 2 ++ server/schedulers/utils.go | 2 -- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/server/schedule/checker/replica_checker.go b/server/schedule/checker/replica_checker.go index d2511453ac4..8ae4837fbeb 100644 --- a/server/schedule/checker/replica_checker.go +++ b/server/schedule/checker/replica_checker.go @@ -39,7 +39,7 @@ var ( // WithLabelValues is a heavy operation, define variable to avoid call it every time. replicaCheckerCounter = checkerCounter.WithLabelValues(replicaChecker, "check") replicaCheckerPausedCounter = checkerCounter.WithLabelValues(replicaChecker, "paused") - replicaCheckerOpCounter = checkerCounter.WithLabelValues(replicaChecker, "new-operator") + replicaCheckerNewOpCounter = checkerCounter.WithLabelValues(replicaChecker, "new-operator") replicaCheckerNoTargetStoreCounter = checkerCounter.WithLabelValues(replicaChecker, "no-target-store") replicaCheckerNoWorstPeerCounter = checkerCounter.WithLabelValues(replicaChecker, "no-worst-peer") replicaCheckerCreateOpFailedCounter = checkerCounter.WithLabelValues(replicaChecker, "create-operator-failed") @@ -87,26 +87,26 @@ func (r *ReplicaChecker) Check(region *core.RegionInfo) *operator.Operator { return nil } if op := r.checkDownPeer(region); op != nil { - replicaCheckerOpCounter.Inc() + replicaCheckerNewOpCounter.Inc() op.SetPriorityLevel(core.High) return op } if op := r.checkOfflinePeer(region); op != nil { - replicaCheckerOpCounter.Inc() + replicaCheckerNewOpCounter.Inc() op.SetPriorityLevel(core.High) return op } if op := r.checkMakeUpReplica(region); op != nil { - replicaCheckerOpCounter.Inc() + replicaCheckerNewOpCounter.Inc() op.SetPriorityLevel(core.High) return op } if op := r.checkRemoveExtraReplica(region); op != nil { - replicaCheckerOpCounter.Inc() + replicaCheckerNewOpCounter.Inc() return op } if op := r.checkLocationReplacement(region); op != nil { - replicaCheckerOpCounter.Inc() + replicaCheckerNewOpCounter.Inc() return op } return nil diff --git a/server/schedulers/metrics.go b/server/schedulers/metrics.go index e1fbe415452..1b7c5b1c0e9 100644 --- a/server/schedulers/metrics.go +++ b/server/schedulers/metrics.go @@ -32,6 +32,7 @@ var schedulerStatus = prometheus.NewGaugeVec( Help: "Inner status of the scheduler.", }, []string{"type", "name"}) +// todo: pre-allocate gauge metrics var opInfluenceStatus = prometheus.NewGaugeVec( prometheus.GaugeOpts{ Namespace: "pd", @@ -40,6 +41,7 @@ var opInfluenceStatus = prometheus.NewGaugeVec( Help: "Store status for schedule", }, []string{"scheduler", "store", "type"}) +// todo: pre-allocate gauge metrics var tolerantResourceStatus = prometheus.NewGaugeVec( prometheus.GaugeOpts{ Namespace: "pd", diff --git a/server/schedulers/utils.go b/server/schedulers/utils.go index a5bb80d4cae..37e1fc8c251 100644 --- a/server/schedulers/utils.go +++ b/server/schedulers/utils.go @@ -40,8 +40,6 @@ const ( defaultRetryQuotaAttenuation = 2 ) -// WithLabelValues is a heavy operation, define variable to avoid call it every time. - type solver struct { *balanceSchedulerPlan schedule.Cluster