From 5c648dc365e0025c4b7c4544bc2c593fe5c76c0b Mon Sep 17 00:00:00 2001 From: Ryan Leung Date: Mon, 23 Sep 2019 11:20:47 +0800 Subject: [PATCH] *: improve more metrics (#1761) Signed-off-by: Ryan Leung --- server/checker/replica_checker.go | 10 ++++++++-- server/cluster.go | 3 +-- server/schedulers/balance_leader.go | 6 +++++- server/schedulers/balance_region.go | 2 +- server/schedulers/random_merge.go | 4 ++-- server/schedulers/shuffle_region.go | 2 +- server/statistics/store_collection.go | 10 +++++----- 7 files changed, 23 insertions(+), 14 deletions(-) diff --git a/server/checker/replica_checker.go b/server/checker/replica_checker.go index 8057daa19a0..8be95dbaf74 100644 --- a/server/checker/replica_checker.go +++ b/server/checker/replica_checker.go @@ -257,7 +257,8 @@ func (r *ReplicaChecker) fixPeer(region *core.RegionInfo, peer *metapb.Peer, sta if len(region.GetPeers()) > r.cluster.GetMaxReplicas() { op, err := operator.CreateRemovePeerOperator(removeExtra, r.cluster, operator.OpReplica, region, peer.GetStoreId()) if err != nil { - checkerCounter.WithLabelValues("replica_checker", "create-operator-fail").Inc() + reason := fmt.Sprintf("%s-fail", removeExtra) + checkerCounter.WithLabelValues("replica_checker", reason).Inc() return nil } return op @@ -271,7 +272,8 @@ func (r *ReplicaChecker) fixPeer(region *core.RegionInfo, peer *metapb.Peer, sta if region.GetPendingPeer(peer.GetId()) != nil { op, err := operator.CreateRemovePeerOperator(removePending, r.cluster, operator.OpReplica, region, peer.GetStoreId()) if err != nil { - checkerCounter.WithLabelValues("replica_checker", "create-operator-fail").Inc() + reason := fmt.Sprintf("%s-fail", removePending) + checkerCounter.WithLabelValues("replica_checker", reason).Inc() return nil } return op @@ -279,6 +281,8 @@ func (r *ReplicaChecker) fixPeer(region *core.RegionInfo, peer *metapb.Peer, sta storeID, _ := r.SelectBestReplacementStore(region, peer, filter.NewStorageThresholdFilter(r.name)) if storeID == 0 { + reason := fmt.Sprintf("no-store-%s", status) + checkerCounter.WithLabelValues("replica_checker", reason).Inc() log.Debug("no best store to add replica", zap.Uint64("region-id", region.GetID())) return nil } @@ -290,6 +294,8 @@ func (r *ReplicaChecker) fixPeer(region *core.RegionInfo, peer *metapb.Peer, sta replace := fmt.Sprintf("replace-%s-replica", status) op, err := operator.CreateMovePeerOperator(replace, r.cluster, region, operator.OpReplica, peer.GetStoreId(), newPeer.GetStoreId(), newPeer.GetId()) if err != nil { + reason := fmt.Sprintf("%s-fail", replace) + checkerCounter.WithLabelValues("replica_checker", reason).Inc() return nil } return op diff --git a/server/cluster.go b/server/cluster.go index b2f874f35ed..3bb04b5da16 100644 --- a/server/cluster.go +++ b/server/cluster.go @@ -928,7 +928,6 @@ func (c *RaftCluster) putStoreLocked(store *core.StoreInfo) error { func (c *RaftCluster) checkStores() { var offlineStores []*metapb.Store var upStoreCount int - stores := c.GetStores() for _, store := range stores { // the store has already been tombstone @@ -963,7 +962,7 @@ func (c *RaftCluster) checkStores() { if upStoreCount < c.GetMaxReplicas() { for _, offlineStore := range offlineStores { - log.Warn("store may not turn into Tombstone, there are no extra up node has enough space to accommodate the extra replica", zap.Stringer("store", offlineStore)) + log.Warn("store may not turn into Tombstone, there are no extra up store has enough space to accommodate the extra replica", zap.Stringer("store", offlineStore)) } } } diff --git a/server/schedulers/balance_leader.go b/server/schedulers/balance_leader.go index 06ed5e2d8fd..1aa8b9cae1a 100644 --- a/server/schedulers/balance_leader.go +++ b/server/schedulers/balance_leader.go @@ -112,7 +112,11 @@ func (l *balanceLeaderScheduler) Schedule(cluster schedule.Cluster) []*operator. // No store can be selected as source or target. if source == nil || target == nil { - schedulerCounter.WithLabelValues(l.GetName(), "no-store").Inc() + if source == nil { + schedulerCounter.WithLabelValues(l.GetName(), "no-source-store").Inc() + } else { + schedulerCounter.WithLabelValues(l.GetName(), "no-target-store").Inc() + } // When the cluster is balanced, all stores will be added to the cache once // all of them have been selected. This will cause the scheduler to not adapt // to sudden change of a store's leader. Here we clear the taint cache and diff --git a/server/schedulers/balance_region.go b/server/schedulers/balance_region.go index 5cb84b21fc9..4fcef549880 100644 --- a/server/schedulers/balance_region.go +++ b/server/schedulers/balance_region.go @@ -119,7 +119,7 @@ func (s *balanceRegionScheduler) Schedule(cluster schedule.Cluster) []*operator. f := s.hitsCounter.buildSourceFilter(s.GetName(), cluster) source := s.selector.SelectSource(cluster, stores, f) if source == nil { - schedulerCounter.WithLabelValues(s.GetName(), "no-store").Inc() + schedulerCounter.WithLabelValues(s.GetName(), "no-source-store").Inc() // Unlike the balanceLeaderScheduler, we don't need to clear the taintCache // here. Because normally region score won't change rapidly, and the region // balance requires lower sensitivity compare to leader balance. diff --git a/server/schedulers/random_merge.go b/server/schedulers/random_merge.go index 26d72481c1d..95a95a49045 100644 --- a/server/schedulers/random_merge.go +++ b/server/schedulers/random_merge.go @@ -69,7 +69,7 @@ func (s *randomMergeScheduler) Schedule(cluster schedule.Cluster) []*operator.Op stores := cluster.GetStores() store := s.selector.SelectSource(cluster, stores) if store == nil { - schedulerCounter.WithLabelValues(s.GetName(), "no-store").Inc() + schedulerCounter.WithLabelValues(s.GetName(), "no-source-store").Inc() return nil } region := cluster.RandLeaderRegion(store.GetID(), core.HealthRegion()) @@ -83,7 +83,7 @@ func (s *randomMergeScheduler) Schedule(cluster schedule.Cluster) []*operator.Op target = other } if target == nil { - schedulerCounter.WithLabelValues(s.GetName(), "no-adjacent").Inc() + schedulerCounter.WithLabelValues(s.GetName(), "no-target-store").Inc() return nil } diff --git a/server/schedulers/shuffle_region.go b/server/schedulers/shuffle_region.go index 1b63aab936b..ca57a7edf69 100644 --- a/server/schedulers/shuffle_region.go +++ b/server/schedulers/shuffle_region.go @@ -94,7 +94,7 @@ func (s *shuffleRegionScheduler) scheduleRemovePeer(cluster schedule.Cluster) (* source := s.selector.SelectSource(cluster, stores) if source == nil { - schedulerCounter.WithLabelValues(s.GetName(), "no-store").Inc() + schedulerCounter.WithLabelValues(s.GetName(), "no-source-store").Inc() return nil, nil } diff --git a/server/statistics/store_collection.go b/server/statistics/store_collection.go index 583be72302c..84da880e752 100644 --- a/server/statistics/store_collection.go +++ b/server/statistics/store_collection.go @@ -198,11 +198,11 @@ func (s *storeStatistics) Collect() { disableReplaceOfflineReplica = 1 } - configs["disable_makeup_replica"] = disableMakeUpReplica - configs["disable_learner"] = disableLearner - configs["disable_remove_down_replica"] = disableRemoveDownReplica - configs["disable_remove_extra_replica"] = disableRemoveExtraReplica - configs["disable_replace_offline_replica"] = disableReplaceOfflineReplica + configs["disable-makeup-replica"] = disableMakeUpReplica + configs["disable-learner"] = disableLearner + configs["disable-remove-down-replica"] = disableRemoveDownReplica + configs["disable-remove-extra-replica"] = disableRemoveExtraReplica + configs["disable-replace-offline-replica"] = disableReplaceOfflineReplica for typ, value := range configs { configStatusGauge.WithLabelValues(typ, s.namespace).Set(value)