From ce3e35d490ec6f7c14419e3ae2ec3a2d4b3daecb Mon Sep 17 00:00:00 2001 From: lysu Date: Mon, 3 Jun 2019 20:53:30 +0800 Subject: [PATCH] tikv: refine region-cache log and metric (#10589) (#10667) --- store/tikv/region_cache.go | 36 +++++++++++++++++++++++---------- store/tikv/region_cache_test.go | 8 ++++---- store/tikv/region_request.go | 2 +- 3 files changed, 30 insertions(+), 16 deletions(-) diff --git a/store/tikv/region_cache.go b/store/tikv/region_cache.go index 64005740fbc8a..c27c15c977a19 100644 --- a/store/tikv/region_cache.go +++ b/store/tikv/region_cache.go @@ -38,13 +38,14 @@ const ( ) var ( - tikvRegionCacheCounterWithDropRegionFromCacheOK = metrics.TiKVRegionCacheCounter.WithLabelValues("drop_region_from_cache", "ok") - tikvRegionCacheCounterWithGetRegionByIDOK = metrics.TiKVRegionCacheCounter.WithLabelValues("get_region_by_id", "ok") - tikvRegionCacheCounterWithGetRegionByIDError = metrics.TiKVRegionCacheCounter.WithLabelValues("get_region_by_id", "err") - tikvRegionCacheCounterWithGetRegionOK = metrics.TiKVRegionCacheCounter.WithLabelValues("get_region", "ok") - tikvRegionCacheCounterWithGetRegionError = metrics.TiKVRegionCacheCounter.WithLabelValues("get_region", "err") - tikvRegionCacheCounterWithGetStoreOK = metrics.TiKVRegionCacheCounter.WithLabelValues("get_store", "ok") - tikvRegionCacheCounterWithGetStoreError = metrics.TiKVRegionCacheCounter.WithLabelValues("get_store", "err") + tikvRegionCacheCounterWithInvalidateRegionFromCacheOK = metrics.TiKVRegionCacheCounter.WithLabelValues("invalidate_region_from_cache", "ok") + tikvRegionCacheCounterWithSendFail = metrics.TiKVRegionCacheCounter.WithLabelValues("send_fail", "ok") + tikvRegionCacheCounterWithGetRegionByIDOK = metrics.TiKVRegionCacheCounter.WithLabelValues("get_region_by_id", "ok") + tikvRegionCacheCounterWithGetRegionByIDError = metrics.TiKVRegionCacheCounter.WithLabelValues("get_region_by_id", "err") + tikvRegionCacheCounterWithGetRegionOK = metrics.TiKVRegionCacheCounter.WithLabelValues("get_region", "ok") + tikvRegionCacheCounterWithGetRegionError = metrics.TiKVRegionCacheCounter.WithLabelValues("get_region", "err") + tikvRegionCacheCounterWithGetStoreOK = metrics.TiKVRegionCacheCounter.WithLabelValues("get_store", "ok") + tikvRegionCacheCounterWithGetStoreError = metrics.TiKVRegionCacheCounter.WithLabelValues("get_store", "err") ) const ( @@ -121,6 +122,7 @@ func (r *Region) checkRegionCacheTTL(ts int64) bool { // invalidate invalidates a region, next time it will got null result. func (r *Region) invalidate() { + tikvRegionCacheCounterWithInvalidateRegionFromCacheOK.Inc() atomic.StoreInt64(&r.lastAccess, invalidatedLastAccessTime) } @@ -359,13 +361,18 @@ func (c *RegionCache) findRegionByKey(bo *Backoffer, key []byte, isEndKey bool) } // OnSendFail handles send request fail logic. -func (c *RegionCache) OnSendFail(bo *Backoffer, ctx *RPCContext, scheduleReload bool) { +func (c *RegionCache) OnSendFail(bo *Backoffer, ctx *RPCContext, scheduleReload bool, err error) { + tikvRegionCacheCounterWithSendFail.Inc() r := c.getCachedRegionWithRLock(ctx.Region) if r != nil { c.switchNextPeer(r, ctx.PeerIdx) if scheduleReload { r.scheduleReload() } + logutil.Logger(bo.ctx).Info("switch region peer to next due to send request fail", + zap.Stringer("current", ctx), + zap.Bool("needReload", scheduleReload), + zap.Error(err)) } } @@ -457,7 +464,6 @@ func (c *RegionCache) InvalidateCachedRegion(id RegionVerID) { if cachedRegion == nil { return } - tikvRegionCacheCounterWithDropRegionFromCacheOK.Inc() cachedRegion.invalidate() } @@ -473,14 +479,23 @@ func (c *RegionCache) UpdateLeader(regionID RegionVerID, leaderStoreID uint64, c if leaderStoreID == 0 { c.switchNextPeer(r, currentPeerIdx) + logutil.Logger(context.Background()).Info("switch region peer to next due to NotLeader with NULL leader", + zap.Int("currIdx", currentPeerIdx), + zap.Uint64("regionID", regionID.GetID())) return } if !c.switchToPeer(r, leaderStoreID) { - logutil.Logger(context.Background()).Debug("regionCache: cannot find peer when updating leader", + logutil.Logger(context.Background()).Info("invalidate region cache due to cannot find peer when updating leader", zap.Uint64("regionID", regionID.GetID()), + zap.Int("currIdx", currentPeerIdx), zap.Uint64("leaderStoreID", leaderStoreID)) r.invalidate() + } else { + logutil.Logger(context.Background()).Info("switch region leader to specific leader due to kv return NotLeader", + zap.Uint64("regionID", regionID.GetID()), + zap.Int("currIdx", currentPeerIdx), + zap.Uint64("leaderStoreID", leaderStoreID)) } } @@ -712,7 +727,6 @@ func (c *RegionCache) OnRegionEpochNotMatch(bo *Backoffer, ctx *RPCContext, curr if needInvalidateOld { cachedRegion, ok := c.mu.regions[ctx.Region] if ok { - tikvRegionCacheCounterWithDropRegionFromCacheOK.Inc() cachedRegion.invalidate() } } diff --git a/store/tikv/region_cache_test.go b/store/tikv/region_cache_test.go index dc78d01413eb9..ed3a28e03e1b7 100644 --- a/store/tikv/region_cache_test.go +++ b/store/tikv/region_cache_test.go @@ -239,7 +239,7 @@ func (s *testRegionCacheSuite) TestSendFailedButLeaderNotChange(c *C) { c.Assert(len(ctx.Meta.Peers), Equals, 3) // send fail leader switch to 2 - s.cache.OnSendFail(s.bo, ctx, false) + s.cache.OnSendFail(s.bo, ctx, false, nil) ctx, err = s.cache.GetRPCContext(s.bo, loc.Region) c.Assert(err, IsNil) c.Assert(ctx.Peer.Id, Equals, s.peer2) @@ -267,7 +267,7 @@ func (s *testRegionCacheSuite) TestSendFailedInHibernateRegion(c *C) { c.Assert(len(ctx.Meta.Peers), Equals, 3) // send fail leader switch to 2 - s.cache.OnSendFail(s.bo, ctx, false) + s.cache.OnSendFail(s.bo, ctx, false, nil) ctx, err = s.cache.GetRPCContext(s.bo, loc.Region) c.Assert(err, IsNil) c.Assert(ctx.Peer.Id, Equals, s.peer2) @@ -303,13 +303,13 @@ func (s *testRegionCacheSuite) TestSendFailedInMultipleNode(c *C) { c.Assert(len(ctx.Meta.Peers), Equals, 3) // send fail leader switch to 2 - s.cache.OnSendFail(s.bo, ctx, false) + s.cache.OnSendFail(s.bo, ctx, false, nil) ctx, err = s.cache.GetRPCContext(s.bo, loc.Region) c.Assert(err, IsNil) c.Assert(ctx.Peer.Id, Equals, s.peer2) // send 2 fail leader switch to 3 - s.cache.OnSendFail(s.bo, ctx, false) + s.cache.OnSendFail(s.bo, ctx, false, nil) ctx, err = s.cache.GetRPCContext(s.bo, loc.Region) c.Assert(err, IsNil) c.Assert(ctx.Peer.Id, Equals, peer3) diff --git a/store/tikv/region_request.go b/store/tikv/region_request.go index 78fde3854f853..c1a82fb344a44 100644 --- a/store/tikv/region_request.go +++ b/store/tikv/region_request.go @@ -172,7 +172,7 @@ func (s *RegionRequestSender) onSendFail(bo *Backoffer, ctx *RPCContext, err err } } - s.regionCache.OnSendFail(bo, ctx, s.needReloadRegion(ctx)) + s.regionCache.OnSendFail(bo, ctx, s.needReloadRegion(ctx), err) // Retry on send request failure when it's not canceled. // When a store is not available, the leader of related region should be elected quickly.