From 4a1573eb237dfbe1939cbcac22dcb192a1981728 Mon Sep 17 00:00:00 2001 From: crazycs520 Date: Wed, 26 Jul 2023 17:14:58 +0800 Subject: [PATCH 1/5] add more log for diagnose Signed-off-by: crazycs520 --- internal/locate/region_cache.go | 38 +++++++++++++++++++++++++++++++ internal/locate/region_request.go | 32 ++++++++++++++++++-------- internal/retry/backoff.go | 14 ++++++++++++ kv/store_vars.go | 16 +++++++++++++ 4 files changed, 91 insertions(+), 9 deletions(-) diff --git a/internal/locate/region_cache.go b/internal/locate/region_cache.go index a5fb471af..c62434f12 100644 --- a/internal/locate/region_cache.go +++ b/internal/locate/region_cache.go @@ -2473,6 +2473,24 @@ const ( tombstone ) +// String implements fmt.Stringer interface. +func (s resolveState) String() string { + switch s { + case unresolved: + return "unresolved" + case resolved: + return "resolved" + case needCheck: + return "needCheck" + case deleted: + return "deleted" + case tombstone: + return "tombstone" + default: + return fmt.Sprintf("unknown-%v", uint64(s)) + } +} + // IsTiFlash returns true if the storeType is TiFlash func (s *Store) IsTiFlash() bool { return s.storeType == tikvrpc.TiFlash @@ -2612,6 +2630,12 @@ func (s *Store) changeResolveStateTo(from, to resolveState) bool { return false } if atomic.CompareAndSwapUint64(&s.state, uint64(from), uint64(to)) { + logutil.BgLogger().Info("change store resolve state", + zap.Uint64("store", s.storeID), + zap.String("addr", s.addr), + zap.String("from", from.String()), + zap.String("to", to.String()), + zap.String("liveness-state", s.getLivenessState().String())) return true } } @@ -2712,6 +2736,20 @@ const ( unknown ) +// String implements fmt.Stringer interface. +func (s livenessState) String() string { + switch s { + case unreachable: + return "unreachable" + case reachable: + return "reachable" + case unknown: + return "unknown" + default: + return fmt.Sprintf("unknown-%v", uint32(s)) + } +} + func (s *Store) startHealthCheckLoopIfNeeded(c *RegionCache, liveness livenessState) { // This mechanism doesn't support non-TiKV stores currently. if s.storeType != tikvrpc.TiKV { diff --git a/internal/locate/region_request.go b/internal/locate/region_request.go index 4308dbb2d..89f598638 100644 --- a/internal/locate/region_request.go +++ b/internal/locate/region_request.go @@ -608,14 +608,15 @@ func (state *accessFollower) next(bo *retry.Backoffer, selector *replicaSelector } // If there is no candidate, fallback to the leader. if selector.targetIdx < 0 { + leader := selector.replicas[state.leaderIdx] + leaderInvalid := leader.isEpochStale() || (!state.option.leaderOnly && leader.isExhausted(1)) if len(state.option.labels) > 0 { - logutil.BgLogger().Warn( - "unable to find stores with given labels", - zap.Any("labels", state.option.labels), - ) + logutil.Logger(bo.GetCtx()).Warn("unable to find stores with given labels", + zap.Uint64("region", selector.region.GetID()), + zap.Bool("leader-invalid", leaderInvalid), + zap.Any("labels", state.option.labels)) } - leader := selector.replicas[state.leaderIdx] - if leader.isEpochStale() || (!state.option.leaderOnly && leader.isExhausted(1)) { + if leaderInvalid { metrics.TiKVReplicaSelectorFailureCounter.WithLabelValues("exhausted").Inc() selector.invalidateRegion() return nil, nil @@ -1221,9 +1222,22 @@ func (s *RegionRequestSender) SendReqCtx( // TODO: Change the returned error to something like "region missing in cache", // and handle this error like EpochNotMatch, which means to re-split the request and retry. - logutil.Logger(bo.GetCtx()).Debug( - "throwing pseudo region error due to region not found in cache", - zap.Stringer("region", ®ionID), + cacheRegionIsValid := "unknown" + if s.replicaSelector.region != nil { + if s.replicaSelector.region.isValid() { + cacheRegionIsValid = "true" + } else { + cacheRegionIsValid = "false" + } + } + logutil.Logger(bo.GetCtx()).Info("throwing pseudo region error due to no replica available", + zap.String("region", regionID.String()), + zap.String("region-is-valid", cacheRegionIsValid), + zap.Int("try-times", retryTimes), + zap.String("replica-read-type", req.ReplicaReadType.String()), + zap.Bool("stale-read", req.StaleRead), + zap.Int("total-backoff-ms", bo.GetTotalSleep()), + zap.Int("total-backoff-times", bo.GetTotalBackoffTimes()), ) resp, err = tikvrpc.GenRegionErrorResp(req, &errorpb.Error{EpochNotMatch: &errorpb.EpochNotMatch{}}) return resp, nil, retryTimes, err diff --git a/internal/retry/backoff.go b/internal/retry/backoff.go index 6a27d0593..bdefc7993 100644 --- a/internal/retry/backoff.go +++ b/internal/retry/backoff.go @@ -35,9 +35,11 @@ package retry import ( + "bytes" "context" "fmt" "math" + "strconv" "strings" "sync/atomic" "time" @@ -150,6 +152,18 @@ func (b *Backoffer) BackoffWithCfgAndMaxSleep(cfg *Config, maxSleepMs int, err e errMsg += "\n" + err.Error() } } + var backoffDetail bytes.Buffer + totalTimes := 0 + for name, times := range b.backoffTimes { + totalTimes += times + if backoffDetail.Len() > 0 { + backoffDetail.WriteString(", ") + } + backoffDetail.WriteString(name) + backoffDetail.WriteString(":") + backoffDetail.WriteString(strconv.Itoa(times)) + } + errMsg += fmt.Sprintf("\ntotal-backoff-times: %v, backoff-detail: %v", totalTimes, backoffDetail.String()) returnedErr := err if longestSleepCfg != nil { errMsg += fmt.Sprintf("\nlongest sleep type: %s, time: %dms", longestSleepCfg.String(), longestSleepTime) diff --git a/kv/store_vars.go b/kv/store_vars.go index cce3e146b..0e064a7f1 100644 --- a/kv/store_vars.go +++ b/kv/store_vars.go @@ -35,6 +35,8 @@ package kv import ( + "fmt" + "go.uber.org/atomic" ) @@ -72,3 +74,17 @@ const ( func (r ReplicaReadType) IsFollowerRead() bool { return r != ReplicaReadLeader } + +// String implements fmt.Stringer interface. +func (r ReplicaReadType) String() string { + switch r { + case ReplicaReadLeader: + return "leader" + case ReplicaReadFollower: + return "follower" + case ReplicaReadMixed: + return "mixed" + default: + return fmt.Sprintf("unknown-%v", byte(r)) + } +} From 84865ce0ee9b06b29a108b35f5ea4cbcfb0a9c2a Mon Sep 17 00:00:00 2001 From: crazycs520 Date: Wed, 26 Jul 2023 18:00:21 +0800 Subject: [PATCH 2/5] fix Signed-off-by: crazycs520 --- internal/locate/region_request.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/locate/region_request.go b/internal/locate/region_request.go index 89f598638..d45a32305 100644 --- a/internal/locate/region_request.go +++ b/internal/locate/region_request.go @@ -1223,7 +1223,7 @@ func (s *RegionRequestSender) SendReqCtx( // TODO: Change the returned error to something like "region missing in cache", // and handle this error like EpochNotMatch, which means to re-split the request and retry. cacheRegionIsValid := "unknown" - if s.replicaSelector.region != nil { + if s.replicaSelector != nil && s.replicaSelector.region != nil { if s.replicaSelector.region.isValid() { cacheRegionIsValid = "true" } else { From e57023fae3eeafdaaa77b9550baf1404de3831e4 Mon Sep 17 00:00:00 2001 From: crazycs520 Date: Thu, 27 Jul 2023 15:12:45 +0800 Subject: [PATCH 3/5] add more log for diagnose Signed-off-by: crazycs520 --- internal/locate/region_request.go | 92 +++++++++++++++++++++++++------ 1 file changed, 75 insertions(+), 17 deletions(-) diff --git a/internal/locate/region_request.go b/internal/locate/region_request.go index d45a32305..210b8526b 100644 --- a/internal/locate/region_request.go +++ b/internal/locate/region_request.go @@ -35,6 +35,7 @@ package locate import ( + "bytes" "context" "fmt" "math" @@ -1190,6 +1191,7 @@ func (s *RegionRequestSender) SendReqCtx( }() } + totalErrors := make(map[string]int) for { if retryTimes > 0 { req.IsRetryRequest = true @@ -1222,23 +1224,7 @@ func (s *RegionRequestSender) SendReqCtx( // TODO: Change the returned error to something like "region missing in cache", // and handle this error like EpochNotMatch, which means to re-split the request and retry. - cacheRegionIsValid := "unknown" - if s.replicaSelector != nil && s.replicaSelector.region != nil { - if s.replicaSelector.region.isValid() { - cacheRegionIsValid = "true" - } else { - cacheRegionIsValid = "false" - } - } - logutil.Logger(bo.GetCtx()).Info("throwing pseudo region error due to no replica available", - zap.String("region", regionID.String()), - zap.String("region-is-valid", cacheRegionIsValid), - zap.Int("try-times", retryTimes), - zap.String("replica-read-type", req.ReplicaReadType.String()), - zap.Bool("stale-read", req.StaleRead), - zap.Int("total-backoff-ms", bo.GetTotalSleep()), - zap.Int("total-backoff-times", bo.GetTotalBackoffTimes()), - ) + s.logSendReqError(bo, "throwing pseudo region error due to no replica available", regionID, retryTimes, req, totalErrors) resp, err = tikvrpc.GenRegionErrorResp(req, &errorpb.Error{EpochNotMatch: &errorpb.EpochNotMatch{}}) return resp, nil, retryTimes, err } @@ -1264,6 +1250,8 @@ func (s *RegionRequestSender) SendReqCtx( var retry bool resp, retry, err = s.sendReqToRegion(bo, rpcCtx, req, timeout) if err != nil { + msg := fmt.Sprintf("send request failed, err: %v", err.Error()) + s.logSendReqError(bo, msg, regionID, retryTimes, req, totalErrors) return nil, nil, retryTimes, err } @@ -1295,6 +1283,8 @@ func (s *RegionRequestSender) SendReqCtx( return nil, nil, retryTimes, err } if regionErr != nil { + regionErrLabel := regionErrorToLabel(regionErr) + totalErrors[regionErrLabel]++ retry, err = s.onRegionError(bo, rpcCtx, req, regionErr) if err != nil { return nil, nil, retryTimes, err @@ -1303,6 +1293,7 @@ func (s *RegionRequestSender) SendReqCtx( retryTimes++ continue } + s.logSendReqError(bo, "send request meet region error without retry", regionID, retryTimes, req, totalErrors) } else { if s.replicaSelector != nil { s.replicaSelector.onSendSuccess() @@ -1315,6 +1306,73 @@ func (s *RegionRequestSender) SendReqCtx( } } +func (s *RegionRequestSender) logSendReqError(bo *retry.Backoffer, msg string, regionID RegionVerID, retryTimes int, req *tikvrpc.Request, totalErrors map[string]int) { + var replicaStatus []string + replicaSelectorState := "nil" + cacheRegionIsValid := "unknown" + if s.replicaSelector != nil { + switch s.replicaSelector.state.(type) { + case *accessKnownLeader: + replicaSelectorState = "accessKnownLeader" + case *accessFollower: + replicaSelectorState = "accessFollower" + case *accessByKnownProxy: + replicaSelectorState = "accessByKnownProxy" + case *tryFollower: + replicaSelectorState = "tryFollower" + case *tryNewProxy: + replicaSelectorState = "tryNewProxy" + case *invalidLeader: + replicaSelectorState = "invalidLeader" + case *invalidStore: + replicaSelectorState = "invalidStore" + case *stateBase: + replicaSelectorState = "stateBase" + case nil: + replicaSelectorState = "nil" + } + if s.replicaSelector.region != nil { + if s.replicaSelector.region.isValid() { + cacheRegionIsValid = "true" + } else { + cacheRegionIsValid = "false" + } + } + for _, replica := range s.replicaSelector.replicas { + replicaStatus = append(replicaStatus, fmt.Sprintf("peer: %v, store: %v, isEpochStale: %v, attempts: %v, replica-epoch: %v, store-epoch: %v, store-state: %v, store-liveness-state: %v", + replica.peer.GetId(), + replica.store.storeID, + replica.isEpochStale(), + replica.attempts, + replica.epoch, + atomic.LoadUint32(&replica.store.epoch), + replica.store.getResolveState(), + replica.store.getLivenessState(), + )) + } + } + var totalErrorStr bytes.Buffer + for err, cnt := range totalErrors { + if totalErrorStr.Len() > 0 { + totalErrorStr.WriteString(", ") + } + totalErrorStr.WriteString(err) + totalErrorStr.WriteString(":") + totalErrorStr.WriteString(strconv.Itoa(cnt)) + } + logutil.Logger(bo.GetCtx()).Info(msg, + zap.String("region", regionID.String()), + zap.String("region-is-valid", cacheRegionIsValid), + zap.Int("retry-times", retryTimes), + zap.String("replica-read-type", req.ReplicaReadType.String()), + zap.String("replica-selector-state", replicaSelectorState), + zap.Bool("stale-read", req.StaleRead), + zap.String("replica-status", strings.Join(replicaStatus, "; ")), + zap.Int("total-backoff-ms", bo.GetTotalSleep()), + zap.Int("total-backoff-times", bo.GetTotalBackoffTimes()), + zap.String("total-region-errors", totalErrorStr.String())) +} + // RPCCancellerCtxKey is context key attach rpc send cancelFunc collector to ctx. type RPCCancellerCtxKey struct{} From 2c3f28165d95fb65f8bb4c7ea7eb04025d066671 Mon Sep 17 00:00:00 2001 From: crazycs520 Date: Thu, 27 Jul 2023 16:57:05 +0800 Subject: [PATCH 4/5] add more log Signed-off-by: crazycs520 --- internal/locate/region_request.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/internal/locate/region_request.go b/internal/locate/region_request.go index 210b8526b..0be0ab900 100644 --- a/internal/locate/region_request.go +++ b/internal/locate/region_request.go @@ -1287,6 +1287,8 @@ func (s *RegionRequestSender) SendReqCtx( totalErrors[regionErrLabel]++ retry, err = s.onRegionError(bo, rpcCtx, req, regionErr) if err != nil { + msg := fmt.Sprintf("send request on region error failed, err: %v", err.Error()) + s.logSendReqError(bo, msg, regionID, retryTimes, req, totalErrors) return nil, nil, retryTimes, err } if retry { @@ -1361,6 +1363,8 @@ func (s *RegionRequestSender) logSendReqError(bo *retry.Backoffer, msg string, r totalErrorStr.WriteString(strconv.Itoa(cnt)) } logutil.Logger(bo.GetCtx()).Info(msg, + zap.Uint64("req-ts", req.GetStartTS()), + zap.String("req-type", req.Type.String()), zap.String("region", regionID.String()), zap.String("region-is-valid", cacheRegionIsValid), zap.Int("retry-times", retryTimes), From fa080256d9a0503099df4955b8b0e82409a6ce23 Mon Sep 17 00:00:00 2001 From: crazycs520 Date: Fri, 28 Jul 2023 17:12:45 +0800 Subject: [PATCH 5/5] address comment Signed-off-by: crazycs520 --- kv/store_vars.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kv/store_vars.go b/kv/store_vars.go index 0e064a7f1..184975ca2 100644 --- a/kv/store_vars.go +++ b/kv/store_vars.go @@ -84,6 +84,10 @@ func (r ReplicaReadType) String() string { return "follower" case ReplicaReadMixed: return "mixed" + case ReplicaReadLearner: + return "learner" + case ReplicaReadPreferLeader: + return "prefer-leader" default: return fmt.Sprintf("unknown-%v", byte(r)) }