Skip to content

Commit

Permalink
lightning: "no leader" should be a retryable error (pingcap#46300) (p…
Browse files Browse the repository at this point in the history
…ingcap#46860) (pingcap#9)

close pingcap#45673

Co-authored-by: Ti Chi Robot <[email protected]>
  • Loading branch information
2 people authored and GitHub Enterprise committed Sep 26, 2023
1 parent 52d8d15 commit 79400bf
Show file tree
Hide file tree
Showing 6 changed files with 20 additions and 4 deletions.
14 changes: 11 additions & 3 deletions br/pkg/lightning/backend/local/local.go
Original file line number Diff line number Diff line change
Expand Up @@ -943,6 +943,11 @@ func (local *local) WriteToTiKV(
}
}
}
leaderID := region.Leader.GetId()
if leaderID == 0 {
return nil, Range{}, rangeStats{}, common.ErrNoLeader.GenWithStackByArgs(region.Region.Id, leaderID)
}

begin := time.Now()
regionRange := intersectRange(region.Region, Range{start: start, end: end})
opt := &pebble.IterOptions{LowerBound: regionRange.start, UpperBound: regionRange.end}
Expand Down Expand Up @@ -984,7 +989,6 @@ func (local *local) WriteToTiKV(
// annotate the error with peer/store/region info to help debug.
return errors.Annotatef(in, "peer %d, store %d, region %d, epoch %s", peer.Id, peer.StoreId, region.Region.Id, region.Region.RegionEpoch.String())
}
leaderID := region.Leader.GetId()
clients := make([]sst.ImportSST_WriteClient, 0, len(region.Region.GetPeers()))
allPeers := make([]*metapb.Peer, 0, len(region.Region.GetPeers()))
requests := make([]*sst.WriteRequest, 0, len(region.Region.GetPeers()))
Expand Down Expand Up @@ -1106,13 +1110,17 @@ func (local *local) WriteToTiKV(
}
}

failpoint.Inject("NoLeader", func() {
log.FromContext(ctx).Warn("enter failpoint NoLeader")
leaderPeerMetas = nil
})

// if there is not leader currently, we should directly return an error
if len(leaderPeerMetas) == 0 {
log.FromContext(ctx).Warn("write to tikv no leader", logutil.Region(region.Region), logutil.Leader(region.Leader),
zap.Uint64("leader_id", leaderID), logutil.SSTMeta(meta),
zap.Int64("kv_pairs", totalCount), zap.Int64("total_bytes", size))
return nil, Range{}, stats, errors.Errorf("write to tikv with no leader returned, region '%d', leader: %d",
region.Region.Id, leaderID)
return nil, Range{}, stats, common.ErrNoLeader.GenWithStackByArgs(region.Region.Id, leaderID)
}

log.FromContext(ctx).Debug("write to kv", zap.Reflect("region", region), zap.Uint64("leader", leaderID),
Expand Down
1 change: 1 addition & 0 deletions br/pkg/lightning/common/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ var (
ErrKVReadIndexNotReady = errors.Normalize("read index not ready", errors.RFCCodeText("Lightning:KV:ReadIndexNotReady"))
ErrKVIngestFailed = errors.Normalize("ingest tikv failed", errors.RFCCodeText("Lightning:KV:ErrKVIngestFailed"))
ErrKVRaftProposalDropped = errors.Normalize("raft proposal dropped", errors.RFCCodeText("Lightning:KV:ErrKVRaftProposalDropped"))
ErrNoLeader = errors.Normalize("write to tikv with no leader returned, region '%d', leader: %d", errors.RFCCodeText("Lightning:KV:ErrNoLeader"))

ErrUnknownBackend = errors.Normalize("unknown backend %s", errors.RFCCodeText("Lightning:Restore:ErrUnknownBackend"))
ErrCheckLocalFile = errors.Normalize("cannot find local file for table: %s engineDir: %s", errors.RFCCodeText("Lightning:Restore:ErrCheckLocalFile"))
Expand Down
1 change: 1 addition & 0 deletions br/pkg/lightning/common/retry.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ func IsRetryableError(err error) bool {
var retryableErrorIDs = map[errors.ErrorID]struct{}{
ErrKVEpochNotMatch.ID(): {},
ErrKVNotLeader.ID(): {},
ErrNoLeader.ID(): {},
ErrKVRegionNotFound.ID(): {},
// common.ErrKVServerIsBusy is a little duplication with tmysql.ErrTiKVServerBusy
// it's because the response of sst.ingest gives us a sst.IngestResponse which doesn't contain error code,
Expand Down
1 change: 1 addition & 0 deletions br/tests/lightning_local_backend/config.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
[lightning]
table-concurrency = 1
index-concurrency = 1

[checkpoint]
enable = true
Expand Down
2 changes: 1 addition & 1 deletion br/tests/lightning_local_backend/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ check_contains 'sum(c): 46'
run_sql 'DROP DATABASE cpeng;'
rm -f "/tmp/tidb_lightning_checkpoint_local_backend_test.pb"

export GO_FAILPOINTS='github.com/pingcap/tidb/br/pkg/lightning/backend/local/FailIngestMeta=2*return("epochnotmatch")'
export GO_FAILPOINTS='github.com/pingcap/tidb/br/pkg/lightning/backend/local/FailIngestMeta=2*return("epochnotmatch");github.com/pingcap/tidb/br/pkg/lightning/backend/local/NoLeader=1*return()'

run_lightning --backend local --enable-checkpoint=1 --log-file "$TEST_DIR/lightning-local.log" --config "tests/$TEST_NAME/config.toml"

Expand Down
5 changes: 5 additions & 0 deletions errors.toml
Original file line number Diff line number Diff line change
Expand Up @@ -406,6 +406,11 @@ error = '''
raft proposal dropped
'''

["Lightning:KV:ErrNoLeader"]
error = '''
write to tikv with no leader returned, region '%d', leader: %d
'''

["Lightning:KV:NotLeader"]
error = '''
not leader
Expand Down

0 comments on commit 79400bf

Please sign in to comment.