-
Notifications
You must be signed in to change notification settings - Fork 5.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
snap_restore: resend recover_region
while there are TiKV restarts
#45361
Merged
Merged
Changes from all commits
Commits
Show all changes
10 commits
Select commit
Hold shift + click to select a range
7fb2891
basic
YuJuncen 4bbb14a
fix the loop variable
YuJuncen 854d4e2
added retry
YuJuncen 02efee9
make bazel_prepare
YuJuncen 1ccae14
fix build
YuJuncen 978cff5
address comments
YuJuncen 59da449
fix build
YuJuncen 7812c83
Apply suggestions from code review
YuJuncen 4ac0023
make clippy happy
YuJuncen 071c0ca
make bazel_prepare
YuJuncen File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -4,6 +4,7 @@ package restore | |||||
import ( | ||||||
"context" | ||||||
"io" | ||||||
"time" | ||||||
|
||||||
"github.com/pingcap/errors" | ||||||
"github.com/pingcap/kvproto/pkg/metapb" | ||||||
|
@@ -12,7 +13,9 @@ import ( | |||||
"github.com/pingcap/tidb/br/pkg/common" | ||||||
"github.com/pingcap/tidb/br/pkg/conn" | ||||||
"github.com/pingcap/tidb/br/pkg/glue" | ||||||
"github.com/pingcap/tidb/br/pkg/logutil" | ||||||
"github.com/pingcap/tidb/br/pkg/utils" | ||||||
"github.com/pingcap/tidb/br/pkg/utils/storewatch" | ||||||
"github.com/pingcap/tidb/ddl" | ||||||
"github.com/pingcap/tidb/util/mathutil" | ||||||
tikvstore "github.com/tikv/client-go/v2/kv" | ||||||
|
@@ -48,6 +51,9 @@ func RecoverData(ctx context.Context, resolveTS uint64, allStores []*metapb.Stor | |||||
return totalRegions, errors.Trace(err) | ||||||
} | ||||||
|
||||||
// Once TiKV shuts down and reboot then, it may be left with no leader because of the recovery mode. | ||||||
// This wathcher will retrigger `RecoveryRegions` for those stores. | ||||||
recovery.SpawnTiKVShutDownWatchers(ctx) | ||||||
if err := recovery.RecoverRegions(ctx); err != nil { | ||||||
return totalRegions, errors.Trace(err) | ||||||
} | ||||||
|
@@ -213,6 +219,39 @@ func (recovery *Recovery) GetTotalRegions() int { | |||||
return len(regions) | ||||||
} | ||||||
|
||||||
func (recovery *Recovery) RecoverRegionOfStore(ctx context.Context, storeID uint64, plan []*recovpb.RecoverRegionRequest) error { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
storeAddr := getStoreAddress(recovery.allStores, storeID) | ||||||
recoveryClient, conn, err := recovery.newRecoveryClient(ctx, storeAddr) | ||||||
if err != nil { | ||||||
log.Error("create tikv client failed", zap.Uint64("store id", storeID)) | ||||||
return errors.Trace(err) | ||||||
} | ||||||
defer conn.Close() | ||||||
log.Info("send recover region to tikv", zap.String("tikv address", storeAddr), zap.Uint64("store id", storeID)) | ||||||
stream, err := recoveryClient.RecoverRegion(ctx) | ||||||
if err != nil { | ||||||
log.Error("create recover region failed", zap.Uint64("store id", storeID)) | ||||||
return errors.Trace(err) | ||||||
} | ||||||
|
||||||
// for a TiKV, send the stream | ||||||
for _, s := range plan { | ||||||
if err = stream.Send(s); err != nil { | ||||||
log.Error("send recover region failed", zap.Error(err)) | ||||||
return errors.Trace(err) | ||||||
} | ||||||
} | ||||||
|
||||||
reply, err := stream.CloseAndRecv() | ||||||
if err != nil { | ||||||
log.Error("close the stream failed") | ||||||
return errors.Trace(err) | ||||||
} | ||||||
recovery.progress.Inc() | ||||||
log.Info("recover region execution success", zap.Uint64("store id", reply.GetStoreId())) | ||||||
return nil | ||||||
} | ||||||
|
||||||
// RecoverRegions send the recovery plan to recovery region (force leader etc) | ||||||
// only tikvs have regions whose have to recover be sent | ||||||
func (recovery *Recovery) RecoverRegions(ctx context.Context) (err error) { | ||||||
|
@@ -224,46 +263,60 @@ func (recovery *Recovery) RecoverRegions(ctx context.Context) (err error) { | |||||
if err := ectx.Err(); err != nil { | ||||||
break | ||||||
} | ||||||
storeId := storeId | ||||||
plan := plan | ||||||
|
||||||
storeAddr := getStoreAddress(recovery.allStores, storeId) | ||||||
recoveryPlan := plan | ||||||
recoveryStoreId := storeId | ||||||
workers.ApplyOnErrorGroup(eg, func() error { | ||||||
recoveryClient, conn, err := recovery.newRecoveryClient(ectx, storeAddr) | ||||||
if err != nil { | ||||||
log.Error("create tikv client failed", zap.Uint64("store id", recoveryStoreId)) | ||||||
return errors.Trace(err) | ||||||
} | ||||||
defer conn.Close() | ||||||
log.Info("send recover region to tikv", zap.String("tikv address", storeAddr), zap.Uint64("store id", recoveryStoreId)) | ||||||
stream, err := recoveryClient.RecoverRegion(ectx) | ||||||
if err != nil { | ||||||
log.Error("create recover region failed", zap.Uint64("store id", recoveryStoreId)) | ||||||
return errors.Trace(err) | ||||||
} | ||||||
|
||||||
// for a TiKV, send the stream | ||||||
for _, s := range recoveryPlan { | ||||||
if err = stream.Send(s); err != nil { | ||||||
log.Error("send recover region failed", zap.Error(err)) | ||||||
return errors.Trace(err) | ||||||
} | ||||||
} | ||||||
|
||||||
reply, err := stream.CloseAndRecv() | ||||||
if err != nil { | ||||||
log.Error("close the stream failed") | ||||||
return errors.Trace(err) | ||||||
} | ||||||
recovery.progress.Inc() | ||||||
log.Info("recover region execution success", zap.Uint64("store id", reply.GetStoreId())) | ||||||
return nil | ||||||
return recovery.RecoverRegionOfStore(ectx, storeId, plan) | ||||||
}) | ||||||
} | ||||||
// Wait for all TiKV instances force leader and wait apply to last log. | ||||||
return eg.Wait() | ||||||
} | ||||||
|
||||||
func (recovery *Recovery) SpawnTiKVShutDownWatchers(ctx context.Context) { | ||||||
rebootStores := map[uint64]struct{}{} | ||||||
cb := storewatch.MakeCallback(storewatch.WithOnReboot(func(s *metapb.Store) { | ||||||
log.Info("Store reboot detected, will regenerate leaders.", zap.Uint64("id", s.GetId())) | ||||||
rebootStores[s.Id] = struct{}{} | ||||||
}), storewatch.WithOnDisconnect(func(s *metapb.Store) { | ||||||
log.Warn("A store disconnected.", zap.Uint64("id", s.GetId()), zap.String("addr", s.GetAddress())) | ||||||
}), storewatch.WithOnNewStoreRegistered(func(s *metapb.Store) { | ||||||
log.Info("Start to observing the state of store.", zap.Uint64("id", s.GetId())) | ||||||
})) | ||||||
watcher := storewatch.New(recovery.mgr.PDClient(), cb) | ||||||
tick := time.NewTicker(30 * time.Second) | ||||||
mainLoop := func() { | ||||||
for { | ||||||
select { | ||||||
case <-ctx.Done(): | ||||||
return | ||||||
case <-tick.C: | ||||||
err := watcher.Step(ctx) | ||||||
if err != nil { | ||||||
log.Warn("Failed to step watcher.", logutil.ShortError(err)) | ||||||
} | ||||||
for id := range rebootStores { | ||||||
plan, ok := recovery.RecoveryPlan[id] | ||||||
if !ok { | ||||||
log.Warn("Store reboot detected, but no recovery plan found.", zap.Uint64("id", id)) | ||||||
continue | ||||||
} | ||||||
err := recovery.RecoverRegionOfStore(ctx, id, plan) | ||||||
if err != nil { | ||||||
log.Warn("Store reboot detected, but failed to regenerate leader.", zap.Uint64("id", id), logutil.ShortError(err)) | ||||||
continue | ||||||
} | ||||||
log.Info("Succeed to reload the leader in store.", zap.Uint64("id", id)) | ||||||
delete(rebootStores, id) | ||||||
} | ||||||
} | ||||||
} | ||||||
} | ||||||
|
||||||
go mainLoop() | ||||||
} | ||||||
|
||||||
// WaitApply send wait apply to all tikv ensure all region peer apply log into the last | ||||||
func (recovery *Recovery) WaitApply(ctx context.Context) (err error) { | ||||||
eg, ectx := errgroup.WithContext(ctx) | ||||||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") | ||
|
||
go_library( | ||
name = "storewatch", | ||
srcs = ["watching.go"], | ||
importpath = "github.com/pingcap/tidb/br/pkg/utils/storewatch", | ||
visibility = ["//visibility:public"], | ||
deps = [ | ||
"//br/pkg/conn", | ||
"//br/pkg/conn/util", | ||
"@com_github_pingcap_errors//:errors", | ||
"@com_github_pingcap_kvproto//pkg/metapb", | ||
], | ||
) | ||
|
||
go_test( | ||
name = "storewatch_test", | ||
timeout = "short", | ||
srcs = ["watching_test.go"], | ||
flaky = True, | ||
shard_count = 3, | ||
deps = [ | ||
":storewatch", | ||
"//br/pkg/conn/util", | ||
"@com_github_pingcap_kvproto//pkg/metapb", | ||
"@com_github_stretchr_testify//require", | ||
"@com_github_tikv_pd_client//:client", | ||
], | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,136 @@ | ||
// package storewatch provides a `Watcher` type which allows | ||
// the user to listen the events of lifetime of stores. | ||
package storewatch | ||
|
||
import ( | ||
"context" | ||
|
||
"github.com/pingcap/errors" | ||
"github.com/pingcap/kvproto/pkg/metapb" | ||
"github.com/pingcap/tidb/br/pkg/conn" | ||
"github.com/pingcap/tidb/br/pkg/conn/util" | ||
) | ||
|
||
// Callback will be called the supported event triggered. | ||
type Callback interface { | ||
OnNewStoreRegistered(store *metapb.Store) | ||
OnDisconnect(store *metapb.Store) | ||
OnReboot(store *metapb.Store) | ||
} | ||
|
||
// DynCallback is a function based callback set. | ||
type DynCallback struct { | ||
onNewStoreRegistered func(*metapb.Store) | ||
onDisconnect func(*metapb.Store) | ||
onReboot func(*metapb.Store) | ||
} | ||
|
||
// OnNewStoreRegistered will be called once new region added to be watched. | ||
func (cb *DynCallback) OnNewStoreRegistered(store *metapb.Store) { | ||
if cb.onNewStoreRegistered != nil { | ||
cb.onNewStoreRegistered(store) | ||
} | ||
} | ||
|
||
// OnDisconnect will be called once the store is disconnected. | ||
func (cb *DynCallback) OnDisconnect(store *metapb.Store) { | ||
if cb.onDisconnect != nil { | ||
cb.onDisconnect(store) | ||
} | ||
} | ||
|
||
// OnReboot will be called once the store is rebooted. | ||
func (cb *DynCallback) OnReboot(store *metapb.Store) { | ||
if cb.onReboot != nil { | ||
cb.onReboot(store) | ||
} | ||
} | ||
|
||
// DynCallbackOpt is the option for DynCallback. | ||
type DynCallbackOpt func(*DynCallback) | ||
|
||
// WithOnNewStoreRegistered adds a hook to the callback. | ||
func WithOnNewStoreRegistered(f func(*metapb.Store)) DynCallbackOpt { | ||
return func(cb *DynCallback) { | ||
cb.onNewStoreRegistered = f | ||
} | ||
} | ||
|
||
// WithOnDisconnect adds a hook to the callback. | ||
func WithOnDisconnect(f func(*metapb.Store)) DynCallbackOpt { | ||
return func(cb *DynCallback) { | ||
cb.onDisconnect = f | ||
} | ||
} | ||
|
||
// WithOnReboot adds a hook to the callback. | ||
func WithOnReboot(f func(*metapb.Store)) DynCallbackOpt { | ||
return func(cb *DynCallback) { | ||
cb.onReboot = f | ||
} | ||
} | ||
|
||
// MakeCallback creates a callback with the given options. | ||
// Allowed options: WithOnNewStoreRegistered, WithOnDisconnect, WithOnReboot. | ||
func MakeCallback(opts ...DynCallbackOpt) Callback { | ||
cb := &DynCallback{} | ||
for _, opt := range opts { | ||
opt(cb) | ||
} | ||
return cb | ||
} | ||
|
||
// Watcher watches the lifetime of stores. | ||
// generally it should be advanced by calling the `Step` call. | ||
type Watcher struct { | ||
cli util.StoreMeta | ||
cb Callback | ||
|
||
lastStores map[uint64]*metapb.Store | ||
} | ||
|
||
func New(cli util.StoreMeta, cb Callback) *Watcher { | ||
return &Watcher{ | ||
cli: cli, | ||
cb: cb, | ||
lastStores: make(map[uint64]*metapb.Store), | ||
} | ||
} | ||
|
||
func (w *Watcher) Step(ctx context.Context) error { | ||
liveStores, err := conn.GetAllTiKVStoresWithRetry(ctx, w.cli, util.SkipTiFlash) | ||
if err != nil { | ||
return errors.Annotate(err, "failed to update store list") | ||
} | ||
recorded := map[uint64]struct{}{} | ||
for _, store := range liveStores { | ||
w.updateStore(store) | ||
recorded[store.GetId()] = struct{}{} | ||
} | ||
w.retain(recorded) | ||
return nil | ||
} | ||
|
||
// updateStore updates the current store. and call the hooks needed. | ||
func (w *Watcher) updateStore(newStore *metapb.Store) { | ||
lastStore, ok := w.lastStores[newStore.GetId()] | ||
w.lastStores[newStore.GetId()] = newStore | ||
if !ok { | ||
w.cb.OnNewStoreRegistered(newStore) | ||
return | ||
} | ||
if lastStore.GetState() == metapb.StoreState_Up && newStore.GetState() == metapb.StoreState_Offline { | ||
w.cb.OnDisconnect(newStore) | ||
} | ||
if lastStore.StartTimestamp != newStore.StartTimestamp { | ||
w.cb.OnReboot(newStore) | ||
} | ||
} | ||
|
||
func (w *Watcher) retain(storeSet map[uint64]struct{}) { | ||
for id := range w.lastStores { | ||
if _, ok := storeSet[id]; !ok { | ||
delete(w.lastStores, id) | ||
} | ||
} | ||
} |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.