From a742996245b0ce521ffebbb0204ad1d8396e70fc Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Wed, 16 Aug 2023 12:02:01 +0800 Subject: [PATCH] DDL: Skip collecting TiFlash status when TiFlash is down (#40872) (#40888) close pingcap/tidb#38484 --- ddl/ddl_tiflash_api.go | 8 ++++++++ ddl/ddl_tiflash_test.go | 20 ++++++++++++++++++++ domain/infosync/tiflash_manager.go | 12 +++++++++++- 3 files changed, 39 insertions(+), 1 deletion(-) diff --git a/ddl/ddl_tiflash_api.go b/ddl/ddl_tiflash_api.go index 6aa3d3a7fb9fc..0752b68058a84 100644 --- a/ddl/ddl_tiflash_api.go +++ b/ddl/ddl_tiflash_api.go @@ -424,6 +424,14 @@ func (d *ddl) refreshTiFlashTicker(ctx sessionctx.Context, pollTiFlashContext *T return err } } + + failpoint.Inject("OneTiFlashStoreDown", func() { + for storeID, store := range pollTiFlashContext.TiFlashStores { + store.Store.StateName = "Down" + pollTiFlashContext.TiFlashStores[storeID] = store + break + } + }) pollTiFlashContext.PollCounter++ // Start to process every table. diff --git a/ddl/ddl_tiflash_test.go b/ddl/ddl_tiflash_test.go index accf7cc038ebd..a23fda24927ed 100644 --- a/ddl/ddl_tiflash_test.go +++ b/ddl/ddl_tiflash_test.go @@ -1299,3 +1299,23 @@ func TestTiFlashAvailableAfterAddPartition(t *testing.T) { require.NotNil(t, pi) require.Equal(t, len(pi.Definitions), 2) } + +func TestTiFlashAvailableAfterDownOneStore(t *testing.T) { + s, teardown := createTiFlashContext(t) + defer teardown() + tk := testkit.NewTestKit(t, s.store) + + tk.MustExec("use test") + tk.MustExec("drop table if exists ddltiflash") + tk.MustExec("create table ddltiflash(z int) PARTITION BY RANGE(z) (PARTITION p0 VALUES LESS THAN (10))") + require.NoError(t, failpoint.Enable("github.com/pingcap/tidb/ddl/OneTiFlashStoreDown", `return`)) + require.NoError(t, failpoint.Enable("github.com/pingcap/tidb/domain/infosync/OneTiFlashStoreDown", `return`)) + defer func() { + require.NoError(t, failpoint.Disable("github.com/pingcap/tidb/ddl/OneTiFlashStoreDown")) + require.NoError(t, failpoint.Disable("github.com/pingcap/tidb/domain/infosync/OneTiFlashStoreDown")) + }() + + tk.MustExec("alter table ddltiflash set tiflash replica 1") + time.Sleep(ddl.PollTiFlashInterval * RoundToBeAvailable * 3) + CheckTableAvailable(s.dom, t, 1, []string{}) +} diff --git a/domain/infosync/tiflash_manager.go b/domain/infosync/tiflash_manager.go index 319ca84464557..840fca170973a 100644 --- a/domain/infosync/tiflash_manager.go +++ b/domain/infosync/tiflash_manager.go @@ -31,6 +31,7 @@ import ( "github.com/gorilla/mux" "github.com/pingcap/errors" + "github.com/pingcap/failpoint" "github.com/pingcap/tidb/ddl/placement" "github.com/pingcap/tidb/store/helper" "github.com/pingcap/tidb/tablecodec" @@ -93,10 +94,19 @@ func getTiFlashPeerWithoutLagCount(tiFlashStores map[int64]helper.StoreStat, tab for _, store := range tiFlashStores { regionReplica := make(map[int64]int) err := helper.CollectTiFlashStatus(store.Store.StatusAddress, tableID, ®ionReplica) + failpoint.Inject("OneTiFlashStoreDown", func() { + if store.Store.StateName == "Down" { + err = errors.New("mock TiFlasah down") + } + }) if err != nil { logutil.BgLogger().Error("Fail to get peer status from TiFlash.", zap.Int64("tableID", tableID)) - return 0, err + // Just skip down or offline or tomestone stores, because PD will migrate regions from these stores. + if store.Store.StateName == "Up" || store.Store.StateName == "Disconnected" { + return 0, err + } + continue } flashPeerCount += len(regionReplica) }