From aa70d33efb63efbbeac80a6b0b3a59e6c8ca9017 Mon Sep 17 00:00:00 2001 From: Jianjun Liao Date: Tue, 10 Sep 2024 17:09:42 +0800 Subject: [PATCH 01/15] snapshot restore checkpoint into table Signed-off-by: Jianjun Liao --- br/pkg/checkpoint/backup.go | 4 +- br/pkg/checkpoint/checkpoint.go | 174 ++++++++------- br/pkg/checkpoint/log_restore.go | 56 ++--- br/pkg/checkpoint/restore.go | 92 +++----- br/pkg/checkpoint/storage.go | 312 +++++++++++++++++++++++++++ br/pkg/restore/log_client/client.go | 26 +-- br/pkg/restore/snap_client/client.go | 11 +- br/pkg/task/restore.go | 17 +- br/pkg/task/stream.go | 4 +- 9 files changed, 502 insertions(+), 194 deletions(-) create mode 100644 br/pkg/checkpoint/storage.go diff --git a/br/pkg/checkpoint/backup.go b/br/pkg/checkpoint/backup.go index 0ae58babb3e8b..c5eff5fe65946 100644 --- a/br/pkg/checkpoint/backup.go +++ b/br/pkg/checkpoint/backup.go @@ -58,7 +58,7 @@ func StartCheckpointBackupRunnerForTest( timer GlobalTimer, ) (*CheckpointRunner[BackupKeyType, BackupValueType], error) { runner := newCheckpointRunner[BackupKeyType, BackupValueType]( - ctx, storage, cipher, timer, flushPositionForBackup(), valueMarshalerForBackup) + ctx, &externalCheckpointStorage{storage}, storage, cipher, timer, flushPositionForBackup(), valueMarshalerForBackup) err := runner.initialLock(ctx) if err != nil { @@ -75,7 +75,7 @@ func StartCheckpointRunnerForBackup( timer GlobalTimer, ) (*CheckpointRunner[BackupKeyType, BackupValueType], error) { runner := newCheckpointRunner[BackupKeyType, BackupValueType]( - ctx, storage, cipher, timer, flushPositionForBackup(), valueMarshalerForBackup) + ctx, &externalCheckpointStorage{storage}, storage, cipher, timer, flushPositionForBackup(), valueMarshalerForBackup) err := runner.initialLock(ctx) if err != nil { diff --git a/br/pkg/checkpoint/checkpoint.go b/br/pkg/checkpoint/checkpoint.go index 4b397a60e5eeb..772cacae929c0 100644 --- a/br/pkg/checkpoint/checkpoint.go +++ b/br/pkg/checkpoint/checkpoint.go @@ -156,9 +156,10 @@ type CheckpointRunner[K KeyType, V ValueType] struct { valueMarshaler func(*RangeGroup[K, V]) ([]byte, error) - storage storage.ExternalStorage - cipher *backuppb.CipherInfo - timer GlobalTimer + checkpointStorage checkpointStorage + lockStorage storage.ExternalStorage + cipher *backuppb.CipherInfo + timer GlobalTimer appendCh chan *CheckpointMessage[K, V] checksumCh chan *ChecksumItem @@ -175,7 +176,8 @@ type CheckpointRunner[K KeyType, V ValueType] struct { func newCheckpointRunner[K KeyType, V ValueType]( ctx context.Context, - storage storage.ExternalStorage, + checkpointStorage checkpointStorage, + lockStorage storage.ExternalStorage, cipher *backuppb.CipherInfo, timer GlobalTimer, f flushPosition, @@ -189,9 +191,10 @@ func newCheckpointRunner[K KeyType, V ValueType]( valueMarshaler: vm, - storage: storage, - cipher: cipher, - timer: timer, + checkpointStorage: checkpointStorage, + lockStorage: lockStorage, + cipher: cipher, + timer: timer, appendCh: make(chan *CheckpointMessage[K, V]), checksumCh: make(chan *ChecksumItem), @@ -274,7 +277,7 @@ func (r *CheckpointRunner[K, V]) WaitForFinish(ctx context.Context, flush bool) r.wg.Wait() // remove the checkpoint lock if r.lockId > 0 { - err := r.storage.DeleteFile(ctx, r.CheckpointLockPath) + err := r.lockStorage.DeleteFile(ctx, r.CheckpointLockPath) if err != nil { log.Warn("failed to remove the checkpoint lock", zap.Error(err)) } @@ -502,7 +505,7 @@ func (r *CheckpointRunner[K, V]) doChecksumFlush(ctx context.Context, checksumIt } fname := fmt.Sprintf("%s/t%d_and__.cpt", r.CheckpointChecksumDir, checksumItems.Items[0].TableID) - if err = r.storage.WriteFile(ctx, fname, data); err != nil { + if err = r.checkpointStorage.flushCheckpointChecksum(ctx, fname, data); err != nil { return errors.Annotatef(err, "failed to write file %s for checkpoint checksum", fname) } @@ -565,7 +568,7 @@ func (r *CheckpointRunner[K, V]) doFlush(ctx context.Context, meta map[K]*RangeG checksum := sha256.Sum256(fname) checksumEncoded := base64.URLEncoding.EncodeToString(checksum[:]) path := fmt.Sprintf("%s/%s_%d.cpt", r.CheckpointDataDir, checksumEncoded, rand.Uint64()) - if err := r.storage.WriteFile(ctx, path, data); err != nil { + if err := r.checkpointStorage.flushCheckpointData(ctx, path, data); err != nil { return errors.Trace(err) } } @@ -616,13 +619,13 @@ func (r *CheckpointRunner[K, V]) flushLock(ctx context.Context, p int64) error { return errors.Trace(err) } - err = r.storage.WriteFile(ctx, r.CheckpointLockPath, data) + err = r.lockStorage.WriteFile(ctx, r.CheckpointLockPath, data) return errors.Trace(err) } // check whether this lock belongs to this BR func (r *CheckpointRunner[K, V]) checkLockFile(ctx context.Context, now int64) error { - data, err := r.storage.ReadFile(ctx, r.CheckpointLockPath) + data, err := r.lockStorage.ReadFile(ctx, r.CheckpointLockPath) if err != nil { return errors.Trace(err) } @@ -644,7 +647,7 @@ func (r *CheckpointRunner[K, V]) checkLockFile(ctx context.Context, now int64) e return errors.Errorf("The existing lock will expire in %d seconds. "+ "There may be another BR(%d) running. If not, you can wait for the lock to expire, "+ "or delete the file `%s%s` manually.", - (lock.ExpireAt-now)/1000, lock.LockId, strings.TrimRight(r.storage.URI(), "/"), r.CheckpointLockPath) + (lock.ExpireAt-now)/1000, lock.LockId, strings.TrimRight(r.lockStorage.URI(), "/"), r.CheckpointLockPath) } return nil @@ -677,7 +680,7 @@ func (r *CheckpointRunner[K, V]) initialLock(ctx context.Context) error { return errors.Trace(err) } r.lockId = oracle.ComposeTS(p, l) - exist, err := r.storage.FileExists(ctx, r.CheckpointLockPath) + exist, err := r.lockStorage.FileExists(ctx, r.CheckpointLockPath) if err != nil { return errors.Trace(err) } @@ -696,6 +699,48 @@ func (r *CheckpointRunner[K, V]) initialLock(ctx context.Context) error { return errors.Trace(err) } +func parseCheckpointData[K KeyType, V ValueType]( + content []byte, + pastDureTime *time.Duration, + cipher *backuppb.CipherInfo, + fn func(groupKey K, value V), +) error { + checkpointData := &CheckpointData{} + if err := json.Unmarshal(content, checkpointData); err != nil { + log.Error("failed to unmarshal the checkpoint data info, skip it", zap.Error(err)) + return nil + } + + if checkpointData.DureTime > *pastDureTime { + *pastDureTime = checkpointData.DureTime + } + for _, meta := range checkpointData.RangeGroupMetas { + decryptContent, err := metautil.Decrypt(meta.RangeGroupsEncriptedData, cipher, meta.CipherIv) + if err != nil { + return errors.Trace(err) + } + + checksum := sha256.Sum256(decryptContent) + if !bytes.Equal(meta.Checksum, checksum[:]) { + log.Error("checkpoint checksum info's checksum mismatch, skip it", + zap.ByteString("expect", meta.Checksum), + zap.ByteString("got", checksum[:]), + ) + continue + } + + group := &RangeGroup[K, V]{} + if err = json.Unmarshal(decryptContent, group); err != nil { + return errors.Trace(err) + } + + for _, g := range group.Group { + fn(group.GroupKey, g) + } + } + return nil +} + // walk the whole checkpoint range files and retrieve the metadata of backed up/restored ranges // and return the total time cost in the past executions func walkCheckpointFile[K KeyType, V ValueType]( @@ -713,39 +758,8 @@ func walkCheckpointFile[K KeyType, V ValueType]( if err != nil { return errors.Trace(err) } - - checkpointData := &CheckpointData{} - if err = json.Unmarshal(content, checkpointData); err != nil { - log.Error("failed to unmarshal the checkpoint data info, skip it", zap.Error(err)) - return nil - } - - if checkpointData.DureTime > pastDureTime { - pastDureTime = checkpointData.DureTime - } - for _, meta := range checkpointData.RangeGroupMetas { - decryptContent, err := metautil.Decrypt(meta.RangeGroupsEncriptedData, cipher, meta.CipherIv) - if err != nil { - return errors.Trace(err) - } - - checksum := sha256.Sum256(decryptContent) - if !bytes.Equal(meta.Checksum, checksum[:]) { - log.Error("checkpoint checksum info's checksum mismatch, skip it", - zap.ByteString("expect", meta.Checksum), - zap.ByteString("got", checksum[:]), - ) - continue - } - - group := &RangeGroup[K, V]{} - if err = json.Unmarshal(decryptContent, group); err != nil { - return errors.Trace(err) - } - - for _, g := range group.Group { - fn(group.GroupKey, g) - } + if err := parseCheckpointData(content, &pastDureTime, cipher, fn); err != nil { + return errors.Trace(err) } } return nil @@ -765,6 +779,44 @@ func loadCheckpointMeta[T any](ctx context.Context, s storage.ExternalStorage, p return errors.Trace(err) } +func parseCheckpointChecksum( + data []byte, + checkpointChecksum map[int64]*ChecksumItem, + pastDureTime *time.Duration, +) error { + info := &ChecksumInfo{} + err := json.Unmarshal(data, info) + if err != nil { + log.Error("failed to unmarshal the checkpoint checksum info, skip it", zap.Error(err)) + return nil + } + + checksum := sha256.Sum256(info.Content) + if !bytes.Equal(info.Checksum, checksum[:]) { + log.Error("checkpoint checksum info's checksum mismatch, skip it", + zap.ByteString("expect", info.Checksum), + zap.ByteString("got", checksum[:]), + ) + return nil + } + + if info.DureTime > *pastDureTime { + *pastDureTime = info.DureTime + } + + items := &ChecksumItems{} + err = json.Unmarshal(info.Content, items) + if err != nil { + return errors.Trace(err) + } + + for _, c := range items.Items { + checkpointChecksum[c.TableID] = c + } + + return nil +} + // walk the whole checkpoint checksum files and retrieve checksum information of tables calculated func loadCheckpointChecksum( ctx context.Context, @@ -778,35 +830,9 @@ func loadCheckpointChecksum( if err != nil { return errors.Trace(err) } - info := &ChecksumInfo{} - err = json.Unmarshal(data, info) - if err != nil { - log.Error("failed to unmarshal the checkpoint checksum info, skip it", zap.Error(err)) - return nil - } - - checksum := sha256.Sum256(info.Content) - if !bytes.Equal(info.Checksum, checksum[:]) { - log.Error("checkpoint checksum info's checksum mismatch, skip it", - zap.ByteString("expect", info.Checksum), - zap.ByteString("got", checksum[:]), - ) - return nil - } - - if info.DureTime > pastDureTime { - pastDureTime = info.DureTime - } - - items := &ChecksumItems{} - err = json.Unmarshal(info.Content, items) - if err != nil { + if err = parseCheckpointChecksum(data, checkpointChecksum, &pastDureTime); err != nil { return errors.Trace(err) } - - for _, c := range items.Items { - checkpointChecksum[c.TableID] = c - } return nil }) return checkpointChecksum, pastDureTime, errors.Trace(err) diff --git a/br/pkg/checkpoint/log_restore.go b/br/pkg/checkpoint/log_restore.go index ebe82083aa503..420486be54346 100644 --- a/br/pkg/checkpoint/log_restore.go +++ b/br/pkg/checkpoint/log_restore.go @@ -21,11 +21,12 @@ import ( "time" "github.com/pingcap/errors" - backuppb "github.com/pingcap/kvproto/pkg/brpb" "github.com/pingcap/log" + "github.com/pingcap/tidb/br/pkg/glue" "github.com/pingcap/tidb/br/pkg/storage" "github.com/pingcap/tidb/pkg/meta/model" pmodel "github.com/pingcap/tidb/pkg/parser/model" + "github.com/pingcap/tidb/pkg/util/sqlexec" "go.uber.org/zap" ) @@ -99,28 +100,32 @@ func valueMarshalerForLogRestore(group *RangeGroup[LogRestoreKeyType, LogRestore }) } +func newTableCheckpointStorage(se glue.Session, checkpointDBName string) *tableCheckpointStorage { + return &tableCheckpointStorage{ + se: se, + checkpointDBName: checkpointDBName, + } +} + // only for test func StartCheckpointLogRestoreRunnerForTest( ctx context.Context, - storage storage.ExternalStorage, - cipher *backuppb.CipherInfo, + se glue.Session, tick time.Duration, - taskName string, ) (*CheckpointRunner[LogRestoreKeyType, LogRestoreValueType], error) { runner := newCheckpointRunner[LogRestoreKeyType, LogRestoreValueType]( - ctx, storage, cipher, nil, flushPositionForRestore(taskName), valueMarshalerForLogRestore) + ctx, newTableCheckpointStorage(se, logRestoreCheckpointDatabaseName), nil, nil, nil, flushPosition{}, valueMarshalerForLogRestore) runner.startCheckpointMainLoop(ctx, tick, tick, 0) return runner, nil } -func StartCheckpointRunnerForLogRestore(ctx context.Context, - storage storage.ExternalStorage, - cipher *backuppb.CipherInfo, - taskName string, +func StartCheckpointRunnerForLogRestore( + ctx context.Context, + se glue.Session, ) (*CheckpointRunner[LogRestoreKeyType, LogRestoreValueType], error) { runner := newCheckpointRunner[LogRestoreKeyType, LogRestoreValueType]( - ctx, storage, cipher, nil, flushPositionForRestore(taskName), valueMarshalerForLogRestore) + ctx, newTableCheckpointStorage(se, logRestoreCheckpointDatabaseName), nil, nil, nil, flushPosition{}, valueMarshalerForLogRestore) // for restore, no need to set lock runner.startCheckpointMainLoop(ctx, defaultTickDurationForFlush, defaultTckDurationForChecksum, 0) @@ -147,17 +152,14 @@ func AppendRangeForLogRestore( }) } -const ( - CheckpointTaskInfoForLogRestorePathFormat = CheckpointDir + "/restore-%d/taskInfo.meta" - CheckpointIngestIndexRepairSQLPathFormat = CheckpointDir + "/restore-%s/ingest-repair.meta" -) - -func getCheckpointTaskInfoPathByID(clusterID uint64) string { - return fmt.Sprintf(CheckpointTaskInfoForLogRestorePathFormat, clusterID) -} - -func getCheckpointIngestIndexRepairPathByTaskName(taskName string) string { - return fmt.Sprintf(CheckpointIngestIndexRepairSQLPathFormat, taskName) +// load the whole checkpoint range data and retrieve the metadata of restored ranges +// and return the total time cost in the past executions +func LoadCheckpointDataForLogRestore[K KeyType, V ValueType]( + ctx context.Context, + execCtx sqlexec.RestrictedSQLExecutor, + fn func(K, V), +) (time.Duration, error) { + return selectCheckpointData(ctx, execCtx, logRestoreCheckpointDatabaseName, fn) } // A progress type for snapshot + log restore. @@ -196,6 +198,8 @@ const ( type CheckpointTaskInfoForLogRestore struct { // the progress for this task Progress RestoreProgress `json:"progress"` + // the upstream cluster id + UpstreamClusterID uint64 `json:"upstream-cluster-id"` // a task marker to distinguish the different tasks StartTS uint64 `json:"start-ts"` RestoreTS uint64 `json:"restore-ts"` @@ -207,21 +211,19 @@ type CheckpointTaskInfoForLogRestore struct { func LoadCheckpointTaskInfoForLogRestore( ctx context.Context, - s storage.ExternalStorage, - clusterID uint64, + execCtx sqlexec.RestrictedSQLExecutor, ) (*CheckpointTaskInfoForLogRestore, error) { m := &CheckpointTaskInfoForLogRestore{} - err := loadCheckpointMeta(ctx, s, getCheckpointTaskInfoPathByID(clusterID), m) + err := selectCheckpointMeta(ctx, execCtx, logRestoreCheckpointDatabaseName, checkpointMetaTypeTaskInfo, m) return m, err } func SaveCheckpointTaskInfoForLogRestore( ctx context.Context, - s storage.ExternalStorage, + se glue.Session, meta *CheckpointTaskInfoForLogRestore, - clusterID uint64, ) error { - return saveCheckpointMetadata(ctx, s, meta, getCheckpointTaskInfoPathByID(clusterID)) + return insertCheckpointMeta(ctx, se, logRestoreCheckpointDatabaseName, checkpointMetaTypeTaskInfo, meta) } func ExistsCheckpointTaskInfo( diff --git a/br/pkg/checkpoint/restore.go b/br/pkg/checkpoint/restore.go index 06fa45c052122..1f8cf40e13482 100644 --- a/br/pkg/checkpoint/restore.go +++ b/br/pkg/checkpoint/restore.go @@ -17,12 +17,14 @@ package checkpoint import ( "context" "encoding/json" - "fmt" "time" - backuppb "github.com/pingcap/kvproto/pkg/brpb" + "github.com/pingcap/errors" + "github.com/pingcap/tidb/br/pkg/glue" "github.com/pingcap/tidb/br/pkg/pdutil" - "github.com/pingcap/tidb/br/pkg/storage" + "github.com/pingcap/tidb/pkg/domain" + "github.com/pingcap/tidb/pkg/parser/model" + "github.com/pingcap/tidb/pkg/util/sqlexec" ) type RestoreKeyType = int64 @@ -35,32 +37,6 @@ func (rv RestoreValueType) IdentKey() []byte { return []byte(rv.RangeKey) } -const ( - CheckpointRestoreDirFormat = CheckpointDir + "/restore-%s" - CheckpointDataDirForRestoreFormat = CheckpointRestoreDirFormat + "/data" - CheckpointChecksumDirForRestoreFormat = CheckpointRestoreDirFormat + "/checksum" - CheckpointMetaPathForRestoreFormat = CheckpointRestoreDirFormat + "/checkpoint.meta" -) - -func getCheckpointMetaPathByName(taskName string) string { - return fmt.Sprintf(CheckpointMetaPathForRestoreFormat, taskName) -} - -func getCheckpointDataDirByName(taskName string) string { - return fmt.Sprintf(CheckpointDataDirForRestoreFormat, taskName) -} - -func getCheckpointChecksumDirByName(taskName string) string { - return fmt.Sprintf(CheckpointChecksumDirForRestoreFormat, taskName) -} - -func flushPositionForRestore(taskName string) flushPosition { - return flushPosition{ - CheckpointDataDir: getCheckpointDataDirByName(taskName), - CheckpointChecksumDir: getCheckpointChecksumDirByName(taskName), - } -} - func valueMarshalerForRestore(group *RangeGroup[RestoreKeyType, RestoreValueType]) ([]byte, error) { return json.Marshal(group) } @@ -68,13 +44,11 @@ func valueMarshalerForRestore(group *RangeGroup[RestoreKeyType, RestoreValueType // only for test func StartCheckpointRestoreRunnerForTest( ctx context.Context, - storage storage.ExternalStorage, - cipher *backuppb.CipherInfo, + se glue.Session, tick time.Duration, - taskName string, ) (*CheckpointRunner[RestoreKeyType, RestoreValueType], error) { runner := newCheckpointRunner[RestoreKeyType, RestoreValueType]( - ctx, storage, cipher, nil, flushPositionForRestore(taskName), valueMarshalerForRestore) + ctx, newTableCheckpointStorage(se, snapshotRestoreCheckpointDatabaseName), nil, nil, nil, flushPosition{}, valueMarshalerForRestore) runner.startCheckpointMainLoop(ctx, tick, tick, 0) return runner, nil @@ -82,12 +56,11 @@ func StartCheckpointRestoreRunnerForTest( func StartCheckpointRunnerForRestore( ctx context.Context, - storage storage.ExternalStorage, - cipher *backuppb.CipherInfo, + se glue.Session, taskName string, ) (*CheckpointRunner[RestoreKeyType, RestoreValueType], error) { runner := newCheckpointRunner[RestoreKeyType, RestoreValueType]( - ctx, storage, cipher, nil, flushPositionForRestore(taskName), valueMarshalerForRestore) + ctx, newTableCheckpointStorage(se, snapshotRestoreCheckpointDatabaseName), nil, nil, nil, flushPosition{}, valueMarshalerForRestore) // for restore, no need to set lock runner.startCheckpointMainLoop(ctx, defaultTickDurationForFlush, defaultTckDurationForChecksum, 0) @@ -108,24 +81,21 @@ func AppendRangesForRestore( }) } -// walk the whole checkpoint range files and retrieve the metadata of restored ranges +// load the whole checkpoint range data and retrieve the metadata of restored ranges // and return the total time cost in the past executions -func WalkCheckpointFileForRestore[K KeyType, V ValueType]( +func LoadCheckpointDataForSnapshotRestore[K KeyType, V ValueType]( ctx context.Context, - s storage.ExternalStorage, - cipher *backuppb.CipherInfo, - taskName string, + execCtx sqlexec.RestrictedSQLExecutor, fn func(K, V), ) (time.Duration, error) { - return walkCheckpointFile(ctx, s, cipher, getCheckpointDataDirByName(taskName), fn) + return selectCheckpointData(ctx, execCtx, snapshotRestoreCheckpointDatabaseName, fn) } func LoadCheckpointChecksumForRestore( ctx context.Context, - s storage.ExternalStorage, - taskName string, + execCtx sqlexec.RestrictedSQLExecutor, ) (map[int64]*ChecksumItem, time.Duration, error) { - return loadCheckpointChecksum(ctx, s, getCheckpointChecksumDirByName(taskName)) + return selectCheckpointChecksum(ctx, execCtx, logRestoreCheckpointDatabaseName) } type CheckpointMetadataForRestore struct { @@ -133,34 +103,34 @@ type CheckpointMetadataForRestore struct { GcRatio string `json:"gc-ratio,omitempty"` } -func LoadCheckpointMetadataForRestore( +func LoadCheckpointMetadataForSnapshotRestore( ctx context.Context, - s storage.ExternalStorage, - taskName string, + execCtx sqlexec.RestrictedSQLExecutor, ) (*CheckpointMetadataForRestore, error) { m := &CheckpointMetadataForRestore{} - err := loadCheckpointMeta(ctx, s, getCheckpointMetaPathByName(taskName), m) + err := selectCheckpointMeta(ctx, execCtx, snapshotRestoreCheckpointDatabaseName, checkpointMetaTableName, m) return m, err } -func SaveCheckpointMetadataForRestore( +func SaveCheckpointMetadataForSnapshotRestore( ctx context.Context, - s storage.ExternalStorage, + se glue.Session, meta *CheckpointMetadataForRestore, - taskName string, ) error { - return saveCheckpointMetadata(ctx, s, meta, getCheckpointMetaPathByName(taskName)) + err := initCheckpointTable(ctx, se, snapshotRestoreCheckpointDatabaseName, []string{checkpointDataTableName, checkpointChecksumTableName}) + if err != nil { + return errors.Trace(err) + } + return insertCheckpointMeta(ctx, se, snapshotRestoreCheckpointDatabaseName, checkpointMetaTableName, meta) } -func ExistsRestoreCheckpoint( +func ExistsSnapshotRestoreCheckpoint( ctx context.Context, - s storage.ExternalStorage, - taskName string, -) (bool, error) { - return s.FileExists(ctx, getCheckpointMetaPathByName(taskName)) + dom *domain.Domain, +) bool { + return dom.InfoSchema().TableExists(model.NewCIStr(snapshotRestoreCheckpointDatabaseName), model.NewCIStr(checkpointMetaTableName)) } -func RemoveCheckpointDataForRestore(ctx context.Context, s storage.ExternalStorage, taskName string) error { - prefix := fmt.Sprintf(CheckpointRestoreDirFormat, taskName) - return removeCheckpointData(ctx, s, prefix) +func RemoveCheckpointDataForSnapshotRestore(ctx context.Context, dom *domain.Domain, se glue.Session) error { + return dropCheckpointTables(ctx, dom, se, snapshotRestoreCheckpointDatabaseName, []string{checkpointDataTableName, checkpointChecksumTableName, checkpointMetaTableName}) } diff --git a/br/pkg/checkpoint/storage.go b/br/pkg/checkpoint/storage.go new file mode 100644 index 0000000000000..825700431cd82 --- /dev/null +++ b/br/pkg/checkpoint/storage.go @@ -0,0 +1,312 @@ +// Copyright 2024 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package checkpoint + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "time" + + "github.com/google/uuid" + "github.com/pingcap/errors" + "github.com/pingcap/log" + "github.com/pingcap/tidb/br/pkg/glue" + "github.com/pingcap/tidb/br/pkg/storage" + "github.com/pingcap/tidb/pkg/domain" + "github.com/pingcap/tidb/pkg/kv" + "github.com/pingcap/tidb/pkg/parser/model" + "github.com/pingcap/tidb/pkg/util/sqlexec" + "go.uber.org/zap" +) + +type checkpointStorage interface { + flushCheckpointData(ctx context.Context, fname string, data []byte) error + flushCheckpointChecksum(ctx context.Context, fname string, data []byte) error +} + +type externalCheckpointStorage struct { + storage storage.ExternalStorage +} + +func (s *externalCheckpointStorage) flushCheckpointData(ctx context.Context, fname string, data []byte) error { + return s.storage.WriteFile(ctx, fname, data) +} + +func (s *externalCheckpointStorage) flushCheckpointChecksum(ctx context.Context, fname string, data []byte) error { + return s.storage.WriteFile(ctx, fname, data) +} + +// Notice that: +// 1. the checkpoint table only records one task checkpoint. +// 2. BR regards the metadata table as a file so that it is not empty if the table exists. +// 3. BR regards the checkpoint table as a directory which is managed by metadata table. +const ( + logRestoreCheckpointDatabaseName string = "__TiDB_BR_Temporary_Log_Restore_Checkpoint" + snapshotRestoreCheckpointDatabaseName string = "__TiDB_BR_Temporary_Snapshot_Restore_Checkpoint" + + // directory level table + checkpointDataTableName string = "cpt_data" + checkpointChecksumTableName string = "cpt_checksum" + // file level table + checkpointMetaTableName string = "cpt_metadata" + checkpointTaskInfoTableName string = "cpt_taskinfo" + checkpointIngestTableName string = "cpt_ingest" + + createCheckpointTable string = ` + CREATE TABLE IF NOT EXISTS %n.%n ( + uuid varchar(32) NOT NULL, + segment_id BIGINT NOT NULL, + data BLOB(524288) NOT NULL, + update_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + PRIMARY KEY(uuid, segment_id));` + + insertCheckpointSQLTemplate string = ` + REPLACE INTO %s.%s + (uuid, segment_id, data) VALUES (%?, %?, %?);` + + selectCheckpointSQLTemplate string = ` + SELECT uuid, segment_id, data FROM %s.%s ORDER BY uuid, segment_id;` + + createCheckpointMetaTable string = ` + CREATE TABLE IF NOT EXISTS %n.%n ( + segment_id BIGINT NOT NULL, + data BLOB(524288) NOT NULL, + update_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + PRIMARY KEY(segment_id));` + + insertCheckpointMetaSQLTemplate string = ` + REPLACE INTO %n.%n (segment_id, data) VALUES (%?, %?);` + + selectCheckpointMetaSQLTemplate string = `SELECT segment_id, data FROM %n.%n;` +) + +const CheckpointIdMapBlockSize int = 524288 + +func chunkInsertCheckpointData(data []byte, fn func(segmentId uint64, chunk []byte) error) error { + for startIdx, segmentId := 0, uint64(0); startIdx < len(data); segmentId += 1 { + endIdx := startIdx + CheckpointIdMapBlockSize + if endIdx > len(data) { + endIdx = len(data) + } + if err := fn(segmentId, data[startIdx:endIdx]); err != nil { + return errors.Trace(err) + } + startIdx = endIdx + } + return nil +} + +func chunkInsertCheckpointSQLs(dbName, tableName string, data []byte) ([]string, [][]any) { + sqls := make([]string, 0, len(data)/CheckpointIdMapBlockSize+1) + argss := make([][]any, 0, len(data)/CheckpointIdMapBlockSize+1) + uuid := uuid.New() + _ = chunkInsertCheckpointData(data, func(segmentId uint64, chunk []byte) error { + sqls = append(sqls, fmt.Sprintf(insertCheckpointSQLTemplate, dbName, tableName)) + argss = append(argss, []any{uuid[:], segmentId, chunk}) + return nil + }) + return sqls, argss +} + +type tableCheckpointStorage struct { + se glue.Session + checkpointDBName string +} + +func (s *tableCheckpointStorage) flushCheckpointData(ctx context.Context, _ string, data []byte) error { + sqls, argss := chunkInsertCheckpointSQLs(s.checkpointDBName, checkpointDataTableName, data) + for i, sql := range sqls { + args := argss[i] + if err := s.se.ExecuteInternal(ctx, sql, args...); err != nil { + return errors.Trace(err) + } + } + return nil +} + +func (s *tableCheckpointStorage) flushCheckpointChecksum(ctx context.Context, _ string, data []byte) error { + sqls, argss := chunkInsertCheckpointSQLs(s.checkpointDBName, checkpointChecksumTableName, data) + for i, sql := range sqls { + args := argss[i] + if err := s.se.ExecuteInternal(ctx, sql, args...); err != nil { + return errors.Trace(err) + } + } + return nil +} + +func mergeSelectCheckpoint( + ctx context.Context, + execCtx sqlexec.RestrictedSQLExecutor, + dbName, tableName string, +) ([][]byte, error) { + rows, _, errSQL := execCtx.ExecRestrictedSQL( + kv.WithInternalSourceType(ctx, kv.InternalTxnBR), + nil, + fmt.Sprintf(selectCheckpointSQLTemplate, dbName, tableName), + ) + if errSQL != nil { + return nil, errors.Annotatef(errSQL, "failed to get checkpoint data from table %s.%s", dbName, tableName) + } + + var ( + retData [][]byte = make([][]byte, 0, len(rows)) + rowData []byte = nil + lastUUID []byte = nil + lastUUIDInvalid bool = false + nextSegmentID uint64 = 0 + ) + for _, row := range rows { + uuid, segment_id, data := row.GetBytes(0), row.GetUint64(1), row.GetBytes(2) + if len(uuid) == 0 { + log.Warn("get the empty uuid, but just skip it") + continue + } + if bytes.Equal(uuid, lastUUID) { + if lastUUIDInvalid { + continue + } + if nextSegmentID != segment_id { + lastUUIDInvalid = true + continue + } + rowData = append(rowData, data...) + nextSegmentID += 1 + } else { + if !lastUUIDInvalid && len(rowData) > 0 { + retData = append(retData, rowData) + } + rowData = nil + lastUUIDInvalid = false + nextSegmentID = 0 + lastUUID = uuid + } + } + return retData, nil +} + +func selectCheckpointData[K KeyType, V ValueType]( + ctx context.Context, + execCtx sqlexec.RestrictedSQLExecutor, + dbName string, + fn func(groupKey K, value V), +) (time.Duration, error) { + // records the total time cost in the past executions + var pastDureTime time.Duration = 0 + checkpointDatas, err := mergeSelectCheckpoint(ctx, execCtx, dbName, checkpointDataTableName) + if err != nil { + return pastDureTime, errors.Trace(err) + } + for _, content := range checkpointDatas { + if err := parseCheckpointData(content, &pastDureTime, nil, fn); err != nil { + return pastDureTime, errors.Trace(err) + } + } + return pastDureTime, nil +} + +func selectCheckpointChecksum( + ctx context.Context, + execCtx sqlexec.RestrictedSQLExecutor, + dbName string, +) (map[int64]*ChecksumItem, time.Duration, error) { + var pastDureTime time.Duration = 0 + checkpointChecksum := make(map[int64]*ChecksumItem) + checkpointChecksums, err := mergeSelectCheckpoint(ctx, execCtx, dbName, checkpointChecksumTableName) + if err != nil { + return checkpointChecksum, pastDureTime, errors.Trace(err) + } + for _, content := range checkpointChecksums { + if err := parseCheckpointChecksum(content, checkpointChecksum, &pastDureTime); err != nil { + return checkpointChecksum, pastDureTime, errors.Trace(err) + } + } + return checkpointChecksum, pastDureTime, nil +} + +func initCheckpointTable(ctx context.Context, se glue.Session, dbName string, checkpointTableNames []string) error { + if err := se.ExecuteInternal(ctx, "CREATE DATABASE %n IF NOT EXISTS;", dbName); err != nil { + return errors.Trace(err) + } + for _, tableName := range checkpointTableNames { + if err := se.ExecuteInternal(ctx, createCheckpointTable, dbName, tableName); err != nil { + return errors.Trace(err) + } + } + return nil +} + +func insertCheckpointMeta[T any](ctx context.Context, se glue.Session, dbName string, tableName string, meta *T) error { + data, err := json.Marshal(meta) + if err != nil { + return errors.Trace(err) + } + if err := se.ExecuteInternal(ctx, createCheckpointMetaTable, dbName, tableName); err != nil { + return errors.Trace(err) + } + err = chunkInsertCheckpointData(data, func(segmentId uint64, chunk []byte) error { + err := se.ExecuteInternal(ctx, insertCheckpointMetaSQLTemplate, dbName, tableName, segmentId, chunk) + return errors.Trace(err) + }) + return errors.Trace(err) +} + +func selectCheckpointMeta[T any](ctx context.Context, execCtx sqlexec.RestrictedSQLExecutor, dbName string, tableName string, meta *T) error { + rows, _, errSQL := execCtx.ExecRestrictedSQL( + kv.WithInternalSourceType(ctx, kv.InternalTxnBR), + nil, + fmt.Sprintf(selectCheckpointMetaSQLTemplate, dbName, tableName), + ) + if errSQL != nil { + return errors.Annotatef(errSQL, "failed to get checkpoint metadata from table %s.%s", dbName, tableName) + } + if len(rows) == 0 { + return errors.Errorf("get the empty checkpoint meta, the checkpoint is incomplete from table %s.%s", dbName, tableName) + } + + data := make([]byte, 0, len(rows)*CheckpointIdMapBlockSize) + for i, row := range rows { + segmentId, chunk := row.GetUint64(0), row.GetBytes(1) + if uint64(i) != segmentId { + return errors.Errorf("the checkpoint metadata is incomplete from table %s.%s at segment %d", dbName, tableName, segmentId) + } + data = append(data, chunk...) + } + err := json.Unmarshal(data, meta) + return errors.Trace(err) +} + +func dropCheckpointTables(ctx context.Context, dom *domain.Domain, se glue.Session, dbName string, tableNames []string) error { + for _, tableName := range tableNames { + if err := se.ExecuteInternal(ctx, "DROP TABLE IF EXISTS %n.%n;", dbName, tableName); err != nil { + return errors.Trace(err) + } + } + // check if any user table is created in the checkpoint database + tables, err := dom.InfoSchema().SchemaTableInfos(ctx, model.NewCIStr(dbName)) + if err != nil { + return errors.Trace(err) + } + if len(tables) > 0 { + log.Warn("user tables in the checkpoint database, skip drop the database", zap.String("db", dbName)) + return nil + } + if err := se.ExecuteInternal(ctx, "DROP DATABASE %n;", dbName); err != nil { + return errors.Trace(err) + } + return nil +} diff --git a/br/pkg/restore/log_client/client.go b/br/pkg/restore/log_client/client.go index 6c4666725ef4f..f3b1a396300a8 100644 --- a/br/pkg/restore/log_client/client.go +++ b/br/pkg/restore/log_client/client.go @@ -200,8 +200,8 @@ func (rc *LogClient) CleanUpKVFiles( return rc.fileImporter.ClearFiles(ctx, rc.pdClient, "v1") } -func (rc *LogClient) StartCheckpointRunnerForLogRestore(ctx context.Context, taskName string) (*checkpoint.CheckpointRunner[checkpoint.LogRestoreKeyType, checkpoint.LogRestoreValueType], error) { - runner, err := checkpoint.StartCheckpointRunnerForLogRestore(ctx, rc.storage, rc.cipher, taskName) +func (rc *LogClient) StartCheckpointRunnerForLogRestore(ctx context.Context) (*checkpoint.CheckpointRunner[checkpoint.LogRestoreKeyType, checkpoint.LogRestoreValueType], error) { + runner, err := checkpoint.StartCheckpointRunnerForLogRestore(ctx, rc.se) return runner, errors.Trace(err) } @@ -1206,18 +1206,19 @@ func (rc *LogClient) WrapLogFilesIterWithSplitHelper(logIter LogIter, rules map[ return NewLogFilesIterWithSplitHelper(logIter, rules, client, splitSize, splitKeys), nil } -func (rc *LogClient) generateKvFilesSkipMap(ctx context.Context, downstreamIdset map[int64]struct{}, taskName string) (*LogFilesSkipMap, error) { +func (rc *LogClient) generateKvFilesSkipMap(ctx context.Context, downstreamIdset map[int64]struct{}) (*LogFilesSkipMap, error) { skipMap := NewLogFilesSkipMap() - t, err := checkpoint.WalkCheckpointFileForRestore(ctx, rc.storage, rc.cipher, taskName, func(groupKey checkpoint.LogRestoreKeyType, off checkpoint.LogRestoreValueMarshaled) { - for tableID, foffs := range off.Foffs { - // filter out the checkpoint data of dropped table - if _, exists := downstreamIdset[tableID]; exists { - for _, foff := range foffs { - skipMap.Insert(groupKey, off.Goff, foff) + t, err := checkpoint.LoadCheckpointDataForSnapshotRestore( + ctx, rc.se.GetSessionCtx().GetRestrictedSQLExecutor(), func(groupKey checkpoint.LogRestoreKeyType, off checkpoint.LogRestoreValueMarshaled) { + for tableID, foffs := range off.Foffs { + // filter out the checkpoint data of dropped table + if _, exists := downstreamIdset[tableID]; exists { + for _, foff := range foffs { + skipMap.Insert(groupKey, off.Goff, foff) + } } } - } - }) + }) if err != nil { return nil, errors.Trace(err) } @@ -1229,11 +1230,10 @@ func (rc *LogClient) WrapLogFilesIterWithCheckpoint( ctx context.Context, logIter LogIter, downstreamIdset map[int64]struct{}, - taskName string, updateStats func(kvCount, size uint64), onProgress func(), ) (LogIter, error) { - skipMap, err := rc.generateKvFilesSkipMap(ctx, downstreamIdset, taskName) + skipMap, err := rc.generateKvFilesSkipMap(ctx, downstreamIdset) if err != nil { return nil, errors.Trace(err) } diff --git a/br/pkg/restore/snap_client/client.go b/br/pkg/restore/snap_client/client.go index be54507c23ed0..1c530680d5e3f 100644 --- a/br/pkg/restore/snap_client/client.go +++ b/br/pkg/restore/snap_client/client.go @@ -315,8 +315,9 @@ func (rc *SnapClient) InitCheckpoint( ) if !checkpointFirstRun { + execCtx := rc.db.Session().GetSessionCtx().GetRestrictedSQLExecutor() // load the checkpoint since this is not the first time to restore - meta, err := checkpoint.LoadCheckpointMetadataForRestore(ctx, s, taskName) + meta, err := checkpoint.LoadCheckpointMetadataForSnapshotRestore(ctx, execCtx) if err != nil { return checkpointSetWithTableID, nil, errors.Trace(err) } @@ -329,7 +330,7 @@ func (rc *SnapClient) InitCheckpoint( } // t1 is the latest time the checkpoint ranges persisted to the external storage. - t1, err := checkpoint.WalkCheckpointFileForRestore(ctx, s, rc.cipher, taskName, func(tableID int64, rangeKey checkpoint.RestoreValueType) { + t1, err := checkpoint.LoadCheckpointDataForSnapshotRestore(ctx, execCtx, func(tableID int64, rangeKey checkpoint.RestoreValueType) { checkpointSet, exists := checkpointSetWithTableID[tableID] if !exists { checkpointSet = make(map[string]struct{}) @@ -341,7 +342,7 @@ func (rc *SnapClient) InitCheckpoint( return checkpointSetWithTableID, nil, errors.Trace(err) } // t2 is the latest time the checkpoint checksum persisted to the external storage. - checkpointChecksum, t2, err := checkpoint.LoadCheckpointChecksumForRestore(ctx, s, taskName) + checkpointChecksum, t2, err := checkpoint.LoadCheckpointChecksumForRestore(ctx, execCtx) if err != nil { return checkpointSetWithTableID, nil, errors.Trace(err) } @@ -359,12 +360,12 @@ func (rc *SnapClient) InitCheckpoint( if config != nil { meta.SchedulersConfig = &pdutil.ClusterConfig{Schedulers: config.Schedulers, ScheduleCfg: config.ScheduleCfg} } - if err = checkpoint.SaveCheckpointMetadataForRestore(ctx, s, meta, taskName); err != nil { + if err = checkpoint.SaveCheckpointMetadataForSnapshotRestore(ctx, rc.db.Session(), meta); err != nil { return checkpointSetWithTableID, nil, errors.Trace(err) } } - rc.checkpointRunner, err = checkpoint.StartCheckpointRunnerForRestore(ctx, s, rc.cipher, taskName) + rc.checkpointRunner, err = checkpoint.StartCheckpointRunnerForRestore(ctx, rc.db.Session(), taskName) return checkpointSetWithTableID, checkpointClusterConfig, errors.Trace(err) } diff --git a/br/pkg/task/restore.go b/br/pkg/task/restore.go index 3463df2cbba97..e0e3cbf74fc2f 100644 --- a/br/pkg/task/restore.go +++ b/br/pkg/task/restore.go @@ -890,10 +890,7 @@ func runRestore(c context.Context, g glue.Glue, cmdName string, cfg *RestoreConf checkpointTaskName = cfg.generateSnapshotRestoreTaskName(client.GetClusterID(ctx)) // if the checkpoint metadata exists in the external storage, the restore is not // for the first time. - existsCheckpointMetadata, err := checkpoint.ExistsRestoreCheckpoint(ctx, s, checkpointTaskName) - if err != nil { - return errors.Trace(err) - } + existsCheckpointMetadata := checkpoint.ExistsSnapshotRestoreCheckpoint(ctx, mgr.GetDomain()) checkpointFirstRun = !existsCheckpointMetadata } @@ -923,6 +920,12 @@ func runRestore(c context.Context, g glue.Glue, cmdName string, cfg *RestoreConf } } + // preallocate the table id, because any ddl job or database creation(include checkpoint) also allocates the global ID + err = client.AllocTableIDs(ctx, tables) + if err != nil { + return errors.Trace(err) + } + // reload or register the checkpoint var checkpointSetWithTableID map[int64]map[string]struct{} if cfg.UseCheckpoint { @@ -1012,12 +1015,6 @@ func runRestore(c context.Context, g glue.Glue, cmdName string, cfg *RestoreConf } } - // preallocate the table id, because any ddl job or database creation also allocates the global ID - err = client.AllocTableIDs(ctx, tables) - if err != nil { - return errors.Trace(err) - } - // execute DDL first err = client.ExecDDLs(ctx, ddlJobs) if err != nil { diff --git a/br/pkg/task/stream.go b/br/pkg/task/stream.go index 1704fbc832711..ae8b61f44b012 100644 --- a/br/pkg/task/stream.go +++ b/br/pkg/task/stream.go @@ -1324,7 +1324,7 @@ func restoreStream( } oldRatio = oldRatioFromCheckpoint - checkpointRunner, err = client.StartCheckpointRunnerForLogRestore(ctx, taskName) + checkpointRunner, err = client.StartCheckpointRunnerForLogRestore(ctx) if err != nil { return errors.Trace(err) } @@ -1421,7 +1421,7 @@ func restoreStream( checkpointTotalKVCount += kvCount checkpointTotalSize += size } - logFilesIter, err = client.WrapLogFilesIterWithCheckpoint(ctx, logFilesIter, downstreamIdset, taskName, updateStatsWithCheckpoint, p.Inc) + logFilesIter, err = client.WrapLogFilesIterWithCheckpoint(ctx, logFilesIter, downstreamIdset, updateStatsWithCheckpoint, p.Inc) if err != nil { return errors.Trace(err) } From d78cad13005f9624d65d5ec1f5214701c63aa57a Mon Sep 17 00:00:00 2001 From: Jianjun Liao Date: Wed, 11 Sep 2024 14:21:58 +0800 Subject: [PATCH 02/15] wip: log restore checkpoint into table Signed-off-by: Jianjun Liao --- br/pkg/checkpoint/restore.go | 8 ++++++-- br/pkg/checkpoint/storage.go | 8 ++++---- br/pkg/restore/snap_client/client.go | 25 ++++++++++++++++++++++--- br/pkg/task/restore.go | 2 +- 4 files changed, 33 insertions(+), 10 deletions(-) diff --git a/br/pkg/checkpoint/restore.go b/br/pkg/checkpoint/restore.go index 1f8cf40e13482..19b54aa56c17d 100644 --- a/br/pkg/checkpoint/restore.go +++ b/br/pkg/checkpoint/restore.go @@ -57,7 +57,6 @@ func StartCheckpointRestoreRunnerForTest( func StartCheckpointRunnerForRestore( ctx context.Context, se glue.Session, - taskName string, ) (*CheckpointRunner[RestoreKeyType, RestoreValueType], error) { runner := newCheckpointRunner[RestoreKeyType, RestoreValueType]( ctx, newTableCheckpointStorage(se, snapshotRestoreCheckpointDatabaseName), nil, nil, nil, flushPosition{}, valueMarshalerForRestore) @@ -99,8 +98,13 @@ func LoadCheckpointChecksumForRestore( } type CheckpointMetadataForRestore struct { + UpstreamClusterID uint64 `json:"upstream-cluster-id"` + RestoredTS uint64 `json:"restored-ts"` + // only for log restore + StartTS uint64 `json:"start-ts,omitempty"` + GcRatio string `json:"gc-ratio,omitempty"` + // only for snapshot restore SchedulersConfig *pdutil.ClusterConfig `json:"schedulers-config,omitempty"` - GcRatio string `json:"gc-ratio,omitempty"` } func LoadCheckpointMetadataForSnapshotRestore( diff --git a/br/pkg/checkpoint/storage.go b/br/pkg/checkpoint/storage.go index 825700431cd82..a1a21fa801bae 100644 --- a/br/pkg/checkpoint/storage.go +++ b/br/pkg/checkpoint/storage.go @@ -55,8 +55,8 @@ func (s *externalCheckpointStorage) flushCheckpointChecksum(ctx context.Context, // 2. BR regards the metadata table as a file so that it is not empty if the table exists. // 3. BR regards the checkpoint table as a directory which is managed by metadata table. const ( - logRestoreCheckpointDatabaseName string = "__TiDB_BR_Temporary_Log_Restore_Checkpoint" - snapshotRestoreCheckpointDatabaseName string = "__TiDB_BR_Temporary_Snapshot_Restore_Checkpoint" + LogRestoreCheckpointDatabaseName string = "__TiDB_BR_Temporary_Log_Restore_Checkpoint" + SnapshotRestoreCheckpointDatabaseName string = "__TiDB_BR_Temporary_Snapshot_Restore_Checkpoint" // directory level table checkpointDataTableName string = "cpt_data" @@ -67,7 +67,7 @@ const ( checkpointIngestTableName string = "cpt_ingest" createCheckpointTable string = ` - CREATE TABLE IF NOT EXISTS %n.%n ( + CREATE TABLE %n.%n ( uuid varchar(32) NOT NULL, segment_id BIGINT NOT NULL, data BLOB(524288) NOT NULL, @@ -82,7 +82,7 @@ const ( SELECT uuid, segment_id, data FROM %s.%s ORDER BY uuid, segment_id;` createCheckpointMetaTable string = ` - CREATE TABLE IF NOT EXISTS %n.%n ( + CREATE TABLE %n.%n ( segment_id BIGINT NOT NULL, data BLOB(524288) NOT NULL, update_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP, diff --git a/br/pkg/restore/snap_client/client.go b/br/pkg/restore/snap_client/client.go index 1c530680d5e3f..0f6f0570464a8 100644 --- a/br/pkg/restore/snap_client/client.go +++ b/br/pkg/restore/snap_client/client.go @@ -301,7 +301,6 @@ func (rc *SnapClient) AllocTableIDs(ctx context.Context, tables []*metautil.Tabl func (rc *SnapClient) InitCheckpoint( ctx context.Context, s storage.ExternalStorage, - taskName string, config *pdutil.ClusterConfig, checkpointFirstRun bool, ) (map[int64]map[string]struct{}, *pdutil.ClusterConfig, error) { @@ -322,6 +321,23 @@ func (rc *SnapClient) InitCheckpoint( return checkpointSetWithTableID, nil, errors.Trace(err) } + if meta.UpstreamClusterID != rc.backupMeta.ClusterId { + return checkpointSetWithTableID, nil, errors.Errorf( + "The upstream cluster id[%d] of the current snapshot restore does not match that[%d] recorded in checkpoint. "+ + "Perhaps you should specify the last full backup storage instead, "+ + "or just clean the checkpoint database[%s] if the cluster has been cleaned up.", + rc.backupMeta.ClusterId, meta.UpstreamClusterID, checkpoint.SnapshotRestoreCheckpointDatabaseName) + } + + if meta.RestoredTS != rc.backupMeta.EndVersion { + return checkpointSetWithTableID, nil, errors.Errorf( + "The current snapshot restore want to restore cluster to the BackupTS[%d], which is different from that[%d] recorded in checkpoint. "+ + "Perhaps you should specify the last full backup storage instead, "+ + "or just clean the checkpoint database[%s] if the cluster has been cleaned up.", + rc.backupMeta.EndVersion, meta.RestoredTS, checkpoint.SnapshotRestoreCheckpointDatabaseName, + ) + } + // The schedulers config is nil, so the restore-schedulers operation is just nil. // Then the undo function would use the result undo of `remove schedulers` operation, // instead of that in checkpoint meta. @@ -355,7 +371,10 @@ func (rc *SnapClient) InitCheckpoint( } } else { // initialize the checkpoint metadata since it is the first time to restore. - meta := &checkpoint.CheckpointMetadataForRestore{} + meta := &checkpoint.CheckpointMetadataForRestore{ + UpstreamClusterID: rc.backupMeta.ClusterId, + RestoredTS: rc.backupMeta.EndVersion, + } // a nil config means undo function if config != nil { meta.SchedulersConfig = &pdutil.ClusterConfig{Schedulers: config.Schedulers, ScheduleCfg: config.ScheduleCfg} @@ -365,7 +384,7 @@ func (rc *SnapClient) InitCheckpoint( } } - rc.checkpointRunner, err = checkpoint.StartCheckpointRunnerForRestore(ctx, rc.db.Session(), taskName) + rc.checkpointRunner, err = checkpoint.StartCheckpointRunnerForRestore(ctx, rc.db.Session()) return checkpointSetWithTableID, checkpointClusterConfig, errors.Trace(err) } diff --git a/br/pkg/task/restore.go b/br/pkg/task/restore.go index e0e3cbf74fc2f..fc088af3ddcdc 100644 --- a/br/pkg/task/restore.go +++ b/br/pkg/task/restore.go @@ -888,7 +888,7 @@ func runRestore(c context.Context, g glue.Glue, cmdName string, cfg *RestoreConf var checkpointFirstRun bool = true if cfg.UseCheckpoint { checkpointTaskName = cfg.generateSnapshotRestoreTaskName(client.GetClusterID(ctx)) - // if the checkpoint metadata exists in the external storage, the restore is not + // if the checkpoint metadata exists in the checkpoint storage, the restore is not // for the first time. existsCheckpointMetadata := checkpoint.ExistsSnapshotRestoreCheckpoint(ctx, mgr.GetDomain()) checkpointFirstRun = !existsCheckpointMetadata From 5e6903b32c5abbf944f57d7410d05d24a277df7c Mon Sep 17 00:00:00 2001 From: Jianjun Liao Date: Wed, 11 Sep 2024 18:42:59 +0800 Subject: [PATCH 03/15] wip: log restore checkpoint into table Signed-off-by: Jianjun Liao --- br/pkg/checkpoint/log_restore.go | 157 +++++++++++++++++++-------- br/pkg/checkpoint/restore.go | 38 +++---- br/pkg/checkpoint/storage.go | 2 +- br/pkg/restore/log_client/client.go | 74 ++++++------- br/pkg/restore/snap_client/client.go | 2 +- br/pkg/task/restore.go | 5 +- br/pkg/task/stream.go | 112 +++++++++---------- br/pkg/utils/db.go | 5 +- 8 files changed, 219 insertions(+), 176 deletions(-) diff --git a/br/pkg/checkpoint/log_restore.go b/br/pkg/checkpoint/log_restore.go index 420486be54346..efc9648603474 100644 --- a/br/pkg/checkpoint/log_restore.go +++ b/br/pkg/checkpoint/log_restore.go @@ -24,6 +24,7 @@ import ( "github.com/pingcap/log" "github.com/pingcap/tidb/br/pkg/glue" "github.com/pingcap/tidb/br/pkg/storage" + "github.com/pingcap/tidb/pkg/domain" "github.com/pingcap/tidb/pkg/meta/model" pmodel "github.com/pingcap/tidb/pkg/parser/model" "github.com/pingcap/tidb/pkg/util/sqlexec" @@ -114,7 +115,7 @@ func StartCheckpointLogRestoreRunnerForTest( tick time.Duration, ) (*CheckpointRunner[LogRestoreKeyType, LogRestoreValueType], error) { runner := newCheckpointRunner[LogRestoreKeyType, LogRestoreValueType]( - ctx, newTableCheckpointStorage(se, logRestoreCheckpointDatabaseName), nil, nil, nil, flushPosition{}, valueMarshalerForLogRestore) + ctx, newTableCheckpointStorage(se, LogRestoreCheckpointDatabaseName), nil, nil, nil, flushPosition{}, valueMarshalerForLogRestore) runner.startCheckpointMainLoop(ctx, tick, tick, 0) return runner, nil @@ -125,7 +126,7 @@ func StartCheckpointRunnerForLogRestore( se glue.Session, ) (*CheckpointRunner[LogRestoreKeyType, LogRestoreValueType], error) { runner := newCheckpointRunner[LogRestoreKeyType, LogRestoreValueType]( - ctx, newTableCheckpointStorage(se, logRestoreCheckpointDatabaseName), nil, nil, nil, flushPosition{}, valueMarshalerForLogRestore) + ctx, newTableCheckpointStorage(se, LogRestoreCheckpointDatabaseName), nil, nil, nil, flushPosition{}, valueMarshalerForLogRestore) // for restore, no need to set lock runner.startCheckpointMainLoop(ctx, defaultTickDurationForFlush, defaultTckDurationForChecksum, 0) @@ -159,7 +160,45 @@ func LoadCheckpointDataForLogRestore[K KeyType, V ValueType]( execCtx sqlexec.RestrictedSQLExecutor, fn func(K, V), ) (time.Duration, error) { - return selectCheckpointData(ctx, execCtx, logRestoreCheckpointDatabaseName, fn) + return selectCheckpointData(ctx, execCtx, LogRestoreCheckpointDatabaseName, fn) +} + +type CheckpointMetadataForLogRestore struct { + UpstreamClusterID uint64 `json:"upstream-cluster-id"` + RestoredTS uint64 `json:"restored-ts"` + StartTS uint64 `json:"start-ts"` + RewriteTS uint64 `json:"rewrite-ts"` + GcRatio string `json:"gc-ratio"` + // tiflash recorder items with snapshot restore records + TiFlashItems map[int64]model.TiFlashReplicaInfo `json:"tiflash-recorder,omitempty"` +} + +func LoadCheckpointMetadataForLogRestore( + ctx context.Context, + execCtx sqlexec.RestrictedSQLExecutor, +) (*CheckpointMetadataForLogRestore, error) { + m := &CheckpointMetadataForLogRestore{} + err := selectCheckpointMeta(ctx, execCtx, LogRestoreCheckpointDatabaseName, checkpointMetaTableName, m) + return m, err +} + +func SaveCheckpointMetadataForLogRestore( + ctx context.Context, + se glue.Session, + meta *CheckpointMetadataForLogRestore, +) error { + err := initCheckpointTable(ctx, se, LogRestoreCheckpointDatabaseName, []string{checkpointDataTableName, checkpointChecksumTableName}) + if err != nil { + return errors.Trace(err) + } + return insertCheckpointMeta(ctx, se, LogRestoreCheckpointDatabaseName, checkpointMetaTableName, meta) +} + +func ExistsLogRestoreCheckpointMetadata( + ctx context.Context, + dom *domain.Domain, +) bool { + return dom.InfoSchema().TableExists(pmodel.NewCIStr(LogRestoreCheckpointDatabaseName), pmodel.NewCIStr(checkpointMetaTableName)) } // A progress type for snapshot + log restore. @@ -193,60 +232,72 @@ const ( InLogRestoreAndIdMapPersist ) -// CheckpointTaskInfo is unique information within the same cluster id. It represents the last -// restore task executed for this cluster. -type CheckpointTaskInfoForLogRestore struct { - // the progress for this task +type CheckpointProgress struct { Progress RestoreProgress `json:"progress"` - // the upstream cluster id - UpstreamClusterID uint64 `json:"upstream-cluster-id"` - // a task marker to distinguish the different tasks - StartTS uint64 `json:"start-ts"` - RestoreTS uint64 `json:"restore-ts"` - // updated in the progress of `InLogRestoreAndIdMapPersist` - RewriteTS uint64 `json:"rewrite-ts"` - // tiflash recorder items with snapshot restore records - TiFlashItems map[int64]model.TiFlashReplicaInfo `json:"tiflash-recorder,omitempty"` } -func LoadCheckpointTaskInfoForLogRestore( +func LoadCheckpointProgress( ctx context.Context, execCtx sqlexec.RestrictedSQLExecutor, -) (*CheckpointTaskInfoForLogRestore, error) { - m := &CheckpointTaskInfoForLogRestore{} - err := selectCheckpointMeta(ctx, execCtx, logRestoreCheckpointDatabaseName, checkpointMetaTypeTaskInfo, m) - return m, err +) (*CheckpointProgress, error) { + m := &CheckpointProgress{} + err := selectCheckpointMeta(ctx, execCtx, LogRestoreCheckpointDatabaseName, checkpointProgressTableName, m) + return m, errors.Trace(err) } -func SaveCheckpointTaskInfoForLogRestore( +func SaveCheckpointProgress( ctx context.Context, se glue.Session, - meta *CheckpointTaskInfoForLogRestore, + meta *CheckpointProgress, ) error { - return insertCheckpointMeta(ctx, se, logRestoreCheckpointDatabaseName, checkpointMetaTypeTaskInfo, meta) + return insertCheckpointMeta(ctx, se, LogRestoreCheckpointDatabaseName, checkpointProgressTableName, meta) } -func ExistsCheckpointTaskInfo( +func ExistsCheckpointProgress( ctx context.Context, - s storage.ExternalStorage, - clusterID uint64, -) (bool, error) { - return s.FileExists(ctx, getCheckpointTaskInfoPathByID(clusterID)) + dom *domain.Domain, +) bool { + return dom.InfoSchema().TableExists(pmodel.NewCIStr(LogRestoreCheckpointDatabaseName), pmodel.NewCIStr(checkpointProgressTableName)) } -func removeCheckpointTaskInfoForLogRestore(ctx context.Context, s storage.ExternalStorage, clusterID uint64) error { - fileName := getCheckpointTaskInfoPathByID(clusterID) - exists, err := s.FileExists(ctx, fileName) - if err != nil { - return errors.Trace(err) - } +// CheckpointTaskInfo is unique information within the same cluster id. It represents the last +// restore task executed for this cluster. +type CheckpointTaskInfoForLogRestore struct { + Metadata *CheckpointMetadataForLogRestore + // the progress for this task + Progress RestoreProgress +} - if !exists { - log.Warn("the task info file doesn't exist", zap.String("file", fileName)) - return nil +func TryToGetCheckpointTaskInfo( + ctx context.Context, + dom *domain.Domain, + execCtx sqlexec.RestrictedSQLExecutor, +) (*CheckpointTaskInfoForLogRestore, error) { + var ( + metadata *CheckpointMetadataForLogRestore + progress RestoreProgress + err error + ) + // get the progress + if ExistsCheckpointProgress(ctx, dom) { + checkpointProgress, err := LoadCheckpointProgress(ctx, execCtx) + if err != nil { + return nil, errors.Trace(err) + } + progress = checkpointProgress.Progress + } + // get the checkpoint metadata + if ExistsLogRestoreCheckpointMetadata(ctx, dom) { + metadata, err = LoadCheckpointMetadataForLogRestore(ctx, execCtx) + if err != nil { + return nil, errors.Trace(err) + } } - return s.DeleteFile(ctx, fileName) + return &CheckpointTaskInfoForLogRestore{ + Metadata: metadata, + Progress: progress, + }, nil } type CheckpointIngestIndexRepairSQL struct { @@ -272,21 +323,31 @@ func LoadCheckpointIngestIndexRepairSQLs( return m, err } -func ExistsCheckpointIngestIndexRepairSQLs( - ctx context.Context, - s storage.ExternalStorage, - taskName string, -) (bool, error) { - return s.FileExists(ctx, getCheckpointIngestIndexRepairPathByTaskName(taskName)) +func ExistsCheckpointIngestIndexRepairSQLs(ctx context.Context, dom *domain.Domain) bool { + return dom.InfoSchema().TableExists(pmodel.NewCIStr(LogRestoreCheckpointDatabaseName), pmodel.NewCIStr(checkpointIngestTableName)) } func SaveCheckpointIngestIndexRepairSQLs( ctx context.Context, - s storage.ExternalStorage, + se glue.Session, meta *CheckpointIngestIndexRepairSQLs, - taskName string, ) error { - return saveCheckpointMetadata(ctx, s, meta, getCheckpointIngestIndexRepairPathByTaskName(taskName)) + return insertCheckpointMeta(ctx, se, LogRestoreCheckpointDatabaseName, checkpointIngestTableName, meta) +} + +func removeCheckpointTaskInfoForLogRestore(ctx context.Context, s storage.ExternalStorage, clusterID uint64) error { + fileName := getCheckpointTaskInfoPathByID(clusterID) + exists, err := s.FileExists(ctx, fileName) + if err != nil { + return errors.Trace(err) + } + + if !exists { + log.Warn("the task info file doesn't exist", zap.String("file", fileName)) + return nil + } + + return s.DeleteFile(ctx, fileName) } func RemoveCheckpointDataForLogRestore( diff --git a/br/pkg/checkpoint/restore.go b/br/pkg/checkpoint/restore.go index 19b54aa56c17d..11c35503b2bf2 100644 --- a/br/pkg/checkpoint/restore.go +++ b/br/pkg/checkpoint/restore.go @@ -23,7 +23,7 @@ import ( "github.com/pingcap/tidb/br/pkg/glue" "github.com/pingcap/tidb/br/pkg/pdutil" "github.com/pingcap/tidb/pkg/domain" - "github.com/pingcap/tidb/pkg/parser/model" + pmodel "github.com/pingcap/tidb/pkg/parser/model" "github.com/pingcap/tidb/pkg/util/sqlexec" ) @@ -48,7 +48,7 @@ func StartCheckpointRestoreRunnerForTest( tick time.Duration, ) (*CheckpointRunner[RestoreKeyType, RestoreValueType], error) { runner := newCheckpointRunner[RestoreKeyType, RestoreValueType]( - ctx, newTableCheckpointStorage(se, snapshotRestoreCheckpointDatabaseName), nil, nil, nil, flushPosition{}, valueMarshalerForRestore) + ctx, newTableCheckpointStorage(se, SnapshotRestoreCheckpointDatabaseName), nil, nil, nil, flushPosition{}, valueMarshalerForRestore) runner.startCheckpointMainLoop(ctx, tick, tick, 0) return runner, nil @@ -59,7 +59,7 @@ func StartCheckpointRunnerForRestore( se glue.Session, ) (*CheckpointRunner[RestoreKeyType, RestoreValueType], error) { runner := newCheckpointRunner[RestoreKeyType, RestoreValueType]( - ctx, newTableCheckpointStorage(se, snapshotRestoreCheckpointDatabaseName), nil, nil, nil, flushPosition{}, valueMarshalerForRestore) + ctx, newTableCheckpointStorage(se, SnapshotRestoreCheckpointDatabaseName), nil, nil, nil, flushPosition{}, valueMarshalerForRestore) // for restore, no need to set lock runner.startCheckpointMainLoop(ctx, defaultTickDurationForFlush, defaultTckDurationForChecksum, 0) @@ -87,54 +87,50 @@ func LoadCheckpointDataForSnapshotRestore[K KeyType, V ValueType]( execCtx sqlexec.RestrictedSQLExecutor, fn func(K, V), ) (time.Duration, error) { - return selectCheckpointData(ctx, execCtx, snapshotRestoreCheckpointDatabaseName, fn) + return selectCheckpointData(ctx, execCtx, SnapshotRestoreCheckpointDatabaseName, fn) } func LoadCheckpointChecksumForRestore( ctx context.Context, execCtx sqlexec.RestrictedSQLExecutor, ) (map[int64]*ChecksumItem, time.Duration, error) { - return selectCheckpointChecksum(ctx, execCtx, logRestoreCheckpointDatabaseName) + return selectCheckpointChecksum(ctx, execCtx, LogRestoreCheckpointDatabaseName) } -type CheckpointMetadataForRestore struct { - UpstreamClusterID uint64 `json:"upstream-cluster-id"` - RestoredTS uint64 `json:"restored-ts"` - // only for log restore - StartTS uint64 `json:"start-ts,omitempty"` - GcRatio string `json:"gc-ratio,omitempty"` - // only for snapshot restore - SchedulersConfig *pdutil.ClusterConfig `json:"schedulers-config,omitempty"` +type CheckpointMetadataForSnapshotRestore struct { + UpstreamClusterID uint64 `json:"upstream-cluster-id"` + RestoredTS uint64 `json:"restored-ts"` + SchedulersConfig *pdutil.ClusterConfig `json:"schedulers-config"` } func LoadCheckpointMetadataForSnapshotRestore( ctx context.Context, execCtx sqlexec.RestrictedSQLExecutor, -) (*CheckpointMetadataForRestore, error) { - m := &CheckpointMetadataForRestore{} - err := selectCheckpointMeta(ctx, execCtx, snapshotRestoreCheckpointDatabaseName, checkpointMetaTableName, m) +) (*CheckpointMetadataForSnapshotRestore, error) { + m := &CheckpointMetadataForSnapshotRestore{} + err := selectCheckpointMeta(ctx, execCtx, SnapshotRestoreCheckpointDatabaseName, checkpointMetaTableName, m) return m, err } func SaveCheckpointMetadataForSnapshotRestore( ctx context.Context, se glue.Session, - meta *CheckpointMetadataForRestore, + meta *CheckpointMetadataForSnapshotRestore, ) error { - err := initCheckpointTable(ctx, se, snapshotRestoreCheckpointDatabaseName, []string{checkpointDataTableName, checkpointChecksumTableName}) + err := initCheckpointTable(ctx, se, SnapshotRestoreCheckpointDatabaseName, []string{checkpointDataTableName, checkpointChecksumTableName}) if err != nil { return errors.Trace(err) } - return insertCheckpointMeta(ctx, se, snapshotRestoreCheckpointDatabaseName, checkpointMetaTableName, meta) + return insertCheckpointMeta(ctx, se, SnapshotRestoreCheckpointDatabaseName, checkpointMetaTableName, meta) } func ExistsSnapshotRestoreCheckpoint( ctx context.Context, dom *domain.Domain, ) bool { - return dom.InfoSchema().TableExists(model.NewCIStr(snapshotRestoreCheckpointDatabaseName), model.NewCIStr(checkpointMetaTableName)) + return dom.InfoSchema().TableExists(pmodel.NewCIStr(SnapshotRestoreCheckpointDatabaseName), pmodel.NewCIStr(checkpointMetaTableName)) } func RemoveCheckpointDataForSnapshotRestore(ctx context.Context, dom *domain.Domain, se glue.Session) error { - return dropCheckpointTables(ctx, dom, se, snapshotRestoreCheckpointDatabaseName, []string{checkpointDataTableName, checkpointChecksumTableName, checkpointMetaTableName}) + return dropCheckpointTables(ctx, dom, se, SnapshotRestoreCheckpointDatabaseName, []string{checkpointDataTableName, checkpointChecksumTableName, checkpointMetaTableName}) } diff --git a/br/pkg/checkpoint/storage.go b/br/pkg/checkpoint/storage.go index a1a21fa801bae..e0b32f78fb848 100644 --- a/br/pkg/checkpoint/storage.go +++ b/br/pkg/checkpoint/storage.go @@ -63,7 +63,7 @@ const ( checkpointChecksumTableName string = "cpt_checksum" // file level table checkpointMetaTableName string = "cpt_metadata" - checkpointTaskInfoTableName string = "cpt_taskinfo" + checkpointProgressTableName string = "cpt_progress" checkpointIngestTableName string = "cpt_ingest" createCheckpointTable string = ` diff --git a/br/pkg/restore/log_client/client.go b/br/pkg/restore/log_client/client.go index f3b1a396300a8..879f78fc0ea9d 100644 --- a/br/pkg/restore/log_client/client.go +++ b/br/pkg/restore/log_client/client.go @@ -176,8 +176,12 @@ func (rc *LogClient) SetStorage(ctx context.Context, backend *backuppb.StorageBa return nil } -func (rc *LogClient) SetCurrentTS(ts uint64) { +func (rc *LogClient) SetCurrentTS(ts uint64) error { + if ts == 0 { + return errors.Errorf("set rewrite ts to an invalid ts", zap.Uint64("ts", ts)) + } rc.currentTS = ts + return nil } // GetClusterID gets the cluster id from down-stream cluster. @@ -238,36 +242,38 @@ func (rc *LogClient) InitClients(ctx context.Context, backend *backuppb.StorageB rc.fileImporter = NewLogFileImporter(metaClient, importCli, backend) } -func (rc *LogClient) InitCheckpointMetadataForLogRestore(ctx context.Context, taskName string, gcRatio string) (string, error) { +func (rc *LogClient) InitCheckpointMetadataForLogRestore(ctx context.Context, gcRatio string, tiflashRecorder *tiflashrec.TiFlashRecorder) (string, error) { rc.useCheckpoint = true - // it shows that the user has modified gc-ratio, if `gcRatio` doesn't equal to "1.1". - // update the `gcRatio` for checkpoint metadata. - if gcRatio == utils.DefaultGcRatioVal { - // if the checkpoint metadata exists in the external storage, the restore is not - // for the first time. - exists, err := checkpoint.ExistsRestoreCheckpoint(ctx, rc.storage, taskName) + // if the checkpoint metadata exists in the external storage, the restore is not + // for the first time. + if checkpoint.ExistsLogRestoreCheckpointMetadata(ctx, rc.dom) { + // load the checkpoint since this is not the first time to restore + meta, err := checkpoint.LoadCheckpointMetadataForLogRestore(ctx, rc.se.GetSessionCtx().GetRestrictedSQLExecutor()) if err != nil { return "", errors.Trace(err) } - if exists { - // load the checkpoint since this is not the first time to restore - meta, err := checkpoint.LoadCheckpointMetadataForRestore(ctx, rc.storage, taskName) - if err != nil { - return "", errors.Trace(err) - } - - log.Info("reuse gc ratio from checkpoint metadata", zap.String("gc-ratio", gcRatio)) - return meta.GcRatio, nil - } + log.Info("reuse gc ratio from checkpoint metadata", zap.String("gc-ratio", gcRatio)) + return meta.GcRatio, nil } // initialize the checkpoint metadata since it is the first time to restore. - log.Info("save gc ratio into checkpoint metadata", zap.String("gc-ratio", gcRatio)) - if err := checkpoint.SaveCheckpointMetadataForRestore(ctx, rc.storage, &checkpoint.CheckpointMetadataForRestore{ - GcRatio: gcRatio, - }, taskName); err != nil { + var items map[int64]model.TiFlashReplicaInfo + if tiflashRecorder != nil { + items = tiflashRecorder.GetItems() + } + log.Info("save gc ratio into checkpoint metadata", + zap.Uint64("start-ts", rc.startTS), zap.Uint64("restored-ts", rc.restoreTS), zap.Uint64("rewrite-ts", rc.currentTS), + zap.String("gc-ratio", gcRatio), zap.Int("tiflash-item-count", len(items))) + if err := checkpoint.SaveCheckpointMetadataForLogRestore(ctx, rc.se, &checkpoint.CheckpointMetadataForLogRestore{ + UpstreamClusterID: rc.UpstreamClusterID, + RestoredTS: rc.restoreTS, + StartTS: rc.startTS, + RewriteTS: rc.currentTS, + GcRatio: gcRatio, + TiFlashItems: items, + }); err != nil { return gcRatio, errors.Trace(err) } @@ -1261,11 +1267,7 @@ func (rc *LogClient) generateRepairIngestIndexSQLs( ) ([]checkpoint.CheckpointIngestIndexRepairSQL, bool, error) { var sqls []checkpoint.CheckpointIngestIndexRepairSQL if rc.useCheckpoint { - exists, err := checkpoint.ExistsCheckpointIngestIndexRepairSQLs(ctx, rc.storage, taskName) - if err != nil { - return sqls, false, errors.Trace(err) - } - if exists { + if checkpoint.ExistsCheckpointIngestIndexRepairSQLs(ctx, rc.dom) { checkpointSQLs, err := checkpoint.LoadCheckpointIngestIndexRepairSQLs(ctx, rc.storage, taskName) if err != nil { return sqls, false, errors.Trace(err) @@ -1331,9 +1333,9 @@ func (rc *LogClient) generateRepairIngestIndexSQLs( } if rc.useCheckpoint && len(sqls) > 0 { - if err := checkpoint.SaveCheckpointIngestIndexRepairSQLs(ctx, rc.storage, &checkpoint.CheckpointIngestIndexRepairSQLs{ + if err := checkpoint.SaveCheckpointIngestIndexRepairSQLs(ctx, rc.se, &checkpoint.CheckpointIngestIndexRepairSQLs{ SQLs: sqls, - }, taskName); err != nil { + }); err != nil { return sqls, false, errors.Trace(err) } } @@ -1510,18 +1512,10 @@ func (rc *LogClient) SaveIDMap( return errors.Trace(err) } if rc.useCheckpoint { - var items map[int64]model.TiFlashReplicaInfo - if sr.TiflashRecorder != nil { - items = sr.TiflashRecorder.GetItems() - } log.Info("save checkpoint task info with InLogRestoreAndIdMapPersist status") - if err := checkpoint.SaveCheckpointTaskInfoForLogRestore(ctx, rc.storage, &checkpoint.CheckpointTaskInfoForLogRestore{ - Progress: checkpoint.InLogRestoreAndIdMapPersist, - StartTS: rc.startTS, - RestoreTS: rc.restoreTS, - RewriteTS: rc.currentTS, - TiFlashItems: items, - }, rc.GetClusterID(ctx)); err != nil { + if err := checkpoint.SaveCheckpointProgress(ctx, rc.se, &checkpoint.CheckpointProgress{ + Progress: checkpoint.InLogRestoreAndIdMapPersist, + }); err != nil { return errors.Trace(err) } } diff --git a/br/pkg/restore/snap_client/client.go b/br/pkg/restore/snap_client/client.go index 0f6f0570464a8..0c8ebec216223 100644 --- a/br/pkg/restore/snap_client/client.go +++ b/br/pkg/restore/snap_client/client.go @@ -371,7 +371,7 @@ func (rc *SnapClient) InitCheckpoint( } } else { // initialize the checkpoint metadata since it is the first time to restore. - meta := &checkpoint.CheckpointMetadataForRestore{ + meta := &checkpoint.CheckpointMetadataForSnapshotRestore{ UpstreamClusterID: rc.backupMeta.ClusterId, RestoredTS: rc.backupMeta.EndVersion, } diff --git a/br/pkg/task/restore.go b/br/pkg/task/restore.go index fc088af3ddcdc..72186b74bfa90 100644 --- a/br/pkg/task/restore.go +++ b/br/pkg/task/restore.go @@ -884,10 +884,9 @@ func runRestore(c context.Context, g glue.Glue, cmdName string, cfg *RestoreConf log.Info("finish removing pd scheduler") }() - var checkpointTaskName string var checkpointFirstRun bool = true if cfg.UseCheckpoint { - checkpointTaskName = cfg.generateSnapshotRestoreTaskName(client.GetClusterID(ctx)) + _ = cfg.generateSnapshotRestoreTaskName(client.GetClusterID(ctx)) // if the checkpoint metadata exists in the checkpoint storage, the restore is not // for the first time. existsCheckpointMetadata := checkpoint.ExistsSnapshotRestoreCheckpoint(ctx, mgr.GetDomain()) @@ -929,7 +928,7 @@ func runRestore(c context.Context, g glue.Glue, cmdName string, cfg *RestoreConf // reload or register the checkpoint var checkpointSetWithTableID map[int64]map[string]struct{} if cfg.UseCheckpoint { - sets, restoreSchedulersConfigFromCheckpoint, err := client.InitCheckpoint(ctx, s, checkpointTaskName, schedulersConfig, checkpointFirstRun) + sets, restoreSchedulersConfigFromCheckpoint, err := client.InitCheckpoint(ctx, s, schedulersConfig, checkpointFirstRun) if err != nil { return errors.Trace(err) } diff --git a/br/pkg/task/stream.go b/br/pkg/task/stream.go index ae8b61f44b012..274bc6f8c7433 100644 --- a/br/pkg/task/stream.go +++ b/br/pkg/task/stream.go @@ -470,18 +470,11 @@ func KeepGcDisabled(g glue.Glue, store kv.Storage) (RestoreFunc, string, error) return nil, "", errors.Trace(err) } - newRatio := "-1.0" - err = utils.SetGcRatio(execCtx, newRatio) + err = utils.SetGcRatio(execCtx, utils.DisabledGcRatioVal) if err != nil { return nil, "", errors.Trace(err) } - // If the oldRatio is negative, which is not normal status. - // It should set default value "1.1" after PiTR finished. - if strings.HasPrefix(oldRatio, "-") { - oldRatio = utils.DefaultGcRatioVal - } - return func(ratio string) error { return utils.SetGcRatio(execCtx, ratio) }, oldRatio, nil @@ -1194,12 +1187,12 @@ func RunStreamRestore( if _, err := glue.GetConsole(g).Out().Write(skipMsg); err != nil { return errors.Trace(err) } - if checkInfo.CheckpointInfo != nil && checkInfo.CheckpointInfo.TiFlashItems != nil { + if checkInfo.CheckpointInfo != nil && checkInfo.CheckpointInfo.Metadata != nil && checkInfo.CheckpointInfo.Metadata.TiFlashItems != nil { log.Info("load tiflash records of snapshot restore from checkpoint") if err != nil { return errors.Trace(err) } - cfg.tiflashRecorder.Load(checkInfo.CheckpointInfo.TiFlashItems) + cfg.tiflashRecorder.Load(checkInfo.CheckpointInfo.Metadata.TiFlashItems) } } // restore log. @@ -1272,17 +1265,19 @@ func restoreStream( } defer client.Close() - if taskInfo != nil && taskInfo.RewriteTS > 0 { + if taskInfo != nil && taskInfo.Metadata != nil { // reuse the task's rewrite ts - log.Info("reuse the task's rewrite ts", zap.Uint64("rewrite-ts", taskInfo.RewriteTS)) - currentTS = taskInfo.RewriteTS + log.Info("reuse the task's rewrite ts", zap.Uint64("rewrite-ts", taskInfo.Metadata.RewriteTS)) + currentTS = taskInfo.Metadata.RewriteTS } else { currentTS, err = restore.GetTSWithRetry(ctx, mgr.GetPDClient()) if err != nil { return errors.Trace(err) } } - client.SetCurrentTS(currentTS) + if err := client.SetCurrentTS(currentTS); err != nil { + return errors.Trace(err) + } importModeSwitcher := restore.NewImportModeSwitcher(mgr.GetPDClient(), cfg.Config.SwitchModeInterval, mgr.GetTLSConfig()) restoreSchedulers, _, err := restore.RestorePreWork(ctx, mgr, importModeSwitcher, cfg.Online, false) @@ -1307,6 +1302,12 @@ func restoreStream( return } + // If the oldRatio is negative, which is not normal status. + // It should set default value "1.1" after PiTR finished. + if strings.HasPrefix(oldRatio, "-") { + log.Warn("the original gc-ratio is negative, reset by default value 1.1", zap.String("old-gc-ratio", oldRatio)) + oldRatio = utils.DefaultGcRatioVal + } log.Info("start to restore gc", zap.String("ratio", oldRatio)) if err := restoreGc(oldRatio); err != nil { log.Error("failed to set gc enabled", zap.Error(err)) @@ -1318,7 +1319,7 @@ func restoreStream( var checkpointRunner *checkpoint.CheckpointRunner[checkpoint.LogRestoreKeyType, checkpoint.LogRestoreValueType] if cfg.UseCheckpoint { taskName = cfg.generateLogRestoreTaskName(client.GetClusterID(ctx), cfg.StartTS, cfg.RestoreTS) - oldRatioFromCheckpoint, err := client.InitCheckpointMetadataForLogRestore(ctx, taskName, oldRatio) + oldRatioFromCheckpoint, err := client.InitCheckpointMetadataForLogRestore(ctx, oldRatio, cfg.tiflashRecorder) if err != nil { return errors.Trace(err) } @@ -1758,7 +1759,6 @@ func checkPiTRTaskInfo( var ( doFullRestore = (len(cfg.FullBackupStorage) > 0) curTaskInfo *checkpoint.CheckpointTaskInfoForLogRestore - errTaskMsg string ) checkInfo := &PiTRTaskInfo{} @@ -1771,36 +1771,46 @@ func checkPiTRTaskInfo( clusterID := mgr.GetPDClient().GetClusterID(ctx) if cfg.UseCheckpoint { - exists, err := checkpoint.ExistsCheckpointTaskInfo(ctx, s, clusterID) + se, err := g.CreateSession(mgr.GetStorage()) + if err != nil { + return nil, errors.Trace(err) + } + + execCtx := se.GetSessionCtx().GetRestrictedSQLExecutor() + curTaskInfo, err = checkpoint.TryToGetCheckpointTaskInfo(ctx, mgr.GetDomain(), execCtx) if err != nil { return checkInfo, errors.Trace(err) } - if exists { - curTaskInfo, err = checkpoint.LoadCheckpointTaskInfoForLogRestore(ctx, s, clusterID) - if err != nil { - return checkInfo, errors.Trace(err) - } + // the log restore checkpoint metadata is persist, so the PITR is in the log restore stage. + if curTaskInfo.Metadata != nil { // TODO: check whether user has manually modified the cluster(ddl). If so, regard the behavior // as restore from scratch. (update `curTaskInfo.RewriteTs` to 0 as an uninitial value) - // The task info is written to external storage without status `InSnapshotRestore` only when - // id-maps is persist into external storage, so there is no need to do snapshot restore again. - if curTaskInfo.StartTS == cfg.StartTS && curTaskInfo.RestoreTS == cfg.RestoreTS { - // the same task, check whether skip snapshot restore - doFullRestore = doFullRestore && (curTaskInfo.Progress == checkpoint.InSnapshotRestore) - // update the snapshot restore task name to clean up in final - if !doFullRestore && (len(cfg.FullBackupStorage) > 0) { - _ = cfg.generateSnapshotRestoreTaskName(clusterID) - } - log.Info("the same task", zap.Bool("skip-snapshot-restore", !doFullRestore)) - } else { - // not the same task, so overwrite the taskInfo with a new task - log.Info("not the same task, start to restore from scratch") - errTaskMsg = fmt.Sprintf( - "a new task [start-ts=%d] [restored-ts=%d] while the last task info: [start-ts=%d] [restored-ts=%d] [skip-snapshot-restore=%t]", - cfg.StartTS, cfg.RestoreTS, curTaskInfo.StartTS, curTaskInfo.RestoreTS, curTaskInfo.Progress == checkpoint.InLogRestoreAndIdMapPersist) - - curTaskInfo = nil + if curTaskInfo.Metadata.UpstreamClusterID != cfg.UpstreamClusterID { + return checkInfo, errors.Errorf( + "The upstream cluster id[%d] of the current log restore does not match that[%d] recorded in checkpoint. "+ + "Perhaps you should specify the last log backup storage instead, "+ + "or just clean the checkpoint database[%s] if the cluster has been cleaned up.", + cfg.UpstreamClusterId, curTaskInfo.Metadata.UpstreamClusterID, checkpoint.LogRestoreCheckpointDatabaseName) + } + + if curTaskInfo.Metadata.StartTS != cfg.StartTS || curTaskInfo.Metadata.RestoredTS != cfg.RestoreTS { + return checkInfo, errors.Errorf( + "The current log restore want to restore cluster from %d to %d, "+ + "which is different from that from %d to %d recorded in checkpoint. "+ + "Perhaps you should specify the last full backup storage to match the start-ts and "+ + "the parameter --restored-ts to match the restored-ts. "+ + "or just clean the checkpoint database[%s] if the cluster has been cleaned up.", + cfg.StartTS, cfg.RestoreTS, curTaskInfo.Metadata.StartTS, curTaskInfo.Metadata.RestoredTS, checkpoint.LogRestoreCheckpointDatabaseName, + ) + } + + log.Info("the same task, skip snapshot restore") + // the same task, skip full restore because it is already in the log restore stage. + doFullRestore = false + // update the snapshot restore task name to clean up in final + if !doFullRestore && (len(cfg.FullBackupStorage) > 0) { + _ = cfg.generateSnapshotRestoreTaskName(clusterID) } } } @@ -1808,17 +1818,11 @@ func checkPiTRTaskInfo( checkInfo.NeedFullRestore = doFullRestore // restore full snapshot precheck. if doFullRestore { - if !(cfg.UseCheckpoint && curTaskInfo != nil) { + if !(cfg.UseCheckpoint && curTaskInfo.Metadata != nil) { // Only when use checkpoint and not the first execution, // skip checking requirements. log.Info("check pitr requirements for the first execution") if err := checkPiTRRequirements(mgr); err != nil { - if len(errTaskMsg) > 0 { - err = errors.Annotatef(err, "The current restore task is regarded as %s. "+ - "If you ensure that no changes have been made to the cluster since the last execution, "+ - "you can adjust the `start-ts` or `restored-ts` to continue with the previous execution. "+ - "Otherwise, if you want to restore from scratch, please clean the cluster at first", errTaskMsg) - } // delay cluster checks after we get the backupmeta. // for the case that the restore inc + log backup, // we can still restore them. @@ -1828,19 +1832,5 @@ func checkPiTRTaskInfo( } } - // persist the new task info - if cfg.UseCheckpoint && curTaskInfo == nil { - log.Info("save checkpoint task info with `InSnapshotRestore` status") - if err := checkpoint.SaveCheckpointTaskInfoForLogRestore(ctx, s, &checkpoint.CheckpointTaskInfoForLogRestore{ - Progress: checkpoint.InSnapshotRestore, - StartTS: cfg.StartTS, - RestoreTS: cfg.RestoreTS, - // updated in the stage of `InLogRestoreAndIdMapPersist` - RewriteTS: 0, - TiFlashItems: nil, - }, clusterID); err != nil { - return checkInfo, errors.Trace(err) - } - } return checkInfo, nil } diff --git a/br/pkg/utils/db.go b/br/pkg/utils/db.go index f6d0b66dc7a54..cc51b899b3f97 100644 --- a/br/pkg/utils/db.go +++ b/br/pkg/utils/db.go @@ -110,7 +110,10 @@ func GetGcRatio(ctx sqlexec.RestrictedSQLExecutor) (string, error) { return d.ToString() } -const DefaultGcRatioVal = "1.1" +const ( + DefaultGcRatioVal = "1.1" + DisabledGcRatioVal = "-1.0" +) func SetGcRatio(ctx sqlexec.RestrictedSQLExecutor, ratio string) error { _, _, err := ctx.ExecRestrictedSQL( From 179343e3ae1478d0b2584747b50e55c6f4a0207a Mon Sep 17 00:00:00 2001 From: Jianjun Liao Date: Thu, 12 Sep 2024 13:55:17 +0800 Subject: [PATCH 04/15] log restore checkpoint into table Signed-off-by: Jianjun Liao --- br/pkg/checkpoint/BUILD.bazel | 6 +++++ br/pkg/checkpoint/log_restore.go | 42 +++++--------------------------- br/pkg/checkpoint/storage.go | 10 +++++--- 3 files changed, 18 insertions(+), 40 deletions(-) diff --git a/br/pkg/checkpoint/BUILD.bazel b/br/pkg/checkpoint/BUILD.bazel index a9edaf1334ef0..dac194013a72b 100644 --- a/br/pkg/checkpoint/BUILD.bazel +++ b/br/pkg/checkpoint/BUILD.bazel @@ -7,11 +7,13 @@ go_library( "checkpoint.go", "log_restore.go", "restore.go", + "storage.go", "ticker.go", ], importpath = "github.com/pingcap/tidb/br/pkg/checkpoint", visibility = ["//visibility:public"], deps = [ + "//br/pkg/glue", "//br/pkg/logutil", "//br/pkg/metautil", "//br/pkg/pdutil", @@ -19,9 +21,13 @@ go_library( "//br/pkg/storage", "//br/pkg/summary", "//br/pkg/utils", + "//pkg/domain", + "//pkg/kv", "//pkg/meta/model", "//pkg/parser/model", "//pkg/util", + "//pkg/util/sqlexec", + "@com_github_google_uuid//:uuid", "@com_github_pingcap_errors//:errors", "@com_github_pingcap_failpoint//:failpoint", "@com_github_pingcap_kvproto//pkg/brpb", diff --git a/br/pkg/checkpoint/log_restore.go b/br/pkg/checkpoint/log_restore.go index efc9648603474..ceaf413e674d3 100644 --- a/br/pkg/checkpoint/log_restore.go +++ b/br/pkg/checkpoint/log_restore.go @@ -23,12 +23,10 @@ import ( "github.com/pingcap/errors" "github.com/pingcap/log" "github.com/pingcap/tidb/br/pkg/glue" - "github.com/pingcap/tidb/br/pkg/storage" "github.com/pingcap/tidb/pkg/domain" "github.com/pingcap/tidb/pkg/meta/model" pmodel "github.com/pingcap/tidb/pkg/parser/model" "github.com/pingcap/tidb/pkg/util/sqlexec" - "go.uber.org/zap" ) type LogRestoreKeyType = string @@ -315,12 +313,11 @@ type CheckpointIngestIndexRepairSQLs struct { func LoadCheckpointIngestIndexRepairSQLs( ctx context.Context, - s storage.ExternalStorage, - taskName string, + execCtx sqlexec.RestrictedSQLExecutor, ) (*CheckpointIngestIndexRepairSQLs, error) { m := &CheckpointIngestIndexRepairSQLs{} - err := loadCheckpointMeta(ctx, s, getCheckpointIngestIndexRepairPathByTaskName(taskName), m) - return m, err + err := selectCheckpointMeta(ctx, execCtx, LogRestoreCheckpointDatabaseName, checkpointIngestTableName, m) + return m, errors.Trace(err) } func ExistsCheckpointIngestIndexRepairSQLs(ctx context.Context, dom *domain.Domain) bool { @@ -335,34 +332,7 @@ func SaveCheckpointIngestIndexRepairSQLs( return insertCheckpointMeta(ctx, se, LogRestoreCheckpointDatabaseName, checkpointIngestTableName, meta) } -func removeCheckpointTaskInfoForLogRestore(ctx context.Context, s storage.ExternalStorage, clusterID uint64) error { - fileName := getCheckpointTaskInfoPathByID(clusterID) - exists, err := s.FileExists(ctx, fileName) - if err != nil { - return errors.Trace(err) - } - - if !exists { - log.Warn("the task info file doesn't exist", zap.String("file", fileName)) - return nil - } - - return s.DeleteFile(ctx, fileName) -} - -func RemoveCheckpointDataForLogRestore( - ctx context.Context, - s storage.ExternalStorage, - taskName string, - clusterID uint64, -) error { - if err := removeCheckpointTaskInfoForLogRestore(ctx, s, clusterID); err != nil { - return errors.Annotatef(err, - "failed to remove the task info file: clusterId is %d, taskName is %s", - clusterID, - taskName, - ) - } - prefix := fmt.Sprintf(CheckpointRestoreDirFormat, taskName) - return removeCheckpointData(ctx, s, prefix) +func RemoveCheckpointDataForLogRestore(ctx context.Context, dom *domain.Domain, se glue.Session) error { + return dropCheckpointTables(ctx, dom, se, LogRestoreCheckpointDatabaseName, + []string{checkpointDataTableName, checkpointMetaTableName, checkpointProgressTableName, checkpointIngestTableName}) } diff --git a/br/pkg/checkpoint/storage.go b/br/pkg/checkpoint/storage.go index e0b32f78fb848..a453bcd430a60 100644 --- a/br/pkg/checkpoint/storage.go +++ b/br/pkg/checkpoint/storage.go @@ -76,10 +76,10 @@ const ( insertCheckpointSQLTemplate string = ` REPLACE INTO %s.%s - (uuid, segment_id, data) VALUES (%?, %?, %?);` + (uuid, segment_id, data) VALUES (%%?, %%?, %%?);` selectCheckpointSQLTemplate string = ` - SELECT uuid, segment_id, data FROM %s.%s ORDER BY uuid, segment_id;` + SELECT uuid, segment_id, data FROM %n.%n ORDER BY uuid, segment_id;` createCheckpointMetaTable string = ` CREATE TABLE %n.%n ( @@ -157,7 +157,8 @@ func mergeSelectCheckpoint( rows, _, errSQL := execCtx.ExecRestrictedSQL( kv.WithInternalSourceType(ctx, kv.InternalTxnBR), nil, - fmt.Sprintf(selectCheckpointSQLTemplate, dbName, tableName), + selectCheckpointSQLTemplate, + dbName, tableName, ) if errSQL != nil { return nil, errors.Annotatef(errSQL, "failed to get checkpoint data from table %s.%s", dbName, tableName) @@ -269,7 +270,8 @@ func selectCheckpointMeta[T any](ctx context.Context, execCtx sqlexec.Restricted rows, _, errSQL := execCtx.ExecRestrictedSQL( kv.WithInternalSourceType(ctx, kv.InternalTxnBR), nil, - fmt.Sprintf(selectCheckpointMetaSQLTemplate, dbName, tableName), + selectCheckpointMetaSQLTemplate, + dbName, tableName, ) if errSQL != nil { return errors.Annotatef(errSQL, "failed to get checkpoint metadata from table %s.%s", dbName, tableName) From 670b7dd2fc3bbabcba44103784b1e65e0b50ff87 Mon Sep 17 00:00:00 2001 From: Jianjun Liao Date: Thu, 12 Sep 2024 14:39:18 +0800 Subject: [PATCH 05/15] draft Signed-off-by: Jianjun Liao --- br/pkg/checkpoint/checkpoint_test.go | 106 +++++++++++++++------------ 1 file changed, 59 insertions(+), 47 deletions(-) diff --git a/br/pkg/checkpoint/checkpoint_test.go b/br/pkg/checkpoint/checkpoint_test.go index 17af80dacc48f..6275c61dc6af7 100644 --- a/br/pkg/checkpoint/checkpoint_test.go +++ b/br/pkg/checkpoint/checkpoint_test.go @@ -24,15 +24,17 @@ import ( backuppb "github.com/pingcap/kvproto/pkg/brpb" "github.com/pingcap/kvproto/pkg/encryptionpb" "github.com/pingcap/tidb/br/pkg/checkpoint" + "github.com/pingcap/tidb/br/pkg/gluetidb" "github.com/pingcap/tidb/br/pkg/pdutil" "github.com/pingcap/tidb/br/pkg/storage" + "github.com/pingcap/tidb/br/pkg/utiltest" "github.com/pingcap/tidb/pkg/meta/model" pmodel "github.com/pingcap/tidb/pkg/parser/model" "github.com/stretchr/testify/require" "github.com/tikv/client-go/v2/oracle" ) -func TestCheckpointMeta(t *testing.T) { +func TestCheckpointMetaForBackup(t *testing.T) { ctx := context.Background() base := t.TempDir() s, err := storage.NewLocalStorage(base) @@ -50,9 +52,19 @@ func TestCheckpointMeta(t *testing.T) { require.NoError(t, err) require.Equal(t, checkpointMeta.ConfigHash, checkpointMeta2.ConfigHash) require.Equal(t, checkpointMeta.BackupTS, checkpointMeta2.BackupTS) +} + +func TestCheckpointMetaForRestore(t *testing.T) { + ctx := context.Background() + s := utiltest.CreateRestoreSchemaSuite(t) + dom := s.Mock.Domain + g := gluetidb.New() + se, err := g.CreateSession(s.Mock.Storage) + require.NoError(t, err) - taskName := "test123" - checkpointMetaForRestore := &checkpoint.CheckpointMetadataForRestore{ + checkpointMetaForSnapshotRestore := &checkpoint.CheckpointMetadataForSnapshotRestore{ + UpstreamClusterID: 123, + RestoredTS: 321, SchedulersConfig: &pdutil.ClusterConfig{ Schedulers: []string{"1", "2"}, ScheduleCfg: map[string]any{ @@ -60,39 +72,47 @@ func TestCheckpointMeta(t *testing.T) { "2": "1", }, }, - GcRatio: "123", } - err = checkpoint.SaveCheckpointMetadataForRestore(ctx, s, checkpointMetaForRestore, taskName) + err = checkpoint.SaveCheckpointMetadataForSnapshotRestore(ctx, se, checkpointMetaForSnapshotRestore) require.NoError(t, err) - - checkpointMetaForRestore2, err := checkpoint.LoadCheckpointMetadataForRestore(ctx, s, taskName) + checkpointMetaForSnapshotRestore2, err := checkpoint.LoadCheckpointMetadataForSnapshotRestore(ctx, se.GetSessionCtx().GetRestrictedSQLExecutor()) require.NoError(t, err) - require.Equal(t, checkpointMetaForRestore.SchedulersConfig, checkpointMetaForRestore2.SchedulersConfig) - require.Equal(t, checkpointMetaForRestore.GcRatio, checkpointMetaForRestore2.GcRatio) - - exists, err := checkpoint.ExistsCheckpointTaskInfo(ctx, s, 123) + require.Equal(t, checkpointMetaForSnapshotRestore.SchedulersConfig, checkpointMetaForSnapshotRestore2.SchedulersConfig) + require.Equal(t, checkpointMetaForSnapshotRestore.UpstreamClusterID, checkpointMetaForSnapshotRestore2.UpstreamClusterID) + require.Equal(t, checkpointMetaForSnapshotRestore.RestoredTS, checkpointMetaForSnapshotRestore2.RestoredTS) + + checkpointMetaForLogRestore := &checkpoint.CheckpointMetadataForLogRestore{ + UpstreamClusterID: 123, + RestoredTS: 222, + StartTS: 111, + RewriteTS: 333, + GcRatio: "1.0", + TiFlashItems: map[int64]model.TiFlashReplicaInfo{1: {Count: 1}}, + } + err = checkpoint.SaveCheckpointMetadataForLogRestore(ctx, se, checkpointMetaForLogRestore) + require.NoError(t, err) + checkpointMetaForLogRestore2, err := checkpoint.LoadCheckpointMetadataForLogRestore(ctx, se.GetSessionCtx().GetRestrictedSQLExecutor()) require.NoError(t, err) + require.Equal(t, checkpointMetaForLogRestore.UpstreamClusterID, checkpointMetaForLogRestore2.UpstreamClusterID) + require.Equal(t, checkpointMetaForLogRestore.RestoredTS, checkpointMetaForLogRestore2.RestoredTS) + require.Equal(t, checkpointMetaForLogRestore.StartTS, checkpointMetaForLogRestore2.StartTS) + require.Equal(t, checkpointMetaForLogRestore.RewriteTS, checkpointMetaForLogRestore2.RewriteTS) + require.Equal(t, checkpointMetaForLogRestore.GcRatio, checkpointMetaForLogRestore2.GcRatio) + require.Equal(t, checkpointMetaForLogRestore.TiFlashItems, checkpointMetaForLogRestore2.TiFlashItems) + + exists := checkpoint.ExistsCheckpointProgress(ctx, dom) require.False(t, exists) - err = checkpoint.SaveCheckpointTaskInfoForLogRestore(ctx, s, &checkpoint.CheckpointTaskInfoForLogRestore{ - Progress: checkpoint.InLogRestoreAndIdMapPersist, - StartTS: 1, - RestoreTS: 2, - RewriteTS: 3, - TiFlashItems: map[int64]model.TiFlashReplicaInfo{1: {Count: 1}}, - }, 123) + err = checkpoint.SaveCheckpointProgress(ctx, se, &checkpoint.CheckpointProgress{ + Progress: checkpoint.InLogRestoreAndIdMapPersist, + }) require.NoError(t, err) - taskInfo, err := checkpoint.LoadCheckpointTaskInfoForLogRestore(ctx, s, 123) + taskInfo, err := checkpoint.LoadCheckpointProgress(ctx, se.GetSessionCtx().GetRestrictedSQLExecutor()) require.NoError(t, err) require.Equal(t, taskInfo.Progress, checkpoint.InLogRestoreAndIdMapPersist) - require.Equal(t, taskInfo.StartTS, uint64(1)) - require.Equal(t, taskInfo.RestoreTS, uint64(2)) - require.Equal(t, taskInfo.RewriteTS, uint64(3)) - require.Equal(t, taskInfo.TiFlashItems[1].Count, uint64(1)) - exists, err = checkpoint.ExistsCheckpointIngestIndexRepairSQLs(ctx, s, "123") - require.NoError(t, err) + exists = checkpoint.ExistsCheckpointIngestIndexRepairSQLs(ctx, dom) require.False(t, exists) - err = checkpoint.SaveCheckpointIngestIndexRepairSQLs(ctx, s, &checkpoint.CheckpointIngestIndexRepairSQLs{ + err = checkpoint.SaveCheckpointIngestIndexRepairSQLs(ctx, se, &checkpoint.CheckpointIngestIndexRepairSQLs{ SQLs: []checkpoint.CheckpointIngestIndexRepairSQL{ { IndexID: 1, @@ -103,9 +123,9 @@ func TestCheckpointMeta(t *testing.T) { AddArgs: []any{"6", "7", "8"}, }, }, - }, "123") + }) require.NoError(t, err) - repairSQLs, err := checkpoint.LoadCheckpointIngestIndexRepairSQLs(ctx, s, "123") + repairSQLs, err := checkpoint.LoadCheckpointIngestIndexRepairSQLs(ctx, se.GetSessionCtx().GetRestrictedSQLExecutor()) require.NoError(t, err) require.Equal(t, repairSQLs.SQLs[0].IndexID, int64(1)) require.Equal(t, repairSQLs.SQLs[0].SchemaName, pmodel.NewCIStr("2")) @@ -241,16 +261,12 @@ func TestCheckpointBackupRunner(t *testing.T) { func TestCheckpointRestoreRunner(t *testing.T) { ctx := context.Background() - base := t.TempDir() - s, err := storage.NewLocalStorage(base) + s := utiltest.CreateRestoreSchemaSuite(t) + g := gluetidb.New() + se, err := g.CreateSession(s.Mock.Storage) require.NoError(t, err) - taskName := "test" - cipher := &backuppb.CipherInfo{ - CipherType: encryptionpb.EncryptionMethod_AES256_CTR, - CipherKey: []byte("01234567890123456789012345678901"), - } - checkpointRunner, err := checkpoint.StartCheckpointRestoreRunnerForTest(ctx, s, cipher, 5*time.Second, taskName) + checkpointRunner, err := checkpoint.StartCheckpointRestoreRunnerForTest(ctx, se, 5*time.Second) require.NoError(t, err) data := map[string]struct { @@ -309,10 +325,10 @@ func TestCheckpointRestoreRunner(t *testing.T) { require.Equal(t, d.RangeKey, resp.RangeKey) } - _, err = checkpoint.WalkCheckpointFileForRestore(ctx, s, cipher, taskName, checker) + _, err = checkpoint.LoadCheckpointDataForSnapshotRestore(ctx, se.GetSessionCtx().GetRestrictedSQLExecutor(), checker) require.NoError(t, err) - checksum, _, err := checkpoint.LoadCheckpointChecksumForRestore(ctx, s, taskName) + checksum, _, err := checkpoint.LoadCheckpointChecksumForRestore(ctx, se.GetSessionCtx().GetRestrictedSQLExecutor()) require.NoError(t, err) var i int64 @@ -323,16 +339,12 @@ func TestCheckpointRestoreRunner(t *testing.T) { func TestCheckpointLogRestoreRunner(t *testing.T) { ctx := context.Background() - base := t.TempDir() - s, err := storage.NewLocalStorage(base) + s := utiltest.CreateRestoreSchemaSuite(t) + g := gluetidb.New() + se, err := g.CreateSession(s.Mock.Storage) require.NoError(t, err) - taskName := "test" - cipher := &backuppb.CipherInfo{ - CipherType: encryptionpb.EncryptionMethod_AES256_CTR, - CipherKey: []byte("01234567890123456789012345678901"), - } - checkpointRunner, err := checkpoint.StartCheckpointLogRestoreRunnerForTest(ctx, s, cipher, 5*time.Second, taskName) + checkpointRunner, err := checkpoint.StartCheckpointLogRestoreRunnerForTest(ctx, se, 5*time.Second) require.NoError(t, err) data := map[string]map[int][]struct { @@ -400,7 +412,7 @@ func TestCheckpointLogRestoreRunner(t *testing.T) { require.FailNow(t, "not found in the original data") } - _, err = checkpoint.WalkCheckpointFileForRestore(ctx, s, cipher, taskName, checker) + _, err = checkpoint.LoadCheckpointDataForLogRestore(ctx, se.GetSessionCtx().GetRestrictedSQLExecutor(), checker) require.NoError(t, err) } From 819cf2fbe54c824fb8cf74b10bfde230c7ad8639 Mon Sep 17 00:00:00 2001 From: Jianjun Liao Date: Fri, 13 Sep 2024 18:32:51 +0800 Subject: [PATCH 06/15] add unit test Signed-off-by: Jianjun Liao --- br/pkg/checkpoint/checkpoint_test.go | 39 +++++++++- br/pkg/checkpoint/log_restore.go | 2 +- br/pkg/checkpoint/restore.go | 2 +- br/pkg/checkpoint/storage.go | 34 +++++---- br/pkg/conn/conn.go | 13 ++-- br/pkg/restore/log_client/client.go | 9 ++- br/pkg/task/restore.go | 104 ++++++++++----------------- br/pkg/task/stream.go | 36 +++------- 8 files changed, 119 insertions(+), 120 deletions(-) diff --git a/br/pkg/checkpoint/checkpoint_test.go b/br/pkg/checkpoint/checkpoint_test.go index 6275c61dc6af7..84803453c1aa3 100644 --- a/br/pkg/checkpoint/checkpoint_test.go +++ b/br/pkg/checkpoint/checkpoint_test.go @@ -106,9 +106,18 @@ func TestCheckpointMetaForRestore(t *testing.T) { Progress: checkpoint.InLogRestoreAndIdMapPersist, }) require.NoError(t, err) - taskInfo, err := checkpoint.LoadCheckpointProgress(ctx, se.GetSessionCtx().GetRestrictedSQLExecutor()) + progress, err := checkpoint.LoadCheckpointProgress(ctx, se.GetSessionCtx().GetRestrictedSQLExecutor()) require.NoError(t, err) - require.Equal(t, taskInfo.Progress, checkpoint.InLogRestoreAndIdMapPersist) + require.Equal(t, checkpoint.InLogRestoreAndIdMapPersist, progress.Progress) + + taskInfo, err := checkpoint.TryToGetCheckpointTaskInfo(ctx, s.Mock.Domain, se.GetSessionCtx().GetRestrictedSQLExecutor()) + require.NoError(t, err) + require.Equal(t, uint64(123), taskInfo.Metadata.UpstreamClusterID) + require.Equal(t, uint64(222), taskInfo.Metadata.RestoredTS) + require.Equal(t, uint64(111), taskInfo.Metadata.StartTS) + require.Equal(t, uint64(333), taskInfo.Metadata.RewriteTS) + require.Equal(t, "1.0", taskInfo.Metadata.GcRatio) + require.Equal(t, checkpoint.InLogRestoreAndIdMapPersist, taskInfo.Progress) exists = checkpoint.ExistsCheckpointIngestIndexRepairSQLs(ctx, dom) require.False(t, exists) @@ -266,6 +275,8 @@ func TestCheckpointRestoreRunner(t *testing.T) { se, err := g.CreateSession(s.Mock.Storage) require.NoError(t, err) + err = checkpoint.SaveCheckpointMetadataForSnapshotRestore(ctx, se, &checkpoint.CheckpointMetadataForSnapshotRestore{}) + require.NoError(t, err) checkpointRunner, err := checkpoint.StartCheckpointRestoreRunnerForTest(ctx, se, 5*time.Second) require.NoError(t, err) @@ -312,6 +323,7 @@ func TestCheckpointRestoreRunner(t *testing.T) { checkpointRunner.WaitForFinish(ctx, true) + respCount := 0 checker := func(tableID int64, resp checkpoint.RestoreValueType) { require.NotNil(t, resp) d, ok := data[resp.RangeKey] @@ -323,10 +335,12 @@ func TestCheckpointRestoreRunner(t *testing.T) { require.Equal(t, tableID, int64(1)) } require.Equal(t, d.RangeKey, resp.RangeKey) + respCount += 1 } _, err = checkpoint.LoadCheckpointDataForSnapshotRestore(ctx, se.GetSessionCtx().GetRestrictedSQLExecutor(), checker) require.NoError(t, err) + require.Equal(t, 4, respCount) checksum, _, err := checkpoint.LoadCheckpointChecksumForRestore(ctx, se.GetSessionCtx().GetRestrictedSQLExecutor()) require.NoError(t, err) @@ -335,6 +349,14 @@ func TestCheckpointRestoreRunner(t *testing.T) { for i = 1; i <= 4; i++ { require.Equal(t, checksum[i].Crc64xor, uint64(i)) } + + err = checkpoint.RemoveCheckpointDataForSnapshotRestore(ctx, s.Mock.Domain, se) + require.NoError(t, err) + + exists := checkpoint.ExistsSnapshotRestoreCheckpoint(ctx, s.Mock.Domain) + require.False(t, exists) + exists = s.Mock.Domain.InfoSchema().SchemaExists(pmodel.NewCIStr(checkpoint.SnapshotRestoreCheckpointDatabaseName)) + require.False(t, exists) } func TestCheckpointLogRestoreRunner(t *testing.T) { @@ -344,6 +366,8 @@ func TestCheckpointLogRestoreRunner(t *testing.T) { se, err := g.CreateSession(s.Mock.Storage) require.NoError(t, err) + err = checkpoint.SaveCheckpointMetadataForLogRestore(ctx, se, &checkpoint.CheckpointMetadataForLogRestore{}) + require.NoError(t, err) checkpointRunner, err := checkpoint.StartCheckpointLogRestoreRunnerForTest(ctx, se, 5*time.Second) require.NoError(t, err) @@ -389,6 +413,7 @@ func TestCheckpointLogRestoreRunner(t *testing.T) { checkpointRunner.WaitForFinish(ctx, true) + respCount := 0 checker := func(metaKey string, resp checkpoint.LogRestoreValueMarshaled) { require.NotNil(t, resp) d, ok := data[metaKey] @@ -405,6 +430,7 @@ func TestCheckpointLogRestoreRunner(t *testing.T) { } for _, foff := range foffs { if f.foff == foff { + respCount += 1 return } } @@ -414,6 +440,15 @@ func TestCheckpointLogRestoreRunner(t *testing.T) { _, err = checkpoint.LoadCheckpointDataForLogRestore(ctx, se.GetSessionCtx().GetRestrictedSQLExecutor(), checker) require.NoError(t, err) + require.Equal(t, 4, respCount) + + err = checkpoint.RemoveCheckpointDataForLogRestore(ctx, s.Mock.Domain, se) + require.NoError(t, err) + + exists := checkpoint.ExistsLogRestoreCheckpointMetadata(ctx, s.Mock.Domain) + require.False(t, exists) + exists = s.Mock.Domain.InfoSchema().SchemaExists(pmodel.NewCIStr(checkpoint.LogRestoreCheckpointDatabaseName)) + require.False(t, exists) } func getLockData(p, l int64) ([]byte, error) { diff --git a/br/pkg/checkpoint/log_restore.go b/br/pkg/checkpoint/log_restore.go index ceaf413e674d3..8ba6d906bbc2e 100644 --- a/br/pkg/checkpoint/log_restore.go +++ b/br/pkg/checkpoint/log_restore.go @@ -185,7 +185,7 @@ func SaveCheckpointMetadataForLogRestore( se glue.Session, meta *CheckpointMetadataForLogRestore, ) error { - err := initCheckpointTable(ctx, se, LogRestoreCheckpointDatabaseName, []string{checkpointDataTableName, checkpointChecksumTableName}) + err := initCheckpointTable(ctx, se, LogRestoreCheckpointDatabaseName, []string{checkpointDataTableName}) if err != nil { return errors.Trace(err) } diff --git a/br/pkg/checkpoint/restore.go b/br/pkg/checkpoint/restore.go index 11c35503b2bf2..421b40aa50f29 100644 --- a/br/pkg/checkpoint/restore.go +++ b/br/pkg/checkpoint/restore.go @@ -94,7 +94,7 @@ func LoadCheckpointChecksumForRestore( ctx context.Context, execCtx sqlexec.RestrictedSQLExecutor, ) (map[int64]*ChecksumItem, time.Duration, error) { - return selectCheckpointChecksum(ctx, execCtx, LogRestoreCheckpointDatabaseName) + return selectCheckpointChecksum(ctx, execCtx, SnapshotRestoreCheckpointDatabaseName) } type CheckpointMetadataForSnapshotRestore struct { diff --git a/br/pkg/checkpoint/storage.go b/br/pkg/checkpoint/storage.go index a453bcd430a60..3281b7e5e0941 100644 --- a/br/pkg/checkpoint/storage.go +++ b/br/pkg/checkpoint/storage.go @@ -66,9 +66,10 @@ const ( checkpointProgressTableName string = "cpt_progress" checkpointIngestTableName string = "cpt_ingest" + // the primary key (uuid: uuid, segment_id:0) records the number of segment createCheckpointTable string = ` CREATE TABLE %n.%n ( - uuid varchar(32) NOT NULL, + uuid binary(32) NOT NULL, segment_id BIGINT NOT NULL, data BLOB(524288) NOT NULL, update_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP, @@ -177,17 +178,7 @@ func mergeSelectCheckpoint( log.Warn("get the empty uuid, but just skip it") continue } - if bytes.Equal(uuid, lastUUID) { - if lastUUIDInvalid { - continue - } - if nextSegmentID != segment_id { - lastUUIDInvalid = true - continue - } - rowData = append(rowData, data...) - nextSegmentID += 1 - } else { + if !bytes.Equal(uuid, lastUUID) { if !lastUUIDInvalid && len(rowData) > 0 { retData = append(retData, rowData) } @@ -196,6 +187,21 @@ func mergeSelectCheckpoint( nextSegmentID = 0 lastUUID = uuid } + + if lastUUIDInvalid { + continue + } + + if nextSegmentID != segment_id { + lastUUIDInvalid = true + continue + } + + rowData = append(rowData, data...) + nextSegmentID += 1 + } + if !lastUUIDInvalid && len(rowData) > 0 { + retData = append(retData, rowData) } return retData, nil } @@ -240,7 +246,7 @@ func selectCheckpointChecksum( } func initCheckpointTable(ctx context.Context, se glue.Session, dbName string, checkpointTableNames []string) error { - if err := se.ExecuteInternal(ctx, "CREATE DATABASE %n IF NOT EXISTS;", dbName); err != nil { + if err := se.ExecuteInternal(ctx, "CREATE DATABASE IF NOT EXISTS %n;", dbName); err != nil { return errors.Trace(err) } for _, tableName := range checkpointTableNames { @@ -304,7 +310,7 @@ func dropCheckpointTables(ctx context.Context, dom *domain.Domain, se glue.Sessi return errors.Trace(err) } if len(tables) > 0 { - log.Warn("user tables in the checkpoint database, skip drop the database", zap.String("db", dbName)) + log.Warn("user tables in the checkpoint database, skip drop the database", zap.String("db", dbName), zap.String("table", tables[0].Name.L)) return nil } if err := se.ExecuteInternal(ctx, "DROP DATABASE %n;", dbName); err != nil { diff --git a/br/pkg/conn/conn.go b/br/pkg/conn/conn.go index cdb81a011c8a5..f34a6c276c29b 100644 --- a/br/pkg/conn/conn.go +++ b/br/pkg/conn/conn.go @@ -63,6 +63,8 @@ const ( NormalVersionChecker VersionCheckerType = iota // version checker for PiTR StreamVersionChecker + // no check + NoVersionChecker ) // Mgr manages connections to a TiDB cluster. @@ -170,17 +172,18 @@ func NewMgr( return nil, errors.Trace(err) } if checkRequirements { - var checker version.VerChecker + var versionErr error switch versionCheckerType { case NormalVersionChecker: - checker = version.CheckVersionForBR + versionErr = version.CheckClusterVersion(ctx, controller.GetPDClient(), version.CheckVersionForBR) case StreamVersionChecker: - checker = version.CheckVersionForBRPiTR + versionErr = version.CheckClusterVersion(ctx, controller.GetPDClient(), version.CheckVersionForBRPiTR) + case NoVersionChecker: + versionErr = nil default: return nil, errors.Errorf("unknown command type, comman code is %d", versionCheckerType) } - err = version.CheckClusterVersion(ctx, controller.GetPDClient(), checker) - if err != nil { + if versionErr != nil { return nil, errors.Annotate(err, "running BR in incompatible version of cluster, "+ "if you believe it's OK, use --check-requirements=false to skip.") } diff --git a/br/pkg/restore/log_client/client.go b/br/pkg/restore/log_client/client.go index a7d5fabaa86a2..2959635bb9c5c 100644 --- a/br/pkg/restore/log_client/client.go +++ b/br/pkg/restore/log_client/client.go @@ -275,7 +275,7 @@ func (rc *LogClient) InitCheckpointMetadataForLogRestore(ctx context.Context, gc zap.Uint64("start-ts", rc.startTS), zap.Uint64("restored-ts", rc.restoreTS), zap.Uint64("rewrite-ts", rc.currentTS), zap.String("gc-ratio", gcRatio), zap.Int("tiflash-item-count", len(items))) if err := checkpoint.SaveCheckpointMetadataForLogRestore(ctx, rc.se, &checkpoint.CheckpointMetadataForLogRestore{ - UpstreamClusterID: rc.UpstreamClusterID, + UpstreamClusterID: rc.upstreamClusterID, RestoredTS: rc.restoreTS, StartTS: rc.startTS, RewriteTS: rc.currentTS, @@ -1285,12 +1285,11 @@ const ( func (rc *LogClient) generateRepairIngestIndexSQLs( ctx context.Context, ingestRecorder *ingestrec.IngestRecorder, - taskName string, ) ([]checkpoint.CheckpointIngestIndexRepairSQL, bool, error) { var sqls []checkpoint.CheckpointIngestIndexRepairSQL if rc.useCheckpoint { if checkpoint.ExistsCheckpointIngestIndexRepairSQLs(ctx, rc.dom) { - checkpointSQLs, err := checkpoint.LoadCheckpointIngestIndexRepairSQLs(ctx, rc.storage, taskName) + checkpointSQLs, err := checkpoint.LoadCheckpointIngestIndexRepairSQLs(ctx, rc.se.GetSessionCtx().GetRestrictedSQLExecutor()) if err != nil { return sqls, false, errors.Trace(err) } @@ -1365,8 +1364,8 @@ func (rc *LogClient) generateRepairIngestIndexSQLs( } // RepairIngestIndex drops the indexes from IngestRecorder and re-add them. -func (rc *LogClient) RepairIngestIndex(ctx context.Context, ingestRecorder *ingestrec.IngestRecorder, g glue.Glue, taskName string) error { - sqls, fromCheckpoint, err := rc.generateRepairIngestIndexSQLs(ctx, ingestRecorder, taskName) +func (rc *LogClient) RepairIngestIndex(ctx context.Context, ingestRecorder *ingestrec.IngestRecorder, g glue.Glue) error { + sqls, fromCheckpoint, err := rc.generateRepairIngestIndexSQLs(ctx, ingestRecorder) if err != nil { return errors.Trace(err) } diff --git a/br/pkg/task/restore.go b/br/pkg/task/restore.go index 4284e8214f160..beb7156021b48 100644 --- a/br/pkg/task/restore.go +++ b/br/pkg/task/restore.go @@ -254,12 +254,9 @@ type RestoreConfig struct { PitrBatchSize uint32 `json:"pitr-batch-size" toml:"pitr-batch-size"` PitrConcurrency uint32 `json:"-" toml:"-"` - UseCheckpoint bool `json:"use-checkpoint" toml:"use-checkpoint"` - checkpointSnapshotRestoreTaskName string `json:"-" toml:"-"` - checkpointLogRestoreTaskName string `json:"-" toml:"-"` - checkpointTaskInfoClusterID uint64 `json:"-" toml:"-"` - upstreamClusterID uint64 `json:"-" toml:"-"` - WaitTiflashReady bool `json:"wait-tiflash-ready" toml:"wait-tiflash-ready"` + UseCheckpoint bool `json:"use-checkpoint" toml:"use-checkpoint"` + upstreamClusterID uint64 `json:"-" toml:"-"` + WaitTiflashReady bool `json:"wait-tiflash-ready" toml:"wait-tiflash-ready"` // for ebs-based restore FullBackupType FullBackupType `json:"full-backup-type" toml:"full-backup-type"` @@ -531,19 +528,6 @@ func (cfg *RestoreConfig) adjustRestoreConfigForStreamRestore() { cfg.Config.Concurrency = cfg.PitrConcurrency } -// generateLogRestoreTaskName generates the log restore taskName for checkpoint -func (cfg *RestoreConfig) generateLogRestoreTaskName(clusterID, startTS, restoreTs uint64) string { - cfg.checkpointTaskInfoClusterID = clusterID - cfg.checkpointLogRestoreTaskName = fmt.Sprintf("%d/%d.%d", clusterID, startTS, restoreTs) - return cfg.checkpointLogRestoreTaskName -} - -// generateSnapshotRestoreTaskName generates the snapshot restore taskName for checkpoint -func (cfg *RestoreConfig) generateSnapshotRestoreTaskName(clusterID uint64) string { - cfg.checkpointSnapshotRestoreTaskName = fmt.Sprint(clusterID) - return cfg.checkpointSnapshotRestoreTaskName -} - func configureRestoreClient(ctx context.Context, client *snapclient.SnapClient, cfg *RestoreConfig) error { client.SetRateLimit(cfg.RateLimit) client.SetCrypter(&cfg.CipherInfo) @@ -656,22 +640,6 @@ func registerTaskToPD(ctx context.Context, etcdCLI *clientv3.Client) (closeF fun return register.Close, errors.Trace(err) } -func removeCheckpointDataForSnapshotRestore(ctx context.Context, storageName string, taskName string, config *Config) error { - _, s, err := GetStorage(ctx, storageName, config) - if err != nil { - return errors.Trace(err) - } - return errors.Trace(checkpoint.RemoveCheckpointDataForRestore(ctx, s, taskName)) -} - -func removeCheckpointDataForLogRestore(ctx context.Context, storageName string, taskName string, clusterID uint64, config *Config) error { - _, s, err := GetStorage(ctx, storageName, config) - if err != nil { - return errors.Trace(err) - } - return errors.Trace(checkpoint.RemoveCheckpointDataForLogRestore(ctx, s, taskName, clusterID)) -} - func DefaultRestoreConfig() RestoreConfig { fs := pflag.NewFlagSet("dummy", pflag.ContinueOnError) DefineCommonFlags(fs) @@ -715,43 +683,57 @@ func RunRestore(c context.Context, g glue.Glue, cmdName string, cfg *RestoreConf conf.KeyspaceName = cfg.KeyspaceName }) + // TODO: remove version checker from `NewMgr` + mgr, err := NewMgr(c, g, cfg.PD, cfg.TLS, GetKeepalive(&cfg.Config), cfg.CheckRequirements, true, conn.NormalVersionChecker) + if err != nil { + return errors.Trace(err) + } + defer mgr.Close() + var restoreError error if IsStreamRestore(cmdName) { - restoreError = RunStreamRestore(c, g, cmdName, cfg) + if err := version.CheckClusterVersion(c, mgr.GetPDClient(), version.CheckVersionForBRPiTR); err != nil { + return errors.Trace(err) + } + restoreError = RunStreamRestore(c, mgr, g, cmdName, cfg) } else { - restoreError = runRestore(c, g, cmdName, cfg, nil) + if err := version.CheckClusterVersion(c, mgr.GetPDClient(), version.CheckVersionForBR); err != nil { + return errors.Trace(err) + } + restoreError = runRestore(c, mgr, g, cmdName, cfg, nil) } if restoreError != nil { return errors.Trace(restoreError) } // Clear the checkpoint data if cfg.UseCheckpoint { - if len(cfg.checkpointLogRestoreTaskName) > 0 { - log.Info("start to remove checkpoint data for log restore") - err = removeCheckpointDataForLogRestore(c, cfg.Config.Storage, cfg.checkpointLogRestoreTaskName, cfg.checkpointTaskInfoClusterID, &cfg.Config) - if err != nil { - log.Warn("failed to remove checkpoint data for log restore", zap.Error(err)) - } - } - if len(cfg.checkpointSnapshotRestoreTaskName) > 0 { - log.Info("start to remove checkpoint data for snapshot restore.") - var storage string + se, err := g.CreateSession(mgr.GetStorage()) + if err != nil { + log.Warn("failed to remove checkpoint data", zap.Error(err)) + } else { if IsStreamRestore(cmdName) { - storage = cfg.FullBackupStorage + log.Info("start to remove checkpoint data for PITR restore") + err = checkpoint.RemoveCheckpointDataForLogRestore(c, mgr.GetDomain(), se) + if err != nil { + log.Warn("failed to remove checkpoint data for log restore", zap.Error(err)) + } + err = checkpoint.RemoveCheckpointDataForSnapshotRestore(c, mgr.GetDomain(), se) + if err != nil { + log.Warn("failed to remove checkpoint data for snapshot restore", zap.Error(err)) + } } else { - storage = cfg.Config.Storage - } - err = removeCheckpointDataForSnapshotRestore(c, storage, cfg.checkpointSnapshotRestoreTaskName, &cfg.Config) - if err != nil { - log.Warn("failed to remove checkpoint data for snapshot restore", zap.Error(err)) + err = checkpoint.RemoveCheckpointDataForSnapshotRestore(c, mgr.GetDomain(), se) + if err != nil { + log.Warn("failed to remove checkpoint data for snapshot restore", zap.Error(err)) + } } + log.Info("all the checkpoint data is removed.") } - log.Info("all the checkpoint data is removed.") } return nil } -func runRestore(c context.Context, g glue.Glue, cmdName string, cfg *RestoreConfig, checkInfo *PiTRTaskInfo) error { +func runRestore(c context.Context, mgr *conn.Mgr, g glue.Glue, cmdName string, cfg *RestoreConfig, checkInfo *PiTRTaskInfo) error { cfg.Adjust() defer summary.Summary(cmdName) ctx, cancel := context.WithCancel(c) @@ -763,14 +745,6 @@ func runRestore(c context.Context, g glue.Glue, cmdName string, cfg *RestoreConf ctx = opentracing.ContextWithSpan(ctx, span1) } - // Restore needs domain to do DDL. - needDomain := true - keepaliveCfg := GetKeepalive(&cfg.Config) - mgr, err := NewMgr(ctx, g, cfg.PD, cfg.TLS, keepaliveCfg, cfg.CheckRequirements, needDomain, conn.NormalVersionChecker) - if err != nil { - return errors.Trace(err) - } - defer mgr.Close() codec := mgr.GetStorage().GetCodec() // need retrieve these configs from tikv if not set in command. @@ -785,11 +759,12 @@ func runRestore(c context.Context, g glue.Glue, cmdName string, cfg *RestoreConf httpCli := httputil.NewClient(mgr.GetTLSConfig()) mgr.ProcessTiKVConfigs(ctx, kvConfigs, httpCli) + keepaliveCfg := GetKeepalive(&cfg.Config) keepaliveCfg.PermitWithoutStream = true client := snapclient.NewRestoreClient(mgr.GetPDClient(), mgr.GetPDHTTPClient(), mgr.GetTLSConfig(), keepaliveCfg) // using tikv config to set the concurrency-per-store for client. client.SetConcurrencyPerStore(kvConfigs.ImportGoroutines.Value) - err = configureRestoreClient(ctx, client, cfg) + err := configureRestoreClient(ctx, client, cfg) if err != nil { return errors.Trace(err) } @@ -887,7 +862,6 @@ func runRestore(c context.Context, g glue.Glue, cmdName string, cfg *RestoreConf var checkpointFirstRun bool = true if cfg.UseCheckpoint { - _ = cfg.generateSnapshotRestoreTaskName(client.GetClusterID(ctx)) // if the checkpoint metadata exists in the checkpoint storage, the restore is not // for the first time. existsCheckpointMetadata := checkpoint.ExistsSnapshotRestoreCheckpoint(ctx, mgr.GetDomain()) diff --git a/br/pkg/task/stream.go b/br/pkg/task/stream.go index ac9623c332cfa..17a1277a9591f 100644 --- a/br/pkg/task/stream.go +++ b/br/pkg/task/stream.go @@ -1111,6 +1111,7 @@ func checkIncompatibleChangefeed(ctx context.Context, backupTS uint64, etcdCLI * // RunStreamRestore restores stream log. func RunStreamRestore( c context.Context, + mgr *conn.Mgr, g glue.Glue, cmdName string, cfg *RestoreConfig, @@ -1163,7 +1164,7 @@ func RunStreamRestore( return errors.Trace(err) } - checkInfo, err := checkPiTRTaskInfo(ctx, g, s, cfg) + checkInfo, err := checkPiTRTaskInfo(ctx, mgr, g, s, cfg) if err != nil { return errors.Trace(err) } @@ -1179,7 +1180,7 @@ func RunStreamRestore( logStorage := cfg.Config.Storage cfg.Config.Storage = cfg.FullBackupStorage // TiFlash replica is restored to down-stream on 'pitr' currently. - if err = runRestore(ctx, g, FullRestoreCmd, cfg, checkInfo); err != nil { + if err = runRestore(ctx, mgr, g, FullRestoreCmd, cfg, checkInfo); err != nil { return errors.Trace(err) } cfg.Config.Storage = logStorage @@ -1198,7 +1199,7 @@ func RunStreamRestore( } // restore log. cfg.adjustRestoreConfigForStreamRestore() - if err := restoreStream(ctx, g, cfg, checkInfo.CheckpointInfo); err != nil { + if err := restoreStream(ctx, mgr, g, cfg, checkInfo.CheckpointInfo); err != nil { return errors.Trace(err) } return nil @@ -1207,6 +1208,7 @@ func RunStreamRestore( // RunStreamRestore start restore job func restoreStream( c context.Context, + mgr *conn.Mgr, g glue.Glue, cfg *RestoreConfig, taskInfo *checkpoint.CheckpointTaskInfoForLogRestore, @@ -1253,13 +1255,6 @@ func restoreStream( ctx = opentracing.ContextWithSpan(ctx, span1) } - mgr, err := NewMgr(ctx, g, cfg.PD, cfg.TLS, GetKeepalive(&cfg.Config), - cfg.CheckRequirements, true, conn.StreamVersionChecker) - if err != nil { - return errors.Trace(err) - } - defer mgr.Close() - client, err := createRestoreClient(ctx, g, cfg, mgr) if err != nil { return errors.Annotate(err, "failed to create restore client") @@ -1316,10 +1311,8 @@ func restoreStream( log.Info("finish restoring gc") }() - var taskName string var checkpointRunner *checkpoint.CheckpointRunner[checkpoint.LogRestoreKeyType, checkpoint.LogRestoreValueType] if cfg.UseCheckpoint { - taskName = cfg.generateLogRestoreTaskName(client.GetClusterID(ctx), cfg.StartTS, cfg.RestoreTS) oldRatioFromCheckpoint, err := client.InitCheckpointMetadataForLogRestore(ctx, oldRatio, cfg.tiflashRecorder) if err != nil { return errors.Trace(err) @@ -1447,7 +1440,7 @@ func restoreStream( return errors.Annotate(err, "failed to insert rows into gc_delete_range") } - if err = client.RepairIngestIndex(ctx, ingestRecorder, g, taskName); err != nil { + if err = client.RepairIngestIndex(ctx, ingestRecorder, g); err != nil { return errors.Annotate(err, "failed to repair ingest index") } @@ -1754,6 +1747,7 @@ type PiTRTaskInfo struct { func checkPiTRTaskInfo( ctx context.Context, + mgr *conn.Mgr, g glue.Glue, s storage.ExternalStorage, cfg *RestoreConfig, @@ -1764,14 +1758,6 @@ func checkPiTRTaskInfo( ) checkInfo := &PiTRTaskInfo{} - mgr, err := NewMgr(ctx, g, cfg.PD, cfg.TLS, GetKeepalive(&cfg.Config), - cfg.CheckRequirements, true, conn.StreamVersionChecker) - if err != nil { - return checkInfo, errors.Trace(err) - } - defer mgr.Close() - - clusterID := mgr.GetPDClient().GetClusterID(ctx) if cfg.UseCheckpoint { se, err := g.CreateSession(mgr.GetStorage()) if err != nil { @@ -1788,12 +1774,12 @@ func checkPiTRTaskInfo( // TODO: check whether user has manually modified the cluster(ddl). If so, regard the behavior // as restore from scratch. (update `curTaskInfo.RewriteTs` to 0 as an uninitial value) - if curTaskInfo.Metadata.UpstreamClusterID != cfg.UpstreamClusterID { + if curTaskInfo.Metadata.UpstreamClusterID != cfg.upstreamClusterID { return checkInfo, errors.Errorf( "The upstream cluster id[%d] of the current log restore does not match that[%d] recorded in checkpoint. "+ "Perhaps you should specify the last log backup storage instead, "+ "or just clean the checkpoint database[%s] if the cluster has been cleaned up.", - cfg.UpstreamClusterId, curTaskInfo.Metadata.UpstreamClusterID, checkpoint.LogRestoreCheckpointDatabaseName) + cfg.upstreamClusterID, curTaskInfo.Metadata.UpstreamClusterID, checkpoint.LogRestoreCheckpointDatabaseName) } if curTaskInfo.Metadata.StartTS != cfg.StartTS || curTaskInfo.Metadata.RestoredTS != cfg.RestoreTS { @@ -1810,10 +1796,6 @@ func checkPiTRTaskInfo( log.Info("the same task, skip snapshot restore") // the same task, skip full restore because it is already in the log restore stage. doFullRestore = false - // update the snapshot restore task name to clean up in final - if !doFullRestore && (len(cfg.FullBackupStorage) > 0) { - _ = cfg.generateSnapshotRestoreTaskName(clusterID) - } } } checkInfo.CheckpointInfo = curTaskInfo From b6fac6ffb8dfe511a3b7447f3c09ae023cb84ecc Mon Sep 17 00:00:00 2001 From: Jianjun Liao Date: Sat, 14 Sep 2024 15:34:23 +0800 Subject: [PATCH 07/15] bazel prepare Signed-off-by: Jianjun Liao --- br/pkg/checkpoint/BUILD.bazel | 4 +++- br/pkg/checkpoint/log_restore.go | 15 ++++++++++----- br/pkg/checkpoint/restore.go | 15 ++++++++++----- br/pkg/checkpoint/storage.go | 27 ++++++++++++++++++++------- br/pkg/restore/log_client/client.go | 13 +++++++++---- br/pkg/task/stream.go | 2 +- br/tests/br_other/run.sh | 2 +- 7 files changed, 54 insertions(+), 24 deletions(-) diff --git a/br/pkg/checkpoint/BUILD.bazel b/br/pkg/checkpoint/BUILD.bazel index dac194013a72b..22c5604bbac27 100644 --- a/br/pkg/checkpoint/BUILD.bazel +++ b/br/pkg/checkpoint/BUILD.bazel @@ -44,11 +44,13 @@ go_test( srcs = ["checkpoint_test.go"], flaky = True, race = "on", - shard_count = 5, + shard_count = 6, deps = [ ":checkpoint", + "//br/pkg/gluetidb", "//br/pkg/pdutil", "//br/pkg/storage", + "//br/pkg/utiltest", "//pkg/meta/model", "//pkg/parser/model", "@com_github_pingcap_kvproto//pkg/brpb", diff --git a/br/pkg/checkpoint/log_restore.go b/br/pkg/checkpoint/log_restore.go index 8ba6d906bbc2e..98afc436deb68 100644 --- a/br/pkg/checkpoint/log_restore.go +++ b/br/pkg/checkpoint/log_restore.go @@ -113,7 +113,8 @@ func StartCheckpointLogRestoreRunnerForTest( tick time.Duration, ) (*CheckpointRunner[LogRestoreKeyType, LogRestoreValueType], error) { runner := newCheckpointRunner[LogRestoreKeyType, LogRestoreValueType]( - ctx, newTableCheckpointStorage(se, LogRestoreCheckpointDatabaseName), nil, nil, nil, flushPosition{}, valueMarshalerForLogRestore) + ctx, newTableCheckpointStorage(se, LogRestoreCheckpointDatabaseName), + nil, nil, nil, flushPosition{}, valueMarshalerForLogRestore) runner.startCheckpointMainLoop(ctx, tick, tick, 0) return runner, nil @@ -124,7 +125,8 @@ func StartCheckpointRunnerForLogRestore( se glue.Session, ) (*CheckpointRunner[LogRestoreKeyType, LogRestoreValueType], error) { runner := newCheckpointRunner[LogRestoreKeyType, LogRestoreValueType]( - ctx, newTableCheckpointStorage(se, LogRestoreCheckpointDatabaseName), nil, nil, nil, flushPosition{}, valueMarshalerForLogRestore) + ctx, newTableCheckpointStorage(se, LogRestoreCheckpointDatabaseName), + nil, nil, nil, flushPosition{}, valueMarshalerForLogRestore) // for restore, no need to set lock runner.startCheckpointMainLoop(ctx, defaultTickDurationForFlush, defaultTckDurationForChecksum, 0) @@ -196,7 +198,8 @@ func ExistsLogRestoreCheckpointMetadata( ctx context.Context, dom *domain.Domain, ) bool { - return dom.InfoSchema().TableExists(pmodel.NewCIStr(LogRestoreCheckpointDatabaseName), pmodel.NewCIStr(checkpointMetaTableName)) + return dom.InfoSchema(). + TableExists(pmodel.NewCIStr(LogRestoreCheckpointDatabaseName), pmodel.NewCIStr(checkpointMetaTableName)) } // A progress type for snapshot + log restore. @@ -255,7 +258,8 @@ func ExistsCheckpointProgress( ctx context.Context, dom *domain.Domain, ) bool { - return dom.InfoSchema().TableExists(pmodel.NewCIStr(LogRestoreCheckpointDatabaseName), pmodel.NewCIStr(checkpointProgressTableName)) + return dom.InfoSchema(). + TableExists(pmodel.NewCIStr(LogRestoreCheckpointDatabaseName), pmodel.NewCIStr(checkpointProgressTableName)) } // CheckpointTaskInfo is unique information within the same cluster id. It represents the last @@ -321,7 +325,8 @@ func LoadCheckpointIngestIndexRepairSQLs( } func ExistsCheckpointIngestIndexRepairSQLs(ctx context.Context, dom *domain.Domain) bool { - return dom.InfoSchema().TableExists(pmodel.NewCIStr(LogRestoreCheckpointDatabaseName), pmodel.NewCIStr(checkpointIngestTableName)) + return dom.InfoSchema(). + TableExists(pmodel.NewCIStr(LogRestoreCheckpointDatabaseName), pmodel.NewCIStr(checkpointIngestTableName)) } func SaveCheckpointIngestIndexRepairSQLs( diff --git a/br/pkg/checkpoint/restore.go b/br/pkg/checkpoint/restore.go index 421b40aa50f29..12f1aeae25437 100644 --- a/br/pkg/checkpoint/restore.go +++ b/br/pkg/checkpoint/restore.go @@ -48,7 +48,8 @@ func StartCheckpointRestoreRunnerForTest( tick time.Duration, ) (*CheckpointRunner[RestoreKeyType, RestoreValueType], error) { runner := newCheckpointRunner[RestoreKeyType, RestoreValueType]( - ctx, newTableCheckpointStorage(se, SnapshotRestoreCheckpointDatabaseName), nil, nil, nil, flushPosition{}, valueMarshalerForRestore) + ctx, newTableCheckpointStorage(se, SnapshotRestoreCheckpointDatabaseName), + nil, nil, nil, flushPosition{}, valueMarshalerForRestore) runner.startCheckpointMainLoop(ctx, tick, tick, 0) return runner, nil @@ -59,7 +60,8 @@ func StartCheckpointRunnerForRestore( se glue.Session, ) (*CheckpointRunner[RestoreKeyType, RestoreValueType], error) { runner := newCheckpointRunner[RestoreKeyType, RestoreValueType]( - ctx, newTableCheckpointStorage(se, SnapshotRestoreCheckpointDatabaseName), nil, nil, nil, flushPosition{}, valueMarshalerForRestore) + ctx, newTableCheckpointStorage(se, SnapshotRestoreCheckpointDatabaseName), + nil, nil, nil, flushPosition{}, valueMarshalerForRestore) // for restore, no need to set lock runner.startCheckpointMainLoop(ctx, defaultTickDurationForFlush, defaultTckDurationForChecksum, 0) @@ -117,7 +119,8 @@ func SaveCheckpointMetadataForSnapshotRestore( se glue.Session, meta *CheckpointMetadataForSnapshotRestore, ) error { - err := initCheckpointTable(ctx, se, SnapshotRestoreCheckpointDatabaseName, []string{checkpointDataTableName, checkpointChecksumTableName}) + err := initCheckpointTable(ctx, se, SnapshotRestoreCheckpointDatabaseName, + []string{checkpointDataTableName, checkpointChecksumTableName}) if err != nil { return errors.Trace(err) } @@ -128,9 +131,11 @@ func ExistsSnapshotRestoreCheckpoint( ctx context.Context, dom *domain.Domain, ) bool { - return dom.InfoSchema().TableExists(pmodel.NewCIStr(SnapshotRestoreCheckpointDatabaseName), pmodel.NewCIStr(checkpointMetaTableName)) + return dom.InfoSchema(). + TableExists(pmodel.NewCIStr(SnapshotRestoreCheckpointDatabaseName), pmodel.NewCIStr(checkpointMetaTableName)) } func RemoveCheckpointDataForSnapshotRestore(ctx context.Context, dom *domain.Domain, se glue.Session) error { - return dropCheckpointTables(ctx, dom, se, SnapshotRestoreCheckpointDatabaseName, []string{checkpointDataTableName, checkpointChecksumTableName, checkpointMetaTableName}) + return dropCheckpointTables(ctx, dom, se, SnapshotRestoreCheckpointDatabaseName, + []string{checkpointDataTableName, checkpointChecksumTableName, checkpointMetaTableName}) } diff --git a/br/pkg/checkpoint/storage.go b/br/pkg/checkpoint/storage.go index 3281b7e5e0941..8a18288a3eaf2 100644 --- a/br/pkg/checkpoint/storage.go +++ b/br/pkg/checkpoint/storage.go @@ -167,7 +167,7 @@ func mergeSelectCheckpoint( var ( retData [][]byte = make([][]byte, 0, len(rows)) - rowData []byte = nil + rowData []byte = make([]byte, 0) lastUUID []byte = nil lastUUIDInvalid bool = false nextSegmentID uint64 = 0 @@ -182,7 +182,7 @@ func mergeSelectCheckpoint( if !lastUUIDInvalid && len(rowData) > 0 { retData = append(retData, rowData) } - rowData = nil + rowData = make([]byte, 0) lastUUIDInvalid = false nextSegmentID = 0 lastUUID = uuid @@ -272,7 +272,12 @@ func insertCheckpointMeta[T any](ctx context.Context, se glue.Session, dbName st return errors.Trace(err) } -func selectCheckpointMeta[T any](ctx context.Context, execCtx sqlexec.RestrictedSQLExecutor, dbName string, tableName string, meta *T) error { +func selectCheckpointMeta[T any]( + ctx context.Context, + execCtx sqlexec.RestrictedSQLExecutor, + dbName string, tableName string, + meta *T, +) error { rows, _, errSQL := execCtx.ExecRestrictedSQL( kv.WithInternalSourceType(ctx, kv.InternalTxnBR), nil, @@ -283,14 +288,16 @@ func selectCheckpointMeta[T any](ctx context.Context, execCtx sqlexec.Restricted return errors.Annotatef(errSQL, "failed to get checkpoint metadata from table %s.%s", dbName, tableName) } if len(rows) == 0 { - return errors.Errorf("get the empty checkpoint meta, the checkpoint is incomplete from table %s.%s", dbName, tableName) + return errors.Errorf( + "get the empty checkpoint meta, the checkpoint is incomplete from table %s.%s", dbName, tableName) } data := make([]byte, 0, len(rows)*CheckpointIdMapBlockSize) for i, row := range rows { segmentId, chunk := row.GetUint64(0), row.GetBytes(1) if uint64(i) != segmentId { - return errors.Errorf("the checkpoint metadata is incomplete from table %s.%s at segment %d", dbName, tableName, segmentId) + return errors.Errorf( + "the checkpoint metadata is incomplete from table %s.%s at segment %d", dbName, tableName, segmentId) } data = append(data, chunk...) } @@ -298,7 +305,12 @@ func selectCheckpointMeta[T any](ctx context.Context, execCtx sqlexec.Restricted return errors.Trace(err) } -func dropCheckpointTables(ctx context.Context, dom *domain.Domain, se glue.Session, dbName string, tableNames []string) error { +func dropCheckpointTables( + ctx context.Context, + dom *domain.Domain, + se glue.Session, + dbName string, tableNames []string, +) error { for _, tableName := range tableNames { if err := se.ExecuteInternal(ctx, "DROP TABLE IF EXISTS %n.%n;", dbName, tableName); err != nil { return errors.Trace(err) @@ -310,7 +322,8 @@ func dropCheckpointTables(ctx context.Context, dom *domain.Domain, se glue.Sessi return errors.Trace(err) } if len(tables) > 0 { - log.Warn("user tables in the checkpoint database, skip drop the database", zap.String("db", dbName), zap.String("table", tables[0].Name.L)) + log.Warn("user tables in the checkpoint database, skip drop the database", + zap.String("db", dbName), zap.String("table", tables[0].Name.L)) return nil } if err := se.ExecuteInternal(ctx, "DROP DATABASE %n;", dbName); err != nil { diff --git a/br/pkg/restore/log_client/client.go b/br/pkg/restore/log_client/client.go index 2959635bb9c5c..fe7c2cdfc25bf 100644 --- a/br/pkg/restore/log_client/client.go +++ b/br/pkg/restore/log_client/client.go @@ -250,7 +250,12 @@ func (rc *LogClient) InitClients(ctx context.Context, backend *backuppb.StorageB rc.fileImporter = NewLogFileImporter(metaClient, importCli, backend) } -func (rc *LogClient) InitCheckpointMetadataForLogRestore(ctx context.Context, gcRatio string, tiflashRecorder *tiflashrec.TiFlashRecorder) (string, error) { +func (rc *LogClient) InitCheckpointMetadataForLogRestore( + ctx context.Context, + startTS, restoredTS uint64, + gcRatio string, + tiflashRecorder *tiflashrec.TiFlashRecorder, +) (string, error) { rc.useCheckpoint = true // if the checkpoint metadata exists in the external storage, the restore is not @@ -272,12 +277,12 @@ func (rc *LogClient) InitCheckpointMetadataForLogRestore(ctx context.Context, gc items = tiflashRecorder.GetItems() } log.Info("save gc ratio into checkpoint metadata", - zap.Uint64("start-ts", rc.startTS), zap.Uint64("restored-ts", rc.restoreTS), zap.Uint64("rewrite-ts", rc.currentTS), + zap.Uint64("start-ts", startTS), zap.Uint64("restored-ts", restoredTS), zap.Uint64("rewrite-ts", rc.currentTS), zap.String("gc-ratio", gcRatio), zap.Int("tiflash-item-count", len(items))) if err := checkpoint.SaveCheckpointMetadataForLogRestore(ctx, rc.se, &checkpoint.CheckpointMetadataForLogRestore{ UpstreamClusterID: rc.upstreamClusterID, - RestoredTS: rc.restoreTS, - StartTS: rc.startTS, + RestoredTS: restoredTS, + StartTS: startTS, RewriteTS: rc.currentTS, GcRatio: gcRatio, TiFlashItems: items, diff --git a/br/pkg/task/stream.go b/br/pkg/task/stream.go index 17a1277a9591f..9329341cd6258 100644 --- a/br/pkg/task/stream.go +++ b/br/pkg/task/stream.go @@ -1313,7 +1313,7 @@ func restoreStream( var checkpointRunner *checkpoint.CheckpointRunner[checkpoint.LogRestoreKeyType, checkpoint.LogRestoreValueType] if cfg.UseCheckpoint { - oldRatioFromCheckpoint, err := client.InitCheckpointMetadataForLogRestore(ctx, oldRatio, cfg.tiflashRecorder) + oldRatioFromCheckpoint, err := client.InitCheckpointMetadataForLogRestore(ctx, cfg.StartTS, cfg.RestoreTS, oldRatio, cfg.tiflashRecorder) if err != nil { return errors.Trace(err) } diff --git a/br/tests/br_other/run.sh b/br/tests/br_other/run.sh index 0e60c816c5318..71f937793bcbf 100644 --- a/br/tests/br_other/run.sh +++ b/br/tests/br_other/run.sh @@ -163,7 +163,7 @@ done # check is there still exists scheduler in pause. -pause_schedulers=$(curl https://$PD_ADDR/pd/api/v1/schedulers?status="paused" | grep "scheduler" | wc -l) +pause_schedulers=$(run_curl https://$PD_ADDR/pd/api/v1/schedulers?status="paused" | grep "scheduler" | wc -l) # There shouldn't be any paused schedulers since BR gracfully shutdown. if [ "$pause_schedulers" -ne "0" ];then echo "TEST: [$TEST_NAME] failed because paused scheduler has changed" From 681bb2534c00222f036a0496eca3a55cc897a821 Mon Sep 17 00:00:00 2001 From: Jianjun Liao Date: Sat, 14 Sep 2024 17:13:27 +0800 Subject: [PATCH 08/15] bazel prepare Signed-off-by: Jianjun Liao --- br/pkg/checkpoint/storage.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/br/pkg/checkpoint/storage.go b/br/pkg/checkpoint/storage.go index 8a18288a3eaf2..31fbc87e346e6 100644 --- a/br/pkg/checkpoint/storage.go +++ b/br/pkg/checkpoint/storage.go @@ -167,7 +167,7 @@ func mergeSelectCheckpoint( var ( retData [][]byte = make([][]byte, 0, len(rows)) - rowData []byte = make([]byte, 0) + rowData []byte = make([]byte, 0, CheckpointIdMapBlockSize) lastUUID []byte = nil lastUUIDInvalid bool = false nextSegmentID uint64 = 0 @@ -182,7 +182,7 @@ func mergeSelectCheckpoint( if !lastUUIDInvalid && len(rowData) > 0 { retData = append(retData, rowData) } - rowData = make([]byte, 0) + rowData = rowData[:0] lastUUIDInvalid = false nextSegmentID = 0 lastUUID = uuid From c6d033085699c149e3166f3a1b5cf60b199d3132 Mon Sep 17 00:00:00 2001 From: Jianjun Liao Date: Thu, 19 Sep 2024 14:58:36 +0800 Subject: [PATCH 09/15] add integration test Signed-off-by: Jianjun Liao --- br/pkg/checkpoint/checkpoint_test.go | 1 + br/pkg/checkpoint/log_restore.go | 9 +- br/pkg/checkpoint/storage.go | 4 +- br/pkg/restore/log_client/client.go | 28 +++- br/pkg/restore/snap_client/tikv_sender.go | 31 ++++- br/pkg/task/stream.go | 12 +- br/tests/br_restore_checkpoint/run.sh | 149 ++++++++++++++++++++++ br/tests/run_group_br_tests.sh | 2 +- 8 files changed, 222 insertions(+), 14 deletions(-) create mode 100644 br/tests/br_restore_checkpoint/run.sh diff --git a/br/pkg/checkpoint/checkpoint_test.go b/br/pkg/checkpoint/checkpoint_test.go index 84803453c1aa3..1904017feb47e 100644 --- a/br/pkg/checkpoint/checkpoint_test.go +++ b/br/pkg/checkpoint/checkpoint_test.go @@ -117,6 +117,7 @@ func TestCheckpointMetaForRestore(t *testing.T) { require.Equal(t, uint64(111), taskInfo.Metadata.StartTS) require.Equal(t, uint64(333), taskInfo.Metadata.RewriteTS) require.Equal(t, "1.0", taskInfo.Metadata.GcRatio) + require.Equal(t, true, taskInfo.HasSnapshotMetadata) require.Equal(t, checkpoint.InLogRestoreAndIdMapPersist, taskInfo.Progress) exists = checkpoint.ExistsCheckpointIngestIndexRepairSQLs(ctx, dom) diff --git a/br/pkg/checkpoint/log_restore.go b/br/pkg/checkpoint/log_restore.go index 98afc436deb68..155ae01766ba7 100644 --- a/br/pkg/checkpoint/log_restore.go +++ b/br/pkg/checkpoint/log_restore.go @@ -265,7 +265,8 @@ func ExistsCheckpointProgress( // CheckpointTaskInfo is unique information within the same cluster id. It represents the last // restore task executed for this cluster. type CheckpointTaskInfoForLogRestore struct { - Metadata *CheckpointMetadataForLogRestore + Metadata *CheckpointMetadataForLogRestore + HasSnapshotMetadata bool // the progress for this task Progress RestoreProgress } @@ -295,10 +296,12 @@ func TryToGetCheckpointTaskInfo( return nil, errors.Trace(err) } } + hasSnapshotMetadata := ExistsSnapshotRestoreCheckpoint(ctx, dom) return &CheckpointTaskInfoForLogRestore{ - Metadata: metadata, - Progress: progress, + Metadata: metadata, + HasSnapshotMetadata: hasSnapshotMetadata, + Progress: progress, }, nil } diff --git a/br/pkg/checkpoint/storage.go b/br/pkg/checkpoint/storage.go index 31fbc87e346e6..a7fd5be5f0d94 100644 --- a/br/pkg/checkpoint/storage.go +++ b/br/pkg/checkpoint/storage.go @@ -167,7 +167,7 @@ func mergeSelectCheckpoint( var ( retData [][]byte = make([][]byte, 0, len(rows)) - rowData []byte = make([]byte, 0, CheckpointIdMapBlockSize) + rowData = []byte{} lastUUID []byte = nil lastUUIDInvalid bool = false nextSegmentID uint64 = 0 @@ -182,7 +182,7 @@ func mergeSelectCheckpoint( if !lastUUIDInvalid && len(rowData) > 0 { retData = append(retData, rowData) } - rowData = rowData[:0] + rowData = make([]byte, 0) lastUUIDInvalid = false nextSegmentID = 0 lastUUID = uuid diff --git a/br/pkg/restore/log_client/client.go b/br/pkg/restore/log_client/client.go index fe7c2cdfc25bf..c764c9415722d 100644 --- a/br/pkg/restore/log_client/client.go +++ b/br/pkg/restore/log_client/client.go @@ -186,7 +186,7 @@ func (rc *LogClient) SetStorage(ctx context.Context, backend *backuppb.StorageBa func (rc *LogClient) SetCurrentTS(ts uint64) error { if ts == 0 { - return errors.Errorf("set rewrite ts to an invalid ts", zap.Uint64("ts", ts)) + return errors.Errorf("set rewrite ts to an invalid ts: %d", ts) } rc.currentTS = ts return nil @@ -1241,7 +1241,7 @@ func (rc *LogClient) WrapLogFilesIterWithSplitHelper(logIter LogIter, rules map[ func (rc *LogClient) generateKvFilesSkipMap(ctx context.Context, downstreamIdset map[int64]struct{}) (*LogFilesSkipMap, error) { skipMap := NewLogFilesSkipMap() - t, err := checkpoint.LoadCheckpointDataForSnapshotRestore( + t, err := checkpoint.LoadCheckpointDataForLogRestore( ctx, rc.se.GetSessionCtx().GetRestrictedSQLExecutor(), func(groupKey checkpoint.LogRestoreKeyType, off checkpoint.LogRestoreValueMarshaled) { for tableID, foffs := range off.Foffs { // filter out the checkpoint data of dropped table @@ -1259,6 +1259,30 @@ func (rc *LogClient) generateKvFilesSkipMap(ctx context.Context, downstreamIdset return skipMap, nil } +func WrapLogFilesIterWithCheckpointFailpoint( + v failpoint.Value, + logIter LogIter, +) (LogIter, error) { + if cmd, ok := v.(string); ok { + switch cmd { + case "corrupt-last-table-files": // skip some files and eventually return an error to make the restore fail + newLogIter := iter.FilterOut(logIter, func(d *LogDataFileInfo) bool { + return d.OffsetInMergedGroup&1 > 0 + }) + return newLogIter, errors.Errorf("skip the last table files") + case "only-last-table-files": // check whether all the files, except files skipped before, are skipped by checkpoint + newLogIter := iter.FilterOut(logIter, func(d *LogDataFileInfo) bool { + if d.OffsetInMergedGroup&1 == 0 { + log.Panic("has files but not the files skipped before") + } + return false + }) + return newLogIter, nil + } + } + return logIter, nil +} + func (rc *LogClient) WrapLogFilesIterWithCheckpoint( ctx context.Context, logIter LogIter, diff --git a/br/pkg/restore/snap_client/tikv_sender.go b/br/pkg/restore/snap_client/tikv_sender.go index e6e2336fdc724..a5f27a87c4050 100644 --- a/br/pkg/restore/snap_client/tikv_sender.go +++ b/br/pkg/restore/snap_client/tikv_sender.go @@ -22,6 +22,7 @@ import ( "time" "github.com/pingcap/errors" + "github.com/pingcap/failpoint" backuppb "github.com/pingcap/kvproto/pkg/brpb" "github.com/pingcap/log" "github.com/pingcap/tidb/br/pkg/checkpoint" @@ -363,16 +364,42 @@ func getFileRangeKey(f string) string { return f[:idx] } -// RestoreSSTFiles tries to restore the files. +// RestoreSSTFiles tries to do something prepare work, such as set speed limit, and restore the files. func (rc *SnapClient) RestoreSSTFiles( ctx context.Context, tableIDWithFilesGroup [][]TableIDWithFiles, updateCh glue.Progress, -) error { +) (retErr error) { if err := rc.setSpeedLimit(ctx, rc.rateLimit); err != nil { return errors.Trace(err) } + failpoint.Inject("corrupt-files", func(v failpoint.Value) { + if cmd, ok := v.(string); ok { + switch cmd { + case "corrupt-last-table-files": // skip some files and eventually return an error to make the restore fail + tableIDWithFilesGroup = tableIDWithFilesGroup[:len(tableIDWithFilesGroup)-1] + defer func() { retErr = errors.Errorf("skip the last table files") }() + case "only-last-table-files": // check whether all the files, except last table files, are skipped by checkpoint + for _, tableIDWithFiless := range tableIDWithFilesGroup[:len(tableIDWithFilesGroup)-1] { + for _, tableIDWithFiles := range tableIDWithFiless { + if len(tableIDWithFiles.Files) > 0 { + log.Panic("has files but not the last table files") + } + } + } + } + } + }) + + return rc.restoreSSTFilesInternal(ctx, tableIDWithFilesGroup, updateCh) +} + +func (rc *SnapClient) restoreSSTFilesInternal( + ctx context.Context, + tableIDWithFilesGroup [][]TableIDWithFiles, + updateCh glue.Progress, +) error { eg, ectx := errgroup.WithContext(ctx) for _, tableIDWithFiles := range tableIDWithFilesGroup { if ectx.Err() != nil { diff --git a/br/pkg/task/stream.go b/br/pkg/task/stream.go index 9329341cd6258..7be42f0d36323 100644 --- a/br/pkg/task/stream.go +++ b/br/pkg/task/stream.go @@ -1164,7 +1164,7 @@ func RunStreamRestore( return errors.Trace(err) } - checkInfo, err := checkPiTRTaskInfo(ctx, mgr, g, s, cfg) + checkInfo, err := checkPiTRTaskInfo(ctx, mgr, g, cfg) if err != nil { return errors.Trace(err) } @@ -1406,7 +1406,7 @@ func restoreStream( return errors.Trace(err) } pd := g.StartProgress(ctx, "Restore KV Files", int64(dataFileCount), !cfg.LogProgress) - err = withProgress(pd, func(p glue.Progress) error { + err = withProgress(pd, func(p glue.Progress) (pErr error) { if cfg.UseCheckpoint { updateStatsWithCheckpoint := func(kvCount, size uint64) { mu.Lock() @@ -1420,6 +1420,11 @@ func restoreStream( if err != nil { return errors.Trace(err) } + failpoint.Inject("corrupt-files", func(v failpoint.Value) { + var retErr error + logFilesIter, retErr = logclient.WrapLogFilesIterWithCheckpointFailpoint(v, logFilesIter) + defer func() { pErr = retErr }() + }) } logFilesIterWithSplit, err := client.WrapLogFilesIterWithSplitHelper(logFilesIter, rewriteRules, g, mgr.GetStorage()) if err != nil { @@ -1749,7 +1754,6 @@ func checkPiTRTaskInfo( ctx context.Context, mgr *conn.Mgr, g glue.Glue, - s storage.ExternalStorage, cfg *RestoreConfig, ) (*PiTRTaskInfo, error) { var ( @@ -1802,7 +1806,7 @@ func checkPiTRTaskInfo( checkInfo.NeedFullRestore = doFullRestore // restore full snapshot precheck. if doFullRestore { - if !(cfg.UseCheckpoint && curTaskInfo.Metadata != nil) { + if !(cfg.UseCheckpoint && (curTaskInfo.Metadata != nil || curTaskInfo.HasSnapshotMetadata)) { // Only when use checkpoint and not the first execution, // skip checking requirements. log.Info("check pitr requirements for the first execution") diff --git a/br/tests/br_restore_checkpoint/run.sh b/br/tests/br_restore_checkpoint/run.sh new file mode 100644 index 0000000000000..76ce7a95b095f --- /dev/null +++ b/br/tests/br_restore_checkpoint/run.sh @@ -0,0 +1,149 @@ +#!/bin/bash +# +# Copyright 2024 PingCAP, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -eu +. run_services +CUR=$(cd `dirname $0`; pwd) + +# const value +PREFIX="checkpoint" # NOTICE: don't start with 'br' because `restart services` would remove file/directory br*. +DB=$TEST_NAME +res_file="$TEST_DIR/sql_res.$TEST_NAME.txt" + +# start a new cluster +echo "restart a services" +restart_services + +# prepare snapshot data +echo "prepare the data" +run_sql "CREATE DATABASE IF NOT EXISTS $DB;" +run_sql "CREATE TABLE IF NOT EXISTS $DB.tbl1 (id int, val varchar(20));" +run_sql "CREATE TABLE IF NOT EXISTS $DB.tbl2 (id int, val varchar(20));" +run_sql "INSERT INTO $DB.tbl1 values (1, 'a');" +run_sql "INSERT INTO $DB.tbl2 values (2, 'b');" + +# start the log backup task +echo "start log task" +run_br --pd $PD_ADDR log start --task-name integration_test -s "local://$TEST_DIR/$PREFIX/log" + +# run snapshot backup +echo "run snapshot backup" +run_br --pd $PD_ADDR backup db --db $DB -s "local://$TEST_DIR/$PREFIX/full" + +# prepare incremental data +echo "prepare the incremental data" +run_sql "RENAME TABLE $DB.tbl2 TO $DB.tbl4;" +run_sql "CREATE TABLE IF NOT EXISTS $DB.tbl3 (id int, val varchar(20));" +run_sql "INSERT INTO $DB.tbl1 values (11, 'aa');" +run_sql "INSERT INTO $DB.tbl4 values (22, 'bb');" +run_sql "INSERT INTO $DB.tbl3 values (33, 'cc');" + +# wait checkpoint advance +echo "wait checkpoint advance" +sleep 10 +current_ts=$(echo $(($(date +%s%3N) << 18))) +echo "current ts: $current_ts" +i=0 +while true; do + # extract the checkpoint ts of the log backup task. If there is some error, the checkpoint ts should be empty + log_backup_status=$(unset BR_LOG_TO_TERM && run_br --skip-goleak --pd $PD_ADDR log status --task-name integration_test --json 2>br.log) + echo "log backup status: $log_backup_status" + checkpoint_ts=$(echo "$log_backup_status" | head -n 1 | jq 'if .[0].last_errors | length == 0 then .[0].checkpoint else empty end') + echo "checkpoint ts: $checkpoint_ts" + + # check whether the checkpoint ts is a number + if [ $checkpoint_ts -gt 0 ] 2>/dev/null; then + # check whether the checkpoint has advanced + if [ $checkpoint_ts -gt $current_ts ]; then + echo "the checkpoint has advanced" + break + fi + # the checkpoint hasn't advanced + echo "the checkpoint hasn't advanced" + i=$((i+1)) + if [ "$i" -gt 50 ]; then + echo 'the checkpoint lag is too large' + exit 1 + fi + sleep 10 + else + # unknown status, maybe somewhere is wrong + echo "TEST: [$TEST_NAME] failed to wait checkpoint advance!" + exit 1 + fi +done + +# start a new cluster +echo "restart a services" +restart_services + +# PITR but failed in the snapshot restore stage +export GO_FAILPOINTS="github.com/pingcap/tidb/br/pkg/restore/snapclient/corrupt-files=return(\"corrupt-last-table-files\")" +restore_fail=0 +run_br --pd $PD_ADDR restore point --full-backup-storage "local://$TEST_DIR/$PREFIX/full" -s "local://$TEST_DIR/$PREFIX/log" || restore_fail=1 +export GO_FAILPOINTS="" +if [ $restore_fail -ne 1 ]; then + echo 'PITR success' + exit 1 +fi + +# PITR with checkpoint but failed in the log restore metakv stage +export GO_FAILPOINTS="github.com/pingcap/tidb/br/pkg/restore/snapclient/corrupt-files=return(\"only-last-table-files\");\ +github.com/pingcap/tidb/br/pkg/restore/logclient/failed-after-id-maps-saved=return(true)" +restore_fail=0 +run_br --pd $PD_ADDR restore point --full-backup-storage "local://$TEST_DIR/$PREFIX/full" -s "local://$TEST_DIR/$PREFIX/log" || restore_fail=1 +export GO_FAILPOINTS="" +if [ $restore_fail -ne 1 ]; then + echo 'PITR success' + exit 1 +fi + +# PITR with checkpoint but failed in the log restore datakv stage +# skip the snapshot restore stage +export GO_FAILPOINTS="github.com/pingcap/tidb/br/pkg/task/corrupt-files=return(\"corrupt-last-table-files\");" +restore_fail=0 +run_br --pd $PD_ADDR restore point --full-backup-storage "local://$TEST_DIR/$PREFIX/full" -s "local://$TEST_DIR/$PREFIX/log" || restore_fail=1 +export GO_FAILPOINTS="" +if [ $restore_fail -ne 1 ]; then + echo 'PITR success' + exit 1 +fi + +# PITR with checkpoint +export GO_FAILPOINTS="github.com/pingcap/tidb/br/pkg/task/corrupt-files=return(\"only-last-table-files\");" +run_br --pd $PD_ADDR restore point --full-backup-storage "local://$TEST_DIR/$PREFIX/full" -s "local://$TEST_DIR/$PREFIX/log" +export GO_FAILPOINTS="" + +# check the data consistency +# $DB.tbl1 has (1, 'a'), (11, 'aa') +# $DB.tbl4 has (2, 'b'), (22, 'bb') +# $DB.tbl3 has (33, 'cc') +run_sql "SELECT count(*) AS RESCNT FROM $DB.tbl1;" +check_contains "RESCNT: 2" +run_sql "SELECT count(*) AS RESCNT FROM $DB.tbl4;" +check_contains "RESCNT: 2" +run_sql "SELECT count(*) AS RESCNT FROM $DB.tbl3;" +check_contains "RESCNT: 1" +run_sql "SELECT id, val FROM $DB.tbl1 WHERE val = 'a';" +check_contains "id: 1" +run_sql "SELECT id, val FROM $DB.tbl1 WHERE val = 'aa';" +check_contains "id: 11" +run_sql "SELECT id, val FROM $DB.tbl4 WHERE val = 'b';" +check_contains "id: 2" +run_sql "SELECT id, val FROM $DB.tbl4 WHERE val = 'bb';" +check_contains "id: 22" +run_sql "SELECT id, val FROM $DB.tbl3 WHERE val = 'cc';" +check_contains "id: 33" diff --git a/br/tests/run_group_br_tests.sh b/br/tests/run_group_br_tests.sh index ae9f17d7a462b..0e44dd73f7864 100755 --- a/br/tests/run_group_br_tests.sh +++ b/br/tests/run_group_br_tests.sh @@ -27,7 +27,7 @@ groups=( ["G04"]='br_range br_replica_read br_restore_TDE_enable br_restore_log_task_enable br_s3 br_shuffle_leader br_shuffle_region br_single_table' ["G05"]='br_skip_checksum br_split_region_fail br_systables br_table_filter br_txn br_stats br_clustered_index br_crypter' ["G06"]='br_tikv_outage br_tikv_outage3' - ["G07"]='br_pitr' + ["G07"]='br_pitr br_restore_checkpoint' ["G08"]='br_tikv_outage2 br_ttl br_views_and_sequences br_z_gc_safepoint br_autorandom' ) From 8c70a2a5a07135df2aa87fd4b50e1b1a85a6112e Mon Sep 17 00:00:00 2001 From: Jianjun Liao Date: Thu, 19 Sep 2024 14:59:37 +0800 Subject: [PATCH 10/15] add integration test Signed-off-by: Jianjun Liao --- br/tests/run_group_br_tests.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/br/tests/run_group_br_tests.sh b/br/tests/run_group_br_tests.sh index 0e44dd73f7864..e8df4d9e097ab 100755 --- a/br/tests/run_group_br_tests.sh +++ b/br/tests/run_group_br_tests.sh @@ -26,8 +26,8 @@ groups=( ["G03"]='br_incompatible_tidb_config br_incremental br_incremental_index br_incremental_only_ddl br_incremental_same_table br_insert_after_restore br_key_locked br_log_test br_move_backup br_mv_index br_other br_partition_add_index br_tidb_placement_policy br_tiflash br_tiflash_conflict' ["G04"]='br_range br_replica_read br_restore_TDE_enable br_restore_log_task_enable br_s3 br_shuffle_leader br_shuffle_region br_single_table' ["G05"]='br_skip_checksum br_split_region_fail br_systables br_table_filter br_txn br_stats br_clustered_index br_crypter' - ["G06"]='br_tikv_outage br_tikv_outage3' - ["G07"]='br_pitr br_restore_checkpoint' + ["G06"]='br_tikv_outage br_tikv_outage3 br_restore_checkpoint' + ["G07"]='br_pitr' ["G08"]='br_tikv_outage2 br_ttl br_views_and_sequences br_z_gc_safepoint br_autorandom' ) From 115d03c7ada6d207a734cd31ad7cecaa7b4216b4 Mon Sep 17 00:00:00 2001 From: Jianjun Liao Date: Thu, 19 Sep 2024 17:48:12 +0800 Subject: [PATCH 11/15] fix integration test Signed-off-by: Jianjun Liao --- br/tests/br_pitr/run.sh | 1 + br/tests/br_restore_checkpoint/run.sh | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/br/tests/br_pitr/run.sh b/br/tests/br_pitr/run.sh index 994a9ef343c3d..096004c634f5e 100644 --- a/br/tests/br_pitr/run.sh +++ b/br/tests/br_pitr/run.sh @@ -136,6 +136,7 @@ fi # PITR restore echo "run pitr" +run_sql "DROP DATABASE __TiDB_BR_Temporary_Log_Restore_Checkpoint;" run_br --pd $PD_ADDR restore point -s "local://$TEST_DIR/$PREFIX/log" --full-backup-storage "local://$TEST_DIR/$PREFIX/full" > $res_file 2>&1 check_result diff --git a/br/tests/br_restore_checkpoint/run.sh b/br/tests/br_restore_checkpoint/run.sh index 76ce7a95b095f..d35c2ce2538a3 100644 --- a/br/tests/br_restore_checkpoint/run.sh +++ b/br/tests/br_restore_checkpoint/run.sh @@ -91,7 +91,7 @@ echo "restart a services" restart_services # PITR but failed in the snapshot restore stage -export GO_FAILPOINTS="github.com/pingcap/tidb/br/pkg/restore/snapclient/corrupt-files=return(\"corrupt-last-table-files\")" +export GO_FAILPOINTS="github.com/pingcap/tidb/br/pkg/restore/snap_client/corrupt-files=return(\"corrupt-last-table-files\")" restore_fail=0 run_br --pd $PD_ADDR restore point --full-backup-storage "local://$TEST_DIR/$PREFIX/full" -s "local://$TEST_DIR/$PREFIX/log" || restore_fail=1 export GO_FAILPOINTS="" @@ -101,7 +101,7 @@ if [ $restore_fail -ne 1 ]; then fi # PITR with checkpoint but failed in the log restore metakv stage -export GO_FAILPOINTS="github.com/pingcap/tidb/br/pkg/restore/snapclient/corrupt-files=return(\"only-last-table-files\");\ +export GO_FAILPOINTS="github.com/pingcap/tidb/br/pkg/restore/snap_client/corrupt-files=return(\"only-last-table-files\");\ github.com/pingcap/tidb/br/pkg/restore/logclient/failed-after-id-maps-saved=return(true)" restore_fail=0 run_br --pd $PD_ADDR restore point --full-backup-storage "local://$TEST_DIR/$PREFIX/full" -s "local://$TEST_DIR/$PREFIX/log" || restore_fail=1 From 5c220b7102be44dcf624b7787dedd3c0ecd2506f Mon Sep 17 00:00:00 2001 From: Jianjun Liao Date: Thu, 19 Sep 2024 18:32:49 +0800 Subject: [PATCH 12/15] fix integration test Signed-off-by: Jianjun Liao --- br/pkg/utils/backoff.go | 3 +++ br/tests/br_restore_checkpoint/run.sh | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/br/pkg/utils/backoff.go b/br/pkg/utils/backoff.go index fda272606e5c7..ee97292ef7fc7 100644 --- a/br/pkg/utils/backoff.go +++ b/br/pkg/utils/backoff.go @@ -108,6 +108,9 @@ func (rs *RetryState) ShouldRetry() bool { // Get the exponential backoff durion and transform the state. func (rs *RetryState) ExponentialBackoff() time.Duration { rs.retryTimes++ + failpoint.Inject("set-import-attempt-to-one", func(_ failpoint.Value) { + rs.retryTimes = rs.maxRetry + }) backoff := rs.nextBackoff rs.nextBackoff *= 2 if rs.nextBackoff > rs.maxBackoff { diff --git a/br/tests/br_restore_checkpoint/run.sh b/br/tests/br_restore_checkpoint/run.sh index d35c2ce2538a3..ef5a0ca07cbe1 100644 --- a/br/tests/br_restore_checkpoint/run.sh +++ b/br/tests/br_restore_checkpoint/run.sh @@ -102,7 +102,7 @@ fi # PITR with checkpoint but failed in the log restore metakv stage export GO_FAILPOINTS="github.com/pingcap/tidb/br/pkg/restore/snap_client/corrupt-files=return(\"only-last-table-files\");\ -github.com/pingcap/tidb/br/pkg/restore/logclient/failed-after-id-maps-saved=return(true)" +github.com/pingcap/tidb/br/pkg/restore/log_client/failed-after-id-maps-saved=return(true)" restore_fail=0 run_br --pd $PD_ADDR restore point --full-backup-storage "local://$TEST_DIR/$PREFIX/full" -s "local://$TEST_DIR/$PREFIX/log" || restore_fail=1 export GO_FAILPOINTS="" From c8f4ce4c8496496ffd2aee7f81b42d42677e3b6d Mon Sep 17 00:00:00 2001 From: Jianjun Liao Date: Fri, 20 Sep 2024 17:30:00 +0800 Subject: [PATCH 13/15] fix integration test Signed-off-by: Jianjun Liao --- br/pkg/restore/log_client/client.go | 4 +++- br/pkg/task/stream.go | 2 +- br/tests/br_restore_checkpoint/run.sh | 4 ++-- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/br/pkg/restore/log_client/client.go b/br/pkg/restore/log_client/client.go index c764c9415722d..05aa19ace19b1 100644 --- a/br/pkg/restore/log_client/client.go +++ b/br/pkg/restore/log_client/client.go @@ -1262,6 +1262,7 @@ func (rc *LogClient) generateKvFilesSkipMap(ctx context.Context, downstreamIdset func WrapLogFilesIterWithCheckpointFailpoint( v failpoint.Value, logIter LogIter, + rules map[int64]*restoreutils.RewriteRules, ) (LogIter, error) { if cmd, ok := v.(string); ok { switch cmd { @@ -1272,7 +1273,8 @@ func WrapLogFilesIterWithCheckpointFailpoint( return newLogIter, errors.Errorf("skip the last table files") case "only-last-table-files": // check whether all the files, except files skipped before, are skipped by checkpoint newLogIter := iter.FilterOut(logIter, func(d *LogDataFileInfo) bool { - if d.OffsetInMergedGroup&1 == 0 { + _, exists := rules[d.TableId] + if d.OffsetInMergedGroup&1 == 0 && exists { log.Panic("has files but not the files skipped before") } return false diff --git a/br/pkg/task/stream.go b/br/pkg/task/stream.go index 7be42f0d36323..731cc6381011d 100644 --- a/br/pkg/task/stream.go +++ b/br/pkg/task/stream.go @@ -1422,7 +1422,7 @@ func restoreStream( } failpoint.Inject("corrupt-files", func(v failpoint.Value) { var retErr error - logFilesIter, retErr = logclient.WrapLogFilesIterWithCheckpointFailpoint(v, logFilesIter) + logFilesIter, retErr = logclient.WrapLogFilesIterWithCheckpointFailpoint(v, logFilesIter, rewriteRules) defer func() { pErr = retErr }() }) } diff --git a/br/tests/br_restore_checkpoint/run.sh b/br/tests/br_restore_checkpoint/run.sh index ef5a0ca07cbe1..2a4b1104916de 100644 --- a/br/tests/br_restore_checkpoint/run.sh +++ b/br/tests/br_restore_checkpoint/run.sh @@ -113,7 +113,7 @@ fi # PITR with checkpoint but failed in the log restore datakv stage # skip the snapshot restore stage -export GO_FAILPOINTS="github.com/pingcap/tidb/br/pkg/task/corrupt-files=return(\"corrupt-last-table-files\");" +export GO_FAILPOINTS="github.com/pingcap/tidb/br/pkg/task/corrupt-files=return(\"corrupt-last-table-files\")" restore_fail=0 run_br --pd $PD_ADDR restore point --full-backup-storage "local://$TEST_DIR/$PREFIX/full" -s "local://$TEST_DIR/$PREFIX/log" || restore_fail=1 export GO_FAILPOINTS="" @@ -123,7 +123,7 @@ if [ $restore_fail -ne 1 ]; then fi # PITR with checkpoint -export GO_FAILPOINTS="github.com/pingcap/tidb/br/pkg/task/corrupt-files=return(\"only-last-table-files\");" +export GO_FAILPOINTS="github.com/pingcap/tidb/br/pkg/task/corrupt-files=return(\"only-last-table-files\")" run_br --pd $PD_ADDR restore point --full-backup-storage "local://$TEST_DIR/$PREFIX/full" -s "local://$TEST_DIR/$PREFIX/log" export GO_FAILPOINTS="" From 41aa77b173134cc6fc3fcbf1163243982c4baa6b Mon Sep 17 00:00:00 2001 From: Jianjun Liao Date: Tue, 24 Sep 2024 11:06:30 +0800 Subject: [PATCH 14/15] commit some suggestions Signed-off-by: Jianjun Liao --- br/pkg/checkpoint/BUILD.bazel | 1 + br/pkg/checkpoint/backup.go | 18 +-- br/pkg/checkpoint/checkpoint.go | 164 +--------------------- br/pkg/checkpoint/external_storage.go | 192 ++++++++++++++++++++++++++ br/pkg/checkpoint/log_restore.go | 8 +- br/pkg/checkpoint/restore.go | 8 +- br/pkg/checkpoint/storage.go | 47 ++++--- br/pkg/restore/snap_client/client.go | 14 +- br/pkg/task/restore.go | 3 + br/pkg/task/stream.go | 2 +- 10 files changed, 252 insertions(+), 205 deletions(-) create mode 100644 br/pkg/checkpoint/external_storage.go diff --git a/br/pkg/checkpoint/BUILD.bazel b/br/pkg/checkpoint/BUILD.bazel index 22c5604bbac27..f4eae48f7e57d 100644 --- a/br/pkg/checkpoint/BUILD.bazel +++ b/br/pkg/checkpoint/BUILD.bazel @@ -5,6 +5,7 @@ go_library( srcs = [ "backup.go", "checkpoint.go", + "external_storage.go", "log_restore.go", "restore.go", "storage.go", diff --git a/br/pkg/checkpoint/backup.go b/br/pkg/checkpoint/backup.go index c5eff5fe65946..2725cdc65666a 100644 --- a/br/pkg/checkpoint/backup.go +++ b/br/pkg/checkpoint/backup.go @@ -57,13 +57,13 @@ func StartCheckpointBackupRunnerForTest( tick time.Duration, timer GlobalTimer, ) (*CheckpointRunner[BackupKeyType, BackupValueType], error) { - runner := newCheckpointRunner[BackupKeyType, BackupValueType]( - ctx, &externalCheckpointStorage{storage}, storage, cipher, timer, flushPositionForBackup(), valueMarshalerForBackup) - - err := runner.initialLock(ctx) + checkpointStorage, err := newExternalCheckpointStorage(ctx, storage, timer) if err != nil { - return nil, errors.Annotate(err, "Failed to initialize checkpoint lock.") + return nil, errors.Trace(err) } + runner := newCheckpointRunner[BackupKeyType, BackupValueType]( + checkpointStorage, cipher, valueMarshalerForBackup) + runner.startCheckpointMainLoop(ctx, tick, tick, tick) return runner, nil } @@ -74,13 +74,13 @@ func StartCheckpointRunnerForBackup( cipher *backuppb.CipherInfo, timer GlobalTimer, ) (*CheckpointRunner[BackupKeyType, BackupValueType], error) { - runner := newCheckpointRunner[BackupKeyType, BackupValueType]( - ctx, &externalCheckpointStorage{storage}, storage, cipher, timer, flushPositionForBackup(), valueMarshalerForBackup) - - err := runner.initialLock(ctx) + checkpointStorage, err := newExternalCheckpointStorage(ctx, storage, timer) if err != nil { return nil, errors.Trace(err) } + runner := newCheckpointRunner[BackupKeyType, BackupValueType]( + checkpointStorage, cipher, valueMarshalerForBackup) + runner.startCheckpointMainLoop( ctx, defaultTickDurationForFlush, diff --git a/br/pkg/checkpoint/checkpoint.go b/br/pkg/checkpoint/checkpoint.go index 772cacae929c0..493334d748e64 100644 --- a/br/pkg/checkpoint/checkpoint.go +++ b/br/pkg/checkpoint/checkpoint.go @@ -18,10 +18,7 @@ import ( "bytes" "context" "crypto/sha256" - "encoding/base64" "encoding/json" - "fmt" - "math/rand" "strings" "sync" "sync/atomic" @@ -36,9 +33,7 @@ import ( "github.com/pingcap/tidb/br/pkg/rtree" "github.com/pingcap/tidb/br/pkg/storage" "github.com/pingcap/tidb/br/pkg/summary" - "github.com/pingcap/tidb/br/pkg/utils" "github.com/pingcap/tidb/pkg/util" - "github.com/tikv/client-go/v2/oracle" "go.uber.org/zap" "golang.org/x/sync/errgroup" ) @@ -148,18 +143,13 @@ type GlobalTimer interface { } type CheckpointRunner[K KeyType, V ValueType] struct { - flushPosition - lockId uint64 - meta map[K]*RangeGroup[K, V] checksum ChecksumItems valueMarshaler func(*RangeGroup[K, V]) ([]byte, error) checkpointStorage checkpointStorage - lockStorage storage.ExternalStorage cipher *backuppb.CipherInfo - timer GlobalTimer appendCh chan *CheckpointMessage[K, V] checksumCh chan *ChecksumItem @@ -175,26 +165,18 @@ type CheckpointRunner[K KeyType, V ValueType] struct { } func newCheckpointRunner[K KeyType, V ValueType]( - ctx context.Context, checkpointStorage checkpointStorage, - lockStorage storage.ExternalStorage, cipher *backuppb.CipherInfo, - timer GlobalTimer, - f flushPosition, vm func(*RangeGroup[K, V]) ([]byte, error), ) *CheckpointRunner[K, V] { return &CheckpointRunner[K, V]{ - flushPosition: f, - meta: make(map[K]*RangeGroup[K, V]), checksum: ChecksumItems{Items: make([]*ChecksumItem, 0)}, valueMarshaler: vm, checkpointStorage: checkpointStorage, - lockStorage: lockStorage, cipher: cipher, - timer: timer, appendCh: make(chan *CheckpointMessage[K, V]), checksumCh: make(chan *ChecksumItem), @@ -276,12 +258,7 @@ func (r *CheckpointRunner[K, V]) WaitForFinish(ctx context.Context, flush bool) // wait the range flusher exit r.wg.Wait() // remove the checkpoint lock - if r.lockId > 0 { - err := r.lockStorage.DeleteFile(ctx, r.CheckpointLockPath) - if err != nil { - log.Warn("failed to remove the checkpoint lock", zap.Error(err)) - } - } + r.checkpointStorage.deleteLock(ctx) } // Send the checksum to the flush goroutine, and reset the CheckpointRunner's checksum @@ -363,7 +340,7 @@ func (r *CheckpointRunner[K, V]) startCheckpointFlushLoop(ctx context.Context, w log.Info("stop checkpoint flush worker") return } - if err := r.updateLock(ctx); err != nil { + if err := r.checkpointStorage.updateLock(ctx); err != nil { errCh <- errors.Annotate(err, "failed to update checkpoint lock.") return } @@ -504,9 +481,8 @@ func (r *CheckpointRunner[K, V]) doChecksumFlush(ctx context.Context, checksumIt return errors.Trace(err) } - fname := fmt.Sprintf("%s/t%d_and__.cpt", r.CheckpointChecksumDir, checksumItems.Items[0].TableID) - if err = r.checkpointStorage.flushCheckpointChecksum(ctx, fname, data); err != nil { - return errors.Annotatef(err, "failed to write file %s for checkpoint checksum", fname) + if err = r.checkpointStorage.flushCheckpointChecksum(ctx, data); err != nil { + return errors.Trace(err) } failpoint.Inject("failed-after-checkpoint-flushes-checksum", func(_ failpoint.Value) { @@ -526,18 +502,11 @@ func (r *CheckpointRunner[K, V]) doFlush(ctx context.Context, meta map[K]*RangeG RangeGroupMetas: make([]*RangeGroupData, 0, len(meta)), } - var fname []byte = nil - for _, group := range meta { if len(group.Group) == 0 { continue } - // use the first item's group-key and sub-range-key as the filename - if len(fname) == 0 { - fname = append([]byte(fmt.Sprint(group.GroupKey, '.', '.')), group.Group[0].IdentKey()...) - } - // Flush the metaFile to storage content, err := r.valueMarshaler(group) if err != nil { @@ -565,10 +534,7 @@ func (r *CheckpointRunner[K, V]) doFlush(ctx context.Context, meta map[K]*RangeG return errors.Trace(err) } - checksum := sha256.Sum256(fname) - checksumEncoded := base64.URLEncoding.EncodeToString(checksum[:]) - path := fmt.Sprintf("%s/%s_%d.cpt", r.CheckpointDataDir, checksumEncoded, rand.Uint64()) - if err := r.checkpointStorage.flushCheckpointData(ctx, path, data); err != nil { + if err := r.checkpointStorage.flushCheckpointData(ctx, data); err != nil { return errors.Trace(err) } } @@ -579,126 +545,6 @@ func (r *CheckpointRunner[K, V]) doFlush(ctx context.Context, meta map[K]*RangeG return nil } -type CheckpointLock struct { - LockId uint64 `json:"lock-id"` - ExpireAt int64 `json:"expire-at"` -} - -// get ts with retry -func (r *CheckpointRunner[K, V]) getTS(ctx context.Context) (int64, int64, error) { - var ( - p int64 = 0 - l int64 = 0 - retry int = 0 - ) - errRetry := utils.WithRetry(ctx, func() error { - var err error - p, l, err = r.timer.GetTS(ctx) - if err != nil { - retry++ - log.Info("failed to get ts", zap.Int("retry", retry), zap.Error(err)) - return err - } - - return nil - }, utils.NewPDReqBackoffer()) - - return p, l, errors.Trace(errRetry) -} - -// flush the lock to the external storage -func (r *CheckpointRunner[K, V]) flushLock(ctx context.Context, p int64) error { - lock := &CheckpointLock{ - LockId: r.lockId, - ExpireAt: p + lockTimeToLive.Milliseconds(), - } - log.Info("start to flush the checkpoint lock", zap.Int64("lock-at", p), - zap.Int64("expire-at", lock.ExpireAt)) - data, err := json.Marshal(lock) - if err != nil { - return errors.Trace(err) - } - - err = r.lockStorage.WriteFile(ctx, r.CheckpointLockPath, data) - return errors.Trace(err) -} - -// check whether this lock belongs to this BR -func (r *CheckpointRunner[K, V]) checkLockFile(ctx context.Context, now int64) error { - data, err := r.lockStorage.ReadFile(ctx, r.CheckpointLockPath) - if err != nil { - return errors.Trace(err) - } - lock := &CheckpointLock{} - err = json.Unmarshal(data, lock) - if err != nil { - return errors.Trace(err) - } - if lock.ExpireAt <= now { - if lock.LockId > r.lockId { - return errors.Errorf("There are another BR(%d) running after but setting lock before this one(%d). "+ - "Please check whether the BR is running. If not, you can retry.", lock.LockId, r.lockId) - } - if lock.LockId == r.lockId { - log.Warn("The lock has expired.", - zap.Int64("expire-at(ms)", lock.ExpireAt), zap.Int64("now(ms)", now)) - } - } else if lock.LockId != r.lockId { - return errors.Errorf("The existing lock will expire in %d seconds. "+ - "There may be another BR(%d) running. If not, you can wait for the lock to expire, "+ - "or delete the file `%s%s` manually.", - (lock.ExpireAt-now)/1000, lock.LockId, strings.TrimRight(r.lockStorage.URI(), "/"), r.CheckpointLockPath) - } - - return nil -} - -// generate a new lock and flush the lock to the external storage -func (r *CheckpointRunner[K, V]) updateLock(ctx context.Context) error { - p, _, err := r.getTS(ctx) - if err != nil { - return errors.Trace(err) - } - if err = r.checkLockFile(ctx, p); err != nil { - return errors.Trace(err) - } - if err = r.flushLock(ctx, p); err != nil { - return errors.Trace(err) - } - - failpoint.Inject("failed-after-checkpoint-updates-lock", func(_ failpoint.Value) { - failpoint.Return(errors.Errorf("failpoint: failed after checkpoint updates lock")) - }) - - return nil -} - -// Attempt to initialize the lock. Need to stop the backup when there is an unexpired locks. -func (r *CheckpointRunner[K, V]) initialLock(ctx context.Context) error { - p, l, err := r.getTS(ctx) - if err != nil { - return errors.Trace(err) - } - r.lockId = oracle.ComposeTS(p, l) - exist, err := r.lockStorage.FileExists(ctx, r.CheckpointLockPath) - if err != nil { - return errors.Trace(err) - } - if exist { - if err := r.checkLockFile(ctx, p); err != nil { - return errors.Trace(err) - } - } - if err = r.flushLock(ctx, p); err != nil { - return errors.Trace(err) - } - - // wait for 3 seconds to check whether the lock file is overwritten by another BR - time.Sleep(3 * time.Second) - err = r.checkLockFile(ctx, p) - return errors.Trace(err) -} - func parseCheckpointData[K KeyType, V ValueType]( content []byte, pastDureTime *time.Duration, diff --git a/br/pkg/checkpoint/external_storage.go b/br/pkg/checkpoint/external_storage.go new file mode 100644 index 0000000000000..2e0576e4ab218 --- /dev/null +++ b/br/pkg/checkpoint/external_storage.go @@ -0,0 +1,192 @@ +// Copyright 2024 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package checkpoint + +import ( + "context" + "encoding/json" + "fmt" + "strings" + "time" + + "github.com/google/uuid" + "github.com/pingcap/errors" + "github.com/pingcap/failpoint" + "github.com/pingcap/log" + "github.com/pingcap/tidb/br/pkg/storage" + "github.com/pingcap/tidb/br/pkg/utils" + "github.com/tikv/client-go/v2/oracle" + "go.uber.org/zap" +) + +type externalCheckpointStorage struct { + flushPosition + storage storage.ExternalStorage + + lockId uint64 + timer GlobalTimer +} + +func newExternalCheckpointStorage(ctx context.Context, s storage.ExternalStorage, timer GlobalTimer) (*externalCheckpointStorage, error) { + checkpointStorage := &externalCheckpointStorage{ + flushPosition: flushPositionForBackup(), + storage: s, + timer: timer, + } + if timer != nil { + if err := checkpointStorage.initialLock(ctx); err != nil { + return nil, errors.Trace(err) + } + } + return checkpointStorage, nil +} + +func (s *externalCheckpointStorage) flushCheckpointData(ctx context.Context, data []byte) error { + fname := fmt.Sprintf("%s/%x.cpt", s.CheckpointDataDir, uuid.New()) + return s.storage.WriteFile(ctx, fname, data) +} + +func (s *externalCheckpointStorage) flushCheckpointChecksum(ctx context.Context, data []byte) error { + fname := fmt.Sprintf("%s/%x.cpt", s.CheckpointChecksumDir, uuid.New()) + return s.storage.WriteFile(ctx, fname, data) +} + +func (s *externalCheckpointStorage) getTS(ctx context.Context) (int64, int64, error) { + var ( + p int64 = 0 + l int64 = 0 + retry int = 0 + ) + errRetry := utils.WithRetry(ctx, func() error { + var err error + p, l, err = s.timer.GetTS(ctx) + if err != nil { + retry++ + log.Info("failed to get ts", zap.Int("retry", retry), zap.Error(err)) + return err + } + + return nil + }, utils.NewPDReqBackoffer()) + + return p, l, errors.Trace(errRetry) +} + +type CheckpointLock struct { + LockId uint64 `json:"lock-id"` + ExpireAt int64 `json:"expire-at"` +} + +// flush the lock to the external storage +func (s *externalCheckpointStorage) flushLock(ctx context.Context, p int64) error { + lock := &CheckpointLock{ + LockId: s.lockId, + ExpireAt: p + lockTimeToLive.Milliseconds(), + } + log.Info("start to flush the checkpoint lock", zap.Int64("lock-at", p), + zap.Int64("expire-at", lock.ExpireAt)) + data, err := json.Marshal(lock) + if err != nil { + return errors.Trace(err) + } + + err = s.storage.WriteFile(ctx, s.CheckpointLockPath, data) + return errors.Trace(err) +} + +// check whether this lock belongs to this BR +func (s *externalCheckpointStorage) checkLockFile(ctx context.Context, now int64) error { + data, err := s.storage.ReadFile(ctx, s.CheckpointLockPath) + if err != nil { + return errors.Trace(err) + } + lock := &CheckpointLock{} + err = json.Unmarshal(data, lock) + if err != nil { + return errors.Trace(err) + } + if lock.ExpireAt <= now { + if lock.LockId > s.lockId { + return errors.Errorf("There are another BR(%d) running after but setting lock before this one(%d). "+ + "Please check whether the BR is running. If not, you can retry.", lock.LockId, s.lockId) + } + if lock.LockId == s.lockId { + log.Warn("The lock has expired.", + zap.Int64("expire-at(ms)", lock.ExpireAt), zap.Int64("now(ms)", now)) + } + } else if lock.LockId != s.lockId { + return errors.Errorf("The existing lock will expire in %d seconds. "+ + "There may be another BR(%d) running. If not, you can wait for the lock to expire, "+ + "or delete the file `%s%s` manually.", + (lock.ExpireAt-now)/1000, lock.LockId, strings.TrimRight(s.storage.URI(), "/"), s.CheckpointLockPath) + } + + return nil +} + +// Attempt to initialize the lock. Need to stop the backup when there is an unexpired locks. +func (s *externalCheckpointStorage) initialLock(ctx context.Context) error { + p, l, err := s.getTS(ctx) + if err != nil { + return errors.Trace(err) + } + s.lockId = oracle.ComposeTS(p, l) + exist, err := s.storage.FileExists(ctx, s.CheckpointLockPath) + if err != nil { + return errors.Trace(err) + } + if exist { + if err := s.checkLockFile(ctx, p); err != nil { + return errors.Trace(err) + } + } + if err = s.flushLock(ctx, p); err != nil { + return errors.Trace(err) + } + + // wait for 3 seconds to check whether the lock file is overwritten by another BR + time.Sleep(3 * time.Second) + err = s.checkLockFile(ctx, p) + return errors.Trace(err) +} + +// generate a new lock and flush the lock to the external storage +func (s *externalCheckpointStorage) updateLock(ctx context.Context) error { + p, _, err := s.getTS(ctx) + if err != nil { + return errors.Trace(err) + } + if err = s.checkLockFile(ctx, p); err != nil { + return errors.Trace(err) + } + if err = s.flushLock(ctx, p); err != nil { + return errors.Trace(err) + } + + failpoint.Inject("failed-after-checkpoint-updates-lock", func(_ failpoint.Value) { + failpoint.Return(errors.Errorf("failpoint: failed after checkpoint updates lock")) + }) + + return nil +} + +func (s *externalCheckpointStorage) deleteLock(ctx context.Context) { + if s.lockId > 0 { + err := s.storage.DeleteFile(ctx, s.CheckpointLockPath) + if err != nil { + log.Warn("failed to remove the checkpoint lock", zap.Error(err)) + } + } +} diff --git a/br/pkg/checkpoint/log_restore.go b/br/pkg/checkpoint/log_restore.go index 155ae01766ba7..138eb86965eac 100644 --- a/br/pkg/checkpoint/log_restore.go +++ b/br/pkg/checkpoint/log_restore.go @@ -113,8 +113,8 @@ func StartCheckpointLogRestoreRunnerForTest( tick time.Duration, ) (*CheckpointRunner[LogRestoreKeyType, LogRestoreValueType], error) { runner := newCheckpointRunner[LogRestoreKeyType, LogRestoreValueType]( - ctx, newTableCheckpointStorage(se, LogRestoreCheckpointDatabaseName), - nil, nil, nil, flushPosition{}, valueMarshalerForLogRestore) + newTableCheckpointStorage(se, LogRestoreCheckpointDatabaseName), + nil, valueMarshalerForLogRestore) runner.startCheckpointMainLoop(ctx, tick, tick, 0) return runner, nil @@ -125,8 +125,8 @@ func StartCheckpointRunnerForLogRestore( se glue.Session, ) (*CheckpointRunner[LogRestoreKeyType, LogRestoreValueType], error) { runner := newCheckpointRunner[LogRestoreKeyType, LogRestoreValueType]( - ctx, newTableCheckpointStorage(se, LogRestoreCheckpointDatabaseName), - nil, nil, nil, flushPosition{}, valueMarshalerForLogRestore) + newTableCheckpointStorage(se, LogRestoreCheckpointDatabaseName), + nil, valueMarshalerForLogRestore) // for restore, no need to set lock runner.startCheckpointMainLoop(ctx, defaultTickDurationForFlush, defaultTckDurationForChecksum, 0) diff --git a/br/pkg/checkpoint/restore.go b/br/pkg/checkpoint/restore.go index 12f1aeae25437..971938ba45e5b 100644 --- a/br/pkg/checkpoint/restore.go +++ b/br/pkg/checkpoint/restore.go @@ -48,8 +48,8 @@ func StartCheckpointRestoreRunnerForTest( tick time.Duration, ) (*CheckpointRunner[RestoreKeyType, RestoreValueType], error) { runner := newCheckpointRunner[RestoreKeyType, RestoreValueType]( - ctx, newTableCheckpointStorage(se, SnapshotRestoreCheckpointDatabaseName), - nil, nil, nil, flushPosition{}, valueMarshalerForRestore) + newTableCheckpointStorage(se, SnapshotRestoreCheckpointDatabaseName), + nil, valueMarshalerForRestore) runner.startCheckpointMainLoop(ctx, tick, tick, 0) return runner, nil @@ -60,8 +60,8 @@ func StartCheckpointRunnerForRestore( se glue.Session, ) (*CheckpointRunner[RestoreKeyType, RestoreValueType], error) { runner := newCheckpointRunner[RestoreKeyType, RestoreValueType]( - ctx, newTableCheckpointStorage(se, SnapshotRestoreCheckpointDatabaseName), - nil, nil, nil, flushPosition{}, valueMarshalerForRestore) + newTableCheckpointStorage(se, SnapshotRestoreCheckpointDatabaseName), + nil, valueMarshalerForRestore) // for restore, no need to set lock runner.startCheckpointMainLoop(ctx, defaultTickDurationForFlush, defaultTckDurationForChecksum, 0) diff --git a/br/pkg/checkpoint/storage.go b/br/pkg/checkpoint/storage.go index a7fd5be5f0d94..b297bc6781a8b 100644 --- a/br/pkg/checkpoint/storage.go +++ b/br/pkg/checkpoint/storage.go @@ -25,29 +25,21 @@ import ( "github.com/pingcap/errors" "github.com/pingcap/log" "github.com/pingcap/tidb/br/pkg/glue" - "github.com/pingcap/tidb/br/pkg/storage" "github.com/pingcap/tidb/pkg/domain" "github.com/pingcap/tidb/pkg/kv" "github.com/pingcap/tidb/pkg/parser/model" + pmodel "github.com/pingcap/tidb/pkg/parser/model" "github.com/pingcap/tidb/pkg/util/sqlexec" "go.uber.org/zap" ) type checkpointStorage interface { - flushCheckpointData(ctx context.Context, fname string, data []byte) error - flushCheckpointChecksum(ctx context.Context, fname string, data []byte) error -} - -type externalCheckpointStorage struct { - storage storage.ExternalStorage -} - -func (s *externalCheckpointStorage) flushCheckpointData(ctx context.Context, fname string, data []byte) error { - return s.storage.WriteFile(ctx, fname, data) -} + flushCheckpointData(ctx context.Context, data []byte) error + flushCheckpointChecksum(ctx context.Context, data []byte) error -func (s *externalCheckpointStorage) flushCheckpointChecksum(ctx context.Context, fname string, data []byte) error { - return s.storage.WriteFile(ctx, fname, data) + initialLock(ctx context.Context) error + updateLock(ctx context.Context) error + deleteLock(ctx context.Context) } // Notice that: @@ -95,6 +87,11 @@ const ( selectCheckpointMetaSQLTemplate string = `SELECT segment_id, data FROM %n.%n;` ) +// IsCheckpointDB checks whether the dbname is checkpoint database. +func IsCheckpointDB(dbname pmodel.CIStr) bool { + return dbname.O == LogRestoreCheckpointDatabaseName || dbname.O == SnapshotRestoreCheckpointDatabaseName +} + const CheckpointIdMapBlockSize int = 524288 func chunkInsertCheckpointData(data []byte, fn func(segmentId uint64, chunk []byte) error) error { @@ -128,7 +125,21 @@ type tableCheckpointStorage struct { checkpointDBName string } -func (s *tableCheckpointStorage) flushCheckpointData(ctx context.Context, _ string, data []byte) error { +func (s *tableCheckpointStorage) initialLock(ctx context.Context) error { + log.Fatal("unimplement!") + return nil +} + +func (s *tableCheckpointStorage) updateLock(ctx context.Context) error { + log.Fatal("unimplement!") + return nil +} + +func (s *tableCheckpointStorage) deleteLock(ctx context.Context) { + log.Fatal("unimplement!") +} + +func (s *tableCheckpointStorage) flushCheckpointData(ctx context.Context, data []byte) error { sqls, argss := chunkInsertCheckpointSQLs(s.checkpointDBName, checkpointDataTableName, data) for i, sql := range sqls { args := argss[i] @@ -139,7 +150,7 @@ func (s *tableCheckpointStorage) flushCheckpointData(ctx context.Context, _ stri return nil } -func (s *tableCheckpointStorage) flushCheckpointChecksum(ctx context.Context, _ string, data []byte) error { +func (s *tableCheckpointStorage) flushCheckpointChecksum(ctx context.Context, data []byte) error { sqls, argss := chunkInsertCheckpointSQLs(s.checkpointDBName, checkpointChecksumTableName, data) for i, sql := range sqls { args := argss[i] @@ -272,11 +283,11 @@ func insertCheckpointMeta[T any](ctx context.Context, se glue.Session, dbName st return errors.Trace(err) } -func selectCheckpointMeta[T any]( +func selectCheckpointMeta( ctx context.Context, execCtx sqlexec.RestrictedSQLExecutor, dbName string, tableName string, - meta *T, + meta any, ) error { rows, _, errSQL := execCtx.ExecRestrictedSQL( kv.WithInternalSourceType(ctx, kv.InternalTxnBR), diff --git a/br/pkg/restore/snap_client/client.go b/br/pkg/restore/snap_client/client.go index 0c8ebec216223..50712e230877a 100644 --- a/br/pkg/restore/snap_client/client.go +++ b/br/pkg/restore/snap_client/client.go @@ -303,15 +303,9 @@ func (rc *SnapClient) InitCheckpoint( s storage.ExternalStorage, config *pdutil.ClusterConfig, checkpointFirstRun bool, -) (map[int64]map[string]struct{}, *pdutil.ClusterConfig, error) { - var ( - // checkpoint sets distinguished by range key - checkpointSetWithTableID = make(map[int64]map[string]struct{}) - - checkpointClusterConfig *pdutil.ClusterConfig - - err error - ) +) (checkpointSetWithTableID map[int64]map[string]struct{}, checkpointClusterConfig *pdutil.ClusterConfig, err error) { + // checkpoint sets distinguished by range key + checkpointSetWithTableID = make(map[int64]map[string]struct{}) if !checkpointFirstRun { execCtx := rc.db.Session().GetSessionCtx().GetRestrictedSQLExecutor() @@ -379,7 +373,7 @@ func (rc *SnapClient) InitCheckpoint( if config != nil { meta.SchedulersConfig = &pdutil.ClusterConfig{Schedulers: config.Schedulers, ScheduleCfg: config.ScheduleCfg} } - if err = checkpoint.SaveCheckpointMetadataForSnapshotRestore(ctx, rc.db.Session(), meta); err != nil { + if err := checkpoint.SaveCheckpointMetadataForSnapshotRestore(ctx, rc.db.Session(), meta); err != nil { return checkpointSetWithTableID, nil, errors.Trace(err) } } diff --git a/br/pkg/task/restore.go b/br/pkg/task/restore.go index beb7156021b48..be6b655c08b63 100644 --- a/br/pkg/task/restore.go +++ b/br/pkg/task/restore.go @@ -1389,6 +1389,9 @@ func filterRestoreFiles( if name, ok := utils.GetSysDBName(db.Info.Name); utils.IsSysDB(name) && ok { dbName = name } + if checkpoint.IsCheckpointDB(db.Info.Name) { + continue + } if !cfg.TableFilter.MatchSchema(dbName) { continue } diff --git a/br/pkg/task/stream.go b/br/pkg/task/stream.go index 731cc6381011d..7b167a474f0bb 100644 --- a/br/pkg/task/stream.go +++ b/br/pkg/task/stream.go @@ -1797,7 +1797,7 @@ func checkPiTRTaskInfo( ) } - log.Info("the same task, skip snapshot restore") + log.Info("detect log restore checkpoint. so skip snapshot restore and start log restore from the checkpoint") // the same task, skip full restore because it is already in the log restore stage. doFullRestore = false } From 6d3e630af21bf5ef87d981b07b3b72fb495d6fa7 Mon Sep 17 00:00:00 2001 From: Jianjun Liao Date: Tue, 24 Sep 2024 17:48:47 +0800 Subject: [PATCH 15/15] fix integration test Signed-off-by: Jianjun Liao --- br/pkg/checkpoint/external_storage.go | 6 +++++- br/pkg/checkpoint/storage.go | 7 ++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/br/pkg/checkpoint/external_storage.go b/br/pkg/checkpoint/external_storage.go index 2e0576e4ab218..d6d8e30c67996 100644 --- a/br/pkg/checkpoint/external_storage.go +++ b/br/pkg/checkpoint/external_storage.go @@ -39,7 +39,11 @@ type externalCheckpointStorage struct { timer GlobalTimer } -func newExternalCheckpointStorage(ctx context.Context, s storage.ExternalStorage, timer GlobalTimer) (*externalCheckpointStorage, error) { +func newExternalCheckpointStorage( + ctx context.Context, + s storage.ExternalStorage, + timer GlobalTimer, +) (*externalCheckpointStorage, error) { checkpointStorage := &externalCheckpointStorage{ flushPosition: flushPositionForBackup(), storage: s, diff --git a/br/pkg/checkpoint/storage.go b/br/pkg/checkpoint/storage.go index b297bc6781a8b..c46d955209306 100644 --- a/br/pkg/checkpoint/storage.go +++ b/br/pkg/checkpoint/storage.go @@ -27,7 +27,6 @@ import ( "github.com/pingcap/tidb/br/pkg/glue" "github.com/pingcap/tidb/pkg/domain" "github.com/pingcap/tidb/pkg/kv" - "github.com/pingcap/tidb/pkg/parser/model" pmodel "github.com/pingcap/tidb/pkg/parser/model" "github.com/pingcap/tidb/pkg/util/sqlexec" "go.uber.org/zap" @@ -135,9 +134,7 @@ func (s *tableCheckpointStorage) updateLock(ctx context.Context) error { return nil } -func (s *tableCheckpointStorage) deleteLock(ctx context.Context) { - log.Fatal("unimplement!") -} +func (s *tableCheckpointStorage) deleteLock(ctx context.Context) {} func (s *tableCheckpointStorage) flushCheckpointData(ctx context.Context, data []byte) error { sqls, argss := chunkInsertCheckpointSQLs(s.checkpointDBName, checkpointDataTableName, data) @@ -328,7 +325,7 @@ func dropCheckpointTables( } } // check if any user table is created in the checkpoint database - tables, err := dom.InfoSchema().SchemaTableInfos(ctx, model.NewCIStr(dbName)) + tables, err := dom.InfoSchema().SchemaTableInfos(ctx, pmodel.NewCIStr(dbName)) if err != nil { return errors.Trace(err) }