From 56c943b47c7f2e33cf5a0db0e17b693899ca2af0 Mon Sep 17 00:00:00 2001 From: 3pointer Date: Thu, 19 Aug 2021 17:00:02 +0800 Subject: [PATCH] lightning: move invalid and dirty checkpoint's check in dataCheck. (#27248) --- br/pkg/lightning/restore/check_info.go | 37 ++++++ br/pkg/lightning/restore/restore.go | 111 ++---------------- .../lightning_checkpoint_dirty_tableid/run.sh | 20 ++-- br/tests/lightning_error_summary/run.sh | 18 +-- 4 files changed, 71 insertions(+), 115 deletions(-) diff --git a/br/pkg/lightning/restore/check_info.go b/br/pkg/lightning/restore/check_info.go index a7507cf6a9222..023231691ec36 100644 --- a/br/pkg/lightning/restore/check_info.go +++ b/br/pkg/lightning/restore/check_info.go @@ -446,6 +446,43 @@ func (rc *Controller) CheckpointIsValid(ctx context.Context, tableInfo *mydump.M return nil, false, nil } + if tableCheckPoint.Status <= checkpoints.CheckpointStatusMaxInvalid { + failedStep := tableCheckPoint.Status * 10 + var action strings.Builder + action.WriteString("./tidb-lightning-ctl --checkpoint-error-") + switch failedStep { + case checkpoints.CheckpointStatusAlteredAutoInc, checkpoints.CheckpointStatusAnalyzed: + action.WriteString("ignore") + default: + action.WriteString("destroy") + } + action.WriteString("='") + action.WriteString(uniqueName) + action.WriteString("' --config=...") + + msgs = append(msgs, fmt.Sprintf("TiDB Lightning has failed last time. To prevent data loss, this run will stop now, "+ + "%s failed in step(%s), please run command %s,"+ + "You may also run `./tidb-lightning-ctl --checkpoint-error-destroy=all --config=...` to start from scratch,"+ + "For details of this failure, read the log file from the PREVIOUS run", + uniqueName, failedStep.MetricName(), action.String())) + return msgs, false, nil + } + + dbInfo, ok := rc.dbInfos[tableInfo.DB] + if ok { + t, ok := dbInfo.Tables[tableInfo.Name] + if ok { + if tableCheckPoint.TableID > 0 && tableCheckPoint.TableID != t.ID { + msgs = append(msgs, fmt.Sprintf("TiDB Lightning has detected tables with illegal checkpoints. To prevent data loss, this run will stop now,"+ + "please run command \"./tidb-lightning-ctl --checkpoint-remove='%s' --config=...\""+ + "You may also run `./tidb-lightning-ctl --checkpoint-error-destroy=all --config=...` to start from scratch,"+ + "For details of this failure, read the log file from the PREVIOUS run", + uniqueName)) + return msgs, false, nil + } + } + } + var permFromCheckpoint []int var columns []string for _, eng := range tableCheckPoint.Engines { diff --git a/br/pkg/lightning/restore/restore.go b/br/pkg/lightning/restore/restore.go index d375c9588229d..85757927134aa 100644 --- a/br/pkg/lightning/restore/restore.go +++ b/br/pkg/lightning/restore/restore.go @@ -734,16 +734,10 @@ func (rc *Controller) restoreSchema(ctx context.Context) error { } rc.dbInfos = dbInfos - if rc.cfg.App.CheckRequirements && rc.tidbGlue.OwnsSQLExecutor() { + if rc.tidbGlue.OwnsSQLExecutor() { if err = rc.DataCheck(ctx); err != nil { return errors.Trace(err) } - // print check template only if check requirements is true. - fmt.Println(rc.checkTemplate.Output()) - if !rc.checkTemplate.Success() { - return errors.Errorf("tidb-lightning pre-check failed." + - " Please fix the failed check(s) or set --check-requirements=false to skip checks") - } } // Load new checkpoints @@ -1312,87 +1306,6 @@ func (rc *Controller) restoreTables(ctx context.Context) error { }() } - // first collect all tables where the checkpoint is invalid - allInvalidCheckpoints := make(map[string]checkpoints.CheckpointStatus) - // collect all tables whose checkpoint's tableID can't match current tableID - allDirtyCheckpoints := make(map[string]struct{}) - for _, dbMeta := range rc.dbMetas { - dbInfo, ok := rc.dbInfos[dbMeta.Name] - if !ok { - return errors.Errorf("database %s not found in rc.dbInfos", dbMeta.Name) - } - for _, tableMeta := range dbMeta.Tables { - tableInfo, ok := dbInfo.Tables[tableMeta.Name] - if !ok { - return errors.Errorf("table info %s.%s not found", dbMeta.Name, tableMeta.Name) - } - - tableName := common.UniqueTable(dbInfo.Name, tableInfo.Name) - cp, err := rc.checkpointsDB.Get(ctx, tableName) - if err != nil { - return errors.Trace(err) - } - if cp.Status <= checkpoints.CheckpointStatusMaxInvalid { - allInvalidCheckpoints[tableName] = cp.Status - } else if cp.TableID > 0 && cp.TableID != tableInfo.ID { - allDirtyCheckpoints[tableName] = struct{}{} - } - } - } - - if len(allInvalidCheckpoints) != 0 { - logger := log.L() - logger.Error( - "TiDB Lightning has failed last time. To prevent data loss, this run will stop now. Please resolve errors first", - zap.Int("count", len(allInvalidCheckpoints)), - ) - - for tableName, status := range allInvalidCheckpoints { - failedStep := status * 10 - var action strings.Builder - action.WriteString("./tidb-lightning-ctl --checkpoint-error-") - switch failedStep { - case checkpoints.CheckpointStatusAlteredAutoInc, checkpoints.CheckpointStatusAnalyzed: - action.WriteString("ignore") - default: - action.WriteString("destroy") - } - action.WriteString("='") - action.WriteString(tableName) - action.WriteString("' --config=...") - - logger.Info("-", - zap.String("table", tableName), - zap.Uint8("status", uint8(status)), - zap.String("failedStep", failedStep.MetricName()), - zap.Stringer("recommendedAction", &action), - ) - } - - logger.Info("You may also run `./tidb-lightning-ctl --checkpoint-error-destroy=all --config=...` to start from scratch") - logger.Info("For details of this failure, read the log file from the PREVIOUS run") - - return errors.New("TiDB Lightning has failed last time; please resolve these errors first") - } - if len(allDirtyCheckpoints) > 0 { - logger := log.L() - logger.Error( - "TiDB Lightning has detected tables with illegal checkpoints. To prevent data mismatch, this run will stop now. Please remove these checkpoints first", - zap.Int("count", len(allDirtyCheckpoints)), - ) - - for tableName := range allDirtyCheckpoints { - logger.Info("-", - zap.String("table", tableName), - zap.String("recommendedAction", "./tidb-lightning-ctl --checkpoint-remove='"+tableName+"' --config=..."), - ) - } - - logger.Info("You may also run `./tidb-lightning-ctl --checkpoint-remove=all --config=...` to start from scratch") - - return errors.New("TiDB Lightning has detected tables with illegal checkpoints; please remove these checkpoints first") - } - for _, dbMeta := range rc.dbMetas { dbInfo := rc.dbInfos[dbMeta.Name] for _, tableMeta := range dbMeta.Tables { @@ -1811,10 +1724,11 @@ func (rc *Controller) preCheckRequirements(ctx context.Context) error { } } } - if rc.cfg.App.CheckRequirements && rc.tidbGlue.OwnsSQLExecutor() { - // print check template only if check requirements is true. + if rc.tidbGlue.OwnsSQLExecutor() { + // print check info at any time. fmt.Print(rc.checkTemplate.Output()) - if !rc.checkTemplate.Success() { + if rc.cfg.App.CheckRequirements && !rc.checkTemplate.Success() { + // if check requirements is true, return error. if !taskExist && rc.taskMgr != nil { rc.taskMgr.CleanupTask(ctx) } @@ -1827,14 +1741,12 @@ func (rc *Controller) preCheckRequirements(ctx context.Context) error { // DataCheck checks the data schema which needs #rc.restoreSchema finished. func (rc *Controller) DataCheck(ctx context.Context) error { - if !rc.cfg.App.CheckRequirements { - log.L().Info("skip data check due to user requirement") - return nil - } var err error - err = rc.HasLargeCSV(rc.dbMetas) - if err != nil { - return errors.Trace(err) + if rc.cfg.App.CheckRequirements { + err = rc.HasLargeCSV(rc.dbMetas) + if err != nil { + return errors.Trace(err) + } } checkPointCriticalMsgs := make([]string, 0, len(rc.dbMetas)) schemaCriticalMsgs := make([]string, 0, len(rc.dbMetas)) @@ -1852,7 +1764,8 @@ func (rc *Controller) DataCheck(ctx context.Context) error { checkPointCriticalMsgs = append(checkPointCriticalMsgs, msgs...) } } - if noCheckpoint && rc.cfg.TikvImporter.Backend != config.BackendTiDB { + + if rc.cfg.App.CheckRequirements && noCheckpoint && rc.cfg.TikvImporter.Backend != config.BackendTiDB { if msgs, err = rc.SchemaIsValid(ctx, tableInfo); err != nil { return errors.Trace(err) } diff --git a/br/tests/lightning_checkpoint_dirty_tableid/run.sh b/br/tests/lightning_checkpoint_dirty_tableid/run.sh index dfd320508d4e2..eeddfd493c263 100755 --- a/br/tests/lightning_checkpoint_dirty_tableid/run.sh +++ b/br/tests/lightning_checkpoint_dirty_tableid/run.sh @@ -27,14 +27,16 @@ run_sql 'DROP DATABASE IF EXISTS cpdt' export GO_FAILPOINTS="" set +e -run_lightning --enable-checkpoint=1 --log-file "$TEST_DIR/lightning-checkpoint-dirty-tableid.log" --config "tests/$TEST_NAME/mysql.toml" -d "tests/$TEST_NAME/data" +# put stdout to log file for next grep +run_lightning --enable-checkpoint=1 --log-file "$TEST_DIR/lightning-checkpoint-dirty-tableid.log" --config "tests/$TEST_NAME/mysql.toml" -d "tests/$TEST_NAME/data" >> "$TEST_DIR/lightning-checkpoint-dirty-tableid.log" set -e -ILLEGAL_CP_COUNT=$(grep "TiDB Lightning has detected tables with illegal checkpoints. To prevent data mismatch, this run will stop now. Please remove these checkpoints first" "$TEST_DIR/lightning-checkpoint-dirty-tableid.log" | wc -l) -TABLE_SUGGEST=$(grep "./tidb-lightning-ctl --checkpoint-remove=" "$TEST_DIR/lightning-checkpoint-dirty-tableid.log" | wc -l) +# some msg will split into two lines when put them into chart. +ILLEGAL_CP_COUNT=$(grep "TiDB Lightning has detected tables with illegal checkpoints. To prevent data loss, this run will stop now." "$TEST_DIR/lightning-checkpoint-dirty-tableid.log" | wc -l) +TABLE_SUGGEST=$(grep "checkpoint-remove=" "$TEST_DIR/lightning-checkpoint-dirty-tableid.log" | wc -l) [ $ILLEGAL_CP_COUNT -eq 1 ] -[ $TABLE_SUGGEST -eq 2 ] +[ $TABLE_SUGGEST -eq 1 ] # Try again with the file checkpoints @@ -50,11 +52,13 @@ run_sql 'DROP DATABASE IF EXISTS cpdt' export GO_FAILPOINTS="" set +e -run_lightning --enable-checkpoint=1 --log-file "$TEST_DIR/lightning-checkpoint-dirty-tableid.log" --config "tests/$TEST_NAME/file.toml" -d "tests/$TEST_NAME/data" +# put stdout to log file for next grep +run_lightning --enable-checkpoint=1 --log-file "$TEST_DIR/lightning-checkpoint-dirty-tableid.log" --config "tests/$TEST_NAME/file.toml" -d "tests/$TEST_NAME/data" >> "$TEST_DIR/lightning-checkpoint-dirty-tableid.log" set -e -ILLEGAL_CP_COUNT=$(grep "TiDB Lightning has detected tables with illegal checkpoints. To prevent data mismatch, this run will stop now. Please remove these checkpoints first" "$TEST_DIR/lightning-checkpoint-dirty-tableid.log" | wc -l) -TABLE_SUGGEST=$(grep "./tidb-lightning-ctl --checkpoint-remove=" "$TEST_DIR/lightning-checkpoint-dirty-tableid.log" | wc -l) +# some msg will split into two lines when put them into chart. +ILLEGAL_CP_COUNT=$(grep "TiDB Lightning has detected tables with illegal checkpoints. To prevent data loss, this run will stop now." "$TEST_DIR/lightning-checkpoint-dirty-tableid.log" | wc -l) +TABLE_SUGGEST=$(grep "checkpoint-remove=" "$TEST_DIR/lightning-checkpoint-dirty-tableid.log" | wc -l) [ $ILLEGAL_CP_COUNT -eq 1 ] -[ $TABLE_SUGGEST -eq 2 ] +[ $TABLE_SUGGEST -eq 1 ] diff --git a/br/tests/lightning_error_summary/run.sh b/br/tests/lightning_error_summary/run.sh index 674268a54c6b2..dcb06d6bf8c2f 100755 --- a/br/tests/lightning_error_summary/run.sh +++ b/br/tests/lightning_error_summary/run.sh @@ -53,16 +53,18 @@ grep -Fq '[-] [table=`error_summary`.`c`] [status=checksum] [error="checksum mis # Now check the error log when the checkpoint is not cleaned. set +e -run_lightning --enable-checkpoint=1 --log-file "$TEST_DIR/lightning-error-summary.log" +# put stdout to log for next grep +run_lightning --enable-checkpoint=1 --log-file "$TEST_DIR/lightning-error-summary.log" >> "$TEST_DIR/lightning-error-summary.log" ERRORCODE=$? set -e [ "$ERRORCODE" -ne 0 ] -tail -20 "$TEST_DIR/lightning-error-summary.log" > "$TEST_DIR/lightning-error-summary.tail" -grep -Fq '["TiDB Lightning has failed last time. To prevent data loss, this run will stop now. Please resolve errors first"] [count=2]' "$TEST_DIR/lightning-error-summary.tail" -grep -Fq '[-] [table=`error_summary`.`a`] [status=18] [failedStep=checksum] [recommendedAction="./tidb-lightning-ctl --checkpoint-error-destroy='"'"'`error_summary`.`a`'"'"' --config=..."]' "$TEST_DIR/lightning-error-summary.tail" -grep -Fq '[-] [table=`error_summary`.`c`] [status=18] [failedStep=checksum] [recommendedAction="./tidb-lightning-ctl --checkpoint-error-destroy='"'"'`error_summary`.`c`'"'"' --config=..."]' "$TEST_DIR/lightning-error-summary.tail" -! grep -Fq '[-] [table=`error_summary`.`b`] [status=18] [failedStep=checksum]' "$TEST_DIR/lightning-error-summary.tail" -grep -Fq '["You may also run `./tidb-lightning-ctl --checkpoint-error-destroy=all --config=...` to start from scratch"]' "$TEST_DIR/lightning-error-summary.tail" -grep -Fq '["For details of this failure, read the log file from the PREVIOUS run"]' "$TEST_DIR/lightning-error-summary.tail" +tail -100 "$TEST_DIR/lightning-error-summary.log" > "$TEST_DIR/lightning-error-summary.tail" +grep -Fq 'TiDB Lightning has failed last time. To prevent data loss, this run will stop now' "$TEST_DIR/lightning-error-summary.tail" +grep -Fq './tidb-lightning-ctl --checkpoint-error-destroy='"'"'`error_summary`.`a`'"'"' --config=...' "$TEST_DIR/lightning-error-summary.tail" +grep -Fq './tidb-lightning-ctl --checkpoint-error-destroy='"'"'`error_summary`.`c`'"'"' --config=...' "$TEST_DIR/lightning-error-summary.tail" +! grep -Fq './tidb-lightning-ctl --checkpoint-error-destroy='"'"'`error_summary`.`b`'"'"' --config=...' "$TEST_DIR/lightning-error-summary.tail" +grep -Fq 'checkpoint-error-destroy=all --config=...` to start from scratch' "$TEST_DIR/lightning-error-summary.tail" +grep -Fq 'For details of this failure, read the log file' "$TEST_DIR/lightning-error-summary.tail" +grep -Fq 'PREVIOUS run' "$TEST_DIR/lightning-error-summary.tail"