-
Notifications
You must be signed in to change notification settings - Fork 5.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
br: restore checksum shouldn't rely on backup checksum #56712
Changes from 6 commits
8b843bd
829e6bc
03b2bf9
134692b
a835140
9fa90b9
3e9053d
6b28638
06276b3
668fe54
ac5a12b
ffd6082
95059bc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -166,11 +166,6 @@ type Table struct { | |||||||||||
StatsFileIndexes []*backuppb.StatsFileIndex | ||||||||||||
} | ||||||||||||
|
||||||||||||
// NoChecksum checks whether the table has a calculated checksum. | ||||||||||||
func (tbl *Table) NoChecksum() bool { | ||||||||||||
return tbl.Crc64Xor == 0 && tbl.TotalKvs == 0 && tbl.TotalBytes == 0 | ||||||||||||
} | ||||||||||||
|
||||||||||||
// MetaReader wraps a reader to read both old and new version of backupmeta. | ||||||||||||
type MetaReader struct { | ||||||||||||
storage storage.ExternalStorage | ||||||||||||
|
@@ -235,14 +230,41 @@ func (reader *MetaReader) readDataFiles(ctx context.Context, output func(*backup | |||||||||||
} | ||||||||||||
|
||||||||||||
// ArchiveSize return the size of Archive data | ||||||||||||
func (*MetaReader) ArchiveSize(_ context.Context, files []*backuppb.File) uint64 { | ||||||||||||
func ArchiveSize(files []*backuppb.File) uint64 { | ||||||||||||
total := uint64(0) | ||||||||||||
for _, file := range files { | ||||||||||||
total += file.Size_ | ||||||||||||
} | ||||||||||||
return total | ||||||||||||
} | ||||||||||||
|
||||||||||||
type ChecksumStats struct { | ||||||||||||
Crc64Xor uint64 | ||||||||||||
TotalKvs uint64 | ||||||||||||
TotalBytes uint64 | ||||||||||||
} | ||||||||||||
|
||||||||||||
func (stats *ChecksumStats) ChecksumExists() bool { | ||||||||||||
if stats == nil { | ||||||||||||
return false | ||||||||||||
} | ||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It seems pointer receiver isn't needed here? As I cannot find where a
Suggested change
|
||||||||||||
if stats.Crc64Xor == 0 && stats.TotalKvs == 0 && stats.TotalBytes == 0 { | ||||||||||||
return false | ||||||||||||
} | ||||||||||||
return true | ||||||||||||
} | ||||||||||||
|
||||||||||||
// CalculateChecksumStatsOnFiles returns the ChecksumStats for the given files | ||||||||||||
func CalculateChecksumStatsOnFiles(files []*backuppb.File) ChecksumStats { | ||||||||||||
var stats ChecksumStats | ||||||||||||
for _, file := range files { | ||||||||||||
stats.Crc64Xor ^= file.Crc64Xor | ||||||||||||
stats.TotalKvs += file.TotalKvs | ||||||||||||
stats.TotalBytes += file.TotalBytes | ||||||||||||
} | ||||||||||||
return stats | ||||||||||||
} | ||||||||||||
|
||||||||||||
// ReadDDLs reads the ddls from the backupmeta. | ||||||||||||
// This function is compatible with the old backupmeta. | ||||||||||||
func (reader *MetaReader) ReadDDLs(ctx context.Context) ([]byte, error) { | ||||||||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -467,8 +467,8 @@ func (rc *SnapClient) needLoadSchemas(backupMeta *backuppb.BackupMeta) bool { | |
return !(backupMeta.IsRawKv || backupMeta.IsTxnKv) | ||
} | ||
|
||
// InitBackupMeta loads schemas from BackupMeta to initialize RestoreClient. | ||
func (rc *SnapClient) InitBackupMeta( | ||
// LoadSchemaIfNeededAndInitClient loads schemas from BackupMeta to initialize RestoreClient. | ||
func (rc *SnapClient) LoadSchemaIfNeededAndInitClient( | ||
c context.Context, | ||
backupMeta *backuppb.BackupMeta, | ||
backend *backuppb.StorageBackend, | ||
|
@@ -989,7 +989,7 @@ func (rc *SnapClient) setSpeedLimit(ctx context.Context, rateLimit uint64) error | |
return nil | ||
} | ||
|
||
func (rc *SnapClient) execChecksum( | ||
func (rc *SnapClient) execAndValidateChecksum( | ||
ctx context.Context, | ||
tbl *CreatedTable, | ||
kvClient kv.Client, | ||
|
@@ -1000,13 +1000,14 @@ func (rc *SnapClient) execChecksum( | |
zap.String("table", tbl.OldTable.Info.Name.O), | ||
) | ||
|
||
if tbl.OldTable.NoChecksum() { | ||
logger.Warn("table has no checksum, skipping checksum") | ||
expectedChecksumStats := metautil.CalculateChecksumStatsOnFiles(tbl.OldTable.Files) | ||
if !expectedChecksumStats.ChecksumExists() { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. not sure if we need to do the check here, if an empty table is backed up, should we check after restore if it's still empty? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, the error log is misleading. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should I remove the checking here at all? |
||
logger.Error("table has no checksum, skipping checksum") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would you also print the table and database name here? Also I think this can be a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. right, I think this line above adds the db and table as fields for all subsequent logging
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm a bit skeptical about this logic, should we just remove it? If there is an empty table restored(would there be a case?), we should still check the checksum make sure it's empty. What do you think? |
||
return nil | ||
} | ||
|
||
if span := opentracing.SpanFromContext(ctx); span != nil && span.Tracer() != nil { | ||
span1 := span.Tracer().StartSpan("Client.execChecksum", opentracing.ChildOf(span.Context())) | ||
span1 := span.Tracer().StartSpan("Client.execAndValidateChecksum", opentracing.ChildOf(span.Context())) | ||
defer span1.Finish() | ||
ctx = opentracing.ContextWithSpan(ctx, span1) | ||
} | ||
|
@@ -1046,21 +1047,24 @@ func (rc *SnapClient) execChecksum( | |
} | ||
} | ||
} | ||
table := tbl.OldTable | ||
if item.Crc64xor != table.Crc64Xor || | ||
item.TotalKvs != table.TotalKvs || | ||
item.TotalBytes != table.TotalBytes { | ||
checksumMatch := item.Crc64xor == expectedChecksumStats.Crc64Xor && | ||
item.TotalKvs == expectedChecksumStats.TotalKvs && | ||
item.TotalBytes == expectedChecksumStats.TotalBytes | ||
failpoint.Inject("full-restore-validate-checksum", func(_ failpoint.Value) { | ||
checksumMatch = false | ||
}) | ||
if !checksumMatch { | ||
logger.Error("failed in validate checksum", | ||
zap.Uint64("origin tidb crc64", table.Crc64Xor), | ||
zap.Uint64("expected tidb crc64", expectedChecksumStats.Crc64Xor), | ||
zap.Uint64("calculated crc64", item.Crc64xor), | ||
zap.Uint64("origin tidb total kvs", table.TotalKvs), | ||
zap.Uint64("expected tidb total kvs", expectedChecksumStats.TotalKvs), | ||
zap.Uint64("calculated total kvs", item.TotalKvs), | ||
zap.Uint64("origin tidb total bytes", table.TotalBytes), | ||
zap.Uint64("expected tidb total bytes", expectedChecksumStats.TotalBytes), | ||
zap.Uint64("calculated total bytes", item.TotalBytes), | ||
) | ||
return errors.Annotate(berrors.ErrRestoreChecksumMismatch, "failed to validate checksum") | ||
} | ||
logger.Info("success in validate checksum") | ||
logger.Info("success in validating checksum") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would you also add the table / database name to this log? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same as above |
||
return nil | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -800,6 +800,13 @@ func DefaultBackupConfig() BackupConfig { | |
if err != nil { | ||
log.Panic("infallible operation failed.", zap.Error(err)) | ||
} | ||
|
||
// Check if the checksum flag was set by the user | ||
if !fs.Changed("checksum") { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. oh right... my mistake There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It seems in this context, There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. BTW, can we override the default value of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I feel like having override in |
||
// If not set, disable it for backup | ||
cfg.Checksum = false | ||
} | ||
|
||
return cfg | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,6 +3,7 @@ | |
package task | ||
|
||
import ( | ||
"os" | ||
"testing" | ||
"time" | ||
|
||
|
@@ -222,3 +223,27 @@ func TestBackupConfigHash(t *testing.T) { | |
hashCheck(t, &testCfg, originalHash, true) | ||
} | ||
} | ||
|
||
func TestDefaultBackupConfigDisableChecksum(t *testing.T) { | ||
// Test the default configuration | ||
cfg := DefaultBackupConfig() | ||
|
||
// Check some default values | ||
require.Equal(t, uint32(4), cfg.Concurrency) | ||
require.Equal(t, uint32(2), cfg.ChecksumConcurrency) | ||
require.False(t, cfg.SendCreds) | ||
require.False(t, cfg.Checksum) | ||
|
||
// Test with checksum flag set | ||
os.Args = []string{"cmd", "--checksum=true"} | ||
cfg = DefaultBackupConfig() | ||
require.True(t, cfg.Checksum) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah you are absolutely right, should revisit before making PR read for review. It did pass locally and on CI which is weird. |
||
|
||
// Test with checksum flag explicitly set to false | ||
os.Args = []string{"cmd", "--checksum=false"} | ||
cfg = DefaultBackupConfig() | ||
require.False(t, cfg.Checksum) | ||
|
||
// Reset os.Args | ||
os.Args = []string{"cmd"} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -801,7 +801,7 @@ func runSnapshotRestore(c context.Context, mgr *conn.Mgr, g glue.Glue, cmdName s | |
} | ||
|
||
reader := metautil.NewMetaReader(backupMeta, s, &cfg.CipherInfo) | ||
if err = client.InitBackupMeta(c, backupMeta, u, reader, cfg.LoadStats); err != nil { | ||
if err = client.LoadSchemaIfNeededAndInitClient(c, backupMeta, u, reader, cfg.LoadStats); err != nil { | ||
return errors.Trace(err) | ||
} | ||
|
||
|
@@ -822,7 +822,7 @@ func runSnapshotRestore(c context.Context, mgr *conn.Mgr, g glue.Glue, cmdName s | |
} | ||
} | ||
|
||
archiveSize := reader.ArchiveSize(ctx, files) | ||
archiveSize := metautil.ArchiveSize(files) | ||
g.Record(summary.RestoreDataSize, archiveSize) | ||
//restore from tidb will fetch a general Size issue https://github.com/pingcap/tidb/issues/27247 | ||
g.Record("Size", archiveSize) | ||
|
@@ -1108,8 +1108,9 @@ func runSnapshotRestore(c context.Context, mgr *conn.Mgr, g glue.Glue, cmdName s | |
errCh := make(chan error, 32) | ||
postHandleCh := afterTableRestoredCh(ctx, createdTables) | ||
|
||
// pipeline checksum | ||
if cfg.Checksum { | ||
// pipeline checksum only when enabled and is not incremental snapshot repair mode cuz incremental doesn't have | ||
// enough information in backup meta to validate checksum | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we don't use the collection of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. could you point me to the code, I thought table level checksum for incremental is not calculated during backup
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Get it. |
||
if cfg.Checksum && !client.IsIncremental() { | ||
postHandleCh = client.GoValidateChecksum( | ||
ctx, postHandleCh, mgr.GetStorage().GetClient(), errCh, updateCh, cfg.ChecksumConcurrency) | ||
} | ||
|
@@ -1124,7 +1125,7 @@ func runSnapshotRestore(c context.Context, mgr *conn.Mgr, g glue.Glue, cmdName s | |
|
||
finish := dropToBlackhole(ctx, postHandleCh, errCh) | ||
|
||
// Reset speed limit. ResetSpeedLimit must be called after client.InitBackupMeta has been called. | ||
// Reset speed limit. ResetSpeedLimit must be called after client.LoadSchemaIfNeededAndInitClient has been called. | ||
defer func() { | ||
var resetErr error | ||
// In future we may need a mechanism to set speed limit in ttl. like what we do in switchmode. TODO | ||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -22,33 +22,62 @@ CUR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) | |||||
|
||||||
run_sql "CREATE DATABASE $DB;" | ||||||
go-ycsb load mysql -P $CUR/workload -p mysql.host=$TIDB_IP -p mysql.port=$TIDB_PORT -p mysql.user=root -p mysql.db=$DB | ||||||
run_br --pd $PD_ADDR backup full -s "local://$TEST_DIR/$DB" | ||||||
run_br --pd $PD_ADDR backup full -s "local://$TEST_DIR/$DB" --checksum=false | ||||||
|
||||||
filename=$(find $TEST_DIR/$DB -regex ".*.sst" | head -n 1) | ||||||
filename_temp=$filename"_temp" | ||||||
filename_bak=$filename"_bak" | ||||||
echo "corruption" > $filename_temp | ||||||
cat $filename >> $filename_temp | ||||||
# Replace the single file manipulation with a loop over all .sst files | ||||||
for filename in $(find $TEST_DIR/$DB -name "*.sst"); do | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. need to corrupt every sst file cuz during restore not all sst are going to be used, since backup backs up a bunch of tables but some of them are going to be filtered out during restore |
||||||
filename_temp="${filename}_temp" | ||||||
filename_bak="${filename}_bak" | ||||||
echo "corruption" > "$filename_temp" | ||||||
cat "$filename" >> "$filename_temp" | ||||||
mv "$filename" "$filename_bak" | ||||||
done | ||||||
|
||||||
# need to drop db otherwise restore will fail because of cluster not fresh but not the expected issue | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. previous br_file_corruption is not testing what it should be testing, restore fails because of cluster not fresh but not because of corruption. |
||||||
run_sql "DROP DATABASE IF EXISTS $DB;" | ||||||
|
||||||
# file lost | ||||||
mv $filename $filename_bak | ||||||
export GO_FAILPOINTS="github.com/pingcap/tidb/br/pkg/utils/set-import-attempt-to-one=return(true)" | ||||||
restore_fail=0 | ||||||
run_br --pd $PD_ADDR restore full -s "local://$TEST_DIR/$DB" || restore_fail=1 | ||||||
export GO_FAILPOINTS="" | ||||||
if [ $restore_fail -ne 1 ]; then | ||||||
echo 'restore success' | ||||||
echo 'expect restore to fail on file lost but succeed' | ||||||
exit 1 | ||||||
fi | ||||||
run_sql "DROP DATABASE IF EXISTS $DB;" | ||||||
|
||||||
# file corruption | ||||||
mv $filename_temp $filename | ||||||
truncate --size=-11 $filename | ||||||
for filename in $(find $TEST_DIR/$DB -name "*.sst_temp"); do | ||||||
mv "$filename" "${filename%_temp}" | ||||||
truncate -s 11 "${filename%_temp}" | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. --size=-11 doesn't work on macOS There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
This comment was marked as resolved.
Sorry, something went wrong. |
||||||
done | ||||||
|
||||||
export GO_FAILPOINTS="github.com/pingcap/tidb/br/pkg/utils/set-import-attempt-to-one=return(true)" | ||||||
restore_fail=0 | ||||||
run_br --pd $PD_ADDR restore full -s "local://$TEST_DIR/$DB" || restore_fail=1 | ||||||
export GO_FAILPOINTS="" | ||||||
if [ $restore_fail -ne 1 ]; then | ||||||
echo 'restore success' | ||||||
echo 'expect restore to fail on file corruption but succeed' | ||||||
exit 1 | ||||||
fi | ||||||
run_sql "DROP DATABASE IF EXISTS $DB;" | ||||||
|
||||||
# verify validating checksum is still performed even backup didn't enable it | ||||||
for filename in $(find $TEST_DIR/$DB -name "*.sst_bak"); do | ||||||
mv "$filename" "${filename%_bak}" | ||||||
done | ||||||
|
||||||
export GO_FAILPOINTS="github.com/pingcap/tidb/br/pkg/restore/snap_client/full-restore-validate-checksum=return(true)" | ||||||
restore_fail=0 | ||||||
run_br --pd $PD_ADDR restore full -s "local://$TEST_DIR/$DB" --checksum=true || restore_fail=1 | ||||||
export GO_FAILPOINTS="" | ||||||
if [ $restore_fail -ne 1 ]; then | ||||||
echo 'expect restore to fail on checksum mismatch but succeed' | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: I guess if we have a corrupted SST file, the restoration fails though with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. oh it's not testing corrupted SST file in this test, the valid files are moved back after the previous corruption test, This test is just to verify the checksum process is running even backup disables the checksum, by injecting the fail point into the checksum routine. Right after this test there is a sanity test that can restore successfully, verifying all files are valid. |
||||||
exit 1 | ||||||
fi | ||||||
run_sql "DROP DATABASE IF EXISTS $DB;" | ||||||
|
||||||
# sanity check restore can succeed | ||||||
run_br --pd $PD_ADDR restore full -s "local://$TEST_DIR/$DB" --checksum=true | ||||||
echo 'file corruption tests passed' |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Keep consistency with other fields?