From 80f96ed0e4d703b63dcaa94a2c435d19b82bc403 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Mon, 11 Dec 2023 13:31:48 +0800 Subject: [PATCH] ebs br: control the snapshots batch size for fsr enable/disable (#48506) (#48526) close pingcap/tidb#48505 --- br/pkg/aws/ebs.go | 79 ++++++++++++++++++++------------- br/pkg/task/restore_ebs_meta.go | 5 ++- 2 files changed, 51 insertions(+), 33 deletions(-) diff --git a/br/pkg/aws/ebs.go b/br/pkg/aws/ebs.go index eb7b51e26e40f..ff90e50851fa7 100644 --- a/br/pkg/aws/ebs.go +++ b/br/pkg/aws/ebs.go @@ -27,6 +27,7 @@ import ( const ( pollingPendingSnapshotInterval = 30 * time.Second errCodeTooManyPendingSnapshots = "PendingSnapshotLimitExceeded" + FsrApiSnapshotsThreshold = 10 ) type EC2Session struct { @@ -294,24 +295,32 @@ func (e *EC2Session) EnableDataFSR(meta *config.EBSBasedBRMeta, targetAZ string) for availableZone := range snapshotsIDsMap { targetAZ := availableZone - eg.Go(func() error { - log.Info("enable fsr for snapshots", zap.String("available zone", targetAZ)) - resp, err := e.ec2.EnableFastSnapshotRestores(&ec2.EnableFastSnapshotRestoresInput{ - AvailabilityZones: []*string{&targetAZ}, - SourceSnapshotIds: snapshotsIDsMap[targetAZ], - }) - - if err != nil { - return errors.Trace(err) + // We have to control the batch size to avoid the error of "parameter SourceSnapshotIds must be less than or equal to 10" + for i := 0; i < len(snapshotsIDsMap[targetAZ]); i += FsrApiSnapshotsThreshold { + start := i + end := i + FsrApiSnapshotsThreshold + if end > len(snapshotsIDsMap[targetAZ]) { + end = len(snapshotsIDsMap[targetAZ]) } + eg.Go(func() error { + log.Info("enable fsr for snapshots", zap.String("available zone", targetAZ), zap.Any("snapshots", snapshotsIDsMap[targetAZ][start:end])) + resp, err := e.ec2.EnableFastSnapshotRestores(&ec2.EnableFastSnapshotRestoresInput{ + AvailabilityZones: []*string{&targetAZ}, + SourceSnapshotIds: snapshotsIDsMap[targetAZ][start:end], + }) - if len(resp.Unsuccessful) > 0 { - log.Warn("not all snapshots enabled FSR") - return errors.Errorf("Some snapshot fails to enable FSR for available zone %s, such as %s, error code is %v", targetAZ, *resp.Unsuccessful[0].SnapshotId, resp.Unsuccessful[0].FastSnapshotRestoreStateErrors) - } + if err != nil { + return errors.Trace(err) + } - return e.waitDataFSREnabled(snapshotsIDsMap[targetAZ], targetAZ) - }) + if len(resp.Unsuccessful) > 0 { + log.Warn("not all snapshots enabled FSR") + return errors.Errorf("Some snapshot fails to enable FSR for available zone %s, such as %s, error code is %v", targetAZ, *resp.Unsuccessful[0].SnapshotId, resp.Unsuccessful[0].FastSnapshotRestoreStateErrors) + } + + return e.waitDataFSREnabled(snapshotsIDsMap[targetAZ][start:end], targetAZ) + }) + } } return snapshotsIDsMap, eg.Wait() } @@ -329,7 +338,7 @@ func (e *EC2Session) waitDataFSREnabled(snapShotIDs []*string, targetAZ string) log.Info("starts check fsr pending snapshots", zap.Any("snapshots", pendingSnapshots), zap.String("available zone", targetAZ)) for { if len(pendingSnapshots) == 0 { - log.Info("all snapshots fsr enablement is finished", zap.String("available zone", targetAZ)) + log.Info("all snapshots in current batch fsr enablement is finished", zap.String("available zone", targetAZ), zap.Any("snapshots", snapShotIDs)) return nil } @@ -380,25 +389,33 @@ func (e *EC2Session) DisableDataFSR(snapshotsIDsMap map[string][]*string) error for availableZone := range snapshotsIDsMap { targetAZ := availableZone - eg.Go(func() error { - resp, err := e.ec2.DisableFastSnapshotRestores(&ec2.DisableFastSnapshotRestoresInput{ - AvailabilityZones: []*string{&targetAZ}, - SourceSnapshotIds: snapshotsIDsMap[targetAZ], - }) - - if err != nil { - return errors.Trace(err) + // We have to control the batch size to avoid the error of "parameter SourceSnapshotIds must be less than or equal to 10" + for i := 0; i < len(snapshotsIDsMap[targetAZ]); i += FsrApiSnapshotsThreshold { + start := i + end := i + FsrApiSnapshotsThreshold + if end > len(snapshotsIDsMap[targetAZ]) { + end = len(snapshotsIDsMap[targetAZ]) } + eg.Go(func() error { + resp, err := e.ec2.DisableFastSnapshotRestores(&ec2.DisableFastSnapshotRestoresInput{ + AvailabilityZones: []*string{&targetAZ}, + SourceSnapshotIds: snapshotsIDsMap[targetAZ][start:end], + }) - if len(resp.Unsuccessful) > 0 { - log.Warn("not all snapshots disabled FSR", zap.String("available zone", targetAZ)) - return errors.Errorf("Some snapshot fails to disable FSR for available zone %s, such as %s, error code is %v", targetAZ, *resp.Unsuccessful[0].SnapshotId, resp.Unsuccessful[0].FastSnapshotRestoreStateErrors) - } + if err != nil { + return errors.Trace(err) + } - log.Info("Disable FSR issued", zap.String("available zone", targetAZ)) + if len(resp.Unsuccessful) > 0 { + log.Warn("not all snapshots disabled FSR", zap.String("available zone", targetAZ)) + return errors.Errorf("Some snapshot fails to disable FSR for available zone %s, such as %s, error code is %v", targetAZ, *resp.Unsuccessful[0].SnapshotId, resp.Unsuccessful[0].FastSnapshotRestoreStateErrors) + } - return nil - }) + log.Info("Disable FSR issued", zap.String("available zone", targetAZ), zap.Any("snapshots", snapshotsIDsMap[targetAZ][start:end])) + + return nil + }) + } } return eg.Wait() } diff --git a/br/pkg/task/restore_ebs_meta.go b/br/pkg/task/restore_ebs_meta.go index b8a5fda16c54a..a7317770fcc96 100644 --- a/br/pkg/task/restore_ebs_meta.go +++ b/br/pkg/task/restore_ebs_meta.go @@ -241,10 +241,11 @@ func (h *restoreEBSMetaHelper) restoreVolumes(progress glue.Progress) (map[strin log.Error("failed to create all volumes, cleaning up created volume") ec2Session.DeleteVolumes(volumeIDMap) } - if h.cfg.UseFSR { err = ec2Session.DisableDataFSR(snapshotsIDsMap) - log.Error("disable fsr failed", zap.Error(err)) + if err != nil { + log.Error("disable fsr failed", zap.Error(err)) + } } }()