From 21416b1d9af2ac79791573f0a5e457e32039417b Mon Sep 17 00:00:00 2001 From: Ishan Tyagi <42602577+ishan16696@users.noreply.github.com> Date: Fri, 30 Aug 2024 00:00:57 +0530 Subject: [PATCH] Retry to take full snapshot if prev full snapshot fails due to some reason. (#765) * Retry to take full snapshot if prev full snapshot fails. * Retry with backoff. * Move checking inside func IsFullSnapshotRequiredAtStartup(). * Fixed unit tests. * Added unit test. Address review comments. --- pkg/server/backuprestoreserver.go | 5 ++- pkg/snapshot/snapshotter/snapshotter.go | 46 +++++++++++--------- pkg/snapshot/snapshotter/snapshotter_test.go | 19 ++++++++ 3 files changed, 49 insertions(+), 21 deletions(-) diff --git a/pkg/server/backuprestoreserver.go b/pkg/server/backuprestoreserver.go index a32bd8fa7..b8ba2d128 100644 --- a/pkg/server/backuprestoreserver.go +++ b/pkg/server/backuprestoreserver.go @@ -422,14 +422,17 @@ func (b *BackupRestoreServer) runEtcdProbeLoopWithSnapshotter(ctx context.Contex if !initialDeltaSnapshotTaken { // need to take a full snapshot here // if initial deltaSnapshot is not taken + // or previous full snapshot wasn't successful var snapshot *brtypes.Snapshot metrics.SnapshotRequired.With(prometheus.Labels{metrics.LabelKind: brtypes.SnapshotKindDelta}).Set(0) metrics.SnapshotRequired.With(prometheus.Labels{metrics.LabelKind: brtypes.SnapshotKindFull}).Set(1) if snapshot, err = ssr.TakeFullSnapshotAndResetTimer(false); err != nil { metrics.SnapshotterOperationFailure.With(prometheus.Labels{metrics.LabelError: err.Error()}).Inc() - b.logger.Errorf("Failed to take substitute first full snapshot: %v", err) + ssr.PrevFullSnapshotSucceeded = false + b.logger.Errorf("Failed to take substitute full snapshot: %v", err) continue } + ssr.PrevFullSnapshotSucceeded = true if b.config.HealthConfig.SnapshotLeaseRenewalEnabled { leaseUpdatectx, cancel := context.WithTimeout(ctx, brtypes.LeaseUpdateTimeoutDuration) defer cancel() diff --git a/pkg/snapshot/snapshotter/snapshotter.go b/pkg/snapshot/snapshotter/snapshotter.go index 79c9e6842..6e9331868 100644 --- a/pkg/snapshot/snapshotter/snapshotter.go +++ b/pkg/snapshot/snapshotter/snapshotter.go @@ -109,6 +109,7 @@ type Snapshotter struct { K8sClientset client.Client snapstoreConfig *brtypes.SnapstoreConfig lastSecretModifiedTime time.Time + PrevFullSnapshotSucceeded bool } // NewSnapshotter returns the snapshotter object. @@ -149,25 +150,26 @@ func NewSnapshotter(logger *logrus.Entry, config *brtypes.SnapshotterConfig, sto } return &Snapshotter{ - logger: logger.WithField("actor", "snapshotter"), - store: store, - config: config, - etcdConnectionConfig: etcdConnectionConfig, - compressionConfig: compressionConfig, - HealthConfig: healthConfig, - schedule: sdl, - PrevSnapshot: prevSnapshot, - PrevFullSnapshot: fullSnap, - PrevDeltaSnapshots: deltaSnapList, - SsrState: brtypes.SnapshotterInactive, - SsrStateMutex: &sync.Mutex{}, - fullSnapshotReqCh: make(chan bool), - deltaSnapshotReqCh: make(chan struct{}), - fullSnapshotAckCh: make(chan result), - deltaSnapshotAckCh: make(chan result), - cancelWatch: func() {}, - K8sClientset: clientSet, - snapstoreConfig: storeConfig, + logger: logger.WithField("actor", "snapshotter"), + store: store, + config: config, + etcdConnectionConfig: etcdConnectionConfig, + compressionConfig: compressionConfig, + HealthConfig: healthConfig, + schedule: sdl, + PrevSnapshot: prevSnapshot, + PrevFullSnapshot: fullSnap, + PrevDeltaSnapshots: deltaSnapList, + SsrState: brtypes.SnapshotterInactive, + SsrStateMutex: &sync.Mutex{}, + fullSnapshotReqCh: make(chan bool), + deltaSnapshotReqCh: make(chan struct{}), + fullSnapshotAckCh: make(chan result), + deltaSnapshotAckCh: make(chan result), + cancelWatch: func() {}, + K8sClientset: clientSet, + snapstoreConfig: storeConfig, + PrevFullSnapshotSucceeded: true, }, nil } @@ -648,8 +650,10 @@ func (ssr *Snapshotter) snapshotEventHandler(stopCh <-chan struct{}) error { } ssr.fullSnapshotAckCh <- res if err != nil { + ssr.PrevFullSnapshotSucceeded = false return err } + ssr.PrevFullSnapshotSucceeded = true if ssr.HealthConfig.SnapshotLeaseRenewalEnabled { ssr.FullSnapshotLeaseUpdateTimer.Stop() ssr.FullSnapshotLeaseUpdateTimer.Reset(time.Nanosecond) @@ -675,8 +679,10 @@ func (ssr *Snapshotter) snapshotEventHandler(stopCh <-chan struct{}) error { case <-ssr.fullSnapshotTimer.C: if _, err := ssr.TakeFullSnapshotAndResetTimer(false); err != nil { + ssr.PrevFullSnapshotSucceeded = false return err } + ssr.PrevFullSnapshotSucceeded = true if ssr.HealthConfig.SnapshotLeaseRenewalEnabled { ssr.FullSnapshotLeaseUpdateTimer.Stop() ssr.FullSnapshotLeaseUpdateTimer.Reset(time.Nanosecond) @@ -764,7 +770,7 @@ func (ssr *Snapshotter) hasSnapStoreSecretUpdated() (bool, error) { // IsFullSnapshotRequiredAtStartup checks whether to take a full snapshot or not during the startup of backup-restore. func (ssr *Snapshotter) IsFullSnapshotRequiredAtStartup(timeWindow float64) bool { - if ssr.PrevFullSnapshot == nil || ssr.PrevFullSnapshot.IsFinal || time.Since(ssr.PrevFullSnapshot.CreatedOn).Hours() > timeWindow { + if ssr.PrevFullSnapshot == nil || ssr.PrevFullSnapshot.IsFinal || time.Since(ssr.PrevFullSnapshot.CreatedOn).Hours() > timeWindow || !ssr.PrevFullSnapshotSucceeded { return true } diff --git a/pkg/snapshot/snapshotter/snapshotter_test.go b/pkg/snapshot/snapshotter/snapshotter_test.go index 4b573d601..ce85159a6 100644 --- a/pkg/snapshot/snapshotter/snapshotter_test.go +++ b/pkg/snapshot/snapshotter/snapshotter_test.go @@ -790,6 +790,22 @@ var _ = Describe("Snapshotter", func() { }) }) + Context("Previous full snapshot was not successful", func() { + It("should return true", func() { + snapshotterConfig := &brtypes.SnapshotterConfig{ + FullSnapshotSchedule: fmt.Sprintf("%d %d * * *", (currentMin+1)%60, (currentHour+2)%24), + } + + ssr, err = NewSnapshotter(logger, snapshotterConfig, store, etcdConnectionConfig, compressionConfig, healthConfig, snapstoreConfig) + Expect(err).ShouldNot(HaveOccurred()) + + // previous full snapshot wasn't successful + ssr.PrevFullSnapshotSucceeded = false + isFullSnapMissed := ssr.IsFullSnapshotRequiredAtStartup(fullSnapshotTimeWindow) + Expect(isFullSnapMissed).Should(BeTrue()) + }) + }) + Context("Previous full snapshot was taken exactly at scheduled snapshot time, no FullSnapshot was missed", func() { It("should return false", func() { snapshotterConfig := &brtypes.SnapshotterConfig{ @@ -803,6 +819,7 @@ var _ = Describe("Snapshotter", func() { ssr.PrevFullSnapshot = &brtypes.Snapshot{ CreatedOn: time.Date(time.Now().Year(), time.Now().Month(), time.Now().Day()-1, (currentHour+2)%24, (currentMin+1)%60, 0, 0, time.Local), } + ssr.PrevFullSnapshotSucceeded = true isFullSnapMissed := ssr.IsFullSnapshotRequiredAtStartup(fullSnapshotTimeWindow) Expect(isFullSnapMissed).Should(BeFalse()) @@ -823,6 +840,7 @@ var _ = Describe("Snapshotter", func() { ssr.PrevFullSnapshot = &brtypes.Snapshot{ CreatedOn: time.Date(time.Now().Year(), time.Now().Month(), time.Now().Day(), (currentHour-4)%24, (currentMin-10)%60, 0, 0, time.Local), } + ssr.PrevFullSnapshotSucceeded = true isFullSnapCanBeMissed := ssr.IsFullSnapshotRequiredAtStartup(fullSnapshotTimeWindow) Expect(isFullSnapCanBeMissed).Should(BeFalse()) }) @@ -842,6 +860,7 @@ var _ = Describe("Snapshotter", func() { ssr.PrevFullSnapshot = &brtypes.Snapshot{ CreatedOn: time.Date(time.Now().Year(), time.Now().Month(), time.Now().Day(), (currentHour-18)%24, (currentMin)%60, 0, 0, time.Local), } + ssr.PrevFullSnapshotSucceeded = true isFullSnapCanBeMissed := ssr.IsFullSnapshotRequiredAtStartup(fullSnapshotTimeWindow) Expect(isFullSnapCanBeMissed).Should(BeTrue()) })