Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enhances the decision to take full snapshot during startup to avoid missing of any full-snapshot. #574

Merged
16 changes: 15 additions & 1 deletion pkg/miscellaneous/miscellaneous.go
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,6 @@ func IsMultiNode(logger *logrus.Entry) bool {
}

config := map[string]interface{}{}
err = yaml.Unmarshal([]byte(configYML), &config)
if err := yaml.Unmarshal([]byte(configYML), &config); err != nil {
return false
}
Expand Down Expand Up @@ -550,3 +549,18 @@ func IsPeerURLTLSEnabled() (bool, error) {

return peerURL.Scheme == https, nil
}

// GetPrevScheduledSnapTime returns the previous schedule snapshot time.
// TODO: Previous full snapshot time should be calculated on basis of previous cron schedule of full snapshot.
func GetPrevScheduledSnapTime(nextSnapSchedule time.Time, timeWindow float64) time.Time {
return time.Date(
nextSnapSchedule.Year(),
nextSnapSchedule.Month(),
nextSnapSchedule.Day(),
nextSnapSchedule.Hour()-int(timeWindow),
nextSnapSchedule.Minute(),
nextSnapSchedule.Second(),
nextSnapSchedule.Nanosecond(),
nextSnapSchedule.Location(),
)
}
63 changes: 28 additions & 35 deletions pkg/server/backuprestoreserver.go
Original file line number Diff line number Diff line change
Expand Up @@ -307,8 +307,8 @@ func (b *BackupRestoreServer) runServer(ctx context.Context, restoreOpts *brtype
// for the case when backup-restore becomes leading sidecar.
func (b *BackupRestoreServer) runEtcdProbeLoopWithSnapshotter(ctx context.Context, handler *HTTPHandler, ssr *snapshotter.Snapshotter, ss brtypes.SnapStore, ssrStopCh chan struct{}, ackCh chan struct{}) {
var (
err error
initialDeltaSnapshotTaken bool
err error
initialFullSnapshotTaken bool
)

for {
Expand Down Expand Up @@ -353,16 +353,32 @@ func (b *BackupRestoreServer) runEtcdProbeLoopWithSnapshotter(ctx context.Contex
// the delta snapshot memory limit), after which a full snapshot
// is taken and the regular snapshot schedule comes into effect.

// TODO: write code to find out if prev full snapshot is older than it is
// supposed to be, according to the given cron schedule, instead of the
// hard-coded "24 hours" full snapshot interval
fullSnapshotMaxTimeWindowInHours := ssr.GetFullSnapshotMaxTimeWindow(b.config.SnapshotterConfig.FullSnapshotSchedule)
initialFullSnapshotTaken = false
if ssr.IsFullSnapshotRequiredAtStartup(fullSnapshotMaxTimeWindowInHours) {
// need to take a full snapshot here
var snapshot *brtypes.Snapshot
metrics.SnapshotRequired.With(prometheus.Labels{metrics.LabelKind: brtypes.SnapshotKindDelta}).Set(0)
metrics.SnapshotRequired.With(prometheus.Labels{metrics.LabelKind: brtypes.SnapshotKindFull}).Set(1)
if snapshot, err = ssr.TakeFullSnapshotAndResetTimer(false); err != nil {
metrics.SnapshotterOperationFailure.With(prometheus.Labels{metrics.LabelError: err.Error()}).Inc()
b.logger.Errorf("Failed to take substitute first full snapshot: %v", err)
continue
}
initialFullSnapshotTaken = true
if b.config.HealthConfig.SnapshotLeaseRenewalEnabled {
leaseUpdatectx, cancel := context.WithTimeout(ctx, brtypes.LeaseUpdateTimeoutDuration)
defer cancel()
if err = heartbeat.FullSnapshotCaseLeaseUpdate(leaseUpdatectx, b.logger, snapshot, ssr.K8sClientset, b.config.HealthConfig.FullSnapshotLeaseName, b.config.HealthConfig.DeltaSnapshotLeaseName); err != nil {
b.logger.Warnf("Snapshot lease update failed : %v", err)
}
}
if b.backoffConfig.Start {
b.backoffConfig.ResetExponentialBackoff()
}
}

// Temporary fix for missing alternate full snapshots for Gardener shoots
// with hibernation schedule set: change value from 24 ot 23.5 to
// accommodate for slight pod spin-up delays on shoot wake-up
const recentFullSnapshotPeriodInHours = 23.5
initialDeltaSnapshotTaken = false
if ssr.PrevFullSnapshot != nil && !ssr.PrevFullSnapshot.IsFinal && time.Since(ssr.PrevFullSnapshot.CreatedOn).Hours() <= recentFullSnapshotPeriodInHours {
if !initialFullSnapshotTaken {
ssrStopped, err := ssr.CollectEventsSincePrevSnapshot(ssrStopCh)
if ssrStopped {
b.logger.Info("Snapshotter stopped.")
Expand All @@ -375,7 +391,6 @@ func (b *BackupRestoreServer) runEtcdProbeLoopWithSnapshotter(ctx context.Contex
b.logger.Warnf("Failed to take first delta snapshot: snapshotter failed with error: %v", err)
continue
}
initialDeltaSnapshotTaken = true
if b.config.HealthConfig.SnapshotLeaseRenewalEnabled {
leaseUpdatectx, cancel := context.WithTimeout(ctx, brtypes.LeaseUpdateTimeoutDuration)
defer cancel()
Expand All @@ -391,28 +406,6 @@ func (b *BackupRestoreServer) runEtcdProbeLoopWithSnapshotter(ctx context.Contex
}
}

if !initialDeltaSnapshotTaken {
// need to take a full snapshot here
var snapshot *brtypes.Snapshot
metrics.SnapshotRequired.With(prometheus.Labels{metrics.LabelKind: brtypes.SnapshotKindDelta}).Set(0)
metrics.SnapshotRequired.With(prometheus.Labels{metrics.LabelKind: brtypes.SnapshotKindFull}).Set(1)
if snapshot, err = ssr.TakeFullSnapshotAndResetTimer(false); err != nil {
metrics.SnapshotterOperationFailure.With(prometheus.Labels{metrics.LabelError: err.Error()}).Inc()
b.logger.Errorf("Failed to take substitute first full snapshot: %v", err)
continue
}
if b.config.HealthConfig.SnapshotLeaseRenewalEnabled {
leaseUpdatectx, cancel := context.WithTimeout(ctx, brtypes.LeaseUpdateTimeoutDuration)
defer cancel()
if err = heartbeat.FullSnapshotCaseLeaseUpdate(leaseUpdatectx, b.logger, snapshot, ssr.K8sClientset, b.config.HealthConfig.FullSnapshotLeaseName, b.config.HealthConfig.DeltaSnapshotLeaseName); err != nil {
b.logger.Warnf("Snapshot lease update failed : %v", err)
}
}
if b.backoffConfig.Start {
b.backoffConfig.ResetExponentialBackoff()
}
}

// Set snapshotter state to Active
ssr.SetSnapshotterActive()

Expand All @@ -423,7 +416,7 @@ func (b *BackupRestoreServer) runEtcdProbeLoopWithSnapshotter(ctx context.Contex

// Start snapshotter
b.logger.Infof("Starting snapshotter...")
startWithFullSnapshot := ssr.PrevFullSnapshot == nil || ssr.PrevFullSnapshot.IsFinal || !(time.Since(ssr.PrevFullSnapshot.CreatedOn).Hours() <= recentFullSnapshotPeriodInHours)
startWithFullSnapshot := ssr.IsFullSnapshotRequiredAtStartup(fullSnapshotMaxTimeWindowInHours)
if err := ssr.Run(ssrStopCh, startWithFullSnapshot); err != nil {
if etcdErr, ok := err.(*errors.EtcdError); ok {
metrics.SnapshotterOperationFailure.With(prometheus.Labels{metrics.LabelError: etcdErr.Error()}).Inc()
Expand Down
71 changes: 68 additions & 3 deletions pkg/snapshot/snapshotter/snapshotter.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ import (
"fmt"
"io"
"path"
"strconv"
"strings"
"sync"
"time"

Expand All @@ -42,6 +44,15 @@ import (
"sigs.k8s.io/controller-runtime/pkg/client"
)

const (
ishan16696 marked this conversation as resolved.
Show resolved Hide resolved
min = iota // Minutes field
hour // Hours field
dayOfMonth // Day of month field
month // Month field
dayOfWeek // Day of week field
defaultFullSnapMaxTimeWindow = 24 // default full snapshot time window in hours
)

var (
emptyStruct struct{}
snapstoreHash = make(map[string]interface{})
Expand Down Expand Up @@ -104,7 +115,7 @@ func NewSnapshotter(logger *logrus.Entry, config *brtypes.SnapshotterConfig, sto
sdl, err := cron.ParseStandard(config.FullSnapshotSchedule)
if err != nil {
// Ideally this should be validated before.
return nil, fmt.Errorf("invalid schedule provied %s : %v", config.FullSnapshotSchedule, err)
return nil, fmt.Errorf("invalid full snapshot schedule provided %s : %v", config.FullSnapshotSchedule, err)
}

var prevSnapshot *brtypes.Snapshot
Expand Down Expand Up @@ -473,12 +484,12 @@ func (ssr *Snapshotter) TakeDeltaSnapshot() (*brtypes.Snapshot, error) {
defer rc.Close()

if err := ssr.store.Save(*snap, rc); err != nil {
timeTaken := time.Now().Sub(startTime).Seconds()
timeTaken := time.Since(startTime).Seconds()
metrics.SnapshotDurationSeconds.With(prometheus.Labels{metrics.LabelKind: brtypes.SnapshotKindDelta, metrics.LabelSucceeded: metrics.ValueSucceededFalse}).Observe(timeTaken)
ssr.logger.Errorf("Error saving delta snapshots. %v", err)
return nil, err
}
timeTaken := time.Now().Sub(startTime).Seconds()
timeTaken := time.Since(startTime).Seconds()
metrics.SnapshotDurationSeconds.With(prometheus.Labels{metrics.LabelKind: brtypes.SnapshotKindDelta, metrics.LabelSucceeded: metrics.ValueSucceededTrue}).Observe(timeTaken)
logrus.Infof("Total time to save delta snapshot: %f seconds.", timeTaken)
ssr.prevSnapshot = snap
Expand Down Expand Up @@ -740,3 +751,57 @@ func (ssr *Snapshotter) checkSnapstoreSecretUpdate() bool {
snapstoreHash[ssr.snapstoreConfig.Provider] = newSnapstoreSecretHash
return true
}

// IsFullSnapshotRequiredAtStartup checks whether to take a full snapshot or not during the startup of backup-restore.
func (ssr *Snapshotter) IsFullSnapshotRequiredAtStartup(timeWindow float64) bool {
if ssr.PrevFullSnapshot == nil || ssr.PrevFullSnapshot.IsFinal || time.Since(ssr.PrevFullSnapshot.CreatedOn).Hours() > timeWindow {
return true
}

if !ssr.WasScheduledFullSnapshotMissed(timeWindow) {
return false
}
return ssr.IsNextFullSnapshotBeyondTimeWindow(timeWindow)
}

// WasScheduledFullSnapshotMissed determines whether the preceding full-snapshot was missed or not.
func (ssr *Snapshotter) WasScheduledFullSnapshotMissed(timeWindow float64) bool {
aaronfern marked this conversation as resolved.
Show resolved Hide resolved
now := time.Now()
nextSnapSchedule := ssr.schedule.Next(now)

if miscellaneous.GetPrevScheduledSnapTime(nextSnapSchedule, timeWindow) == ssr.PrevFullSnapshot.CreatedOn {
ssr.logger.Info("previous full snapshot was taken at scheduled time, skipping the full snapshot at startup")
return false
}
return true
}

// IsNextFullSnapshotBeyondTimeWindow determines whether the next scheduled full snapshot will exceed the given time window or not.
func (ssr *Snapshotter) IsNextFullSnapshotBeyondTimeWindow(timeWindow float64) bool {
now := time.Now()
nextSnapSchedule := ssr.schedule.Next(now)
timeLeftToTakeNextSnap := nextSnapSchedule.Sub(now)

return timeLeftToTakeNextSnap.Hours()+time.Since(ssr.PrevFullSnapshot.CreatedOn).Hours() > timeWindow
}

// GetFullSnapshotMaxTimeWindow returns the maximum time period in hours for which backup-restore must take atleast one full snapshot.
func (ssr *Snapshotter) GetFullSnapshotMaxTimeWindow(fullSnapScheduleSpec string) float64 {
ishan16696 marked this conversation as resolved.
Show resolved Hide resolved
// Split on whitespace.
schedule := strings.Fields(fullSnapScheduleSpec)
if len(schedule) < 5 {
return defaultFullSnapMaxTimeWindow
}

if schedule[dayOfWeek] != "*" {
return defaultFullSnapMaxTimeWindow * 7
}

if schedule[dayOfMonth] == "*" && schedule[dayOfWeek] == "*" && strings.Contains(schedule[hour], "/") {
if timeWindow, err := strconv.ParseFloat(schedule[hour][strings.Index(schedule[hour], "/")+1:], 64); err == nil {
return timeWindow
}
}

return defaultFullSnapMaxTimeWindow
}
Loading