Skip to content

Commit

Permalink
Improve computation of full snapshot interval for BackupReady condi…
Browse files Browse the repository at this point in the history
…tion (#729)
  • Loading branch information
shreyas-s-rao authored Dec 1, 2023
1 parent c28db3b commit d682519
Show file tree
Hide file tree
Showing 13 changed files with 177 additions and 76 deletions.
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ require (
github.com/onsi/ginkgo/v2 v2.11.0
github.com/onsi/gomega v1.27.8
github.com/prometheus/client_golang v1.14.0
github.com/robfig/cron/v3 v3.0.1
github.com/spf13/pflag v1.0.5
go.uber.org/zap v1.24.0
golang.org/x/exp v0.0.0-20230213192124-5e25df0256eb
Expand Down Expand Up @@ -99,7 +100,6 @@ require (
github.com/prometheus/client_model v0.3.0 // indirect
github.com/prometheus/common v0.37.0 // indirect
github.com/prometheus/procfs v0.8.0 // indirect
github.com/robfig/cron/v3 v3.0.1 // indirect
github.com/russross/blackfriday/v2 v2.1.0 // indirect
github.com/sirupsen/logrus v1.8.1 // indirect
github.com/spf13/afero v1.8.2 // indirect
Expand Down
4 changes: 2 additions & 2 deletions pkg/health/condition/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

// skipMergeConditions contain the list of conditions we dont want to add to the list if not recalculated
// skipMergeConditions contain the list of conditions we don't want to add to the list if not recalculated
var skipMergeConditions = map[druidv1alpha1.ConditionType]struct{}{
druidv1alpha1.ConditionTypeReady: {},
druidv1alpha1.ConditionTypeAllMembersReady: {},
Expand Down Expand Up @@ -111,7 +111,7 @@ func (b *defaultBuilder) Build(replicas int32) []druidv1alpha1.Condition {
if condition.Status == "" {
condition.Status = druidv1alpha1.ConditionUnknown
}
condition.Reason = ConditionNotChecked
condition.Reason = NotChecked
condition.Message = "etcd cluster has been scaled down"
} else {
condition.Status = res.Status()
Expand Down
3 changes: 2 additions & 1 deletion pkg/health/condition/check_all_members.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,13 @@ import (
"context"

druidv1alpha1 "github.com/gardener/etcd-druid/api/v1alpha1"
"github.com/go-logr/logr"
"sigs.k8s.io/controller-runtime/pkg/client"
)

type allMembersReady struct{}

func (a *allMembersReady) Check(_ context.Context, etcd druidv1alpha1.Etcd) Result {
func (a *allMembersReady) Check(_ context.Context, _ logr.Logger, etcd druidv1alpha1.Etcd) Result {
if len(etcd.Status.Members) == 0 {
return &result{
conType: druidv1alpha1.ConditionTypeAllMembersReady,
Expand Down
15 changes: 10 additions & 5 deletions pkg/health/condition/check_all_members_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,16 @@ import (

druidv1alpha1 "github.com/gardener/etcd-druid/api/v1alpha1"
. "github.com/gardener/etcd-druid/pkg/health/condition"
"github.com/go-logr/logr"
)

var _ = Describe("AllMembersReadyCheck", func() {
Describe("#Check", func() {
var readyMember, notReadyMember druidv1alpha1.EtcdMemberStatus
var (
readyMember, notReadyMember druidv1alpha1.EtcdMemberStatus

logger = logr.Discard()
)

BeforeEach(func() {
readyMember = druidv1alpha1.EtcdMemberStatus{
Expand All @@ -53,7 +58,7 @@ var _ = Describe("AllMembersReadyCheck", func() {
}
check := AllMembersCheck(nil)

result := check.Check(context.TODO(), etcd)
result := check.Check(context.TODO(), logger, etcd)

Expect(result.ConditionType()).To(Equal(druidv1alpha1.ConditionTypeAllMembersReady))
Expect(result.Status()).To(Equal(druidv1alpha1.ConditionTrue))
Expand All @@ -74,7 +79,7 @@ var _ = Describe("AllMembersReadyCheck", func() {
}
check := AllMembersCheck(nil)

result := check.Check(context.TODO(), etcd)
result := check.Check(context.TODO(), logger, etcd)

Expect(result.ConditionType()).To(Equal(druidv1alpha1.ConditionTypeAllMembersReady))
Expect(result.Status()).To(Equal(druidv1alpha1.ConditionFalse))
Expand All @@ -94,7 +99,7 @@ var _ = Describe("AllMembersReadyCheck", func() {
}
check := AllMembersCheck(nil)

result := check.Check(context.TODO(), etcd)
result := check.Check(context.TODO(), logger, etcd)

Expect(result.ConditionType()).To(Equal(druidv1alpha1.ConditionTypeAllMembersReady))
Expect(result.Status()).To(Equal(druidv1alpha1.ConditionFalse))
Expand All @@ -114,7 +119,7 @@ var _ = Describe("AllMembersReadyCheck", func() {
}
check := AllMembersCheck(nil)

result := check.Check(context.TODO(), etcd)
result := check.Check(context.TODO(), logger, etcd)

Expect(result.ConditionType()).To(Equal(druidv1alpha1.ConditionTypeAllMembersReady))
Expect(result.Status()).To(Equal(druidv1alpha1.ConditionUnknown))
Expand Down
59 changes: 36 additions & 23 deletions pkg/health/condition/check_backup_ready.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,10 @@ import (
"time"

druidv1alpha1 "github.com/gardener/etcd-druid/api/v1alpha1"
coordinationv1 "k8s.io/api/coordination/v1"
"github.com/gardener/etcd-druid/pkg/utils"

"github.com/go-logr/logr"
coordinationv1 "k8s.io/api/coordination/v1"
"k8s.io/apimachinery/pkg/types"
"sigs.k8s.io/controller-runtime/pkg/client"
)
Expand All @@ -37,11 +39,11 @@ const (
BackupFailed string = "BackupFailed"
// Unknown is a constant that means that the etcd backup status is currently not known
Unknown string = "Unknown"
// ConditionNotChecked is a constant that means that the etcd backup status has not been updated or rechecked
ConditionNotChecked string = "ConditionNotChecked"
// NotChecked is a constant that means that the etcd backup status has not been updated or rechecked
NotChecked string = "NotChecked"
)

func (a *backupReadyCheck) Check(ctx context.Context, etcd druidv1alpha1.Etcd) Result {
func (a *backupReadyCheck) Check(ctx context.Context, logger logr.Logger, etcd druidv1alpha1.Etcd) Result {
//Default case
result := &result{
conType: druidv1alpha1.ConditionTypeBackupReady,
Expand All @@ -56,52 +58,63 @@ func (a *backupReadyCheck) Check(ctx context.Context, etcd druidv1alpha1.Etcd) R
return nil
}

//Fetch snapshot leases
// Fetch snapshot leases
var (
fullSnapErr, incrSnapErr error
fullSnapLease = &coordinationv1.Lease{}
deltaSnapLease = &coordinationv1.Lease{}
err, fullSnapErr, deltaSnapErr error
fullSnapshotInterval = 1 * time.Hour
fullSnapLease = &coordinationv1.Lease{}
deltaSnapLease = &coordinationv1.Lease{}
)
fullSnapErr = a.cl.Get(ctx, types.NamespacedName{Name: getFullSnapLeaseName(&etcd), Namespace: etcd.ObjectMeta.Namespace}, fullSnapLease)
incrSnapErr = a.cl.Get(ctx, types.NamespacedName{Name: getDeltaSnapLeaseName(&etcd), Namespace: etcd.ObjectMeta.Namespace}, deltaSnapLease)
fullSnapErr = a.cl.Get(ctx, types.NamespacedName{Name: getFullSnapLeaseName(&etcd), Namespace: etcd.Namespace}, fullSnapLease)
deltaSnapErr = a.cl.Get(ctx, types.NamespacedName{Name: getDeltaSnapLeaseName(&etcd), Namespace: etcd.Namespace}, deltaSnapLease)

//Set status to Unknown if errors in fetching snapshot leases or lease never renewed
if fullSnapErr != nil || incrSnapErr != nil || (fullSnapLease.Spec.RenewTime == nil && deltaSnapLease.Spec.RenewTime == nil) {
// Set status to Unknown if errors in fetching snapshot leases or lease never renewed
if fullSnapErr != nil || deltaSnapErr != nil || (fullSnapLease.Spec.RenewTime == nil && deltaSnapLease.Spec.RenewTime == nil) {
return result
}

deltaLeaseRenewTime := deltaSnapLease.Spec.RenewTime
fullLeaseRenewTime := fullSnapLease.Spec.RenewTime
fullLeaseCreateTime := &fullSnapLease.ObjectMeta.CreationTimestamp

// TODO: make etcd.Spec.Backup.FullSnapshotSchedule non-optional, since it is mandatory to
// set the full snapshot schedule, or introduce defaulting webhook to add default value for this field
if etcd.Spec.Backup.FullSnapshotSchedule != nil {
if fullSnapshotInterval, err = utils.ComputeScheduleInterval(*etcd.Spec.Backup.FullSnapshotSchedule); err != nil {
logger.Error(err, "unable to compute full snapshot duration from full snapshot schedule", "fullSnapshotSchedule", *etcd.Spec.Backup.FullSnapshotSchedule)
return result
}
}

if fullLeaseRenewTime == nil && deltaLeaseRenewTime != nil {
// Most probable during reconcile of existing clusters if fresh leases are created
// Treat backup as succeeded if delta snap lease renewal happens in the required time window and full snap lease is not older than 24h.
if time.Since(deltaLeaseRenewTime.Time) < 2*etcd.Spec.Backup.DeltaSnapshotPeriod.Duration && time.Since(fullLeaseCreateTime.Time) < 24*time.Hour {
// Treat backup as succeeded if delta snap lease renewal happens in the required time window
// and full snap lease is not older than full snapshot duration.
if time.Since(deltaLeaseRenewTime.Time) < 2*etcd.Spec.Backup.DeltaSnapshotPeriod.Duration && time.Since(fullLeaseCreateTime.Time) < fullSnapshotInterval {
result.reason = BackupSucceeded
result.message = "Delta snapshot backup succeeded"
result.status = druidv1alpha1.ConditionTrue
return result
}
} else if deltaLeaseRenewTime == nil && fullLeaseRenewTime != nil {
//Most probable during a startup scenario for new clusters
//Special case. Return Unknown condition for some time to allow delta backups to start up
// Most probable during a startup scenario for new clusters
// Special case. Return Unknown condition for some time to allow delta backups to start up
if time.Since(fullLeaseRenewTime.Time) > 5*etcd.Spec.Backup.DeltaSnapshotPeriod.Duration {
result.message = "Periodic delta snapshots not started yet"
return result
}
} else if deltaLeaseRenewTime != nil && fullLeaseRenewTime != nil {
//Both snap leases are maintained. Both are expected to be renewed periodically
if time.Since(deltaLeaseRenewTime.Time) < 2*etcd.Spec.Backup.DeltaSnapshotPeriod.Duration && time.Since(fullLeaseRenewTime.Time) < 24*time.Hour {
// Both snap leases are maintained. Both are expected to be renewed periodically
if time.Since(deltaLeaseRenewTime.Time) < 2*etcd.Spec.Backup.DeltaSnapshotPeriod.Duration && time.Since(fullLeaseRenewTime.Time) < fullSnapshotInterval {
result.reason = BackupSucceeded
result.message = "Snapshot backup succeeded"
result.status = druidv1alpha1.ConditionTrue
return result
}
}

//Cases where snapshot leases are not updated for a long time
//If snapshot leases are present and leases aren't updated, it is safe to assume that backup is not healthy
// Cases where snapshot leases are not updated for a long time
// If snapshot leases are present and leases aren't updated, it is safe to assume that backup is not healthy

if etcd.Status.Conditions != nil {
var prevBackupReadyStatus druidv1alpha1.Condition
Expand All @@ -122,16 +135,16 @@ func (a *backupReadyCheck) Check(ctx context.Context, etcd druidv1alpha1.Etcd) R
}
}

//Transition to "Unknown" state is we cannot prove a "True" state
// Transition to "Unknown" state is we cannot prove a "True" state
return result
}

func getDeltaSnapLeaseName(etcd *druidv1alpha1.Etcd) string {
return fmt.Sprintf("%s-delta-snap", string(etcd.ObjectMeta.Name))
return fmt.Sprintf("%s-delta-snap", etcd.Name)
}

func getFullSnapLeaseName(etcd *druidv1alpha1.Etcd) string {
return fmt.Sprintf("%s-full-snap", string(etcd.ObjectMeta.Name))
return fmt.Sprintf("%s-full-snap", etcd.Name)
}

// BackupReadyCheck returns a check for the "BackupReady" condition.
Expand Down
37 changes: 20 additions & 17 deletions pkg/health/condition/check_backup_ready_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,20 +18,21 @@ import (
"context"
"time"

druidv1alpha1 "github.com/gardener/etcd-druid/api/v1alpha1"
. "github.com/gardener/etcd-druid/pkg/health/condition"
mockclient "github.com/gardener/etcd-druid/pkg/mock/controller-runtime/client"

"github.com/golang/mock/gomock"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"

"k8s.io/apimachinery/pkg/types"
"sigs.k8s.io/controller-runtime/pkg/client"
druidv1alpha1 "github.com/gardener/etcd-druid/api/v1alpha1"
. "github.com/gardener/etcd-druid/pkg/health/condition"
mockclient "github.com/gardener/etcd-druid/pkg/mock/controller-runtime/client"

coordinationv1 "k8s.io/api/coordination/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/utils/pointer"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
)

var _ = Describe("BackupReadyCheck", func() {
Expand All @@ -47,6 +48,7 @@ var _ = Describe("BackupReadyCheck", func() {
},
}
deltaSnapshotDuration = 2 * time.Minute
logger = ctrl.Log.WithName("backup-ready-checker")

etcd = druidv1alpha1.Etcd{
ObjectMeta: v1.ObjectMeta{
Expand All @@ -56,6 +58,7 @@ var _ = Describe("BackupReadyCheck", func() {
Spec: druidv1alpha1.EtcdSpec{
Replicas: 1,
Backup: druidv1alpha1.BackupSpec{
FullSnapshotSchedule: pointer.String("0 0 * * *"), // at 00:00 every day
DeltaSnapshotPeriod: &v1.Duration{
Duration: deltaSnapshotDuration,
},
Expand Down Expand Up @@ -90,15 +93,15 @@ var _ = Describe("BackupReadyCheck", func() {
})

Context("With no snapshot leases present", func() {
It("Should return Unknown rediness", func() {
It("Should return Unknown readiness", func() {
cl.EXPECT().Get(context.TODO(), gomock.Any(), gomock.Any(), gomock.Any()).DoAndReturn(
func(_ context.Context, _ client.ObjectKey, er *coordinationv1.Lease, _ ...client.GetOption) error {
return &noLeaseError
},
).AnyTimes()

check := BackupReadyCheck(cl)
result := check.Check(context.TODO(), etcd)
result := check.Check(context.TODO(), logger, etcd)

Expect(result).ToNot(BeNil())
Expect(result.ConditionType()).To(Equal(druidv1alpha1.ConditionTypeBackupReady))
Expand All @@ -117,15 +120,15 @@ var _ = Describe("BackupReadyCheck", func() {
).AnyTimes()

check := BackupReadyCheck(cl)
result := check.Check(context.TODO(), etcd)
result := check.Check(context.TODO(), logger, etcd)

Expect(result).ToNot(BeNil())
Expect(result.ConditionType()).To(Equal(druidv1alpha1.ConditionTypeBackupReady))
Expect(result.Status()).To(Equal(druidv1alpha1.ConditionTrue))
Expect(result.Reason()).To(Equal(BackupSucceeded))
})

It("Should set status to BackupSucceeded if delta snap lease is recently created and empty full snap lease has been created in the last 24h", func() {
It("Should set status to BackupSucceeded if delta snap lease is recently created and empty full snap lease has been created recently", func() {
cl.EXPECT().Get(context.TODO(), types.NamespacedName{Name: "test-etcd-full-snap", Namespace: "default"}, gomock.Any(), gomock.Any()).DoAndReturn(
func(_ context.Context, _ client.ObjectKey, le *coordinationv1.Lease, _ ...client.GetOption) error {
*le = lease
Expand All @@ -143,7 +146,7 @@ var _ = Describe("BackupReadyCheck", func() {
).AnyTimes()

check := BackupReadyCheck(cl)
result := check.Check(context.TODO(), etcd)
result := check.Check(context.TODO(), logger, etcd)

Expect(result).ToNot(BeNil())
Expect(result.ConditionType()).To(Equal(druidv1alpha1.ConditionTypeBackupReady))
Expand All @@ -169,7 +172,7 @@ var _ = Describe("BackupReadyCheck", func() {
).AnyTimes()

check := BackupReadyCheck(cl)
result := check.Check(context.TODO(), etcd)
result := check.Check(context.TODO(), logger, etcd)

Expect(result).ToNot(BeNil())
Expect(result.ConditionType()).To(Equal(druidv1alpha1.ConditionTypeBackupReady))
Expand Down Expand Up @@ -198,7 +201,7 @@ var _ = Describe("BackupReadyCheck", func() {
}

check := BackupReadyCheck(cl)
result := check.Check(context.TODO(), etcd)
result := check.Check(context.TODO(), logger, etcd)

Expect(result).ToNot(BeNil())
Expect(result.ConditionType()).To(Equal(druidv1alpha1.ConditionTypeBackupReady))
Expand Down Expand Up @@ -226,7 +229,7 @@ var _ = Describe("BackupReadyCheck", func() {
}

check := BackupReadyCheck(cl)
result := check.Check(context.TODO(), etcd)
result := check.Check(context.TODO(), logger, etcd)

Expect(result).ToNot(BeNil())
Expect(result.ConditionType()).To(Equal(druidv1alpha1.ConditionTypeBackupReady))
Expand All @@ -245,7 +248,7 @@ var _ = Describe("BackupReadyCheck", func() {

etcd.Spec.Backup.Store = nil
check := BackupReadyCheck(cl)
result := check.Check(context.TODO(), etcd)
result := check.Check(context.TODO(), logger, etcd)

Expect(result).To(BeNil())
etcd.Spec.Backup.Store = &druidv1alpha1.StoreSpec{
Expand All @@ -265,7 +268,7 @@ var _ = Describe("BackupReadyCheck", func() {

etcd.Spec.Backup.Store.Provider = nil
check := BackupReadyCheck(cl)
result := check.Check(context.TODO(), etcd)
result := check.Check(context.TODO(), logger, etcd)

Expect(result).To(BeNil())
etcd.Spec.Backup.Store.Provider = &storageProvider
Expand Down
3 changes: 2 additions & 1 deletion pkg/health/condition/check_ready.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,13 @@ import (
"context"

druidv1alpha1 "github.com/gardener/etcd-druid/api/v1alpha1"
"github.com/go-logr/logr"
"sigs.k8s.io/controller-runtime/pkg/client"
)

type readyCheck struct{}

func (r *readyCheck) Check(_ context.Context, etcd druidv1alpha1.Etcd) Result {
func (r *readyCheck) Check(_ context.Context, _ logr.Logger, etcd druidv1alpha1.Etcd) Result {

// TODO: remove this case as soon as leases are completely supported by etcd-backup-restore
if len(etcd.Status.Members) == 0 {
Expand Down
Loading

0 comments on commit d682519

Please sign in to comment.