From 1a1406074c8463e20c8df005c947533ac227d053 Mon Sep 17 00:00:00 2001 From: madhav bhargava Date: Mon, 30 Sep 2024 22:30:41 +0530 Subject: [PATCH] fixes #881, #877 Added ability to handle unknown CLI args to allow switching between v0.22 and v0.23 Added use-etcd-wrapper cli arg for etcdbr container --- api/v1alpha1/constants.go | 2 + api/v1alpha1/etcd.go | 8 +- .../component/clientservice/clientservice.go | 6 +- .../clientservice/clientservice_test.go | 8 +- internal/component/configmap/configmap.go | 12 +- .../component/configmap/configmap_test.go | 8 +- internal/component/memberlease/memberlease.go | 6 +- .../component/memberlease/memberlease_test.go | 8 +- internal/component/peerservice/peerservice.go | 6 +- .../component/peerservice/peerservice_test.go | 8 +- .../poddisruptionbudget.go | 6 +- .../poddisruptionbudget_test.go | 8 +- internal/component/role/role.go | 6 +- internal/component/role/role_test.go | 6 +- internal/component/rolebinding/rolebinding.go | 6 +- .../component/rolebinding/rolebinding_test.go | 6 +- .../serviceaccount/serviceaccount.go | 6 +- .../serviceaccount/serviceaccount_test.go | 6 +- .../component/snapshotlease/snapshotlease.go | 10 +- .../snapshotlease/snapshotlease_test.go | 8 +- internal/component/statefulset/builder.go | 144 +++++--- internal/component/statefulset/statefulset.go | 336 ++++++++++-------- internal/component/types.go | 12 + internal/controller/etcd/reconcile_delete.go | 2 +- internal/controller/etcd/reconcile_spec.go | 14 +- .../etcdcopybackupstask/reconciler.go | 6 +- internal/controller/utils/etcdstatus.go | 13 +- internal/errors/errors.go | 18 +- internal/errors/errors_test.go | 35 +- internal/health/condition/check_ready_test.go | 5 + internal/health/etcdmember/check_ready.go | 2 +- internal/images/images.yaml | 4 +- internal/utils/labels.go | 17 + internal/utils/labels_test.go | 131 +++++++ internal/utils/lease.go | 84 +++-- internal/utils/lease_test.go | 101 +++++- internal/utils/statefulset.go | 32 ++ internal/webhook/etcdcomponents/handler.go | 8 +- main.go | 62 ++-- 39 files changed, 769 insertions(+), 397 deletions(-) diff --git a/api/v1alpha1/constants.go b/api/v1alpha1/constants.go index 87f2daf57..91ae7447a 100644 --- a/api/v1alpha1/constants.go +++ b/api/v1alpha1/constants.go @@ -16,6 +16,8 @@ const ( LabelPartOfKey = "app.kubernetes.io/part-of" // LabelComponentKey is a key for a label that sets the component type on resources provisioned for an etcd cluster. LabelComponentKey = "app.kubernetes.io/component" + // LabelEtcdClusterSizeKey is the label key used to store the size of the etcd cluster on the etcd pods. + LabelEtcdClusterSizeKey = "druid.gardener.cloud/etcd-cluster-size" ) // Annotation keys that can be placed on an Etcd custom resource. diff --git a/api/v1alpha1/etcd.go b/api/v1alpha1/etcd.go index 5b3a2cfed..b079f16a4 100644 --- a/api/v1alpha1/etcd.go +++ b/api/v1alpha1/etcd.go @@ -443,6 +443,8 @@ const ( LastOperationStateSucceeded LastOperationState = "Succeeded" // LastOperationStateError indicates that an operation is completed with errors and will be retried. LastOperationStateError LastOperationState = "Error" + // LastOperationStateRequeue indicates that an operation is not completed and either due to an error or unfulfilled conditions will be retried. + LastOperationStateRequeue LastOperationState = "Requeue" ) // LastOperation holds the information on the last operation done on the Etcd resource. @@ -486,7 +488,8 @@ func (e *Etcd) IsReconciliationInProgress() bool { return e.Status.LastOperation != nil && e.Status.LastOperation.Type == LastOperationTypeReconcile && (e.Status.LastOperation.State == LastOperationStateProcessing || - e.Status.LastOperation.State == LastOperationStateError) + e.Status.LastOperation.State == LastOperationStateError || + e.Status.LastOperation.State == LastOperationStateRequeue) } // IsDeletionInProgress returns true if the Etcd resource is currently being reconciled, else returns false. @@ -494,5 +497,6 @@ func (e *Etcd) IsDeletionInProgress() bool { return e.Status.LastOperation != nil && e.Status.LastOperation.Type == LastOperationTypeDelete && (e.Status.LastOperation.State == LastOperationStateProcessing || - e.Status.LastOperation.State == LastOperationStateError) + e.Status.LastOperation.State == LastOperationStateError || + e.Status.LastOperation.State == LastOperationStateRequeue) } diff --git a/internal/component/clientservice/clientservice.go b/internal/component/clientservice/clientservice.go index 88b1002f4..8e740bda5 100644 --- a/internal/component/clientservice/clientservice.go +++ b/internal/component/clientservice/clientservice.go @@ -54,7 +54,7 @@ func (r _resource) GetExistingResourceNames(ctx component.OperatorContext, etcdO } return resourceNames, druiderr.WrapError(err, ErrGetClientService, - "GetExistingResourceNames", + component.OperationGetExistingResourceNames, fmt.Sprintf("Error getting client service: %v for etcd: %v", svcObjectKey, druidv1alpha1.GetNamespaceName(etcdObjMeta))) } if metav1.IsControlledBy(objMeta, &etcdObjMeta) { @@ -77,7 +77,7 @@ func (r _resource) Sync(ctx component.OperatorContext, etcd *druidv1alpha1.Etcd) if err != nil { return druiderr.WrapError(err, ErrSyncClientService, - "Sync", + component.OperationSync, fmt.Sprintf("Error during create or update of client service: %v for etcd: %v", objectKey, druidv1alpha1.GetNamespaceName(etcd.ObjectMeta)), ) } @@ -97,7 +97,7 @@ func (r _resource) TriggerDelete(ctx component.OperatorContext, etcdObjMeta meta return druiderr.WrapError( err, ErrDeleteClientService, - "TriggerDelete", + component.OperationTriggerDelete, "Failed to delete client service", ) } diff --git a/internal/component/clientservice/clientservice_test.go b/internal/component/clientservice/clientservice_test.go index 9fe28facb..001fcdec1 100644 --- a/internal/component/clientservice/clientservice_test.go +++ b/internal/component/clientservice/clientservice_test.go @@ -56,7 +56,7 @@ func TestGetExistingResourceNames(t *testing.T) { expectedErr: &druiderr.DruidError{ Code: ErrGetClientService, Cause: testutils.TestAPIInternalErr, - Operation: "GetExistingResourceNames", + Operation: component.OperationGetExistingResourceNames, }, }, } @@ -106,7 +106,7 @@ func TestSyncWhenNoServiceExists(t *testing.T) { expectedErr: &druiderr.DruidError{ Code: ErrSyncClientService, Cause: testutils.TestAPIInternalErr, - Operation: "Sync", + Operation: component.OperationSync, }, }, } @@ -159,7 +159,7 @@ func TestSyncWhenServiceExists(t *testing.T) { expectedError: &druiderr.DruidError{ Code: ErrSyncClientService, Cause: testutils.TestAPIInternalErr, - Operation: "Sync", + Operation: component.OperationSync, }, }, } @@ -211,7 +211,7 @@ func TestTriggerDelete(t *testing.T) { expectedErr: &druiderr.DruidError{ Code: ErrDeleteClientService, Cause: testutils.TestAPIInternalErr, - Operation: "TriggerDelete", + Operation: component.OperationTriggerDelete, }, deleteErr: testutils.TestAPIInternalErr, }, diff --git a/internal/component/configmap/configmap.go b/internal/component/configmap/configmap.go index 68612b848..0eb5e0e46 100644 --- a/internal/component/configmap/configmap.go +++ b/internal/component/configmap/configmap.go @@ -55,7 +55,7 @@ func (r _resource) GetExistingResourceNames(ctx component.OperatorContext, etcdO } return nil, druiderr.WrapError(err, ErrGetConfigMap, - "GetExistingResourceNames", + component.OperationGetExistingResourceNames, fmt.Sprintf("Error getting ConfigMap: %v for etcd: %v", objKey, druidv1alpha1.GetNamespaceName(etcdObjMeta))) } if metav1.IsControlledBy(objMeta, &etcdObjMeta) { @@ -65,7 +65,9 @@ func (r _resource) GetExistingResourceNames(ctx component.OperatorContext, etcdO } // PreSync is a no-op for the configmap component. -func (r _resource) PreSync(_ component.OperatorContext, _ *druidv1alpha1.Etcd) error { return nil } +func (r _resource) PreSync(_ component.OperatorContext, _ *druidv1alpha1.Etcd) error { + return nil +} // Sync creates or updates the configmap for the given Etcd. func (r _resource) Sync(ctx component.OperatorContext, etcd *druidv1alpha1.Etcd) error { @@ -76,14 +78,14 @@ func (r _resource) Sync(ctx component.OperatorContext, etcd *druidv1alpha1.Etcd) if err != nil { return druiderr.WrapError(err, ErrSyncConfigMap, - "Sync", + component.OperationSync, fmt.Sprintf("Error during create or update of configmap for etcd: %v", druidv1alpha1.GetNamespaceName(etcd.ObjectMeta))) } checkSum, err := computeCheckSum(cm) if err != nil { return druiderr.WrapError(err, ErrSyncConfigMap, - "Sync", + component.OperationSync, fmt.Sprintf("Error when computing CheckSum for configmap for etcd: %v", druidv1alpha1.GetNamespaceName(etcd.ObjectMeta))) } ctx.Data[common.CheckSumKeyConfigMap] = checkSum @@ -103,7 +105,7 @@ func (r _resource) TriggerDelete(ctx component.OperatorContext, etcdObjMeta meta return druiderr.WrapError( err, ErrDeleteConfigMap, - "TriggerDelete", + component.OperationTriggerDelete, "Failed to delete configmap", ) } diff --git a/internal/component/configmap/configmap_test.go b/internal/component/configmap/configmap_test.go index f3775c994..3fa558a65 100644 --- a/internal/component/configmap/configmap_test.go +++ b/internal/component/configmap/configmap_test.go @@ -56,7 +56,7 @@ func TestGetExistingResourceNames(t *testing.T) { expectedErr: &druiderr.DruidError{ Code: ErrGetConfigMap, Cause: testutils.TestAPIInternalErr, - Operation: "GetExistingResourceNames", + Operation: component.OperationGetExistingResourceNames, }, }, } @@ -115,7 +115,7 @@ func TestSyncWhenNoConfigMapExists(t *testing.T) { expectedErr: &druiderr.DruidError{ Code: ErrSyncConfigMap, Cause: testutils.TestAPIInternalErr, - Operation: "Sync", + Operation: component.OperationSync, }, }, } @@ -211,7 +211,7 @@ func TestSyncWhenConfigMapExists(t *testing.T) { expectedErr: &druiderr.DruidError{ Code: ErrSyncConfigMap, Cause: testutils.TestAPIInternalErr, - Operation: "Sync", + Operation: component.OperationSync, }, }, } @@ -264,7 +264,7 @@ func TestTriggerDelete(t *testing.T) { expectedErr: &druiderr.DruidError{ Code: ErrDeleteConfigMap, Cause: testutils.TestAPIInternalErr, - Operation: "TriggerDelete", + Operation: component.OperationTriggerDelete, }, }, } diff --git a/internal/component/memberlease/memberlease.go b/internal/component/memberlease/memberlease.go index 532953250..151551cde 100644 --- a/internal/component/memberlease/memberlease.go +++ b/internal/component/memberlease/memberlease.go @@ -53,7 +53,7 @@ func (r _resource) GetExistingResourceNames(ctx component.OperatorContext, etcdO ); err != nil { return resourceNames, druiderr.WrapError(err, ErrListMemberLease, - "GetExistingResourceNames", + component.OperationGetExistingResourceNames, fmt.Sprintf("Error listing member leases for etcd: %v", druidv1alpha1.GetNamespaceName(etcdObjMeta))) } for _, lease := range objMetaList.Items { @@ -98,7 +98,7 @@ func (r _resource) doCreateOrUpdate(ctx component.OperatorContext, etcd *druidv1 if err != nil { return druiderr.WrapError(err, ErrSyncMemberLease, - "Sync", + component.OperationSync, fmt.Sprintf("Error syncing member lease: %v for etcd: %v", objKey, druidv1alpha1.GetNamespaceName(etcd.ObjectMeta))) } ctx.Logger.Info("triggered create or update of member lease", "objectKey", objKey, "operationResult", opResult) @@ -114,7 +114,7 @@ func (r _resource) TriggerDelete(ctx component.OperatorContext, etcdObjMeta meta client.MatchingLabels(getSelectorLabelsForAllMemberLeases(etcdObjMeta))); err != nil { return druiderr.WrapError(err, ErrDeleteMemberLease, - "TriggerDelete", + component.OperationTriggerDelete, fmt.Sprintf("Failed to delete member leases for etcd: %v", druidv1alpha1.GetNamespaceName(etcdObjMeta))) } ctx.Logger.Info("deleted", "component", "member-leases") diff --git a/internal/component/memberlease/memberlease_test.go b/internal/component/memberlease/memberlease_test.go index 8dbc22111..2ada1803e 100644 --- a/internal/component/memberlease/memberlease_test.go +++ b/internal/component/memberlease/memberlease_test.go @@ -69,7 +69,7 @@ func TestGetExistingResourceNames(t *testing.T) { expectedErr: &druiderr.DruidError{ Code: ErrListMemberLease, Cause: testutils.TestAPIInternalErr, - Operation: "GetExistingResourceNames", + Operation: component.OperationGetExistingResourceNames, }, }, } @@ -135,7 +135,7 @@ func TestSync(t *testing.T) { expectedErr: &druiderr.DruidError{ Code: ErrSyncMemberLease, Cause: testutils.TestAPIInternalErr, - Operation: "Sync", + Operation: component.OperationSync, }, }, { @@ -146,7 +146,7 @@ func TestSync(t *testing.T) { expectedErr: &druiderr.DruidError{ Code: ErrSyncMemberLease, Cause: testutils.TestAPIInternalErr, - Operation: "Sync", + Operation: component.OperationSync, }, }, } @@ -229,7 +229,7 @@ func TestTriggerDelete(t *testing.T) { expectedErr: &druiderr.DruidError{ Code: ErrDeleteMemberLease, Cause: testutils.TestAPIInternalErr, - Operation: "TriggerDelete", + Operation: component.OperationTriggerDelete, }, }, } diff --git a/internal/component/peerservice/peerservice.go b/internal/component/peerservice/peerservice.go index 9696da8d4..966145e9c 100644 --- a/internal/component/peerservice/peerservice.go +++ b/internal/component/peerservice/peerservice.go @@ -54,7 +54,7 @@ func (r _resource) GetExistingResourceNames(ctx component.OperatorContext, etcdO } return resourceNames, druiderr.WrapError(err, ErrGetPeerService, - "GetExistingResourceNames", + component.OperationGetExistingResourceNames, fmt.Sprintf("Error getting peer service: %s for etcd: %v", svcObjectKey.Name, druidv1alpha1.GetNamespaceName(etcdObjMeta))) } if metav1.IsControlledBy(objMeta, &etcdObjMeta) { @@ -77,7 +77,7 @@ func (r _resource) Sync(ctx component.OperatorContext, etcd *druidv1alpha1.Etcd) if err != nil { return druiderr.WrapError(err, ErrSyncPeerService, - "Sync", + component.OperationSync, fmt.Sprintf("Error during create or update of peer service: %v for etcd: %v", objectKey, druidv1alpha1.GetNamespaceName(etcd.ObjectMeta)), ) } @@ -97,7 +97,7 @@ func (r _resource) TriggerDelete(ctx component.OperatorContext, etcdObjMeta meta return druiderr.WrapError( err, ErrDeletePeerService, - "TriggerDelete", + component.OperationTriggerDelete, fmt.Sprintf("Failed to delete peer service: %v for etcd: %v", objectKey, druidv1alpha1.GetNamespaceName(etcdObjMeta)), ) } diff --git a/internal/component/peerservice/peerservice_test.go b/internal/component/peerservice/peerservice_test.go index 44b200b66..fee6cf88b 100644 --- a/internal/component/peerservice/peerservice_test.go +++ b/internal/component/peerservice/peerservice_test.go @@ -55,7 +55,7 @@ func TestGetExistingResourceNames(t *testing.T) { expectedErr: &druiderr.DruidError{ Code: ErrGetPeerService, Cause: testutils.TestAPIInternalErr, - Operation: "GetExistingResourceNames", + Operation: component.OperationGetExistingResourceNames, }, }, } @@ -105,7 +105,7 @@ func TestSyncWhenNoServiceExists(t *testing.T) { expectedError: &druiderr.DruidError{ Code: ErrSyncPeerService, Cause: testutils.TestAPIInternalErr, - Operation: "Sync", + Operation: component.OperationSync, }, }, } @@ -155,7 +155,7 @@ func TestSyncWhenServiceExists(t *testing.T) { expectedError: &druiderr.DruidError{ Code: ErrSyncPeerService, Cause: testutils.TestAPIInternalErr, - Operation: "Sync", + Operation: component.OperationSync, }, }, } @@ -209,7 +209,7 @@ func TestPeerServiceTriggerDelete(t *testing.T) { expectError: &druiderr.DruidError{ Code: ErrDeletePeerService, Cause: deleteInternalErr, - Operation: "TriggerDelete", + Operation: component.OperationTriggerDelete, }, }, } diff --git a/internal/component/poddistruptionbudget/poddisruptionbudget.go b/internal/component/poddistruptionbudget/poddisruptionbudget.go index 074643e7e..14e47bccc 100644 --- a/internal/component/poddistruptionbudget/poddisruptionbudget.go +++ b/internal/component/poddistruptionbudget/poddisruptionbudget.go @@ -60,7 +60,7 @@ func (r _resource) GetExistingResourceNames(ctx component.OperatorContext, etcdO } return resourceNames, druiderr.WrapError(err, ErrGetPodDisruptionBudget, - "GetExistingResourceNames", + component.OperationGetExistingResourceNames, fmt.Sprintf("Error getting PDB: %v for etcd: %v", objectKey, druidv1alpha1.GetNamespaceName(etcdObjMeta))) } if metav1.IsControlledBy(objMeta, &etcdObjMeta) { @@ -83,7 +83,7 @@ func (r _resource) Sync(ctx component.OperatorContext, etcd *druidv1alpha1.Etcd) if err != nil { return druiderr.WrapError(err, ErrSyncPodDisruptionBudget, - "Sync", + component.OperationSync, fmt.Sprintf("Error during create or update of PDB: %v for etcd: %v", objectKey, druidv1alpha1.GetNamespaceName(etcd.ObjectMeta)), ) } @@ -98,7 +98,7 @@ func (r _resource) TriggerDelete(ctx component.OperatorContext, etcdObjMeta meta if err := client.IgnoreNotFound(r.client.Delete(ctx, emptyPodDisruptionBudget(pdbObjectKey))); err != nil { return druiderr.WrapError(err, ErrDeletePodDisruptionBudget, - "TriggerDelete", + component.OperationTriggerDelete, fmt.Sprintf("Failed to delete PDB: %v for etcd: %v", pdbObjectKey, druidv1alpha1.GetNamespaceName(etcdObjMeta))) } ctx.Logger.Info("deleted", "component", "pod-disruption-budget", "objectKey", pdbObjectKey) diff --git a/internal/component/poddistruptionbudget/poddisruptionbudget_test.go b/internal/component/poddistruptionbudget/poddisruptionbudget_test.go index 34aadbbd2..8ae3f6c0b 100644 --- a/internal/component/poddistruptionbudget/poddisruptionbudget_test.go +++ b/internal/component/poddistruptionbudget/poddisruptionbudget_test.go @@ -53,7 +53,7 @@ func TestGetExistingResourceNames(t *testing.T) { expectedErr: &druiderr.DruidError{ Code: ErrGetPodDisruptionBudget, Cause: testutils.TestAPIInternalErr, - Operation: "GetExistingResourceNames", + Operation: component.OperationGetExistingResourceNames, }, }, } @@ -107,7 +107,7 @@ func TestSyncWhenNoPDBExists(t *testing.T) { expectedErr: &druiderr.DruidError{ Code: ErrSyncPodDisruptionBudget, Cause: testutils.TestAPIInternalErr, - Operation: "Sync", + Operation: component.OperationSync, }, }, } @@ -161,7 +161,7 @@ func TestSyncWhenPDBExists(t *testing.T) { expectedErr: &druiderr.DruidError{ Code: ErrSyncPodDisruptionBudget, Cause: testutils.TestAPIInternalErr, - Operation: "Sync", + Operation: component.OperationSync, }, }, } @@ -214,7 +214,7 @@ func TestTriggerDelete(t *testing.T) { expectedErr: &druiderr.DruidError{ Code: ErrDeletePodDisruptionBudget, Cause: testutils.TestAPIInternalErr, - Operation: "TriggerDelete", + Operation: component.OperationTriggerDelete, }, }, } diff --git a/internal/component/role/role.go b/internal/component/role/role.go index d040ceb31..a323bb2ff 100644 --- a/internal/component/role/role.go +++ b/internal/component/role/role.go @@ -53,7 +53,7 @@ func (r _resource) GetExistingResourceNames(ctx component.OperatorContext, etcdO } return resourceNames, druiderr.WrapError(err, ErrGetRole, - "GetExistingResourceNames", + component.OperationGetExistingResourceNames, fmt.Sprintf("Error getting role: %v for etcd: %v", objectKey, druidv1alpha1.GetNamespaceName(etcdObjMeta))) } if metav1.IsControlledBy(objMeta, &etcdObjMeta) { @@ -76,7 +76,7 @@ func (r _resource) Sync(ctx component.OperatorContext, etcd *druidv1alpha1.Etcd) if err != nil { return druiderr.WrapError(err, ErrSyncRole, - "Sync", + component.OperationSync, fmt.Sprintf("Error during create or update of role %v for etcd: %v", objectKey, druidv1alpha1.GetNamespaceName(etcd.ObjectMeta)), ) } @@ -95,7 +95,7 @@ func (r _resource) TriggerDelete(ctx component.OperatorContext, etcdObjMeta meta } return druiderr.WrapError(err, ErrDeleteRole, - "TriggerDelete", + component.OperationTriggerDelete, fmt.Sprintf("Failed to delete role: %v for etcd: %v", objectKey, druidv1alpha1.GetNamespaceName(etcdObjMeta)), ) } diff --git a/internal/component/role/role_test.go b/internal/component/role/role_test.go index dc19718df..b1093b1ce 100644 --- a/internal/component/role/role_test.go +++ b/internal/component/role/role_test.go @@ -52,7 +52,7 @@ func TestGetExistingResourceNames(t *testing.T) { expectedErr: &druiderr.DruidError{ Code: ErrGetRole, Cause: testutils.TestAPIInternalErr, - Operation: "GetExistingResourceNames", + Operation: component.OperationGetExistingResourceNames, }, }, } @@ -98,7 +98,7 @@ func TestSync(t *testing.T) { expectedErr: &druiderr.DruidError{ Code: ErrSyncRole, Cause: testutils.TestAPIInternalErr, - Operation: "Sync", + Operation: component.OperationSync, }, }, } @@ -147,7 +147,7 @@ func TestTriggerDelete(t *testing.T) { expectedErr: &druiderr.DruidError{ Code: ErrDeleteRole, Cause: testutils.TestAPIInternalErr, - Operation: "TriggerDelete", + Operation: component.OperationTriggerDelete, }, }, { diff --git a/internal/component/rolebinding/rolebinding.go b/internal/component/rolebinding/rolebinding.go index f59ebc003..64c8e2918 100644 --- a/internal/component/rolebinding/rolebinding.go +++ b/internal/component/rolebinding/rolebinding.go @@ -53,7 +53,7 @@ func (r _resource) GetExistingResourceNames(ctx component.OperatorContext, etcdO } return resourceNames, druiderr.WrapError(err, ErrGetRoleBinding, - "GetExistingResourceNames", + component.OperationGetExistingResourceNames, fmt.Sprintf("Error getting role-binding: %v for etcd: %v", objectKey, druidv1alpha1.GetNamespaceName(etcdObjMeta))) } if metav1.IsControlledBy(objMeta, &etcdObjMeta) { @@ -76,7 +76,7 @@ func (r _resource) Sync(ctx component.OperatorContext, etcd *druidv1alpha1.Etcd) if err != nil { return druiderr.WrapError(err, ErrSyncRoleBinding, - "Sync", + component.OperationSync, fmt.Sprintf("Error during create or update of role-binding %v for etcd: %v", objectKey, druidv1alpha1.GetNamespaceName(etcd.ObjectMeta)), ) } @@ -95,7 +95,7 @@ func (r _resource) TriggerDelete(ctx component.OperatorContext, etcdObjMeta meta } return druiderr.WrapError(err, ErrDeleteRoleBinding, - "TriggerDelete", + component.OperationTriggerDelete, fmt.Sprintf("Failed to delete role-binding: %v for etcd: %v", objectKey, druidv1alpha1.GetNamespaceName(etcdObjMeta)), ) } diff --git a/internal/component/rolebinding/rolebinding_test.go b/internal/component/rolebinding/rolebinding_test.go index b6c9e781b..a860c0c77 100644 --- a/internal/component/rolebinding/rolebinding_test.go +++ b/internal/component/rolebinding/rolebinding_test.go @@ -52,7 +52,7 @@ func TestGetExistingResourceNames(t *testing.T) { expectedErr: &druiderr.DruidError{ Code: ErrGetRoleBinding, Cause: testutils.TestAPIInternalErr, - Operation: "GetExistingResourceNames", + Operation: component.OperationGetExistingResourceNames, }, }, } @@ -98,7 +98,7 @@ func TestSync(t *testing.T) { expectedErr: &druiderr.DruidError{ Code: ErrSyncRoleBinding, Cause: testutils.TestAPIInternalErr, - Operation: "Sync", + Operation: component.OperationSync, }, }, } @@ -147,7 +147,7 @@ func TestTriggerDelete(t *testing.T) { expectedErr: &druiderr.DruidError{ Code: ErrDeleteRoleBinding, Cause: testutils.TestAPIInternalErr, - Operation: "TriggerDelete", + Operation: component.OperationTriggerDelete, }, }, { diff --git a/internal/component/serviceaccount/serviceaccount.go b/internal/component/serviceaccount/serviceaccount.go index c67b83e88..dc94b4dec 100644 --- a/internal/component/serviceaccount/serviceaccount.go +++ b/internal/component/serviceaccount/serviceaccount.go @@ -55,7 +55,7 @@ func (r _resource) GetExistingResourceNames(ctx component.OperatorContext, etcdO } return resourceNames, druiderr.WrapError(err, ErrGetServiceAccount, - "GetExistingResourceNames", + component.OperationGetExistingResourceNames, fmt.Sprintf("Error getting service account: %v for etcd: %v", objectKey, druidv1alpha1.GetNamespaceName(etcdObjMeta))) } if metav1.IsControlledBy(objMeta, &etcdObjMeta) { @@ -78,7 +78,7 @@ func (r _resource) Sync(ctx component.OperatorContext, etcd *druidv1alpha1.Etcd) if err != nil { return druiderr.WrapError(err, ErrSyncServiceAccount, - "Sync", + component.OperationSync, fmt.Sprintf("Error during create or update of service account: %v for etcd: %v", objectKey, druidv1alpha1.GetNamespaceName(etcd.ObjectMeta)), ) } @@ -97,7 +97,7 @@ func (r _resource) TriggerDelete(ctx component.OperatorContext, etcdObjMeta meta } return druiderr.WrapError(err, ErrDeleteServiceAccount, - "TriggerDelete", + component.OperationTriggerDelete, fmt.Sprintf("Failed to delete service account: %v for etcd: %v", objectKey, druidv1alpha1.GetNamespaceName(etcdObjMeta))) } ctx.Logger.Info("deleted", "component", "service-account", "objectKey", objectKey) diff --git a/internal/component/serviceaccount/serviceaccount_test.go b/internal/component/serviceaccount/serviceaccount_test.go index ba8868118..b8cb5e522 100644 --- a/internal/component/serviceaccount/serviceaccount_test.go +++ b/internal/component/serviceaccount/serviceaccount_test.go @@ -51,7 +51,7 @@ func TestGetExistingResourceNames(t *testing.T) { expectedErr: &druiderr.DruidError{ Code: ErrGetServiceAccount, Cause: testutils.TestAPIInternalErr, - Operation: "GetExistingResourceNames", + Operation: component.OperationGetExistingResourceNames, }, }, } @@ -106,7 +106,7 @@ func TestSync(t *testing.T) { expectedErr: &druiderr.DruidError{ Code: ErrSyncServiceAccount, Cause: testutils.TestAPIInternalErr, - Operation: "Sync", + Operation: component.OperationSync, }, }, } @@ -157,7 +157,7 @@ func TestTriggerDelete(t *testing.T) { expectedErr: &druiderr.DruidError{ Code: ErrDeleteServiceAccount, Cause: testutils.TestAPIInternalErr, - Operation: "TriggerDelete", + Operation: component.OperationTriggerDelete, }, }, } diff --git a/internal/component/snapshotlease/snapshotlease.go b/internal/component/snapshotlease/snapshotlease.go index 11ea29f4c..253ace68e 100644 --- a/internal/component/snapshotlease/snapshotlease.go +++ b/internal/component/snapshotlease/snapshotlease.go @@ -54,7 +54,7 @@ func (r _resource) GetExistingResourceNames(ctx component.OperatorContext, etcdO if err != nil { return resourceNames, druiderr.WrapError(err, ErrGetSnapshotLease, - "GetExistingResourceNames", + component.OperationGetExistingResourceNames, fmt.Sprintf("Error getting delta snapshot lease: %v for etcd: %v", deltaSnapshotObjectKey, druidv1alpha1.GetNamespaceName(etcdObjMeta)), ) } @@ -66,7 +66,7 @@ func (r _resource) GetExistingResourceNames(ctx component.OperatorContext, etcdO if err != nil { return resourceNames, druiderr.WrapError(err, ErrGetSnapshotLease, - "GetExistingResourceNames", + component.OperationGetExistingResourceNames, fmt.Sprintf("Error getting full snapshot lease: %v for etcd: %v", fullSnapshotObjectKey, druidv1alpha1.GetNamespaceName(etcdObjMeta)), ) } @@ -87,7 +87,7 @@ func (r _resource) Sync(ctx component.OperatorContext, etcd *druidv1alpha1.Etcd) return r.deleteAllSnapshotLeases(ctx, etcd.ObjectMeta, func(err error) error { return druiderr.WrapError(err, ErrSyncSnapshotLease, - "Sync", + component.OperationSync, fmt.Sprintf("Failed to delete existing snapshot leases (due to backup being disabled for etcd) due to reason: %v", druidv1alpha1.GetNamespaceName(etcd.ObjectMeta))) }) } @@ -112,7 +112,7 @@ func (r _resource) TriggerDelete(ctx component.OperatorContext, etcdObjMeta meta if err := r.deleteAllSnapshotLeases(ctx, etcdObjMeta, func(err error) error { return druiderr.WrapError(err, ErrDeleteSnapshotLease, - "TriggerDelete", + component.OperationTriggerDelete, fmt.Sprintf("Failed to delete snapshot leases for etcd: %v", druidv1alpha1.GetNamespaceName(etcdObjMeta))) }); err != nil { return err @@ -152,7 +152,7 @@ func (r _resource) doCreateOrUpdate(ctx component.OperatorContext, etcd *druidv1 if err != nil { return druiderr.WrapError(err, ErrSyncSnapshotLease, - "Sync", + component.OperationSync, fmt.Sprintf("Error syncing snapshot lease: %v for etcd: %v", leaseObjectKey, druidv1alpha1.GetNamespaceName(etcd.ObjectMeta))) } ctx.Logger.Info("triggered create or update of snapshot lease", "objectKey", leaseObjectKey, "operationResult", opResult) diff --git a/internal/component/snapshotlease/snapshotlease_test.go b/internal/component/snapshotlease/snapshotlease_test.go index 8f87c1f85..d10fc9a92 100644 --- a/internal/component/snapshotlease/snapshotlease_test.go +++ b/internal/component/snapshotlease/snapshotlease_test.go @@ -62,7 +62,7 @@ func TestGetExistingResourceNames(t *testing.T) { expectedErr: &druiderr.DruidError{ Code: ErrGetSnapshotLease, Cause: testutils.TestAPIInternalErr, - Operation: "GetExistingResourceNames", + Operation: component.OperationGetExistingResourceNames, }, }, } @@ -111,7 +111,7 @@ func TestSyncWhenBackupIsEnabled(t *testing.T) { expectedErr: &druiderr.DruidError{ Code: ErrSyncSnapshotLease, Cause: testutils.TestAPIInternalErr, - Operation: "Sync", + Operation: component.OperationSync, }, }, } @@ -156,7 +156,7 @@ func TestSyncWhenBackupHasBeenDisabled(t *testing.T) { expectedErr: &druiderr.DruidError{ Code: ErrSyncSnapshotLease, Cause: testutils.TestAPIInternalErr, - Operation: "Sync", + Operation: component.OperationSync, }, }, } @@ -217,7 +217,7 @@ func TestTriggerDelete(t *testing.T) { expectedErr: &druiderr.DruidError{ Code: ErrDeleteSnapshotLease, Cause: testutils.TestAPIInternalErr, - Operation: "TriggerDelete", + Operation: component.OperationTriggerDelete, }, }, } diff --git a/internal/component/statefulset/builder.go b/internal/component/statefulset/builder.go index 7a8902e31..acadfaaf6 100644 --- a/internal/component/statefulset/builder.go +++ b/internal/component/statefulset/builder.go @@ -29,7 +29,6 @@ import ( // defaults // ----------------------------------------------------------------------------------------- const ( - defaultWrapperPort int = 9095 defaultMaxBackupsLimitBasedGC int32 = 7 defaultQuota int64 = 8 * 1024 * 1024 * 1024 // 8Gi defaultSnapshotMemoryLimit int64 = 100 * 1024 * 1024 // 100Mi @@ -70,6 +69,10 @@ type stsBuilder struct { clientPort int32 serverPort int32 backupPort int32 + // skipSetOrUpdateForbiddenFields if its true then it will set/update values to fields which are forbidden to be updated for an existing StatefulSet. + // Updates to statefulset spec for fields other than 'replicas', 'ordinals', 'template', 'updateStrategy', 'persistentVolumeClaimRetentionPolicy' and 'minReadySeconds' are forbidden. + // Only for a new StatefulSet should this be set to true. + skipSetOrUpdateForbiddenFields bool } func newStsBuilder(client client.Client, @@ -78,6 +81,7 @@ func newStsBuilder(client client.Client, replicas int32, useEtcdWrapper bool, imageVector imagevector.ImageVector, + skipSetOrUpdateForbiddenFields bool, sts *appsv1.StatefulSet) (*stsBuilder, error) { etcdImage, etcdBackupRestoreImage, initContainerImage, err := utils.GetEtcdImages(etcd, imageVector, useEtcdWrapper) if err != nil { @@ -88,26 +92,30 @@ func newStsBuilder(client client.Client, return nil, err } return &stsBuilder{ - client: client, - logger: logger, - etcd: etcd, - replicas: replicas, - useEtcdWrapper: useEtcdWrapper, - provider: provider, - etcdImage: etcdImage, - etcdBackupRestoreImage: etcdBackupRestoreImage, - initContainerImage: initContainerImage, - sts: sts, - clientPort: ptr.Deref(etcd.Spec.Etcd.ClientPort, common.DefaultPortEtcdClient), - serverPort: ptr.Deref(etcd.Spec.Etcd.ServerPort, common.DefaultPortEtcdPeer), - backupPort: ptr.Deref(etcd.Spec.Backup.Port, common.DefaultPortEtcdBackupRestore), + client: client, + logger: logger, + etcd: etcd, + replicas: replicas, + useEtcdWrapper: useEtcdWrapper, + provider: provider, + etcdImage: etcdImage, + etcdBackupRestoreImage: etcdBackupRestoreImage, + initContainerImage: initContainerImage, + sts: sts, + clientPort: ptr.Deref(etcd.Spec.Etcd.ClientPort, common.DefaultPortEtcdClient), + serverPort: ptr.Deref(etcd.Spec.Etcd.ServerPort, common.DefaultPortEtcdPeer), + backupPort: ptr.Deref(etcd.Spec.Backup.Port, common.DefaultPortEtcdBackupRestore), + skipSetOrUpdateForbiddenFields: skipSetOrUpdateForbiddenFields, }, nil } // Build builds the StatefulSet for the given Etcd. func (b *stsBuilder) Build(ctx component.OperatorContext) error { b.createStatefulSetObjectMeta() - return b.createStatefulSetSpec(ctx) + if err := b.createStatefulSetSpec(ctx); err != nil { + return fmt.Errorf("[stsBuilder]: error in creating StatefulSet spec: %w", err) + } + return nil } func (b *stsBuilder) createStatefulSetObjectMeta() { @@ -128,50 +136,74 @@ func (b *stsBuilder) getStatefulSetLabels() map[string]string { } func (b *stsBuilder) createStatefulSetSpec(ctx component.OperatorContext) error { - podVolumes, err := b.getPodVolumes(ctx) + err := b.createPodTemplateSpec(ctx) + b.sts.Spec.Replicas = ptr.To(b.replicas) + b.sts.Spec.UpdateStrategy = defaultUpdateStrategy if err != nil { return err } + if !b.skipSetOrUpdateForbiddenFields { + b.sts.Spec.Selector = &metav1.LabelSelector{ + MatchLabels: druidv1alpha1.GetDefaultLabels(b.etcd.ObjectMeta), + } + b.sts.Spec.PodManagementPolicy = defaultPodManagementPolicy + b.sts.Spec.ServiceName = druidv1alpha1.GetPeerServiceName(b.etcd.ObjectMeta) + b.sts.Spec.VolumeClaimTemplates = b.getVolumeClaimTemplates() + } + return nil +} +func (b *stsBuilder) createPodTemplateSpec(ctx component.OperatorContext) error { + podVolumes, err := b.getPodVolumes(ctx) + if err != nil { + return err + } backupRestoreContainer, err := b.getBackupRestoreContainer() if err != nil { return err } - - b.sts.Spec = appsv1.StatefulSetSpec{ - Replicas: ptr.To(b.replicas), - Selector: &metav1.LabelSelector{ - MatchLabels: druidv1alpha1.GetDefaultLabels(b.etcd.ObjectMeta), - }, - PodManagementPolicy: defaultPodManagementPolicy, - UpdateStrategy: defaultUpdateStrategy, - VolumeClaimTemplates: b.getVolumeClaimTemplates(), - ServiceName: druidv1alpha1.GetPeerServiceName(b.etcd.ObjectMeta), - Template: corev1.PodTemplateSpec{ - ObjectMeta: metav1.ObjectMeta{ - Labels: utils.MergeMaps(b.etcd.Spec.Labels, b.getStatefulSetLabels()), - Annotations: b.getPodTemplateAnnotations(ctx), - }, - Spec: corev1.PodSpec{ - HostAliases: b.getHostAliases(), - ServiceAccountName: druidv1alpha1.GetServiceAccountName(b.etcd.ObjectMeta), - ShareProcessNamespace: ptr.To(true), - InitContainers: b.getPodInitContainers(), - Containers: []corev1.Container{ - b.getEtcdContainer(), - backupRestoreContainer, - }, - SecurityContext: b.getPodSecurityContext(), - Affinity: b.etcd.Spec.SchedulingConstraints.Affinity, - TopologySpreadConstraints: b.etcd.Spec.SchedulingConstraints.TopologySpreadConstraints, - Volumes: podVolumes, - PriorityClassName: ptr.Deref(b.etcd.Spec.PriorityClassName, ""), + podTemplateSpec := corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + HostAliases: b.getHostAliases(), + ServiceAccountName: druidv1alpha1.GetServiceAccountName(b.etcd.ObjectMeta), + ShareProcessNamespace: ptr.To(true), + InitContainers: b.getPodInitContainers(), + Containers: []corev1.Container{ + b.getEtcdContainer(), + backupRestoreContainer, }, + SecurityContext: b.getPodSecurityContext(), + Affinity: b.etcd.Spec.SchedulingConstraints.Affinity, + TopologySpreadConstraints: b.etcd.Spec.SchedulingConstraints.TopologySpreadConstraints, + Volumes: podVolumes, + PriorityClassName: ptr.Deref(b.etcd.Spec.PriorityClassName, ""), }, } + podTemplateLabels := b.getStatefulSetPodLabels() + selectorMatchesLabels, err := utils.DoesLabelSelectorMatchLabels(b.sts.Spec.Selector, podTemplateLabels) + if err != nil { + return err + } + if !selectorMatchesLabels { + podTemplateLabels = utils.MergeMaps(podTemplateLabels, b.sts.Spec.Template.Labels) + } + podTemplateSpec.ObjectMeta = metav1.ObjectMeta{ + Labels: podTemplateLabels, + Annotations: b.getPodTemplateAnnotations(ctx), + } + b.sts.Spec.Template = podTemplateSpec return nil } +func (b *stsBuilder) getStatefulSetPodLabels() map[string]string { + return utils.MergeMaps( + b.etcd.Spec.Labels, + b.getStatefulSetLabels(), + map[string]string{ + druidv1alpha1.LabelEtcdClusterSizeKey: strconv.Itoa(int(b.etcd.Spec.Replicas)), + }) +} + func (b *stsBuilder) getHostAliases() []corev1.HostAlias { return []corev1.HostAlias{ { @@ -261,7 +293,7 @@ func (b *stsBuilder) getPodInitContainers() []corev1.Container { func (b *stsBuilder) getEtcdContainerVolumeMounts() []corev1.VolumeMount { etcdVolumeMounts := make([]corev1.VolumeMount, 0, 7) etcdVolumeMounts = append(etcdVolumeMounts, b.getEtcdDataVolumeMount()) - etcdVolumeMounts = append(etcdVolumeMounts, b.getEtcdContainerSecretVolumeMounts()...) + etcdVolumeMounts = append(etcdVolumeMounts, getEtcdContainerSecretVolumeMounts(b.etcd)...) return etcdVolumeMounts } @@ -274,7 +306,7 @@ func (b *stsBuilder) getBackupRestoreContainerVolumeMounts() []corev1.VolumeMoun MountPath: etcdConfigFileMountPath, }, ) - brVolumeMounts = append(brVolumeMounts, b.getBackupRestoreContainerSecretVolumeMounts()...) + brVolumeMounts = append(brVolumeMounts, getBackupRestoreContainerSecretVolumeMounts(b.etcd)...) if b.etcd.IsBackupStoreEnabled() { etcdBackupVolumeMount := b.getEtcdBackupVolumeMount() @@ -285,9 +317,9 @@ func (b *stsBuilder) getBackupRestoreContainerVolumeMounts() []corev1.VolumeMoun return brVolumeMounts } -func (b *stsBuilder) getBackupRestoreContainerSecretVolumeMounts() []corev1.VolumeMount { +func getBackupRestoreContainerSecretVolumeMounts(etcd *druidv1alpha1.Etcd) []corev1.VolumeMount { secretVolumeMounts := make([]corev1.VolumeMount, 0, 3) - if b.etcd.Spec.Backup.TLS != nil { + if etcd.Spec.Backup.TLS != nil { secretVolumeMounts = append(secretVolumeMounts, corev1.VolumeMount{ Name: common.VolumeNameBackupRestoreServerTLS, @@ -295,7 +327,7 @@ func (b *stsBuilder) getBackupRestoreContainerSecretVolumeMounts() []corev1.Volu }, ) } - if b.etcd.Spec.Etcd.ClientUrlTLS != nil { + if etcd.Spec.Etcd.ClientUrlTLS != nil { secretVolumeMounts = append(secretVolumeMounts, corev1.VolumeMount{ Name: common.VolumeNameEtcdCA, @@ -466,6 +498,10 @@ func (b *stsBuilder) getBackupRestoreContainerCommandArgs() []string { commandArgs = append(commandArgs, fmt.Sprintf("--snapstore-temp-directory=%s/temp", common.VolumeMountPathEtcdData)) commandArgs = append(commandArgs, fmt.Sprintf("--etcd-connection-timeout=%s", defaultEtcdConnectionTimeout)) commandArgs = append(commandArgs, "--enable-member-lease-renewal=true") + // Enable/Disable use Etcd Wrapper in BackupRestore container. Once `use-etcd-wrapper` feature-gate is GA then this value will always be true. + if b.useEtcdWrapper { + commandArgs = append(commandArgs, "--use-etcd-wrapper=true") + } var quota = defaultQuota if b.etcd.Spec.Etcd.Quota != nil { @@ -657,9 +693,9 @@ func (b *stsBuilder) getPodSecurityContext() *corev1.PodSecurityContext { } } -func (b *stsBuilder) getEtcdContainerSecretVolumeMounts() []corev1.VolumeMount { +func getEtcdContainerSecretVolumeMounts(etcd *druidv1alpha1.Etcd) []corev1.VolumeMount { secretVolumeMounts := make([]corev1.VolumeMount, 0, 6) - if b.etcd.Spec.Etcd.ClientUrlTLS != nil { + if etcd.Spec.Etcd.ClientUrlTLS != nil { secretVolumeMounts = append(secretVolumeMounts, corev1.VolumeMount{ Name: common.VolumeNameEtcdCA, @@ -675,7 +711,7 @@ func (b *stsBuilder) getEtcdContainerSecretVolumeMounts() []corev1.VolumeMount { }, ) } - if b.etcd.Spec.Etcd.PeerUrlTLS != nil { + if etcd.Spec.Etcd.PeerUrlTLS != nil { secretVolumeMounts = append(secretVolumeMounts, corev1.VolumeMount{ Name: common.VolumeNameEtcdPeerCA, @@ -687,7 +723,7 @@ func (b *stsBuilder) getEtcdContainerSecretVolumeMounts() []corev1.VolumeMount { }, ) } - if b.etcd.Spec.Backup.TLS != nil { + if etcd.Spec.Backup.TLS != nil { secretVolumeMounts = append(secretVolumeMounts, corev1.VolumeMount{ Name: common.VolumeNameBackupRestoreCA, diff --git a/internal/component/statefulset/statefulset.go b/internal/component/statefulset/statefulset.go index 96eb35f5f..fc575ff98 100644 --- a/internal/component/statefulset/statefulset.go +++ b/internal/component/statefulset/statefulset.go @@ -6,6 +6,7 @@ package statefulset import ( "fmt" + "slices" druidv1alpha1 "github.com/gardener/etcd-druid/api/v1alpha1" "github.com/gardener/etcd-druid/internal/common" @@ -16,6 +17,7 @@ import ( "github.com/gardener/gardener/pkg/controllerutils" "github.com/gardener/gardener/pkg/utils/imagevector" + "github.com/go-logr/logr" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" @@ -28,8 +30,6 @@ import ( const ( // ErrGetStatefulSet indicates an error in getting the statefulset resource. ErrGetStatefulSet druidv1alpha1.ErrorCode = "ERR_GET_STATEFULSET" - // ErrPreSyncStatefulSet indicates an error in pre-sync operations for the statefulset resource. - ErrPreSyncStatefulSet druidv1alpha1.ErrorCode = "ERR_PRESYNC_STATEFULSET" // ErrSyncStatefulSet indicates an error in syncing the statefulset resource. ErrSyncStatefulSet druidv1alpha1.ErrorCode = "ERR_SYNC_STATEFULSET" // ErrDeleteStatefulSet indicates an error in deleting the statefulset resource. @@ -40,6 +40,7 @@ type _resource struct { client client.Client imageVector imagevector.ImageVector useEtcdWrapper bool + logger logr.Logger } // New returns a new statefulset component operator. @@ -63,7 +64,7 @@ func (r _resource) GetExistingResourceNames(ctx component.OperatorContext, etcdO } return nil, druiderr.WrapError(err, ErrGetStatefulSet, - "GetExistingResourceNames", + component.OperationGetExistingResourceNames, fmt.Sprintf("Error getting StatefulSet: %v for etcd: %v", objectKey, druidv1alpha1.GetNamespaceName(etcdObjMeta))) } if metav1.IsControlledBy(objMeta, &etcdObjMeta) { @@ -72,90 +73,55 @@ func (r _resource) GetExistingResourceNames(ctx component.OperatorContext, etcdO return resourceNames, nil } -// PreSync recreates the statefulset for the given Etcd, if label selector for the existing statefulset -// is different from the label selector required to be applied on it. This is because the statefulset's -// spec.selector field is immutable and cannot be updated on the existing statefulset. -func (r _resource) PreSync(ctx component.OperatorContext, etcd *druidv1alpha1.Etcd) error { - ctx.Logger.Info("Running pre-sync for StatefulSet", "name", druidv1alpha1.GetStatefulSetName(etcd.ObjectMeta), "namespace", druidv1alpha1.GetNamespaceName(etcd.ObjectMeta)) - - sts, err := r.getExistingStatefulSet(ctx, etcd.ObjectMeta) - if err != nil { - return druiderr.WrapError(err, - ErrPreSyncStatefulSet, - "PreSync", - fmt.Sprintf("Error getting StatefulSet: %v for etcd: %v", getObjectKey(etcd.ObjectMeta), druidv1alpha1.GetNamespaceName(etcd.ObjectMeta))) - } - // if no sts exists, this method is a no-op. - if sts == nil { - return nil - } - - // patch sts with new pod labels. - if err = r.checkAndPatchStsPodLabelsOnMismatch(ctx, etcd, sts); err != nil { - return druiderr.WrapError(err, - ErrPreSyncStatefulSet, - "PreSync", - fmt.Sprintf("Error checking and patching StatefulSet pods with new labels for etcd: %v", druidv1alpha1.GetNamespaceName(etcd.ObjectMeta))) - } - - // check if pods have been updated with new labels. - podsHaveDesiredLabels, err := r.doStatefulSetPodsHaveDesiredLabels(ctx, etcd, sts) - if err != nil { - return druiderr.WrapError(err, - ErrPreSyncStatefulSet, - "PreSync", - fmt.Sprintf("Error checking if StatefulSet pods are updated for etcd: %v", druidv1alpha1.GetNamespaceName(etcd.ObjectMeta))) - } - if !podsHaveDesiredLabels { - return druiderr.New(druiderr.ErrRequeueAfter, - "PreSync", - fmt.Sprintf("StatefulSet pods are not yet updated with new labels, for StatefulSet: %v for etcd: %v", getObjectKey(sts.ObjectMeta), druidv1alpha1.GetNamespaceName(etcd.ObjectMeta)), - ) - } else { - ctx.Logger.Info("StatefulSet pods have all the desired labels", "objectKey", getObjectKey(etcd.ObjectMeta)) - } - - // if sts label selector needs to be changed, then delete the statefulset, but keeping the pods intact. - if err = r.checkAndDeleteStsWithOrphansOnLabelSelectorMismatch(ctx, etcd, sts); err != nil { - return druiderr.WrapError(err, - ErrPreSyncStatefulSet, - "PreSync", - fmt.Sprintf("Error checking and deleting StatefulSet with orphans for etcd: %v", druidv1alpha1.GetNamespaceName(etcd.ObjectMeta))) - } - +// PreSync is a no-op for the statefulset component. +func (r _resource) PreSync(_ component.OperatorContext, _ *druidv1alpha1.Etcd) error { return nil } // Sync creates or updates the statefulset for the given Etcd. func (r _resource) Sync(ctx component.OperatorContext, etcd *druidv1alpha1.Etcd) error { - var ( - existingSTS *appsv1.StatefulSet - err error - ) + r.logger = ctx.Logger.WithValues("component", component.StatefulSetKind, "operation", component.OperationSync) objectKey := getObjectKey(etcd.ObjectMeta) - if existingSTS, err = r.getExistingStatefulSet(ctx, etcd.ObjectMeta); err != nil { + existingSTS, err := r.getExistingStatefulSet(ctx, etcd.ObjectMeta) + if err != nil { return druiderr.WrapError(err, ErrSyncStatefulSet, - "Sync", + component.OperationSync, fmt.Sprintf("Error getting StatefulSet: %v for etcd: %v", objectKey, druidv1alpha1.GetNamespaceName(etcd.ObjectMeta))) } // There is no StatefulSet present. Create one. if existingSTS == nil { - return r.createOrPatch(ctx, etcd) + // Check etcd observed generation to determine if the etcd cluster is new or not. + if etcd.Status.ObservedGeneration == nil { + r.logger.Info("ObservedGeneration has not yet been set, triggering the creation of StatefulSet assuming a new etcd cluster") + return r.createOrPatch(ctx, etcd) + } + // If Etcd resource has previously being reconciled successfully (indicated by a non-nil etcd.Status.ObservedGeneration) + // then check if the STS is missing due to it being orphan deleted in the previous reconcile run. If so, recreate the STS. + if err = r.checkAndRecreateOrphanDeletedSts(ctx, etcd); err != nil { + return err + } } - // StatefulSet exists, check if TLS has been enabled for peer communication, if yes then it is currently a multistep - // process to ensure that all members are updated and establish peer TLS communication. - if err = r.handlePeerTLSChanges(ctx, etcd, existingSTS); err != nil { - return err + if existingSTS != nil { + if err = r.handleTLSChanges(ctx, etcd, existingSTS); err != nil { + return err + } + if !labels.Equals(existingSTS.Spec.Selector.MatchLabels, druidv1alpha1.GetDefaultLabels(etcd.ObjectMeta)) { + if err = r.handleStsLabelSelectorOnMismatch(ctx, etcd, existingSTS); err != nil { + return err + } + } } + return r.createOrPatch(ctx, etcd) } // TriggerDelete triggers the deletion of the statefulset for the given Etcd. func (r _resource) TriggerDelete(ctx component.OperatorContext, etcdObjMeta metav1.ObjectMeta) error { + r.logger = ctx.Logger.WithValues("component", component.StatefulSetKind, "operation", component.OperationTriggerDelete) objectKey := getObjectKey(etcdObjMeta) - ctx.Logger.Info("Triggering deletion of StatefulSet", "objectKey", objectKey) + r.logger.Info("Triggering deletion of StatefulSet") if err := r.client.Delete(ctx, emptyStatefulSet(etcdObjMeta)); err != nil { if apierrors.IsNotFound(err) { ctx.Logger.Info("No StatefulSet found, Deletion is a No-Op", "objectKey", objectKey.Name) @@ -163,13 +129,84 @@ func (r _resource) TriggerDelete(ctx component.OperatorContext, etcdObjMeta meta } return druiderr.WrapError(err, ErrDeleteStatefulSet, - "TriggerDelete", + component.OperationTriggerDelete, fmt.Sprintf("Failed to delete StatefulSet: %v for etcd %v", objectKey, druidv1alpha1.GetNamespaceName(etcdObjMeta))) } - ctx.Logger.Info("deleted", "component", "statefulset", "objectKey", objectKey) + r.logger.Info("deletion successful") return nil } +func (r _resource) handleStsLabelSelectorOnMismatch(ctx component.OperatorContext, etcd *druidv1alpha1.Etcd, sts *appsv1.StatefulSet) error { + r.logger.Info("Orphan deleting StatefulSet for recreation later, as label selector has changed", "oldSelector.MatchLabels", sts.Spec.Selector.MatchLabels, "newOldSelector.MatchLabels", druidv1alpha1.GetDefaultLabels(etcd.ObjectMeta)) + if err := r.client.Delete(ctx, sts, client.PropagationPolicy(metav1.DeletePropagationOrphan)); err != nil { + return druiderr.WrapError(err, + ErrSyncStatefulSet, + component.OperationSync, + fmt.Sprintf("Error orphan deleting StatefulSet: %v for etcd: %v", client.ObjectKeyFromObject(sts), client.ObjectKeyFromObject(sts))) + } + // Requeue the reconcile request to ensure that the STS is orphan deleted. + return druiderr.New( + druiderr.ErrRequeueAfter, + component.OperationSync, + fmt.Sprintf("StatefulSet has not been orphan deleted: %v for etcd: %v, requeuing reconcile request", client.ObjectKeyFromObject(sts), client.ObjectKeyFromObject(sts))) +} + +func (r _resource) checkAndRecreateOrphanDeletedSts(ctx component.OperatorContext, etcd *druidv1alpha1.Etcd) error { + numOrphanedPods, err := r.determineNumOrphanedPods(ctx, etcd) + if err != nil { + return err + } + if numOrphanedPods > 0 { + r.logger.Info("Recreating StatefulSet with previous replicas to adopt orphan pods", "numOrphanedPods", numOrphanedPods) + sts := emptyStatefulSet(etcd.ObjectMeta) + if err = r.createOrPatchWithReplicas(ctx, etcd, sts, int32(numOrphanedPods), false); err != nil { + return druiderr.WrapError(err, + ErrSyncStatefulSet, + component.OperationSync, + fmt.Sprintf("Error recreating StatefulSet with previous replicas for orphan pods adoption for etcd: %v", druidv1alpha1.GetNamespaceName(etcd.ObjectMeta))) + } + return druiderr.New( + druiderr.ErrRequeueAfter, + component.OperationSync, + fmt.Sprintf("StatefulSet has not yet been created or is not ready with previous replicas for etcd: %v, requeuing reconcile request", druidv1alpha1.GetNamespaceName(etcd.ObjectMeta))) + } + r.logger.Info("There is no STS and no orphaned pods found. Skipping recreation as none is required.") + return nil +} + +func (r _resource) determineNumOrphanedPods(ctx component.OperatorContext, etcd *druidv1alpha1.Etcd) (int, error) { + orphanedPodsObjMeta, err := r.getStsPodsObjMeta(ctx, etcd) + if err != nil { + return 0, druiderr.WrapError(err, + ErrSyncStatefulSet, + component.OperationSync, + fmt.Sprintf("Error getting orphaned pods for etcd: %v", client.ObjectKeyFromObject(etcd))) + } + if len(orphanedPodsObjMeta) > 0 { + r.logger.Info("Orphaned pods found", "numOrphanedPods", len(orphanedPodsObjMeta)) + // If there are orphaned pods then determine the correct number of orphaned pods by first looking at etcd.status.members instead of depending + // on the number of orphaned pods. It is quite possible that a subset of orphan pods have been evicted due to node crash or other reasons. + return len(etcd.Status.Members), nil + } + return 0, nil +} + +func (r _resource) getStsPodsObjMeta(ctx component.OperatorContext, etcd *druidv1alpha1.Etcd) ([]metav1.PartialObjectMetadata, error) { + objMetaList := &metav1.PartialObjectMetadataList{} + objMetaList.SetGroupVersionKind(corev1.SchemeGroupVersion.WithKind("Pod")) + if err := r.client.List(ctx, + objMetaList, + client.InNamespace(etcd.Namespace), + client.MatchingLabels(druidv1alpha1.GetDefaultLabels(etcd.ObjectMeta)), + ); err != nil { + if apierrors.IsNotFound(err) { + return []metav1.PartialObjectMetadata{}, nil + } + return nil, err + } + return objMetaList.Items, nil +} + // getExistingStatefulSet gets the existing statefulset if it exists. // If it is not found, it simply returns nil. Any other errors are returned as is. func (r _resource) getExistingStatefulSet(ctx component.OperatorContext, etcdObjMeta metav1.ObjectMeta) (*appsv1.StatefulSet, error) { @@ -185,130 +222,113 @@ func (r _resource) getExistingStatefulSet(ctx component.OperatorContext, etcdObj // createOrPatchWithReplicas ensures that the StatefulSet is updated with all changes from passed in etcd but the replicas set on the StatefulSet // are taken from the passed in replicas and not from the etcd component. -func (r _resource) createOrPatchWithReplicas(ctx component.OperatorContext, etcd *druidv1alpha1.Etcd, replicas int32) error { - desiredStatefulSet := emptyStatefulSet(etcd.ObjectMeta) +func (r _resource) createOrPatchWithReplicas(ctx component.OperatorContext, etcd *druidv1alpha1.Etcd, sts *appsv1.StatefulSet, replicas int32, skipSetOrUpdateForbiddenFields bool) error { + stsClone := sts.DeepCopy() mutatingFn := func() error { - if builder, err := newStsBuilder(r.client, ctx.Logger, etcd, replicas, r.useEtcdWrapper, r.imageVector, desiredStatefulSet); err != nil { - return druiderr.WrapError(err, - ErrSyncStatefulSet, - "Sync", - fmt.Sprintf("Error initializing StatefulSet builder for etcd %v", druidv1alpha1.GetNamespaceName(etcd.ObjectMeta))) + if builder, err := newStsBuilder(r.client, ctx.Logger, etcd, replicas, r.useEtcdWrapper, r.imageVector, skipSetOrUpdateForbiddenFields, stsClone); err != nil { + return err } else { return builder.Build(ctx) } } - opResult, err := controllerutils.GetAndCreateOrStrategicMergePatch(ctx, r.client, desiredStatefulSet, mutatingFn) + opResult, err := controllerutils.GetAndCreateOrStrategicMergePatch(ctx, r.client, stsClone, mutatingFn) if err != nil { - return druiderr.WrapError(err, - ErrSyncStatefulSet, - "Sync", - fmt.Sprintf("Error creating or patching StatefulSet: %s for etcd: %v", desiredStatefulSet.Name, druidv1alpha1.GetNamespaceName(etcd.ObjectMeta))) + return err } - - ctx.Logger.Info("triggered create/patch of statefulSet", "statefulSet", getObjectKey(etcd.ObjectMeta), "operationResult", opResult) + r.logger.Info("triggered create/patch of statefulSet", "operationResult", opResult) return nil } // createOrPatch updates StatefulSet taking changes from passed in etcd component. func (r _resource) createOrPatch(ctx component.OperatorContext, etcd *druidv1alpha1.Etcd) error { - return r.createOrPatchWithReplicas(ctx, etcd, etcd.Spec.Replicas) + sts := emptyStatefulSet(etcd.ObjectMeta) + if err := r.createOrPatchWithReplicas(ctx, etcd, sts, etcd.Spec.Replicas, false); err != nil { + return druiderr.WrapError(err, + ErrSyncStatefulSet, + component.OperationSync, + fmt.Sprintf("Error creating or patching [StatefulSet: %v, Replicas: %d] for etcd: %v", client.ObjectKeyFromObject(etcd), etcd.Spec.Replicas, client.ObjectKeyFromObject(etcd))) + } + return nil } -func (r _resource) handlePeerTLSChanges(ctx component.OperatorContext, etcd *druidv1alpha1.Etcd, existingSts *appsv1.StatefulSet) error { - if etcd.Spec.Etcd.PeerUrlTLS == nil { +func (r _resource) handleTLSChanges(ctx component.OperatorContext, etcd *druidv1alpha1.Etcd, existingSts *appsv1.StatefulSet) error { + // There are no replicas and there is no need to handle any TLS changes. Once replicas are increased then new pods will automatically have the TLS changes. + if etcd.Spec.Replicas == 0 { + r.logger.Info("Skipping handling TLS changes for StatefulSet as replicas are set to 0") return nil } - peerTLSEnabledForMembers, err := utils.IsPeerURLTLSEnabledForMembers(ctx, r.client, ctx.Logger, etcd.Namespace, etcd.Name, *existingSts.Spec.Replicas) + isSTSTLSConfigInSync := isStatefulSetTLSConfigInSync(etcd, existingSts) + if !isSTSTLSConfigInSync { + // check if the etcd cluster is in a state where it can handle TLS changes. + // If the peer URL TLS has changed and there are more than 1 replicas in the etcd cluster. Then wait for all members to be ready. + // If we do not wait for all members to be ready patching STS to reflect peer TLS changes will cause rolling update which will never finish + // and the cluster will be stuck in a bad state. Updating peer URL is a cluster wide operation as all members will need to know that a peer TLS has changed. + // If not all members are ready then rolling-update of StatefulSet can potentially cause a healthy node to be restarted causing loss of quorum from which + // there will not be an automatic recovery. + if shouldRequeueForMultiNodeEtcdIfPodsNotReady(existingSts) { + return druiderr.New( + druiderr.ErrRequeueAfter, + component.OperationSync, + fmt.Sprintf("Not all etcd cluster members are ready. It is not safe to patch STS for Peer URL TLS changes. Replicas: %d, ReadyReplicas: %d", *existingSts.Spec.Replicas, existingSts.Status.ReadyReplicas)) + } + r.logger.Info("TLS configuration is not in sync, updating StatefulSet with TLS changes") + if err := r.createOrPatchWithReplicas(ctx, etcd, existingSts, *existingSts.Spec.Replicas, true); err != nil { + return druiderr.WrapError(err, + ErrSyncStatefulSet, + component.OperationSync, + fmt.Sprintf("Error creating or patching StatefulSet with TLS changes for StatefulSet: %v, etcd: %v", client.ObjectKeyFromObject(existingSts), client.ObjectKeyFromObject(etcd))) + } + return druiderr.New( + druiderr.ErrRequeueAfter, + component.OperationSync, + fmt.Sprintf("Updated TLS config for etcd: %v, requeuing reconcile request", client.ObjectKeyFromObject(etcd))) + } + peerTLSInSyncForAllMembers, err := utils.IsPeerURLInSyncForAllMembers(ctx, r.client, ctx.Logger, etcd, *existingSts.Spec.Replicas) if err != nil { return druiderr.WrapError(err, ErrSyncStatefulSet, - "Sync", + component.OperationSync, fmt.Sprintf("Error checking if peer TLS is enabled for statefulset: %v, etcd: %v", client.ObjectKeyFromObject(existingSts), client.ObjectKeyFromObject(etcd))) } - - if !peerTLSEnabledForMembers { - if !isStatefulSetPatchedWithPeerTLSVolMount(existingSts) { - // This step ensures that only STS is updated with secret volume mounts which gets added to the etcd component due to - // enabling of TLS for peer communication. It preserves the current STS replicas. - if err = r.createOrPatchWithReplicas(ctx, etcd, *existingSts.Spec.Replicas); err != nil { - return druiderr.WrapError(err, - ErrSyncStatefulSet, - "Sync", - fmt.Sprintf("Error creating or patching StatefulSet with TLS enabled for StatefulSet: %v, etcd: %v", client.ObjectKeyFromObject(existingSts), client.ObjectKeyFromObject(etcd))) - } - } else { - ctx.Logger.Info("Secret volume mounts to enable Peer URL TLS have already been mounted. Skipping patching StatefulSet with secret volume mounts.") - } + if peerTLSInSyncForAllMembers { + r.logger.Info("Peer URL TLS configuration is reflected on all currently running members") + return nil + } else { return druiderr.New( druiderr.ErrRequeueAfter, - "Sync", - fmt.Sprintf("Peer URL TLS not enabled for #%d members for etcd: %v, requeuing reconcile request", existingSts.Spec.Replicas, client.ObjectKeyFromObject(etcd))) + component.OperationSync, + fmt.Sprintf("Peer URL TLS not enabled for #%d members for etcd: %v, requeuing reconcile request", *existingSts.Spec.Replicas, client.ObjectKeyFromObject(etcd))) } - - ctx.Logger.Info("Peer URL TLS has been enabled for all currently running members") - return nil } -func isStatefulSetPatchedWithPeerTLSVolMount(sts *appsv1.StatefulSet) bool { - volumes := sts.Spec.Template.Spec.Volumes - var peerURLCAEtcdVolPresent, peerURLEtcdServerTLSVolPresent bool - for _, vol := range volumes { - if vol.Name == common.VolumeNameEtcdPeerCA { - peerURLCAEtcdVolPresent = true - } - if vol.Name == common.VolumeNameEtcdPeerServerTLS { - peerURLEtcdServerTLSVolPresent = true - } - } - return peerURLCAEtcdVolPresent && peerURLEtcdServerTLSVolPresent +func shouldRequeueForMultiNodeEtcdIfPodsNotReady(sts *appsv1.StatefulSet) bool { + return sts.Spec.Replicas != nil && + *sts.Spec.Replicas > 1 && + sts.Status.ReadyReplicas > 0 && + sts.Status.ReadyReplicas < *sts.Spec.Replicas } -func (r _resource) checkAndPatchStsPodLabelsOnMismatch(ctx component.OperatorContext, etcd *druidv1alpha1.Etcd, sts *appsv1.StatefulSet) error { - desiredPodTemplateLabels := getDesiredPodTemplateLabels(etcd) - if !utils.ContainsAllDesiredLabels(sts.Spec.Template.Labels, desiredPodTemplateLabels) { - ctx.Logger.Info("Patching StatefulSet with new pod labels", "objectKey", getObjectKey(etcd.ObjectMeta)) - originalSts := sts.DeepCopy() - sts.Spec.Template.Labels = utils.MergeMaps(sts.Spec.Template.Labels, desiredPodTemplateLabels) - if err := r.client.Patch(ctx, sts, client.MergeFrom(originalSts)); err != nil { - return err - } - } - return nil +func isStatefulSetTLSConfigInSync(etcd *druidv1alpha1.Etcd, sts *appsv1.StatefulSet) bool { + newEtcdbrTLSVolMounts := getBackupRestoreContainerSecretVolumeMounts(etcd) + newEtcdWrapperTLSVolMounts := getEtcdContainerSecretVolumeMounts(etcd) + containerTLSVolMounts := utils.GetStatefulSetContainerTLSVolumeMounts(sts) + return !hasTLSVolumeMountsChanged(containerTLSVolMounts[common.ContainerNameEtcd], newEtcdWrapperTLSVolMounts) && + !hasTLSVolumeMountsChanged(containerTLSVolMounts[common.ContainerNameEtcdBackupRestore], newEtcdbrTLSVolMounts) } -func getDesiredPodTemplateLabels(etcd *druidv1alpha1.Etcd) map[string]string { - return utils.MergeMaps(etcd.Spec.Labels, getStatefulSetLabels(etcd.Name)) -} - -func (r _resource) doStatefulSetPodsHaveDesiredLabels(ctx component.OperatorContext, etcd *druidv1alpha1.Etcd, sts *appsv1.StatefulSet) (bool, error) { - // sts.spec.replicas is more accurate than Etcd.spec.replicas, specifically when - // Etcd.spec.replicas is updated but not yet reflected in the etcd cluster - if sts.Spec.Replicas == nil { - return false, fmt.Errorf("statefulset %s does not have a replicas count defined", sts.Name) - } - podNames := druidv1alpha1.GetAllPodNames(etcd.ObjectMeta, *sts.Spec.Replicas) - desiredLabels := getDesiredPodTemplateLabels(etcd) - for _, podName := range podNames { - pod := &corev1.Pod{} - if err := r.client.Get(ctx, client.ObjectKey{Name: podName, Namespace: etcd.Namespace}, pod); err != nil { - return false, err - } - if !utils.ContainsAllDesiredLabels(pod.Labels, desiredLabels) { - return false, nil - } +func hasTLSVolumeMountsChanged(existingVolMounts, newVolMounts []corev1.VolumeMount) bool { + if len(existingVolMounts) != len(newVolMounts) { + return true } - return true, nil -} - -func (r _resource) checkAndDeleteStsWithOrphansOnLabelSelectorMismatch(ctx component.OperatorContext, etcd *druidv1alpha1.Etcd, sts *appsv1.StatefulSet) error { - if !labels.Equals(sts.Spec.Selector.MatchLabels, druidv1alpha1.GetDefaultLabels(etcd.ObjectMeta)) { - ctx.Logger.Info("Deleting StatefulSet for recreation later, as label selector has changed", "objectKey", getObjectKey(etcd.ObjectMeta)) - if err := r.client.Delete(ctx, sts, client.PropagationPolicy(metav1.DeletePropagationOrphan)); err != nil { - return err + for _, newVolMount := range newVolMounts { + if !slices.ContainsFunc(existingVolMounts, func(existingVolMount corev1.VolumeMount) bool { + return existingVolMount.Name == newVolMount.Name && existingVolMount.MountPath == newVolMount.MountPath + }) { + return true } } - return nil + return false } func emptyStatefulSet(obj metav1.ObjectMeta) *appsv1.StatefulSet { diff --git a/internal/component/types.go b/internal/component/types.go index 12813d272..56315a826 100644 --- a/internal/component/types.go +++ b/internal/component/types.go @@ -13,6 +13,18 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) +// constants for common operations that an Operator can perform. This is used across components to provide context when reporting error. +// However, it can also be used where ever operation context is required to be specified. +// Each component can in turn define their own fine-grained operation labels as well. +const ( + // OperationSync is the Sync operation of the Operator. + OperationSync = "Sync" + // OperationTriggerDelete is the TriggerDelete operation of the Operator. + OperationTriggerDelete = "TriggerDelete" + // OperationGetExistingResourceNames is the GetExistingResourceNames operation of the Operator. + OperationGetExistingResourceNames = "GetExistingResourceNames" +) + // OperatorContext holds the underline context.Context along with additional data that needs to be passed from one reconcile-step to another in a multistep reconciliation run. type OperatorContext struct { context.Context diff --git a/internal/controller/etcd/reconcile_delete.go b/internal/controller/etcd/reconcile_delete.go index 46ec656ca..44c52648e 100644 --- a/internal/controller/etcd/reconcile_delete.go +++ b/internal/controller/etcd/reconcile_delete.go @@ -108,7 +108,7 @@ func (r *Reconciler) recordIncompleteDeletionOperation(ctx component.OperatorCon if result := ctrlutils.GetLatestEtcdPartialObjectMeta(ctx, r.client, etcdObjKey, etcdObjMeta); ctrlutils.ShortCircuitReconcileFlow(result) { return result } - if err := r.lastOpErrRecorder.RecordErrors(ctx, etcdObjKey, druidv1alpha1.LastOperationTypeDelete, exitReconcileStepResult.GetDescription(), exitReconcileStepResult.GetErrors()...); err != nil { + if err := r.lastOpErrRecorder.RecordErrors(ctx, etcdObjKey, druidv1alpha1.LastOperationTypeDelete, exitReconcileStepResult); err != nil { logger.Error(err, "failed to record last operation and last errors for etcd deletion") return ctrlutils.ReconcileWithError(err) } diff --git a/internal/controller/etcd/reconcile_spec.go b/internal/controller/etcd/reconcile_spec.go index 80e99c3d1..ef98371ee 100644 --- a/internal/controller/etcd/reconcile_spec.go +++ b/internal/controller/etcd/reconcile_spec.go @@ -92,8 +92,8 @@ func (r *Reconciler) preSyncEtcdResources(ctx component.OperatorContext, etcdObj for _, kind := range resourceOperators { op := r.operatorRegistry.GetOperator(kind) if err := op.PreSync(ctx, etcd); err != nil { - if druiderr.IsRequeueAfterError(err) { - ctx.Logger.Info("retrying pre-sync of component", "kind", kind, "syncRetryInterval", syncRetryInterval.String()) + if derr := druiderr.AsDruidError(err); derr != nil && derr.Code == druiderr.ErrRequeueAfter { + ctx.Logger.Info("retrying pre-sync of component", "kind", kind, "syncRetryInterval", syncRetryInterval.String(), "reason", derr.Message) return ctrlutils.ReconcileAfter(syncRetryInterval, fmt.Sprintf("requeueing pre-sync of component %s to be retried after %s", kind, syncRetryInterval.String())) } ctx.Logger.Error(err, "failed to sync etcd resource", "kind", kind) @@ -112,8 +112,8 @@ func (r *Reconciler) syncEtcdResources(ctx component.OperatorContext, etcdObjKey for _, kind := range resourceOperators { op := r.operatorRegistry.GetOperator(kind) if err := op.Sync(ctx, etcd); err != nil { - if druiderr.IsRequeueAfterError(err) { - ctx.Logger.Info("retrying sync of component", "kind", kind, "syncRetryInterval", syncRetryInterval.String()) + if derr := druiderr.AsDruidError(err); derr != nil && derr.Code == druiderr.ErrRequeueAfter { + ctx.Logger.Info("retrying sync of component", "kind", kind, "syncRetryInterval", syncRetryInterval.String(), "reason", derr.Message) return ctrlutils.ReconcileAfter(syncRetryInterval, fmt.Sprintf("retrying sync of component %s after %s", kind, syncRetryInterval.String())) } ctx.Logger.Error(err, "failed to sync etcd resource", "kind", kind) @@ -155,7 +155,7 @@ func (r *Reconciler) recordReconcileSuccessOperation(ctx component.OperatorConte } func (r *Reconciler) recordIncompleteReconcileOperation(ctx component.OperatorContext, etcdObjKey client.ObjectKey, exitReconcileStepResult ctrlutils.ReconcileStepResult) ctrlutils.ReconcileStepResult { - if err := r.lastOpErrRecorder.RecordErrors(ctx, etcdObjKey, druidv1alpha1.LastOperationTypeReconcile, exitReconcileStepResult.GetDescription(), exitReconcileStepResult.GetErrors()...); err != nil { + if err := r.lastOpErrRecorder.RecordErrors(ctx, etcdObjKey, druidv1alpha1.LastOperationTypeReconcile, exitReconcileStepResult); err != nil { ctx.Logger.Error(err, "failed to record last operation and last errors for etcd reconciliation") return ctrlutils.ReconcileWithError(err) } @@ -214,9 +214,7 @@ func (r *Reconciler) recordEtcdSpecReconcileSuspension(etcd *druidv1alpha1.Etcd, } func (r *Reconciler) getOrderedOperatorsForPreSync() []component.Kind { - return []component.Kind{ - component.StatefulSetKind, - } + return []component.Kind{} } func (r *Reconciler) getOrderedOperatorsForSync() []component.Kind { diff --git a/internal/controller/etcdcopybackupstask/reconciler.go b/internal/controller/etcdcopybackupstask/reconciler.go index 2239084be..45219814c 100644 --- a/internal/controller/etcdcopybackupstask/reconciler.go +++ b/internal/controller/etcdcopybackupstask/reconciler.go @@ -304,7 +304,9 @@ func (r *Reconciler) createJobObject(ctx context.Context, task *druidv1alpha1.Et env := append(createEnvVarsFromStore(&sourceStore, sourceProvider, "SOURCE_", sourcePrefix), createEnvVarsFromStore(&targetStore, targetProvider, "", "")...) // Formulate the job's volume mounts. - volumeMounts := append(createVolumeMountsFromStore(&sourceStore, sourceProvider, sourcePrefix, r.Config.FeatureGates[features.UseEtcdWrapper]), createVolumeMountsFromStore(&targetStore, targetProvider, targetPrefix, r.Config.FeatureGates[features.UseEtcdWrapper])...) + volumeMounts := append( + createVolumeMountsFromStore(&sourceStore, sourceProvider, sourcePrefix, r.Config.FeatureGates[features.UseEtcdWrapper]), + createVolumeMountsFromStore(&targetStore, targetProvider, "", r.Config.FeatureGates[features.UseEtcdWrapper])...) // Formulate the job's volumes from the source store. sourceVolumes, err := r.createVolumesFromStore(ctx, &sourceStore, task.Namespace, sourceProvider, sourcePrefix) @@ -313,7 +315,7 @@ func (r *Reconciler) createJobObject(ctx context.Context, task *druidv1alpha1.Et } // Formulate the job's volumes from the target store. - targetVolumes, err := r.createVolumesFromStore(ctx, &targetStore, task.Namespace, targetProvider, targetPrefix) + targetVolumes, err := r.createVolumesFromStore(ctx, &targetStore, task.Namespace, targetProvider, "") if err != nil { return nil, err } diff --git a/internal/controller/utils/etcdstatus.go b/internal/controller/utils/etcdstatus.go index 850e69f90..9052249b2 100644 --- a/internal/controller/utils/etcdstatus.go +++ b/internal/controller/utils/etcdstatus.go @@ -5,11 +5,13 @@ package utils import ( + "fmt" "time" druidv1alpha1 "github.com/gardener/etcd-druid/api/v1alpha1" "github.com/gardener/etcd-druid/internal/component" druiderr "github.com/gardener/etcd-druid/internal/errors" + "github.com/gardener/etcd-druid/internal/utils" "github.com/go-logr/logr" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -23,7 +25,7 @@ type LastOperationAndLastErrorsRecorder interface { // RecordSuccess records the success of an operation in the Etcd status. RecordSuccess(ctx component.OperatorContext, etcdObjectKey client.ObjectKey, operationType druidv1alpha1.LastOperationType) error // RecordErrors records errors encountered in the last operation in the Etcd status. - RecordErrors(ctx component.OperatorContext, etcdObjectKey client.ObjectKey, operationType druidv1alpha1.LastOperationType, description string, errs ...error) error + RecordErrors(ctx component.OperatorContext, etcdObjectKey client.ObjectKey, operationType druidv1alpha1.LastOperationType, operationResult ReconcileStepResult) error } // NewLastOperationAndLastErrorsRecorder returns a new LastOperationAndLastErrorsRecorder. @@ -70,10 +72,11 @@ func (l *lastOpErrRecorder) RecordSuccess(ctx component.OperatorContext, etcdObj return l.recordLastOperationAndErrors(ctx, etcdObjectKey, operationType, druidv1alpha1.LastOperationStateSucceeded, description) } -func (l *lastOpErrRecorder) RecordErrors(ctx component.OperatorContext, etcdObjectKey client.ObjectKey, operationType druidv1alpha1.LastOperationType, description string, errs ...error) error { - description += " Operation will be retried." - lastErrors := druiderr.MapToLastErrors(errs) - return l.recordLastOperationAndErrors(ctx, etcdObjectKey, operationType, druidv1alpha1.LastOperationStateError, description, lastErrors...) +func (l *lastOpErrRecorder) RecordErrors(ctx component.OperatorContext, etcdObjectKey client.ObjectKey, operationType druidv1alpha1.LastOperationType, operationResult ReconcileStepResult) error { + description := fmt.Sprintf("%s Operation will be retried.", operationResult.description) + lastErrors := druiderr.MapToLastErrors(operationResult.GetErrors()) + lastOpState := utils.IfConditionOr(operationResult.HasErrors(), druidv1alpha1.LastOperationStateError, druidv1alpha1.LastOperationStateRequeue) + return l.recordLastOperationAndErrors(ctx, etcdObjectKey, operationType, lastOpState, description, lastErrors...) } func (l *lastOpErrRecorder) recordLastOperationAndErrors(ctx component.OperatorContext, etcdObjectKey client.ObjectKey, operationType druidv1alpha1.LastOperationType, operationState druidv1alpha1.LastOperationState, description string, lastErrors ...druidv1alpha1.LastError) error { diff --git a/internal/errors/errors.go b/internal/errors/errors.go index 693bd815b..6627fb7c3 100644 --- a/internal/errors/errors.go +++ b/internal/errors/errors.go @@ -51,15 +51,6 @@ func (e *DruidError) WithCause(err error) error { return e } -// IsRequeueAfterError checks if the given error is of type DruidError and has the given error code. -func IsRequeueAfterError(err error) bool { - druidErr := &DruidError{} - if errors.As(err, &druidErr) { - return druidErr.Code == ErrRequeueAfter - } - return false -} - // New creates a new DruidError with the given error code, operation and message. func New(code druidv1alpha1.ErrorCode, operation string, message string) error { return &DruidError{ @@ -106,3 +97,12 @@ func MapToLastErrors(errs []error) []druidv1alpha1.LastError { } return lastErrs } + +// AsDruidError returns the given error as a DruidError if it is of type DruidError, otherwise returns nil. +func AsDruidError(err error) *DruidError { + druidErr := &DruidError{} + if errors.As(err, &druidErr) { + return druidErr + } + return nil +} diff --git a/internal/errors/errors_test.go b/internal/errors/errors_test.go index 1f04a21af..0772a115d 100644 --- a/internal/errors/errors_test.go +++ b/internal/errors/errors_test.go @@ -25,21 +25,26 @@ func TestWrapError(t *testing.T) { g.Expect(druidErr.Message).To(Equal("testMsg")) } -func TestIsRequeueAfterError(t *testing.T) { +func TestAsDruidError(t *testing.T) { testCases := []struct { - name string - code druidv1alpha1.ErrorCode - expected bool + name string + err error + expectedDruidErr bool }{ { - name: "error has code ERR_REQUEUE_AFTER", - code: ErrRequeueAfter, - expected: true, + name: "error is of type DruidError", + err: &DruidError{ + Code: druidv1alpha1.ErrorCode("ERR_TEST"), + Cause: fmt.Errorf("testError"), + Operation: "testOp", + Message: "testMsg", + }, + expectedDruidErr: true, }, { - name: "error has code ERR_TEST", - code: druidv1alpha1.ErrorCode("ERR_TEST"), - expected: false, + name: "error is not of type DruidError", + err: fmt.Errorf("testError"), + expectedDruidErr: false, }, } @@ -48,12 +53,12 @@ func TestIsRequeueAfterError(t *testing.T) { for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { t.Parallel() - err := &DruidError{ - Code: tc.code, - ObservedAt: time.Now().UTC(), + druidErr := AsDruidError(tc.err) + if tc.expectedDruidErr { + g.Expect(druidErr).NotTo(BeNil()) + } else { + g.Expect(druidErr).To(BeNil()) } - g.Expect(IsRequeueAfterError(err)).To(Equal(tc.expected)) - }) } } diff --git a/internal/health/condition/check_ready_test.go b/internal/health/condition/check_ready_test.go index 116ad7eee..39a67cc3f 100644 --- a/internal/health/condition/check_ready_test.go +++ b/internal/health/condition/check_ready_test.go @@ -33,6 +33,7 @@ var _ = Describe("ReadyCheck", func() { Context("when members in status", func() { It("should return that the cluster has a quorum (all members ready)", func() { etcd := druidv1alpha1.Etcd{ + Spec: druidv1alpha1.EtcdSpec{Replicas: 3}, Status: druidv1alpha1.EtcdStatus{ Members: []druidv1alpha1.EtcdMemberStatus{ readyMember, @@ -51,6 +52,7 @@ var _ = Describe("ReadyCheck", func() { It("should return that the cluster has a quorum (members are partly unknown)", func() { etcd := druidv1alpha1.Etcd{ + Spec: druidv1alpha1.EtcdSpec{Replicas: 3}, Status: druidv1alpha1.EtcdStatus{ Members: []druidv1alpha1.EtcdMemberStatus{ readyMember, @@ -69,6 +71,7 @@ var _ = Describe("ReadyCheck", func() { It("should return that the cluster has a quorum (one member not ready)", func() { etcd := druidv1alpha1.Etcd{ + Spec: druidv1alpha1.EtcdSpec{Replicas: 3}, Status: druidv1alpha1.EtcdStatus{ Members: []druidv1alpha1.EtcdMemberStatus{ readyMember, @@ -87,6 +90,7 @@ var _ = Describe("ReadyCheck", func() { It("should return that the cluster has lost its quorum", func() { etcd := druidv1alpha1.Etcd{ + Spec: druidv1alpha1.EtcdSpec{Replicas: 3}, Status: druidv1alpha1.EtcdStatus{ Members: []druidv1alpha1.EtcdMemberStatus{ readyMember, @@ -108,6 +112,7 @@ var _ = Describe("ReadyCheck", func() { Context("when no members in status", func() { It("should return that quorum is unknown", func() { etcd := druidv1alpha1.Etcd{ + Spec: druidv1alpha1.EtcdSpec{Replicas: 3}, Status: druidv1alpha1.EtcdStatus{ Members: []druidv1alpha1.EtcdMemberStatus{}, }, diff --git a/internal/health/etcdmember/check_ready.go b/internal/health/etcdmember/check_ready.go index 11a19045a..c636496a0 100644 --- a/internal/health/etcdmember/check_ready.go +++ b/internal/health/etcdmember/check_ready.go @@ -70,7 +70,7 @@ func (r *readyCheck) Check(ctx context.Context, etcd druidv1alpha1.Etcd) []Resul // This behavior is expected by the `Ready` condition and it will become imprecise if members are added here too early. renew := lease.Spec.RenewTime if renew == nil { - r.logger.Info("Member hasn't acquired lease yet, still in bootstrapping phase", "name", lease.Name) + r.logger.V(4).Info("Member hasn't acquired lease yet, still in bootstrapping phase", "name", lease.Name) continue } diff --git a/internal/images/images.yaml b/internal/images/images.yaml index 250fc90de..8d1645cac 100644 --- a/internal/images/images.yaml +++ b/internal/images/images.yaml @@ -14,11 +14,11 @@ images: name: 'etcdbrctl' sourceRepository: github.com/gardener/etcd-backup-restore repository: europe-docker.pkg.dev/gardener-project/public/gardener/etcdbrctl - tag: "v0.30.1" + tag: "v0.30.2" - name: etcd-wrapper sourceRepository: github.com/gardener/etcd-wrapper repository: europe-docker.pkg.dev/gardener-project/public/gardener/etcd-wrapper - tag: "v0.1.1" + tag: "v0.2.0" - name: alpine repository: europe-docker.pkg.dev/gardener-project/public/3rd/alpine tag: "3.18.4" diff --git a/internal/utils/labels.go b/internal/utils/labels.go index 697875822..7bd2e563a 100644 --- a/internal/utils/labels.go +++ b/internal/utils/labels.go @@ -4,6 +4,11 @@ package utils +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" +) + // ContainsAllDesiredLabels checks if the actual map contains all the desired labels. func ContainsAllDesiredLabels(actual, desired map[string]string) bool { for key, desiredValue := range desired { @@ -20,3 +25,15 @@ func ContainsLabel(actual map[string]string, key, value string) bool { actualValue, ok := actual[key] return ok && actualValue == value } + +// DoesLabelSelectorMatchLabels checks if the given label selector matches the given labels. +func DoesLabelSelectorMatchLabels(labelSelector *metav1.LabelSelector, resourceLabels map[string]string) (bool, error) { + if labelSelector == nil { + return true, nil + } + selector, err := metav1.LabelSelectorAsSelector(labelSelector) + if err != nil { + return false, err + } + return selector.Matches(labels.Set(resourceLabels)), nil +} diff --git a/internal/utils/labels_test.go b/internal/utils/labels_test.go index 58aaa3968..54c05545d 100644 --- a/internal/utils/labels_test.go +++ b/internal/utils/labels_test.go @@ -7,6 +7,8 @@ package utils import ( "testing" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + . "github.com/onsi/gomega" ) @@ -158,3 +160,132 @@ func TestContainsLabel(t *testing.T) { }) } } + +func TestDoesLabelSelectorMatchLabels(t *testing.T) { + testCases := []struct { + name string + labelSelector *metav1.LabelSelector + resourceLabels map[string]string + expectedResult bool + expectedError bool + }{ + { + name: "label selector matches resource labels", + labelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "key1": "value1", + "key2": "value2", + }, + }, + resourceLabels: map[string]string{ + "key1": "value1", + "key2": "value2", + }, + expectedResult: true, + expectedError: false, + }, + { + name: "label selector does not match resource labels", + labelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "key1": "value1", + "key2": "value2", + }, + }, + resourceLabels: map[string]string{ + "key1": "value1", + "key2": "value3", + }, + expectedResult: false, + expectedError: false, + }, + { + name: "label selector matches resource labels, where there are more resource labels than label selector MatchLabels", + labelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "key1": "value1", + "key2": "value2", + }, + }, + resourceLabels: map[string]string{ + "key1": "value1", + "key2": "value2", + "key3": "value3", + }, + expectedResult: true, + expectedError: false, + }, + { + name: "label selector is nil", + labelSelector: nil, + resourceLabels: map[string]string{ + "key1": "value1", + "key2": "value2", + }, + expectedResult: true, + expectedError: false, + }, + { + name: "resource labels are nil", + labelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "key1": "value1", + "key2": "value2", + }, + }, + resourceLabels: nil, + expectedResult: false, + expectedError: false, + }, + { + name: "label selector is empty", + labelSelector: &metav1.LabelSelector{MatchLabels: map[string]string{}}, + resourceLabels: map[string]string{ + "key1": "value1", + "key2": "value2", + }, + expectedResult: true, + expectedError: false, + }, + { + name: "resource labels are nil", + labelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "key1": "value1", + "key2": "value2", + }, + }, + resourceLabels: map[string]string{}, + expectedResult: false, + expectedError: false, + }, + { + name: "both label selector and resource labels are nil", + labelSelector: nil, + resourceLabels: nil, + expectedResult: true, + expectedError: false, + }, + { + name: "both label selector and resource labels are empty", + labelSelector: &metav1.LabelSelector{MatchLabels: map[string]string{}}, + resourceLabels: map[string]string{}, + expectedResult: true, + expectedError: false, + }, + } + g := NewWithT(t) + t.Parallel() + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + result, err := DoesLabelSelectorMatchLabels(tc.labelSelector, tc.resourceLabels) + if tc.expectedError { + g.Expect(err).To(HaveOccurred()) + } else { + g.Expect(err).NotTo(HaveOccurred()) + } + g.Expect(result).To(Equal(tc.expectedResult)) + }) + } +} diff --git a/internal/utils/lease.go b/internal/utils/lease.go index a4a08f323..1c15d22f5 100644 --- a/internal/utils/lease.go +++ b/internal/utils/lease.go @@ -8,13 +8,12 @@ import ( "context" "slices" "strconv" - "strings" druidv1alpha1 "github.com/gardener/etcd-druid/api/v1alpha1" - "github.com/gardener/etcd-druid/internal/common" "github.com/go-logr/logr" coordinationv1 "k8s.io/api/coordination/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -23,23 +22,25 @@ import ( // If the annotation is not present or its value is `false` then it indicates that the member is not TLS enabled. const LeaseAnnotationKeyPeerURLTLSEnabled = "member.etcd.gardener.cloud/tls-enabled" -// IsPeerURLTLSEnabledForMembers checks if TLS has been enabled for all existing members of an etcd cluster identified by etcdName and in the provided namespace. -func IsPeerURLTLSEnabledForMembers(ctx context.Context, cl client.Client, logger logr.Logger, namespace, etcdName string, numReplicas int32) (bool, error) { - leaseList := &coordinationv1.LeaseList{} - if err := cl.List(ctx, leaseList, client.InNamespace(namespace), client.MatchingLabels(map[string]string{ - druidv1alpha1.LabelComponentKey: common.ComponentNameMemberLease, - druidv1alpha1.LabelPartOfKey: etcdName, - druidv1alpha1.LabelManagedByKey: druidv1alpha1.LabelManagedByValue, - })); err != nil { - return false, err +// IsPeerURLInSyncForAllMembers checks if the peer URL is in sync for all existing members of an etcd cluster identified by etcdName and in the provided namespace. +func IsPeerURLInSyncForAllMembers(ctx context.Context, cl client.Client, logger logr.Logger, etcd *druidv1alpha1.Etcd, replicas int32) (bool, error) { + peerURLTLSEnabled := etcd.Spec.Etcd.PeerUrlTLS != nil + if peerURLTLSEnabled { + return isPeerURLTLSEnabledForMembers(ctx, cl, logger, etcd, replicas) } + return isPeerURLTLSDisabledForMembers(ctx, cl, logger, etcd, replicas) +} + +// isPeerURLTLSEnabledForMembers checks if TLS has been enabled for all existing members of an etcd cluster identified by etcdName and in the provided namespace. +func isPeerURLTLSEnabledForMembers(ctx context.Context, cl client.Client, logger logr.Logger, etcd *druidv1alpha1.Etcd, replicas int32) (bool, error) { tlsEnabledForAllMembers := true - leases := leaseList.DeepCopy().Items - slices.SortFunc(leases, func(a, b coordinationv1.Lease) int { - return strings.Compare(a.Name, b.Name) - }) - for _, lease := range leases[:numReplicas] { - tlsEnabled, err := parseAndGetTLSEnabledValue(lease, logger) + leaseObjMetaSlice, err := ListAllMemberLeaseObjectMeta(ctx, cl, etcd) + if err != nil { + return false, err + } + targetMembers := leaseObjMetaSlice[:replicas] + for _, leaseObjMeta := range targetMembers { + tlsEnabled, err := parseAndGetTLSEnabledValue(leaseObjMeta, logger) if err != nil { return false, err } @@ -48,17 +49,56 @@ func IsPeerURLTLSEnabledForMembers(ctx context.Context, cl client.Client, logger return tlsEnabledForAllMembers, nil } -func parseAndGetTLSEnabledValue(lease coordinationv1.Lease, logger logr.Logger) (bool, error) { - if lease.Annotations != nil { - if tlsEnabledStr, ok := lease.Annotations[LeaseAnnotationKeyPeerURLTLSEnabled]; ok { +// isPeerURLTLSDisabledForMembers checks if TLS has been disabled for all existing members of an etcd cluster identified by etcdName and in the provided namespace. +func isPeerURLTLSDisabledForMembers(ctx context.Context, cl client.Client, logger logr.Logger, etcd *druidv1alpha1.Etcd, replicas int32) (bool, error) { + tlsDisabledForAllMembers := true + leaseObjMetaSlice, err := ListAllMemberLeaseObjectMeta(ctx, cl, etcd) + if err != nil { + return false, err + } + for _, leaseObjMeta := range leaseObjMetaSlice[:replicas] { + tlsEnabled, err := parseAndGetTLSEnabledValue(leaseObjMeta, logger) + if err != nil { + return false, err + } + tlsDisabledForAllMembers = tlsDisabledForAllMembers && !tlsEnabled + } + return tlsDisabledForAllMembers, nil +} + +// ListAllMemberLeaseObjectMeta returns the list of all member leases for the given etcd cluster. +func ListAllMemberLeaseObjectMeta(ctx context.Context, cl client.Client, etcd *druidv1alpha1.Etcd) ([]metav1.PartialObjectMetadata, error) { + objMetaList := &metav1.PartialObjectMetadataList{} + objMetaList.SetGroupVersionKind(coordinationv1.SchemeGroupVersion.WithKind("Lease")) + if err := cl.List(ctx, + objMetaList, + client.InNamespace(etcd.Namespace), + ); err != nil { + return nil, err + } + // This OK to do as we do not support downscaling an etcd cluster. + // If and when we do that by then we should have already stabilised the labels and therefore this code itself will not be there. + allPossibleMemberNames := druidv1alpha1.GetMemberLeaseNames(etcd.ObjectMeta, etcd.Spec.Replicas) + leasesObjMeta := make([]metav1.PartialObjectMetadata, 0, len(objMetaList.Items)) + for _, lease := range objMetaList.Items { + if metav1.IsControlledBy(&lease, &etcd.ObjectMeta) && slices.Contains(allPossibleMemberNames, lease.Name) { + leasesObjMeta = append(leasesObjMeta, lease) + } + } + return leasesObjMeta, nil +} + +func parseAndGetTLSEnabledValue(leaseObjMeta metav1.PartialObjectMetadata, logger logr.Logger) (bool, error) { + if leaseObjMeta.Annotations != nil { + if tlsEnabledStr, ok := leaseObjMeta.Annotations[LeaseAnnotationKeyPeerURLTLSEnabled]; ok { tlsEnabled, err := strconv.ParseBool(tlsEnabledStr) if err != nil { - logger.Error(err, "tls-enabled value is not a valid boolean", "namespace", lease.Namespace, "leaseName", lease.Name) + logger.Error(err, "tls-enabled value is not a valid boolean", "namespace", leaseObjMeta.Namespace, "leaseName", leaseObjMeta.Name) return false, err } return tlsEnabled, nil } - logger.V(4).Info("tls-enabled annotation not present for lease.", "namespace", lease.Namespace, "leaseName", lease.Name) + logger.V(4).Info("tls-enabled annotation not present for lease.", "namespace", leaseObjMeta.Namespace, "leaseName", leaseObjMeta.Name) } return false, nil } diff --git a/internal/utils/lease_test.go b/internal/utils/lease_test.go index 583648a07..949f71b74 100644 --- a/internal/utils/lease_test.go +++ b/internal/utils/lease_test.go @@ -28,11 +28,11 @@ import ( func TestIsPeerURLTLSEnabledForAllMembers(t *testing.T) { internalErr := errors.New("fake get internal error") apiInternalErr := apierrors.NewInternalError(internalErr) - const etcdReplicas = 3 + etcd := testutils.EtcdBuilderWithDefaults(testutils.TestEtcdName, testutils.TestNamespace).WithReplicas(3).WithPeerTLS().Build() testCases := []struct { name string numETCDMembersWithTLSEnabled int - listErr *apierrors.StatusError + errors []testutils.ErrorsForGVK expectedErr *apierrors.StatusError expectedResult bool }{ @@ -52,8 +52,73 @@ func TestIsPeerURLTLSEnabledForAllMembers(t *testing.T) { expectedResult: true, }, { - name: "should return error when client list call fails", - listErr: apiInternalErr, + name: "should return error when client list call fails", + errors: []testutils.ErrorsForGVK{ + { + GVK: coordinationv1.SchemeGroupVersion.WithKind("Lease"), + ListErr: apiInternalErr, + }, + }, + expectedErr: apiInternalErr, + expectedResult: false, + }, + } + + g := NewWithT(t) + t.Parallel() + logger := logr.Discard() + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + var existingObjects []client.Object + for _, l := range createLeases(etcd, tc.numETCDMembersWithTLSEnabled) { + existingObjects = append(existingObjects, l) + } + cl := testutils.CreateTestFakeClientForObjectsInNamespaceWithGVK(tc.errors, testutils.TestNamespace, existingObjects...) + tlsEnabled, err := IsPeerURLInSyncForAllMembers(context.Background(), cl, logger, etcd, etcd.Spec.Replicas) + if tc.expectedErr != nil { + g.Expect(err).To(Equal(tc.expectedErr)) + } else { + g.Expect(tlsEnabled).To(Equal(tc.expectedResult)) + } + }) + } +} + +func TestIsPeerURLTLSDisabledForAllMembers(t *testing.T) { + internalErr := errors.New("fake get internal error") + apiInternalErr := apierrors.NewInternalError(internalErr) + etcd := testutils.EtcdBuilderWithDefaults(testutils.TestEtcdName, testutils.TestNamespace).WithReplicas(3).Build() + testCases := []struct { + name string + numETCDMembersWithTLSEnabled int + errors []testutils.ErrorsForGVK + expectedErr *apierrors.StatusError + expectedResult bool + }{ + { + name: "should return true when all of the members have peer TLS disabled", + numETCDMembersWithTLSEnabled: 0, + expectedResult: true, + }, + { + name: "should return false when one of three still have peer TLS enabled", + numETCDMembersWithTLSEnabled: 1, + expectedResult: false, + }, + { + name: "should return false when none of the members have peer TLS disabled", + numETCDMembersWithTLSEnabled: 3, + expectedResult: false, + }, + { + name: "should return error when client list call fails", + errors: []testutils.ErrorsForGVK{ + { + GVK: coordinationv1.SchemeGroupVersion.WithKind("Lease"), + ListErr: apiInternalErr, + }, + }, expectedErr: apiInternalErr, expectedResult: false, }, @@ -66,15 +131,11 @@ func TestIsPeerURLTLSEnabledForAllMembers(t *testing.T) { t.Run(tc.name, func(t *testing.T) { t.Parallel() var existingObjects []client.Object - for _, l := range createLeases(testutils.TestNamespace, testutils.TestEtcdName, etcdReplicas, tc.numETCDMembersWithTLSEnabled) { + for _, l := range createLeases(etcd, tc.numETCDMembersWithTLSEnabled) { existingObjects = append(existingObjects, l) } - cl := testutils.CreateTestFakeClientForAllObjectsInNamespace(nil, tc.listErr, testutils.TestNamespace, map[string]string{ - druidv1alpha1.LabelComponentKey: common.ComponentNameMemberLease, - druidv1alpha1.LabelPartOfKey: testutils.TestEtcdName, - druidv1alpha1.LabelManagedByKey: druidv1alpha1.LabelManagedByValue, - }, existingObjects...) - tlsEnabled, err := IsPeerURLTLSEnabledForMembers(context.Background(), cl, logger, testutils.TestNamespace, testutils.TestEtcdName, etcdReplicas) + cl := testutils.CreateTestFakeClientForObjectsInNamespaceWithGVK(tc.errors, testutils.TestNamespace, existingObjects...) + tlsEnabled, err := IsPeerURLInSyncForAllMembers(context.Background(), cl, logger, etcd, etcd.Spec.Replicas) if tc.expectedErr != nil { g.Expect(err).To(Equal(tc.expectedErr)) } else { @@ -84,15 +145,16 @@ func TestIsPeerURLTLSEnabledForAllMembers(t *testing.T) { } } -func createLeases(namespace, etcdName string, numLease, withTLSEnabled int) []*coordinationv1.Lease { - leases := make([]*coordinationv1.Lease, 0, numLease) +func createLeases(etcd *druidv1alpha1.Etcd, withTLSEnabled int) []*coordinationv1.Lease { + numLeases := int(etcd.Spec.Replicas) + leases := make([]*coordinationv1.Lease, 0, numLeases) labels := map[string]string{ druidv1alpha1.LabelComponentKey: common.ComponentNameMemberLease, - druidv1alpha1.LabelPartOfKey: etcdName, + druidv1alpha1.LabelPartOfKey: etcd.Name, druidv1alpha1.LabelManagedByKey: druidv1alpha1.LabelManagedByValue, } tlsEnabledCount := 0 - for i := 0; i < numLease; i++ { + for i := 0; i < numLeases; i++ { var annotations map[string]string if tlsEnabledCount < withTLSEnabled { annotations = map[string]string{ @@ -104,10 +166,11 @@ func createLeases(namespace, etcdName string, numLease, withTLSEnabled int) []*c } lease := &coordinationv1.Lease{ ObjectMeta: metav1.ObjectMeta{ - Name: fmt.Sprintf("%s-%d", etcdName, i), - Namespace: namespace, - Annotations: annotations, - Labels: labels, + Name: fmt.Sprintf("%s-%d", etcd.Name, i), + Namespace: etcd.Namespace, + Annotations: annotations, + Labels: labels, + OwnerReferences: []metav1.OwnerReference{druidv1alpha1.GetAsOwnerReference(etcd.ObjectMeta)}, }, } leases = append(leases, lease) diff --git a/internal/utils/statefulset.go b/internal/utils/statefulset.go index 7cdcc7a56..b87f3234f 100644 --- a/internal/utils/statefulset.go +++ b/internal/utils/statefulset.go @@ -11,11 +11,13 @@ import ( "strings" druidv1alpha1 "github.com/gardener/etcd-druid/api/v1alpha1" + "github.com/gardener/etcd-druid/internal/common" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/sets" "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -88,3 +90,33 @@ func FetchPVCWarningMessagesForStatefulSet(ctx context.Context, cl client.Client } return strings.TrimSpace(strings.Join(events, "; ")), pvcErr } + +var ( + etcdTLSVolumeMountNames = sets.New[string](common.VolumeNameEtcdCA, common.VolumeNameEtcdServerTLS, common.VolumeNameEtcdClientTLS, common.VolumeNameEtcdPeerCA, common.VolumeNameEtcdPeerServerTLS, common.VolumeNameBackupRestoreCA) + etcdbrTLSVolumeMountNames = sets.New[string](common.VolumeNameBackupRestoreServerTLS, common.VolumeNameEtcdCA, common.VolumeNameEtcdClientTLS) +) + +// GetStatefulSetContainerTLSVolumeMounts returns a map of container name to TLS volume mounts for the given StatefulSet. +func GetStatefulSetContainerTLSVolumeMounts(sts *appsv1.StatefulSet) map[string][]corev1.VolumeMount { + containerVolMounts := make(map[string][]corev1.VolumeMount, 2) // each pod is a 2 container pod. Init containers are not counted as containers. + for _, container := range sts.Spec.Template.Spec.Containers { + if _, ok := containerVolMounts[container.Name]; !ok { + // Assuming 6 volume mounts per container. If there are more in future then this map's capacity will be increased by the golang runtime. + // A size is assumed to minimize the possibility of a resize, which is usually not cheap. + containerVolMounts[container.Name] = make([]corev1.VolumeMount, 0, 6) + } + containerVolMounts[container.Name] = append(containerVolMounts[container.Name], filterTLSVolumeMounts(container.Name, container.VolumeMounts)...) + } + return containerVolMounts +} + +func filterTLSVolumeMounts(containerName string, allVolumeMounts []corev1.VolumeMount) []corev1.VolumeMount { + filteredVolMounts := make([]corev1.VolumeMount, 0, len(allVolumeMounts)) + knownTLSVolMountNames := IfConditionOr(containerName == common.ContainerNameEtcd, etcdTLSVolumeMountNames, etcdbrTLSVolumeMountNames) + for _, volMount := range allVolumeMounts { + if knownTLSVolMountNames.Has(volMount.Name) { + filteredVolMounts = append(filteredVolMounts, volMount) + } + } + return filteredVolMounts +} diff --git a/internal/webhook/etcdcomponents/handler.go b/internal/webhook/etcdcomponents/handler.go index f57d9dcbc..641f3cff1 100644 --- a/internal/webhook/etcdcomponents/handler.go +++ b/internal/webhook/etcdcomponents/handler.go @@ -87,10 +87,10 @@ func (h *Handler) Handle(ctx context.Context, req admission.Request) admission.R return h.handleDelete(req, etcd) } - return h.handleUpdate(req, etcd) + return h.handleUpdate(req, etcd, partialObjMeta.ObjectMeta) } -func (h *Handler) handleUpdate(req admission.Request, etcd *druidv1alpha1.Etcd) admission.Response { +func (h *Handler) handleUpdate(req admission.Request, etcd *druidv1alpha1.Etcd, objMeta metav1.ObjectMeta) admission.Response { requestGK := util.GetGroupKindFromRequest(req) // Leases (member and snapshot) will be periodically updated by etcd members. @@ -106,7 +106,9 @@ func (h *Handler) handleUpdate(req admission.Request, etcd *druidv1alpha1.Etcd) if req.UserInfo.Username == h.config.ReconcilerServiceAccount { return admission.Allowed(fmt.Sprintf("ongoing reconciliation of Etcd %s by etcd-druid requires changes to resources", etcd.Name)) } - return admission.Denied(fmt.Sprintf("no external intervention allowed during ongoing reconciliation of Etcd %s by etcd-druid", etcd.Name)) + if objMeta.DeletionTimestamp == nil { + return admission.Denied(fmt.Sprintf("no external intervention allowed during ongoing reconciliation of Etcd %s by etcd-druid", etcd.Name)) + } } // allow exempt service accounts to make changes to resources, but only if the Etcd is not currently being reconciled. diff --git a/main.go b/main.go index 12e8f748e..004c9f710 100644 --- a/main.go +++ b/main.go @@ -5,18 +5,14 @@ package main import ( - "github.com/gardener/etcd-druid/internal/utils" - "os" - "runtime" - druidmgr "github.com/gardener/etcd-druid/internal/manager" + "github.com/gardener/etcd-druid/internal/utils" "github.com/gardener/etcd-druid/internal/version" "github.com/go-logr/logr" - "go.uber.org/zap/zapcore" - "sigs.k8s.io/controller-runtime/pkg/log/zap" - flag "github.com/spf13/pflag" _ "k8s.io/client-go/plugin/pkg/client/auth/gcp" + "os" + "runtime" ctrl "sigs.k8s.io/controller-runtime" // +kubebuilder:scaffold:imports ) @@ -25,26 +21,18 @@ var logger = ctrl.Log.WithName("druid") func main() { ctx := ctrl.SetupSignalHandler() - ctrl.SetLogger(utils.MustNewLogger(false, utils.LogFormatJSON)) - printVersionInfo() - mgrConfig := druidmgr.Config{} - if err := mgrConfig.InitFromFlags(flag.CommandLine); err != nil { - logger.Error(err, "failed to initialize from flags") + mgrConfig, err := initializeManagerConfig() + if err != nil { + logger.Error(err, "failed to initialize manager config") os.Exit(1) } - - flag.Parse() - - printFlags(logger) - - if err := mgrConfig.Validate(); err != nil { + if err = mgrConfig.Validate(); err != nil { logger.Error(err, "validation of manager config failed") os.Exit(1) } - mgr, err := druidmgr.InitializeManager(&mgrConfig) if err != nil { logger.Error(err, "failed to create druid controller manager") @@ -60,26 +48,36 @@ func main() { } } +func initializeManagerConfig() (druidmgr.Config, error) { + mgrConfig := druidmgr.Config{} + cmdLineFlagSet := flag.CommandLine + // This is required when moving from v0.22 to v0.23. Some command line flags have been renamed in v0.23. + // If the entity which deploys druid needs to handle both versions then in the caller both old and new flags can be set. + // Older version of druid will only work with older flags while ignoring the newer ones as unknowns. It is going to be a similar case for newer versions of druid as well. + // Once we stabilise the command line arguments then this will no longer be needed. + cmdLineFlagSet.ParseErrorsWhitelist.UnknownFlags = true + if err := mgrConfig.InitFromFlags(cmdLineFlagSet); err != nil { + logger.Error(err, "failed to initialize from flags") + return druidmgr.Config{}, err + } + + if err := cmdLineFlagSet.Parse(os.Args[1:]); err != nil { + logger.Error(err, "failed to parse command line flags") + return druidmgr.Config{}, err + } + printFlags(cmdLineFlagSet, logger) + return mgrConfig, nil +} + func printVersionInfo() { logger.Info("Etcd-druid build information", "Etcd-druid Version", version.Version, "Git SHA", version.GitSHA) logger.Info("Golang runtime information", "Version", runtime.Version(), "OS", runtime.GOOS, "Arch", runtime.GOARCH) } -func printFlags(logger logr.Logger) { +func printFlags(fs *flag.FlagSet, logger logr.Logger) { var flagKVs []interface{} - flag.VisitAll(func(f *flag.Flag) { + fs.VisitAll(func(f *flag.Flag) { flagKVs = append(flagKVs, f.Name, f.Value.String()) }) - logger.Info("Running with flags", flagKVs...) } - -func buildDefaultLoggerOpts() []zap.Opts { - var opts []zap.Opts - opts = append(opts, zap.UseDevMode(false)) - opts = append(opts, zap.JSONEncoder(func(encoderConfig *zapcore.EncoderConfig) { - encoderConfig.EncodeTime = zapcore.ISO8601TimeEncoder - encoderConfig.EncodeDuration = zapcore.StringDurationEncoder - })) - return opts -}