diff --git a/api/v1alpha1/etcd.go b/api/v1alpha1/etcd.go index b079f16a4..8d311e95e 100644 --- a/api/v1alpha1/etcd.go +++ b/api/v1alpha1/etcd.go @@ -194,6 +194,10 @@ type EtcdConfig struct { // Quota defines the etcd DB quota. // +optional Quota *resource.Quantity `json:"quota,omitempty"` + // SnapshotCount defines the number of applied Raft entries to hold in-memory before compaction. + // More info: https://etcd.io/docs/v3.4/op-guide/maintenance/#raft-log-retention + // +optional + SnapshotCount *int64 `json:"snapshotCount,omitempty"` // DefragmentationSchedule defines the cron standard schedule for defragmentation of etcd. // +optional DefragmentationSchedule *string `json:"defragmentationSchedule,omitempty"` diff --git a/api/v1alpha1/etcd_test.go b/api/v1alpha1/etcd_test.go index 3d3651b12..adcb2cd3d 100644 --- a/api/v1alpha1/etcd_test.go +++ b/api/v1alpha1/etcd_test.go @@ -118,10 +118,11 @@ func TestIsReconciliationInProgress(t *testing.T) { func createEtcd(name, namespace string) *Etcd { var ( - clientPort int32 = 2379 - serverPort int32 = 2380 - backupPort int32 = 8080 - metricLevel = Basic + clientPort int32 = 2379 + serverPort int32 = 2380 + backupPort int32 = 8080 + metricLevel = Basic + snapshotCount int64 = 75000 ) garbageCollectionPeriod := metav1.Duration{ @@ -238,10 +239,11 @@ func createEtcd(name, namespace string) *Etcd { "memory": resource.MustParse("1000Mi"), }, }, - ClientPort: &clientPort, - ServerPort: &serverPort, - ClientUrlTLS: clientTlsConfig, - PeerUrlTLS: peerTlsConfig, + ClientPort: &clientPort, + ServerPort: &serverPort, + SnapshotCount: &snapshotCount, + ClientUrlTLS: clientTlsConfig, + PeerUrlTLS: peerTlsConfig, }, }, } diff --git a/api/v1alpha1/helper.go b/api/v1alpha1/helper.go index a6bb27a3e..0bdaed5bd 100644 --- a/api/v1alpha1/helper.go +++ b/api/v1alpha1/helper.go @@ -31,7 +31,7 @@ func GetServiceAccountName(etcdObjMeta metav1.ObjectMeta) string { // GetConfigMapName returns the name of the configmap for the Etcd. func GetConfigMapName(etcdObjMeta metav1.ObjectMeta) string { - return fmt.Sprintf("etcd-bootstrap-%s", string(etcdObjMeta.UID[:6])) + return fmt.Sprintf("%s-config", etcdObjMeta.Name) } // GetCompactionJobName returns the compaction job name for the Etcd. diff --git a/api/v1alpha1/helper_test.go b/api/v1alpha1/helper_test.go index cc9776279..015b6bb74 100644 --- a/api/v1alpha1/helper_test.go +++ b/api/v1alpha1/helper_test.go @@ -54,7 +54,7 @@ func TestGetConfigMapName(t *testing.T) { uid := uuid.NewUUID() etcdObjMeta := createEtcdObjectMetadata(uid, nil, nil, false) configMapName := GetConfigMapName(etcdObjMeta) - g.Expect(configMapName).To(Equal("etcd-bootstrap-" + string(uid[:6]))) + g.Expect(configMapName).To(Equal(etcdObjMeta.Name + "-config")) } func TestGetCompactionJobName(t *testing.T) { diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index a94f34e03..72bc071d8 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -235,6 +235,11 @@ func (in *EtcdConfig) DeepCopyInto(out *EtcdConfig) { x := (*in).DeepCopy() *out = &x } + if in.SnapshotCount != nil { + in, out := &in.SnapshotCount, &out.SnapshotCount + *out = new(int64) + **out = **in + } if in.DefragmentationSchedule != nil { in, out := &in.DefragmentationSchedule, &out.DefragmentationSchedule *out = new(string) diff --git a/charts/druid/charts/crds/templates/crd-druid.gardener.cloud_etcds.yaml b/charts/druid/charts/crds/templates/crd-druid.gardener.cloud_etcds.yaml index 49b507783..d0c7d23cc 100644 --- a/charts/druid/charts/crds/templates/crd-druid.gardener.cloud_etcds.yaml +++ b/charts/druid/charts/crds/templates/crd-druid.gardener.cloud_etcds.yaml @@ -597,6 +597,12 @@ spec: serverPort: format: int32 type: integer + snapshotCount: + description: |- + SnapshotCount defines the number of applied Raft entries to hold in-memory before compaction. + More info: https://etcd.io/docs/v3.4/op-guide/maintenance/#raft-log-retention + format: int64 + type: integer type: object labels: additionalProperties: diff --git a/config/crd/bases/crd-druid.gardener.cloud_etcds.yaml b/config/crd/bases/crd-druid.gardener.cloud_etcds.yaml index 49b507783..d0c7d23cc 100644 --- a/config/crd/bases/crd-druid.gardener.cloud_etcds.yaml +++ b/config/crd/bases/crd-druid.gardener.cloud_etcds.yaml @@ -597,6 +597,12 @@ spec: serverPort: format: int32 type: integer + snapshotCount: + description: |- + SnapshotCount defines the number of applied Raft entries to hold in-memory before compaction. + More info: https://etcd.io/docs/v3.4/op-guide/maintenance/#raft-log-retention + format: int64 + type: integer type: object labels: additionalProperties: diff --git a/docs/api-reference/etcd-druid-api.md b/docs/api-reference/etcd-druid-api.md index 14824674a..8bded3a8d 100644 --- a/docs/api-reference/etcd-druid-api.md +++ b/docs/api-reference/etcd-druid-api.md @@ -245,6 +245,7 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | | `quota` _[Quantity](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.29/#quantity-resource-api)_ | Quota defines the etcd DB quota. | | | +| `snapshotCount` _integer_ | SnapshotCount defines the number of applied Raft entries to hold in-memory before compaction.
More info: https://etcd.io/docs/v3.4/op-guide/maintenance/#raft-log-retention | | | | `defragmentationSchedule` _string_ | DefragmentationSchedule defines the cron standard schedule for defragmentation of etcd. | | | | `serverPort` _integer_ | | | | | `clientPort` _integer_ | | | | diff --git a/docs/proposals/03-scaling-up-an-etcd-cluster.md b/docs/proposals/03-scaling-up-an-etcd-cluster.md index 51af42f94..3de353360 100644 --- a/docs/proposals/03-scaling-up-an-etcd-cluster.md +++ b/docs/proposals/03-scaling-up-an-etcd-cluster.md @@ -24,7 +24,7 @@ Now, it is detected whether peer URL was TLS enabled or not for single node etcd - If peer URL was not TLS enabled then etcd-druid has to intervene and make sure peer URL should be TLS enabled first for the single node before marking the cluster for scale-up. ## Action taken by etcd-druid to enable the peerURL TLS -1. Etcd-druid will update the `etcd-bootstrap` config-map with new config like initial-cluster,initial-advertise-peer-urls etc. Backup-restore will detect this change and update the member lease annotation to `member.etcd.gardener.cloud/tls-enabled: "true"`. +1. Etcd-druid will update the `{etcd.Name}-config` config-map with new config like initial-cluster,initial-advertise-peer-urls etc. Backup-restore will detect this change and update the member lease annotation to `member.etcd.gardener.cloud/tls-enabled: "true"`. 2. In case the peer URL TLS has been changed to `enabled`: Etcd-druid will add tasks to the deployment flow: - Check if peer TLS has been enabled for existing StatefulSet pods, by checking the member leases for the annotation `member.etcd.gardener.cloud/tls-enabled`. - If peer TLS enablement is pending for any of the members, then check and patch the StatefulSet with the peer TLS volume mounts, if not already patched. This will cause a rolling update of the existing StatefulSet pods, which allows etcd-backup-restore to update the member peer URL in the etcd cluster. diff --git a/docs/usage/recovering-etcd-clusters.md b/docs/usage/recovering-etcd-clusters.md index 27ffd285d..9d8f4a995 100644 --- a/docs/usage/recovering-etcd-clusters.md +++ b/docs/usage/recovering-etcd-clusters.md @@ -6,9 +6,9 @@ For a multi-node `Etcd` cluster quorum loss can either be `Transient` or `Perman ## Transient quorum loss -If quorum is lost through transient network failures (e.g. n/w partitions), spike in resource usage which results in OOM, `etcd` automatically and safely resumes (once the network recovers or the resource consumption has come down) and restores quorum. In other cases like transient power loss, etcd persists the Raft log to disk and replays the log to the point of failure and resumes cluster operation. +If quorum is lost through transient network failures (e.g. n/w partitions) or there is a spike in resource usage which results in OOM, `etcd` automatically and safely resumes (once the network recovers or the resource consumption has come down) and restores quorum. In other cases like transient power loss, etcd persists the Raft log to disk and replays the log to the point of failure and resumes cluster operation. -## Permanent quorum loss +## Permanent quorum loss In case the quorum is lost due to hardware failures or disk corruption etc, automatic recovery is no longer possible and it is categorized as a permanent quorum loss. @@ -43,6 +43,7 @@ Identify the etcd-cluster which has a permanent quorum loss. Most of the resourc To ensure that only one actor (in this case an operator) makes changes to the `Etcd` resource and also to the `Etcd` cluster resources, following must be done: Add the annotation to the `Etcd` resource: + ```bash kubectl annotate etcd -n druid.gardener.cloud/suspend-etcd-spec-reconcile= ``` @@ -74,6 +75,7 @@ kubectl delete pvc -l instance= -n For a `n` member `Etcd` cluster there should be `n` member `Lease` objects. The lease names should start with the `Etcd` name. Example leases for a 3 node `Etcd` cluster: + ```b NAME HOLDER AGE -0 4c37667312a3912b:Member 1m @@ -82,6 +84,7 @@ Example leases for a 3 node `Etcd` cluster: ``` Delete all the member leases. + ```bash kubectl delete lease # Alternatively you can use label selector. From v0.23.0 onwards leases will have common set of labels @@ -90,18 +93,66 @@ kubectl delete lease -l app.kubernetes.io.component=etcd-member-lease, app.kuber #### 05-Modify ConfigMap -Prerequisite to scale up etcd-cluster from 0->1 is to change `initial-cluster` in the ConfigMap. Assuming that prior to scale-down to 0, there were 3 members, the `initial-cluster` field would look like the following (assuming that the name of the etcd resource is `etcd-main`): +Prerequisite to scale up etcd-cluster from 0->1 is to change the fields `initial-cluster`, `initial-advertise-peer-urls`, and `advertise-client-urls` in the ConfigMap. + +Assuming that prior to scale-down to 0, there were 3 members: + +The `initial-cluster` field would look like the following (assuming that the name of the etcd resource is `etcd-main`): + ```yaml # Initial cluster initial-cluster: etcd-main-0=https://etcd-main-0.etcd-main-peer.default.svc:2380,etcd-main-1=https://etcd-main-1.etcd-main-peer.default.svc:2380,etcd-main-2=https://etcd-main-2.etcd-main-peer.default.svc:2380 ``` -Change the `initial-cluster` field to have only one member (in this case `etc-main-0`). After the change it should look like: -```bash +Change the `initial-cluster` field to have only one member (in this case `etcd-main-0`). After the change it should look like: + +```yaml # Initial cluster initial-cluster: etcd-main-0=https://etcd-main-0.etcd-main-peer.default.svc:2380 ``` +The `initial-advertise-peer-urls` field would look like the following: + +```yaml +# Initial advertise peer urls +initial-advertise-peer-urls: + etcd-main-0: + - http://etcd-main-0.etcd-main-peer.default.svc:2380 + etcd-main-1: + - http://etcd-main-1.etcd-main-peer.default.svc:2380 + etcd-main-2: + - http://etcd-main-2.etcd-main-peer.default.svc:2380 +``` + +Change the `initial-advertise-peer-urls` field to have only one member (in this case `etcd-main-0`). After the change it should look like: + +```yaml +# Initial advertise peer urls +initial-advertise-peer-urls: + etcd-main-0: + - http://etcd-main-0.etcd-main-peer.default.svc:2380 +``` + +The `advertise-client-urls` field would look like the following: + +```yaml +advertise-client-urls: + etcd-main-0: + - http://etcd-main-0.etcd-main-peer.default.svc:2379 + etcd-main-1: + - http://etcd-main-1.etcd-main-peer.default.svc:2379 + etcd-main-2: + - http://etcd-main-2.etcd-main-peer.default.svc:2379 +``` + +Change the `advertise-client-urls` field to have only one member (in this case `etcd-main-0`). After the change it should look like: + +```yaml +advertise-client-urls: + etcd-main-0: + - http://etcd-main-0.etcd-main-peer.default.svc:2379 +``` + #### 06-Scale up Etcd cluster to size 1 ```bash @@ -111,6 +162,7 @@ kubectl scale sts -n --replicas=1 #### 07-Wait for Single-Member etcd cluster to be completely ready To check if the `single-member` etcd cluster is ready check the status of the pod. + ```bash kubectl get pods -n NAME READY STATUS RESTARTS AGE @@ -122,6 +174,7 @@ If both containers report readiness (as seen above), then the etcd-cluster is co #### 08-Enable Etcd reconciliation and resource protection All manual changes are now done. We must now re-enable etcd-cluster resource protection and also enable reconciliation by etcd-druid by doing the following: + ```bash kubectl annotate etcd -n druid.gardener.cloud/suspend-etcd-spec-reconcile- kubectl annotate etcd -n druid.gardener.cloud/disable-etcd-component-protection- @@ -136,8 +189,9 @@ kubectl scale sts -n namespace --replicas=3 ``` If etcd-druid has been set up with `--enable-etcd-spec-auto-reconcile` switched-off then to ensure reconciliation one must annotate `Etcd` resource with the following command: + ```bash -# Annotate etcd-test CR to reconcile +# Annotate etcd CR to reconcile kubectl annotate etcd -n gardener.cloud/operation="reconcile" ``` @@ -154,6 +208,7 @@ NAME READY STATUS RESTARTS AGE ``` Additionally, check if the `Etcd` CR is ready: + ```bash kubectl get etcd -n NAME READY AGE @@ -161,14 +216,10 @@ NAME READY AGE ``` Check member leases, whose `holderIdentity` should reflect the member role. Check if all members are voting members (their role should either be `Member` or `Leader`). Monitor the leases for some time and check if the leases are getting updated. You can monitor the `AGE` field. + ```bash NAME HOLDER AGE -0 4c37667312a3912b:Member 1m -1 75a9b74cfd3077cc:Member 1m -2 c62ee6af755e890d:Leader 1m ``` - - - - - diff --git a/internal/component/configmap/configmap_test.go b/internal/component/configmap/configmap_test.go index 3fa558a65..be1408203 100644 --- a/internal/component/configmap/configmap_test.go +++ b/internal/component/configmap/configmap_test.go @@ -7,7 +7,6 @@ package configmap import ( "context" "fmt" - "strconv" "testing" druidv1alpha1 "github.com/gardener/etcd-druid/api/v1alpha1" @@ -306,12 +305,6 @@ func newConfigMap(g *WithT, etcd *druidv1alpha1.Etcd) *corev1.ConfigMap { return cm } -func ensureConfigMapExists(g *WithT, cl client.WithWatch, etcd *druidv1alpha1.Etcd) { - cm, err := getLatestConfigMap(cl, etcd) - g.Expect(err).ToNot(HaveOccurred()) - g.Expect(cm).ToNot(BeNil()) -} - func getLatestConfigMap(cl client.Client, etcd *druidv1alpha1.Etcd) (*corev1.ConfigMap, error) { cm := &corev1.ConfigMap{} err := cl.Get(context.Background(), client.ObjectKey{Name: druidv1alpha1.GetConfigMapName(etcd.ObjectMeta), Namespace: etcd.Namespace}, cm) @@ -341,10 +334,10 @@ func matchConfigMap(g *WithT, etcd *druidv1alpha1.Etcd, actualConfigMap corev1.C err := yaml.Unmarshal([]byte(actualETCDConfigYAML), &actualETCDConfig) g.Expect(err).ToNot(HaveOccurred()) g.Expect(actualETCDConfig).To(MatchKeys(IgnoreExtras|IgnoreMissing, Keys{ - "name": Equal(fmt.Sprintf("etcd-%s", etcd.UID[:6])), + "name": Equal("etcd-config"), "data-dir": Equal(fmt.Sprintf("%s/new.etcd", common.VolumeMountPathEtcdData)), "metrics": Equal(string(druidv1alpha1.Basic)), - "snapshot-count": Equal(int64(75000)), + "snapshot-count": Equal(ptr.Deref(etcd.Spec.Etcd.SnapshotCount, defaultSnapshotCount)), "enable-v2": Equal(false), "quota-backend-bytes": Equal(etcd.Spec.Etcd.Quota.Value()), "initial-cluster-token": Equal("etcd-cluster"), @@ -360,7 +353,7 @@ func matchClientTLSRelatedConfiguration(g *WithT, etcd *druidv1alpha1.Etcd, actu if etcd.Spec.Etcd.ClientUrlTLS != nil { g.Expect(actualETCDConfig).To(MatchKeys(IgnoreExtras|IgnoreMissing, Keys{ "listen-client-urls": Equal(fmt.Sprintf("https://0.0.0.0:%d", ptr.Deref(etcd.Spec.Etcd.ClientPort, common.DefaultPortEtcdClient))), - "advertise-client-urls": Equal(fmt.Sprintf("https@%s@%s@%d", druidv1alpha1.GetPeerServiceName(etcd.ObjectMeta), etcd.Namespace, ptr.Deref(etcd.Spec.Etcd.ClientPort, common.DefaultPortEtcdClient))), + "advertise-client-urls": Equal(expectedAdvertiseURLsAsInterface(etcd, advertiseURLTypeClient, "https")), "client-transport-security": MatchKeys(IgnoreExtras, Keys{ "cert-file": Equal("/var/etcd/ssl/server/tls.crt"), "key-file": Equal("/var/etcd/ssl/server/tls.key"), @@ -377,8 +370,38 @@ func matchClientTLSRelatedConfiguration(g *WithT, etcd *druidv1alpha1.Etcd, actu } } +func expectedAdvertiseURLs(etcd *druidv1alpha1.Etcd, advertiseURLType, scheme string) map[string][]string { + var port int32 + switch advertiseURLType { + case advertiseURLTypePeer: + port = ptr.Deref(etcd.Spec.Etcd.ServerPort, common.DefaultPortEtcdPeer) + case advertiseURLTypeClient: + port = ptr.Deref(etcd.Spec.Etcd.ClientPort, common.DefaultPortEtcdClient) + default: + return nil + } + advUrlsMap := make(map[string][]string) + for i := 0; i < int(etcd.Spec.Replicas); i++ { + podName := druidv1alpha1.GetOrdinalPodName(etcd.ObjectMeta, i) + advUrlsMap[podName] = []string{fmt.Sprintf("%s://%s.%s.%s.svc:%d", scheme, podName, druidv1alpha1.GetPeerServiceName(etcd.ObjectMeta), etcd.Namespace, port)} + } + return advUrlsMap +} + +func expectedAdvertiseURLsAsInterface(etcd *druidv1alpha1.Etcd, advertiseURLType, scheme string) map[string]interface{} { + advertiseUrlsMap := expectedAdvertiseURLs(etcd, advertiseURLType, scheme) + advertiseUrlsInterface := make(map[string]interface{}, len(advertiseUrlsMap)) + for podName, urlList := range advertiseUrlsMap { + urlsListInterface := make([]interface{}, len(urlList)) + for i, url := range urlList { + urlsListInterface[i] = url + } + advertiseUrlsInterface[podName] = urlsListInterface + } + return advertiseUrlsInterface +} + func matchPeerTLSRelatedConfiguration(g *WithT, etcd *druidv1alpha1.Etcd, actualETCDConfig map[string]interface{}) { - peerSvcName := druidv1alpha1.GetPeerServiceName(etcd.ObjectMeta) if etcd.Spec.Etcd.PeerUrlTLS != nil { g.Expect(actualETCDConfig).To(MatchKeys(IgnoreExtras|IgnoreMissing, Keys{ "peer-transport-security": MatchKeys(IgnoreExtras, Keys{ @@ -389,12 +412,12 @@ func matchPeerTLSRelatedConfiguration(g *WithT, etcd *druidv1alpha1.Etcd, actual "auto-tls": Equal(false), }), "listen-peer-urls": Equal(fmt.Sprintf("https://0.0.0.0:%d", ptr.Deref(etcd.Spec.Etcd.ServerPort, common.DefaultPortEtcdPeer))), - "initial-advertise-peer-urls": Equal(fmt.Sprintf("https@%s@%s@%s", peerSvcName, etcd.Namespace, strconv.Itoa(int(ptr.Deref(etcd.Spec.Etcd.ServerPort, common.DefaultPortEtcdPeer))))), + "initial-advertise-peer-urls": Equal(expectedAdvertiseURLsAsInterface(etcd, advertiseURLTypePeer, "https")), })) } else { g.Expect(actualETCDConfig).To(MatchKeys(IgnoreExtras|IgnoreMissing, Keys{ "listen-peer-urls": Equal(fmt.Sprintf("http://0.0.0.0:%d", ptr.Deref(etcd.Spec.Etcd.ServerPort, common.DefaultPortEtcdPeer))), - "initial-advertise-peer-urls": Equal(fmt.Sprintf("http@%s@%s@%s", peerSvcName, etcd.Namespace, strconv.Itoa(int(ptr.Deref(etcd.Spec.Etcd.ServerPort, common.DefaultPortEtcdPeer))))), + "initial-advertise-peer-urls": Equal(expectedAdvertiseURLsAsInterface(etcd, advertiseURLTypePeer, "http")), })) g.Expect(actualETCDConfig).ToNot(HaveKey("peer-transport-security")) } diff --git a/internal/component/configmap/etcdconfig.go b/internal/component/configmap/etcdconfig.go index de9c8409a..d15caa3a7 100644 --- a/internal/component/configmap/etcdconfig.go +++ b/internal/component/configmap/etcdconfig.go @@ -22,27 +22,20 @@ const ( defaultInitialClusterToken = "etcd-cluster" defaultInitialClusterState = "new" // For more information refer to https://etcd.io/docs/v3.4/op-guide/maintenance/#raft-log-retention - // TODO: Ideally this should be made configurable via Etcd resource as this has a direct impact on the memory requirements for etcd container. - // which in turn is influenced by the size of objects that are getting stored in etcd. - defaultSnapshotCount = 75000 + defaultSnapshotCount = int64(75000) + advertiseURLTypePeer = "peer" + advertiseURLTypeClient = "client" ) var ( defaultDataDir = fmt.Sprintf("%s/new.etcd", common.VolumeMountPathEtcdData) ) -type tlsTarget string - -const ( - clientTLS tlsTarget = "client" - peerTLS tlsTarget = "peer" -) - type etcdConfig struct { Name string `yaml:"name"` DataDir string `yaml:"data-dir"` Metrics druidv1alpha1.MetricsLevel `yaml:"metrics"` - SnapshotCount int `yaml:"snapshot-count"` + SnapshotCount int64 `yaml:"snapshot-count"` EnableV2 bool `yaml:"enable-v2"` QuotaBackendBytes int64 `yaml:"quota-backend-bytes"` InitialClusterToken string `yaml:"initial-cluster-token"` @@ -52,8 +45,8 @@ type etcdConfig struct { AutoCompactionRetention string `yaml:"auto-compaction-retention"` ListenPeerUrls string `yaml:"listen-peer-urls"` ListenClientUrls string `yaml:"listen-client-urls"` - AdvertisePeerUrls string `yaml:"initial-advertise-peer-urls"` - AdvertiseClientUrls string `yaml:"advertise-client-urls"` + AdvertisePeerUrls map[string][]string `yaml:"initial-advertise-peer-urls"` + AdvertiseClientUrls map[string][]string `yaml:"advertise-client-urls"` ClientSecurity securityConfig `yaml:"client-transport-security,omitempty"` PeerSecurity securityConfig `yaml:"peer-transport-security,omitempty"` } @@ -71,10 +64,10 @@ func createEtcdConfig(etcd *druidv1alpha1.Etcd) *etcdConfig { peerScheme, peerSecurityConfig := getSchemeAndSecurityConfig(etcd.Spec.Etcd.PeerUrlTLS, common.VolumeMountPathEtcdPeerCA, common.VolumeMountPathEtcdPeerServerTLS) peerSvcName := druidv1alpha1.GetPeerServiceName(etcd.ObjectMeta) cfg := &etcdConfig{ - Name: fmt.Sprintf("etcd-%s", etcd.UID[:6]), + Name: "etcd-config", DataDir: defaultDataDir, Metrics: ptr.Deref(etcd.Spec.Etcd.Metrics, druidv1alpha1.Basic), - SnapshotCount: defaultSnapshotCount, + SnapshotCount: getSnapshotCount(etcd), EnableV2: false, QuotaBackendBytes: getDBQuotaBytes(etcd), InitialClusterToken: defaultInitialClusterToken, @@ -84,8 +77,8 @@ func createEtcdConfig(etcd *druidv1alpha1.Etcd) *etcdConfig { AutoCompactionRetention: ptr.Deref(etcd.Spec.Common.AutoCompactionRetention, defaultAutoCompactionRetention), ListenPeerUrls: fmt.Sprintf("%s://0.0.0.0:%d", peerScheme, ptr.Deref(etcd.Spec.Etcd.ServerPort, common.DefaultPortEtcdPeer)), ListenClientUrls: fmt.Sprintf("%s://0.0.0.0:%d", clientScheme, ptr.Deref(etcd.Spec.Etcd.ClientPort, common.DefaultPortEtcdClient)), - AdvertisePeerUrls: fmt.Sprintf("%s@%s@%s@%d", peerScheme, peerSvcName, etcd.Namespace, ptr.Deref(etcd.Spec.Etcd.ServerPort, common.DefaultPortEtcdPeer)), - AdvertiseClientUrls: fmt.Sprintf("%s@%s@%s@%d", clientScheme, peerSvcName, etcd.Namespace, ptr.Deref(etcd.Spec.Etcd.ClientPort, common.DefaultPortEtcdClient)), + AdvertisePeerUrls: getAdvertiseURLs(etcd, advertiseURLTypePeer, peerScheme, peerSvcName), + AdvertiseClientUrls: getAdvertiseURLs(etcd, advertiseURLTypeClient, clientScheme, peerSvcName), } if peerSecurityConfig != nil { cfg.PeerSecurity = *peerSecurityConfig @@ -97,12 +90,18 @@ func createEtcdConfig(etcd *druidv1alpha1.Etcd) *etcdConfig { return cfg } +func getSnapshotCount(etcd *druidv1alpha1.Etcd) int64 { + if etcd.Spec.Etcd.SnapshotCount != nil { + return *etcd.Spec.Etcd.SnapshotCount + } + return defaultSnapshotCount +} + func getDBQuotaBytes(etcd *druidv1alpha1.Etcd) int64 { - dbQuotaBytes := defaultDBQuotaBytes if etcd.Spec.Etcd.Quota != nil { - dbQuotaBytes = etcd.Spec.Etcd.Quota.Value() + return etcd.Spec.Etcd.Quota.Value() } - return dbQuotaBytes + return defaultDBQuotaBytes } func getSchemeAndSecurityConfig(tlsConfig *druidv1alpha1.TLSConfig, caPath, serverTLSPath string) (string, *securityConfig) { @@ -129,3 +128,21 @@ func prepareInitialCluster(etcd *druidv1alpha1.Etcd, peerScheme string) string { } return strings.Trim(builder.String(), ",") } + +func getAdvertiseURLs(etcd *druidv1alpha1.Etcd, advertiseURLType, scheme, peerSvcName string) map[string][]string { + var port int32 + switch advertiseURLType { + case advertiseURLTypePeer: + port = ptr.Deref(etcd.Spec.Etcd.ServerPort, common.DefaultPortEtcdPeer) + case advertiseURLTypeClient: + port = ptr.Deref(etcd.Spec.Etcd.ClientPort, common.DefaultPortEtcdClient) + default: + return nil + } + advUrlsMap := make(map[string][]string) + for i := 0; i < int(etcd.Spec.Replicas); i++ { + podName := druidv1alpha1.GetOrdinalPodName(etcd.ObjectMeta, i) + advUrlsMap[podName] = []string{fmt.Sprintf("%s://%s.%s.%s.svc:%d", scheme, podName, peerSvcName, etcd.Namespace, port)} + } + return advUrlsMap +} diff --git a/test/e2e/etcd_backup_test.go b/test/e2e/etcd_backup_test.go index a4a2727bf..6e4956da5 100644 --- a/test/e2e/etcd_backup_test.go +++ b/test/e2e/etcd_backup_test.go @@ -247,7 +247,7 @@ func checkEtcdReady(ctx context.Context, cl client.Client, logger logr.Logger, e logger.Info("Checking configmap") cm := &corev1.ConfigMap{} - ExpectWithOffset(2, cl.Get(ctx, client.ObjectKey{Name: "etcd-bootstrap-" + string(etcd.UID[:6]), Namespace: etcd.Namespace}, cm)).To(Succeed()) + ExpectWithOffset(2, cl.Get(ctx, client.ObjectKey{Name: etcd.Name + "-config", Namespace: etcd.Namespace}, cm)).To(Succeed()) logger.Info("Checking client service") svc := &corev1.Service{} @@ -280,7 +280,7 @@ func deleteAndCheckEtcd(ctx context.Context, cl client.Client, logger logr.Logge ExpectWithOffset(1, cl.Get( ctx, - client.ObjectKey{Name: "etcd-bootstrap-" + string(etcd.UID[:6]), Namespace: etcd.Namespace}, + client.ObjectKey{Name: etcd.Name + "-config", Namespace: etcd.Namespace}, &corev1.ConfigMap{}, ), ).Should(matchers.BeNotFoundError()) diff --git a/test/e2e/utils.go b/test/e2e/utils.go index f24aafb02..56841b207 100644 --- a/test/e2e/utils.go +++ b/test/e2e/utils.go @@ -105,8 +105,9 @@ var ( "memory": resource.MustParse("256Mi"), }, } - etcdClientPort = int32(2379) - etcdServerPort = int32(2380) + etcdClientPort = int32(2379) + etcdServerPort = int32(2380) + etcdSnapshotCount = int64(75000) backupPort = int32(8080) backupFullSnapshotSchedule = "0 */1 * * *" @@ -182,6 +183,7 @@ func getDefaultEtcd(name, namespace, container, prefix string, provider TestProv Resources: &etcdResources, ClientPort: &etcdClientPort, ServerPort: &etcdServerPort, + SnapshotCount: &etcdSnapshotCount, ClientUrlTLS: &etcdTLS, } diff --git a/test/it/setup/setup.go b/test/it/setup/setup.go index b190b6802..b94d94d3b 100644 --- a/test/it/setup/setup.go +++ b/test/it/setup/setup.go @@ -20,7 +20,7 @@ import ( k8sruntime "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/rest" - "k8s.io/utils/pointer" + "k8s.io/utils/ptr" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/envtest" @@ -168,7 +168,7 @@ func (t *itTestEnv) startTestEnvironment(crdDirectoryPaths []string) error { CRDDirectoryPaths: crdDirectoryPaths, } if useExistingK8SCluster() { - testEnv.UseExistingCluster = pointer.Bool(true) + testEnv.UseExistingCluster = ptr.To(true) } cfg, err := testEnv.Start() diff --git a/test/utils/etcd.go b/test/utils/etcd.go index fbfd2a752..b2c2658f6 100644 --- a/test/utils/etcd.go +++ b/test/utils/etcd.go @@ -40,6 +40,7 @@ var ( deltaSnapShotMemLimit = resource.MustParse("100Mi") autoCompactionMode = druidv1alpha1.Periodic autoCompactionRetention = "2m" + snapshotCount = int64(75000) quota = resource.MustParse("8Gi") localProvider = druidv1alpha1.StorageProvider("Local") prefix = "/tmp" @@ -387,6 +388,7 @@ func getDefaultEtcd(name, namespace string) *druidv1alpha1.Etcd { Backup: getBackupSpec(), Etcd: druidv1alpha1.EtcdConfig{ Quota: "a, + SnapshotCount: &snapshotCount, Metrics: &metricsBasic, Image: &imageEtcd, DefragmentationSchedule: &defragSchedule,