From 95dd05c3361f530b9c357cbd7d2b0f63197b6f8c Mon Sep 17 00:00:00 2001 From: Ewout Prangsma Date: Sat, 25 Aug 2018 12:08:44 +0200 Subject: [PATCH 01/17] Resilience improvements --- pkg/deployment/deployment_inspector.go | 5 ++- pkg/deployment/resources/pod_finalizers.go | 24 +++++++++--- pkg/deployment/resources/pod_inspector.go | 22 +++++++---- pkg/deployment/resources/pvc_finalizers.go | 4 +- pkg/util/duration.go | 43 ++++++++++++++++++++++ tests/resilience_test.go | 31 ++++++++++++---- 6 files changed, 105 insertions(+), 24 deletions(-) create mode 100644 pkg/util/duration.go diff --git a/pkg/deployment/deployment_inspector.go b/pkg/deployment/deployment_inspector.go index 13557009f..441ae26e1 100644 --- a/pkg/deployment/deployment_inspector.go +++ b/pkg/deployment/deployment_inspector.go @@ -27,6 +27,7 @@ import ( "time" api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha" + "github.com/arangodb/kube-arangodb/pkg/util" "github.com/arangodb/kube-arangodb/pkg/util/k8sutil" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) @@ -87,9 +88,11 @@ func (d *Deployment) inspectDeployment(lastInterval time.Duration) time.Duration } // Inspection of generated resources needed - if err := d.resources.InspectPods(ctx); err != nil { + if x, err := d.resources.InspectPods(ctx); err != nil { hasError = true d.CreateEvent(k8sutil.NewErrorEvent("Pod inspection failed", err, d.apiObject)) + } else { + nextInterval = util.MinDuration(nextInterval, x) } if err := d.resources.InspectPVCs(ctx); err != nil { hasError = true diff --git a/pkg/deployment/resources/pod_finalizers.go b/pkg/deployment/resources/pod_finalizers.go index 4cc8f90e5..f94b281c3 100644 --- a/pkg/deployment/resources/pod_finalizers.go +++ b/pkg/deployment/resources/pod_finalizers.go @@ -37,8 +37,13 @@ import ( "github.com/arangodb/kube-arangodb/pkg/util/k8sutil" ) +const ( + recheckPodFinalizerInterval = time.Second * 10 +) + // runPodFinalizers goes through the list of pod finalizers to see if they can be removed. -func (r *Resources) runPodFinalizers(ctx context.Context, p *v1.Pod, memberStatus api.MemberStatus, updateMember func(api.MemberStatus) error) error { +// Returns: Interval_till_next_inspection, error +func (r *Resources) runPodFinalizers(ctx context.Context, p *v1.Pod, memberStatus api.MemberStatus, updateMember func(api.MemberStatus) error) (time.Duration, error) { log := r.log.With().Str("pod-name", p.GetName()).Logger() var removalList []string for _, f := range p.ObjectMeta.GetFinalizers() { @@ -55,7 +60,7 @@ func (r *Resources) runPodFinalizers(ctx context.Context, p *v1.Pod, memberStatu if err := r.inspectFinalizerPodDrainDBServer(ctx, log, p, memberStatus, updateMember); err == nil { removalList = append(removalList, f) } else { - log.Debug().Err(err).Str("finalizer", f).Msg("Cannot remove finalizer yet") + log.Debug().Err(err).Str("finalizer", f).Msg("Cannot remove Pod finalizer yet") } } } @@ -65,10 +70,15 @@ func (r *Resources) runPodFinalizers(ctx context.Context, p *v1.Pod, memberStatu ignoreNotFound := false if err := k8sutil.RemovePodFinalizers(log, kubecli, p, removalList, ignoreNotFound); err != nil { log.Debug().Err(err).Msg("Failed to update pod (to remove finalizers)") - return maskAny(err) + return 0, maskAny(err) + } else { + log.Debug().Strs("finalizers", removalList).Msg("Removed finalizer(s) from Pod") } + } else { + // Check again at given interval + return recheckPodFinalizerInterval, nil } - return nil + return maxPodInspectorInterval, nil } // inspectFinalizerPodAgencyServing checks the finalizer condition for agency-serving. @@ -131,14 +141,16 @@ func (r *Resources) inspectFinalizerPodAgencyServing(ctx context.Context, log ze return maskAny(fmt.Errorf("No more remaining agents")) } if err := agency.AreAgentsHealthy(ctx, agencyConns); err != nil { - log.Debug().Err(err).Msg("Remaining agents are not health") + log.Debug().Err(err).Msg("Remaining agents are not healthy") return maskAny(err) } // Remaining agents are healthy, we can remove this one and trigger a delete of the PVC - if err := pvcs.Delete(memberStatus.PersistentVolumeClaimName, &metav1.DeleteOptions{}); err != nil { + if err := pvcs.Delete(memberStatus.PersistentVolumeClaimName, &metav1.DeleteOptions{}); err != nil && !k8sutil.IsNotFound(err) { log.Warn().Err(err).Msg("Failed to delete PVC for member") return maskAny(err) + } else { + log.Debug().Str("pvc-name", memberStatus.PersistentVolumeClaimName).Msg("Removed PVC of member so agency can be completely replaced") } return nil diff --git a/pkg/deployment/resources/pod_inspector.go b/pkg/deployment/resources/pod_inspector.go index d55235c73..972bc85c5 100644 --- a/pkg/deployment/resources/pod_inspector.go +++ b/pkg/deployment/resources/pod_inspector.go @@ -31,6 +31,7 @@ import ( api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha" "github.com/arangodb/kube-arangodb/pkg/metrics" + "github.com/arangodb/kube-arangodb/pkg/util" "github.com/arangodb/kube-arangodb/pkg/util/k8sutil" ) @@ -39,19 +40,22 @@ var ( ) const ( - podScheduleTimeout = time.Minute // How long we allow the schedule to take scheduling a pod. + podScheduleTimeout = time.Minute // How long we allow the schedule to take scheduling a pod. + maxPodInspectorInterval = time.Hour ) // InspectPods lists all pods that belong to the given deployment and updates // the member status of the deployment accordingly. -func (r *Resources) InspectPods(ctx context.Context) error { +// Returns: Interval_till_next_inspection, error +func (r *Resources) InspectPods(ctx context.Context) (time.Duration, error) { log := r.log var events []*k8sutil.Event + nextInterval := maxPodInspectorInterval // Large by default, will be made smaller if needed in the rest of the function pods, err := r.context.GetOwnedPods() if err != nil { log.Debug().Err(err).Msg("Failed to get owned pods") - return maskAny(err) + return 0, maskAny(err) } // Update member status from all pods found @@ -80,7 +84,7 @@ func (r *Resources) InspectPods(ctx context.Context) error { ignoreNotFound := false if err := k8sutil.RemovePodFinalizers(log, kubecli, &p, p.GetFinalizers(), ignoreNotFound); err != nil { log.Debug().Err(err).Msg("Failed to update pod (to remove all finalizers)") - return maskAny(err) + return 0, maskAny(err) } } continue @@ -136,18 +140,20 @@ func (r *Resources) InspectPods(ctx context.Context) error { } if k8sutil.IsPodMarkedForDeletion(&p) { // Process finalizers - if err := r.runPodFinalizers(ctx, &p, memberStatus, func(m api.MemberStatus) error { + if x, err := r.runPodFinalizers(ctx, &p, memberStatus, func(m api.MemberStatus) error { updateMemberStatusNeeded = true memberStatus = m return nil }); err != nil { // Only log here, since we'll be called to try again. log.Warn().Err(err).Msg("Failed to run pod finalizers") + } else { + nextInterval = util.MinDuration(nextInterval, x) } } if updateMemberStatusNeeded { if err := status.Members.Update(memberStatus, group); err != nil { - return maskAny(err) + return 0, maskAny(err) } } } @@ -238,14 +244,14 @@ func (r *Resources) InspectPods(ctx context.Context) error { // Save status if err := r.context.UpdateStatus(status, lastVersion); err != nil { - return maskAny(err) + return 0, maskAny(err) } // Create events for _, evt := range events { r.context.CreateEvent(evt) } - return nil + return nextInterval, nil } // GetExpectedPodArguments creates command line arguments for a server in the given group with given ID. diff --git a/pkg/deployment/resources/pvc_finalizers.go b/pkg/deployment/resources/pvc_finalizers.go index fd01bbe6a..33aa238ed 100644 --- a/pkg/deployment/resources/pvc_finalizers.go +++ b/pkg/deployment/resources/pvc_finalizers.go @@ -46,7 +46,7 @@ func (r *Resources) runPVCFinalizers(ctx context.Context, p *v1.PersistentVolume if err := r.inspectFinalizerPVCMemberExists(ctx, log, p, group, memberStatus); err == nil { removalList = append(removalList, f) } else { - log.Debug().Err(err).Str("finalizer", f).Msg("Cannot remove finalizer yet") + log.Debug().Err(err).Str("finalizer", f).Msg("Cannot remove PVC finalizer yet") } } } @@ -57,6 +57,8 @@ func (r *Resources) runPVCFinalizers(ctx context.Context, p *v1.PersistentVolume if err := k8sutil.RemovePVCFinalizers(log, kubecli, p, removalList, ignoreNotFound); err != nil { log.Debug().Err(err).Msg("Failed to update PVC (to remove finalizers)") return maskAny(err) + } else { + log.Debug().Strs("finalizers", removalList).Msg("Removed finalizer(s) from PVC") } } return nil diff --git a/pkg/util/duration.go b/pkg/util/duration.go new file mode 100644 index 000000000..38e0442b9 --- /dev/null +++ b/pkg/util/duration.go @@ -0,0 +1,43 @@ +// +// DISCLAIMER +// +// Copyright 2018 ArangoDB GmbH, Cologne, Germany +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright holder is ArangoDB GmbH, Cologne, Germany +// +// Author Ewout Prangsma +// + +package util + +import ( + "time" +) + +// MaxDuration returns the largest of the given durations +func MaxDuration(a, b time.Duration) time.Duration { + if a > b { + return a + } + return b +} + +// MinDuration returns the smallest of the given durations +func MinDuration(a, b time.Duration) time.Duration { + if a < b { + return a + } + return b +} diff --git a/tests/resilience_test.go b/tests/resilience_test.go index ea00f34d5..dc04cab94 100644 --- a/tests/resilience_test.go +++ b/tests/resilience_test.go @@ -122,16 +122,28 @@ func TestResiliencePod(t *testing.T) { removeDeployment(c, depl.GetName(), ns) } -// TestResiliencePVC -// Tests handling of individual pod deletions -func TestResiliencePVC(t *testing.T) { +// TestResiliencePVCAgents +// Tests handling of individual PVCs of agents being deleted +func TestResiliencePVCAgents(t *testing.T) { + testResiliencePVC(api.ServerGroupAgents, t) +} + +// TestResiliencePVCDBServers +// Tests handling of individual PVCs of dbservers being deleted +func TestResiliencePVCDBServers(t *testing.T) { + testResiliencePVC(api.ServerGroupDBServers, t) +} + +// testResiliencePVC +// Tests handling of individual PVCs of given group being deleted +func testResiliencePVC(testGroup api.ServerGroup, t *testing.T) { longOrSkip(t) c := client.MustNewInCluster() kubecli := mustNewKubeClient(t) ns := getNamespace(t) // Prepare deployment config - depl := newDeployment("test-pvc-resilience-" + uniuri.NewLen(4)) + depl := newDeployment(fmt.Sprintf("test-pvc-resilience-%s-%s", testGroup.AsRoleAbbreviated(), uniuri.NewLen(4))) depl.Spec.Mode = api.NewMode(api.DeploymentModeCluster) depl.Spec.SetDefaults(depl.GetName()) // this must be last @@ -166,9 +178,8 @@ func TestResiliencePVC(t *testing.T) { // Delete one pvc after the other apiObject.ForeachServerGroup(func(group api.ServerGroup, spec api.ServerGroupSpec, status *api.MemberStatusList) error { - if group != api.ServerGroupAgents { - // Coordinators have no PVC - // DBServers will be cleaned out and create a new member + if group != testGroup { + // We only test a specific group here return nil } for _, m := range *status { @@ -195,9 +206,13 @@ func TestResiliencePVC(t *testing.T) { } return nil } - if err := retry.Retry(op, time.Minute); err != nil { + if err := retry.Retry(op, time.Minute*2); err != nil { t.Fatalf("PVC did not restart: %v", err) } + // Wait for deployment to be ready + if _, err = waitUntilDeployment(c, depl.GetName(), ns, deploymentIsReady()); err != nil { + t.Fatalf("Deployment not running in time: %v", err) + } // Wait for cluster to be completely ready if err := waitUntilClusterHealth(client, func(h driver.ClusterHealth) error { return clusterHealthEqualsSpec(h, apiObject.Spec) From 85918bdfc17ed106b60545c4da6b9f7d11d860d4 Mon Sep 17 00:00:00 2001 From: Ewout Prangsma Date: Mon, 27 Aug 2018 09:52:43 +0200 Subject: [PATCH 02/17] Preparing pods for termination when PVC is deleted --- pkg/deployment/reconcile/plan_builder.go | 8 + pkg/deployment/resources/pod_finalizers.go | 62 +----- pkg/deployment/resources/pod_termination.go | 213 ++++++++++++++++++++ pkg/deployment/resources/pvc_finalizers.go | 22 +- pkg/deployment/resources/pvc_inspector.go | 13 +- tests/resilience_test.go | 13 +- 6 files changed, 260 insertions(+), 71 deletions(-) create mode 100644 pkg/deployment/resources/pod_termination.go diff --git a/pkg/deployment/reconcile/plan_builder.go b/pkg/deployment/reconcile/plan_builder.go index 99087440a..6827a381c 100644 --- a/pkg/deployment/reconcile/plan_builder.go +++ b/pkg/deployment/reconcile/plan_builder.go @@ -101,6 +101,10 @@ func createPlan(log zerolog.Logger, apiObject k8sutil.APIObject, status.Members.ForeachServerGroup(func(group api.ServerGroup, members api.MemberStatusList) error { for _, m := range members { if m.Phase == api.MemberPhaseFailed && len(plan) == 0 { + log.Debug(). + Str("id", m.ID). + Str("role", group.AsRole()). + Msg("Creating member replacement plan because member has failed") newID := "" if group == api.ServerGroupAgents { newID = m.ID // Agents cannot (yet) be replaced with new IDs @@ -117,6 +121,10 @@ func createPlan(log zerolog.Logger, apiObject k8sutil.APIObject, // Check for cleaned out dbserver in created state for _, m := range status.Members.DBServers { if len(plan) == 0 && m.Phase == api.MemberPhaseCreated && m.Conditions.IsTrue(api.ConditionTypeCleanedOut) { + log.Debug(). + Str("id", m.ID). + Str("role", api.ServerGroupDBServers.AsRole()). + Msg("Creating dbserver replacement plan because server is cleanout in created phase") plan = append(plan, api.NewAction(api.ActionTypeRemoveMember, api.ServerGroupDBServers, m.ID), api.NewAction(api.ActionTypeAddMember, api.ServerGroupDBServers, ""), diff --git a/pkg/deployment/resources/pod_finalizers.go b/pkg/deployment/resources/pod_finalizers.go index f94b281c3..7e3b3e5b4 100644 --- a/pkg/deployment/resources/pod_finalizers.go +++ b/pkg/deployment/resources/pod_finalizers.go @@ -31,7 +31,6 @@ import ( "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "github.com/arangodb/go-driver/agency" api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha" "github.com/arangodb/kube-arangodb/pkg/util/constants" "github.com/arangodb/kube-arangodb/pkg/util/k8sutil" @@ -84,68 +83,13 @@ func (r *Resources) runPodFinalizers(ctx context.Context, p *v1.Pod, memberStatu // inspectFinalizerPodAgencyServing checks the finalizer condition for agency-serving. // It returns nil if the finalizer can be removed. func (r *Resources) inspectFinalizerPodAgencyServing(ctx context.Context, log zerolog.Logger, p *v1.Pod, memberStatus api.MemberStatus) error { - // Inspect member phase - if memberStatus.Phase.IsFailed() { - log.Debug().Msg("Pod is already failed, safe to remove agency serving finalizer") - return nil - } - // Inspect deployment deletion state - apiObject := r.context.GetAPIObject() - if apiObject.GetDeletionTimestamp() != nil { - log.Debug().Msg("Entire deployment is being deleted, safe to remove agency serving finalizer") - return nil - } - - // Check node the pod is scheduled on - agentDataWillBeGone := false - if p.Spec.NodeName != "" { - node, err := r.context.GetKubeCli().CoreV1().Nodes().Get(p.Spec.NodeName, metav1.GetOptions{}) - if err != nil { - log.Warn().Err(err).Msg("Failed to get node for member") - return maskAny(err) - } - if node.Spec.Unschedulable { - agentDataWillBeGone = true - } - } - - // Check PVC - pvcs := r.context.GetKubeCli().CoreV1().PersistentVolumeClaims(apiObject.GetNamespace()) - pvc, err := pvcs.Get(memberStatus.PersistentVolumeClaimName, metav1.GetOptions{}) - if err != nil { - log.Warn().Err(err).Msg("Failed to get PVC for member") - return maskAny(err) - } - if k8sutil.IsPersistentVolumeClaimMarkedForDeletion(pvc) { - agentDataWillBeGone = true - } - - // Is this a simple pod restart? - if !agentDataWillBeGone { - log.Debug().Msg("Pod is just being restarted, safe to remove agency serving finalizer") - return nil - } - - // Inspect agency state - log.Debug().Msg("Agent data will be gone, so we will check agency serving status first") - ctx = agency.WithAllowNoLeader(ctx) // The ID we're checking may be the leader, so ignore situations where all other agents are followers - ctx, cancel := context.WithTimeout(ctx, time.Second*15) // Force a quick check - defer cancel() - agencyConns, err := r.context.GetAgencyClients(ctx, func(id string) bool { return id != memberStatus.ID }) - if err != nil { - log.Debug().Err(err).Msg("Failed to create member client") - return maskAny(err) - } - if len(agencyConns) == 0 { - log.Debug().Err(err).Msg("No more remaining agents, we cannot delete this one") - return maskAny(fmt.Errorf("No more remaining agents")) - } - if err := agency.AreAgentsHealthy(ctx, agencyConns); err != nil { - log.Debug().Err(err).Msg("Remaining agents are not healthy") + if err := r.prepareAgencyPodTermination(ctx, log, p, memberStatus); err != nil { + // Pod cannot be terminated yet return maskAny(err) } // Remaining agents are healthy, we can remove this one and trigger a delete of the PVC + pvcs := r.context.GetKubeCli().CoreV1().PersistentVolumeClaims(r.context.GetNamespace()) if err := pvcs.Delete(memberStatus.PersistentVolumeClaimName, &metav1.DeleteOptions{}); err != nil && !k8sutil.IsNotFound(err) { log.Warn().Err(err).Msg("Failed to delete PVC for member") return maskAny(err) diff --git a/pkg/deployment/resources/pod_termination.go b/pkg/deployment/resources/pod_termination.go new file mode 100644 index 000000000..776e6b793 --- /dev/null +++ b/pkg/deployment/resources/pod_termination.go @@ -0,0 +1,213 @@ +// +// DISCLAIMER +// +// Copyright 2018 ArangoDB GmbH, Cologne, Germany +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright holder is ArangoDB GmbH, Cologne, Germany +// +// Author Ewout Prangsma +// + +package resources + +import ( + "context" + "fmt" + "time" + + "github.com/rs/zerolog" + "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "github.com/arangodb/go-driver/agency" + api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha" + "github.com/arangodb/kube-arangodb/pkg/util/k8sutil" +) + +// preparePodTermination checks if the given pod is allowed to terminate and if so, +// prepares it for termination. +// It returns nil if the pod is allowed to terminate yet, an error otherwise. +func (r *Resources) preparePodTermination(ctx context.Context, log zerolog.Logger, p *v1.Pod, group api.ServerGroup, memberStatus api.MemberStatus, updateMember func(api.MemberStatus) error) error { + var err error + switch group { + case api.ServerGroupAgents: + err = r.prepareAgencyPodTermination(ctx, log, p, memberStatus) + case api.ServerGroupDBServers: + err = r.prepareDBServerPodTermination(ctx, log, p, memberStatus, updateMember) + default: + err = nil + } + return maskAny(err) +} + +// prepareAgencyPodTermination checks if the given agency pod is allowed to terminate +// and if so, prepares it for termination. +// It returns nil if the pod is allowed to terminate, an error otherwise. +func (r *Resources) prepareAgencyPodTermination(ctx context.Context, log zerolog.Logger, p *v1.Pod, memberStatus api.MemberStatus) error { + // Inspect member phase + if memberStatus.Phase.IsFailed() { + log.Debug().Msg("Pod is already failed, safe to remove agency serving finalizer") + return nil + } + // Inspect deployment deletion state + apiObject := r.context.GetAPIObject() + if apiObject.GetDeletionTimestamp() != nil { + log.Debug().Msg("Entire deployment is being deleted, safe to remove agency serving finalizer") + return nil + } + + // Check node the pod is scheduled on + agentDataWillBeGone := false + if p.Spec.NodeName != "" { + node, err := r.context.GetKubeCli().CoreV1().Nodes().Get(p.Spec.NodeName, metav1.GetOptions{}) + if err != nil { + log.Warn().Err(err).Msg("Failed to get node for member") + return maskAny(err) + } + if node.Spec.Unschedulable { + agentDataWillBeGone = true + } + } + + // Check PVC + pvcs := r.context.GetKubeCli().CoreV1().PersistentVolumeClaims(apiObject.GetNamespace()) + pvc, err := pvcs.Get(memberStatus.PersistentVolumeClaimName, metav1.GetOptions{}) + if err != nil { + log.Warn().Err(err).Msg("Failed to get PVC for member") + return maskAny(err) + } + if k8sutil.IsPersistentVolumeClaimMarkedForDeletion(pvc) { + agentDataWillBeGone = true + } + + // Is this a simple pod restart? + if !agentDataWillBeGone { + log.Debug().Msg("Pod is just being restarted, safe to terminate agency pod") + return nil + } + + // Inspect agency state + log.Debug().Msg("Agent data will be gone, so we will check agency serving status first") + ctx = agency.WithAllowNoLeader(ctx) // The ID we're checking may be the leader, so ignore situations where all other agents are followers + ctx, cancel := context.WithTimeout(ctx, time.Second*15) // Force a quick check + defer cancel() + agencyConns, err := r.context.GetAgencyClients(ctx, func(id string) bool { return id != memberStatus.ID }) + if err != nil { + log.Debug().Err(err).Msg("Failed to create member client") + return maskAny(err) + } + if len(agencyConns) == 0 { + log.Debug().Err(err).Msg("No more remaining agents, we cannot delete this one") + return maskAny(fmt.Errorf("No more remaining agents")) + } + if err := agency.AreAgentsHealthy(ctx, agencyConns); err != nil { + log.Debug().Err(err).Msg("Remaining agents are not healthy") + return maskAny(err) + } + + return nil +} + +// prepareDBServerPodTermination checks if the given dbserver pod is allowed to terminate +// and if so, prepares it for termination. +// It returns nil if the pod is allowed to terminate, an error otherwise. +func (r *Resources) prepareDBServerPodTermination(ctx context.Context, log zerolog.Logger, p *v1.Pod, memberStatus api.MemberStatus, updateMember func(api.MemberStatus) error) error { + // Inspect member phase + if memberStatus.Phase.IsFailed() { + log.Debug().Msg("Pod is already failed, safe to remove dbserver pod") + return nil + } + // Inspect deployment deletion state + apiObject := r.context.GetAPIObject() + if apiObject.GetDeletionTimestamp() != nil { + log.Debug().Msg("Entire deployment is being deleted, safe to remove dbserver pod") + return nil + } + + // Check node the pod is scheduled on + dbserverDataWillBeGone := false + if p.Spec.NodeName != "" { + node, err := r.context.GetKubeCli().CoreV1().Nodes().Get(p.Spec.NodeName, metav1.GetOptions{}) + if err != nil { + log.Warn().Err(err).Msg("Failed to get node for member") + return maskAny(err) + } + if node.Spec.Unschedulable { + dbserverDataWillBeGone = true + } + } + + // Check PVC + pvcs := r.context.GetKubeCli().CoreV1().PersistentVolumeClaims(apiObject.GetNamespace()) + pvc, err := pvcs.Get(memberStatus.PersistentVolumeClaimName, metav1.GetOptions{}) + if err != nil { + log.Warn().Err(err).Msg("Failed to get PVC for member") + return maskAny(err) + } + if k8sutil.IsPersistentVolumeClaimMarkedForDeletion(pvc) { + dbserverDataWillBeGone = true + } + + // Is this a simple pod restart? + if !dbserverDataWillBeGone { + log.Debug().Msg("Pod is just being restarted, safe to remove dbserver pod") + return nil + } + + // Inspect cleaned out state + log.Debug().Msg("DBServer data is being deleted, so we will cleanout the dbserver first") + c, err := r.context.GetDatabaseClient(ctx) + if err != nil { + log.Debug().Err(err).Msg("Failed to create member client") + return maskAny(err) + } + cluster, err := c.Cluster(ctx) + if err != nil { + log.Debug().Err(err).Msg("Failed to access cluster") + return maskAny(err) + } + cleanedOut, err := cluster.IsCleanedOut(ctx, memberStatus.ID) + if err != nil { + return maskAny(err) + } + if cleanedOut { + // Cleanout completed + if memberStatus.Conditions.Update(api.ConditionTypeCleanedOut, true, "CleanedOut", "") { + if err := updateMember(memberStatus); err != nil { + return maskAny(err) + } + } + // Trigger PVC removal + if err := pvcs.Delete(memberStatus.PersistentVolumeClaimName, &metav1.DeleteOptions{}); err != nil { + log.Warn().Err(err).Msg("Failed to delete PVC for member") + return maskAny(err) + } + + log.Debug().Msg("Server is cleaned out. Save to remove drain dbserver finalizer") + return nil + } + // Not cleaned out yet, check member status + if memberStatus.Conditions.IsTrue(api.ConditionTypeTerminated) { + log.Warn().Msg("Member is already terminated before it could be cleaned out. Not good, but removing dbserver pod because we cannot do anything further") + return nil + } + // Ensure the cleanout is triggered + log.Debug().Msg("Server is not yet clean out. Triggering a clean out now") + if err := cluster.CleanOutServer(ctx, memberStatus.ID); err != nil { + log.Debug().Err(err).Msg("Failed to clean out server") + return maskAny(err) + } + return maskAny(fmt.Errorf("Server is not yet cleaned out")) +} diff --git a/pkg/deployment/resources/pvc_finalizers.go b/pkg/deployment/resources/pvc_finalizers.go index 33aa238ed..0bb18d58e 100644 --- a/pkg/deployment/resources/pvc_finalizers.go +++ b/pkg/deployment/resources/pvc_finalizers.go @@ -36,14 +36,14 @@ import ( ) // runPVCFinalizers goes through the list of PVC finalizers to see if they can be removed. -func (r *Resources) runPVCFinalizers(ctx context.Context, p *v1.PersistentVolumeClaim, group api.ServerGroup, memberStatus api.MemberStatus) error { +func (r *Resources) runPVCFinalizers(ctx context.Context, p *v1.PersistentVolumeClaim, group api.ServerGroup, memberStatus api.MemberStatus, updateMember func(api.MemberStatus) error) error { log := r.log.With().Str("pvc-name", p.GetName()).Logger() var removalList []string for _, f := range p.ObjectMeta.GetFinalizers() { switch f { case constants.FinalizerPVCMemberExists: log.Debug().Msg("Inspecting member exists finalizer") - if err := r.inspectFinalizerPVCMemberExists(ctx, log, p, group, memberStatus); err == nil { + if err := r.inspectFinalizerPVCMemberExists(ctx, log, p, group, memberStatus, updateMember); err == nil { removalList = append(removalList, f) } else { log.Debug().Err(err).Str("finalizer", f).Msg("Cannot remove PVC finalizer yet") @@ -66,7 +66,7 @@ func (r *Resources) runPVCFinalizers(ctx context.Context, p *v1.PersistentVolume // inspectFinalizerPVCMemberExists checks the finalizer condition for member-exists. // It returns nil if the finalizer can be removed. -func (r *Resources) inspectFinalizerPVCMemberExists(ctx context.Context, log zerolog.Logger, p *v1.PersistentVolumeClaim, group api.ServerGroup, memberStatus api.MemberStatus) error { +func (r *Resources) inspectFinalizerPVCMemberExists(ctx context.Context, log zerolog.Logger, p *v1.PersistentVolumeClaim, group api.ServerGroup, memberStatus api.MemberStatus, updateMember func(api.MemberStatus) error) error { // Inspect member phase if memberStatus.Phase.IsFailed() { log.Debug().Msg("Member is already failed, safe to remove member-exists finalizer") @@ -93,10 +93,22 @@ func (r *Resources) inspectFinalizerPVCMemberExists(ctx context.Context, log zer } } - // Member still exists, let's trigger a delete of it + // Member still exists, let's trigger a delete of it, if we're allowed to do so if memberStatus.PodName != "" { - log.Info().Msg("Removing Pod of member, because PVC is being removed") pods := r.context.GetKubeCli().CoreV1().Pods(apiObject.GetNamespace()) + log.Info().Msg("Checking in Pod of member can be removed, because PVC is being removed") + if pod, err := pods.Get(memberStatus.PodName, metav1.GetOptions{}); err != nil && !k8sutil.IsNotFound(err) { + log.Debug().Err(err).Msg("Failed to get pod for PVC") + return maskAny(err) + } else if err == nil { + // We've got the pod, check & prepare its termination + if err := r.preparePodTermination(ctx, log, pod, group, memberStatus, updateMember); err != nil { + log.Debug().Err(err).Msg("Not allowed to remove pod yet") + return maskAny(err) + } + } + + log.Info().Msg("Removing Pod of member, because PVC is being removed") if err := pods.Delete(memberStatus.PodName, &metav1.DeleteOptions{}); err != nil && !k8sutil.IsNotFound(err) { log.Debug().Err(err).Msg("Failed to delete pod") return maskAny(err) diff --git a/pkg/deployment/resources/pvc_inspector.go b/pkg/deployment/resources/pvc_inspector.go index b7c2d0e8c..e525b1003 100644 --- a/pkg/deployment/resources/pvc_inspector.go +++ b/pkg/deployment/resources/pvc_inspector.go @@ -25,6 +25,7 @@ package resources import ( "context" + api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha" "github.com/arangodb/kube-arangodb/pkg/metrics" "github.com/arangodb/kube-arangodb/pkg/util/k8sutil" ) @@ -68,13 +69,23 @@ func (r *Resources) InspectPVCs(ctx context.Context) error { continue } + updateMemberStatusNeeded := false if k8sutil.IsPersistentVolumeClaimMarkedForDeletion(&p) { // Process finalizers - if err := r.runPVCFinalizers(ctx, &p, group, memberStatus); err != nil { + if err := r.runPVCFinalizers(ctx, &p, group, memberStatus, func(m api.MemberStatus) error { + updateMemberStatusNeeded = true + memberStatus = m + return nil + }); err != nil { // Only log here, since we'll be called to try again. log.Warn().Err(err).Msg("Failed to run PVC finalizers") } } + if updateMemberStatusNeeded { + if err := status.Members.Update(memberStatus, group); err != nil { + return maskAny(err) + } + } } return nil diff --git a/tests/resilience_test.go b/tests/resilience_test.go index dc04cab94..fb5c341ce 100644 --- a/tests/resilience_test.go +++ b/tests/resilience_test.go @@ -36,6 +36,7 @@ import ( driver "github.com/arangodb/go-driver" api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha" "github.com/arangodb/kube-arangodb/pkg/client" + "github.com/arangodb/kube-arangodb/pkg/util/k8sutil" "github.com/arangodb/kube-arangodb/pkg/util/retry" ) @@ -191,14 +192,14 @@ func testResiliencePVC(testGroup api.ServerGroup, t *testing.T) { if err := kubecli.CoreV1().PersistentVolumeClaims(ns).Delete(m.PersistentVolumeClaimName, &metav1.DeleteOptions{}); err != nil { t.Fatalf("Failed to delete pvc %s: %v", m.PersistentVolumeClaimName, err) } - // Now delete the pod as well, otherwise the PVC will only have a deletion timestamp but its finalizers will stay on. - if err := kubecli.CoreV1().Pods(ns).Delete(m.PodName, &metav1.DeleteOptions{}); err != nil { - t.Fatalf("Failed to delete pod %s: %v", m.PodName, err) - } // Wait for pvc to return with different UID op := func() error { pvc, err := kubecli.CoreV1().PersistentVolumeClaims(ns).Get(m.PersistentVolumeClaimName, metav1.GetOptions{}) if err != nil { + if k8sutil.IsNotFound(err) && group == api.ServerGroupDBServers { + // DBServer member is completely replaced when cleaned out, so the PVC will have a different name also + return nil + } return maskAny(err) } if pvc.GetUID() == originalPVC.GetUID() { @@ -298,9 +299,9 @@ func TestResiliencePVDBServer(t *testing.T) { t.Fatalf("Failed to delete pvc %s: %v", m.PersistentVolumeClaimName, err) } // Delete Pod - if err := kubecli.CoreV1().Pods(ns).Delete(m.PodName, &metav1.DeleteOptions{}); err != nil { + /*if err := kubecli.CoreV1().Pods(ns).Delete(m.PodName, &metav1.DeleteOptions{}); err != nil { t.Fatalf("Failed to delete pod %s: %v", m.PodName, err) - } + }*/ // Wait for cluster to be healthy again with the same number of // dbservers, but the current dbserver being replaced. expectedDBServerCount := apiObject.Spec.DBServers.GetCount() From 3eefc38c5709189aa2c59efe825205ab35183c78 Mon Sep 17 00:00:00 2001 From: Ewout Prangsma Date: Mon, 27 Aug 2018 10:24:19 +0200 Subject: [PATCH 03/17] Various code cleanup --- pkg/deployment/resources/pod_finalizers.go | 90 +++------------------ pkg/deployment/resources/pod_termination.go | 8 +- 2 files changed, 10 insertions(+), 88 deletions(-) diff --git a/pkg/deployment/resources/pod_finalizers.go b/pkg/deployment/resources/pod_finalizers.go index 7e3b3e5b4..9c0df9596 100644 --- a/pkg/deployment/resources/pod_finalizers.go +++ b/pkg/deployment/resources/pod_finalizers.go @@ -24,7 +24,6 @@ package resources import ( "context" - "fmt" "time" "github.com/rs/zerolog" @@ -103,90 +102,19 @@ func (r *Resources) inspectFinalizerPodAgencyServing(ctx context.Context, log ze // inspectFinalizerPodDrainDBServer checks the finalizer condition for drain-dbserver. // It returns nil if the finalizer can be removed. func (r *Resources) inspectFinalizerPodDrainDBServer(ctx context.Context, log zerolog.Logger, p *v1.Pod, memberStatus api.MemberStatus, updateMember func(api.MemberStatus) error) error { - // Inspect member phase - if memberStatus.Phase.IsFailed() { - log.Debug().Msg("Pod is already failed, safe to remove drain dbserver finalizer") - return nil - } - // Inspect deployment deletion state - apiObject := r.context.GetAPIObject() - if apiObject.GetDeletionTimestamp() != nil { - log.Debug().Msg("Entire deployment is being deleted, safe to remove drain dbserver finalizer") - return nil - } - - // Check node the pod is scheduled on - dbserverDataWillBeGone := false - if p.Spec.NodeName != "" { - node, err := r.context.GetKubeCli().CoreV1().Nodes().Get(p.Spec.NodeName, metav1.GetOptions{}) - if err != nil { - log.Warn().Err(err).Msg("Failed to get node for member") - return maskAny(err) - } - if node.Spec.Unschedulable { - dbserverDataWillBeGone = true - } - } - - // Check PVC - pvcs := r.context.GetKubeCli().CoreV1().PersistentVolumeClaims(apiObject.GetNamespace()) - pvc, err := pvcs.Get(memberStatus.PersistentVolumeClaimName, metav1.GetOptions{}) - if err != nil { - log.Warn().Err(err).Msg("Failed to get PVC for member") + if err := r.prepareDBServerPodTermination(ctx, log, p, memberStatus, updateMember); err != nil { + // Pod cannot be terminated yet return maskAny(err) } - if k8sutil.IsPersistentVolumeClaimMarkedForDeletion(pvc) { - dbserverDataWillBeGone = true - } - - // Is this a simple pod restart? - if !dbserverDataWillBeGone { - log.Debug().Msg("Pod is just being restarted, safe to remove drain dbserver finalizer") - return nil - } - // Inspect cleaned out state - log.Debug().Msg("DBServer data is being deleted, so we will cleanout the dbserver first") - c, err := r.context.GetDatabaseClient(ctx) - if err != nil { - log.Debug().Err(err).Msg("Failed to create member client") - return maskAny(err) - } - cluster, err := c.Cluster(ctx) - if err != nil { - log.Debug().Err(err).Msg("Failed to access cluster") - return maskAny(err) - } - cleanedOut, err := cluster.IsCleanedOut(ctx, memberStatus.ID) - if err != nil { + // Remaining agents are healthy, we can remove this one and trigger a delete of the PVC + pvcs := r.context.GetKubeCli().CoreV1().PersistentVolumeClaims(r.context.GetNamespace()) + if err := pvcs.Delete(memberStatus.PersistentVolumeClaimName, &metav1.DeleteOptions{}); err != nil && !k8sutil.IsNotFound(err) { + log.Warn().Err(err).Msg("Failed to delete PVC for member") return maskAny(err) + } else { + log.Debug().Str("pvc-name", memberStatus.PersistentVolumeClaimName).Msg("Removed PVC of member") } - if cleanedOut { - // Cleanout completed - if memberStatus.Conditions.Update(api.ConditionTypeCleanedOut, true, "CleanedOut", "") { - if err := updateMember(memberStatus); err != nil { - return maskAny(err) - } - } - // Trigger PVC removal - if err := pvcs.Delete(memberStatus.PersistentVolumeClaimName, &metav1.DeleteOptions{}); err != nil { - log.Warn().Err(err).Msg("Failed to delete PVC for member") - return maskAny(err) - } - log.Debug().Msg("Server is cleaned out. Save to remove drain dbserver finalizer") - return nil - } - // Not cleaned out yet, check member status - if memberStatus.Conditions.IsTrue(api.ConditionTypeTerminated) { - log.Warn().Msg("Member is already terminated before it could be cleaned out. Not good, but removing drain dbserver finalizer because we cannot do anything further") - return nil - } - // Ensure the cleanout is triggered - log.Debug().Msg("Server is not yet clean out. Triggering a clean out now") - if err := cluster.CleanOutServer(ctx, memberStatus.ID); err != nil { - log.Debug().Err(err).Msg("Failed to clean out server") - return maskAny(err) - } - return maskAny(fmt.Errorf("Server is not yet cleaned out")) + return nil } diff --git a/pkg/deployment/resources/pod_termination.go b/pkg/deployment/resources/pod_termination.go index 776e6b793..c05ebaaa7 100644 --- a/pkg/deployment/resources/pod_termination.go +++ b/pkg/deployment/resources/pod_termination.go @@ -189,13 +189,7 @@ func (r *Resources) prepareDBServerPodTermination(ctx context.Context, log zerol return maskAny(err) } } - // Trigger PVC removal - if err := pvcs.Delete(memberStatus.PersistentVolumeClaimName, &metav1.DeleteOptions{}); err != nil { - log.Warn().Err(err).Msg("Failed to delete PVC for member") - return maskAny(err) - } - - log.Debug().Msg("Server is cleaned out. Save to remove drain dbserver finalizer") + log.Debug().Msg("DBServer is cleaned out.") return nil } // Not cleaned out yet, check member status From c874af44f79e37ec303f7a172e9b2f494bf53a2d Mon Sep 17 00:00:00 2001 From: Ewout Prangsma Date: Mon, 27 Aug 2018 14:47:41 +0200 Subject: [PATCH 04/17] Inspect PVCs again quickly when finalizers cannot (yet) be removed --- pkg/deployment/deployment_inspector.go | 4 +++- pkg/deployment/resources/pod_finalizers.go | 5 ++--- pkg/deployment/resources/pod_inspector.go | 2 +- pkg/deployment/resources/pvc_finalizers.go | 15 ++++++++++++--- pkg/deployment/resources/pvc_inspector.go | 20 +++++++++++++------- 5 files changed, 31 insertions(+), 15 deletions(-) diff --git a/pkg/deployment/deployment_inspector.go b/pkg/deployment/deployment_inspector.go index 441ae26e1..65481daa2 100644 --- a/pkg/deployment/deployment_inspector.go +++ b/pkg/deployment/deployment_inspector.go @@ -94,9 +94,11 @@ func (d *Deployment) inspectDeployment(lastInterval time.Duration) time.Duration } else { nextInterval = util.MinDuration(nextInterval, x) } - if err := d.resources.InspectPVCs(ctx); err != nil { + if x, err := d.resources.InspectPVCs(ctx); err != nil { hasError = true d.CreateEvent(k8sutil.NewErrorEvent("PVC inspection failed", err, d.apiObject)) + } else { + nextInterval = util.MinDuration(nextInterval, x) } // Check members for resilience diff --git a/pkg/deployment/resources/pod_finalizers.go b/pkg/deployment/resources/pod_finalizers.go index 9c0df9596..83c2d8d51 100644 --- a/pkg/deployment/resources/pod_finalizers.go +++ b/pkg/deployment/resources/pod_finalizers.go @@ -36,7 +36,7 @@ import ( ) const ( - recheckPodFinalizerInterval = time.Second * 10 + recheckPodFinalizerInterval = time.Second * 10 // Interval used when Pod finalizers need to be rechecked soon ) // runPodFinalizers goes through the list of pod finalizers to see if they can be removed. @@ -69,9 +69,8 @@ func (r *Resources) runPodFinalizers(ctx context.Context, p *v1.Pod, memberStatu if err := k8sutil.RemovePodFinalizers(log, kubecli, p, removalList, ignoreNotFound); err != nil { log.Debug().Err(err).Msg("Failed to update pod (to remove finalizers)") return 0, maskAny(err) - } else { - log.Debug().Strs("finalizers", removalList).Msg("Removed finalizer(s) from Pod") } + log.Debug().Strs("finalizers", removalList).Msg("Removed finalizer(s) from Pod") } else { // Check again at given interval return recheckPodFinalizerInterval, nil diff --git a/pkg/deployment/resources/pod_inspector.go b/pkg/deployment/resources/pod_inspector.go index 972bc85c5..34209f184 100644 --- a/pkg/deployment/resources/pod_inspector.go +++ b/pkg/deployment/resources/pod_inspector.go @@ -41,7 +41,7 @@ var ( const ( podScheduleTimeout = time.Minute // How long we allow the schedule to take scheduling a pod. - maxPodInspectorInterval = time.Hour + maxPodInspectorInterval = time.Hour // Maximum time between Pod inspection (if nothing else happens) ) // InspectPods lists all pods that belong to the given deployment and updates diff --git a/pkg/deployment/resources/pvc_finalizers.go b/pkg/deployment/resources/pvc_finalizers.go index 0bb18d58e..b5d6f6e55 100644 --- a/pkg/deployment/resources/pvc_finalizers.go +++ b/pkg/deployment/resources/pvc_finalizers.go @@ -25,6 +25,7 @@ package resources import ( "context" "fmt" + "time" "github.com/rs/zerolog" "k8s.io/api/core/v1" @@ -35,8 +36,13 @@ import ( "github.com/arangodb/kube-arangodb/pkg/util/k8sutil" ) +const ( + recheckPVCFinalizerInterval = time.Second * 10 // Interval used when PVC finalizers need to be rechecked soon +) + // runPVCFinalizers goes through the list of PVC finalizers to see if they can be removed. -func (r *Resources) runPVCFinalizers(ctx context.Context, p *v1.PersistentVolumeClaim, group api.ServerGroup, memberStatus api.MemberStatus, updateMember func(api.MemberStatus) error) error { +// Returns: Interval_till_next_inspection, error +func (r *Resources) runPVCFinalizers(ctx context.Context, p *v1.PersistentVolumeClaim, group api.ServerGroup, memberStatus api.MemberStatus, updateMember func(api.MemberStatus) error) (time.Duration, error) { log := r.log.With().Str("pvc-name", p.GetName()).Logger() var removalList []string for _, f := range p.ObjectMeta.GetFinalizers() { @@ -56,12 +62,15 @@ func (r *Resources) runPVCFinalizers(ctx context.Context, p *v1.PersistentVolume ignoreNotFound := false if err := k8sutil.RemovePVCFinalizers(log, kubecli, p, removalList, ignoreNotFound); err != nil { log.Debug().Err(err).Msg("Failed to update PVC (to remove finalizers)") - return maskAny(err) + return 0, maskAny(err) } else { log.Debug().Strs("finalizers", removalList).Msg("Removed finalizer(s) from PVC") } + } else { + // Check again at given interval + return recheckPVCFinalizerInterval, nil } - return nil + return maxPVCInspectorInterval, nil } // inspectFinalizerPVCMemberExists checks the finalizer condition for member-exists. diff --git a/pkg/deployment/resources/pvc_inspector.go b/pkg/deployment/resources/pvc_inspector.go index e525b1003..33c827a17 100644 --- a/pkg/deployment/resources/pvc_inspector.go +++ b/pkg/deployment/resources/pvc_inspector.go @@ -24,25 +24,29 @@ package resources import ( "context" + "time" api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha" "github.com/arangodb/kube-arangodb/pkg/metrics" + "github.com/arangodb/kube-arangodb/pkg/util" "github.com/arangodb/kube-arangodb/pkg/util/k8sutil" ) var ( - inspectedPVCCounter = metrics.MustRegisterCounter("deployment", "inspected_ppvcs", "Number of PVCs inspections") + inspectedPVCCounter = metrics.MustRegisterCounter("deployment", "inspected_ppvcs", "Number of PVCs inspections") + maxPVCInspectorInterval = time.Hour // Maximum time between PVC inspection (if nothing else happens) ) // InspectPVCs lists all PVCs that belong to the given deployment and updates // the member status of the deployment accordingly. -func (r *Resources) InspectPVCs(ctx context.Context) error { +func (r *Resources) InspectPVCs(ctx context.Context) (time.Duration, error) { log := r.log + nextInterval := maxPVCInspectorInterval pvcs, err := r.context.GetOwnedPVCs() if err != nil { log.Debug().Err(err).Msg("Failed to get owned PVCs") - return maskAny(err) + return 0, maskAny(err) } // Update member status from all pods found @@ -63,7 +67,7 @@ func (r *Resources) InspectPVCs(ctx context.Context) error { ignoreNotFound := false if err := k8sutil.RemovePVCFinalizers(log, kubecli, &p, p.GetFinalizers(), ignoreNotFound); err != nil { log.Debug().Err(err).Msg("Failed to update PVC (to remove all finalizers)") - return maskAny(err) + return 0, maskAny(err) } } continue @@ -72,21 +76,23 @@ func (r *Resources) InspectPVCs(ctx context.Context) error { updateMemberStatusNeeded := false if k8sutil.IsPersistentVolumeClaimMarkedForDeletion(&p) { // Process finalizers - if err := r.runPVCFinalizers(ctx, &p, group, memberStatus, func(m api.MemberStatus) error { + if x, err := r.runPVCFinalizers(ctx, &p, group, memberStatus, func(m api.MemberStatus) error { updateMemberStatusNeeded = true memberStatus = m return nil }); err != nil { // Only log here, since we'll be called to try again. log.Warn().Err(err).Msg("Failed to run PVC finalizers") + } else { + nextInterval = util.MinDuration(nextInterval, x) } } if updateMemberStatusNeeded { if err := status.Members.Update(memberStatus, group); err != nil { - return maskAny(err) + return 0, maskAny(err) } } } - return nil + return nextInterval, nil } From adc361675cbf76ce42f870ced684a013a9058904 Mon Sep 17 00:00:00 2001 From: Ewout Prangsma Date: Thu, 30 Aug 2018 08:23:13 +0200 Subject: [PATCH 05/17] Minor --- lifecycle.go | 2 ++ tests/resilience_test.go | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/lifecycle.go b/lifecycle.go index 7b5561388..2fb29efe4 100644 --- a/lifecycle.go +++ b/lifecycle.go @@ -147,4 +147,6 @@ func cmdLifecycleCopyRun(cmd *cobra.Command, args []string) { if err := os.Chmod(targetPath, 0755); err != nil { cliLog.Fatal().Err(err).Msg("Failed to chmod") } + + cliLog.Info().Msgf("Executable copied to %s", targetPath) } diff --git a/tests/resilience_test.go b/tests/resilience_test.go index fb5c341ce..c7e90da86 100644 --- a/tests/resilience_test.go +++ b/tests/resilience_test.go @@ -109,6 +109,10 @@ func TestResiliencePod(t *testing.T) { if err := retry.Retry(op, time.Minute); err != nil { t.Fatalf("Pod did not restart: %v", err) } + // Wait for deployment to be ready + if _, err = waitUntilDeployment(c, depl.GetName(), ns, deploymentIsReady()); err != nil { + t.Fatalf("Deployment not running in time: %v", err) + } // Wait for cluster to be completely ready if err := waitUntilClusterHealth(client, func(h driver.ClusterHealth) error { return clusterHealthEqualsSpec(h, apiObject.Spec) From a47d506884f5269e899a5f135603f356a310e3cb Mon Sep 17 00:00:00 2001 From: Ewout Prangsma Date: Thu, 30 Aug 2018 10:15:42 +0200 Subject: [PATCH 06/17] Various resilience fixes --- pkg/apis/deployment/v1alpha/conditions.go | 4 ++ pkg/deployment/resources/pod_creator.go | 1 + pkg/deployment/resources/pod_finalizers.go | 47 ++++++++++++++------- pkg/deployment/resources/pod_termination.go | 26 ++++-------- pkg/deployment/resources/pvc_finalizers.go | 27 +++--------- pkg/deployment/resources/pvc_inspector.go | 7 +-- tests/resilience_test.go | 21 +++++++++ 7 files changed, 73 insertions(+), 60 deletions(-) diff --git a/pkg/apis/deployment/v1alpha/conditions.go b/pkg/apis/deployment/v1alpha/conditions.go index bbdf74969..4e8da01be 100644 --- a/pkg/apis/deployment/v1alpha/conditions.go +++ b/pkg/apis/deployment/v1alpha/conditions.go @@ -40,6 +40,10 @@ const ( // ConditionTypeCleanedOut indicates that the member (dbserver) has been cleaned out. // Always check in combination with ConditionTypeTerminated. ConditionTypeCleanedOut ConditionType = "CleanedOut" + // ConditionTypeAgentRecoveryNeeded indicates that the member (agent) will no + // longer recover from its current volume and there has to be rebuild + // using the recovery procedure. + ConditionTypeAgentRecoveryNeeded ConditionType = "AgentRecoveryNeeded" // ConditionTypePodSchedulingFailure indicates that one or more pods belonging to the deployment cannot be schedule. ConditionTypePodSchedulingFailure ConditionType = "PodSchedulingFailure" // ConditionTypeSecretsChanged indicates that the value of one of more secrets used by diff --git a/pkg/deployment/resources/pod_creator.go b/pkg/deployment/resources/pod_creator.go index 3f41cdefb..329f84c32 100644 --- a/pkg/deployment/resources/pod_creator.go +++ b/pkg/deployment/resources/pod_creator.go @@ -606,6 +606,7 @@ func (r *Resources) createPodForMember(spec api.DeploymentSpec, memberID string, m.Phase = newPhase m.Conditions.Remove(api.ConditionTypeReady) m.Conditions.Remove(api.ConditionTypeTerminated) + m.Conditions.Remove(api.ConditionTypeAgentRecoveryNeeded) m.Conditions.Remove(api.ConditionTypeAutoUpgrade) if err := status.Members.Update(m, group); err != nil { return maskAny(err) diff --git a/pkg/deployment/resources/pod_finalizers.go b/pkg/deployment/resources/pod_finalizers.go index 83c2d8d51..9310ca16d 100644 --- a/pkg/deployment/resources/pod_finalizers.go +++ b/pkg/deployment/resources/pod_finalizers.go @@ -48,7 +48,7 @@ func (r *Resources) runPodFinalizers(ctx context.Context, p *v1.Pod, memberStatu switch f { case constants.FinalizerPodAgencyServing: log.Debug().Msg("Inspecting agency-serving finalizer") - if err := r.inspectFinalizerPodAgencyServing(ctx, log, p, memberStatus); err == nil { + if err := r.inspectFinalizerPodAgencyServing(ctx, log, p, memberStatus, updateMember); err == nil { removalList = append(removalList, f) } else { log.Debug().Err(err).Str("finalizer", f).Msg("Cannot remove finalizer yet") @@ -80,18 +80,26 @@ func (r *Resources) runPodFinalizers(ctx context.Context, p *v1.Pod, memberStatu // inspectFinalizerPodAgencyServing checks the finalizer condition for agency-serving. // It returns nil if the finalizer can be removed. -func (r *Resources) inspectFinalizerPodAgencyServing(ctx context.Context, log zerolog.Logger, p *v1.Pod, memberStatus api.MemberStatus) error { - if err := r.prepareAgencyPodTermination(ctx, log, p, memberStatus); err != nil { +func (r *Resources) inspectFinalizerPodAgencyServing(ctx context.Context, log zerolog.Logger, p *v1.Pod, memberStatus api.MemberStatus, updateMember func(api.MemberStatus) error) error { + if err := r.prepareAgencyPodTermination(ctx, log, p, memberStatus, func(update api.MemberStatus) error { + if err := updateMember(update); err != nil { + return maskAny(err) + } + memberStatus = update + return nil + }); err != nil { // Pod cannot be terminated yet return maskAny(err) } - // Remaining agents are healthy, we can remove this one and trigger a delete of the PVC - pvcs := r.context.GetKubeCli().CoreV1().PersistentVolumeClaims(r.context.GetNamespace()) - if err := pvcs.Delete(memberStatus.PersistentVolumeClaimName, &metav1.DeleteOptions{}); err != nil && !k8sutil.IsNotFound(err) { - log.Warn().Err(err).Msg("Failed to delete PVC for member") - return maskAny(err) - } else { + // Remaining agents are healthy, if we need to perform complete recovery + // of the agent, also remove the PVC + if memberStatus.Conditions.IsTrue(api.ConditionTypeAgentRecoveryNeeded) { + pvcs := r.context.GetKubeCli().CoreV1().PersistentVolumeClaims(r.context.GetNamespace()) + if err := pvcs.Delete(memberStatus.PersistentVolumeClaimName, &metav1.DeleteOptions{}); err != nil && !k8sutil.IsNotFound(err) { + log.Warn().Err(err).Msg("Failed to delete PVC for member") + return maskAny(err) + } log.Debug().Str("pvc-name", memberStatus.PersistentVolumeClaimName).Msg("Removed PVC of member so agency can be completely replaced") } @@ -101,17 +109,24 @@ func (r *Resources) inspectFinalizerPodAgencyServing(ctx context.Context, log ze // inspectFinalizerPodDrainDBServer checks the finalizer condition for drain-dbserver. // It returns nil if the finalizer can be removed. func (r *Resources) inspectFinalizerPodDrainDBServer(ctx context.Context, log zerolog.Logger, p *v1.Pod, memberStatus api.MemberStatus, updateMember func(api.MemberStatus) error) error { - if err := r.prepareDBServerPodTermination(ctx, log, p, memberStatus, updateMember); err != nil { + if err := r.prepareDBServerPodTermination(ctx, log, p, memberStatus, func(update api.MemberStatus) error { + if err := updateMember(update); err != nil { + return maskAny(err) + } + memberStatus = update + return nil + }); err != nil { // Pod cannot be terminated yet return maskAny(err) } - // Remaining agents are healthy, we can remove this one and trigger a delete of the PVC - pvcs := r.context.GetKubeCli().CoreV1().PersistentVolumeClaims(r.context.GetNamespace()) - if err := pvcs.Delete(memberStatus.PersistentVolumeClaimName, &metav1.DeleteOptions{}); err != nil && !k8sutil.IsNotFound(err) { - log.Warn().Err(err).Msg("Failed to delete PVC for member") - return maskAny(err) - } else { + // If this DBServer is cleaned out, we need to remove the PVC. + if memberStatus.Conditions.IsTrue(api.ConditionTypeCleanedOut) { + pvcs := r.context.GetKubeCli().CoreV1().PersistentVolumeClaims(r.context.GetNamespace()) + if err := pvcs.Delete(memberStatus.PersistentVolumeClaimName, &metav1.DeleteOptions{}); err != nil && !k8sutil.IsNotFound(err) { + log.Warn().Err(err).Msg("Failed to delete PVC for member") + return maskAny(err) + } log.Debug().Str("pvc-name", memberStatus.PersistentVolumeClaimName).Msg("Removed PVC of member") } diff --git a/pkg/deployment/resources/pod_termination.go b/pkg/deployment/resources/pod_termination.go index c05ebaaa7..112709c74 100644 --- a/pkg/deployment/resources/pod_termination.go +++ b/pkg/deployment/resources/pod_termination.go @@ -36,26 +36,10 @@ import ( "github.com/arangodb/kube-arangodb/pkg/util/k8sutil" ) -// preparePodTermination checks if the given pod is allowed to terminate and if so, -// prepares it for termination. -// It returns nil if the pod is allowed to terminate yet, an error otherwise. -func (r *Resources) preparePodTermination(ctx context.Context, log zerolog.Logger, p *v1.Pod, group api.ServerGroup, memberStatus api.MemberStatus, updateMember func(api.MemberStatus) error) error { - var err error - switch group { - case api.ServerGroupAgents: - err = r.prepareAgencyPodTermination(ctx, log, p, memberStatus) - case api.ServerGroupDBServers: - err = r.prepareDBServerPodTermination(ctx, log, p, memberStatus, updateMember) - default: - err = nil - } - return maskAny(err) -} - // prepareAgencyPodTermination checks if the given agency pod is allowed to terminate // and if so, prepares it for termination. // It returns nil if the pod is allowed to terminate, an error otherwise. -func (r *Resources) prepareAgencyPodTermination(ctx context.Context, log zerolog.Logger, p *v1.Pod, memberStatus api.MemberStatus) error { +func (r *Resources) prepareAgencyPodTermination(ctx context.Context, log zerolog.Logger, p *v1.Pod, memberStatus api.MemberStatus, updateMember func(api.MemberStatus) error) error { // Inspect member phase if memberStatus.Phase.IsFailed() { log.Debug().Msg("Pod is already failed, safe to remove agency serving finalizer") @@ -117,6 +101,14 @@ func (r *Resources) prepareAgencyPodTermination(ctx context.Context, log zerolog return maskAny(err) } + // Complete agent recovery is needed, since data is already gone or not accessible + if memberStatus.Conditions.Update(api.ConditionTypeAgentRecoveryNeeded, true, "Data Gone", "") { + if err := updateMember(memberStatus); err != nil { + return maskAny(err) + } + } + log.Debug().Msg("Agent is ready to be completely recovered.") + return nil } diff --git a/pkg/deployment/resources/pvc_finalizers.go b/pkg/deployment/resources/pvc_finalizers.go index b5d6f6e55..2c43a6c64 100644 --- a/pkg/deployment/resources/pvc_finalizers.go +++ b/pkg/deployment/resources/pvc_finalizers.go @@ -41,18 +41,17 @@ const ( ) // runPVCFinalizers goes through the list of PVC finalizers to see if they can be removed. -// Returns: Interval_till_next_inspection, error -func (r *Resources) runPVCFinalizers(ctx context.Context, p *v1.PersistentVolumeClaim, group api.ServerGroup, memberStatus api.MemberStatus, updateMember func(api.MemberStatus) error) (time.Duration, error) { +func (r *Resources) runPVCFinalizers(ctx context.Context, p *v1.PersistentVolumeClaim, group api.ServerGroup, memberStatus api.MemberStatus) (time.Duration, error) { log := r.log.With().Str("pvc-name", p.GetName()).Logger() var removalList []string for _, f := range p.ObjectMeta.GetFinalizers() { switch f { case constants.FinalizerPVCMemberExists: log.Debug().Msg("Inspecting member exists finalizer") - if err := r.inspectFinalizerPVCMemberExists(ctx, log, p, group, memberStatus, updateMember); err == nil { + if err := r.inspectFinalizerPVCMemberExists(ctx, log, p, group, memberStatus); err == nil { removalList = append(removalList, f) } else { - log.Debug().Err(err).Str("finalizer", f).Msg("Cannot remove PVC finalizer yet") + log.Debug().Err(err).Str("finalizer", f).Msg("Cannot remove finalizer yet") } } } @@ -63,8 +62,6 @@ func (r *Resources) runPVCFinalizers(ctx context.Context, p *v1.PersistentVolume if err := k8sutil.RemovePVCFinalizers(log, kubecli, p, removalList, ignoreNotFound); err != nil { log.Debug().Err(err).Msg("Failed to update PVC (to remove finalizers)") return 0, maskAny(err) - } else { - log.Debug().Strs("finalizers", removalList).Msg("Removed finalizer(s) from PVC") } } else { // Check again at given interval @@ -75,7 +72,7 @@ func (r *Resources) runPVCFinalizers(ctx context.Context, p *v1.PersistentVolume // inspectFinalizerPVCMemberExists checks the finalizer condition for member-exists. // It returns nil if the finalizer can be removed. -func (r *Resources) inspectFinalizerPVCMemberExists(ctx context.Context, log zerolog.Logger, p *v1.PersistentVolumeClaim, group api.ServerGroup, memberStatus api.MemberStatus, updateMember func(api.MemberStatus) error) error { +func (r *Resources) inspectFinalizerPVCMemberExists(ctx context.Context, log zerolog.Logger, p *v1.PersistentVolumeClaim, group api.ServerGroup, memberStatus api.MemberStatus) error { // Inspect member phase if memberStatus.Phase.IsFailed() { log.Debug().Msg("Member is already failed, safe to remove member-exists finalizer") @@ -102,22 +99,10 @@ func (r *Resources) inspectFinalizerPVCMemberExists(ctx context.Context, log zer } } - // Member still exists, let's trigger a delete of it, if we're allowed to do so + // Member still exists, let's trigger a delete of it if memberStatus.PodName != "" { - pods := r.context.GetKubeCli().CoreV1().Pods(apiObject.GetNamespace()) - log.Info().Msg("Checking in Pod of member can be removed, because PVC is being removed") - if pod, err := pods.Get(memberStatus.PodName, metav1.GetOptions{}); err != nil && !k8sutil.IsNotFound(err) { - log.Debug().Err(err).Msg("Failed to get pod for PVC") - return maskAny(err) - } else if err == nil { - // We've got the pod, check & prepare its termination - if err := r.preparePodTermination(ctx, log, pod, group, memberStatus, updateMember); err != nil { - log.Debug().Err(err).Msg("Not allowed to remove pod yet") - return maskAny(err) - } - } - log.Info().Msg("Removing Pod of member, because PVC is being removed") + pods := r.context.GetKubeCli().CoreV1().Pods(apiObject.GetNamespace()) if err := pods.Delete(memberStatus.PodName, &metav1.DeleteOptions{}); err != nil && !k8sutil.IsNotFound(err) { log.Debug().Err(err).Msg("Failed to delete pod") return maskAny(err) diff --git a/pkg/deployment/resources/pvc_inspector.go b/pkg/deployment/resources/pvc_inspector.go index 33c827a17..16e8b2eab 100644 --- a/pkg/deployment/resources/pvc_inspector.go +++ b/pkg/deployment/resources/pvc_inspector.go @@ -26,7 +26,6 @@ import ( "context" "time" - api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha" "github.com/arangodb/kube-arangodb/pkg/metrics" "github.com/arangodb/kube-arangodb/pkg/util" "github.com/arangodb/kube-arangodb/pkg/util/k8sutil" @@ -76,11 +75,7 @@ func (r *Resources) InspectPVCs(ctx context.Context) (time.Duration, error) { updateMemberStatusNeeded := false if k8sutil.IsPersistentVolumeClaimMarkedForDeletion(&p) { // Process finalizers - if x, err := r.runPVCFinalizers(ctx, &p, group, memberStatus, func(m api.MemberStatus) error { - updateMemberStatusNeeded = true - memberStatus = m - return nil - }); err != nil { + if x, err := r.runPVCFinalizers(ctx, &p, group, memberStatus); err != nil { // Only log here, since we'll be called to try again. log.Warn().Err(err).Msg("Failed to run PVC finalizers") } else { diff --git a/tests/resilience_test.go b/tests/resilience_test.go index c7e90da86..6a414bac4 100644 --- a/tests/resilience_test.go +++ b/tests/resilience_test.go @@ -32,6 +32,7 @@ import ( "github.com/dchest/uniuri" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" driver "github.com/arangodb/go-driver" api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha" @@ -92,6 +93,17 @@ func TestResiliencePod(t *testing.T) { if err != nil { t.Fatalf("Failed to get pod %s: %v", m.PodName, err) } + // Get current PVC so we can compare UID later + var originalPVCUID types.UID + if m.PersistentVolumeClaimName != "" { + originalPVC, err := kubecli.CoreV1().PersistentVolumeClaims(ns).Get(m.PersistentVolumeClaimName, metav1.GetOptions{}) + if err != nil { + t.Fatalf("Failed to get PVC %s: %v", m.PersistentVolumeClaimName, err) + } else { + originalPVCUID = originalPVC.GetUID() + } + } + // Now delete the pod if err := kubecli.CoreV1().Pods(ns).Delete(m.PodName, &metav1.DeleteOptions{}); err != nil { t.Fatalf("Failed to delete pod %s: %v", m.PodName, err) } @@ -109,6 +121,15 @@ func TestResiliencePod(t *testing.T) { if err := retry.Retry(op, time.Minute); err != nil { t.Fatalf("Pod did not restart: %v", err) } + // Now that the Pod has been replaced, check that the PVC has NOT been replaced (if any) + if m.PersistentVolumeClaimName != "" { + pvc, err := kubecli.CoreV1().PersistentVolumeClaims(ns).Get(m.PersistentVolumeClaimName, metav1.GetOptions{}) + if err != nil { + t.Fatalf("Failed to get PVC %s: %v", m.PersistentVolumeClaimName, err) + } else if originalPVCUID != pvc.GetUID() { + t.Errorf("PVC for member %s has been replaced", m.ID) + } + } // Wait for deployment to be ready if _, err = waitUntilDeployment(c, depl.GetName(), ns, deploymentIsReady()); err != nil { t.Fatalf("Deployment not running in time: %v", err) From fe2bbafab2071c82140e3fb8989ebf99d5c7b225 Mon Sep 17 00:00:00 2001 From: Ewout Prangsma Date: Thu, 30 Aug 2018 11:03:33 +0200 Subject: [PATCH 07/17] Prevent database-autoupgrade on stateless members --- pkg/deployment/reconcile/plan_builder.go | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pkg/deployment/reconcile/plan_builder.go b/pkg/deployment/reconcile/plan_builder.go index 6827a381c..d92822884 100644 --- a/pkg/deployment/reconcile/plan_builder.go +++ b/pkg/deployment/reconcile/plan_builder.go @@ -406,13 +406,18 @@ func createRotateMemberPlan(log zerolog.Logger, member api.MemberStatus, // member. func createUpgradeMemberPlan(log zerolog.Logger, member api.MemberStatus, group api.ServerGroup, reason string, imageName string, status api.DeploymentStatus) api.Plan { + upgradeAction := api.ActionTypeUpgradeMember + if group.IsStateless() { + upgradeAction = api.ActionTypeRotateMember + } log.Debug(). Str("id", member.ID). Str("role", group.AsRole()). Str("reason", reason). + Str("action", string(upgradeAction)). Msg("Creating upgrade plan") plan := api.Plan{ - api.NewAction(api.ActionTypeUpgradeMember, group, member.ID, reason), + api.NewAction(upgradeAction, group, member.ID, reason), api.NewAction(api.ActionTypeWaitForMemberUp, group, member.ID), } if status.CurrentImage == nil || status.CurrentImage.Image != imageName { From 01cbcae5a828ff2b61da4753a40e5e4401e19204 Mon Sep 17 00:00:00 2001 From: Ewout Prangsma Date: Thu, 30 Aug 2018 11:16:34 +0200 Subject: [PATCH 08/17] Sort members by ID --- pkg/apis/deployment/v1alpha/member_status_list.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pkg/apis/deployment/v1alpha/member_status_list.go b/pkg/apis/deployment/v1alpha/member_status_list.go index bc8ca9f21..e47a9bb87 100644 --- a/pkg/apis/deployment/v1alpha/member_status_list.go +++ b/pkg/apis/deployment/v1alpha/member_status_list.go @@ -24,6 +24,7 @@ package v1alpha import ( "math/rand" + "sort" "github.com/pkg/errors" ) @@ -83,7 +84,9 @@ func (l *MemberStatusList) add(m MemberStatus) error { return maskAny(errors.Wrapf(AlreadyExistsError, "Member '%s' already exists", m.ID)) } } - *l = append(src, m) + newList := append(src, m) + sort.Slice(newList, func(i, j int) bool { return newList[i].ID < newList[j].ID }) + *l = newList return nil } From ef03a72667775910d15ca4cf04dcc9ca4033d469 Mon Sep 17 00:00:00 2001 From: Ewout Prangsma Date: Thu, 30 Aug 2018 13:02:01 +0200 Subject: [PATCH 09/17] Wait a bit longer --- tests/resilience_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/resilience_test.go b/tests/resilience_test.go index 6a414bac4..91016fa50 100644 --- a/tests/resilience_test.go +++ b/tests/resilience_test.go @@ -118,7 +118,7 @@ func TestResiliencePod(t *testing.T) { } return nil } - if err := retry.Retry(op, time.Minute); err != nil { + if err := retry.Retry(op, time.Minute*2); err != nil { t.Fatalf("Pod did not restart: %v", err) } // Now that the Pod has been replaced, check that the PVC has NOT been replaced (if any) From bd64ca74ed236ee5c50306b0b2a827647312e9b8 Mon Sep 17 00:00:00 2001 From: Ewout Prangsma Date: Thu, 30 Aug 2018 16:57:08 +0200 Subject: [PATCH 10/17] Speeding up inspection loop --- pkg/deployment/access_package.go | 4 +- pkg/deployment/context_impl.go | 9 +- pkg/deployment/deployment.go | 15 +-- pkg/deployment/deployment_inspector.go | 102 +++++++++++------- .../resources/certificates_client_auth.go | 10 +- pkg/deployment/resources/certificates_tls.go | 10 +- pkg/deployment/resources/pod_cleanup.go | 15 ++- pkg/deployment/resources/pod_creator.go | 15 +-- pkg/deployment/resources/pod_finalizers.go | 14 +-- pkg/deployment/resources/pod_inspector.go | 14 ++- pkg/deployment/resources/pvc_finalizers.go | 5 +- pkg/deployment/resources/pvc_inspector.go | 6 +- pkg/deployment/resources/pvcs.go | 28 +++-- pkg/deployment/resources/secrets.go | 48 ++++----- pkg/deployment/resources/services.go | 71 +++++++----- pkg/replication/server_endpoint_api.go | 3 +- pkg/replication/sync_client.go | 14 +-- pkg/util/arangod/client.go | 3 +- pkg/util/interval.go | 66 ++++++++++++ pkg/util/k8sutil/pvc.go | 6 +- pkg/util/k8sutil/secrets.go | 55 +++++----- pkg/util/k8sutil/secrets_cache.go | 75 +++++++++++++ pkg/util/k8sutil/secrets_test.go | 28 ++--- pkg/util/k8sutil/services.go | 18 ++-- .../{duration.go => profiler/profiler.go} | 32 +++--- tests/auth_test.go | 6 +- tests/rocksdb_encryption_test.go | 3 +- tests/test_util.go | 3 +- 28 files changed, 463 insertions(+), 215 deletions(-) create mode 100644 pkg/util/interval.go create mode 100644 pkg/util/k8sutil/secrets_cache.go rename pkg/util/{duration.go => profiler/profiler.go} (52%) diff --git a/pkg/deployment/access_package.go b/pkg/deployment/access_package.go index b9b12af77..560d3f894 100644 --- a/pkg/deployment/access_package.go +++ b/pkg/deployment/access_package.go @@ -108,7 +108,7 @@ func (d *Deployment) ensureAccessPackage(apSecretName string) error { // Fetch client authentication CA clientAuthSecretName := spec.Sync.Authentication.GetClientCASecretName() - clientAuthCert, clientAuthKey, _, err := k8sutil.GetCASecret(d.deps.KubeCli.CoreV1(), clientAuthSecretName, ns, nil) + clientAuthCert, clientAuthKey, _, err := k8sutil.GetCASecret(secrets, clientAuthSecretName, nil) if err != nil { log.Debug().Err(err).Msg("Failed to get client-auth CA secret") return maskAny(err) @@ -116,7 +116,7 @@ func (d *Deployment) ensureAccessPackage(apSecretName string) error { // Fetch TLS CA public key tlsCASecretName := spec.Sync.TLS.GetCASecretName() - tlsCACert, err := k8sutil.GetCACertficateSecret(d.deps.KubeCli.CoreV1(), tlsCASecretName, ns) + tlsCACert, err := k8sutil.GetCACertficateSecret(secrets, tlsCASecretName) if err != nil { log.Debug().Err(err).Msg("Failed to get TLS CA secret") return maskAny(err) diff --git a/pkg/deployment/context_impl.go b/pkg/deployment/context_impl.go index 4749350f9..173869d40 100644 --- a/pkg/deployment/context_impl.go +++ b/pkg/deployment/context_impl.go @@ -171,8 +171,9 @@ func (d *Deployment) GetSyncServerClient(ctx context.Context, group api.ServerGr log := d.deps.Log kubecli := d.deps.KubeCli ns := d.apiObject.GetNamespace() + secrets := kubecli.CoreV1().Secrets(ns) secretName := d.apiObject.Spec.Sync.Monitoring.GetTokenSecretName() - monitoringToken, err := k8sutil.GetTokenSecret(kubecli.CoreV1(), secretName, ns) + monitoringToken, err := k8sutil.GetTokenSecret(secrets, secretName) if err != nil { log.Debug().Err(err).Str("secret-name", secretName).Msg("Failed to get sync monitoring secret") return nil, maskAny(err) @@ -331,7 +332,8 @@ func (d *Deployment) GetPvc(pvcName string) (*v1.PersistentVolumeClaim, error) { func (d *Deployment) GetTLSKeyfile(group api.ServerGroup, member api.MemberStatus) (string, error) { secretName := k8sutil.CreateTLSKeyfileSecretName(d.apiObject.GetName(), group.AsRole(), member.ID) ns := d.apiObject.GetNamespace() - result, err := k8sutil.GetTLSKeyfileSecret(d.deps.KubeCli.CoreV1(), secretName, ns) + secrets := d.deps.KubeCli.CoreV1().Secrets(ns) + result, err := k8sutil.GetTLSKeyfileSecret(secrets, secretName) if err != nil { return "", maskAny(err) } @@ -353,8 +355,9 @@ func (d *Deployment) DeleteTLSKeyfile(group api.ServerGroup, member api.MemberSt // Returns: publicKey, privateKey, ownerByDeployment, error func (d *Deployment) GetTLSCA(secretName string) (string, string, bool, error) { ns := d.apiObject.GetNamespace() + secrets := d.deps.KubeCli.CoreV1().Secrets(ns) owner := d.apiObject.AsOwner() - cert, priv, isOwned, err := k8sutil.GetCASecret(d.deps.KubeCli.CoreV1(), secretName, ns, &owner) + cert, priv, isOwned, err := k8sutil.GetCASecret(secrets, secretName, &owner) if err != nil { return "", "", false, maskAny(err) } diff --git a/pkg/deployment/deployment.go b/pkg/deployment/deployment.go index b56df311c..5b6da6d04 100644 --- a/pkg/deployment/deployment.go +++ b/pkg/deployment/deployment.go @@ -42,6 +42,7 @@ import ( "github.com/arangodb/kube-arangodb/pkg/deployment/resilience" "github.com/arangodb/kube-arangodb/pkg/deployment/resources" "github.com/arangodb/kube-arangodb/pkg/generated/clientset/versioned" + "github.com/arangodb/kube-arangodb/pkg/util" "github.com/arangodb/kube-arangodb/pkg/util/k8sutil" "github.com/arangodb/kube-arangodb/pkg/util/retry" "github.com/arangodb/kube-arangodb/pkg/util/trigger" @@ -78,8 +79,8 @@ type deploymentEvent struct { const ( deploymentEventQueueSize = 256 - minInspectionInterval = time.Second // Ensure we inspect the generated resources no less than with this interval - maxInspectionInterval = time.Minute // Ensure we inspect the generated resources no less than with this interval + minInspectionInterval = util.Interval(time.Second) // Ensure we inspect the generated resources no less than with this interval + maxInspectionInterval = util.Interval(time.Minute) // Ensure we inspect the generated resources no less than with this interval ) // Deployment is the in process state of an ArangoDeployment. @@ -247,21 +248,21 @@ func (d *Deployment) run() { } case <-d.inspectTrigger.Done(): + log.Debug().Msg("Inspect deployment...") inspectionInterval = d.inspectDeployment(inspectionInterval) + log.Debug().Str("interval", inspectionInterval.String()).Msg("...inspected deployment") case <-d.updateDeploymentTrigger.Done(): + inspectionInterval = minInspectionInterval if err := d.handleArangoDeploymentUpdatedEvent(); err != nil { d.CreateEvent(k8sutil.NewErrorEvent("Failed to handle deployment update", err, d.GetAPIObject())) } - case <-time.After(inspectionInterval): + case <-inspectionInterval.After(): // Trigger inspection d.inspectTrigger.Trigger() // Backoff with next interval - inspectionInterval = time.Duration(float64(inspectionInterval) * 1.5) - if inspectionInterval > maxInspectionInterval { - inspectionInterval = maxInspectionInterval - } + inspectionInterval = inspectionInterval.Backoff(1.5, maxInspectionInterval) } } } diff --git a/pkg/deployment/deployment_inspector.go b/pkg/deployment/deployment_inspector.go index 65481daa2..19a8f5b4f 100644 --- a/pkg/deployment/deployment_inspector.go +++ b/pkg/deployment/deployment_inspector.go @@ -29,6 +29,7 @@ import ( api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha" "github.com/arangodb/kube-arangodb/pkg/util" "github.com/arangodb/kube-arangodb/pkg/util/k8sutil" + "github.com/arangodb/kube-arangodb/pkg/util/profiler" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) @@ -39,7 +40,7 @@ import ( // - any of the underlying resources has changed // - once in a while // Returns the delay until this function should be called again. -func (d *Deployment) inspectDeployment(lastInterval time.Duration) time.Duration { +func (d *Deployment) inspectDeployment(lastInterval util.Interval) util.Interval { log := d.deps.Log nextInterval := lastInterval @@ -92,13 +93,13 @@ func (d *Deployment) inspectDeployment(lastInterval time.Duration) time.Duration hasError = true d.CreateEvent(k8sutil.NewErrorEvent("Pod inspection failed", err, d.apiObject)) } else { - nextInterval = util.MinDuration(nextInterval, x) + nextInterval = nextInterval.ReduceTo(x) } if x, err := d.resources.InspectPVCs(ctx); err != nil { hasError = true d.CreateEvent(k8sutil.NewErrorEvent("PVC inspection failed", err, d.apiObject)) } else { - nextInterval = util.MinDuration(nextInterval, x) + nextInterval = nextInterval.ReduceTo(x) } // Check members for resilience @@ -108,43 +109,67 @@ func (d *Deployment) inspectDeployment(lastInterval time.Duration) time.Duration } // Create scale/update plan - if err := d.reconciler.CreatePlan(); err != nil { - hasError = true - d.CreateEvent(k8sutil.NewErrorEvent("Plan creation failed", err, d.apiObject)) - } - - // Execute current step of scale/update plan - retrySoon, err := d.reconciler.ExecutePlan(ctx) - if err != nil { - hasError = true - d.CreateEvent(k8sutil.NewErrorEvent("Plan execution failed", err, d.apiObject)) - } - if retrySoon { - nextInterval = minInspectionInterval + { + ps := profiler.Start() + if err := d.reconciler.CreatePlan(); err != nil { + hasError = true + d.CreateEvent(k8sutil.NewErrorEvent("Plan creation failed", err, d.apiObject)) + } + + // Execute current step of scale/update plan + retrySoon, err := d.reconciler.ExecutePlan(ctx) + if err != nil { + hasError = true + d.CreateEvent(k8sutil.NewErrorEvent("Plan execution failed", err, d.apiObject)) + } + if retrySoon { + nextInterval = minInspectionInterval + } + ps.Done(log, "plan") } // Ensure all resources are created - if err := d.resources.EnsureSecrets(); err != nil { - hasError = true - d.CreateEvent(k8sutil.NewErrorEvent("Secret creation failed", err, d.apiObject)) - } - if err := d.resources.EnsureServices(); err != nil { - hasError = true - d.CreateEvent(k8sutil.NewErrorEvent("Service creation failed", err, d.apiObject)) - } - if err := d.resources.EnsurePVCs(); err != nil { - hasError = true - d.CreateEvent(k8sutil.NewErrorEvent("PVC creation failed", err, d.apiObject)) - } - if err := d.resources.EnsurePods(); err != nil { - hasError = true - d.CreateEvent(k8sutil.NewErrorEvent("Pod creation failed", err, d.apiObject)) + { + ps := profiler.Start() + { + ps := profiler.Start() + if err := d.resources.EnsureSecrets(); err != nil { + hasError = true + d.CreateEvent(k8sutil.NewErrorEvent("Secret creation failed", err, d.apiObject)) + } + ps.LogIf(log, time.Millisecond*10, "EnsureSecrets") + } + { + ps := profiler.Start() + if err := d.resources.EnsureServices(); err != nil { + hasError = true + d.CreateEvent(k8sutil.NewErrorEvent("Service creation failed", err, d.apiObject)) + } + ps.LogIf(log, time.Millisecond*10, "EnsureServices") + } + if err := d.resources.EnsurePVCs(); err != nil { + hasError = true + d.CreateEvent(k8sutil.NewErrorEvent("PVC creation failed", err, d.apiObject)) + } + { + ps := profiler.Start() + if err := d.resources.EnsurePods(); err != nil { + hasError = true + d.CreateEvent(k8sutil.NewErrorEvent("Pod creation failed", err, d.apiObject)) + } + ps.LogIf(log, time.Millisecond*10, "EnsurePods") + } + ps.Done(log, "ensure resources") } // Create access packages - if err := d.createAccessPackages(); err != nil { - hasError = true - d.CreateEvent(k8sutil.NewErrorEvent("AccessPackage creation failed", err, d.apiObject)) + { + ps := profiler.Start() + if err := d.createAccessPackages(); err != nil { + hasError = true + d.CreateEvent(k8sutil.NewErrorEvent("AccessPackage creation failed", err, d.apiObject)) + } + ps.Done(log, "createAccessPackages") } // Inspect deployment for obsolete members @@ -154,9 +179,11 @@ func (d *Deployment) inspectDeployment(lastInterval time.Duration) time.Duration } // At the end of the inspect, we cleanup terminated pods. - if err := d.resources.CleanupTerminatedPods(); err != nil { + if x, err := d.resources.CleanupTerminatedPods(); err != nil { hasError = true d.CreateEvent(k8sutil.NewErrorEvent("Pod cleanup failed", err, d.apiObject)) + } else { + nextInterval = nextInterval.ReduceTo(x) } } @@ -169,10 +196,7 @@ func (d *Deployment) inspectDeployment(lastInterval time.Duration) time.Duration } else { d.recentInspectionErrors = 0 } - if nextInterval > maxInspectionInterval { - nextInterval = maxInspectionInterval - } - return nextInterval + return nextInterval.ReduceTo(maxInspectionInterval) } // triggerInspection ensures that an inspection is run soon. diff --git a/pkg/deployment/resources/certificates_client_auth.go b/pkg/deployment/resources/certificates_client_auth.go index d3f3be143..51c2424d2 100644 --- a/pkg/deployment/resources/certificates_client_auth.go +++ b/pkg/deployment/resources/certificates_client_auth.go @@ -42,7 +42,7 @@ const ( // createClientAuthCACertificate creates a client authentication CA certificate and stores it in a secret with name // specified in the given spec. -func createClientAuthCACertificate(log zerolog.Logger, cli v1.CoreV1Interface, spec api.SyncAuthenticationSpec, deploymentName, namespace string, ownerRef *metav1.OwnerReference) error { +func createClientAuthCACertificate(log zerolog.Logger, secrets k8sutil.SecretInterface, spec api.SyncAuthenticationSpec, deploymentName string, ownerRef *metav1.OwnerReference) error { log = log.With().Str("secret", spec.GetClientCASecretName()).Logger() options := certificates.CreateCertificateOptions{ CommonName: fmt.Sprintf("%s Client Authentication Root Certificate", deploymentName), @@ -57,7 +57,7 @@ func createClientAuthCACertificate(log zerolog.Logger, cli v1.CoreV1Interface, s log.Debug().Err(err).Msg("Failed to create CA certificate") return maskAny(err) } - if err := k8sutil.CreateCASecret(cli, spec.GetClientCASecretName(), namespace, cert, priv, ownerRef); err != nil { + if err := k8sutil.CreateCASecret(secrets, spec.GetClientCASecretName(), cert, priv, ownerRef); err != nil { if k8sutil.IsAlreadyExists(err) { log.Debug().Msg("CA Secret already exists") } else { @@ -71,10 +71,10 @@ func createClientAuthCACertificate(log zerolog.Logger, cli v1.CoreV1Interface, s // createClientAuthCertificateKeyfile creates a client authentication certificate for a specific user and stores // it in a secret with the given name. -func createClientAuthCertificateKeyfile(log zerolog.Logger, cli v1.CoreV1Interface, commonName string, ttl time.Duration, spec api.SyncAuthenticationSpec, secretName, namespace string, ownerRef *metav1.OwnerReference) error { +func createClientAuthCertificateKeyfile(log zerolog.Logger, secrets v1.SecretInterface, commonName string, ttl time.Duration, spec api.SyncAuthenticationSpec, secretName string, ownerRef *metav1.OwnerReference) error { log = log.With().Str("secret", secretName).Logger() // Load CA certificate - caCert, caKey, _, err := k8sutil.GetCASecret(cli, spec.GetClientCASecretName(), namespace, nil) + caCert, caKey, _, err := k8sutil.GetCASecret(secrets, spec.GetClientCASecretName(), nil) if err != nil { log.Debug().Err(err).Msg("Failed to load CA certificate") return maskAny(err) @@ -100,7 +100,7 @@ func createClientAuthCertificateKeyfile(log zerolog.Logger, cli v1.CoreV1Interfa } keyfile := strings.TrimSpace(cert) + "\n" + strings.TrimSpace(priv) - if err := k8sutil.CreateTLSKeyfileSecret(cli, secretName, namespace, keyfile, ownerRef); err != nil { + if err := k8sutil.CreateTLSKeyfileSecret(secrets, secretName, keyfile, ownerRef); err != nil { if k8sutil.IsAlreadyExists(err) { log.Debug().Msg("Server Secret already exists") } else { diff --git a/pkg/deployment/resources/certificates_tls.go b/pkg/deployment/resources/certificates_tls.go index d2b295c08..541c486c2 100644 --- a/pkg/deployment/resources/certificates_tls.go +++ b/pkg/deployment/resources/certificates_tls.go @@ -43,7 +43,7 @@ const ( // createTLSCACertificate creates a CA certificate and stores it in a secret with name // specified in the given spec. -func createTLSCACertificate(log zerolog.Logger, cli v1.CoreV1Interface, spec api.TLSSpec, deploymentName, namespace string, ownerRef *metav1.OwnerReference) error { +func createTLSCACertificate(log zerolog.Logger, secrets k8sutil.SecretInterface, spec api.TLSSpec, deploymentName string, ownerRef *metav1.OwnerReference) error { log = log.With().Str("secret", spec.GetCASecretName()).Logger() options := certificates.CreateCertificateOptions{ @@ -58,7 +58,7 @@ func createTLSCACertificate(log zerolog.Logger, cli v1.CoreV1Interface, spec api log.Debug().Err(err).Msg("Failed to create CA certificate") return maskAny(err) } - if err := k8sutil.CreateCASecret(cli, spec.GetCASecretName(), namespace, cert, priv, ownerRef); err != nil { + if err := k8sutil.CreateCASecret(secrets, spec.GetCASecretName(), cert, priv, ownerRef); err != nil { if k8sutil.IsAlreadyExists(err) { log.Debug().Msg("CA Secret already exists") } else { @@ -72,7 +72,7 @@ func createTLSCACertificate(log zerolog.Logger, cli v1.CoreV1Interface, spec api // createTLSServerCertificate creates a TLS certificate for a specific server and stores // it in a secret with the given name. -func createTLSServerCertificate(log zerolog.Logger, cli v1.CoreV1Interface, serverNames []string, spec api.TLSSpec, secretName, namespace string, ownerRef *metav1.OwnerReference) error { +func createTLSServerCertificate(log zerolog.Logger, secrets v1.SecretInterface, serverNames []string, spec api.TLSSpec, secretName string, ownerRef *metav1.OwnerReference) error { log = log.With().Str("secret", secretName).Logger() // Load alt names dnsNames, ipAddresses, emailAddress, err := spec.GetParsedAltNames() @@ -82,7 +82,7 @@ func createTLSServerCertificate(log zerolog.Logger, cli v1.CoreV1Interface, serv } // Load CA certificate - caCert, caKey, _, err := k8sutil.GetCASecret(cli, spec.GetCASecretName(), namespace, nil) + caCert, caKey, _, err := k8sutil.GetCASecret(secrets, spec.GetCASecretName(), nil) if err != nil { log.Debug().Err(err).Msg("Failed to load CA certificate") return maskAny(err) @@ -109,7 +109,7 @@ func createTLSServerCertificate(log zerolog.Logger, cli v1.CoreV1Interface, serv } keyfile := strings.TrimSpace(cert) + "\n" + strings.TrimSpace(priv) - if err := k8sutil.CreateTLSKeyfileSecret(cli, secretName, namespace, keyfile, ownerRef); err != nil { + if err := k8sutil.CreateTLSKeyfileSecret(secrets, secretName, keyfile, ownerRef); err != nil { if k8sutil.IsAlreadyExists(err) { log.Debug().Msg("Server Secret already exists") } else { diff --git a/pkg/deployment/resources/pod_cleanup.go b/pkg/deployment/resources/pod_cleanup.go index e507c4a7c..a45862616 100644 --- a/pkg/deployment/resources/pod_cleanup.go +++ b/pkg/deployment/resources/pod_cleanup.go @@ -25,23 +25,27 @@ package resources import ( "time" + "github.com/arangodb/kube-arangodb/pkg/util" "github.com/arangodb/kube-arangodb/pkg/util/k8sutil" api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha" ) const ( - statelessTerminationPeriod = time.Minute // We wait this long for a stateless server to terminate on it's own. Afterwards we kill it. + statelessTerminationPeriod = time.Minute // We wait this long for a stateless server to terminate on it's own. Afterwards we kill it. + recheckStatefullPodCleanupInterval = util.Interval(time.Second * 2) // Interval used when Pod finalizers need to be rechecked soon ) // CleanupTerminatedPods removes all pods in Terminated state that belong to a member in Created state. -func (r *Resources) CleanupTerminatedPods() error { +// Returns: Interval_till_next_inspection, error +func (r *Resources) CleanupTerminatedPods() (util.Interval, error) { log := r.log + nextInterval := maxPodInspectorInterval // Large by default, will be made smaller if needed in the rest of the function pods, err := r.context.GetOwnedPods() if err != nil { log.Debug().Err(err).Msg("Failed to get owned pods") - return maskAny(err) + return 0, maskAny(err) } // Update member status from all pods found @@ -66,12 +70,15 @@ func (r *Resources) CleanupTerminatedPods() error { if !memberStatus.Conditions.IsTrue(api.ConditionTypeTerminated) { if !group.IsStateless() { // For statefull members, we have to wait for confirmed termination + log.Debug().Str("pod", p.GetName()).Msg("Cannot cleanup pod yet, waiting for it to reach terminated state") + nextInterval = nextInterval.ReduceTo(recheckStatefullPodCleanupInterval) continue } else { // If a stateless server does not terminate within a reasonable amount or time, we kill it. t := p.GetDeletionTimestamp() if t == nil || t.Add(statelessTerminationPeriod).After(time.Now()) { // Either delete timestamp is not set, or not yet waiting long enough + nextInterval = nextInterval.ReduceTo(util.Interval(statelessTerminationPeriod)) continue } } @@ -84,5 +91,5 @@ func (r *Resources) CleanupTerminatedPods() error { log.Warn().Err(err).Str("pod-name", p.GetName()).Msg("Failed to cleanup pod") } } - return nil + return nextInterval, nil } diff --git a/pkg/deployment/resources/pod_creator.go b/pkg/deployment/resources/pod_creator.go index 329f84c32..58fa7833a 100644 --- a/pkg/deployment/resources/pod_creator.go +++ b/pkg/deployment/resources/pod_creator.go @@ -441,6 +441,7 @@ func (r *Resources) createPodForMember(spec api.DeploymentSpec, memberID string, log := r.log apiObject := r.context.GetAPIObject() ns := r.context.GetNamespace() + secrets := kubecli.CoreV1().Secrets(ns) status, lastVersion := r.context.GetStatus() m, group, found := status.Members.ElementByID(memberID) if !found { @@ -505,14 +506,14 @@ func (r *Resources) createPodForMember(spec api.DeploymentSpec, memberID string, serverNames = append(serverNames, ip) } owner := apiObject.AsOwner() - if err := createTLSServerCertificate(log, kubecli.CoreV1(), serverNames, spec.TLS, tlsKeyfileSecretName, ns, &owner); err != nil && !k8sutil.IsAlreadyExists(err) { + if err := createTLSServerCertificate(log, secrets, serverNames, spec.TLS, tlsKeyfileSecretName, &owner); err != nil && !k8sutil.IsAlreadyExists(err) { return maskAny(errors.Wrapf(err, "Failed to create TLS keyfile secret")) } } rocksdbEncryptionSecretName := "" if spec.RocksDB.IsEncrypted() { rocksdbEncryptionSecretName = spec.RocksDB.Encryption.GetKeySecretName() - if err := k8sutil.ValidateEncryptionKeySecret(kubecli.CoreV1(), rocksdbEncryptionSecretName, ns); err != nil { + if err := k8sutil.ValidateEncryptionKeySecret(secrets, rocksdbEncryptionSecretName); err != nil { return maskAny(errors.Wrapf(err, "RocksDB encryption key secret validation failed")) } } @@ -539,12 +540,12 @@ func (r *Resources) createPodForMember(spec api.DeploymentSpec, memberID string, var tlsKeyfileSecretName, clientAuthCASecretName, masterJWTSecretName, clusterJWTSecretName string // Check master JWT secret masterJWTSecretName = spec.Sync.Authentication.GetJWTSecretName() - if err := k8sutil.ValidateTokenSecret(kubecli.CoreV1(), masterJWTSecretName, ns); err != nil { + if err := k8sutil.ValidateTokenSecret(secrets, masterJWTSecretName); err != nil { return maskAny(errors.Wrapf(err, "Master JWT secret validation failed")) } // Check monitoring token secret monitoringTokenSecretName := spec.Sync.Monitoring.GetTokenSecretName() - if err := k8sutil.ValidateTokenSecret(kubecli.CoreV1(), monitoringTokenSecretName, ns); err != nil { + if err := k8sutil.ValidateTokenSecret(secrets, monitoringTokenSecretName); err != nil { return maskAny(errors.Wrapf(err, "Monitoring token secret validation failed")) } if group == api.ServerGroupSyncMasters { @@ -562,19 +563,19 @@ func (r *Resources) createPodForMember(spec api.DeploymentSpec, memberID string, } } owner := apiObject.AsOwner() - if err := createTLSServerCertificate(log, kubecli.CoreV1(), serverNames, spec.Sync.TLS, tlsKeyfileSecretName, ns, &owner); err != nil && !k8sutil.IsAlreadyExists(err) { + if err := createTLSServerCertificate(log, secrets, serverNames, spec.Sync.TLS, tlsKeyfileSecretName, &owner); err != nil && !k8sutil.IsAlreadyExists(err) { return maskAny(errors.Wrapf(err, "Failed to create TLS keyfile secret")) } // Check cluster JWT secret if spec.IsAuthenticated() { clusterJWTSecretName = spec.Authentication.GetJWTSecretName() - if err := k8sutil.ValidateTokenSecret(kubecli.CoreV1(), clusterJWTSecretName, ns); err != nil { + if err := k8sutil.ValidateTokenSecret(secrets, clusterJWTSecretName); err != nil { return maskAny(errors.Wrapf(err, "Cluster JWT secret validation failed")) } } // Check client-auth CA certificate secret clientAuthCASecretName = spec.Sync.Authentication.GetClientCASecretName() - if err := k8sutil.ValidateCACertificateSecret(kubecli.CoreV1(), clientAuthCASecretName, ns); err != nil { + if err := k8sutil.ValidateCACertificateSecret(secrets, clientAuthCASecretName); err != nil { return maskAny(errors.Wrapf(err, "Client authentication CA certificate secret validation failed")) } } diff --git a/pkg/deployment/resources/pod_finalizers.go b/pkg/deployment/resources/pod_finalizers.go index 9310ca16d..901c8ea71 100644 --- a/pkg/deployment/resources/pod_finalizers.go +++ b/pkg/deployment/resources/pod_finalizers.go @@ -31,17 +31,19 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha" + "github.com/arangodb/kube-arangodb/pkg/util" "github.com/arangodb/kube-arangodb/pkg/util/constants" "github.com/arangodb/kube-arangodb/pkg/util/k8sutil" ) const ( - recheckPodFinalizerInterval = time.Second * 10 // Interval used when Pod finalizers need to be rechecked soon + podFinalizerRemovedInterval = util.Interval(time.Second / 2) // Interval used (until new inspection) when Pod finalizers have been removed + recheckPodFinalizerInterval = util.Interval(time.Second * 10) // Interval used when Pod finalizers need to be rechecked soon ) // runPodFinalizers goes through the list of pod finalizers to see if they can be removed. // Returns: Interval_till_next_inspection, error -func (r *Resources) runPodFinalizers(ctx context.Context, p *v1.Pod, memberStatus api.MemberStatus, updateMember func(api.MemberStatus) error) (time.Duration, error) { +func (r *Resources) runPodFinalizers(ctx context.Context, p *v1.Pod, memberStatus api.MemberStatus, updateMember func(api.MemberStatus) error) (util.Interval, error) { log := r.log.With().Str("pod-name", p.GetName()).Logger() var removalList []string for _, f := range p.ObjectMeta.GetFinalizers() { @@ -71,11 +73,11 @@ func (r *Resources) runPodFinalizers(ctx context.Context, p *v1.Pod, memberStatu return 0, maskAny(err) } log.Debug().Strs("finalizers", removalList).Msg("Removed finalizer(s) from Pod") - } else { - // Check again at given interval - return recheckPodFinalizerInterval, nil + // Let's do the next inspection quickly, since things may have changed now. + return podFinalizerRemovedInterval, nil } - return maxPodInspectorInterval, nil + // Check again at given interval + return recheckPodFinalizerInterval, nil } // inspectFinalizerPodAgencyServing checks the finalizer condition for agency-serving. diff --git a/pkg/deployment/resources/pod_inspector.go b/pkg/deployment/resources/pod_inspector.go index 34209f184..092abe9dc 100644 --- a/pkg/deployment/resources/pod_inspector.go +++ b/pkg/deployment/resources/pod_inspector.go @@ -40,14 +40,15 @@ var ( ) const ( - podScheduleTimeout = time.Minute // How long we allow the schedule to take scheduling a pod. - maxPodInspectorInterval = time.Hour // Maximum time between Pod inspection (if nothing else happens) + podScheduleTimeout = time.Minute // How long we allow the schedule to take scheduling a pod. + recheckSoonPodInspectorInterval = util.Interval(time.Second) // Time between Pod inspection if we think something will change soon + maxPodInspectorInterval = util.Interval(time.Hour) // Maximum time between Pod inspection (if nothing else happens) ) // InspectPods lists all pods that belong to the given deployment and updates // the member status of the deployment accordingly. // Returns: Interval_till_next_inspection, error -func (r *Resources) InspectPods(ctx context.Context) (time.Duration, error) { +func (r *Resources) InspectPods(ctx context.Context) (util.Interval, error) { log := r.log var events []*k8sutil.Event nextInterval := maxPodInspectorInterval // Large by default, will be made smaller if needed in the rest of the function @@ -98,6 +99,7 @@ func (r *Resources) InspectPods(ctx context.Context) (time.Duration, error) { if memberStatus.Conditions.Update(api.ConditionTypeTerminated, true, "Pod Succeeded", "") { log.Debug().Str("pod-name", p.GetName()).Msg("Updating member condition Terminated to true: Pod Succeeded") updateMemberStatusNeeded = true + nextInterval = nextInterval.ReduceTo(recheckSoonPodInspectorInterval) if !wasTerminated { // Record termination time now := metav1.Now() @@ -110,6 +112,7 @@ func (r *Resources) InspectPods(ctx context.Context) (time.Duration, error) { if memberStatus.Conditions.Update(api.ConditionTypeTerminated, true, "Pod Failed", "") { log.Debug().Str("pod-name", p.GetName()).Msg("Updating member condition Terminated to true: Pod Failed") updateMemberStatusNeeded = true + nextInterval = nextInterval.ReduceTo(recheckSoonPodInspectorInterval) if !wasTerminated { // Record termination time now := metav1.Now() @@ -123,12 +126,14 @@ func (r *Resources) InspectPods(ctx context.Context) (time.Duration, error) { log.Debug().Str("pod-name", p.GetName()).Msg("Updating member condition Ready to true") memberStatus.IsInitialized = true // Require future pods for this member to have an existing UUID (in case of dbserver). updateMemberStatusNeeded = true + nextInterval = nextInterval.ReduceTo(recheckSoonPodInspectorInterval) } } else { // Pod is not ready if memberStatus.Conditions.Update(api.ConditionTypeReady, false, "Pod Not Ready", "") { log.Debug().Str("pod-name", p.GetName()).Msg("Updating member condition Ready to false") updateMemberStatusNeeded = true + nextInterval = nextInterval.ReduceTo(recheckSoonPodInspectorInterval) } } if k8sutil.IsPodNotScheduledFor(&p, podScheduleTimeout) { @@ -148,7 +153,7 @@ func (r *Resources) InspectPods(ctx context.Context) (time.Duration, error) { // Only log here, since we'll be called to try again. log.Warn().Err(err).Msg("Failed to run pod finalizers") } else { - nextInterval = util.MinDuration(nextInterval, x) + nextInterval = nextInterval.ReduceTo(x) } } if updateMemberStatusNeeded { @@ -194,6 +199,7 @@ func (r *Resources) InspectPods(ctx context.Context) (time.Duration, error) { log.Debug().Str("pod-name", podName).Msg("Pod is gone") m.Phase = api.MemberPhaseNone // This is trigger a recreate of the pod. // Create event + nextInterval = nextInterval.ReduceTo(recheckSoonPodInspectorInterval) events = append(events, k8sutil.NewPodGoneEvent(podName, group.AsRole(), apiObject)) updateMemberNeeded := false if m.Conditions.Update(api.ConditionTypeReady, false, "Pod Does Not Exist", "") { diff --git a/pkg/deployment/resources/pvc_finalizers.go b/pkg/deployment/resources/pvc_finalizers.go index 2c43a6c64..aa0882d49 100644 --- a/pkg/deployment/resources/pvc_finalizers.go +++ b/pkg/deployment/resources/pvc_finalizers.go @@ -32,16 +32,17 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha" + "github.com/arangodb/kube-arangodb/pkg/util" "github.com/arangodb/kube-arangodb/pkg/util/constants" "github.com/arangodb/kube-arangodb/pkg/util/k8sutil" ) const ( - recheckPVCFinalizerInterval = time.Second * 10 // Interval used when PVC finalizers need to be rechecked soon + recheckPVCFinalizerInterval = util.Interval(time.Second * 5) // Interval used when PVC finalizers need to be rechecked soon ) // runPVCFinalizers goes through the list of PVC finalizers to see if they can be removed. -func (r *Resources) runPVCFinalizers(ctx context.Context, p *v1.PersistentVolumeClaim, group api.ServerGroup, memberStatus api.MemberStatus) (time.Duration, error) { +func (r *Resources) runPVCFinalizers(ctx context.Context, p *v1.PersistentVolumeClaim, group api.ServerGroup, memberStatus api.MemberStatus) (util.Interval, error) { log := r.log.With().Str("pvc-name", p.GetName()).Logger() var removalList []string for _, f := range p.ObjectMeta.GetFinalizers() { diff --git a/pkg/deployment/resources/pvc_inspector.go b/pkg/deployment/resources/pvc_inspector.go index 16e8b2eab..3a9947490 100644 --- a/pkg/deployment/resources/pvc_inspector.go +++ b/pkg/deployment/resources/pvc_inspector.go @@ -33,12 +33,12 @@ import ( var ( inspectedPVCCounter = metrics.MustRegisterCounter("deployment", "inspected_ppvcs", "Number of PVCs inspections") - maxPVCInspectorInterval = time.Hour // Maximum time between PVC inspection (if nothing else happens) + maxPVCInspectorInterval = util.Interval(time.Hour) // Maximum time between PVC inspection (if nothing else happens) ) // InspectPVCs lists all PVCs that belong to the given deployment and updates // the member status of the deployment accordingly. -func (r *Resources) InspectPVCs(ctx context.Context) (time.Duration, error) { +func (r *Resources) InspectPVCs(ctx context.Context) (util.Interval, error) { log := r.log nextInterval := maxPVCInspectorInterval @@ -79,7 +79,7 @@ func (r *Resources) InspectPVCs(ctx context.Context) (time.Duration, error) { // Only log here, since we'll be called to try again. log.Warn().Err(err).Msg("Failed to run PVC finalizers") } else { - nextInterval = util.MinDuration(nextInterval, x) + nextInterval = nextInterval.ReduceTo(x) } } if updateMemberStatusNeeded { diff --git a/pkg/deployment/resources/pvcs.go b/pkg/deployment/resources/pvcs.go index 86ac90374..b6173bbfb 100644 --- a/pkg/deployment/resources/pvcs.go +++ b/pkg/deployment/resources/pvcs.go @@ -24,6 +24,7 @@ package resources import ( api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "github.com/arangodb/kube-arangodb/pkg/util/constants" "github.com/arangodb/kube-arangodb/pkg/util/k8sutil" @@ -45,15 +46,30 @@ func (r *Resources) EnsurePVCs() error { status, _ := r.context.GetStatus() enforceAntiAffinity := r.context.GetSpec().GetEnvironment().IsProduction() + pvcs := kubecli.CoreV1().PersistentVolumeClaims(ns) + list, err := pvcs.List(metav1.ListOptions{}) + if err != nil { + return maskAny(err) + } + pvcExists := func(name string) bool { + for _, pvc := range list.Items { + if pvc.GetName() == name { + return true + } + } + return false + } if err := iterator.ForeachServerGroup(func(group api.ServerGroup, spec api.ServerGroupSpec, status *api.MemberStatusList) error { for _, m := range *status { if m.PersistentVolumeClaimName != "" { - storageClassName := spec.GetStorageClassName() - role := group.AsRole() - resources := spec.Resources - finalizers := r.createPVCFinalizers(group) - if err := k8sutil.CreatePersistentVolumeClaim(kubecli, m.PersistentVolumeClaimName, deploymentName, ns, storageClassName, role, enforceAntiAffinity, resources, finalizers, owner); err != nil { - return maskAny(err) + if !pvcExists(m.PersistentVolumeClaimName) { + storageClassName := spec.GetStorageClassName() + role := group.AsRole() + resources := spec.Resources + finalizers := r.createPVCFinalizers(group) + if err := k8sutil.CreatePersistentVolumeClaim(pvcs, m.PersistentVolumeClaimName, deploymentName, ns, storageClassName, role, enforceAntiAffinity, resources, finalizers, owner); err != nil { + return maskAny(err) + } } } } diff --git a/pkg/deployment/resources/secrets.go b/pkg/deployment/resources/secrets.go index 205736a43..5d35ddb8a 100644 --- a/pkg/deployment/resources/secrets.go +++ b/pkg/deployment/resources/secrets.go @@ -34,28 +34,31 @@ import ( // EnsureSecrets creates all secrets needed to run the given deployment func (r *Resources) EnsureSecrets() error { + kubecli := r.context.GetKubeCli() + ns := r.context.GetNamespace() + secrets := k8sutil.NewSecretCache(kubecli.CoreV1().Secrets(ns)) spec := r.context.GetSpec() if spec.IsAuthenticated() { - if err := r.ensureTokenSecret(spec.Authentication.GetJWTSecretName()); err != nil { + if err := r.ensureTokenSecret(secrets, spec.Authentication.GetJWTSecretName()); err != nil { return maskAny(err) } } if spec.IsSecure() { - if err := r.ensureTLSCACertificateSecret(spec.TLS); err != nil { + if err := r.ensureTLSCACertificateSecret(secrets, spec.TLS); err != nil { return maskAny(err) } } if spec.Sync.IsEnabled() { - if err := r.ensureTokenSecret(spec.Sync.Authentication.GetJWTSecretName()); err != nil { + if err := r.ensureTokenSecret(secrets, spec.Sync.Authentication.GetJWTSecretName()); err != nil { return maskAny(err) } - if err := r.ensureTokenSecret(spec.Sync.Monitoring.GetTokenSecretName()); err != nil { + if err := r.ensureTokenSecret(secrets, spec.Sync.Monitoring.GetTokenSecretName()); err != nil { return maskAny(err) } - if err := r.ensureTLSCACertificateSecret(spec.Sync.TLS); err != nil { + if err := r.ensureTLSCACertificateSecret(secrets, spec.Sync.TLS); err != nil { return maskAny(err) } - if err := r.ensureClientAuthCACertificateSecret(spec.Sync.Authentication); err != nil { + if err := r.ensureClientAuthCACertificateSecret(secrets, spec.Sync.Authentication); err != nil { return maskAny(err) } } @@ -65,10 +68,8 @@ func (r *Resources) EnsureSecrets() error { // ensureTokenSecret checks if a secret with given name exists in the namespace // of the deployment. If not, it will add such a secret with a random // token. -func (r *Resources) ensureTokenSecret(secretName string) error { - kubecli := r.context.GetKubeCli() - ns := r.context.GetNamespace() - if _, err := kubecli.CoreV1().Secrets(ns).Get(secretName, metav1.GetOptions{}); k8sutil.IsNotFound(err) { +func (r *Resources) ensureTokenSecret(secrets k8sutil.SecretInterface, secretName string) error { + if _, err := secrets.Get(secretName, metav1.GetOptions{}); k8sutil.IsNotFound(err) { // Secret not found, create it // Create token tokenData := make([]byte, 32) @@ -77,7 +78,7 @@ func (r *Resources) ensureTokenSecret(secretName string) error { // Create secret owner := r.context.GetAPIObject().AsOwner() - if err := k8sutil.CreateTokenSecret(kubecli.CoreV1(), secretName, ns, token, &owner); k8sutil.IsAlreadyExists(err) { + if err := k8sutil.CreateTokenSecret(secrets, secretName, token, &owner); k8sutil.IsAlreadyExists(err) { // Secret added while we tried it also return nil } else if err != nil { @@ -93,15 +94,13 @@ func (r *Resources) ensureTokenSecret(secretName string) error { // ensureTLSCACertificateSecret checks if a secret with given name exists in the namespace // of the deployment. If not, it will add such a secret with a generated CA certificate. -func (r *Resources) ensureTLSCACertificateSecret(spec api.TLSSpec) error { - kubecli := r.context.GetKubeCli() - ns := r.context.GetNamespace() - if _, err := kubecli.CoreV1().Secrets(ns).Get(spec.GetCASecretName(), metav1.GetOptions{}); k8sutil.IsNotFound(err) { +func (r *Resources) ensureTLSCACertificateSecret(secrets k8sutil.SecretInterface, spec api.TLSSpec) error { + if _, err := secrets.Get(spec.GetCASecretName(), metav1.GetOptions{}); k8sutil.IsNotFound(err) { // Secret not found, create it apiObject := r.context.GetAPIObject() owner := apiObject.AsOwner() deploymentName := apiObject.GetName() - if err := createTLSCACertificate(r.log, kubecli.CoreV1(), spec, deploymentName, ns, &owner); k8sutil.IsAlreadyExists(err) { + if err := createTLSCACertificate(r.log, secrets, spec, deploymentName, &owner); k8sutil.IsAlreadyExists(err) { // Secret added while we tried it also return nil } else if err != nil { @@ -117,15 +116,13 @@ func (r *Resources) ensureTLSCACertificateSecret(spec api.TLSSpec) error { // ensureClientAuthCACertificateSecret checks if a secret with given name exists in the namespace // of the deployment. If not, it will add such a secret with a generated CA certificate. -func (r *Resources) ensureClientAuthCACertificateSecret(spec api.SyncAuthenticationSpec) error { - kubecli := r.context.GetKubeCli() - ns := r.context.GetNamespace() - if _, err := kubecli.CoreV1().Secrets(ns).Get(spec.GetClientCASecretName(), metav1.GetOptions{}); k8sutil.IsNotFound(err) { +func (r *Resources) ensureClientAuthCACertificateSecret(secrets k8sutil.SecretInterface, spec api.SyncAuthenticationSpec) error { + if _, err := secrets.Get(spec.GetClientCASecretName(), metav1.GetOptions{}); k8sutil.IsNotFound(err) { // Secret not found, create it apiObject := r.context.GetAPIObject() owner := apiObject.AsOwner() deploymentName := apiObject.GetName() - if err := createClientAuthCACertificate(r.log, kubecli.CoreV1(), spec, deploymentName, ns, &owner); k8sutil.IsAlreadyExists(err) { + if err := createClientAuthCACertificate(r.log, secrets, spec, deploymentName, &owner); k8sutil.IsAlreadyExists(err) { // Secret added while we tried it also return nil } else if err != nil { @@ -146,8 +143,9 @@ func (r *Resources) getJWTSecret(spec api.DeploymentSpec) (string, error) { } kubecli := r.context.GetKubeCli() ns := r.context.GetNamespace() + secrets := kubecli.CoreV1().Secrets(ns) secretName := spec.Authentication.GetJWTSecretName() - s, err := k8sutil.GetTokenSecret(kubecli.CoreV1(), secretName, ns) + s, err := k8sutil.GetTokenSecret(secrets, secretName) if err != nil { r.log.Debug().Err(err).Str("secret-name", secretName).Msg("Failed to get JWT secret") return "", maskAny(err) @@ -159,8 +157,9 @@ func (r *Resources) getJWTSecret(spec api.DeploymentSpec) (string, error) { func (r *Resources) getSyncJWTSecret(spec api.DeploymentSpec) (string, error) { kubecli := r.context.GetKubeCli() ns := r.context.GetNamespace() + secrets := kubecli.CoreV1().Secrets(ns) secretName := spec.Sync.Authentication.GetJWTSecretName() - s, err := k8sutil.GetTokenSecret(kubecli.CoreV1(), secretName, ns) + s, err := k8sutil.GetTokenSecret(secrets, secretName) if err != nil { r.log.Debug().Err(err).Str("secret-name", secretName).Msg("Failed to get sync JWT secret") return "", maskAny(err) @@ -172,8 +171,9 @@ func (r *Resources) getSyncJWTSecret(spec api.DeploymentSpec) (string, error) { func (r *Resources) getSyncMonitoringToken(spec api.DeploymentSpec) (string, error) { kubecli := r.context.GetKubeCli() ns := r.context.GetNamespace() + secrets := kubecli.CoreV1().Secrets(ns) secretName := spec.Sync.Monitoring.GetTokenSecretName() - s, err := k8sutil.GetTokenSecret(kubecli.CoreV1(), secretName, ns) + s, err := k8sutil.GetTokenSecret(secrets, secretName) if err != nil { r.log.Debug().Err(err).Str("secret-name", secretName).Msg("Failed to get sync monitoring secret") return "", maskAny(err) diff --git a/pkg/deployment/resources/services.go b/pkg/deployment/resources/services.go index 009b99b5c..9ed910141 100644 --- a/pkg/deployment/resources/services.go +++ b/pkg/deployment/resources/services.go @@ -40,42 +40,63 @@ func (r *Resources) EnsureServices() error { log := r.log kubecli := r.context.GetKubeCli() apiObject := r.context.GetAPIObject() + deploymentName := apiObject.GetName() ns := apiObject.GetNamespace() owner := apiObject.AsOwner() spec := r.context.GetSpec() - // Headless service - svcName, newlyCreated, err := k8sutil.CreateHeadlessService(kubecli, apiObject, owner) + // Fetch existing services + svcs := kubecli.CoreV1().Services(ns) + list, err := svcs.List(metav1.ListOptions{}) if err != nil { - log.Debug().Err(err).Msg("Failed to create headless service") + log.Debug().Err(err).Msg("Failed to list existing services") return maskAny(err) } - if newlyCreated { - log.Debug().Str("service", svcName).Msg("Created headless service") + svcExists := func(name string) bool { + for _, svc := range list.Items { + if svc.GetName() == name { + return true + } + } + return false + } + + // Headless service + if !svcExists(k8sutil.CreateHeadlessServiceName(deploymentName)) { + svcName, newlyCreated, err := k8sutil.CreateHeadlessService(svcs, apiObject, owner) + if err != nil { + log.Debug().Err(err).Msg("Failed to create headless service") + return maskAny(err) + } + if newlyCreated { + log.Debug().Str("service", svcName).Msg("Created headless service") + } } // Internal database client service single := spec.GetMode().HasSingleServers() - svcName, newlyCreated, err = k8sutil.CreateDatabaseClientService(kubecli, apiObject, single, owner) - if err != nil { - log.Debug().Err(err).Msg("Failed to create database client service") - return maskAny(err) - } - if newlyCreated { - log.Debug().Str("service", svcName).Msg("Created database client service") - } - { - status, lastVersion := r.context.GetStatus() - if status.ServiceName != svcName { - status.ServiceName = svcName - if err := r.context.UpdateStatus(status, lastVersion); err != nil { - return maskAny(err) + if !svcExists(k8sutil.CreateDatabaseClientServiceName(deploymentName)) { + svcName, newlyCreated, err := k8sutil.CreateDatabaseClientService(svcs, apiObject, single, owner) + if err != nil { + log.Debug().Err(err).Msg("Failed to create database client service") + return maskAny(err) + } + if newlyCreated { + log.Debug().Str("service", svcName).Msg("Created database client service") + } + { + status, lastVersion := r.context.GetStatus() + if status.ServiceName != svcName { + status.ServiceName = svcName + if err := r.context.UpdateStatus(status, lastVersion); err != nil { + return maskAny(err) + } } } } // Database external access service - eaServiceName := k8sutil.CreateDatabaseExternalAccessServiceName(apiObject.GetName()) + eaServiceName := k8sutil.CreateDatabaseExternalAccessServiceName(deploymentName) role := "coordinator" if single { role = "single" @@ -86,7 +107,7 @@ func (r *Resources) EnsureServices() error { if spec.Sync.IsEnabled() { // External (and internal) Sync master service - eaServiceName := k8sutil.CreateSyncMasterClientServiceName(apiObject.GetName()) + eaServiceName := k8sutil.CreateSyncMasterClientServiceName(deploymentName) role := "syncmaster" if err := r.ensureExternalAccessServices(eaServiceName, ns, role, "sync", k8sutil.ArangoSyncMasterPort, true, spec.Sync.ExternalAccess.ExternalAccessSpec, apiObject, log, kubecli); err != nil { return maskAny(err) @@ -108,8 +129,8 @@ func (r *Resources) ensureExternalAccessServices(eaServiceName, ns, svcRole, tit createExternalAccessService := false deleteExternalAccessService := false eaServiceType := spec.GetType().AsServiceType() // Note: Type auto defaults to ServiceTypeLoadBalancer - svcCli := kubecli.CoreV1().Services(ns) - if existing, err := svcCli.Get(eaServiceName, metav1.GetOptions{}); err == nil { + svcs := kubecli.CoreV1().Services(ns) + if existing, err := svcs.Get(eaServiceName, metav1.GetOptions{}); err == nil { // External access service exists loadBalancerIP := spec.GetLoadBalancerIP() nodePort := spec.GetNodePort() @@ -161,7 +182,7 @@ func (r *Resources) ensureExternalAccessServices(eaServiceName, ns, svcRole, tit } if deleteExternalAccessService { log.Info().Str("service", eaServiceName).Msgf("Removing obsolete %s external access service", title) - if err := svcCli.Delete(eaServiceName, &metav1.DeleteOptions{}); err != nil { + if err := svcs.Delete(eaServiceName, &metav1.DeleteOptions{}); err != nil { log.Debug().Err(err).Msgf("Failed to remove %s external access service", title) return maskAny(err) } @@ -170,7 +191,7 @@ func (r *Resources) ensureExternalAccessServices(eaServiceName, ns, svcRole, tit // Let's create or update the database external access service nodePort := spec.GetNodePort() loadBalancerIP := spec.GetLoadBalancerIP() - _, newlyCreated, err := k8sutil.CreateExternalAccessService(kubecli, eaServiceName, svcRole, apiObject, eaServiceType, port, nodePort, loadBalancerIP, apiObject.AsOwner()) + _, newlyCreated, err := k8sutil.CreateExternalAccessService(svcs, eaServiceName, svcRole, apiObject, eaServiceType, port, nodePort, loadBalancerIP, apiObject.AsOwner()) if err != nil { log.Debug().Err(err).Msgf("Failed to create %s external access service", title) return maskAny(err) diff --git a/pkg/replication/server_endpoint_api.go b/pkg/replication/server_endpoint_api.go index ab8f4a806..b6b026eaf 100644 --- a/pkg/replication/server_endpoint_api.go +++ b/pkg/replication/server_endpoint_api.go @@ -57,7 +57,8 @@ func (ep serverEndpoint) AuthUserSecretName() string { // TLSCACert returns a PEM encoded TLS CA certificate of the syncmaster at this endpoint func (ep serverEndpoint) TLSCACert() string { tlsCASecretName := ep.getSpec().TLS.GetCASecretName() - caCert, err := k8sutil.GetCACertficateSecret(ep.dr.deps.KubeCli.CoreV1(), tlsCASecretName, ep.dr.apiObject.GetNamespace()) + secrets := ep.dr.deps.KubeCli.CoreV1().Secrets(ep.dr.apiObject.GetNamespace()) + caCert, err := k8sutil.GetCACertficateSecret(secrets, tlsCASecretName) if err != nil { return "" } diff --git a/pkg/replication/sync_client.go b/pkg/replication/sync_client.go index df787266e..cbea9005f 100644 --- a/pkg/replication/sync_client.go +++ b/pkg/replication/sync_client.go @@ -46,6 +46,7 @@ func (dr *DeploymentReplication) createSyncMasterClient(epSpec api.EndpointSpec) } // Authentication + secrets := dr.deps.KubeCli.CoreV1().Secrets(dr.apiObject.GetNamespace()) insecureSkipVerify := true tlsAuth := tasks.TLSAuthentication{} clientAuthKeyfileSecretName, userSecretName, authJWTSecretName, tlsCASecretName, err := dr.getEndpointSecretNames(epSpec) @@ -57,18 +58,18 @@ func (dr *DeploymentReplication) createSyncMasterClient(epSpec api.EndpointSpec) jwtSecret := "" if userSecretName != "" { var err error - username, password, err = k8sutil.GetBasicAuthSecret(dr.deps.KubeCli.CoreV1(), userSecretName, dr.apiObject.GetNamespace()) + username, password, err = k8sutil.GetBasicAuthSecret(secrets, userSecretName) if err != nil { return nil, maskAny(err) } } else if authJWTSecretName != "" { var err error - jwtSecret, err = k8sutil.GetTokenSecret(dr.deps.KubeCli.CoreV1(), authJWTSecretName, dr.apiObject.GetNamespace()) + jwtSecret, err = k8sutil.GetTokenSecret(secrets, authJWTSecretName) if err != nil { return nil, maskAny(err) } } else if clientAuthKeyfileSecretName != "" { - keyFileContent, err := k8sutil.GetTLSKeyfileSecret(dr.deps.KubeCli.CoreV1(), clientAuthKeyfileSecretName, dr.apiObject.GetNamespace()) + keyFileContent, err := k8sutil.GetTLSKeyfileSecret(secrets, clientAuthKeyfileSecretName) if err != nil { return nil, maskAny(err) } @@ -82,7 +83,7 @@ func (dr *DeploymentReplication) createSyncMasterClient(epSpec api.EndpointSpec) } } if tlsCASecretName != "" { - caCert, err := k8sutil.GetCACertficateSecret(dr.deps.KubeCli.CoreV1(), tlsCASecretName, dr.apiObject.GetNamespace()) + caCert, err := k8sutil.GetCACertficateSecret(secrets, tlsCASecretName) if err != nil { return nil, maskAny(err) } @@ -126,7 +127,8 @@ func (dr *DeploymentReplication) createArangoSyncTLSAuthentication(spec api.Depl } // Fetch keyfile - keyFileContent, err := k8sutil.GetTLSKeyfileSecret(dr.deps.KubeCli.CoreV1(), clientAuthKeyfileSecretName, dr.apiObject.GetNamespace()) + secrets := dr.deps.KubeCli.CoreV1().Secrets(dr.apiObject.GetNamespace()) + keyFileContent, err := k8sutil.GetTLSKeyfileSecret(secrets, clientAuthKeyfileSecretName) if err != nil { return client.TLSAuthentication{}, maskAny(err) } @@ -136,7 +138,7 @@ func (dr *DeploymentReplication) createArangoSyncTLSAuthentication(spec api.Depl } // Fetch TLS CA certificate for source - caCert, err := k8sutil.GetCACertficateSecret(dr.deps.KubeCli.CoreV1(), tlsCASecretName, dr.apiObject.GetNamespace()) + caCert, err := k8sutil.GetCACertficateSecret(secrets, tlsCASecretName) if err != nil { return client.TLSAuthentication{}, maskAny(err) } diff --git a/pkg/util/arangod/client.go b/pkg/util/arangod/client.go index b7cb50600..406a14cdf 100644 --- a/pkg/util/arangod/client.go +++ b/pkg/util/arangod/client.go @@ -238,7 +238,8 @@ func createArangodClientAuthentication(ctx context.Context, cli corev1.CoreV1Int // Authentication is enabled. // Should we skip using it? if ctx.Value(skipAuthenticationKey{}) == nil { - s, err := k8sutil.GetTokenSecret(cli, apiObject.Spec.Authentication.GetJWTSecretName(), apiObject.GetNamespace()) + secrets := cli.Secrets(apiObject.GetNamespace()) + s, err := k8sutil.GetTokenSecret(secrets, apiObject.Spec.Authentication.GetJWTSecretName()) if err != nil { return nil, maskAny(err) } diff --git a/pkg/util/interval.go b/pkg/util/interval.go new file mode 100644 index 000000000..de549de10 --- /dev/null +++ b/pkg/util/interval.go @@ -0,0 +1,66 @@ +// +// DISCLAIMER +// +// Copyright 2018 ArangoDB GmbH, Cologne, Germany +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright holder is ArangoDB GmbH, Cologne, Germany +// +// Author Ewout Prangsma +// + +package util + +import ( + "time" +) + +// Interval is a specialization of Duration so we can add some +// helper functions to that. +type Interval time.Duration + +func (i Interval) String() string { + return time.Duration(i).String() +} + +// ReduceTo returns an interval that is equal to min(x, i). +func (i Interval) ReduceTo(x Interval) Interval { + if i < x { + return i + } + return x +} + +// IncreaseTo returns an interval that is equal to max(x, i). +func (i Interval) IncreaseTo(x Interval) Interval { + if i > x { + return i + } + return x +} + +// Backoff returns an interval that is equal to min(i*factor, maxInt). +func (i Interval) Backoff(factor float64, maxInt Interval) Interval { + i = Interval(float64(i) * factor) + if i < maxInt { + return i + } + return maxInt +} + +// After waits for the interval to elapse and then sends the current time +// on the returned channel. +func (i Interval) After() <-chan time.Time { + return time.After(time.Duration(i)) +} diff --git a/pkg/util/k8sutil/pvc.go b/pkg/util/k8sutil/pvc.go index 2bfc36d5a..d0981bbd4 100644 --- a/pkg/util/k8sutil/pvc.go +++ b/pkg/util/k8sutil/pvc.go @@ -27,7 +27,7 @@ import ( "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/client-go/kubernetes" + corev1 "k8s.io/client-go/kubernetes/typed/core/v1" "github.com/arangodb/kube-arangodb/pkg/util/constants" ) @@ -46,7 +46,7 @@ func CreatePersistentVolumeClaimName(deploymentName, role, id string) string { // CreatePersistentVolumeClaim creates a persistent volume claim with given name and configuration. // If the pvc already exists, nil is returned. // If another error occurs, that error is returned. -func CreatePersistentVolumeClaim(kubecli kubernetes.Interface, pvcName, deploymentName, ns, storageClassName, role string, enforceAntiAffinity bool, resources v1.ResourceRequirements, finalizers []string, owner metav1.OwnerReference) error { +func CreatePersistentVolumeClaim(pvcs corev1.PersistentVolumeClaimInterface, pvcName, deploymentName, ns, storageClassName, role string, enforceAntiAffinity bool, resources v1.ResourceRequirements, finalizers []string, owner metav1.OwnerReference) error { labels := LabelsForDeployment(deploymentName, role) volumeMode := v1.PersistentVolumeFilesystem pvc := &v1.PersistentVolumeClaim{ @@ -70,7 +70,7 @@ func CreatePersistentVolumeClaim(kubecli kubernetes.Interface, pvcName, deployme pvc.Spec.StorageClassName = &storageClassName } addOwnerRefToObject(pvc.GetObjectMeta(), &owner) - if _, err := kubecli.CoreV1().PersistentVolumeClaims(ns).Create(pvc); err != nil && !IsAlreadyExists(err) { + if _, err := pvcs.Create(pvc); err != nil && !IsAlreadyExists(err) { return maskAny(err) } return nil diff --git a/pkg/util/k8sutil/secrets.go b/pkg/util/k8sutil/secrets.go index a2db083b7..6c32f837a 100644 --- a/pkg/util/k8sutil/secrets.go +++ b/pkg/util/k8sutil/secrets.go @@ -27,15 +27,20 @@ import ( "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - corev1 "k8s.io/client-go/kubernetes/typed/core/v1" "github.com/arangodb/kube-arangodb/pkg/util/constants" ) +// SecretInterface has methods to work with Secret resources. +type SecretInterface interface { + Create(*v1.Secret) (*v1.Secret, error) + Get(name string, options metav1.GetOptions) (*v1.Secret, error) +} + // ValidateEncryptionKeySecret checks that a secret with given name in given namespace // exists and it contains a 'key' data field of exactly 32 bytes. -func ValidateEncryptionKeySecret(cli corev1.CoreV1Interface, secretName, namespace string) error { - s, err := cli.Secrets(namespace).Get(secretName, metav1.GetOptions{}) +func ValidateEncryptionKeySecret(secrets SecretInterface, secretName string) error { + s, err := secrets.Get(secretName, metav1.GetOptions{}) if err != nil { return maskAny(err) } @@ -51,7 +56,7 @@ func ValidateEncryptionKeySecret(cli corev1.CoreV1Interface, secretName, namespa } // CreateEncryptionKeySecret creates a secret used to store a RocksDB encryption key. -func CreateEncryptionKeySecret(cli corev1.CoreV1Interface, secretName, namespace string, key []byte) error { +func CreateEncryptionKeySecret(secrets SecretInterface, secretName string, key []byte) error { if len(key) != 32 { return maskAny(fmt.Errorf("Key in secret '%s' is expected to be 32 bytes long, got %d", secretName, len(key))) } @@ -64,7 +69,7 @@ func CreateEncryptionKeySecret(cli corev1.CoreV1Interface, secretName, namespace constants.SecretEncryptionKey: key, }, } - if _, err := cli.Secrets(namespace).Create(secret); err != nil { + if _, err := secrets.Create(secret); err != nil { // Failed to create secret return maskAny(err) } @@ -73,8 +78,8 @@ func CreateEncryptionKeySecret(cli corev1.CoreV1Interface, secretName, namespace // ValidateCACertificateSecret checks that a secret with given name in given namespace // exists and it contains a 'ca.crt' data field. -func ValidateCACertificateSecret(cli corev1.CoreV1Interface, secretName, namespace string) error { - s, err := cli.Secrets(namespace).Get(secretName, metav1.GetOptions{}) +func ValidateCACertificateSecret(secrets SecretInterface, secretName string) error { + s, err := secrets.Get(secretName, metav1.GetOptions{}) if err != nil { return maskAny(err) } @@ -91,8 +96,8 @@ func ValidateCACertificateSecret(cli corev1.CoreV1Interface, secretName, namespa // If the secret does not exists the field is missing, // an error is returned. // Returns: certificate, error -func GetCACertficateSecret(cli corev1.CoreV1Interface, secretName, namespace string) (string, error) { - s, err := cli.Secrets(namespace).Get(secretName, metav1.GetOptions{}) +func GetCACertficateSecret(secrets SecretInterface, secretName string) (string, error) { + s, err := secrets.Get(secretName, metav1.GetOptions{}) if err != nil { return "", maskAny(err) } @@ -109,8 +114,8 @@ func GetCACertficateSecret(cli corev1.CoreV1Interface, secretName, namespace str // If the secret does not exists or one of the fields is missing, // an error is returned. // Returns: certificate, private-key, isOwnedByDeployment, error -func GetCASecret(cli corev1.CoreV1Interface, secretName, namespace string, ownerRef *metav1.OwnerReference) (string, string, bool, error) { - s, err := cli.Secrets(namespace).Get(secretName, metav1.GetOptions{}) +func GetCASecret(secrets SecretInterface, secretName string, ownerRef *metav1.OwnerReference) (string, string, bool, error) { + s, err := secrets.Get(secretName, metav1.GetOptions{}) if err != nil { return "", "", false, maskAny(err) } @@ -136,7 +141,7 @@ func GetCASecret(cli corev1.CoreV1Interface, secretName, namespace string, owner } // CreateCASecret creates a secret used to store a PEM encoded CA certificate & private key. -func CreateCASecret(cli corev1.CoreV1Interface, secretName, namespace string, certificate, key string, ownerRef *metav1.OwnerReference) error { +func CreateCASecret(secrets SecretInterface, secretName string, certificate, key string, ownerRef *metav1.OwnerReference) error { // Create secret secret := &v1.Secret{ ObjectMeta: metav1.ObjectMeta{ @@ -149,7 +154,7 @@ func CreateCASecret(cli corev1.CoreV1Interface, secretName, namespace string, ce } // Attach secret to owner addOwnerRefToObject(secret, ownerRef) - if _, err := cli.Secrets(namespace).Create(secret); err != nil { + if _, err := secrets.Create(secret); err != nil { // Failed to create secret return maskAny(err) } @@ -159,8 +164,8 @@ func CreateCASecret(cli corev1.CoreV1Interface, secretName, namespace string, ce // GetTLSKeyfileSecret loads a secret used to store a PEM encoded keyfile // in the format ArangoDB accepts it for its `--ssl.keyfile` option. // Returns: keyfile (pem encoded), error -func GetTLSKeyfileSecret(cli corev1.CoreV1Interface, secretName, namespace string) (string, error) { - s, err := cli.Secrets(namespace).Get(secretName, metav1.GetOptions{}) +func GetTLSKeyfileSecret(secrets SecretInterface, secretName string) (string, error) { + s, err := secrets.Get(secretName, metav1.GetOptions{}) if err != nil { return "", maskAny(err) } @@ -174,7 +179,7 @@ func GetTLSKeyfileSecret(cli corev1.CoreV1Interface, secretName, namespace strin // CreateTLSKeyfileSecret creates a secret used to store a PEM encoded keyfile // in the format ArangoDB accepts it for its `--ssl.keyfile` option. -func CreateTLSKeyfileSecret(cli corev1.CoreV1Interface, secretName, namespace string, keyfile string, ownerRef *metav1.OwnerReference) error { +func CreateTLSKeyfileSecret(secrets SecretInterface, secretName string, keyfile string, ownerRef *metav1.OwnerReference) error { // Create secret secret := &v1.Secret{ ObjectMeta: metav1.ObjectMeta{ @@ -186,7 +191,7 @@ func CreateTLSKeyfileSecret(cli corev1.CoreV1Interface, secretName, namespace st } // Attach secret to owner addOwnerRefToObject(secret, ownerRef) - if _, err := cli.Secrets(namespace).Create(secret); err != nil { + if _, err := secrets.Create(secret); err != nil { // Failed to create secret return maskAny(err) } @@ -195,8 +200,8 @@ func CreateTLSKeyfileSecret(cli corev1.CoreV1Interface, secretName, namespace st // ValidateTokenSecret checks that a secret with given name in given namespace // exists and it contains a 'token' data field. -func ValidateTokenSecret(cli corev1.CoreV1Interface, secretName, namespace string) error { - s, err := cli.Secrets(namespace).Get(secretName, metav1.GetOptions{}) +func ValidateTokenSecret(secrets SecretInterface, secretName string) error { + s, err := secrets.Get(secretName, metav1.GetOptions{}) if err != nil { return maskAny(err) } @@ -209,8 +214,8 @@ func ValidateTokenSecret(cli corev1.CoreV1Interface, secretName, namespace strin } // GetTokenSecret loads the token secret from a Secret with given name. -func GetTokenSecret(cli corev1.CoreV1Interface, secretName, namespace string) (string, error) { - s, err := cli.Secrets(namespace).Get(secretName, metav1.GetOptions{}) +func GetTokenSecret(secrets SecretInterface, secretName string) (string, error) { + s, err := secrets.Get(secretName, metav1.GetOptions{}) if err != nil { return "", maskAny(err) } @@ -224,7 +229,7 @@ func GetTokenSecret(cli corev1.CoreV1Interface, secretName, namespace string) (s // CreateTokenSecret creates a secret with given name in given namespace // with a given token as value. -func CreateTokenSecret(cli corev1.CoreV1Interface, secretName, namespace, token string, ownerRef *metav1.OwnerReference) error { +func CreateTokenSecret(secrets SecretInterface, secretName, token string, ownerRef *metav1.OwnerReference) error { // Create secret secret := &v1.Secret{ ObjectMeta: metav1.ObjectMeta{ @@ -236,7 +241,7 @@ func CreateTokenSecret(cli corev1.CoreV1Interface, secretName, namespace, token } // Attach secret to owner addOwnerRefToObject(secret, ownerRef) - if _, err := cli.Secrets(namespace).Create(secret); err != nil { + if _, err := secrets.Create(secret); err != nil { // Failed to create secret return maskAny(err) } @@ -248,8 +253,8 @@ func CreateTokenSecret(cli corev1.CoreV1Interface, secretName, namespace, token // If the secret does not exists or one of the fields is missing, // an error is returned. // Returns: username, password, error -func GetBasicAuthSecret(cli corev1.CoreV1Interface, secretName, namespace string) (string, string, error) { - s, err := cli.Secrets(namespace).Get(secretName, metav1.GetOptions{}) +func GetBasicAuthSecret(secrets SecretInterface, secretName string) (string, string, error) { + s, err := secrets.Get(secretName, metav1.GetOptions{}) if err != nil { return "", "", maskAny(err) } diff --git a/pkg/util/k8sutil/secrets_cache.go b/pkg/util/k8sutil/secrets_cache.go new file mode 100644 index 000000000..0cf4ac909 --- /dev/null +++ b/pkg/util/k8sutil/secrets_cache.go @@ -0,0 +1,75 @@ +// +// DISCLAIMER +// +// Copyright 2018 ArangoDB GmbH, Cologne, Germany +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright holder is ArangoDB GmbH, Cologne, Germany +// +// Author Ewout Prangsma +// + +package k8sutil + +import ( + "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime/schema" + corev1 "k8s.io/client-go/kubernetes/typed/core/v1" +) + +// secretsCache implements a cached version of a SecretInterface. +// It is NOT go-routine safe. +type secretsCache struct { + cli corev1.SecretInterface + cache []v1.Secret +} + +// NewSecretCache creates a cached version of the given SecretInterface. +func NewSecretCache(cli corev1.SecretInterface) SecretInterface { + return &secretsCache{cli: cli} +} + +var ( + secretGroupResource = schema.GroupResource{ + Group: v1.GroupName, + Resource: "Secret", + } +) + +func (sc *secretsCache) Create(s *v1.Secret) (*v1.Secret, error) { + sc.cache = nil + result, err := sc.cli.Create(s) + if err != nil { + return nil, maskAny(err) + } + return result, nil +} + +func (sc *secretsCache) Get(name string, options metav1.GetOptions) (*v1.Secret, error) { + if sc.cache == nil { + list, err := sc.cli.List(metav1.ListOptions{}) + if err != nil { + return nil, maskAny(err) + } + sc.cache = list.Items + } + for _, s := range sc.cache { + if s.GetName() == name { + return &s, nil + } + } + return nil, maskAny(apierrors.NewNotFound(secretGroupResource, name)) +} diff --git a/pkg/util/k8sutil/secrets_test.go b/pkg/util/k8sutil/secrets_test.go index 476974ab3..febca7c09 100644 --- a/pkg/util/k8sutil/secrets_test.go +++ b/pkg/util/k8sutil/secrets_test.go @@ -39,9 +39,10 @@ import ( // TestValidateEncryptionKeySecret tests ValidateEncryptionKeySecret. func TestValidateEncryptionKeySecret(t *testing.T) { cli := mocks.NewCore() + secrets := cli.Secrets("ns") // Prepare mock - m := mocks.AsMock(cli.Secrets("ns")) + m := mocks.AsMock(secrets) m.On("Get", "good", mock.Anything).Return(&v1.Secret{ Data: map[string][]byte{ "key": make([]byte, 32), @@ -59,15 +60,16 @@ func TestValidateEncryptionKeySecret(t *testing.T) { }, nil) m.On("Get", "notfound", mock.Anything).Return(nil, apierrors.NewNotFound(schema.GroupResource{}, "notfound")) - assert.NoError(t, ValidateEncryptionKeySecret(cli, "good", "ns")) - assert.Error(t, ValidateEncryptionKeySecret(cli, "no-key", "ns")) - assert.Error(t, ValidateEncryptionKeySecret(cli, "short-key", "ns")) - assert.True(t, IsNotFound(ValidateEncryptionKeySecret(cli, "notfound", "ns"))) + assert.NoError(t, ValidateEncryptionKeySecret(secrets, "good")) + assert.Error(t, ValidateEncryptionKeySecret(secrets, "no-key")) + assert.Error(t, ValidateEncryptionKeySecret(secrets, "short-key")) + assert.True(t, IsNotFound(ValidateEncryptionKeySecret(secrets, "notfound"))) } // TestCreateEncryptionKeySecret tests CreateEncryptionKeySecret func TestCreateEncryptionKeySecret(t *testing.T) { cli := mocks.NewCore() + secrets := cli.Secrets("ns") // Prepare mock m := mocks.AsMock(cli.Secrets("ns")) @@ -81,14 +83,15 @@ func TestCreateEncryptionKeySecret(t *testing.T) { }).Return(nil, nil) key := make([]byte, 32) - assert.NoError(t, CreateEncryptionKeySecret(cli, "good", "ns", key)) + assert.NoError(t, CreateEncryptionKeySecret(secrets, "good", key)) key = make([]byte, 31) - assert.Error(t, CreateEncryptionKeySecret(cli, "short-key", "ns", key)) + assert.Error(t, CreateEncryptionKeySecret(secrets, "short-key", key)) } // TestGetTokenSecret tests GetTokenSecret. func TestGetTokenSecret(t *testing.T) { cli := mocks.NewCore() + secrets := cli.Secrets("ns") // Prepare mock m := mocks.AsMock(cli.Secrets("ns")) @@ -104,18 +107,19 @@ func TestGetTokenSecret(t *testing.T) { }, nil) m.On("Get", "notfound", mock.Anything).Return(nil, apierrors.NewNotFound(schema.GroupResource{}, "notfound")) - token, err := GetTokenSecret(cli, "good", "ns") + token, err := GetTokenSecret(secrets, "good") assert.NoError(t, err) assert.Equal(t, token, "foo") - _, err = GetTokenSecret(cli, "no-token", "ns") + _, err = GetTokenSecret(secrets, "no-token") assert.Error(t, err) - _, err = GetTokenSecret(cli, "notfound", "ns") + _, err = GetTokenSecret(secrets, "notfound") assert.True(t, IsNotFound(err)) } // TestCreateTokenSecret tests CreateTokenSecret func TestCreateTokenSecret(t *testing.T) { cli := mocks.NewCore() + secrets := cli.Secrets("ns") // Prepare mock m := mocks.AsMock(cli.Secrets("ns")) @@ -130,6 +134,6 @@ func TestCreateTokenSecret(t *testing.T) { } }).Return(nil, nil) - assert.NoError(t, CreateTokenSecret(cli, "good", "ns", "token", nil)) - assert.NoError(t, CreateTokenSecret(cli, "with-owner", "ns", "token", &metav1.OwnerReference{})) + assert.NoError(t, CreateTokenSecret(secrets, "good", "token", nil)) + assert.NoError(t, CreateTokenSecret(secrets, "with-owner", "token", &metav1.OwnerReference{})) } diff --git a/pkg/util/k8sutil/services.go b/pkg/util/k8sutil/services.go index 8d603ae50..4b4cc7795 100644 --- a/pkg/util/k8sutil/services.go +++ b/pkg/util/k8sutil/services.go @@ -31,7 +31,7 @@ import ( "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/client-go/kubernetes" + corev1 "k8s.io/client-go/kubernetes/typed/core/v1" ) // CreateHeadlessServiceName returns the name of the headless service for the given @@ -63,7 +63,7 @@ func CreateSyncMasterClientServiceName(deploymentName string) string { // If the service already exists, nil is returned. // If another error occurs, that error is returned. // The returned bool is true if the service is created, or false when the service already existed. -func CreateHeadlessService(kubecli kubernetes.Interface, deployment metav1.Object, owner metav1.OwnerReference) (string, bool, error) { +func CreateHeadlessService(svcs corev1.ServiceInterface, deployment metav1.Object, owner metav1.OwnerReference) (string, bool, error) { deploymentName := deployment.GetName() svcName := CreateHeadlessServiceName(deploymentName) ports := []v1.ServicePort{ @@ -75,7 +75,7 @@ func CreateHeadlessService(kubecli kubernetes.Interface, deployment metav1.Objec } publishNotReadyAddresses := true serviceType := v1.ServiceTypeClusterIP - newlyCreated, err := createService(kubecli, svcName, deploymentName, deployment.GetNamespace(), ClusterIPNone, "", serviceType, ports, "", publishNotReadyAddresses, owner) + newlyCreated, err := createService(svcs, svcName, deploymentName, deployment.GetNamespace(), ClusterIPNone, "", serviceType, ports, "", publishNotReadyAddresses, owner) if err != nil { return "", false, maskAny(err) } @@ -86,7 +86,7 @@ func CreateHeadlessService(kubecli kubernetes.Interface, deployment metav1.Objec // If the service already exists, nil is returned. // If another error occurs, that error is returned. // The returned bool is true if the service is created, or false when the service already existed. -func CreateDatabaseClientService(kubecli kubernetes.Interface, deployment metav1.Object, single bool, owner metav1.OwnerReference) (string, bool, error) { +func CreateDatabaseClientService(svcs corev1.ServiceInterface, deployment metav1.Object, single bool, owner metav1.OwnerReference) (string, bool, error) { deploymentName := deployment.GetName() svcName := CreateDatabaseClientServiceName(deploymentName) ports := []v1.ServicePort{ @@ -104,7 +104,7 @@ func CreateDatabaseClientService(kubecli kubernetes.Interface, deployment metav1 } serviceType := v1.ServiceTypeClusterIP publishNotReadyAddresses := false - newlyCreated, err := createService(kubecli, svcName, deploymentName, deployment.GetNamespace(), "", role, serviceType, ports, "", publishNotReadyAddresses, owner) + newlyCreated, err := createService(svcs, svcName, deploymentName, deployment.GetNamespace(), "", role, serviceType, ports, "", publishNotReadyAddresses, owner) if err != nil { return "", false, maskAny(err) } @@ -115,7 +115,7 @@ func CreateDatabaseClientService(kubecli kubernetes.Interface, deployment metav1 // If the service already exists, nil is returned. // If another error occurs, that error is returned. // The returned bool is true if the service is created, or false when the service already existed. -func CreateExternalAccessService(kubecli kubernetes.Interface, svcName, role string, deployment metav1.Object, serviceType v1.ServiceType, port, nodePort int, loadBalancerIP string, owner metav1.OwnerReference) (string, bool, error) { +func CreateExternalAccessService(svcs corev1.ServiceInterface, svcName, role string, deployment metav1.Object, serviceType v1.ServiceType, port, nodePort int, loadBalancerIP string, owner metav1.OwnerReference) (string, bool, error) { deploymentName := deployment.GetName() ports := []v1.ServicePort{ v1.ServicePort{ @@ -126,7 +126,7 @@ func CreateExternalAccessService(kubecli kubernetes.Interface, svcName, role str }, } publishNotReadyAddresses := false - newlyCreated, err := createService(kubecli, svcName, deploymentName, deployment.GetNamespace(), "", role, serviceType, ports, loadBalancerIP, publishNotReadyAddresses, owner) + newlyCreated, err := createService(svcs, svcName, deploymentName, deployment.GetNamespace(), "", role, serviceType, ports, loadBalancerIP, publishNotReadyAddresses, owner) if err != nil { return "", false, maskAny(err) } @@ -137,7 +137,7 @@ func CreateExternalAccessService(kubecli kubernetes.Interface, svcName, role str // If the service already exists, nil is returned. // If another error occurs, that error is returned. // The returned bool is true if the service is created, or false when the service already existed. -func createService(kubecli kubernetes.Interface, svcName, deploymentName, ns, clusterIP, role string, serviceType v1.ServiceType, +func createService(svcs corev1.ServiceInterface, svcName, deploymentName, ns, clusterIP, role string, serviceType v1.ServiceType, ports []v1.ServicePort, loadBalancerIP string, publishNotReadyAddresses bool, owner metav1.OwnerReference) (bool, error) { labels := LabelsForDeployment(deploymentName, role) svc := &v1.Service{ @@ -161,7 +161,7 @@ func createService(kubecli kubernetes.Interface, svcName, deploymentName, ns, cl }, } addOwnerRefToObject(svc.GetObjectMeta(), &owner) - if _, err := kubecli.CoreV1().Services(ns).Create(svc); IsAlreadyExists(err) { + if _, err := svcs.Create(svc); IsAlreadyExists(err) { return false, nil } else if err != nil { return false, maskAny(err) diff --git a/pkg/util/duration.go b/pkg/util/profiler/profiler.go similarity index 52% rename from pkg/util/duration.go rename to pkg/util/profiler/profiler.go index 38e0442b9..5053c02a3 100644 --- a/pkg/util/duration.go +++ b/pkg/util/profiler/profiler.go @@ -20,24 +20,32 @@ // Author Ewout Prangsma // -package util +package profiler import ( "time" + + "github.com/rs/zerolog" ) -// MaxDuration returns the largest of the given durations -func MaxDuration(a, b time.Duration) time.Duration { - if a > b { - return a - } - return b +// Session is a single timed action +type Session time.Time + +// Start a profiling session +func Start() Session { + return Session(time.Now()) +} + +// Done with a profiling session, log when time is "long" +func (t Session) Done(log zerolog.Logger, msg string) { + t.LogIf(log, time.Second/4, msg) } -// MinDuration returns the smallest of the given durations -func MinDuration(a, b time.Duration) time.Duration { - if a < b { - return a +// LogIf logs the time taken since the start of the session, if that is longer +// than the given minimum duration. +func (t Session) LogIf(log zerolog.Logger, minLen time.Duration, msg string) { + interval := time.Since(time.Time(t)) + if interval > minLen { + log.Debug().Str("time-taken", interval.String()).Msg("profiler: " + msg) } - return b } diff --git a/tests/auth_test.go b/tests/auth_test.go index 3003f4bc6..1df3db37b 100644 --- a/tests/auth_test.go +++ b/tests/auth_test.go @@ -92,6 +92,7 @@ func TestAuthenticationSingleCustomSecret(t *testing.T) { c := client.MustNewInCluster() kubecli := mustNewKubeClient(t) ns := getNamespace(t) + secrets := kubecli.CoreV1().Secrets(ns) // Prepare deployment config depl := newDeployment("test-auth-sng-cst-" + uniuri.NewLen(4)) @@ -100,7 +101,7 @@ func TestAuthenticationSingleCustomSecret(t *testing.T) { depl.Spec.SetDefaults(depl.GetName()) // Create secret - if err := k8sutil.CreateTokenSecret(kubecli.CoreV1(), depl.Spec.Authentication.GetJWTSecretName(), ns, "foo", nil); err != nil { + if err := k8sutil.CreateTokenSecret(secrets, depl.Spec.Authentication.GetJWTSecretName(), "foo", nil); err != nil { t.Fatalf("Create JWT secret failed: %v", err) } @@ -231,6 +232,7 @@ func TestAuthenticationClusterCustomSecret(t *testing.T) { c := client.MustNewInCluster() kubecli := mustNewKubeClient(t) ns := getNamespace(t) + secrets := kubecli.CoreV1().Secrets(ns) // Prepare deployment config depl := newDeployment("test-auth-cls-cst-" + uniuri.NewLen(4)) @@ -239,7 +241,7 @@ func TestAuthenticationClusterCustomSecret(t *testing.T) { depl.Spec.SetDefaults(depl.GetName()) // Create secret - if err := k8sutil.CreateTokenSecret(kubecli.CoreV1(), depl.Spec.Authentication.GetJWTSecretName(), ns, "foo", nil); err != nil { + if err := k8sutil.CreateTokenSecret(secrets, depl.Spec.Authentication.GetJWTSecretName(), "foo", nil); err != nil { t.Fatalf("Create JWT secret failed: %v", err) } diff --git a/tests/rocksdb_encryption_test.go b/tests/rocksdb_encryption_test.go index 5dc482211..7d40597cb 100644 --- a/tests/rocksdb_encryption_test.go +++ b/tests/rocksdb_encryption_test.go @@ -46,6 +46,7 @@ func TestRocksDBEncryptionSingle(t *testing.T) { c := client.MustNewInCluster() kubecli := mustNewKubeClient(t) ns := getNamespace(t) + secrets := kubecli.CoreV1().Secrets(ns) // Prepull enterprise images assert.NoError(t, prepullArangoImage(kubecli, image, ns)) @@ -60,7 +61,7 @@ func TestRocksDBEncryptionSingle(t *testing.T) { // Create encryption key secret key := make([]byte, 32) rand.Read(key) - if err := k8sutil.CreateEncryptionKeySecret(kubecli.CoreV1(), depl.Spec.RocksDB.Encryption.GetKeySecretName(), ns, key); err != nil { + if err := k8sutil.CreateEncryptionKeySecret(secrets, depl.Spec.RocksDB.Encryption.GetKeySecretName(), key); err != nil { t.Fatalf("Create encryption key secret failed: %v", err) } diff --git a/tests/test_util.go b/tests/test_util.go index 37dbb3ac9..efbb26e93 100644 --- a/tests/test_util.go +++ b/tests/test_util.go @@ -198,8 +198,9 @@ func mustNewArangodDatabaseClient(ctx context.Context, kubecli kubernetes.Interf // as endpoint. It is failing the test on errors. func mustNewArangoSyncClient(ctx context.Context, kubecli kubernetes.Interface, apiObject *api.ArangoDeployment, t *testing.T) client.API { ns := apiObject.GetNamespace() + secrets := kubecli.CoreV1().Secrets(ns) secretName := apiObject.Spec.Sync.Authentication.GetJWTSecretName() - jwtToken, err := k8sutil.GetTokenSecret(kubecli.CoreV1(), secretName, ns) + jwtToken, err := k8sutil.GetTokenSecret(secrets, secretName) if err != nil { t.Fatalf("Failed to get sync jwt secret '%s': %s", secretName, err) } From 38b91d3b03c8638942cbc0300daac543650d8f4f Mon Sep 17 00:00:00 2001 From: Ewout Prangsma Date: Fri, 31 Aug 2018 08:01:49 +0200 Subject: [PATCH 11/17] Added Service & PVC cache for faster inspection loops --- pkg/deployment/resources/pvcs.go | 16 +----- pkg/deployment/resources/services.go | 27 ++------- pkg/util/k8sutil/pvc.go | 9 ++- pkg/util/k8sutil/pvc_cache.go | 75 +++++++++++++++++++++++++ pkg/util/k8sutil/services.go | 16 ++++-- pkg/util/k8sutil/services_cache.go | 83 ++++++++++++++++++++++++++++ 6 files changed, 184 insertions(+), 42 deletions(-) create mode 100644 pkg/util/k8sutil/pvc_cache.go create mode 100644 pkg/util/k8sutil/services_cache.go diff --git a/pkg/deployment/resources/pvcs.go b/pkg/deployment/resources/pvcs.go index b6173bbfb..6483b03a2 100644 --- a/pkg/deployment/resources/pvcs.go +++ b/pkg/deployment/resources/pvcs.go @@ -46,23 +46,11 @@ func (r *Resources) EnsurePVCs() error { status, _ := r.context.GetStatus() enforceAntiAffinity := r.context.GetSpec().GetEnvironment().IsProduction() - pvcs := kubecli.CoreV1().PersistentVolumeClaims(ns) - list, err := pvcs.List(metav1.ListOptions{}) - if err != nil { - return maskAny(err) - } - pvcExists := func(name string) bool { - for _, pvc := range list.Items { - if pvc.GetName() == name { - return true - } - } - return false - } + pvcs := k8sutil.NewPersistentVolumeClaimCache(kubecli.CoreV1().PersistentVolumeClaims(ns)) if err := iterator.ForeachServerGroup(func(group api.ServerGroup, spec api.ServerGroupSpec, status *api.MemberStatusList) error { for _, m := range *status { if m.PersistentVolumeClaimName != "" { - if !pvcExists(m.PersistentVolumeClaimName) { + if _, err := pvcs.Get(m.PersistentVolumeClaimName, metav1.GetOptions{}); err != nil { storageClassName := spec.GetStorageClassName() role := group.AsRole() resources := spec.Resources diff --git a/pkg/deployment/resources/services.go b/pkg/deployment/resources/services.go index 9ed910141..f9de217f8 100644 --- a/pkg/deployment/resources/services.go +++ b/pkg/deployment/resources/services.go @@ -46,23 +46,9 @@ func (r *Resources) EnsureServices() error { spec := r.context.GetSpec() // Fetch existing services - svcs := kubecli.CoreV1().Services(ns) - list, err := svcs.List(metav1.ListOptions{}) - if err != nil { - log.Debug().Err(err).Msg("Failed to list existing services") - return maskAny(err) - } - svcExists := func(name string) bool { - for _, svc := range list.Items { - if svc.GetName() == name { - return true - } - } - return false - } - + svcs := k8sutil.NewServiceCache(kubecli.CoreV1().Services(ns)) // Headless service - if !svcExists(k8sutil.CreateHeadlessServiceName(deploymentName)) { + if _, err := svcs.Get(k8sutil.CreateHeadlessServiceName(deploymentName), metav1.GetOptions{}); err != nil { svcName, newlyCreated, err := k8sutil.CreateHeadlessService(svcs, apiObject, owner) if err != nil { log.Debug().Err(err).Msg("Failed to create headless service") @@ -75,7 +61,7 @@ func (r *Resources) EnsureServices() error { // Internal database client service single := spec.GetMode().HasSingleServers() - if !svcExists(k8sutil.CreateDatabaseClientServiceName(deploymentName)) { + if _, err := svcs.Get(k8sutil.CreateDatabaseClientServiceName(deploymentName), metav1.GetOptions{}); err != nil { svcName, newlyCreated, err := k8sutil.CreateDatabaseClientService(svcs, apiObject, single, owner) if err != nil { log.Debug().Err(err).Msg("Failed to create database client service") @@ -101,7 +87,7 @@ func (r *Resources) EnsureServices() error { if single { role = "single" } - if err := r.ensureExternalAccessServices(eaServiceName, ns, role, "database", k8sutil.ArangoPort, false, spec.ExternalAccess, apiObject, log, kubecli); err != nil { + if err := r.ensureExternalAccessServices(svcs, eaServiceName, ns, role, "database", k8sutil.ArangoPort, false, spec.ExternalAccess, apiObject, log, kubecli); err != nil { return maskAny(err) } @@ -109,7 +95,7 @@ func (r *Resources) EnsureServices() error { // External (and internal) Sync master service eaServiceName := k8sutil.CreateSyncMasterClientServiceName(deploymentName) role := "syncmaster" - if err := r.ensureExternalAccessServices(eaServiceName, ns, role, "sync", k8sutil.ArangoSyncMasterPort, true, spec.Sync.ExternalAccess.ExternalAccessSpec, apiObject, log, kubecli); err != nil { + if err := r.ensureExternalAccessServices(svcs, eaServiceName, ns, role, "sync", k8sutil.ArangoSyncMasterPort, true, spec.Sync.ExternalAccess.ExternalAccessSpec, apiObject, log, kubecli); err != nil { return maskAny(err) } status, lastVersion := r.context.GetStatus() @@ -124,12 +110,11 @@ func (r *Resources) EnsureServices() error { } // EnsureServices creates all services needed to service the deployment -func (r *Resources) ensureExternalAccessServices(eaServiceName, ns, svcRole, title string, port int, noneIsClusterIP bool, spec api.ExternalAccessSpec, apiObject k8sutil.APIObject, log zerolog.Logger, kubecli kubernetes.Interface) error { +func (r *Resources) ensureExternalAccessServices(svcs k8sutil.ServiceInterface, eaServiceName, ns, svcRole, title string, port int, noneIsClusterIP bool, spec api.ExternalAccessSpec, apiObject k8sutil.APIObject, log zerolog.Logger, kubecli kubernetes.Interface) error { // Database external access service createExternalAccessService := false deleteExternalAccessService := false eaServiceType := spec.GetType().AsServiceType() // Note: Type auto defaults to ServiceTypeLoadBalancer - svcs := kubecli.CoreV1().Services(ns) if existing, err := svcs.Get(eaServiceName, metav1.GetOptions{}); err == nil { // External access service exists loadBalancerIP := spec.GetLoadBalancerIP() diff --git a/pkg/util/k8sutil/pvc.go b/pkg/util/k8sutil/pvc.go index d0981bbd4..b175e83d8 100644 --- a/pkg/util/k8sutil/pvc.go +++ b/pkg/util/k8sutil/pvc.go @@ -27,11 +27,16 @@ import ( "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - corev1 "k8s.io/client-go/kubernetes/typed/core/v1" "github.com/arangodb/kube-arangodb/pkg/util/constants" ) +// PersistentVolumeClaimInterface has methods to work with PersistentVolumeClaim resources. +type PersistentVolumeClaimInterface interface { + Create(*v1.PersistentVolumeClaim) (*v1.PersistentVolumeClaim, error) + Get(name string, options metav1.GetOptions) (*v1.PersistentVolumeClaim, error) +} + // IsPersistentVolumeClaimMarkedForDeletion returns true if the pod has been marked for deletion. func IsPersistentVolumeClaimMarkedForDeletion(pvc *v1.PersistentVolumeClaim) bool { return pvc.DeletionTimestamp != nil @@ -46,7 +51,7 @@ func CreatePersistentVolumeClaimName(deploymentName, role, id string) string { // CreatePersistentVolumeClaim creates a persistent volume claim with given name and configuration. // If the pvc already exists, nil is returned. // If another error occurs, that error is returned. -func CreatePersistentVolumeClaim(pvcs corev1.PersistentVolumeClaimInterface, pvcName, deploymentName, ns, storageClassName, role string, enforceAntiAffinity bool, resources v1.ResourceRequirements, finalizers []string, owner metav1.OwnerReference) error { +func CreatePersistentVolumeClaim(pvcs PersistentVolumeClaimInterface, pvcName, deploymentName, ns, storageClassName, role string, enforceAntiAffinity bool, resources v1.ResourceRequirements, finalizers []string, owner metav1.OwnerReference) error { labels := LabelsForDeployment(deploymentName, role) volumeMode := v1.PersistentVolumeFilesystem pvc := &v1.PersistentVolumeClaim{ diff --git a/pkg/util/k8sutil/pvc_cache.go b/pkg/util/k8sutil/pvc_cache.go new file mode 100644 index 000000000..6ef772b48 --- /dev/null +++ b/pkg/util/k8sutil/pvc_cache.go @@ -0,0 +1,75 @@ +// +// DISCLAIMER +// +// Copyright 2018 ArangoDB GmbH, Cologne, Germany +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright holder is ArangoDB GmbH, Cologne, Germany +// +// Author Ewout Prangsma +// + +package k8sutil + +import ( + "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime/schema" + corev1 "k8s.io/client-go/kubernetes/typed/core/v1" +) + +// pvcsCache implements a cached version of a PersistentVolumeClaimInterface. +// It is NOT go-routine safe. +type pvcsCache struct { + cli corev1.PersistentVolumeClaimInterface + cache []v1.PersistentVolumeClaim +} + +// NewPersistentVolumeClaimCache creates a cached version of the given PersistentVolumeClaimInterface. +func NewPersistentVolumeClaimCache(cli corev1.PersistentVolumeClaimInterface) PersistentVolumeClaimInterface { + return &pvcsCache{cli: cli} +} + +var ( + pvcGroupResource = schema.GroupResource{ + Group: v1.GroupName, + Resource: "PersistentVolumeClaim", + } +) + +func (sc *pvcsCache) Create(s *v1.PersistentVolumeClaim) (*v1.PersistentVolumeClaim, error) { + sc.cache = nil + result, err := sc.cli.Create(s) + if err != nil { + return nil, maskAny(err) + } + return result, nil +} + +func (sc *pvcsCache) Get(name string, options metav1.GetOptions) (*v1.PersistentVolumeClaim, error) { + if sc.cache == nil { + list, err := sc.cli.List(metav1.ListOptions{}) + if err != nil { + return nil, maskAny(err) + } + sc.cache = list.Items + } + for _, s := range sc.cache { + if s.GetName() == name { + return &s, nil + } + } + return nil, maskAny(apierrors.NewNotFound(pvcGroupResource, name)) +} diff --git a/pkg/util/k8sutil/services.go b/pkg/util/k8sutil/services.go index 4b4cc7795..bc07ef086 100644 --- a/pkg/util/k8sutil/services.go +++ b/pkg/util/k8sutil/services.go @@ -31,9 +31,15 @@ import ( "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - corev1 "k8s.io/client-go/kubernetes/typed/core/v1" ) +// ServiceInterface has methods to work with Service resources. +type ServiceInterface interface { + Create(*v1.Service) (*v1.Service, error) + Delete(name string, options *metav1.DeleteOptions) error + Get(name string, options metav1.GetOptions) (*v1.Service, error) +} + // CreateHeadlessServiceName returns the name of the headless service for the given // deployment name. func CreateHeadlessServiceName(deploymentName string) string { @@ -63,7 +69,7 @@ func CreateSyncMasterClientServiceName(deploymentName string) string { // If the service already exists, nil is returned. // If another error occurs, that error is returned. // The returned bool is true if the service is created, or false when the service already existed. -func CreateHeadlessService(svcs corev1.ServiceInterface, deployment metav1.Object, owner metav1.OwnerReference) (string, bool, error) { +func CreateHeadlessService(svcs ServiceInterface, deployment metav1.Object, owner metav1.OwnerReference) (string, bool, error) { deploymentName := deployment.GetName() svcName := CreateHeadlessServiceName(deploymentName) ports := []v1.ServicePort{ @@ -86,7 +92,7 @@ func CreateHeadlessService(svcs corev1.ServiceInterface, deployment metav1.Objec // If the service already exists, nil is returned. // If another error occurs, that error is returned. // The returned bool is true if the service is created, or false when the service already existed. -func CreateDatabaseClientService(svcs corev1.ServiceInterface, deployment metav1.Object, single bool, owner metav1.OwnerReference) (string, bool, error) { +func CreateDatabaseClientService(svcs ServiceInterface, deployment metav1.Object, single bool, owner metav1.OwnerReference) (string, bool, error) { deploymentName := deployment.GetName() svcName := CreateDatabaseClientServiceName(deploymentName) ports := []v1.ServicePort{ @@ -115,7 +121,7 @@ func CreateDatabaseClientService(svcs corev1.ServiceInterface, deployment metav1 // If the service already exists, nil is returned. // If another error occurs, that error is returned. // The returned bool is true if the service is created, or false when the service already existed. -func CreateExternalAccessService(svcs corev1.ServiceInterface, svcName, role string, deployment metav1.Object, serviceType v1.ServiceType, port, nodePort int, loadBalancerIP string, owner metav1.OwnerReference) (string, bool, error) { +func CreateExternalAccessService(svcs ServiceInterface, svcName, role string, deployment metav1.Object, serviceType v1.ServiceType, port, nodePort int, loadBalancerIP string, owner metav1.OwnerReference) (string, bool, error) { deploymentName := deployment.GetName() ports := []v1.ServicePort{ v1.ServicePort{ @@ -137,7 +143,7 @@ func CreateExternalAccessService(svcs corev1.ServiceInterface, svcName, role str // If the service already exists, nil is returned. // If another error occurs, that error is returned. // The returned bool is true if the service is created, or false when the service already existed. -func createService(svcs corev1.ServiceInterface, svcName, deploymentName, ns, clusterIP, role string, serviceType v1.ServiceType, +func createService(svcs ServiceInterface, svcName, deploymentName, ns, clusterIP, role string, serviceType v1.ServiceType, ports []v1.ServicePort, loadBalancerIP string, publishNotReadyAddresses bool, owner metav1.OwnerReference) (bool, error) { labels := LabelsForDeployment(deploymentName, role) svc := &v1.Service{ diff --git a/pkg/util/k8sutil/services_cache.go b/pkg/util/k8sutil/services_cache.go new file mode 100644 index 000000000..c7993a460 --- /dev/null +++ b/pkg/util/k8sutil/services_cache.go @@ -0,0 +1,83 @@ +// +// DISCLAIMER +// +// Copyright 2018 ArangoDB GmbH, Cologne, Germany +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright holder is ArangoDB GmbH, Cologne, Germany +// +// Author Ewout Prangsma +// + +package k8sutil + +import ( + "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime/schema" + corev1 "k8s.io/client-go/kubernetes/typed/core/v1" +) + +// servicesCache implements a cached version of a ServiceInterface. +// It is NOT go-routine safe. +type servicesCache struct { + cli corev1.ServiceInterface + cache []v1.Service +} + +// NewServiceCache creates a cached version of the given ServiceInterface. +func NewServiceCache(cli corev1.ServiceInterface) ServiceInterface { + return &servicesCache{cli: cli} +} + +var ( + serviceGroupResource = schema.GroupResource{ + Group: v1.GroupName, + Resource: "Service", + } +) + +func (sc *servicesCache) Create(s *v1.Service) (*v1.Service, error) { + sc.cache = nil + result, err := sc.cli.Create(s) + if err != nil { + return nil, maskAny(err) + } + return result, nil +} + +func (sc *servicesCache) Delete(name string, options *metav1.DeleteOptions) error { + sc.cache = nil + if err := sc.cli.Delete(name, options); err != nil { + return maskAny(err) + } + return nil +} + +func (sc *servicesCache) Get(name string, options metav1.GetOptions) (*v1.Service, error) { + if sc.cache == nil { + list, err := sc.cli.List(metav1.ListOptions{}) + if err != nil { + return nil, maskAny(err) + } + sc.cache = list.Items + } + for _, s := range sc.cache { + if s.GetName() == name { + return &s, nil + } + } + return nil, maskAny(apierrors.NewNotFound(serviceGroupResource, name)) +} From 56466c46317986461d1b0408a407f40b5a35f1c5 Mon Sep 17 00:00:00 2001 From: Ewout Prangsma Date: Fri, 31 Aug 2018 12:17:36 +0200 Subject: [PATCH 12/17] Fetch cluster health in go-routine --- pkg/deployment/deployment.go | 1 + pkg/deployment/resources/deployment_health.go | 89 ++++++++ pkg/deployment/resources/member_cleanup.go | 33 +-- pkg/deployment/resources/resources.go | 13 +- pkg/util/errors/errors.go | 203 ++++++++++++++++++ 5 files changed, 323 insertions(+), 16 deletions(-) create mode 100644 pkg/deployment/resources/deployment_health.go create mode 100644 pkg/util/errors/errors.go diff --git a/pkg/deployment/deployment.go b/pkg/deployment/deployment.go index 5b6da6d04..21e1e8c15 100644 --- a/pkg/deployment/deployment.go +++ b/pkg/deployment/deployment.go @@ -141,6 +141,7 @@ func New(config Config, deps Dependencies, apiObject *api.ArangoDeployment) (*De ci := newClusterScalingIntegration(d) d.clusterScalingIntegration = ci go ci.ListenForClusterEvents(d.stopCh) + go d.resources.RunDeploymentHealthLoop(d.stopCh) } if config.AllowChaos { d.chaosMonkey = chaos.NewMonkey(deps.Log, d) diff --git a/pkg/deployment/resources/deployment_health.go b/pkg/deployment/resources/deployment_health.go new file mode 100644 index 000000000..577db83b1 --- /dev/null +++ b/pkg/deployment/resources/deployment_health.go @@ -0,0 +1,89 @@ +// +// DISCLAIMER +// +// Copyright 2018 ArangoDB GmbH, Cologne, Germany +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright holder is ArangoDB GmbH, Cologne, Germany +// +// Author Ewout Prangsma +// + +package resources + +import ( + "context" + "time" + + api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha" + "github.com/arangodb/kube-arangodb/pkg/metrics" +) + +var ( + fetchDeploymentHealthCounters = metrics.MustRegisterCounterVec("deployment_resources", "fetchDeploymentHealth", "Number of times the health of the deployment was fetched", "deployment", "result") +) + +// RunDeploymentHealthLoop creates a loop to fetch the health of the deployment. +// The loop ends when the given channel is closed. +func (r *Resources) RunDeploymentHealthLoop(stopCh <-chan struct{}) { + log := r.log + deploymentName := r.context.GetAPIObject().GetName() + + if r.context.GetSpec().GetMode() != api.DeploymentModeCluster { + // Deployment health is currently only applicable for clusters + return + } + + for { + if err := r.fetchDeploymentHealth(); err != nil { + log.Debug().Err(err).Msg("Failed to fetch deployment health") + fetchDeploymentHealthCounters.WithLabelValues(deploymentName, "failed").Inc() + } else { + fetchDeploymentHealthCounters.WithLabelValues(deploymentName, "success").Inc() + } + select { + case <-time.After(time.Second * 5): + // Continue + case <-stopCh: + // We're done + return + } + } +} + +// cleanupRemovedClusterMembers removes all arangod members that are no longer part of the cluster. +func (r *Resources) fetchDeploymentHealth() error { + // Ask cluster for its health + ctx, cancel := context.WithTimeout(context.Background(), time.Second*15) + defer cancel() + client, err := r.context.GetDatabaseClient(ctx) + if err != nil { + return maskAny(err) + } + c, err := client.Cluster(ctx) + if err != nil { + return maskAny(err) + } + h, err := c.Health(ctx) + if err != nil { + return maskAny(err) + } + + // Save cluster health + r.health.mutex.Lock() + defer r.health.mutex.Unlock() + r.health.clusterHealth = h + r.health.timestamp = time.Now() + return nil +} diff --git a/pkg/deployment/resources/member_cleanup.go b/pkg/deployment/resources/member_cleanup.go index 87b4ef34a..4cbb859d3 100644 --- a/pkg/deployment/resources/member_cleanup.go +++ b/pkg/deployment/resources/member_cleanup.go @@ -23,18 +23,23 @@ package resources import ( - "context" "time" driver "github.com/arangodb/go-driver" api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha" + "github.com/arangodb/kube-arangodb/pkg/metrics" "github.com/arangodb/kube-arangodb/pkg/util/k8sutil" ) const ( // minMemberAge is the minimum duration we expect a member to be created before we remove it because // it is not part of a deployment. - minMemberAge = time.Minute * 10 + minMemberAge = time.Minute * 10 + maxClusterHealthAge = time.Second * 20 +) + +var ( + cleanupRemovedMembersCounters = metrics.MustRegisterCounterVec("deployment_resources", "cleanupRemovedMembers", "Number of cleanup-removed-members actions", "deployment", "result") ) // CleanupRemovedMembers removes all arangod members that are no longer part of ArangoDB deployment. @@ -43,8 +48,10 @@ func (r *Resources) CleanupRemovedMembers() error { switch r.context.GetSpec().GetMode() { case api.DeploymentModeCluster: if err := r.cleanupRemovedClusterMembers(); err != nil { + cleanupRemovedMembersCounters.WithLabelValues(r.context.GetAPIObject().GetName(), "failed").Inc() return maskAny(err) } + cleanupRemovedMembersCounters.WithLabelValues(r.context.GetAPIObject().GetName(), "success").Inc() return nil default: // Other mode have no concept of cluster in which members can be removed @@ -55,20 +62,16 @@ func (r *Resources) CleanupRemovedMembers() error { // cleanupRemovedClusterMembers removes all arangod members that are no longer part of the cluster. func (r *Resources) cleanupRemovedClusterMembers() error { log := r.log - ctx := context.Background() - // Ask cluster for its health - client, err := r.context.GetDatabaseClient(ctx) - if err != nil { - return maskAny(err) - } - c, err := client.Cluster(ctx) - if err != nil { - return maskAny(err) - } - h, err := c.Health(ctx) - if err != nil { - return maskAny(err) + // Fetch recent cluster health + r.health.mutex.Lock() + h := r.health.clusterHealth + ts := r.health.timestamp + r.health.mutex.Unlock() + + // Only accept recent cluster health values + if time.Since(ts) > maxClusterHealthAge { + return nil } serverFound := func(id string) bool { diff --git a/pkg/deployment/resources/resources.go b/pkg/deployment/resources/resources.go index a77ad9559..7a4aee2b4 100644 --- a/pkg/deployment/resources/resources.go +++ b/pkg/deployment/resources/resources.go @@ -22,13 +22,24 @@ package resources -import "github.com/rs/zerolog" +import ( + "sync" + "time" + + driver "github.com/arangodb/go-driver" + "github.com/rs/zerolog" +) // Resources is a service that creates low level resources for members // and inspects low level resources, put the inspection result in members. type Resources struct { log zerolog.Logger context Context + health struct { + clusterHealth driver.ClusterHealth // Last fetched cluster health + timestamp time.Time // Timestamp of last fetch of cluster health + mutex sync.Mutex // Mutex guarding fields in this struct + } } // NewResources creates a new Resources service, used to diff --git a/pkg/util/errors/errors.go b/pkg/util/errors/errors.go new file mode 100644 index 000000000..6ec185bcc --- /dev/null +++ b/pkg/util/errors/errors.go @@ -0,0 +1,203 @@ +// +// DISCLAIMER +// +// Copyright 2018 ArangoDB GmbH, Cologne, Germany +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright holder is ArangoDB GmbH, Cologne, Germany +// +// Author Ewout Prangsma +// + +package errors + +import ( + "context" + "fmt" + "io" + "net" + "net/url" + "os" + "syscall" + + driver "github.com/arangodb/go-driver" + errs "github.com/pkg/errors" +) + +var ( + Cause = errs.Cause + New = errs.New + WithStack = errs.WithStack + Wrap = errs.Wrap + Wrapf = errs.Wrapf +) + +// WithMessage annotates err with a new message. +// The messages of given error is hidden. +// If err is nil, WithMessage returns nil. +func WithMessage(err error, message string) error { + if err == nil { + return nil + } + return &withMessage{ + cause: err, + msg: message, + } +} + +type withMessage struct { + cause error + msg string +} + +func (w *withMessage) Error() string { return w.msg } +func (w *withMessage) Cause() error { return w.cause } + +func (w *withMessage) Format(s fmt.State, verb rune) { + switch verb { + case 'v': + if s.Flag('+') { + fmt.Fprintf(s, "%+v\n", w.Cause()) + io.WriteString(s, w.msg) + return + } + fallthrough + case 's', 'q': + io.WriteString(s, w.Error()) + } +} + +type timeout interface { + Timeout() bool +} + +// IsTimeout returns true if the given error is caused by a timeout error. +func IsTimeout(err error) bool { + if err == nil { + return false + } + if t, ok := errs.Cause(err).(timeout); ok { + return t.Timeout() + } + return false +} + +type temporary interface { + Temporary() bool +} + +// IsTemporary returns true if the given error is caused by a temporary error. +func IsTemporary(err error) bool { + if err == nil { + return false + } + if t, ok := errs.Cause(err).(temporary); ok { + return t.Temporary() + } + return false +} + +// IsEOF returns true if the given error is caused by an EOF error. +func IsEOF(err error) bool { + err = errs.Cause(err) + if err == io.EOF { + return true + } + if ok, err := libCause(err); ok { + return IsEOF(err) + } + return false +} + +// IsConnectionRefused returns true if the given error is caused by an "connection refused" error. +func IsConnectionRefused(err error) bool { + err = errs.Cause(err) + if err, ok := err.(syscall.Errno); ok { + return err == syscall.ECONNREFUSED + } + if ok, err := libCause(err); ok { + return IsConnectionRefused(err) + } + return false +} + +// IsConnectionReset returns true if the given error is caused by an "connection reset by peer" error. +func IsConnectionReset(err error) bool { + err = errs.Cause(err) + if err, ok := err.(syscall.Errno); ok { + return err == syscall.ECONNRESET + } + if ok, err := libCause(err); ok { + return IsConnectionReset(err) + } + return false +} + +// IsContextCanceled returns true if the given error is caused by a context cancelation. +func IsContextCanceled(err error) bool { + err = errs.Cause(err) + if err == context.Canceled { + return true + } + if ok, err := libCause(err); ok { + return IsContextCanceled(err) + } + return false +} + +// IsContextDeadlineExpired returns true if the given error is caused by a context deadline expiration. +func IsContextDeadlineExpired(err error) bool { + err = errs.Cause(err) + if err == context.DeadlineExceeded { + return true + } + if ok, err := libCause(err); ok { + return IsContextDeadlineExpired(err) + } + return false +} + +// IsContextCanceledOrExpired returns true if the given error is caused by a context cancelation +// or deadline expiration. +func IsContextCanceledOrExpired(err error) bool { + err = errs.Cause(err) + if err == context.Canceled || err == context.DeadlineExceeded { + return true + } + if ok, err := libCause(err); ok { + return IsContextCanceledOrExpired(err) + } + return false +} + +// libCause returns the Cause of well known go library errors. +func libCause(err error) (bool, error) { + original := err + for { + switch e := err.(type) { + case *driver.ResponseError: + err = e.Err + case *net.DNSConfigError: + err = e.Err + case *net.OpError: + err = e.Err + case *os.SyscallError: + err = e.Err + case *url.Error: + err = e.Err + default: + return err != original, err + } + } +} From 0cc2d5dbba9c78eff90c71af2dc4a9b730d032ac Mon Sep 17 00:00:00 2001 From: Ewout Prangsma Date: Fri, 31 Aug 2018 16:08:21 +0200 Subject: [PATCH 13/17] Adding various metrics --- examples/metrics/dashboard.json | 558 ++++++++++++++++++ .../deployment-operator-servicemonitor.yaml | 24 +- pkg/deployment/deployment_inspector.go | 66 +-- pkg/deployment/metrics.go | 28 + pkg/deployment/resources/deployment_health.go | 6 +- pkg/deployment/resources/member_cleanup.go | 7 +- pkg/deployment/resources/metrics.go | 28 + pkg/deployment/resources/pod_inspector.go | 10 +- pkg/deployment/resources/pvc_inspector.go | 11 +- pkg/deployment/resources/secrets.go | 18 + pkg/deployment/resources/services.go | 21 +- pkg/metrics/metrics.go | 21 +- 12 files changed, 724 insertions(+), 74 deletions(-) create mode 100644 examples/metrics/dashboard.json create mode 100644 pkg/deployment/metrics.go create mode 100644 pkg/deployment/resources/metrics.go diff --git a/examples/metrics/dashboard.json b/examples/metrics/dashboard.json new file mode 100644 index 000000000..c0e8c9c48 --- /dev/null +++ b/examples/metrics/dashboard.json @@ -0,0 +1,558 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": 10, + "links": [], + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "$$hashKey": "object:1274", + "expr": "sum(arangodb_operator_controller_deployments)", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "count", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "#Deployments", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:2219", + "decimals": 0, + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:2220", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "$$hashKey": "object:1274", + "expr": "arangodb_operator_deployment_inspect_deployment_duration", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{deployment}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Deployment Inspection Duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:2154", + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:2155", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 9 + }, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "$$hashKey": "object:1441", + "expr": "sum(increase(arangodb_operator_deployment_resources_deployment_health_fetches[5m])) by (result)", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{result}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Fetch deployment health [5m]", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 7, + "w": 8, + "x": 8, + "y": 9 + }, + "id": 5, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "$$hashKey": "object:1441", + "expr": "sum(increase(arangodb_operator_deployment_resources_cleanupRemovedMembers[5m])) by (result)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{result}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Cleanup remove members [5m]", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 16 + }, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "$$hashKey": "object:1274", + "expr": "sum(increase(arangodb_operator_deployment_resources_inspected_pods[5m]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Pod inspections [5m]", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 7, + "w": 8, + "x": 8, + "y": 16 + }, + "id": 7, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "$$hashKey": "object:1274", + "expr": "arangodb_operator_deployment_resources_inspect_pods_duration", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{pod}}, {{deployment}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Pod inspection Duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1930", + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:1931", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "refresh": false, + "schemaVersion": 16, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Kube-ArangoDB", + "uid": "pJUJQd2ik", + "version": 10 + } \ No newline at end of file diff --git a/examples/metrics/deployment-operator-servicemonitor.yaml b/examples/metrics/deployment-operator-servicemonitor.yaml index 128bc3f35..e13f55394 100644 --- a/examples/metrics/deployment-operator-servicemonitor.yaml +++ b/examples/metrics/deployment-operator-servicemonitor.yaml @@ -1,34 +1,22 @@ # This example shows how to integrate with the Prometheus Operator # to bring metrics from kube-arangodb to Prometheus. -apiVersion: v1 -kind: Service -metadata: - name: arango-deployment-operator - labels: - app: arango-deployment-operator -spec: - selector: - app: arango-deployment-operator - ports: - - name: metrics - port: 8528 - ---- - apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: name: arango-deployment-operator + namespace: monitoring labels: - team: frontend + prometheus: kube-prometheus spec: selector: matchLabels: app: arango-deployment-operator + namespaceSelector: + matchNames: + - default endpoints: - - port: metrics + - port: server scheme: https tlsConfig: insecureSkipVerify: true - diff --git a/pkg/deployment/deployment_inspector.go b/pkg/deployment/deployment_inspector.go index 19a8f5b4f..3d323cb85 100644 --- a/pkg/deployment/deployment_inspector.go +++ b/pkg/deployment/deployment_inspector.go @@ -27,12 +27,17 @@ import ( "time" api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha" + "github.com/arangodb/kube-arangodb/pkg/metrics" "github.com/arangodb/kube-arangodb/pkg/util" "github.com/arangodb/kube-arangodb/pkg/util/k8sutil" "github.com/arangodb/kube-arangodb/pkg/util/profiler" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) +var ( + inspectDeploymentDurationGauges = metrics.MustRegisterGaugeVec(metricsComponent, "inspect_deployment_duration", "Amount of time taken by a single inspection of a deployment (in sec)", metrics.DeploymentName) +) + // inspectDeployment inspects the entire deployment, creates // a plan to update if needed and inspects underlying resources. // This function should be called when: @@ -42,13 +47,16 @@ import ( // Returns the delay until this function should be called again. func (d *Deployment) inspectDeployment(lastInterval util.Interval) util.Interval { log := d.deps.Log + start := time.Now() nextInterval := lastInterval hasError := false ctx := context.Background() + deploymentName := d.apiObject.GetName() + defer metrics.SetDuration(inspectDeploymentDurationGauges.WithLabelValues(deploymentName), start) // Check deployment still exists - updated, err := d.deps.DatabaseCRCli.DatabaseV1alpha().ArangoDeployments(d.apiObject.GetNamespace()).Get(d.apiObject.GetName(), metav1.GetOptions{}) + updated, err := d.deps.DatabaseCRCli.DatabaseV1alpha().ArangoDeployments(d.apiObject.GetNamespace()).Get(deploymentName, metav1.GetOptions{}) if k8sutil.IsNotFound(err) { // Deployment is gone log.Info().Msg("Deployment is gone") @@ -129,47 +137,27 @@ func (d *Deployment) inspectDeployment(lastInterval util.Interval) util.Interval } // Ensure all resources are created - { - ps := profiler.Start() - { - ps := profiler.Start() - if err := d.resources.EnsureSecrets(); err != nil { - hasError = true - d.CreateEvent(k8sutil.NewErrorEvent("Secret creation failed", err, d.apiObject)) - } - ps.LogIf(log, time.Millisecond*10, "EnsureSecrets") - } - { - ps := profiler.Start() - if err := d.resources.EnsureServices(); err != nil { - hasError = true - d.CreateEvent(k8sutil.NewErrorEvent("Service creation failed", err, d.apiObject)) - } - ps.LogIf(log, time.Millisecond*10, "EnsureServices") - } - if err := d.resources.EnsurePVCs(); err != nil { - hasError = true - d.CreateEvent(k8sutil.NewErrorEvent("PVC creation failed", err, d.apiObject)) - } - { - ps := profiler.Start() - if err := d.resources.EnsurePods(); err != nil { - hasError = true - d.CreateEvent(k8sutil.NewErrorEvent("Pod creation failed", err, d.apiObject)) - } - ps.LogIf(log, time.Millisecond*10, "EnsurePods") - } - ps.Done(log, "ensure resources") + if err := d.resources.EnsureSecrets(); err != nil { + hasError = true + d.CreateEvent(k8sutil.NewErrorEvent("Secret creation failed", err, d.apiObject)) + } + if err := d.resources.EnsureServices(); err != nil { + hasError = true + d.CreateEvent(k8sutil.NewErrorEvent("Service creation failed", err, d.apiObject)) + } + if err := d.resources.EnsurePVCs(); err != nil { + hasError = true + d.CreateEvent(k8sutil.NewErrorEvent("PVC creation failed", err, d.apiObject)) + } + if err := d.resources.EnsurePods(); err != nil { + hasError = true + d.CreateEvent(k8sutil.NewErrorEvent("Pod creation failed", err, d.apiObject)) } // Create access packages - { - ps := profiler.Start() - if err := d.createAccessPackages(); err != nil { - hasError = true - d.CreateEvent(k8sutil.NewErrorEvent("AccessPackage creation failed", err, d.apiObject)) - } - ps.Done(log, "createAccessPackages") + if err := d.createAccessPackages(); err != nil { + hasError = true + d.CreateEvent(k8sutil.NewErrorEvent("AccessPackage creation failed", err, d.apiObject)) } // Inspect deployment for obsolete members diff --git a/pkg/deployment/metrics.go b/pkg/deployment/metrics.go new file mode 100644 index 000000000..fa8eedc87 --- /dev/null +++ b/pkg/deployment/metrics.go @@ -0,0 +1,28 @@ +// +// DISCLAIMER +// +// Copyright 2018 ArangoDB GmbH, Cologne, Germany +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright holder is ArangoDB GmbH, Cologne, Germany +// +// Author Ewout Prangsma +// + +package deployment + +const ( + // Component name for metrics of this package + metricsComponent = "deployment" +) diff --git a/pkg/deployment/resources/deployment_health.go b/pkg/deployment/resources/deployment_health.go index 577db83b1..338e988b8 100644 --- a/pkg/deployment/resources/deployment_health.go +++ b/pkg/deployment/resources/deployment_health.go @@ -31,7 +31,7 @@ import ( ) var ( - fetchDeploymentHealthCounters = metrics.MustRegisterCounterVec("deployment_resources", "fetchDeploymentHealth", "Number of times the health of the deployment was fetched", "deployment", "result") + deploymentHealthFetchesCounters = metrics.MustRegisterCounterVec(metricsComponent, "deployment_health_fetches", "Number of times the health of the deployment was fetched", metrics.DeploymentName, metrics.Result) ) // RunDeploymentHealthLoop creates a loop to fetch the health of the deployment. @@ -48,9 +48,9 @@ func (r *Resources) RunDeploymentHealthLoop(stopCh <-chan struct{}) { for { if err := r.fetchDeploymentHealth(); err != nil { log.Debug().Err(err).Msg("Failed to fetch deployment health") - fetchDeploymentHealthCounters.WithLabelValues(deploymentName, "failed").Inc() + deploymentHealthFetchesCounters.WithLabelValues(deploymentName, metrics.Failed).Inc() } else { - fetchDeploymentHealthCounters.WithLabelValues(deploymentName, "success").Inc() + deploymentHealthFetchesCounters.WithLabelValues(deploymentName, metrics.Success).Inc() } select { case <-time.After(time.Second * 5): diff --git a/pkg/deployment/resources/member_cleanup.go b/pkg/deployment/resources/member_cleanup.go index 4cbb859d3..7a08242de 100644 --- a/pkg/deployment/resources/member_cleanup.go +++ b/pkg/deployment/resources/member_cleanup.go @@ -39,7 +39,7 @@ const ( ) var ( - cleanupRemovedMembersCounters = metrics.MustRegisterCounterVec("deployment_resources", "cleanupRemovedMembers", "Number of cleanup-removed-members actions", "deployment", "result") + cleanupRemovedMembersCounters = metrics.MustRegisterCounterVec(metricsComponent, "cleanup_removed_members", "Number of cleanup-removed-members actions", metrics.DeploymentName, metrics.Result) ) // CleanupRemovedMembers removes all arangod members that are no longer part of ArangoDB deployment. @@ -47,11 +47,12 @@ func (r *Resources) CleanupRemovedMembers() error { // Decide what to do depending on cluster mode switch r.context.GetSpec().GetMode() { case api.DeploymentModeCluster: + deploymentName := r.context.GetAPIObject().GetName() if err := r.cleanupRemovedClusterMembers(); err != nil { - cleanupRemovedMembersCounters.WithLabelValues(r.context.GetAPIObject().GetName(), "failed").Inc() + cleanupRemovedMembersCounters.WithLabelValues(deploymentName, metrics.Failed).Inc() return maskAny(err) } - cleanupRemovedMembersCounters.WithLabelValues(r.context.GetAPIObject().GetName(), "success").Inc() + cleanupRemovedMembersCounters.WithLabelValues(deploymentName, metrics.Success).Inc() return nil default: // Other mode have no concept of cluster in which members can be removed diff --git a/pkg/deployment/resources/metrics.go b/pkg/deployment/resources/metrics.go new file mode 100644 index 000000000..ec92c8db4 --- /dev/null +++ b/pkg/deployment/resources/metrics.go @@ -0,0 +1,28 @@ +// +// DISCLAIMER +// +// Copyright 2018 ArangoDB GmbH, Cologne, Germany +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright holder is ArangoDB GmbH, Cologne, Germany +// +// Author Ewout Prangsma +// + +package resources + +const ( + // Component name for metrics of this package + metricsComponent = "deployment_resources" +) diff --git a/pkg/deployment/resources/pod_inspector.go b/pkg/deployment/resources/pod_inspector.go index 092abe9dc..8abcf4d07 100644 --- a/pkg/deployment/resources/pod_inspector.go +++ b/pkg/deployment/resources/pod_inspector.go @@ -36,7 +36,8 @@ import ( ) var ( - inspectedPodCounter = metrics.MustRegisterCounter("deployment", "inspected_pods", "Number of pod inspections") + inspectedPodsCounters = metrics.MustRegisterCounterVec(metricsComponent, "inspected_pods", "Number of pod inspections per deployment", metrics.DeploymentName) + inspectPodsDurationGauges = metrics.MustRegisterGaugeVec(metricsComponent, "inspect_pods_duration", "Amount of time taken by a single inspection of all pods for a deployment (in sec)", metrics.DeploymentName) ) const ( @@ -50,8 +51,12 @@ const ( // Returns: Interval_till_next_inspection, error func (r *Resources) InspectPods(ctx context.Context) (util.Interval, error) { log := r.log + start := time.Now() + apiObject := r.context.GetAPIObject() + deploymentName := apiObject.GetName() var events []*k8sutil.Event nextInterval := maxPodInspectorInterval // Large by default, will be made smaller if needed in the rest of the function + defer metrics.SetDuration(inspectPodsDurationGauges.WithLabelValues(deploymentName), start) pods, err := r.context.GetOwnedPods() if err != nil { @@ -61,7 +66,6 @@ func (r *Resources) InspectPods(ctx context.Context) (util.Interval, error) { // Update member status from all pods found status, lastVersion := r.context.GetStatus() - apiObject := r.context.GetAPIObject() var podNamesWithScheduleTimeout []string var unscheduledPodNames []string for _, p := range pods { @@ -71,7 +75,7 @@ func (r *Resources) InspectPods(ctx context.Context) (util.Interval, error) { } // Pod belongs to this deployment, update metric - inspectedPodCounter.Inc() + inspectedPodsCounters.WithLabelValues(deploymentName).Inc() // Find member status memberStatus, group, found := status.Members.MemberStatusByPodName(p.GetName()) diff --git a/pkg/deployment/resources/pvc_inspector.go b/pkg/deployment/resources/pvc_inspector.go index 3a9947490..385e17241 100644 --- a/pkg/deployment/resources/pvc_inspector.go +++ b/pkg/deployment/resources/pvc_inspector.go @@ -32,7 +32,11 @@ import ( ) var ( - inspectedPVCCounter = metrics.MustRegisterCounter("deployment", "inspected_ppvcs", "Number of PVCs inspections") + inspectedPVCsCounters = metrics.MustRegisterCounterVec(metricsComponent, "inspected_pvcs", "Number of PVC inspections per deployment", metrics.DeploymentName) + inspectPVCsDurationGauges = metrics.MustRegisterGaugeVec(metricsComponent, "inspect_pvcs_duration", "Amount of time taken by a single inspection of all PVCs for a deployment (in sec)", metrics.DeploymentName) +) + +const ( maxPVCInspectorInterval = util.Interval(time.Hour) // Maximum time between PVC inspection (if nothing else happens) ) @@ -40,7 +44,10 @@ var ( // the member status of the deployment accordingly. func (r *Resources) InspectPVCs(ctx context.Context) (util.Interval, error) { log := r.log + start := time.Now() nextInterval := maxPVCInspectorInterval + deploymentName := r.context.GetAPIObject().GetName() + defer metrics.SetDuration(inspectPVCsDurationGauges.WithLabelValues(deploymentName), start) pvcs, err := r.context.GetOwnedPVCs() if err != nil { @@ -52,7 +59,7 @@ func (r *Resources) InspectPVCs(ctx context.Context) (util.Interval, error) { status, _ := r.context.GetStatus() for _, p := range pvcs { // PVC belongs to this deployment, update metric - inspectedPVCCounter.Inc() + inspectedPVCsCounters.WithLabelValues(deploymentName).Inc() // Find member status memberStatus, group, found := status.Members.MemberStatusByPVCName(p.GetName()) diff --git a/pkg/deployment/resources/secrets.go b/pkg/deployment/resources/secrets.go index 5d35ddb8a..001669130 100644 --- a/pkg/deployment/resources/secrets.go +++ b/pkg/deployment/resources/secrets.go @@ -25,39 +25,57 @@ package resources import ( "crypto/rand" "encoding/hex" + "time" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha" + "github.com/arangodb/kube-arangodb/pkg/metrics" "github.com/arangodb/kube-arangodb/pkg/util/k8sutil" ) +var ( + inspectedSecretsCounters = metrics.MustRegisterCounterVec(metricsComponent, "inspected_secrets", "Number of Secret inspections per deployment", metrics.DeploymentName) + inspectSecretsDurationGauges = metrics.MustRegisterGaugeVec(metricsComponent, "inspect_secrets_duration", "Amount of time taken by a single inspection of all Secrets for a deployment (in sec)", metrics.DeploymentName) +) + // EnsureSecrets creates all secrets needed to run the given deployment func (r *Resources) EnsureSecrets() error { + start := time.Now() kubecli := r.context.GetKubeCli() ns := r.context.GetNamespace() secrets := k8sutil.NewSecretCache(kubecli.CoreV1().Secrets(ns)) spec := r.context.GetSpec() + deploymentName := r.context.GetAPIObject().GetName() + defer metrics.SetDuration(inspectSecretsDurationGauges.WithLabelValues(deploymentName), start) + counterMetric := inspectedSecretsCounters.WithLabelValues(deploymentName) + if spec.IsAuthenticated() { + counterMetric.Inc() if err := r.ensureTokenSecret(secrets, spec.Authentication.GetJWTSecretName()); err != nil { return maskAny(err) } } if spec.IsSecure() { + counterMetric.Inc() if err := r.ensureTLSCACertificateSecret(secrets, spec.TLS); err != nil { return maskAny(err) } } if spec.Sync.IsEnabled() { + counterMetric.Inc() if err := r.ensureTokenSecret(secrets, spec.Sync.Authentication.GetJWTSecretName()); err != nil { return maskAny(err) } + counterMetric.Inc() if err := r.ensureTokenSecret(secrets, spec.Sync.Monitoring.GetTokenSecretName()); err != nil { return maskAny(err) } + counterMetric.Inc() if err := r.ensureTLSCACertificateSecret(secrets, spec.Sync.TLS); err != nil { return maskAny(err) } + counterMetric.Inc() if err := r.ensureClientAuthCACertificateSecret(secrets, spec.Sync.Authentication); err != nil { return maskAny(err) } diff --git a/pkg/deployment/resources/services.go b/pkg/deployment/resources/services.go index f9de217f8..26842a05a 100644 --- a/pkg/deployment/resources/services.go +++ b/pkg/deployment/resources/services.go @@ -25,29 +25,38 @@ package resources import ( "time" - "k8s.io/client-go/kubernetes" - "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha" + "github.com/arangodb/kube-arangodb/pkg/metrics" "github.com/arangodb/kube-arangodb/pkg/util/k8sutil" + "github.com/prometheus/client_golang/prometheus" "github.com/rs/zerolog" ) +var ( + inspectedServicesCounters = metrics.MustRegisterCounterVec(metricsComponent, "inspected_services", "Number of Service inspections per deployment", metrics.DeploymentName) + inspectServicesDurationGauges = metrics.MustRegisterGaugeVec(metricsComponent, "inspect_services_duration", "Amount of time taken by a single inspection of all Services for a deployment (in sec)", metrics.DeploymentName) +) + // EnsureServices creates all services needed to service the deployment func (r *Resources) EnsureServices() error { log := r.log + start := time.Now() kubecli := r.context.GetKubeCli() apiObject := r.context.GetAPIObject() deploymentName := apiObject.GetName() ns := apiObject.GetNamespace() owner := apiObject.AsOwner() spec := r.context.GetSpec() + defer metrics.SetDuration(inspectServicesDurationGauges.WithLabelValues(deploymentName), start) + counterMetric := inspectedServicesCounters.WithLabelValues(deploymentName) // Fetch existing services svcs := k8sutil.NewServiceCache(kubecli.CoreV1().Services(ns)) // Headless service + counterMetric.Inc() if _, err := svcs.Get(k8sutil.CreateHeadlessServiceName(deploymentName), metav1.GetOptions{}); err != nil { svcName, newlyCreated, err := k8sutil.CreateHeadlessService(svcs, apiObject, owner) if err != nil { @@ -61,6 +70,7 @@ func (r *Resources) EnsureServices() error { // Internal database client service single := spec.GetMode().HasSingleServers() + counterMetric.Inc() if _, err := svcs.Get(k8sutil.CreateDatabaseClientServiceName(deploymentName), metav1.GetOptions{}); err != nil { svcName, newlyCreated, err := k8sutil.CreateDatabaseClientService(svcs, apiObject, single, owner) if err != nil { @@ -87,15 +97,16 @@ func (r *Resources) EnsureServices() error { if single { role = "single" } - if err := r.ensureExternalAccessServices(svcs, eaServiceName, ns, role, "database", k8sutil.ArangoPort, false, spec.ExternalAccess, apiObject, log, kubecli); err != nil { + if err := r.ensureExternalAccessServices(svcs, eaServiceName, ns, role, "database", k8sutil.ArangoPort, false, spec.ExternalAccess, apiObject, log, counterMetric); err != nil { return maskAny(err) } if spec.Sync.IsEnabled() { // External (and internal) Sync master service + counterMetric.Inc() eaServiceName := k8sutil.CreateSyncMasterClientServiceName(deploymentName) role := "syncmaster" - if err := r.ensureExternalAccessServices(svcs, eaServiceName, ns, role, "sync", k8sutil.ArangoSyncMasterPort, true, spec.Sync.ExternalAccess.ExternalAccessSpec, apiObject, log, kubecli); err != nil { + if err := r.ensureExternalAccessServices(svcs, eaServiceName, ns, role, "sync", k8sutil.ArangoSyncMasterPort, true, spec.Sync.ExternalAccess.ExternalAccessSpec, apiObject, log, counterMetric); err != nil { return maskAny(err) } status, lastVersion := r.context.GetStatus() @@ -110,7 +121,7 @@ func (r *Resources) EnsureServices() error { } // EnsureServices creates all services needed to service the deployment -func (r *Resources) ensureExternalAccessServices(svcs k8sutil.ServiceInterface, eaServiceName, ns, svcRole, title string, port int, noneIsClusterIP bool, spec api.ExternalAccessSpec, apiObject k8sutil.APIObject, log zerolog.Logger, kubecli kubernetes.Interface) error { +func (r *Resources) ensureExternalAccessServices(svcs k8sutil.ServiceInterface, eaServiceName, ns, svcRole, title string, port int, noneIsClusterIP bool, spec api.ExternalAccessSpec, apiObject k8sutil.APIObject, log zerolog.Logger, counterMetric prometheus.Counter) error { // Database external access service createExternalAccessService := false deleteExternalAccessService := false diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index 376798f31..3b16df306 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -22,10 +22,23 @@ package metrics -import "github.com/prometheus/client_golang/prometheus" +import ( + "time" + + "github.com/prometheus/client_golang/prometheus" +) const ( namespace = "arangodb_operator" + + // DeploymentName is a label key used for the name of a deployment + DeploymentName = "deployment" + // Result is a label key used for the result of an action (Success|Failed) + Result = "result" + // Success is a label value used for successful actions + Success = "success" + // Failed is a label value used for failed actions + Failed = "failed" ) // MustRegisterCounter creates and registers a counter. @@ -96,3 +109,9 @@ func MustRegisterSummary(component, name, help string, objectives map[float64]fl prometheus.MustRegister(m) return m } + +// SetDuration sets a gauge value for the duration since the given start time +// in seconds. +func SetDuration(g prometheus.Gauge, startTime time.Time) { + g.Set(time.Since(startTime).Seconds()) +} From 6ccd76a97ead8ca3f0d1c7f880ea0e7d2e6ce41a Mon Sep 17 00:00:00 2001 From: Ewout Prangsma Date: Fri, 31 Aug 2018 16:31:06 +0200 Subject: [PATCH 14/17] Dashboard updates --- examples/metrics/dashboard.json | 1570 ++++++++++++++++++++----------- 1 file changed, 1034 insertions(+), 536 deletions(-) diff --git a/examples/metrics/dashboard.json b/examples/metrics/dashboard.json index c0e8c9c48..4a8fd9170 100644 --- a/examples/metrics/dashboard.json +++ b/examples/metrics/dashboard.json @@ -1,558 +1,1056 @@ { - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": "-- Grafana --", - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "type": "dashboard" + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": 10, + "links": [], + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "$$hashKey": "object:1274", + "expr": "sum(arangodb_operator_controller_deployments)", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "count", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "#Deployments", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:2219", + "decimals": 0, + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:2220", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true } ] }, - "editable": true, - "gnetId": null, - "graphTooltip": 0, - "id": 10, - "links": [], - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "fill": 1, - "gridPos": { - "h": 9, - "w": 12, - "x": 0, - "y": 0 - }, - "id": 2, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "$$hashKey": "object:1274", - "expr": "sum(arangodb_operator_controller_deployments)", - "format": "time_series", - "interval": "", - "intervalFactor": 1, - "legendFormat": "count", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "#Deployments", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:2219", - "decimals": 0, - "format": "short", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "$$hashKey": "object:2220", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 0 }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "fill": 1, - "gridPos": { - "h": 9, - "w": 12, - "x": 12, - "y": 0 - }, - "id": 8, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "$$hashKey": "object:1274", - "expr": "arangodb_operator_deployment_inspect_deployment_duration", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{deployment}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Deployment Inspection Duration", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "$$hashKey": "object:2154", - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "$$hashKey": "object:2155", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "fill": 1, - "gridPos": { - "h": 7, - "w": 8, - "x": 0, - "y": 9 - }, - "id": 4, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "$$hashKey": "object:1441", - "expr": "sum(increase(arangodb_operator_deployment_resources_deployment_health_fetches[5m])) by (result)", - "format": "time_series", - "interval": "", - "intervalFactor": 1, - "legendFormat": "{{result}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Fetch deployment health [5m]", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "$$hashKey": "object:1274", + "expr": "arangodb_operator_deployment_inspect_deployment_duration", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{deployment}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Deployment Inspection Duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "fill": 1, - "gridPos": { - "h": 7, - "w": 8, - "x": 8, - "y": 9 - }, - "id": 5, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "$$hashKey": "object:1441", - "expr": "sum(increase(arangodb_operator_deployment_resources_cleanupRemovedMembers[5m])) by (result)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{result}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Cleanup remove members [5m]", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "fill": 1, - "gridPos": { - "h": 7, - "w": 8, - "x": 0, - "y": 16 + "yaxes": [ + { + "$$hashKey": "object:2154", + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true }, - "id": 6, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false + { + "$$hashKey": "object:2155", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 7, + "w": 6, + "x": 0, + "y": 9 + }, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "$$hashKey": "object:1274", + "expr": "sum(increase(arangodb_operator_deployment_resources_inspected_pods[5m])) by (deployment)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{deployment}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Pod inspections [5m]", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "$$hashKey": "object:1274", - "expr": "sum(increase(arangodb_operator_deployment_resources_inspected_pods[5m]))", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Pod inspections [5m]", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 7, + "w": 6, + "x": 6, + "y": 9 + }, + "id": 9, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "$$hashKey": "object:1274", + "expr": "sum(increase(arangodb_operator_deployment_resources_inspected_pvcs[5m])) by (deployment)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{deployment}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "PVC inspections [5m]", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 7, + "w": 6, + "x": 12, + "y": 9 + }, + "id": 11, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "$$hashKey": "object:1274", + "expr": "sum(increase(arangodb_operator_deployment_resources_inspected_services[5m])) by (deployment)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{deployment}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Service inspections [5m]", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 7, + "w": 6, + "x": 18, + "y": 9 }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "prometheus", - "fill": 1, - "gridPos": { - "h": 7, - "w": 8, - "x": 8, - "y": 16 + "id": 13, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "$$hashKey": "object:1274", + "expr": "sum(increase(arangodb_operator_deployment_resources_inspected_secrets[5m])) by (deployment)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{deployment}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Secret inspections [5m]", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true }, - "id": 7, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 7, + "w": 6, + "x": 0, + "y": 16 + }, + "id": 7, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "$$hashKey": "object:1274", + "expr": "arangodb_operator_deployment_resources_inspect_pods_duration", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{pod}}, {{deployment}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Pod inspection Duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1930", + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "$$hashKey": "object:1274", - "expr": "arangodb_operator_deployment_resources_inspect_pods_duration", - "format": "time_series", - "interval": "", - "intervalFactor": 1, - "legendFormat": "{{pod}}, {{deployment}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Pod inspection Duration", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" + { + "$$hashKey": "object:1931", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 7, + "w": 6, + "x": 6, + "y": 16 + }, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "$$hashKey": "object:1274", + "expr": "arangodb_operator_deployment_resources_inspect_pvcs_duration", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{pod}}, {{deployment}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "PVC inspection Duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1930", + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] + { + "$$hashKey": "object:1931", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 7, + "w": 6, + "x": 12, + "y": 16 + }, + "id": 12, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "$$hashKey": "object:1274", + "expr": "arangodb_operator_deployment_resources_inspect_services_duration", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{pod}}, {{deployment}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Service inspection Duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1930", + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true }, - "yaxes": [ - { - "$$hashKey": "object:1930", - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "$$hashKey": "object:1931", - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "refresh": false, - "schemaVersion": 16, - "style": "dark", - "tags": [], - "templating": { - "list": [] + { + "$$hashKey": "object:1931", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] }, - "time": { - "from": "now-15m", - "to": "now" + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 7, + "w": 6, + "x": 18, + "y": 16 + }, + "id": 14, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "$$hashKey": "object:1274", + "expr": "arangodb_operator_deployment_resources_inspect_secrets_duration", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{pod}}, {{deployment}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Secret inspection Duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1930", + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:1931", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 23 + }, + "id": 4, + "legend": { + "avg": false, + "current": false, + "hideZero": true, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "$$hashKey": "object:1441", + "expr": "sum(increase(arangodb_operator_deployment_resources_deployment_health_fetches[5m])) by (deployment, result)", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{deployment}}, {{result}}", + "refId": "A" + } ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Deployment health fetches [5m]", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:2500", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:2501", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } ] }, - "timezone": "", - "title": "Kube-ArangoDB", - "uid": "pJUJQd2ik", - "version": 10 - } \ No newline at end of file + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 23 + }, + "id": 5, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "$$hashKey": "object:1441", + "expr": "sum(increase(arangodb_operator_deployment_resources_cleanup_removed_members[5m])) by (deployment, result)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{deployment}}, {{result}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Cleanup remove members inspections [5m]", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "refresh": false, + "schemaVersion": 16, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Kube-ArangoDB", + "uid": "pJUJQd2ik", + "version": 15 +} \ No newline at end of file From c22139a5932b217295e326aa4e46f757aa115ae1 Mon Sep 17 00:00:00 2001 From: Ewout Prangsma Date: Fri, 7 Sep 2018 13:18:39 +0200 Subject: [PATCH 15/17] Removed debug-only profiler code --- pkg/deployment/deployment_inspector.go | 31 +++++++++++--------------- 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/pkg/deployment/deployment_inspector.go b/pkg/deployment/deployment_inspector.go index 3d323cb85..ca5ed56eb 100644 --- a/pkg/deployment/deployment_inspector.go +++ b/pkg/deployment/deployment_inspector.go @@ -30,7 +30,6 @@ import ( "github.com/arangodb/kube-arangodb/pkg/metrics" "github.com/arangodb/kube-arangodb/pkg/util" "github.com/arangodb/kube-arangodb/pkg/util/k8sutil" - "github.com/arangodb/kube-arangodb/pkg/util/profiler" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) @@ -117,23 +116,19 @@ func (d *Deployment) inspectDeployment(lastInterval util.Interval) util.Interval } // Create scale/update plan - { - ps := profiler.Start() - if err := d.reconciler.CreatePlan(); err != nil { - hasError = true - d.CreateEvent(k8sutil.NewErrorEvent("Plan creation failed", err, d.apiObject)) - } - - // Execute current step of scale/update plan - retrySoon, err := d.reconciler.ExecutePlan(ctx) - if err != nil { - hasError = true - d.CreateEvent(k8sutil.NewErrorEvent("Plan execution failed", err, d.apiObject)) - } - if retrySoon { - nextInterval = minInspectionInterval - } - ps.Done(log, "plan") + if err := d.reconciler.CreatePlan(); err != nil { + hasError = true + d.CreateEvent(k8sutil.NewErrorEvent("Plan creation failed", err, d.apiObject)) + } + + // Execute current step of scale/update plan + retrySoon, err := d.reconciler.ExecutePlan(ctx) + if err != nil { + hasError = true + d.CreateEvent(k8sutil.NewErrorEvent("Plan execution failed", err, d.apiObject)) + } + if retrySoon { + nextInterval = minInspectionInterval } // Ensure all resources are created From db19a9fc008789f399a607b2a40eb22ac37c0060 Mon Sep 17 00:00:00 2001 From: Ewout Prangsma Date: Fri, 7 Sep 2018 14:13:49 +0200 Subject: [PATCH 16/17] Comment fix --- pkg/deployment/resources/deployment_health.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pkg/deployment/resources/deployment_health.go b/pkg/deployment/resources/deployment_health.go index 338e988b8..71b1fb1ba 100644 --- a/pkg/deployment/resources/deployment_health.go +++ b/pkg/deployment/resources/deployment_health.go @@ -62,7 +62,8 @@ func (r *Resources) RunDeploymentHealthLoop(stopCh <-chan struct{}) { } } -// cleanupRemovedClusterMembers removes all arangod members that are no longer part of the cluster. +// fetchDeploymentHealth performs a single fetch of cluster-health +// and stores it in-memory. func (r *Resources) fetchDeploymentHealth() error { // Ask cluster for its health ctx, cancel := context.WithTimeout(context.Background(), time.Second*15) From cb8451413725e2c2d34d797776452e7e4d5ddbc7 Mon Sep 17 00:00:00 2001 From: Ewout Prangsma Date: Fri, 7 Sep 2018 14:38:37 +0200 Subject: [PATCH 17/17] Removed unused code --- pkg/deployment/resources/pvc_inspector.go | 6 ------ 1 file changed, 6 deletions(-) diff --git a/pkg/deployment/resources/pvc_inspector.go b/pkg/deployment/resources/pvc_inspector.go index 385e17241..2c24edfbb 100644 --- a/pkg/deployment/resources/pvc_inspector.go +++ b/pkg/deployment/resources/pvc_inspector.go @@ -79,7 +79,6 @@ func (r *Resources) InspectPVCs(ctx context.Context) (util.Interval, error) { continue } - updateMemberStatusNeeded := false if k8sutil.IsPersistentVolumeClaimMarkedForDeletion(&p) { // Process finalizers if x, err := r.runPVCFinalizers(ctx, &p, group, memberStatus); err != nil { @@ -89,11 +88,6 @@ func (r *Resources) InspectPVCs(ctx context.Context) (util.Interval, error) { nextInterval = nextInterval.ReduceTo(x) } } - if updateMemberStatusNeeded { - if err := status.Members.Update(memberStatus, group); err != nil { - return 0, maskAny(err) - } - } } return nextInterval, nil