diff --git a/hack/olm-registry/olm-artifacts-template.yaml b/hack/olm-registry/olm-artifacts-template.yaml index 4b6d53c3..b0ac2da9 100644 --- a/hack/olm-registry/olm-artifacts-template.yaml +++ b/hack/olm-registry/olm-artifacts-template.yaml @@ -28,7 +28,7 @@ objects: sourceType: grpc image: ${REGISTRY_IMG}:${CHANNEL}-${IMAGE_TAG} displayName: pagerduty-operator Registry - publisher: SRE + publisher: SRE - apiVersion: operators.coreos.com/v1alpha2 kind: OperatorGroup @@ -67,3 +67,20 @@ objects: targetSecretRef: name: pd-secret namespace: openshift-monitoring + +- apiVersion: monitoring.coreos.com/v1 + kind: PrometheusRule + metadata: + name: pagerduty-integration-api-secret + namespace: pagerduty-operator + spec: + groups: + - name: pagerduty-integration-api-secret + rules: + - alert: PagerDutyIntegrationAPISecretError + annotations: + message: PagerDuty Operator is failing to load PAGERDUTY_API_KEY from Secret specified in PagerDutyIntegration {{ $labels.pagerdutyintegration_name }}. Either the Secret might be missing, or the key might be missing from within the Secret. + expr: pagerdutyintegration_secret_loaded < 1 + for: 15m + labels: + severity: warning diff --git a/pkg/controller/pagerdutyintegration/clusterdeployment_created.go b/pkg/controller/pagerdutyintegration/clusterdeployment_created.go index 82b8a099..94021bb1 100644 --- a/pkg/controller/pagerdutyintegration/clusterdeployment_created.go +++ b/pkg/controller/pagerdutyintegration/clusterdeployment_created.go @@ -32,7 +32,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" ) -func (r *ReconcilePagerDutyIntegration) handleCreate(pdi *pagerdutyv1alpha1.PagerDutyIntegration, cd *hivev1.ClusterDeployment) error { +func (r *ReconcilePagerDutyIntegration) handleCreate(pdclient pd.Client, pdi *pagerdutyv1alpha1.PagerDutyIntegration, cd *hivev1.ClusterDeployment) error { var ( // secretName is the name of the Secret deployed to the target // cluster, and also the name of the SyncSet that causes it to @@ -96,15 +96,15 @@ func (r *ReconcilePagerDutyIntegration) handleCreate(pdi *pagerdutyv1alpha1.Page err = pdData.ParseClusterConfig(r.client, cd.Namespace, configMapName) if err != nil { var createErr error - _, createErr = r.pdclient.CreateService(pdData) + _, createErr = pdclient.CreateService(pdData) if createErr != nil { - localmetrics.UpdateMetricPagerDutyCreateFailure(1, ClusterID) + localmetrics.UpdateMetricPagerDutyCreateFailure(1, ClusterID, pdi.Name) return createErr } } - localmetrics.UpdateMetricPagerDutyCreateFailure(0, ClusterID) + localmetrics.UpdateMetricPagerDutyCreateFailure(0, ClusterID, pdi.Name) - pdIntegrationKey, err = r.pdclient.GetIntegrationKey(pdData) + pdIntegrationKey, err = pdclient.GetIntegrationKey(pdData) if err != nil { return err } diff --git a/pkg/controller/pagerdutyintegration/clusterdeployment_deleted.go b/pkg/controller/pagerdutyintegration/clusterdeployment_deleted.go index f846748c..0db109ab 100644 --- a/pkg/controller/pagerdutyintegration/clusterdeployment_deleted.go +++ b/pkg/controller/pagerdutyintegration/clusterdeployment_deleted.go @@ -28,7 +28,7 @@ import ( "k8s.io/apimachinery/pkg/types" ) -func (r *ReconcilePagerDutyIntegration) handleDelete(pdi *pagerdutyv1alpha1.PagerDutyIntegration, cd *hivev1.ClusterDeployment) error { +func (r *ReconcilePagerDutyIntegration) handleDelete(pdclient pd.Client, pdi *pagerdutyv1alpha1.PagerDutyIntegration, cd *hivev1.ClusterDeployment) error { var ( // secretName is the name of the Secret deployed to the target // cluster, and also the name of the SyncSet that causes it to @@ -119,7 +119,7 @@ func (r *ReconcilePagerDutyIntegration) handleDelete(pdi *pagerdutyv1alpha1.Page if deletePDService { // we have everything necessary to attempt deletion of the PD service - err = r.pdclient.DeleteService(pdData) + err = pdclient.DeleteService(pdData) if err != nil { r.reqLogger.Error(err, "Failed cleaning up pagerduty.") } else { @@ -155,11 +155,22 @@ func (r *ReconcilePagerDutyIntegration) handleDelete(pdi *pagerdutyv1alpha1.Page utils.DeleteFinalizer(cd, finalizer) err = r.client.Update(context.TODO(), cd) if err != nil { - metrics.UpdateMetricPagerDutyDeleteFailure(1, ClusterID) + metrics.UpdateMetricPagerDutyDeleteFailure(1, ClusterID, pdi.Name) return err } } - metrics.UpdateMetricPagerDutyDeleteFailure(0, ClusterID) + + if utils.HasFinalizer(cd, config.OperatorFinalizer) { + r.reqLogger.Info("Deleting old PD finalizer from ClusterDeployment", "Namespace", cd.Namespace, "Name", cd.Name) + utils.DeleteFinalizer(cd, config.OperatorFinalizer) + err = r.client.Update(context.TODO(), cd) + if err != nil { + metrics.UpdateMetricPagerDutyDeleteFailure(1, ClusterID, pdi.Name) + return err + } + } + + metrics.UpdateMetricPagerDutyDeleteFailure(0, ClusterID, pdi.Name) return nil } diff --git a/pkg/controller/pagerdutyintegration/pagerdutyintegration_controller.go b/pkg/controller/pagerdutyintegration/pagerdutyintegration_controller.go index 642a1b53..f6dd548d 100644 --- a/pkg/controller/pagerdutyintegration/pagerdutyintegration_controller.go +++ b/pkg/controller/pagerdutyintegration/pagerdutyintegration_controller.go @@ -52,29 +52,16 @@ var log = logf.Log.WithName("controller_pagerdutyintegration") // Add creates a new PagerDutyIntegration Controller and adds it to the Manager. The Manager will set fields on the Controller // and Start it when the Manager is Started. func Add(mgr manager.Manager) error { - newRec, err := newReconciler(mgr) - if err != nil { - return err - } - - return add(mgr, newRec) + return add(mgr, newReconciler(mgr)) } // newReconciler returns a new reconcile.Reconciler -func newReconciler(mgr manager.Manager) (reconcile.Reconciler, error) { - tempClient, err := client.New(mgr.GetConfig(), client.Options{Scheme: mgr.GetScheme()}) - if err != nil { - return nil, err - } - - // get PD API key from secret - pdAPIKey, err := utils.LoadSecretData(tempClient, config.PagerDutyAPISecretName, config.OperatorNamespace, config.PagerDutyAPISecretKey) - +func newReconciler(mgr manager.Manager) reconcile.Reconciler { return &ReconcilePagerDutyIntegration{ client: utils.NewClientWithMetricsOrDie(log, mgr, controllerName), scheme: mgr.GetScheme(), - pdclient: pd.NewClient(pdAPIKey), - }, nil + pdclient: pd.NewClient, + } } // add adds a new Controller to mgr with r as the reconcile.Reconciler @@ -159,7 +146,7 @@ type ReconcilePagerDutyIntegration struct { client client.Client scheme *runtime.Scheme reqLogger logr.Logger - pdclient pd.Client + pdclient func(APIKey string) pd.Client } // Reconcile reads that state of the cluster for a PagerDutyIntegration object and makes changes based on the state read @@ -198,15 +185,31 @@ func (r *ReconcilePagerDutyIntegration) Reconcile(request reconcile.Request) (re return r.requeueOnErr(err) } + pdApiKey, err := utils.LoadSecretData( + r.client, + pdi.Spec.PagerdutyApiKeySecretRef.Name, + pdi.Spec.PagerdutyApiKeySecretRef.Namespace, + config.PagerDutyAPISecretKey, + ) + if err != nil { + r.reqLogger.Error(err, "Failed to load PagerDuty API key from Secret listed in PagerDutyIntegration CR") + localmetrics.UpdateMetricPagerDutyIntegrationSecretLoaded(0, pdi.Name) + return r.requeueAfter(10 * time.Minute) + } + localmetrics.UpdateMetricPagerDutyIntegrationSecretLoaded(1, pdi.Name) + pdClient := r.pdclient(pdApiKey) + if pdi.DeletionTimestamp != nil { if utils.HasFinalizer(pdi, config.OperatorFinalizer) { for _, cd := range matchingClusterDeployments.Items { - err := r.handleDelete(pdi, &cd) + err := r.handleDelete(pdClient, pdi, &cd) if err != nil { return r.requeueOnErr(err) } } + localmetrics.DeleteMetricPagerDutyIntegrationSecretLoaded(pdi.Name) + utils.DeleteFinalizer(pdi, config.OperatorFinalizer) err = r.client.Update(context.TODO(), pdi) if err != nil { @@ -226,12 +229,12 @@ func (r *ReconcilePagerDutyIntegration) Reconcile(request reconcile.Request) (re for _, cd := range matchingClusterDeployments.Items { if cd.DeletionTimestamp != nil || cd.Labels[config.ClusterDeploymentNoalertsLabel] == "true" { - err := r.handleDelete(pdi, &cd) + err := r.handleDelete(pdClient, pdi, &cd) if err != nil { return r.requeueOnErr(err) } } else { - err := r.handleCreate(pdi, &cd) + err := r.handleCreate(pdClient, pdi, &cd) if err != nil { return r.requeueOnErr(err) } @@ -259,3 +262,7 @@ func (r *ReconcilePagerDutyIntegration) doNotRequeue() (reconcile.Result, error) func (r *ReconcilePagerDutyIntegration) requeueOnErr(err error) (reconcile.Result, error) { return reconcile.Result{}, err } + +func (r *ReconcilePagerDutyIntegration) requeueAfter(t time.Duration) (reconcile.Result, error) { + return reconcile.Result{RequeueAfter: t}, nil +} diff --git a/pkg/controller/pagerdutyintegration/pagerdutyintegration_controller_test.go b/pkg/controller/pagerdutyintegration/pagerdutyintegration_controller_test.go index ed5b7868..25619f3c 100644 --- a/pkg/controller/pagerdutyintegration/pagerdutyintegration_controller_test.go +++ b/pkg/controller/pagerdutyintegration/pagerdutyintegration_controller_test.go @@ -26,6 +26,7 @@ import ( pagerdutyapis "github.com/openshift/pagerduty-operator/pkg/apis" pagerdutyv1alpha1 "github.com/openshift/pagerduty-operator/pkg/apis/pagerduty/v1alpha1" "github.com/openshift/pagerduty-operator/pkg/kube" + pd "github.com/openshift/pagerduty-operator/pkg/pagerduty" mockpd "github.com/openshift/pagerduty-operator/pkg/pagerduty/mock" "github.com/stretchr/testify/assert" corev1 "k8s.io/api/core/v1" @@ -359,6 +360,7 @@ func TestReconcilePagerDutyIntegration(t *testing.T) { localObjects: []runtime.Object{ uninstalledClusterDeployment(), testPagerDutyIntegration(), + testPDConfigSecret(), }, expectedSyncSets: &SyncSetEntry{}, expectedSecrets: &SecretEntry{}, @@ -426,6 +428,7 @@ func TestReconcilePagerDutyIntegration(t *testing.T) { localObjects: []runtime.Object{ unlabelledClusterDeployment(), testPagerDutyIntegration(), + testPDConfigSecret(), }, expectedSyncSets: &SyncSetEntry{}, expectedSecrets: &SecretEntry{}, @@ -439,6 +442,7 @@ func TestReconcilePagerDutyIntegration(t *testing.T) { localObjects: []runtime.Object{ testNoalertsClusterDeployment(), testPagerDutyIntegration(), + testPDConfigSecret(), }, expectedSyncSets: &SyncSetEntry{}, expectedSecrets: &SecretEntry{}, @@ -502,7 +506,7 @@ func TestReconcilePagerDutyIntegration(t *testing.T) { rpdi := &ReconcilePagerDutyIntegration{ client: mocks.fakeKubeClient, scheme: scheme.Scheme, - pdclient: mocks.mockPDClient, + pdclient: func(s string) pd.Client { return mocks.mockPDClient }, } // Act [2x as first exits early after setting finalizer] @@ -556,7 +560,7 @@ func TestRemoveAlertsAfterCreate(t *testing.T) { rpdi := &ReconcilePagerDutyIntegration{ client: mocks.fakeKubeClient, scheme: scheme.Scheme, - pdclient: mocks.mockPDClient, + pdclient: func(s string) pd.Client { return mocks.mockPDClient }, } // Act (create) [2x as first exits early after setting finalizer] @@ -636,7 +640,7 @@ func TestDeleteSecret(t *testing.T) { rpdi := &ReconcilePagerDutyIntegration{ client: mocks.fakeKubeClient, scheme: scheme.Scheme, - pdclient: mocks.mockPDClient, + pdclient: func(s string) pd.Client { return mocks.mockPDClient }, } // Act (create) [2x as first exits early after setting finalizer] diff --git a/pkg/localmetrics/localmetrics.go b/pkg/localmetrics/localmetrics.go index 64113885..d76b6d4f 100644 --- a/pkg/localmetrics/localmetrics.go +++ b/pkg/localmetrics/localmetrics.go @@ -38,13 +38,13 @@ var ( Name: "pagerduty_create_failure", Help: "Metric for the failure of creating a cluster deployment.", ConstLabels: prometheus.Labels{"name": "pagerduty-operator"}, - }, []string{"clusterdeployment_name"}) + }, []string{"clusterdeployment_name", "pagerdutyintegration_name"}) MetricPagerDutyDeleteFailure = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "pagerduty_delete_failure", Help: "Metric for the failure of deleting a cluster deployment.", ConstLabels: prometheus.Labels{"name": "pagerduty-operator"}, - }, []string{"clusterdeployment_name"}) + }, []string{"clusterdeployment_name", "pagerdutyintegration_name"}) MetricPagerDutyHeartbeat = prometheus.NewSummary(prometheus.SummaryOpts{ Name: "pagerduty_heartbeat", @@ -69,16 +69,23 @@ var ( Buckets: []float64{1}, }, []string{"controller", "method", "resource", "status"}) + MetricPagerDutyIntegrationSecretLoaded = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "pagerdutyintegration_secret_loaded", + Help: "Metric to track the ability to load the PagerDuty API key from the Secret specified in the PagerDutyIntegration", + ConstLabels: prometheus.Labels{"name": "pagerduty-operator"}, + }, []string{"pagerdutyintegration_name"}) + MetricsList = []prometheus.Collector{ MetricPagerDutyCreateFailure, MetricPagerDutyDeleteFailure, MetricPagerDutyHeartbeat, ApiCallDuration, ReconcileDuration, + MetricPagerDutyIntegrationSecretLoaded, } ) -// UpdateAPIMetrics updates all API endpoint metrics ever 5 minutes +// UpdateAPIMetrics updates all API endpoint metrics every 5 minutes func UpdateAPIMetrics(APIKey string, timer *prometheus.Timer) { d := time.Tick(5 * time.Minute) for range d { @@ -87,18 +94,39 @@ func UpdateAPIMetrics(APIKey string, timer *prometheus.Timer) { } -// UpdateMetricPagerDutyCreateFailure updates guage to 1 when creation fails -func UpdateMetricPagerDutyCreateFailure(x int, cd string) { +// UpdateMetricPagerDutyIntegrationSecretLoaded updates gauge to 1 +// when the PagerDuty API key can be loaded from the Secret specified +// in the PagerDutyIntegration, or to 0 if it fails +func UpdateMetricPagerDutyIntegrationSecretLoaded(x int, pdiName string) { + MetricPagerDutyIntegrationSecretLoaded.With( + prometheus.Labels{"pagerdutyintegration_name": pdiName}, + ).Set(float64(x)) +} + +// DeleteMetricPagerDutyIntegrationSecretLoaded deletes the metric for +// the PagerDutyIntegration name provided. This should be called when +// e.g. the PagerDutyIntegration is being deleted, so that there are +// no irrelevant metrics (which alerts could be firing on). +func DeleteMetricPagerDutyIntegrationSecretLoaded(pdiName string) bool { + return MetricPagerDutyIntegrationSecretLoaded.Delete( + prometheus.Labels{"pagerdutyintegration_name": pdiName}, + ) +} + +// UpdateMetricPagerDutyCreateFailure updates gauge to 1 when creation fails +func UpdateMetricPagerDutyCreateFailure(x int, cd string, pdiName string) { MetricPagerDutyCreateFailure.With(prometheus.Labels{ - "clusterdeployment_name": cd}).Set( - float64(x)) + "clusterdeployment_name": cd, + "pagerdutyintegration_name": pdiName, + }).Set(float64(x)) } -// UpdateMetricPagerDutyDeleteFailure updates guage to 1 when deletion fails -func UpdateMetricPagerDutyDeleteFailure(x int, cd string) { +// UpdateMetricPagerDutyDeleteFailure updates gauge to 1 when deletion fails +func UpdateMetricPagerDutyDeleteFailure(x int, cd string, pdiName string) { MetricPagerDutyDeleteFailure.With(prometheus.Labels{ - "clusterdeployment_name": cd}).Set( - float64(x)) + "clusterdeployment_name": cd, + "pagerdutyintegration_name": pdiName, + }).Set(float64(x)) } // SetReconcileDuration Push the duration of the reconcile iteration