diff --git a/Makefile b/Makefile index 70a26ebf..bafd7ad2 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,14 @@ # Include boilerplate's generated Makefile libraries include boilerplate/generated-includes.mk +# ===> TODO: Remove this override once the boilerplate backing image has go-bindata +.PHONY: go-generate +go-generate: + go get github.com/go-bindata/go-bindata/...@v3.1.2 + ${GOENV} go generate $(TESTTARGETS) + # Don't forget to commit generated files +# <=== TODO: Remove this override once the boilerplate backing image has go-bindata + .PHONY: boilerplate-update boilerplate-update: @boilerplate/update diff --git a/README.md b/README.md index 697f41d8..ea1b82ff 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,17 @@ [![codecov](https://codecov.io/gh/openshift/configure-alertmanager-operator/branch/master/graph/badge.svg)](https://codecov.io/gh/openshift/configure-alertmanager-operator) [![License](https://img.shields.io/:license-apache-blue.svg)](http://www.apache.org/licenses/LICENSE-2.0.html) +- [configure-alertmanager-operator](#configure-alertmanager-operator) + - [Summary](#summary) + - [Cluster Readiness](#cluster-readiness) + - [Metrics](#metrics) + - [Alerts](#alerts) + - [Testing](#testing) + - [Building](#building) + - [Deploying](#deploying) + - [Prevent Overwrites](#prevent-overwrites) + - [Replace the Image](#replace-the-image) + ## Summary The Configure Alertmanager Operator was created for the OpenShift Dedicated platform to dynamically manage Alertmanager configurations based on the presence or absence of secrets containing a Pager Duty RoutingKey and [Dead Man's Snitch](https://deadmanssnitch.com) URL. When the secret is created/updated/deleted, the associated Receiver and Route will be created/updated/deleted within the Alertmanager config. @@ -13,6 +24,9 @@ The operator contains the following components: * Secret controller: watches the `openshift-monitoring` namespace for any changes to Secrets named `alertmanager-main`, `pd-secret` or `dms-secret`. * Types library: these types are imported from the Alertmanager [Config](https://github.com/prometheus/alertmanager/blob/master/config/config.go) library and pared down to suit our config needs. (Since their library is [intended for internal use only](https://github.com/prometheus/alertmanager/pull/1804#issuecomment-482038079)). +## Cluster Readiness +To avoid alert noise while a cluster is in the early stages of being installed and configured, this operator waits to configure Pager Duty -- effectively silencing alerts -- until a predetermined set of health checks has succeeded. +The operator uses [osd-cluster-ready](https://github.com/openshift/osd-cluster-ready/) to perform these health checks. ## Metrics The Configure Alertmanager Operator exposes the following Prometheus metrics: @@ -27,3 +41,73 @@ The following alerts are added to Prometheus as part of configure-alertmanager-o * Mismatch between DMS secret and DMS Alertmanager config. * Mismatch between PD secret and PD Alertmanager config. * Alertmanager config secret does not exist. + +## Testing +Tips for testing on a personal cluster: + +### Building +You may build (`make docker-build`) and push (`make docker-push`) the operator image to a personal repository by overriding components of the image URI: +- `IMAGE_REGISTRY` overrides the *registry* (default: `quay.io`) +- `IMAGE_REPOSITORY` overrides the *organization* (default: `app-sre`) +- `IMAGE_NAME` overrides the *repository name* (default: `managed-cluster-validating-webhooks`) +- `OPERATOR_IMAGE_TAG` overrides the *image tag*. (By default this is generated based on the current commit of your local clone of the git repository; but `make docker-build` will also always tag `latest`) + +For example, to build, tag, and push `quay.io/my-user/configure-alertmanager-operator:latest`, you can run: + +``` +make IMAGE_REPOSITORY=my-user docker-build docker-push +``` + +### Deploying + +#### Prevent Overwrites + +Note: This step requires elevated permissions + +This operator is managed by OLM, so you must switch that off, or your changes to the operator's Deployment will be overwritten: + +``` +oc scale deploy/cluster-version-operator --replicas=0 -n openshift-cluster-version +oc scale deploy/olm-operator --replicas=0 -n openshift-operator-lifecycle-manager +``` + +**NOTE: Don't forget to revert these changes when you have finished testing:** + +``` +oc scale deploy/olm-operator --replicas=1 -n openshift-operator-lifecycle-manager +oc scale deploy/cluster-version-operator --replicas=1 -n openshift-cluster-version +``` + +#### Replace the Image +Edit the operator's deployment (`oc edit deployment configure-alertmanager-operator -n openshift-monitoring`), replacing the `image:` with the URI of the image you built [above](#building). The deployment will automatically delete and replace the running pod. + +**NOTE:** If you are testing the osd-cluster-ready job, you may need to set the `MAX_CLUSTER_AGE_MINUTES` environment variable in the deployment's `configure-alertmanager-operator` container definition. +For example, to ensure osd-cluster-ready runs in a cluster less than 1048576 minutes (~two years) old: + +```yaml + containers: + - command: + - configure-alertmanager-operator + env: + - name: WATCH_NAMESPACE + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: metadata.namespace + - name: POD_NAME + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: metadata.name + - name: OPERATOR_NAME + value: configure-alertmanager-operator + ### Add this entry ### + - name: MAX_CLUSTER_AGE_MINUTES + value: "1048576" + image: quay.io/2uasimojo/configure-alertmanager-operator:latest + imagePullPolicy: Always + name: configure-alertmanager-operator + resources: {} + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File +``` diff --git a/go.mod b/go.mod index a3f31366..31538ca5 100644 --- a/go.mod +++ b/go.mod @@ -5,14 +5,16 @@ go 1.13 require ( cloud.google.com/go v0.47.0 // indirect github.com/coreos/prometheus-operator v0.34.0 + github.com/go-bindata/go-bindata v3.1.2+incompatible // indirect github.com/go-openapi/spec v0.19.5-0.20191022081736-744796356cda // indirect + github.com/golang/mock v1.3.1 github.com/json-iterator/go v1.1.8 // indirect github.com/onsi/ginkgo v1.12.0 // indirect github.com/onsi/gomega v1.9.0 // indirect github.com/openshift/api v3.9.1-0.20190924102528-32369d4db2ad+incompatible - github.com/openshift/cluster-operator v0.0.0-20190529110107-668db5da8c20 github.com/operator-framework/operator-sdk v0.16.0 github.com/prometheus/client_golang v1.2.1 + github.com/prometheus/common v0.7.0 github.com/spf13/pflag v1.0.5 go.uber.org/multierr v1.2.0 // indirect go.uber.org/zap v1.11.0 // indirect diff --git a/go.sum b/go.sum index f4cbfca2..cf969694 100644 --- a/go.sum +++ b/go.sum @@ -207,6 +207,8 @@ github.com/globalsign/mgo v0.0.0-20180905125535-1ca0a4f7cbcb/go.mod h1:xkRDCp4j0 github.com/globalsign/mgo v0.0.0-20181015135952-eeefdecb41b8/go.mod h1:xkRDCp4j0OGD1HRkm4kmhM+pmpv3AKq5SU7GMg4oO/Q= github.com/go-acme/lego v2.5.0+incompatible/go.mod h1:yzMNe9CasVUhkquNvti5nAtPmG94USbYxYrZfTkIn0M= github.com/go-bindata/go-bindata v3.1.1+incompatible/go.mod h1:xK8Dsgwmeed+BBsSy2XTopBn/8uK2HWuGSnA11C3Joo= +github.com/go-bindata/go-bindata v3.1.2+incompatible h1:5vjJMVhowQdPzjE1LdxyFF7YFTXg5IgGVW4gBr5IbvE= +github.com/go-bindata/go-bindata v3.1.2+incompatible/go.mod h1:xK8Dsgwmeed+BBsSy2XTopBn/8uK2HWuGSnA11C3Joo= github.com/go-gl/glfw v0.0.0-20190409004039-e6da0acd62b1/go.mod h1:vR7hzQXu2zJy9AVAgeJqvqgH9Q5CA+iKCZ2gyEVpxRU= github.com/go-kit/kit v0.8.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2as= github.com/go-kit/kit v0.9.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2as= @@ -295,6 +297,7 @@ github.com/golang/groupcache v0.0.0-20191027212112-611e8accdfc9/go.mod h1:cIg4er github.com/golang/lint v0.0.0-20180702182130-06c8688daad7/go.mod h1:tluoj9z5200jBnyusfRPU2LqT6J+DAorxEvtC7LHB+E= github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= github.com/golang/mock v1.2.0/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= +github.com/golang/mock v1.3.1 h1:qGJ6qTW+x6xX/my+8YUVl4WNpX9B7+/l2tRsHGZ7f2s= github.com/golang/mock v1.3.1/go.mod h1:sBzyDLLjw3U8JLTeZvSv8jJB+tU5PVekmnlKIyFUx0Y= github.com/golang/protobuf v0.0.0-20161109072736-4bd1920723d7/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.0.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= @@ -517,8 +520,6 @@ github.com/opencontainers/selinux v1.2.2/go.mod h1:+BLncwf63G4dgOzykXAxcmnFlUaOl github.com/openshift/api v3.9.1-0.20190924102528-32369d4db2ad+incompatible h1:6il8W875Oq9vycPkRV5TteLP9IfMEX3lyOl5yN+CtdI= github.com/openshift/api v3.9.1-0.20190924102528-32369d4db2ad+incompatible/go.mod h1:dh9o4Fs58gpFXGSYfnVxGR9PnV53I8TW84pQaJDdGiY= github.com/openshift/client-go v0.0.0-20190923180330-3b6373338c9b/go.mod h1:6rzn+JTr7+WYS2E1TExP4gByoABxMznR6y2SnUIkmxk= -github.com/openshift/cluster-operator v0.0.0-20190529110107-668db5da8c20 h1:FXE0nwGK3/MC0zUGqa2Wj+xhhawWo0U7q9s5vom+csA= -github.com/openshift/cluster-operator v0.0.0-20190529110107-668db5da8c20/go.mod h1:TOaKmt2XSw3ccak2GoVYvj8UtApTyLBf1vGl3u3+cV4= github.com/openshift/origin v0.0.0-20160503220234-8f127d736703/go.mod h1:0Rox5r9C8aQn6j1oAOQ0c1uC86mYbUFObzjBRvUKHII= github.com/openshift/prom-label-proxy v0.1.1-0.20191016113035-b8153a7f39f1/go.mod h1:p5MuxzsYP1JPsNGwtjtcgRHHlGziCJJfztff91nNixw= github.com/opentracing/opentracing-go v1.1.0/go.mod h1:UkNAQd3GIcIGf0SeVgPpRdFStlNbqXla1AfSYxPUl2o= @@ -974,7 +975,6 @@ modernc.org/strutil v1.0.0/go.mod h1:lstksw84oURvj9y3tn8lGvRxyRC1S2+g5uuIzNfIOBs modernc.org/xc v1.0.0/go.mod h1:mRNCo0bvLjGhHO9WsyuKVU4q0ceiDDDoEeWDJHrNx8I= rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8= rsc.io/letsencrypt v0.0.1/go.mod h1:buyQKZ6IXrRnB7TdkHP0RyEybLx18HHyOSoTyoOLqNY= -sigs.k8s.io/cluster-api v0.3.9 h1:WongQFeW+vbII9Karc3nIarxMfuUuTr33QU9aSyiKfs= sigs.k8s.io/controller-runtime v0.4.0 h1:wATM6/m+3w8lj8FXNaO6Fs/rq/vqoOjO1Q116Z9NPsg= sigs.k8s.io/controller-runtime v0.4.0/go.mod h1:ApC79lpY3PHW9xj/w9pj+lYkLgwAAUZwfXkME1Lajns= sigs.k8s.io/controller-tools v0.2.4/go.mod h1:m/ztfQNocGYBgTTCmFdnK94uVvgxeZeE3LtJvd/jIzA= diff --git a/manifests/01_role.yaml b/manifests/01_role.yaml index 1a98feac..ac2d4978 100644 --- a/manifests/01_role.yaml +++ b/manifests/01_role.yaml @@ -80,4 +80,9 @@ rules: - watch - patch - update - +- apiGroups: + - batch + resources: + - jobs + verbs: + - "*" diff --git a/manifests/03_role_binding.yaml b/manifests/03_role_binding.yaml index 5cfd0f8f..69ffed6e 100644 --- a/manifests/03_role_binding.yaml +++ b/manifests/03_role_binding.yaml @@ -38,3 +38,17 @@ subjects: - kind: ServiceAccount name: configure-alertmanager-operator namespace: openshift-monitoring +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: configure-alertmanager-operator.prom + namespace: openshift-monitoring +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: cluster-monitoring-view +subjects: +- kind: ServiceAccount + name: configure-alertmanager-operator + namespace: openshift-monitoring diff --git a/pkg/controller/secret/secret_controller.go b/pkg/controller/secret/secret_controller.go index f7d92cb1..b044ac58 100644 --- a/pkg/controller/secret/secret_controller.go +++ b/pkg/controller/secret/secret_controller.go @@ -19,6 +19,7 @@ import ( "github.com/openshift/configure-alertmanager-operator/config" "github.com/openshift/configure-alertmanager-operator/pkg/metrics" + "github.com/openshift/configure-alertmanager-operator/pkg/readiness" alertmanager "github.com/openshift/configure-alertmanager-operator/pkg/types" configv1 "github.com/openshift/api/config/v1" @@ -66,8 +67,9 @@ var _ reconcile.Reconciler = &ReconcileSecret{} type ReconcileSecret struct { // This client, initialized using mgr.Client() above, is a split client // that reads objects from the cache and writes to the apiserver - client client.Client - scheme *runtime.Scheme + client client.Client + scheme *runtime.Scheme + readiness readiness.Interface } // Add creates a new Secret Controller and adds it to the Manager. The Manager will set fields on the Controller @@ -78,7 +80,12 @@ func Add(mgr manager.Manager) error { // newReconciler returns a new reconcile.Reconciler func newReconciler(mgr manager.Manager) reconcile.Reconciler { - return &ReconcileSecret{client: mgr.GetClient(), scheme: mgr.GetScheme()} + client := mgr.GetClient() + return &ReconcileSecret{ + client: client, + scheme: mgr.GetScheme(), + readiness: &readiness.Impl{Client: client}, + } } // add adds a new Controller to mgr with r as the reconcile.Reconciler @@ -424,6 +431,7 @@ func (r *ReconcileSecret) Reconcile(request reconcile.Request) (reconcile.Result reqLogger.Info("Reconciling Secret") // This operator is only interested in the 3 secrets listed below. Skip reconciling for all other secrets. + // TODO: Filter these with a predicate instead switch request.Name { case secretNamePD: case secretNameDMS: @@ -434,6 +442,12 @@ func (r *ReconcileSecret) Reconcile(request reconcile.Request) (reconcile.Result } log.Info("DEBUG: Started reconcile loop") + clusterReady, err := r.readiness.IsReady() + if err != nil { + log.Error(err, "Error determining cluster readiness.") + return r.readiness.Result(), err + } + // Get a list of all Secrets in the `openshift-monitoring` namespace. // This is used for determining which secrets are present so that the necessary // Alertmanager config changes can happen later. @@ -450,7 +464,7 @@ func (r *ReconcileSecret) Reconcile(request reconcile.Request) (reconcile.Result // Get the secret from the request. If it's a secret we monitor, flag for reconcile. instance := &corev1.Secret{} - err := r.client.Get(context.TODO(), request.NamespacedName, instance) + err = r.client.Get(context.TODO(), request.NamespacedName, instance) // if there was an error other than "not found" requeue if err != nil { @@ -469,9 +483,16 @@ func (r *ReconcileSecret) Reconcile(request reconcile.Request) (reconcile.Result pagerdutyRoutingKey := "" watchdogURL := "" // If a secret exists, add the necessary configs to Alertmanager. + // But don't activate PagerDuty unless the cluster is "ready". + // This is to avoid alert noise while the cluster is still being installed and configured. if pagerDutySecretExists { log.Info("INFO: Pager Duty secret exists") - pagerdutyRoutingKey = readSecretKey(r, &request, secretNamePD, secretKeyPD) + if clusterReady { + log.Info("INFO: Cluster is ready; configuring Pager Duty") + pagerdutyRoutingKey = readSecretKey(r, &request, secretNamePD, secretKeyPD) + } else { + log.Info("INFO: Cluster is not ready; skipping Pager Duty configuration") + } } if snitchSecretExists { log.Info("INFO: Dead Man's Snitch secret exists") @@ -496,7 +517,9 @@ func (r *ReconcileSecret) Reconcile(request reconcile.Request) (reconcile.Result // Update metrics after all reconcile operations are complete. metrics.UpdateSecretsMetrics(secretList, alertmanagerconfig) reqLogger.Info("Finished reconcile for secret.") - return reconcile.Result{}, nil + + // The readiness Result decides whether we should requeue, effectively "polling" the readiness logic. + return r.readiness.Result(), nil } func (r *ReconcileSecret) getClusterID() (string, error) { diff --git a/pkg/controller/secret/secret_controller_test.go b/pkg/controller/secret/secret_controller_test.go index d45a9392..12c07234 100644 --- a/pkg/controller/secret/secret_controller_test.go +++ b/pkg/controller/secret/secret_controller_test.go @@ -7,8 +7,10 @@ import ( "reflect" "testing" + "github.com/golang/mock/gomock" configv1 "github.com/openshift/api/config/v1" "github.com/openshift/configure-alertmanager-operator/config" + "github.com/openshift/configure-alertmanager-operator/pkg/readiness" alertmanager "github.com/openshift/configure-alertmanager-operator/pkg/types" yaml "gopkg.in/yaml.v2" corev1 "k8s.io/api/core/v1" @@ -514,15 +516,21 @@ func createSecret(reconciler *ReconcileSecret, secretname string, secretkey stri } // createReconciler creates a fake ReconcileSecret for testing. -func createReconciler(t *testing.T) *ReconcileSecret { +// If ready is nil, a real readiness Impl is constructed. +func createReconciler(t *testing.T, ready readiness.Interface) *ReconcileSecret { scheme := scheme.Scheme if err := configv1.AddToScheme(scheme); err != nil { t.Fatalf("Unable to add route scheme: (%v)", err) } + if ready == nil { + ready = &readiness.Impl{} + } + return &ReconcileSecret{ - client: fake.NewFakeClientWithScheme(scheme), - scheme: scheme, + client: fake.NewFakeClientWithScheme(scheme), + scheme: scheme, + readiness: ready, } } @@ -556,7 +564,12 @@ func Test_createPagerdutySecret_Create(t *testing.T) { verifyInhibitRules(t, configExpected.InhibitRules) // prepare environment - reconciler := createReconciler(t) + ctrl := gomock.NewController(t) + defer ctrl.Finish() + mockReadiness := readiness.NewMockInterface(ctrl) + mockReadiness.EXPECT().IsReady().Times(1).Return(true, nil) + mockReadiness.EXPECT().Result().Times(1).Return(reconcile.Result{}) + reconciler := createReconciler(t, mockReadiness) createNamespace(reconciler, t) createConsolePublicConfigMap(reconciler, t) createSecret(reconciler, secretNamePD, secretKeyPD, pdKey) @@ -588,7 +601,12 @@ func Test_createPagerdutySecret_Update(t *testing.T) { verifyInhibitRules(t, configExpected.InhibitRules) // prepare environment - reconciler := createReconciler(t) + ctrl := gomock.NewController(t) + defer ctrl.Finish() + mockReadiness := readiness.NewMockInterface(ctrl) + mockReadiness.EXPECT().IsReady().Times(2).Return(true, nil) + mockReadiness.EXPECT().Result().Times(2).Return(reconcile.Result{}) + reconciler := createReconciler(t, mockReadiness) createNamespace(reconciler, t) createConsolePublicConfigMap(reconciler, t) createSecret(reconciler, secretNamePD, secretKeyPD, pdKey) @@ -632,6 +650,9 @@ func createClusterVersion(reconciler *ReconcileSecret) { } func Test_ReconcileSecrets(t *testing.T) { + ctrl := gomock.NewController(t) + defer ctrl.Finish() + tests := []struct { name string dmsExists bool @@ -704,7 +725,10 @@ func Test_ReconcileSecrets(t *testing.T) { }, } for _, tt := range tests { - reconciler := createReconciler(t) + mockReadiness := readiness.NewMockInterface(ctrl) + mockReadiness.EXPECT().IsReady().Times(1).Return(true, nil) + mockReadiness.EXPECT().Result().Times(1).Return(reconcile.Result{}) + reconciler := createReconciler(t, mockReadiness) createNamespace(reconciler, t) createConsolePublicConfigMap(reconciler, t) createClusterVersion(reconciler) @@ -744,3 +768,88 @@ func Test_ReconcileSecrets(t *testing.T) { assertEquals(t, configExpected.String(), configActual.String(), tt.name) } } + +// Test_ReconcileSecrets_Readiness tests the Reconcile loop for different results of the +// cluster readiness check. +func Test_ReconcileSecrets_Readiness(t *testing.T) { + ctrl := gomock.NewController(t) + defer ctrl.Finish() + tests := []struct { + name string + ready bool + readyErr bool + expectDMS bool + expectPD bool + }{ + { + name: "Cluster not ready: don't configure PD.", + ready: false, + readyErr: false, + expectDMS: true, + expectPD: false, + }, + { + // This is covered by other test cases, but for completeness... + name: "Cluster ready: configure everything.", + ready: true, + readyErr: false, + expectDMS: true, + expectPD: true, + }, + { + name: "Readiness check errors: don't configure anything.", + ready: false, + readyErr: true, + expectDMS: false, + expectPD: false, + }, + } + for _, tt := range tests { + mockReadiness := readiness.NewMockInterface(ctrl) + var expectErr error = nil + if tt.readyErr { + expectErr = fmt.Errorf("An error occurred") + } + mockReadiness.EXPECT().IsReady().Times(1).Return(tt.ready, expectErr) + // Use a weird Result() to validate that all the code paths are using it. + expectResult := reconcile.Result{RequeueAfter: 12345} + mockReadiness.EXPECT().Result().Times(1).Return(expectResult) + reconciler := createReconciler(t, mockReadiness) + createNamespace(reconciler, t) + createConsolePublicConfigMap(reconciler, t) + createClusterVersion(reconciler) + + writeAlertManagerConfig(reconciler, createAlertManagerConfig("", "", "", "")) + + pdKey := "asdfjkl123" + dmsURL := "https://hjklasdf09876" + + // Create the secrets for this specific test. + // We're testing that Reconcile parlays the PD/DMS secrets into the AM config as + // appropriate. So we always start with those two secrets + createSecret(reconciler, secretNameDMS, secretKeyDMS, dmsURL) + createSecret(reconciler, secretNamePD, secretKeyPD, pdKey) + + // However, we expect the AM config to be updated only according to the test spec + if !tt.expectDMS { + dmsURL = "" + } + if !tt.expectPD { + pdKey = "" + } + configExpected := createAlertManagerConfig(pdKey, dmsURL, exampleConsoleUrl, exampleClusterId) + + verifyInhibitRules(t, configExpected.InhibitRules) + + req := createReconcileRequest(reconciler, secretNameAlertmanager) + ret, err := reconciler.Reconcile(*req) + assertEquals(t, expectResult, ret, "Unexpected result") + assertEquals(t, expectErr, err, "Unexpected err") + + // load the config and check it + configActual := readAlertManagerConfig(reconciler, req) + + // NOTE compare of the objects will fail when no secrets are created for some reason, so using .String() + assertEquals(t, configExpected.String(), configActual.String(), tt.name) + } +} diff --git a/pkg/readiness/cluster_ready.go b/pkg/readiness/cluster_ready.go new file mode 100644 index 00000000..62c683f7 --- /dev/null +++ b/pkg/readiness/cluster_ready.go @@ -0,0 +1,272 @@ +package readiness + +//go:generate go-bindata -nocompress -nometadata -pkg readiness -o zz_generated_defs.go defs/ +//go:generate mockgen -destination zz_generated_mocks.go -package readiness -source=cluster_ready.go + +import ( + "context" + "crypto/tls" + "fmt" + "io/ioutil" + "net" + "net/http" + "net/url" + "os" + "path/filepath" + "strconv" + "time" + + "github.com/openshift/configure-alertmanager-operator/config" + "github.com/prometheus/client_golang/api" + promv1 "github.com/prometheus/client_golang/api/prometheus/v1" + "github.com/prometheus/common/model" + "gopkg.in/yaml.v2" + batchv1 "k8s.io/api/batch/v1" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + logf "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/reconcile" +) + +var log = logf.Log.WithName("readiness") + +// Impl is a concrete instance of the readiness engine. +type Impl struct { + // Client is a controller-runtime client capable of querying k8s. + Client client.Client + // result is what the calling Reconcile should return if it is otherwise successful. + result reconcile.Result + // ready indicates whether the cluster is considered ready. Once this is true, + // Check() is a no-op. + ready bool + // clusterCreationTime caches the birth time of the cluster so we only have to + // query prometheus once. + clusterCreationTime time.Time + // promAPI is a handle to the prometheus API client + promAPI promv1.API +} + +// Interface is the interface for the readiness engine. +type Interface interface { + IsReady() (bool, error) + Result() reconcile.Result + setClusterCreationTime() error + clusterTooOld(int) bool + setPromAPI() error +} + +var _ Interface = &Impl{} + +const ( + // Maximum cluster age, in minutes, after whiche we'll assume we don't need to run health checks. + maxClusterAgeKey = "MAX_CLUSTER_AGE_MINUTES" + // By default, ignore clusters older than two hours + maxClusterAgeDefault = 2 * 60 + + jobName = "osd-cluster-ready" +) + +// IsReady deals with the osd-cluster-ready Job. +// Sets: +// - impl.Ready: +// true if: +// - a previous check has already succeeded (a cluster can't become un-ready once it's ready); +// - an osd-cluster-ready Job has completed successfully; or +// - the cluster is older than maxClusterAgeMinutes +// false otherwise. +// - impl.Result: If the caller's reconcile is otherwise successful, it +// should return the given Result. +// - impl.clusterCreationTime: If it is necessary to check the age of the cluster, this is set so +// we only have to query prometheus once. +func (impl *Impl) IsReady() (bool, error) { + if impl.ready { + log.Info("DEBUG: Using cached positive cluster readiness.") + return impl.ready, nil + } + + // Default Result + impl.result = reconcile.Result{} + + // Readiness job part 1: Grab it, and short out if it was successful. + job := &batchv1.Job{} + found := true + if err := impl.Client.Get(context.TODO(), types.NamespacedName{Namespace: config.OperatorNamespace, Name: jobName}, job); err != nil { + if !errors.IsNotFound(err) { + // If we couldn't query k8s, it is fatal for this iteration of the reconcile + return false, fmt.Errorf("Failed to retrieve %s Job: %v", jobName, err) + } + found = false + } + // If the job completed successfully, we're done. + if found && job.Status.Succeeded > 0 { + log.Info(fmt.Sprintf("INFO: Found a succeeded %s Job.", jobName)) + impl.ready = true + return impl.ready, nil + } + + // Cluster age: short out if the cluster is older than the configured value + if err := impl.setClusterCreationTime(); err != nil { + log.Error(err, "Failed to determine cluster creation time") + // If we failed to query prometheus, the cluster isn't ready. + // We want the main Reconcile loop to proceed, so don't return an error; but + // we want to requeue rapidly so we can keep checking for cluster birth. + impl.result = reconcile.Result{Requeue: true, RequeueAfter: time.Second} + return false, nil + } + maxClusterAge, err := getEnvInt(maxClusterAgeKey, maxClusterAgeDefault) + if err != nil { + // This is likely to result in a hot loop :( + return false, err + } + if impl.clusterTooOld(maxClusterAge) { + log.Info(fmt.Sprintf("INFO: Cluster is older than %d minutes. Ignoring health check.", maxClusterAge)) + impl.ready = true + return impl.ready, nil + } + + // Readiness job part 2: existing but not (yet) successul + if found { + // If the Job is still running, requeue with a little pause. + if job.Status.Active > 0 { + log.Info(fmt.Sprintf("INFO: Found an Active %s Job. Will requeue.", jobName)) + impl.result = reconcile.Result{Requeue: true, RequeueAfter: 10 * time.Second} + return false, nil + } + + // If we get here, the Job finished unsuccessfully. Delete it so we can recreate it. + log.Info(fmt.Sprintf("INFO: Deleting failed %s Job", jobName)) + err = impl.Client.Delete(context.TODO(), job) + // Let the rest of the reconcile proceed, but requeue so we'll come back and reassess the readiness job next time. + // We requeue whether there's an error (which might be "already gone") or not. + impl.result = reconcile.Result{Requeue: true} + return false, err + } + + // Readiness job part 3: the Job doesn't exist -- create it. + loadDefTemplate(job, fmt.Sprintf("%s.Job.yaml", jobName)) + log.Info(fmt.Sprintf("INFO: Creating %s Job", jobName)) + log.Info(fmt.Sprintf("DEBUG: Job def: %v", *job)) + err = impl.Client.Create(context.TODO(), job) + // Let the rest of the reconcile proceed, but requeue so we'll come back and reassess the readiness job next time. + // We requeue whether there's an error or not. + impl.result = reconcile.Result{Requeue: true} + return false, err +} + +func (impl *Impl) Result() reconcile.Result { + return impl.result +} + +func (impl *Impl) setPromAPI() error { + rawToken, err := ioutil.ReadFile("/var/run/secrets/kubernetes.io/serviceaccount/token") + if err != nil { + return fmt.Errorf("Couldn't read token file: %v", err) + } + + client, err := api.NewClient(api.Config{ + Address: "https://prometheus-k8s.openshift-monitoring.svc:9091", + RoundTripper: &http.Transport{ + Proxy: func(request *http.Request) (*url.URL, error) { + request.Header.Add("Authorization", "Bearer "+string(rawToken)) + return http.ProxyFromEnvironment(request) + }, + DialContext: (&net.Dialer{ + Timeout: 30 * time.Second, + KeepAlive: 30 * time.Second, + }).DialContext, + TLSClientConfig: &tls.Config{ + InsecureSkipVerify: true, + }, + TLSHandshakeTimeout: 10 * time.Second, + }, + }) + if err != nil { + return fmt.Errorf("Couldn't configure prometheus client: %v", err) + } + + impl.promAPI = promv1.NewAPI(client) + return nil +} + +func (impl *Impl) setClusterCreationTime() error { + // Is it cached? + if !impl.clusterCreationTime.IsZero() { + return nil + } + + if err := impl.setPromAPI(); err != nil { + return fmt.Errorf("Couldn't get prometheus API: %v", err) + } + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + when := time.Now() + // For testing, do something like this, subtracting the number of hours + // since you disabled CVO: + // when := time.Now().Add(-32*time.Hour) + result, warnings, err := impl.promAPI.Query(ctx, "cluster_version{type=\"initial\"}", when) + if err != nil { + return fmt.Errorf("Error querying Prometheus: %v", err) + } + if len(warnings) > 0 { + log.Info(fmt.Sprintf("Warnings: %v\n", warnings)) + } + + log.Info(fmt.Sprintf("DEBUG: Result of type %s:\n%s\n", result.Type().String(), result.String())) + resultVec := result.(model.Vector) + earliest := time.Time{} + for i := 0; i < resultVec.Len(); i++ { + thisTime := time.Unix(int64(resultVec[i].Value), 0) + if earliest.IsZero() || thisTime.Before(earliest) { + earliest = thisTime + } + } + if earliest.IsZero() { + return fmt.Errorf("Failed to determine cluster birth time from prometheus %s result %v", result.Type().String(), result.String()) + } + impl.clusterCreationTime = earliest + log.Info(fmt.Sprintf("INFO: Cluster created %v", earliest.UTC())) + return nil +} + +func (impl *Impl) clusterTooOld(maxAgeMinutes int) bool { + maxAge := time.Now().Add(time.Duration(-maxAgeMinutes) * time.Minute) + return impl.clusterCreationTime.Before(maxAge) +} + +// getEnvInt returns the integer value of the environment variable with the specified `key`. +// If the env var is unspecified/empty, the `def` value is returned. +// The error is non-nil if the env var is nonempty but cannot be parsed as an int. +func getEnvInt(key string, def int) (int, error) { + var intVal int + var err error + + strVal := os.Getenv(key) + + if strVal == "" { + // Env var unset; use the default + return def, nil + } + + if intVal, err = strconv.Atoi(strVal); err != nil { + return 0, fmt.Errorf("Invalid value for env var: %s=%s (expected int): %v", key, strVal, err) + } + + return intVal, nil +} + +func loadDefTemplate(receiver runtime.Object, defFile string) { + if err := yaml.Unmarshal(MustAsset(filepath.Join("defs", defFile)), receiver); err != nil { + panic(fmt.Sprintf("Couldn't load %s: %s", defFile, err.Error())) + } + // TODO: Why aren't these coming in from the def? + job := receiver.(*batchv1.Job) + job.TypeMeta = v1.TypeMeta{Kind: "Job", APIVersion: "batch/v1"} + job.ObjectMeta = v1.ObjectMeta{Name: jobName, Namespace: config.OperatorNamespace} + job.Spec.Template.Spec.Containers[0].ImagePullPolicy = corev1.PullAlways + job.Spec.Template.Spec.RestartPolicy = corev1.RestartPolicyOnFailure + job.Spec.Template.Spec.ServiceAccountName = jobName +} diff --git a/pkg/readiness/defs/osd-cluster-ready.Job.yaml b/pkg/readiness/defs/osd-cluster-ready.Job.yaml new file mode 100644 index 00000000..3e5bf221 --- /dev/null +++ b/pkg/readiness/defs/osd-cluster-ready.Job.yaml @@ -0,0 +1,17 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: osd-cluster-ready + namespace: openshift-monitoring +spec: + template: + metadata: + name: osd-cluster-ready + spec: + containers: + - name: osd-cluster-ready + image: quay.io/openshift-sre/osd-cluster-ready + imagePullPolicy: Always + command: ["/root/main"] + restartPolicy: OnFailure + serviceAccountName: osd-cluster-ready diff --git a/pkg/readiness/zz_generated_defs.go b/pkg/readiness/zz_generated_defs.go new file mode 100644 index 00000000..db8cc31c --- /dev/null +++ b/pkg/readiness/zz_generated_defs.go @@ -0,0 +1,236 @@ +// Code generated for package readiness by go-bindata DO NOT EDIT. (@generated) +// sources: +// defs/osd-cluster-ready.Job.yaml +package readiness + +import ( + "fmt" + "io/ioutil" + "os" + "path/filepath" + "strings" + "time" +) +type asset struct { + bytes []byte + info os.FileInfo +} + +type bindataFileInfo struct { + name string + size int64 + mode os.FileMode + modTime time.Time +} + +// Name return file name +func (fi bindataFileInfo) Name() string { + return fi.name +} + +// Size return file size +func (fi bindataFileInfo) Size() int64 { + return fi.size +} + +// Mode return file mode +func (fi bindataFileInfo) Mode() os.FileMode { + return fi.mode +} + +// Mode return file modify time +func (fi bindataFileInfo) ModTime() time.Time { + return fi.modTime +} + +// IsDir return file whether a directory +func (fi bindataFileInfo) IsDir() bool { + return fi.mode&os.ModeDir != 0 +} + +// Sys return file is sys mode +func (fi bindataFileInfo) Sys() interface{} { + return nil +} + +var _defsOsdClusterReadyJobYaml = []byte(`apiVersion: batch/v1 +kind: Job +metadata: + name: osd-cluster-ready + namespace: openshift-monitoring +spec: + template: + metadata: + name: osd-cluster-ready + spec: + containers: + - name: osd-cluster-ready + image: quay.io/openshift-sre/osd-cluster-ready + imagePullPolicy: Always + command: ["/root/main"] + restartPolicy: OnFailure + serviceAccountName: osd-cluster-ready +`) + +func defsOsdClusterReadyJobYamlBytes() ([]byte, error) { + return _defsOsdClusterReadyJobYaml, nil +} + +func defsOsdClusterReadyJobYaml() (*asset, error) { + bytes, err := defsOsdClusterReadyJobYamlBytes() + if err != nil { + return nil, err + } + + info := bindataFileInfo{name: "defs/osd-cluster-ready.Job.yaml", size: 0, mode: os.FileMode(0), modTime: time.Unix(0, 0)} + a := &asset{bytes: bytes, info: info} + return a, nil +} + +// Asset loads and returns the asset for the given name. +// It returns an error if the asset could not be found or +// could not be loaded. +func Asset(name string) ([]byte, error) { + cannonicalName := strings.Replace(name, "\\", "/", -1) + if f, ok := _bindata[cannonicalName]; ok { + a, err := f() + if err != nil { + return nil, fmt.Errorf("Asset %s can't read by error: %v", name, err) + } + return a.bytes, nil + } + return nil, fmt.Errorf("Asset %s not found", name) +} + +// MustAsset is like Asset but panics when Asset would return an error. +// It simplifies safe initialization of global variables. +func MustAsset(name string) []byte { + a, err := Asset(name) + if err != nil { + panic("asset: Asset(" + name + "): " + err.Error()) + } + + return a +} + +// AssetInfo loads and returns the asset info for the given name. +// It returns an error if the asset could not be found or +// could not be loaded. +func AssetInfo(name string) (os.FileInfo, error) { + cannonicalName := strings.Replace(name, "\\", "/", -1) + if f, ok := _bindata[cannonicalName]; ok { + a, err := f() + if err != nil { + return nil, fmt.Errorf("AssetInfo %s can't read by error: %v", name, err) + } + return a.info, nil + } + return nil, fmt.Errorf("AssetInfo %s not found", name) +} + +// AssetNames returns the names of the assets. +func AssetNames() []string { + names := make([]string, 0, len(_bindata)) + for name := range _bindata { + names = append(names, name) + } + return names +} + +// _bindata is a table, holding each asset generator, mapped to its name. +var _bindata = map[string]func() (*asset, error){ + "defs/osd-cluster-ready.Job.yaml": defsOsdClusterReadyJobYaml, +} + +// AssetDir returns the file names below a certain +// directory embedded in the file by go-bindata. +// For example if you run go-bindata on data/... and data contains the +// following hierarchy: +// data/ +// foo.txt +// img/ +// a.png +// b.png +// then AssetDir("data") would return []string{"foo.txt", "img"} +// AssetDir("data/img") would return []string{"a.png", "b.png"} +// AssetDir("foo.txt") and AssetDir("notexist") would return an error +// AssetDir("") will return []string{"data"}. +func AssetDir(name string) ([]string, error) { + node := _bintree + if len(name) != 0 { + cannonicalName := strings.Replace(name, "\\", "/", -1) + pathList := strings.Split(cannonicalName, "/") + for _, p := range pathList { + node = node.Children[p] + if node == nil { + return nil, fmt.Errorf("Asset %s not found", name) + } + } + } + if node.Func != nil { + return nil, fmt.Errorf("Asset %s not found", name) + } + rv := make([]string, 0, len(node.Children)) + for childName := range node.Children { + rv = append(rv, childName) + } + return rv, nil +} + +type bintree struct { + Func func() (*asset, error) + Children map[string]*bintree +} + +var _bintree = &bintree{nil, map[string]*bintree{ + "defs": &bintree{nil, map[string]*bintree{ + "osd-cluster-ready.Job.yaml": &bintree{defsOsdClusterReadyJobYaml, map[string]*bintree{}}, + }}, +}} + +// RestoreAsset restores an asset under the given directory +func RestoreAsset(dir, name string) error { + data, err := Asset(name) + if err != nil { + return err + } + info, err := AssetInfo(name) + if err != nil { + return err + } + err = os.MkdirAll(_filePath(dir, filepath.Dir(name)), os.FileMode(0755)) + if err != nil { + return err + } + err = ioutil.WriteFile(_filePath(dir, name), data, info.Mode()) + if err != nil { + return err + } + err = os.Chtimes(_filePath(dir, name), info.ModTime(), info.ModTime()) + if err != nil { + return err + } + return nil +} + +// RestoreAssets restores an asset under the given directory recursively +func RestoreAssets(dir, name string) error { + children, err := AssetDir(name) + // File + if err != nil { + return RestoreAsset(dir, name) + } + // Dir + for _, child := range children { + err = RestoreAssets(dir, filepath.Join(name, child)) + if err != nil { + return err + } + } + return nil +} + +func _filePath(dir, name string) string { + cannonicalName := strings.Replace(name, "\\", "/", -1) + return filepath.Join(append([]string{dir}, strings.Split(cannonicalName, "/")...)...) +} diff --git a/pkg/readiness/zz_generated_mocks.go b/pkg/readiness/zz_generated_mocks.go new file mode 100644 index 00000000..9bd3bdf4 --- /dev/null +++ b/pkg/readiness/zz_generated_mocks.go @@ -0,0 +1,105 @@ +// Code generated by MockGen. DO NOT EDIT. +// Source: cluster_ready.go + +// Package readiness is a generated GoMock package. +package readiness + +import ( + gomock "github.com/golang/mock/gomock" + reflect "reflect" + reconcile "sigs.k8s.io/controller-runtime/pkg/reconcile" +) + +// MockInterface is a mock of Interface interface +type MockInterface struct { + ctrl *gomock.Controller + recorder *MockInterfaceMockRecorder +} + +// MockInterfaceMockRecorder is the mock recorder for MockInterface +type MockInterfaceMockRecorder struct { + mock *MockInterface +} + +// NewMockInterface creates a new mock instance +func NewMockInterface(ctrl *gomock.Controller) *MockInterface { + mock := &MockInterface{ctrl: ctrl} + mock.recorder = &MockInterfaceMockRecorder{mock} + return mock +} + +// EXPECT returns an object that allows the caller to indicate expected use +func (m *MockInterface) EXPECT() *MockInterfaceMockRecorder { + return m.recorder +} + +// IsReady mocks base method +func (m *MockInterface) IsReady() (bool, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "IsReady") + ret0, _ := ret[0].(bool) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// IsReady indicates an expected call of IsReady +func (mr *MockInterfaceMockRecorder) IsReady() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "IsReady", reflect.TypeOf((*MockInterface)(nil).IsReady)) +} + +// Result mocks base method +func (m *MockInterface) Result() reconcile.Result { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Result") + ret0, _ := ret[0].(reconcile.Result) + return ret0 +} + +// Result indicates an expected call of Result +func (mr *MockInterfaceMockRecorder) Result() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Result", reflect.TypeOf((*MockInterface)(nil).Result)) +} + +// setClusterCreationTime mocks base method +func (m *MockInterface) setClusterCreationTime() error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "setClusterCreationTime") + ret0, _ := ret[0].(error) + return ret0 +} + +// setClusterCreationTime indicates an expected call of setClusterCreationTime +func (mr *MockInterfaceMockRecorder) setClusterCreationTime() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "setClusterCreationTime", reflect.TypeOf((*MockInterface)(nil).setClusterCreationTime)) +} + +// clusterTooOld mocks base method +func (m *MockInterface) clusterTooOld(arg0 int) bool { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "clusterTooOld", arg0) + ret0, _ := ret[0].(bool) + return ret0 +} + +// clusterTooOld indicates an expected call of clusterTooOld +func (mr *MockInterfaceMockRecorder) clusterTooOld(arg0 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "clusterTooOld", reflect.TypeOf((*MockInterface)(nil).clusterTooOld), arg0) +} + +// setPromAPI mocks base method +func (m *MockInterface) setPromAPI() error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "setPromAPI") + ret0, _ := ret[0].(error) + return ret0 +} + +// setPromAPI indicates an expected call of setPromAPI +func (mr *MockInterfaceMockRecorder) setPromAPI() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "setPromAPI", reflect.TypeOf((*MockInterface)(nil).setPromAPI)) +}