Skip to content

Commit

Permalink
clusteroperator: Report when OLM reaches "level" and check syncs
Browse files Browse the repository at this point in the history
Cluster operators are expected to report the version of the payload
they are included in once they are "deployed", and also to keep the
cluster operator object created. Have the OLM operator keep CO up
to date, report the payload version once it hits available, and use
the count of successful syncs from the queueInformers as a probalistic
measurement of "available" (i.e. is the operator able to retire syncs).
A future change should add a "health over time" metric or a "has
successfully synced all InstallPlans at least once" metric to replace
the current estimation.
  • Loading branch information
smarterclayton authored and ecordell committed Mar 15, 2019
1 parent 176bf33 commit 1c10730
Show file tree
Hide file tree
Showing 9 changed files with 259 additions and 77 deletions.
2 changes: 1 addition & 1 deletion cmd/catalog/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,6 @@ func main() {
http.Handle("/metrics", promhttp.Handler())
go http.ListenAndServe(":8081", nil)

_, done := catalogOperator.Run(stopCh)
_, done, _ := catalogOperator.Run(stopCh)
<-done
}
242 changes: 171 additions & 71 deletions cmd/olm/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"fmt"
"net/http"
"os"
"reflect"
"strings"
"time"

Expand All @@ -13,21 +14,20 @@ import (
v1 "k8s.io/api/core/v1"
k8serrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/discovery"
"k8s.io/client-go/tools/clientcmd"

configv1 "github.com/openshift/api/config/v1"
configv1client "github.com/openshift/client-go/config/clientset/versioned/typed/config/v1"
clusteroperatorv1helpers "github.com/openshift/library-go/pkg/config/clusteroperator/v1helpers"
operatorv1helpers "github.com/openshift/library-go/pkg/operator/v1helpers"
"github.com/operator-framework/operator-lifecycle-manager/pkg/api/client"
"github.com/operator-framework/operator-lifecycle-manager/pkg/controller/install"
"github.com/operator-framework/operator-lifecycle-manager/pkg/controller/operators/olm"
"github.com/operator-framework/operator-lifecycle-manager/pkg/lib/operatorclient"
"github.com/operator-framework/operator-lifecycle-manager/pkg/lib/signals"
"github.com/operator-framework/operator-lifecycle-manager/pkg/metrics"
olmversion "github.com/operator-framework/operator-lifecycle-manager/pkg/version"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/client-go/discovery"
"k8s.io/client-go/tools/clientcmd"
)

const (
Expand Down Expand Up @@ -128,98 +128,198 @@ func main() {
http.Handle("/metrics", promhttp.Handler())
go http.ListenAndServe(":8081", nil)

ready, done := operator.Run(stopCh)
ready, done, sync := operator.Run(stopCh)
<-ready

if *writeStatusName != "" {
opStatusGV := schema.GroupVersion{
Group: "config.openshift.io",
Version: "v1",
monitorClusterStatus(sync, stopCh, opClient, configClient)
}

<-done
}

func monitorClusterStatus(syncCh chan error, stopCh <-chan struct{}, opClient operatorclient.ClientInterface, configClient configv1client.ConfigV1Interface) {
var (
syncs int
successfulSyncs int
hasClusterOperator bool
)
go wait.Until(func() {
// slow poll until we see a cluster operator API, which could be never
if !hasClusterOperator {
opStatusGV := schema.GroupVersion{
Group: "config.openshift.io",
Version: "v1",
}
err := discovery.ServerSupportsVersion(opClient.KubernetesInterface().Discovery(), opStatusGV)
if err != nil {
log.Infof("ClusterOperator api not present, skipping update (%v)", err)
time.Sleep(time.Minute)
return
}
hasClusterOperator = true
}
err := discovery.ServerSupportsVersion(opClient.KubernetesInterface().Discovery(), opStatusGV)
if err != nil {
log.Infof("ClusterOperator api not present, skipping update (%v)", err)
} else {
existing, err := configClient.ClusterOperators().Get(*writeStatusName, metav1.GetOptions{})
if k8serrors.IsNotFound(err) {
log.Info("Existing operator status not found, creating")
created, err := configClient.ClusterOperators().Create(&configv1.ClusterOperator{
ObjectMeta: metav1.ObjectMeta{
Name: *writeStatusName,
},
})
if err != nil {
log.Fatalf("ClusterOperator create failed: %v\n", err)

// Sample the sync channel and see whether we're successfully retiring syncs as a
// proxy for "working" (we can't know when we hit level, but we can at least verify
// we are seeing some syncs succeeding). Once we observe at least one successful
// sync we can begin reporting available and level.
select {
case err, ok := <-syncCh:
if !ok {
// syncCh should only close if the Run() loop exits
time.Sleep(5 * time.Second)
log.Fatalf("Status sync channel closed but process did not exit in time")
}
syncs++
if err == nil {
successfulSyncs++
}
// grab any other sync events that have accumulated
for len(syncCh) > 0 {
if err := <-syncCh; err == nil {
successfulSyncs++
}
syncs++
}
// if we haven't yet accumulated enough syncs, wait longer
// TODO: replace these magic numbers with a better measure of syncs across all queueInformers
if successfulSyncs < 5 || syncs < 10 {
log.Printf("Waiting to observe more successful syncs")
return
}
}

created.Status = configv1.ClusterOperatorStatus{
// create the cluster operator in an initial state if it does not exist
existing, err := configClient.ClusterOperators().Get(*writeStatusName, metav1.GetOptions{})
if k8serrors.IsNotFound(err) {
log.Info("Existing operator status not found, creating")
created, createErr := configClient.ClusterOperators().Create(&configv1.ClusterOperator{
ObjectMeta: metav1.ObjectMeta{
Name: *writeStatusName,
},
Status: configv1.ClusterOperatorStatus{
Conditions: []configv1.ClusterOperatorStatusCondition{
configv1.ClusterOperatorStatusCondition{
Type: configv1.OperatorProgressing,
Status: configv1.ConditionFalse,
Message: fmt.Sprintf("Done deploying %s.", olmversion.OLMVersion),
Status: configv1.ConditionTrue,
Message: fmt.Sprintf("Installing %s", olmversion.OLMVersion),
LastTransitionTime: metav1.Now(),
},
configv1.ClusterOperatorStatusCondition{
Type: configv1.OperatorFailing,
Status: configv1.ConditionFalse,
Message: fmt.Sprintf("Done deploying %s.", olmversion.OLMVersion),
LastTransitionTime: metav1.Now(),
},
configv1.ClusterOperatorStatusCondition{
Type: configv1.OperatorAvailable,
Status: configv1.ConditionTrue,
Message: fmt.Sprintf("Done deploying %s.", olmversion.OLMVersion),
Status: configv1.ConditionFalse,
LastTransitionTime: metav1.Now(),
},
},
Versions: []configv1.OperandVersion{{
},
})
if createErr != nil {
log.Errorf("Failed to create cluster operator: %v\n", createErr)
return
}
existing = created
err = nil
}
if err != nil {
log.Errorf("Unable to retrieve cluster operator: %v", err)
return
}

// update the status with the appropriate state
previousStatus := existing.Status.DeepCopy()
switch {
case successfulSyncs > 0:
setOperatorStatusCondition(&existing.Status.Conditions, configv1.ClusterOperatorStatusCondition{
Type: configv1.OperatorFailing,
Status: configv1.ConditionFalse,
})
setOperatorStatusCondition(&existing.Status.Conditions, configv1.ClusterOperatorStatusCondition{
Type: configv1.OperatorProgressing,
Status: configv1.ConditionFalse,
Message: fmt.Sprintf("Deployed %s", olmversion.OLMVersion),
})
setOperatorStatusCondition(&existing.Status.Conditions, configv1.ClusterOperatorStatusCondition{
Type: configv1.OperatorAvailable,
Status: configv1.ConditionTrue,
})
// we set the versions array when all the latest code is deployed and running - in this case,
// the sync method is responsible for guaranteeing that happens before it returns nil
if version := os.Getenv("RELEASE_VERSION"); len(version) > 0 {
existing.Status.Versions = []configv1.OperandVersion{
{
Name: "operator",
Version: olmversion.Full(),
}},
}
_, err = configClient.ClusterOperators().UpdateStatus(created)
if err != nil {
log.Fatalf("ClusterOperator update status failed: %v", err)
Version: version,
},
{
Name: "operator-lifecycle-manager",
Version: olmversion.OLMVersion,
},
}
} else if err != nil {
log.Fatalf("ClusterOperators get failed: %v", err)
} else {
clusteroperatorv1helpers.SetStatusCondition(&existing.Status.Conditions, configv1.ClusterOperatorStatusCondition{
Type: configv1.OperatorProgressing,
Status: configv1.ConditionFalse,
Message: fmt.Sprintf("Done deploying %s.", olmversion.OLMVersion),
LastTransitionTime: metav1.Now(),
})
clusteroperatorv1helpers.SetStatusCondition(&existing.Status.Conditions, configv1.ClusterOperatorStatusCondition{
Type: configv1.OperatorFailing,
Status: configv1.ConditionFalse,
Message: fmt.Sprintf("Done deploying %s.", olmversion.OLMVersion),
LastTransitionTime: metav1.Now(),
})
clusteroperatorv1helpers.SetStatusCondition(&existing.Status.Conditions, configv1.ClusterOperatorStatusCondition{
Type: configv1.OperatorAvailable,
Status: configv1.ConditionTrue,
Message: fmt.Sprintf("Done deploying %s.", olmversion.OLMVersion),
LastTransitionTime: metav1.Now(),
})

olmOperandVersion := configv1.OperandVersion{Name: "operator", Version: olmversion.Full()}
// look for operator version, even though in OLM's case should only be one
for _, item := range existing.Status.Versions {
if item.Name == "operator" && item != olmOperandVersion {
// if a cluster wide upgrade has occurred, hopefully any existing operator statuses have been deleted
log.Infof("Updating version from %v to %v\n", item.Version, olmversion.Full())
}
}
operatorv1helpers.SetOperandVersion(&existing.Status.Versions, olmOperandVersion)
_, err = configClient.ClusterOperators().UpdateStatus(existing)
if err != nil {
log.Fatalf("ClusterOperator update status failed: %v", err)
}
existing.Status.Versions = nil
}
default:
setOperatorStatusCondition(&existing.Status.Conditions, configv1.ClusterOperatorStatusCondition{
Type: configv1.OperatorFailing,
Status: configv1.ConditionTrue,
Message: "Waiting for updates to take effect",
})
setOperatorStatusCondition(&existing.Status.Conditions, configv1.ClusterOperatorStatusCondition{
Type: configv1.OperatorProgressing,
Status: configv1.ConditionFalse,
Message: fmt.Sprintf("Waiting to see update %s succeed", olmversion.OLMVersion),
})
// TODO: use % errors within a window to report available
}

// update the status
if !reflect.DeepEqual(previousStatus, &existing.Status) {
if _, err := configClient.ClusterOperators().UpdateStatus(existing); err != nil {
log.Errorf("Unable to update cluster operator status: %v", err)
}
}

// if we've reported success, we can sleep longer, otherwise we want to keep watching for
// successful
if successfulSyncs > 0 {
time.Sleep(5 * time.Minute)
}

}, 5*time.Second, stopCh)
}

func setOperatorStatusCondition(conditions *[]configv1.ClusterOperatorStatusCondition, newCondition configv1.ClusterOperatorStatusCondition) {
if conditions == nil {
conditions = &[]configv1.ClusterOperatorStatusCondition{}
}
existingCondition := findOperatorStatusCondition(*conditions, newCondition.Type)
if existingCondition == nil {
newCondition.LastTransitionTime = metav1.NewTime(time.Now())
*conditions = append(*conditions, newCondition)
return
}

<-done
if existingCondition.Status != newCondition.Status {
existingCondition.Status = newCondition.Status
existingCondition.LastTransitionTime = newCondition.LastTransitionTime
}

existingCondition.Reason = newCondition.Reason
existingCondition.Message = newCondition.Message
}

func findOperatorStatusCondition(conditions []configv1.ClusterOperatorStatusCondition, conditionType configv1.ClusterStatusConditionType) *configv1.ClusterOperatorStatusCondition {
for i := range conditions {
if conditions[i].Type == conditionType {
return &conditions[i]
}
}

return nil
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
-----BEGIN CERTIFICATE-----
MIIDGDCCAgCgAwIBAgIBAjANBgkqhkiG9w0BAQsFADAiMSAwHgYDVQQDDBdsb2Nh
bGhvc3QtY2FAMTU1MjQyMzAyMDAeFw0xOTAzMTIyMDM3MDFaFw0yMDAzMTEyMDM3
MDFaMB8xHTAbBgNVBAMMFGxvY2FsaG9zdEAxNTUyNDIzMDIxMIIBIjANBgkqhkiG
9w0BAQEFAAOCAQ8AMIIBCgKCAQEAubkXRqN2xYxJiVhMjHnOtPCkU44QcLosVpIj
tbUgzjJt0BDv/XNCMhbpD3dfKjMKZiKXt1dKDK2Tl52AceWqipVQlCf7kiX+CjuO
gTAIEbVC7FWdu/sDI8BWbhs5knT+8Y7a5uGVexclZifvcbASuVtedLH47XI25Ak4
s103Usy5Z2WXOLd79w/tsAr1kvQzveIdbn+upMu4to2wmfXhiLaU2qMhGoz+2hzm
z+SXkB7uCgFbGuLIUj99/faSZ3CAH6EwPIerAKtY+1hdVmsjqpIrSs4jD7YyfmVN
3+/MLTSMyHrghHYKt/SiRdCuVrbMhCylU8NFry+iuBIsOA202QIDAQABo1wwWjAO
BgNVHQ8BAf8EBAMCBaAwEwYDVR0lBAwwCgYIKwYBBQUHAwEwDAYDVR0TAQH/BAIw
ADAlBgNVHREEHjAcgglsb2NhbGhvc3SCCWxvY2FsaG9zdIcEfwAAATANBgkqhkiG
9w0BAQsFAAOCAQEAacr9G8nNsHQpLCW+0meGmDz9deTfLYldFCbCjsPiUDWs9tUn
O+04ykac2tEqZt2Ovkp6gntRPBCOKpgwHYvo0CJtCaL4yh6wYMvlbjHmHR/y+Ioy
HymMmaQ06iVIhb2KoKFJvFtFUVNg6QE9w7dm9/C73eHcv3JhqYhGw3qBfUI6lmIc
lWGj6WGVNfslofTYMkshbRGNZ3gFGkvcQvPOhKb/K4A3X9ZTGy9XyydVAOpdk/5n
FBD4gOJJVSq2jJ5SOTJd5Z/YrY2tbCfZeuuPuxBK4XG3hnLN2fk9URwfCDc9EUQg
aYagxskTB6jaDkFD5lfXxEc3W+/mP62i7mH/fQ==
-----END CERTIFICATE-----
-----BEGIN CERTIFICATE-----
MIIC4jCCAcqgAwIBAgIBATANBgkqhkiG9w0BAQsFADAiMSAwHgYDVQQDDBdsb2Nh
bGhvc3QtY2FAMTU1MjQyMzAyMDAeFw0xOTAzMTIyMDM3MDBaFw0yMDAzMTEyMDM3
MDBaMCIxIDAeBgNVBAMMF2xvY2FsaG9zdC1jYUAxNTUyNDIzMDIwMIIBIjANBgkq
hkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAnQb0E1iZ/R1J8bdzDP/EFx73JpU6fw6T
aTY9QTWWgt4EcamLpJK5Z+dOLhj/i6rQbe/vKpI6BbBo+S6MuBemyUbc4VpoTde6
Hn26uWSlkQA72GLHYWvD+ahdRpLxOFddog9xcfEoYN/rlpwMp030y6clQhrb4WML
x1uQzqyOvzRHAN4NqxmLXbepTyWqiM3tLe2f4mPfcg/vhwQ5TSqR/Rm3FPh3rDdA
zvk9bGkvyX8iAUoLw/0aHe2dzTfnvBvkTJFEaLq61FLQ/zfMVRhPI2Fwljxq+jSq
FoYju/vr1sWxKc+AFxDdAZdRey2Afi1bVf8JHiDU8FSe9UcfqBUoyQIDAQABoyMw
ITAOBgNVHQ8BAf8EBAMCAqQwDwYDVR0TAQH/BAUwAwEB/zANBgkqhkiG9w0BAQsF
AAOCAQEAmrIS4kJNVjKj4vSj0lNWzOjk31CI26rKwPo+cFhvnPh+eg6wI+3I/gLC
yf9X5KIPaNS5MGzNEmpr7Ml7IviqUn8rSoVryoQwKtqnMhsGr3/Y/Rrd27OIYEW+
6/phRyI2rM8Vzo0RVdqcQT+6qvknbZ4fr/3Or3YbjycyfqNeL0SzXff+c8s9skDw
r9OV5uMvmVJv3VNBhAEX83I4zJsfrH9XtAmz255aw24vBGMUHYEdH15K/IBxh4LZ
Y5AXZhVazjlzwWwnUpu8k88vesCUay8c4VtXfXHQTk/oS/ZDn7eQ7hTvzqYfEH2k
znJYRthnuUZo6M/rtMWzXK6QuunRtg==
-----END CERTIFICATE-----
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
-----BEGIN RSA PRIVATE KEY-----
MIIEowIBAAKCAQEAubkXRqN2xYxJiVhMjHnOtPCkU44QcLosVpIjtbUgzjJt0BDv
/XNCMhbpD3dfKjMKZiKXt1dKDK2Tl52AceWqipVQlCf7kiX+CjuOgTAIEbVC7FWd
u/sDI8BWbhs5knT+8Y7a5uGVexclZifvcbASuVtedLH47XI25Ak4s103Usy5Z2WX
OLd79w/tsAr1kvQzveIdbn+upMu4to2wmfXhiLaU2qMhGoz+2hzmz+SXkB7uCgFb
GuLIUj99/faSZ3CAH6EwPIerAKtY+1hdVmsjqpIrSs4jD7YyfmVN3+/MLTSMyHrg
hHYKt/SiRdCuVrbMhCylU8NFry+iuBIsOA202QIDAQABAoIBAEqc4o39c+TvdEea
Ur6I3RNyLgJna5FuKgvpkDEbAH/2YImblF7VZD2tWJpfEbtpX/8iXKNKjTREs6vQ
md6oLviX/hRXb8kKPGIuBRU/j65VjPpXdxQjRuKhDdgUVe/R0u6GvsjMzfnylZLR
7m9VFmCjJXJqYaA7J3Q7hC0DAQvhBiWk0lZHR7cjGeG37fIT2yzH7gf8M4VeYjCn
asatNUuAOORVfGudtKLCgFk/bmO1Nb5UwCYcz4OXVEpDBWrcg1SsvYwKxyUDxO8a
8A7TAWWEXjWK+sPmaJkUzRfnd/1chvlzcaawXfgfXRHcAaLWRaBu4fdYS7fwMYy6
+/0Pa5ECgYEA0GCkaAl7qicfHTY6xTkBvJwkXu/rDIfzJCRVtdlXOhPmJ+F3+0Rj
0d+O6LMNSyJpYdOYeWOJbjHMJ92XIRJVxqF+K2O6dToEMTG2XbqMm2gtyn16BoTt
ngzcWqeo+zqwvHxLcM6L/tjivnbsI7mVDpdcBJZwVd6VwrR2NgRh0tUCgYEA5CsF
rJUlOR3JJ1CUTrT1G4smBES00lL3QFlhkiF4zWOW6NwhswZlYPkzqe6tgxmtGAuQ
mJINMcqWUkU18BWLh8RRTH+oKcUbmZkTqP9k/bqe6foIm8UyxVsSF80S4tRtMcWm
87Nd2h+FbYY2MP9RFscdDDd5FHf+weSCbnn0s/UCgYEAz05WQeqtTSp+meFJtsxw
HeR5irnFbkIScvJzEueXEACcCTEW3LO9Wx6+XmND5mvly51nI90S7L4+Das2n4BO
Nb6UdzZQWi/N2+NJOxZMrI+Ifts2eyXkAElrMAV85/QLwHkn1KKoRHIhortNUn1e
/ZU3xpikScmX1I0UzciuScECgYAbWrEOdL8GrvR7uyRcn0M3byI6psYK5RlxZIXX
EB48eXERL7r2jJDA5H92IwA4VG61EEXglLnyOzh0WonR47NbroSUqEVP5KqfaoO5
4gyIgsQkhu5bRnQExxtPMS3Pdeo1al3On7Vjvh2v+MQscZ+WHH72BPyGILCxLCUa
+5IDtQKBgGE1Wl2dmdAyedzCX93oOjnVQ2xdH4s+4k7yHBYEt9AIzbuZCSZLMsf+
hDoU/TokDRXkrHnRvZvhpljgjRJULktnmZxRWW8e/YXrp+gTvSq7/bZCob8Dgs80
w21YuIgo6sXV2uvqGUbZ3YvJQU0GnoFB/GztGlmuVyU0jpsJKq5P
-----END RSA PRIVATE KEY-----
2 changes: 2 additions & 0 deletions manifests/0000_50_olm_06-olm-operator.deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ spec:
path: /healthz
port: 8080
env:
- name: RELEASE_VERSION
value: "0.0.1-snapshot"
- name: OPERATOR_NAMESPACE
valueFrom:
fieldRef:
Expand Down
4 changes: 4 additions & 0 deletions manifests/0000_50_olm_14-operatorstatus.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,7 @@ apiVersion: config.openshift.io/v1
kind: ClusterOperator
metadata:
name: operator-lifecycle-manager
status:
versions:
- name: operator
version: "0.0.1-snapshot"
Loading

0 comments on commit 1c10730

Please sign in to comment.