Skip to content

Commit

Permalink
ROX-22549: Add an addon metric to track version mismatch (#1868)
Browse files Browse the repository at this point in the history
  • Loading branch information
kovayur authored Jun 7, 2024
1 parent 87fc497 commit 3ba01f2
Show file tree
Hide file tree
Showing 2 changed files with 76 additions and 1 deletion.
17 changes: 17 additions & 0 deletions internal/dinosaur/pkg/services/addon.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (
"github.com/stackrox/acs-fleet-manager/pkg/client/ocm"
ocmImpl "github.com/stackrox/acs-fleet-manager/pkg/client/ocm/impl"
"github.com/stackrox/acs-fleet-manager/pkg/features"
"github.com/stackrox/acs-fleet-manager/pkg/metrics"
"github.com/stackrox/acs-fleet-manager/pkg/shared"
"golang.org/x/exp/maps"
)
Expand Down Expand Up @@ -115,9 +116,11 @@ func (p *AddonProvisioner) Provision(cluster api.Cluster, expectedConfigs []gito
}
if clusterInstallationDifferent(installedOnCluster, versionInstalledInOCM) {
multiErr = multierror.Append(multiErr, p.updateAddon(clusterID, expectedConfig))
updateAddonStatusMetric(clusterID, installedOnCluster, versionInstalledInOCM, metrics.AddonUpgrade)
} else {
glog.V(10).Infof("Addon %s is already up-to-date", installedOnCluster.ID)
multiErr = validateUpToDateAddon(multiErr, installedInOCM, installedOnCluster)
updateAddonStatusMetric(clusterID, installedOnCluster, versionInstalledInOCM, metrics.AddonUp)
}
}

Expand All @@ -129,6 +132,20 @@ func (p *AddonProvisioner) Provision(cluster api.Cluster, expectedConfigs []gito
return errorOrNil(multiErr)
}

func updateAddonStatusMetric(clusterID string, installedOnCluster dbapi.AddonInstallation, versionInstalledInOCM *clustersmgmtv1.AddOnVersion, status metrics.AddonStatus) {
metrics.UpdateClusterAddonStatusMetric(
installedOnCluster.ID,
clusterID,
versionInstalledInOCM.ID(),
versionInstalledInOCM.SourceImage(),
versionInstalledInOCM.PackageImage(),
installedOnCluster.Version,
installedOnCluster.SourceImage,
installedOnCluster.PackageImage,
status,
)
}

func validateUpToDateAddon(multiErr *multierror.Error, ocmInstallation *clustersmgmtv1.AddOnInstallation, dataPlaneInstallation dbapi.AddonInstallation) *multierror.Error {
if ocmInstallation.State() == clustersmgmtv1.AddOnInstallationStateFailed {
// addon is already up-to-date with gitops config and still failed
Expand Down
60 changes: 59 additions & 1 deletion pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ import (
"time"

constants2 "github.com/stackrox/acs-fleet-manager/internal/dinosaur/constants"

"github.com/stackrox/acs-fleet-manager/pkg/api"

"github.com/prometheus/client_golang/prometheus"
Expand Down Expand Up @@ -74,6 +73,9 @@ const (
// ClusterStatusCapacityUsed - metric name for the current number of instances
ClusterStatusCapacityUsed = "cluster_status_capacity_used"

// ClusterAddonStatusMetric - metric name for the cluster addon status represented by ClusterAddonStatus
ClusterAddonStatusMetric = "cluster_addon_status"

// GitopsConfigProviderErrorCount - metric name for the number of errors encountered while fetching GitOps config
GitopsConfigProviderErrorCount = "gitops_config_provider_error_count"

Expand All @@ -85,6 +87,13 @@ const (
LabelDatabaseQueryType = "query"
LabelRegion = "region"
LabelInstanceType = "instance_type"

labelAddonOCMVersion = "ocm_version"
labelAddonOCMSourceImage = "ocm_source_image"
labelAddonOCMPackageImage = "ocm_package_image"
labelAddonClusterVersion = "cluster_version"
labelAddonClusterSourceImage = "cluster_source_image"
labelAddonClusterPackageImage = "cluster_package_image"
)

// JobType metric to capture
Expand Down Expand Up @@ -171,6 +180,17 @@ var clusterStatusCapacityLabels = []string{
LabelClusterID,
}

var clusterAddonStatusLabels = []string{
LabelID,
LabelClusterID,
labelAddonOCMVersion,
labelAddonOCMSourceImage,
labelAddonOCMPackageImage,
labelAddonClusterVersion,
labelAddonClusterSourceImage,
labelAddonClusterPackageImage,
}

// #### Metrics for Dataplane clusters - Start ####
// create a new histogramVec for cluster creation duration
var requestClusterCreationDurationMetric = prometheus.NewHistogramVec(
Expand Down Expand Up @@ -707,6 +727,42 @@ func init() {
GitopsConfigProviderErrorCounter.WithLabelValues().Add(0)
}

// AddonStatus represents the status of the addon installation on a Data Plane cluster
type AddonStatus int

const (
// AddonUp the addon is up and running
AddonUp AddonStatus = iota
// AddonUpgrade the addon is upgrading
AddonUpgrade
)

// clusterAddonStatusMetric create a new GaugeVec for cluster addon upgrade started timestamp
var clusterAddonStatusMetric = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Subsystem: FleetManager,
Name: ClusterAddonStatusMetric,
Help: "metric name for the time period after the addon is upgraded in OCM but not yet installed on a cluster in seconds",
},
clusterAddonStatusLabels,
)

// UpdateClusterAddonStatusMetric updates ClusterAddonStatusMetric Metric
func UpdateClusterAddonStatusMetric(addonID, clusterID, ocmVersion, ocmSourceImage, ocmPackageImage,
clusterVersion, clusterSourceImage, clusterPackageImage string, status AddonStatus) {
labels := prometheus.Labels{
LabelID: addonID,
LabelClusterID: clusterID,
labelAddonOCMVersion: ocmVersion,
labelAddonOCMSourceImage: ocmSourceImage,
labelAddonOCMPackageImage: ocmPackageImage,
labelAddonClusterVersion: clusterVersion,
labelAddonClusterSourceImage: clusterSourceImage,
labelAddonClusterPackageImage: clusterPackageImage,
}
clusterAddonStatusMetric.With(labels).Set(float64(status))
}

// UpdateDatabaseQueryDurationMetric Update the observatorium request duration metric with the following labels:
// - status: (i.e. "success" or "failure")
// - queryType: (i.e. "SELECT", "UPDATE", "INSERT", "DELETE")
Expand All @@ -731,6 +787,7 @@ func init() {
prometheus.MustRegister(centralPerClusterCountMetric)
prometheus.MustRegister(clusterStatusCapacityMaxMetric)
prometheus.MustRegister(clusterStatusCapacityUsedMetric)
prometheus.MustRegister(clusterAddonStatusMetric)
prometheus.MustRegister(GitopsConfigProviderErrorCounter)

// metrics for Centrals
Expand Down Expand Up @@ -801,6 +858,7 @@ func Reset() {
centralPerClusterCountMetric.Reset()
clusterStatusCapacityMaxMetric.Reset()
clusterStatusCapacityUsedMetric.Reset()
clusterAddonStatusMetric.Reset()
GitopsConfigProviderErrorCounter.Reset()

requestCentralCreationDurationMetric.Reset()
Expand Down

0 comments on commit 3ba01f2

Please sign in to comment.