From 756f500ba441f0d2e32927ca372c9a6387dddf38 Mon Sep 17 00:00:00 2001 From: Suhani Date: Fri, 13 Jan 2023 18:28:09 -0500 Subject: [PATCH] Addressing PR comments Most of the changes are as per the PR comments. Additional changes: - Rename files and package names as per Golang naming conventions - Add Hive Operator metrics in documentation --- ...durationMetrics.go => duration_metrics.go} | 0 .../{metricsConfig.go => metrics_config.go} | 3 +- apis/hive/v1/zz_generated.deepcopy.go | 32 +++++++++ .../crds/hive.openshift.io_hiveconfigs.yaml | 3 +- docs/{hive-metrics.md => hive_metrics.md} | 34 ++++++---- hack/app-sre/saas-template.yaml | 4 +- .../clusterdeployment_controller.go | 13 +--- .../clusterdeployment_controller_test.go | 6 +- .../clusterdeployment/clusterinstalls.go | 4 +- .../clusterdeployment/clusterprovisions.go | 4 +- pkg/controller/clusterdeployment/metrics.go | 47 +++++++------ .../clusterprovision_controller.go | 17 ++--- .../clusterprovision_controller_test.go | 6 +- pkg/controller/clusterprovision/metrics.go | 68 +++++++++++-------- ...bels.go => metrics_with_dynamic_labels.go} | 43 +++++++----- ...durationMetrics.go => duration_metrics.go} | 0 .../{metricsConfig.go => metrics_config.go} | 3 +- .../apis/hive/v1/zz_generated.deepcopy.go | 32 +++++++++ 18 files changed, 203 insertions(+), 116 deletions(-) rename apis/hive/v1/metricsconfig/{durationMetrics.go => duration_metrics.go} (100%) rename apis/hive/v1/metricsconfig/{metricsConfig.go => metrics_config.go} (88%) rename docs/{hive-metrics.md => hive_metrics.md} (81%) rename pkg/controller/metrics/{metricsWithDynamicLabels.go => metrics_with_dynamic_labels.go} (63%) rename vendor/github.com/openshift/hive/apis/hive/v1/metricsconfig/{durationMetrics.go => duration_metrics.go} (100%) rename vendor/github.com/openshift/hive/apis/hive/v1/metricsconfig/{metricsConfig.go => metrics_config.go} (88%) diff --git a/apis/hive/v1/metricsconfig/durationMetrics.go b/apis/hive/v1/metricsconfig/duration_metrics.go similarity index 100% rename from apis/hive/v1/metricsconfig/durationMetrics.go rename to apis/hive/v1/metricsconfig/duration_metrics.go diff --git a/apis/hive/v1/metricsconfig/metricsConfig.go b/apis/hive/v1/metricsconfig/metrics_config.go similarity index 88% rename from apis/hive/v1/metricsconfig/metricsConfig.go rename to apis/hive/v1/metricsconfig/metrics_config.go index 05e0aea12b5..2cbb68a4eee 100644 --- a/apis/hive/v1/metricsconfig/metricsConfig.go +++ b/apis/hive/v1/metricsconfig/metrics_config.go @@ -13,7 +13,8 @@ type MetricsConfig struct { // label -- e.g. "4". // NOTE: Avoid ClusterDeployment labels whose values are unbounded, such as those representing cluster names or IDs, // as these will cause your prometheus database to grow indefinitely. - // Check docs\hive-metrics.md for a list of affected metrics + // Affected metrics are those whose type implements the metricsWithDyanamicLabels interface found in + // pkg/controller/metrics/metrics_with_dynamic_labels.go // +optional AdditionalClusterDeploymentLabels map[string]string `json:"additionalClusterDeploymentLabels"` } diff --git a/apis/hive/v1/zz_generated.deepcopy.go b/apis/hive/v1/zz_generated.deepcopy.go index cae09367f72..384184f1153 100644 --- a/apis/hive/v1/zz_generated.deepcopy.go +++ b/apis/hive/v1/zz_generated.deepcopy.go @@ -2254,6 +2254,27 @@ func (in *DNSZoneStatus) DeepCopy() *DNSZoneStatus { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DeploymentConfig) DeepCopyInto(out *DeploymentConfig) { + *out = *in + if in.Resources != nil { + in, out := &in.Resources, &out.Resources + *out = new(corev1.ResourceRequirements) + (*in).DeepCopyInto(*out) + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DeploymentConfig. +func (in *DeploymentConfig) DeepCopy() *DeploymentConfig { + if in == nil { + return nil + } + out := new(DeploymentConfig) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *FailedProvisionAWSConfig) DeepCopyInto(out *FailedProvisionAWSConfig) { *out = *in @@ -2541,6 +2562,17 @@ func (in *HiveConfigSpec) DeepCopyInto(out *HiveConfigSpec) { *out = new(ControllersConfig) (*in).DeepCopyInto(*out) } + if in.DeploymentConfig != nil { + in, out := &in.DeploymentConfig, &out.DeploymentConfig + *out = new([]DeploymentConfig) + if **in != nil { + in, out := *in, *out + *out = make([]DeploymentConfig, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + } if in.AWSPrivateLink != nil { in, out := &in.AWSPrivateLink, &out.AWSPrivateLink *out = new(AWSPrivateLinkConfig) diff --git a/config/crds/hive.openshift.io_hiveconfigs.yaml b/config/crds/hive.openshift.io_hiveconfigs.yaml index ac295c967aa..3f78b102c47 100644 --- a/config/crds/hive.openshift.io_hiveconfigs.yaml +++ b/config/crds/hive.openshift.io_hiveconfigs.yaml @@ -627,7 +627,8 @@ spec: -- e.g. "4". NOTE: Avoid ClusterDeployment labels whose values are unbounded, such as those representing cluster names or IDs, as these will cause your prometheus database to grow indefinitely. - Check docs\hive-metrics.md for a list of affected metrics' + Affected metrics are those whose type implements the metricsWithDyanamicLabels + interface found in pkg/controller/metrics/metrics_with_dynamic_labels.go' type: object metricsWithDuration: description: Optional metrics and their configurations diff --git a/docs/hive-metrics.md b/docs/hive_metrics.md similarity index 81% rename from docs/hive-metrics.md rename to docs/hive_metrics.md index b398b5561f6..349a0e79827 100644 --- a/docs/hive-metrics.md +++ b/docs/hive_metrics.md @@ -4,8 +4,10 @@ - [Hive Metrics](#hive-metrics) - [Optional Metrics](#optional-metrics) - - [Metrics with Optional Cluster Deployment labels](#metrics-with-optional-cluster-deployment-labels) + - [Duration-based Metrics](#duration-based-metrics) + - [Metrics with Optional Cluster Deployment labels](#metrics-with-optional-cluster-deployment-labels) - [List of all Hive metrics](#list-of-all-hive-metrics) + - [Hive Operator metrics](#hive-operator-metrics) - [ClusterDeployment controller metrics](#clusterdeployment-controller-metrics) - [ClusterProvision controller metrics](#clusterprovision-controller-metrics) - [ClusterPool controller metrics](#clusterpool-controller-metrics) @@ -15,19 +17,19 @@ # Hive Metrics -Hive publishes metrics, which can help admins to monitor the Hive operations. Most of these metrics are always published, few are [optional](#metrics-with-optional-cluster-deployment-labels). - -### Exporting Hive Metrics -`HiveConfig.Spec.ExportMetrics` specifies whether the operator should enable metrics for hive controllers to be extracted for the prometheus instance running on the cluster. When set to true, `openshift.io/cluster-monitoring` label is applied and the operator deploys ServiceMonitors so that the prometheus instances can extract metrics. The operator also sets up RBAC in the TargetNamespace so that openshift prometheus in the cluster can list/access objects required to pull metrics. +Hive publishes metrics, which can help admins to monitor the Hive operations. Most of these metrics are always published; a few are [optional](#optional-metrics). ### Optional Metrics -Some metrics pertaining to Hibernation (or PowerState) controller, are not logged by default, since that feature is not supported across all clouds. -Admins can opt for logging such metrics via `HiveConfig.Spec.MetricsConfig` +#### Duration-based Metrics + +Some metrics that log duration are not logged by default, since that feature is not supported across all clouds. +Admins can opt for logging such metrics via `HiveConfig.Spec.MetricsConfig.MetricsWithDuration` +Check the metrics labelled as `Optional` in the [list below](#optional-metrics) to see affected metrics. -### Metrics with Optional Cluster Deployment labels +#### Metrics with Optional Cluster Deployment labels -Not all metrics are logged per cluster deployment name or namespace. Many metrics are logged per cluster-type, and hive allows for admins to define the labels to look for when reporting these metrics. +Most metrics are not observed per cluster deployment name or namespace, so Hive allows for admins to define the labels to look for when reporting these metrics. Opt into them via `HiveConfig.Spec.MetricsConfig.AdditionalClusterDeploymentLabels`, which accepts a map of the label you would want on your metrics, to the label found on ClusterDeployment. For example, including `{"ocp_major_version": "hive.openshift.io/version-major"}` will cause affected metrics to include a label key ocp_major_version with the value from the `hive.openshift.io/version-major` ClusterDeployment label -- e.g. "4". @@ -36,8 +38,16 @@ Note: It is upto the cluster admins to be mindful of cardinality and ensure thes ### List of all Hive metrics +#### Hive Operator metrics +These metrics are observed by the Hive Operator. None of these are optional + +| Metric Name | Optional Label Support | +|:-------------------------------:|:----------------------:| +| hive_hiveconfig_conditions | N | +| hive_operator_reconcile_seconds | N | + #### ClusterDeployment controller metrics -These metrics are logged per ClusterDeployment instance. None of these are optional. +These metrics are observed while processing ClusterDeployments. None of these are optional. | Metric Name | Optional Label Support | |:--------------------------------------------------------:|:----------------------:| @@ -52,7 +62,7 @@ These metrics are logged per ClusterDeployment instance. None of these are optio | hive_cluster_deployments_provision_failed_terminal_total | Y | #### ClusterProvision controller metrics -These metrics are logged per ClusterProvision instance. If they have optional label support, ClusterDeployment will apply those labels to ClusterProvision. None of these are optional. +These metrics are observed while processing ClusterProvisions. None of these are optional. | Metric Name | Optional Label Support | |:---------------------------------------------:|:----------------------:| @@ -62,7 +72,7 @@ These metrics are logged per ClusterProvision instance. If they have optional la | hive_cluster_deployment_install_success_total | Y | #### ClusterPool controller metrics -These metrics are logged per ClusterPool instance. None of these are optional. +These metrics are observed while processing ClusterPools. None of these are optional. | Metric Name | Optional Label Support | |:-------------------------------------------------:|:----------------------:| diff --git a/hack/app-sre/saas-template.yaml b/hack/app-sre/saas-template.yaml index 9cfdc4a043b..fc6e72221cc 100644 --- a/hack/app-sre/saas-template.yaml +++ b/hack/app-sre/saas-template.yaml @@ -4464,8 +4464,8 @@ objects: -- e.g. "4". NOTE: Avoid ClusterDeployment labels whose values are unbounded, such as those representing cluster names or IDs, as these will cause your prometheus database to grow - indefinitely. Check docs\hive-metrics.md for a list of affected - metrics' + indefinitely. Affected metrics are those whose type implements + the metricsWithDyanamicLabels interface found in pkg/controller/metrics/metrics_with_dynamic_labels.go' type: object metricsWithDuration: description: Optional metrics and their configurations diff --git a/pkg/controller/clusterdeployment/clusterdeployment_controller.go b/pkg/controller/clusterdeployment/clusterdeployment_controller.go index 1009ad281ce..b81f015b8d5 100644 --- a/pkg/controller/clusterdeployment/clusterdeployment_controller.go +++ b/pkg/controller/clusterdeployment/clusterdeployment_controller.go @@ -77,11 +77,6 @@ var ( // ImageSet condition initialized by cluster deployment hivev1.InstallerImageResolutionFailedCondition, } - - // mapClusterTypeLabelToValue is a map that corresponds to MetricsWithClusterTypeLabels provided in metrics config. - // The key in the map is the name of the label used in defining the metric, and the value would be the cluster - // deployment label to look for while reporting the metric - mapClusterTypeLabelToValue map[string]string ) const ( @@ -128,11 +123,9 @@ func Add(mgr manager.Manager) error { log.WithError(err).Error("error reading metrics config") return err } - mapClusterTypeLabelToValue = make(map[string]string) - mapClusterTypeLabelToValue = hivemetrics.GetOptionalClusterTypeLabels(mConfig) // Register the metrics. This is done here to ensure we define the metrics with optional label support after we have // read the hiveconfig, and we register them only once. - registerMetrics(mapClusterTypeLabelToValue) + registerMetrics(mConfig) return AddToManager(mgr, NewReconciler(mgr, logger, clientRateLimiter), concurrentReconciles, queueRateLimiter) } @@ -588,7 +581,7 @@ func (r *ReconcileClusterDeployment) reconcile(request reconcile.Request, cd *hi cdLog.WithError(err).Log(controllerutils.LogLevel(err), "error adding finalizer") return reconcile.Result{}, err } - metricClustersCreated.LogMetricWithDynamicLabels(cd, cdLog, 1) + metricClustersCreated.ObserveMetricWithDynamicLabels(cd, cdLog, 1) return reconcile.Result{}, nil } @@ -1444,7 +1437,7 @@ func (r *ReconcileClusterDeployment) removeClusterDeploymentFinalizer(cd *hivev1 hivemetrics.ClearClusterSyncFailingSecondsMetric(cd.Namespace, cd.Name, cdLog) // Increment the clusters deleted counter: - metricClustersDeleted.LogMetricWithDynamicLabels(cd, cdLog, 1) + metricClustersDeleted.ObserveMetricWithDynamicLabels(cd, cdLog, 1) return nil } diff --git a/pkg/controller/clusterdeployment/clusterdeployment_controller_test.go b/pkg/controller/clusterdeployment/clusterdeployment_controller_test.go index b0f97bc5f91..23fd3b62747 100644 --- a/pkg/controller/clusterdeployment/clusterdeployment_controller_test.go +++ b/pkg/controller/clusterdeployment/clusterdeployment_controller_test.go @@ -44,6 +44,7 @@ import ( hivev1aws "github.com/openshift/hive/apis/hive/v1/aws" "github.com/openshift/hive/apis/hive/v1/azure" "github.com/openshift/hive/apis/hive/v1/baremetal" + "github.com/openshift/hive/apis/hive/v1/metricsconfig" hiveintv1alpha1 "github.com/openshift/hive/apis/hiveinternal/v1alpha1" "github.com/openshift/hive/pkg/constants" controllerutils "github.com/openshift/hive/pkg/controller/utils" @@ -94,8 +95,9 @@ current-context: admin func init() { log.SetLevel(log.DebugLevel) - // Register the metrics to avoid panics during testing - registerMetrics(mapClusterTypeLabelToValue) + // While the metrics need not be registered for this test suite, they still need to be defined to avoid panics + // during the tests + registerMetrics(&metricsconfig.MetricsConfig{}) } func fakeReadFile(content string) func(string) ([]byte, error) { diff --git a/pkg/controller/clusterdeployment/clusterinstalls.go b/pkg/controller/clusterdeployment/clusterinstalls.go index 0536d6c73fb..f2a0d98753c 100644 --- a/pkg/controller/clusterdeployment/clusterinstalls.go +++ b/pkg/controller/clusterdeployment/clusterinstalls.go @@ -211,9 +211,9 @@ func (r *ReconcileClusterDeployment) reconcileExistingInstallingClusterInstall(c logger.WithField("duration", installDuration.Seconds()).Debug("install job completed") metricInstallJobDuration.Observe(float64(installDuration.Seconds())) - metricCompletedInstallJobRestarts.LogMetricWithDynamicLabels(cd, logger, float64(cd.Status.InstallRestarts)) + metricCompletedInstallJobRestarts.ObserveMetricWithDynamicLabels(cd, logger, float64(cd.Status.InstallRestarts)) - metricClustersInstalled.LogMetricWithDynamicLabels(cd, logger, 1) + metricClustersInstalled.ObserveMetricWithDynamicLabels(cd, logger, 1) if r.protectedDelete { // Set protected delete on for the ClusterDeployment. diff --git a/pkg/controller/clusterdeployment/clusterprovisions.go b/pkg/controller/clusterdeployment/clusterprovisions.go index 20d6cb1147f..9d826b4e018 100644 --- a/pkg/controller/clusterdeployment/clusterprovisions.go +++ b/pkg/controller/clusterdeployment/clusterprovisions.go @@ -512,9 +512,9 @@ func (r *ReconcileClusterDeployment) reconcileCompletedProvision(cd *hivev1.Clus metricInstallJobDuration.Observe(float64(jobDuration.Seconds())) // Report a metric for the total number of install restarts: - metricCompletedInstallJobRestarts.LogMetricWithDynamicLabels(cd, cdLog, float64(cd.Status.InstallRestarts)) + metricCompletedInstallJobRestarts.ObserveMetricWithDynamicLabels(cd, cdLog, float64(cd.Status.InstallRestarts)) - metricClustersInstalled.LogMetricWithDynamicLabels(cd, cdLog, 1) + metricClustersInstalled.ObserveMetricWithDynamicLabels(cd, cdLog, 1) return reconcile.Result{}, nil } diff --git a/pkg/controller/clusterdeployment/metrics.go b/pkg/controller/clusterdeployment/metrics.go index 8fb820fb709..5939c3c36bd 100644 --- a/pkg/controller/clusterdeployment/metrics.go +++ b/pkg/controller/clusterdeployment/metrics.go @@ -1,6 +1,7 @@ package clusterdeployment import ( + "github.com/openshift/hive/apis/hive/v1/metricsconfig" "github.com/prometheus/client_golang/prometheus" log "github.com/sirupsen/logrus" @@ -44,12 +45,12 @@ var ( // Declare the metrics which allow optional labels to be added. // They are defined later once the hive config has been read. - metricCompletedInstallJobRestarts hivemetrics.HistogramWithDynamicLabels + metricCompletedInstallJobRestarts hivemetrics.HistogramVecWithDynamicLabels - metricClustersCreated hivemetrics.CounterWithDynamicLabels - metricClustersInstalled hivemetrics.CounterWithDynamicLabels - metricClustersDeleted hivemetrics.CounterWithDynamicLabels - metricProvisionFailedTerminal hivemetrics.CounterWithDynamicLabels + metricClustersCreated hivemetrics.CounterVecWithDynamicLabels + metricClustersInstalled hivemetrics.CounterVecWithDynamicLabels + metricClustersDeleted hivemetrics.CounterVecWithDynamicLabels + metricProvisionFailedTerminal hivemetrics.CounterVecWithDynamicLabels ) func incProvisionFailedTerminal(cd *hivev1.ClusterDeployment, log log.FieldLogger) { @@ -64,13 +65,18 @@ func incProvisionFailedTerminal(cd *hivev1.ClusterDeployment, log log.FieldLogge } metricProvisionFailedTerminal.FixedLabels["clusterpool_namespacedname"] = poolNSName metricProvisionFailedTerminal.FixedLabels["failure_reason"] = stoppedReason - metricProvisionFailedTerminal.LogMetricWithDynamicLabels(cd, log, 1) + metricProvisionFailedTerminal.ObserveMetricWithDynamicLabels(cd, log, 1) } -func registerMetrics(mapClusterTypeLabelToValue map[string]string) { +func registerMetrics(mConfig *metricsconfig.MetricsConfig) { + mapClusterTypeLabelToValue := hivemetrics.GetOptionalClusterTypeLabels(mConfig) optionalClusterTypeLabels := hivemetrics.GetSortedKeys(mapClusterTypeLabelToValue) + defaultLabels := &hivemetrics.DynamicLabels{ + FixedLabels: map[string]string{}, + OptionalLabels: mapClusterTypeLabelToValue, + } - metricCompletedInstallJobRestarts = hivemetrics.HistogramWithDynamicLabels{ + metricCompletedInstallJobRestarts = hivemetrics.HistogramVecWithDynamicLabels{ HistogramVec: prometheus.NewHistogramVec( prometheus.HistogramOpts{ Name: "hive_cluster_deployment_completed_install_restart", @@ -78,49 +84,46 @@ func registerMetrics(mapClusterTypeLabelToValue map[string]string) { Buckets: []float64{0, 2, 10, 20, 50}, }, optionalClusterTypeLabels), - FixedLabels: map[string]string{}, - OptionalLabels: mapClusterTypeLabelToValue, + DynamicLabels: defaultLabels, } - metricClustersCreated = hivemetrics.CounterWithDynamicLabels{ + metricClustersCreated = hivemetrics.CounterVecWithDynamicLabels{ CounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{ Name: "hive_cluster_deployments_created_total", Help: "Counter incremented every time we observe a new cluster.", }, optionalClusterTypeLabels), - FixedLabels: map[string]string{}, - OptionalLabels: mapClusterTypeLabelToValue, + DynamicLabels: defaultLabels, } - metricClustersInstalled = hivemetrics.CounterWithDynamicLabels{ + metricClustersInstalled = hivemetrics.CounterVecWithDynamicLabels{ CounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{ Name: "hive_cluster_deployments_installed_total", Help: "Counter incremented every time we observe a successful installation.", }, optionalClusterTypeLabels), - FixedLabels: map[string]string{}, - OptionalLabels: mapClusterTypeLabelToValue, + DynamicLabels: defaultLabels, } - metricClustersDeleted = hivemetrics.CounterWithDynamicLabels{ + metricClustersDeleted = hivemetrics.CounterVecWithDynamicLabels{ CounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{ Name: "hive_cluster_deployments_deleted_total", Help: "Counter incremented every time we observe a deleted cluster.", }, optionalClusterTypeLabels), - FixedLabels: map[string]string{}, - OptionalLabels: mapClusterTypeLabelToValue, + DynamicLabels: defaultLabels, } - metricProvisionFailedTerminal = hivemetrics.CounterWithDynamicLabels{ + metricProvisionFailedTerminal = hivemetrics.CounterVecWithDynamicLabels{ CounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{ Name: "hive_cluster_deployments_provision_failed_terminal_total", Help: "Counter incremented when a cluster provision has failed and won't be retried.", }, append([]string{"clusterpool_namespacedname", "failure_reason"}, optionalClusterTypeLabels...)), - FixedLabels: map[string]string{ + DynamicLabels: &hivemetrics.DynamicLabels{FixedLabels: map[string]string{ // These values must be filled before logging the metric "clusterpool_namespacedname": constants.MetricLabelDefaultValue, "failure_reason": constants.MetricLabelDefaultValue, }, - OptionalLabels: mapClusterTypeLabelToValue, + OptionalLabels: mapClusterTypeLabelToValue, + }, } metrics.Registry.MustRegister(metricInstallJobDuration) diff --git a/pkg/controller/clusterprovision/clusterprovision_controller.go b/pkg/controller/clusterprovision/clusterprovision_controller.go index b1080463222..05a62e4ae99 100644 --- a/pkg/controller/clusterprovision/clusterprovision_controller.go +++ b/pkg/controller/clusterprovision/clusterprovision_controller.go @@ -51,11 +51,6 @@ const ( var ( // controllerKind contains the schema.GroupVersionKind for this controller type. controllerKind = hivev1.SchemeGroupVersion.WithKind("ClusterProvision") - - // mapClusterTypeLabelToValue is a map that corresponds to MetricsWithClusterTypeLabels provided in metrics config. - // The key in the map is the name of the label used in defining the metric, and the value would be the cluster - // deployment label to look for while reporting the metric - mapClusterTypeLabelToValue map[string]string ) // Add creates a new ClusterProvision Controller and adds it to the Manager with default RBAC. The Manager will set fields on the Controller @@ -73,11 +68,9 @@ func Add(mgr manager.Manager) error { log.WithError(err).Error("error reading metrics config") return err } - mapClusterTypeLabelToValue = make(map[string]string) - mapClusterTypeLabelToValue = hivemetrics.GetOptionalClusterTypeLabels(mConfig) // Register the metrics. This is done here to ensure we define the metrics with optional label support after we have // read the hiveconfig, and we register them only once. - registerMetrics(mapClusterTypeLabelToValue) + registerMetrics(mConfig) return add(mgr, newReconciler(mgr, clientRateLimiter), concurrentReconciles, queueRateLimiter) } @@ -377,7 +370,7 @@ func (r *ReconcileClusterProvision) reconcileSuccessfulJob(instance *hivev1.Clus result, err := r.transitionStage(instance, hivev1.ClusterProvisionStageComplete, "InstallComplete", "Install job has completed successfully", pLog) if err == nil { metricClusterProvisionsTotal.FixedLabels["result"] = resultSuccess - metricClusterProvisionsTotal.LogMetricWithDynamicLabels(instance, pLog, 1) + metricClusterProvisionsTotal.ObserveMetricWithDynamicLabels(instance, pLog, 1) } return result, err } @@ -392,9 +385,9 @@ func (r *ReconcileClusterProvision) reconcileFailedJob(instance *hivev1.ClusterP if err == nil { // Increment a counter metric for this cluster type and error reason: metricInstallErrors.FixedLabels["reason"] = reason - metricInstallErrors.LogMetricWithDynamicLabels(instance, pLog, 1) + metricInstallErrors.ObserveMetricWithDynamicLabels(instance, pLog, 1) metricClusterProvisionsTotal.FixedLabels["result"] = resultFailure - metricClusterProvisionsTotal.LogMetricWithDynamicLabels(instance, pLog, 1) + metricClusterProvisionsTotal.ObserveMetricWithDynamicLabels(instance, pLog, 1) } return result, err } @@ -628,5 +621,5 @@ func (r *ReconcileClusterProvision) logProvisionSuccessFailureMetric( timeMetric.FixedLabels["cluster_version"] = *cd.Status.InstallVersion timeMetric.FixedLabels["workers"] = r.getWorkers(*cd) timeMetric.FixedLabels["install_attempt"] = strconv.Itoa(instance.Spec.Attempt) - timeMetric.LogMetricWithDynamicLabels(cd, r.logger, time.Since(instance.CreationTimestamp.Time).Seconds()) + timeMetric.ObserveMetricWithDynamicLabels(cd, r.logger, time.Since(instance.CreationTimestamp.Time).Seconds()) } diff --git a/pkg/controller/clusterprovision/clusterprovision_controller_test.go b/pkg/controller/clusterprovision/clusterprovision_controller_test.go index a3e9bd31954..38c2a9c9b72 100644 --- a/pkg/controller/clusterprovision/clusterprovision_controller_test.go +++ b/pkg/controller/clusterprovision/clusterprovision_controller_test.go @@ -26,6 +26,7 @@ import ( routev1 "github.com/openshift/api/route/v1" "github.com/openshift/hive/apis" hivev1 "github.com/openshift/hive/apis/hive/v1" + "github.com/openshift/hive/apis/hive/v1/metricsconfig" "github.com/openshift/hive/pkg/constants" controllerutils "github.com/openshift/hive/pkg/controller/utils" "github.com/openshift/hive/pkg/install" @@ -45,8 +46,9 @@ const ( func init() { log.SetLevel(log.DebugLevel) - // Register the metrics to avoid panics during testing - registerMetrics(mapClusterTypeLabelToValue) + // While the metrics need not be registered for this test suite, they still need to be defined to avoid panics + // during the tests + registerMetrics(&metricsconfig.MetricsConfig{}) } func TestClusterProvisionReconcile(t *testing.T) { diff --git a/pkg/controller/clusterprovision/metrics.go b/pkg/controller/clusterprovision/metrics.go index 9713b8c8ef2..b62f984528c 100644 --- a/pkg/controller/clusterprovision/metrics.go +++ b/pkg/controller/clusterprovision/metrics.go @@ -1,6 +1,7 @@ package clusterprovision import ( + "github.com/openshift/hive/apis/hive/v1/metricsconfig" "github.com/prometheus/client_golang/prometheus" "sigs.k8s.io/controller-runtime/pkg/metrics" @@ -11,70 +12,79 @@ import ( var ( // Declare the metrics which allow optional labels to be added. // They are defined later once the hive config has been read. - metricClusterProvisionsTotal hivemetrics.CounterWithDynamicLabels - metricInstallErrors hivemetrics.CounterWithDynamicLabels + metricClusterProvisionsTotal hivemetrics.CounterVecWithDynamicLabels + metricInstallErrors hivemetrics.CounterVecWithDynamicLabels - metricInstallFailureSeconds hivemetrics.HistogramWithDynamicLabels - metricInstallSuccessSeconds hivemetrics.HistogramWithDynamicLabels + metricInstallFailureSeconds hivemetrics.HistogramVecWithDynamicLabels + metricInstallSuccessSeconds hivemetrics.HistogramVecWithDynamicLabels ) -func registerMetrics(mapClusterTypeLabelToValue map[string]string) { +func registerMetrics(mConfig *metricsconfig.MetricsConfig) { + mapClusterTypeLabelToValue := hivemetrics.GetOptionalClusterTypeLabels(mConfig) optionalClusterTypeLabels := hivemetrics.GetSortedKeys(mapClusterTypeLabelToValue) - metricClusterProvisionsTotal = hivemetrics.CounterWithDynamicLabels{ + metricClusterProvisionsTotal = hivemetrics.CounterVecWithDynamicLabels{ CounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{ Name: "hive_cluster_provision_results_total", Help: "Counter incremented every time we observe a completed cluster provision.", }, append([]string{"result"}, optionalClusterTypeLabels...)), - FixedLabels: map[string]string{ - "result": constants.MetricLabelDefaultValue, + DynamicLabels: &hivemetrics.DynamicLabels{ + FixedLabels: map[string]string{ + "result": constants.MetricLabelDefaultValue, + }, + OptionalLabels: mapClusterTypeLabelToValue, }, - OptionalLabels: mapClusterTypeLabelToValue, } - metricInstallErrors = hivemetrics.CounterWithDynamicLabels{ + metricInstallErrors = hivemetrics.CounterVecWithDynamicLabels{ CounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{ Name: "hive_install_errors", Help: "Counter incremented every time we observe certain errors strings in install logs.", }, append([]string{"reason"}, optionalClusterTypeLabels...)), - FixedLabels: map[string]string{ - "reason": constants.MetricLabelDefaultValue, + DynamicLabels: &hivemetrics.DynamicLabels{ + FixedLabels: map[string]string{ + "reason": constants.MetricLabelDefaultValue, + }, + OptionalLabels: mapClusterTypeLabelToValue, }, - OptionalLabels: mapClusterTypeLabelToValue, } - metricInstallFailureSeconds = hivemetrics.HistogramWithDynamicLabels{ + metricInstallFailureSeconds = hivemetrics.HistogramVecWithDynamicLabels{ HistogramVec: prometheus.NewHistogramVec(prometheus.HistogramOpts{ Name: "hive_cluster_deployment_install_failure_total", Help: "Time taken before a cluster provision failed to install", Buckets: []float64{30, 120, 300, 600, 1800}, }, append([]string{"platform", "region", "cluster_version", "workers", "install_attempt"}, optionalClusterTypeLabels...)), - FixedLabels: map[string]string{ - "platform": constants.MetricLabelDefaultValue, - "region": constants.MetricLabelDefaultValue, - "cluster_version": constants.MetricLabelDefaultValue, - "workers": constants.MetricLabelDefaultValue, - "install_attempt": constants.MetricLabelDefaultValue, + DynamicLabels: &hivemetrics.DynamicLabels{ + FixedLabels: map[string]string{ + "platform": constants.MetricLabelDefaultValue, + "region": constants.MetricLabelDefaultValue, + "cluster_version": constants.MetricLabelDefaultValue, + "workers": constants.MetricLabelDefaultValue, + "install_attempt": constants.MetricLabelDefaultValue, + }, + OptionalLabels: mapClusterTypeLabelToValue, }, - OptionalLabels: mapClusterTypeLabelToValue, } - metricInstallSuccessSeconds = hivemetrics.HistogramWithDynamicLabels{ + metricInstallSuccessSeconds = hivemetrics.HistogramVecWithDynamicLabels{ HistogramVec: prometheus.NewHistogramVec(prometheus.HistogramOpts{ Name: "hive_cluster_deployment_install_success_total", Help: "Time taken before a cluster provision succeeded to install", Buckets: []float64{1800, 2400, 3000, 3600}, }, append([]string{"platform", "region", "cluster_version", "workers", "install_attempt"}, optionalClusterTypeLabels...)), - FixedLabels: map[string]string{ - "platform": constants.MetricLabelDefaultValue, - "region": constants.MetricLabelDefaultValue, - "cluster_version": constants.MetricLabelDefaultValue, - "workers": constants.MetricLabelDefaultValue, - "install_attempt": constants.MetricLabelDefaultValue, + DynamicLabels: &hivemetrics.DynamicLabels{ + FixedLabels: map[string]string{ + "platform": constants.MetricLabelDefaultValue, + "region": constants.MetricLabelDefaultValue, + "cluster_version": constants.MetricLabelDefaultValue, + "workers": constants.MetricLabelDefaultValue, + "install_attempt": constants.MetricLabelDefaultValue, + }, + OptionalLabels: mapClusterTypeLabelToValue, }, - OptionalLabels: mapClusterTypeLabelToValue, } metrics.Registry.MustRegister(metricInstallErrors) diff --git a/pkg/controller/metrics/metricsWithDynamicLabels.go b/pkg/controller/metrics/metrics_with_dynamic_labels.go similarity index 63% rename from pkg/controller/metrics/metricsWithDynamicLabels.go rename to pkg/controller/metrics/metrics_with_dynamic_labels.go index 0b6eef8c5ba..129fd3e7c8b 100644 --- a/pkg/controller/metrics/metricsWithDynamicLabels.go +++ b/pkg/controller/metrics/metrics_with_dynamic_labels.go @@ -11,59 +11,66 @@ import ( "github.com/openshift/hive/apis/hive/v1/metricsconfig" ) -// metricsWithDyanamicLabels encompasses metrics that can have optional labels. -type metricsWithDyanamicLabels interface { - LogMetricWithDynamicLabels(metav1.Object, log.FieldLogger, float64) +// metricsWithDynamicLabels encompasses metrics that can have optional labels. +type metricsWithDynamicLabels interface { + ObserveMetricWithDynamicLabels(metav1.Object, log.FieldLogger, float64) } -type CounterWithDynamicLabels struct { - *prometheus.CounterVec +type DynamicLabels struct { FixedLabels map[string]string OptionalLabels map[string]string } -// LogMetricWithDynamicLabels logs the counter metric. It simply increments the value, so last parameter isn't used. -func (c CounterWithDynamicLabels) LogMetricWithDynamicLabels(obj metav1.Object, log log.FieldLogger, _ float64) { +type CounterVecWithDynamicLabels struct { + *prometheus.CounterVec + *DynamicLabels +} + +// ObserveMetricWithDynamicLabels logs the counter metric. It simply increments the value, so last parameter isn't used. +func (c CounterVecWithDynamicLabels) ObserveMetricWithDynamicLabels(obj metav1.Object, log log.FieldLogger, _ float64) { curriedVec, err := c.CounterVec.CurryWith(c.FixedLabels) // Maps cannot be sorted, however we would want to loop through them in a sorted fashion to ensure all the metric // labels are added in an expected order. Get the slice of map keys first, sort them and then use it to loop over // the map + labels := make(prometheus.Labels, len(c.OptionalLabels)) for _, v := range GetSortedKeys(c.OptionalLabels) { - curriedVec, err = curriedVec.CurryWith(prometheus.Labels{v: GetLabelValue(obj, c.OptionalLabels[v])}) + labels[v] = GetLabelValue(obj, c.OptionalLabels[v]) } + curriedVec, err = curriedVec.CurryWith(labels) if err != nil { // The possibility of hitting an error here is low, but report it and don't observe the metric in case it happens - log.WithError(err).Error("error observing metric") + log.WithField("metric", c.CounterVec).WithError(err).Error("error observing metric") } else { curriedVec.WithLabelValues().Inc() } } -type HistogramWithDynamicLabels struct { +type HistogramVecWithDynamicLabels struct { *prometheus.HistogramVec - FixedLabels map[string]string - OptionalLabels map[string]string + *DynamicLabels } -// LogMetricWithDynamicLabels observes the histogram metric with val value. -func (h HistogramWithDynamicLabels) LogMetricWithDynamicLabels(obj metav1.Object, log log.FieldLogger, val float64) { +// ObserveMetricWithDynamicLabels observes the histogram metric with val value. +func (h HistogramVecWithDynamicLabels) ObserveMetricWithDynamicLabels(obj metav1.Object, log log.FieldLogger, val float64) { curriedVec, err := h.HistogramVec.CurryWith(h.FixedLabels) // Maps cannot be sorted, however we would want to loop through them in a sorted fashion to ensure all the metric // labels are added in an expected order. Get the slice of map keys first, sort them and then use it to loop over // the map + labels := make(prometheus.Labels, len(h.OptionalLabels)) for _, v := range GetSortedKeys(h.OptionalLabels) { - curriedVec, err = curriedVec.CurryWith(prometheus.Labels{v: GetLabelValue(obj, h.OptionalLabels[v])}) + labels[v] = GetLabelValue(obj, h.OptionalLabels[v]) } + curriedVec, err = curriedVec.CurryWith(labels) if err != nil { // The possibility of hitting an error here is low, but report it and don't observe the metric in case it happens - log.WithError(err).Error("error observing metric") + log.WithField("metric", h.HistogramVec).WithError(err).Error("error observing metric") } else { curriedVec.WithLabelValues().Observe(val) } } -var _ metricsWithDyanamicLabels = CounterWithDynamicLabels{} -var _ metricsWithDyanamicLabels = HistogramWithDynamicLabels{} +var _ metricsWithDynamicLabels = CounterVecWithDynamicLabels{} +var _ metricsWithDynamicLabels = HistogramVecWithDynamicLabels{} // GetOptionalClusterTypeLabels reads the AdditionalClusterDeploymentLabels from the metrics config and returns the same // as a map diff --git a/vendor/github.com/openshift/hive/apis/hive/v1/metricsconfig/durationMetrics.go b/vendor/github.com/openshift/hive/apis/hive/v1/metricsconfig/duration_metrics.go similarity index 100% rename from vendor/github.com/openshift/hive/apis/hive/v1/metricsconfig/durationMetrics.go rename to vendor/github.com/openshift/hive/apis/hive/v1/metricsconfig/duration_metrics.go diff --git a/vendor/github.com/openshift/hive/apis/hive/v1/metricsconfig/metricsConfig.go b/vendor/github.com/openshift/hive/apis/hive/v1/metricsconfig/metrics_config.go similarity index 88% rename from vendor/github.com/openshift/hive/apis/hive/v1/metricsconfig/metricsConfig.go rename to vendor/github.com/openshift/hive/apis/hive/v1/metricsconfig/metrics_config.go index 05e0aea12b5..2cbb68a4eee 100644 --- a/vendor/github.com/openshift/hive/apis/hive/v1/metricsconfig/metricsConfig.go +++ b/vendor/github.com/openshift/hive/apis/hive/v1/metricsconfig/metrics_config.go @@ -13,7 +13,8 @@ type MetricsConfig struct { // label -- e.g. "4". // NOTE: Avoid ClusterDeployment labels whose values are unbounded, such as those representing cluster names or IDs, // as these will cause your prometheus database to grow indefinitely. - // Check docs\hive-metrics.md for a list of affected metrics + // Affected metrics are those whose type implements the metricsWithDyanamicLabels interface found in + // pkg/controller/metrics/metrics_with_dynamic_labels.go // +optional AdditionalClusterDeploymentLabels map[string]string `json:"additionalClusterDeploymentLabels"` } diff --git a/vendor/github.com/openshift/hive/apis/hive/v1/zz_generated.deepcopy.go b/vendor/github.com/openshift/hive/apis/hive/v1/zz_generated.deepcopy.go index cae09367f72..384184f1153 100644 --- a/vendor/github.com/openshift/hive/apis/hive/v1/zz_generated.deepcopy.go +++ b/vendor/github.com/openshift/hive/apis/hive/v1/zz_generated.deepcopy.go @@ -2254,6 +2254,27 @@ func (in *DNSZoneStatus) DeepCopy() *DNSZoneStatus { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DeploymentConfig) DeepCopyInto(out *DeploymentConfig) { + *out = *in + if in.Resources != nil { + in, out := &in.Resources, &out.Resources + *out = new(corev1.ResourceRequirements) + (*in).DeepCopyInto(*out) + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DeploymentConfig. +func (in *DeploymentConfig) DeepCopy() *DeploymentConfig { + if in == nil { + return nil + } + out := new(DeploymentConfig) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *FailedProvisionAWSConfig) DeepCopyInto(out *FailedProvisionAWSConfig) { *out = *in @@ -2541,6 +2562,17 @@ func (in *HiveConfigSpec) DeepCopyInto(out *HiveConfigSpec) { *out = new(ControllersConfig) (*in).DeepCopyInto(*out) } + if in.DeploymentConfig != nil { + in, out := &in.DeploymentConfig, &out.DeploymentConfig + *out = new([]DeploymentConfig) + if **in != nil { + in, out := *in, *out + *out = make([]DeploymentConfig, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + } if in.AWSPrivateLink != nil { in, out := &in.AWSPrivateLink, &out.AWSPrivateLink *out = new(AWSPrivateLinkConfig)