feat(analysis): Add Measurements Retention Limit Option for Metrics

Signed-off-by: Rohit Agrawal <[email protected]>
argoproj · Dec 23, 2021 · ee91d9c · ee91d9c
1 parent a3477cf
commit ee91d9c
Show file tree

Hide file tree

Showing 13 changed files with 428 additions and 11 deletions.
diff --git a/analysis/analysis.go b/analysis/analysis.go
@@ -80,6 +80,16 @@ func (c *Controller) reconcileAnalysisRun(origRun *v1alpha1.AnalysisRun) *v1alph
 		return run
 	}
 
+	measurementRetentionMetricsMap, err := analysisutil.GetMeasurementRetentionMetrics(run.Spec.MeasurementRetention, resolvedMetrics)
+	if err != nil {
+		message := fmt.Sprintf("Analysis spec invalid: %v", err)
+		logger.Warn(message)
+		run.Status.Phase = v1alpha1.AnalysisPhaseError
+		run.Status.Message = message
+		c.recordAnalysisRunCompletionEvent(run)
+		return run
+	}
+
 	tasks := generateMetricTasks(run, resolvedMetrics)
 	logger.Infof("Taking %d Measurement(s)...", len(tasks))
 	err = c.runMeasurements(run, tasks, dryRunMetricsMap)
@@ -101,7 +111,7 @@ func (c *Controller) reconcileAnalysisRun(origRun *v1alpha1.AnalysisRun) *v1alph
 		}
 	}
 
-	err = c.garbageCollectMeasurements(run, DefaultMeasurementHistoryLimit)
+	err = c.garbageCollectMeasurements(run, measurementRetentionMetricsMap, DefaultMeasurementHistoryLimit)
 	if err != nil {
 		// TODO(jessesuen): surface errors to controller so they can be retried
 		logger.Warnf("Failed to garbage collect measurements: %v", err)
@@ -693,7 +703,7 @@ func calculateNextReconcileTime(run *v1alpha1.AnalysisRun, metrics []v1alpha1.Me
 }
 
 // garbageCollectMeasurements trims the measurement history to the specified limit and GCs old measurements
-func (c *Controller) garbageCollectMeasurements(run *v1alpha1.AnalysisRun, limit int) error {
+func (c *Controller) garbageCollectMeasurements(run *v1alpha1.AnalysisRun, measurementRetentionMetricNamesMap map[string]*v1alpha1.MeasurementRetention, limit int) error {
 	var errors []error
 
 	metricsByName := make(map[string]v1alpha1.Metric)
@@ -703,7 +713,12 @@ func (c *Controller) garbageCollectMeasurements(run *v1alpha1.AnalysisRun, limit
 
 	for i, result := range run.Status.MetricResults {
 		length := len(result.Measurements)
-		if length > limit {
+		measurementRetentionObject := measurementRetentionMetricNamesMap[result.Name]
+		measurementsLimit := limit
+		if measurementRetentionObject != nil && measurementRetentionObject.Limit > 0 {
+			measurementsLimit = measurementRetentionObject.Limit
+		}
+		if length > measurementsLimit {
 			metric, ok := metricsByName[result.Name]
 			if !ok {
 				continue
@@ -714,11 +729,11 @@ func (c *Controller) garbageCollectMeasurements(run *v1alpha1.AnalysisRun, limit
 				errors = append(errors, err)
 				continue
 			}
-			err = provider.GarbageCollect(run, metric, limit)
+			err = provider.GarbageCollect(run, metric, measurementsLimit)
 			if err != nil {
 				return err
 			}
-			result.Measurements = result.Measurements[length-limit : length]
+			result.Measurements = result.Measurements[length-measurementsLimit : length]
 		}
 		run.Status.MetricResults[i] = result
 	}

diff --git a/analysis/analysis_test.go b/analysis/analysis_test.go
@@ -1055,7 +1055,8 @@ func TestTrimMeasurementHistory(t *testing.T) {
 
 	{
 		run := newRun()
-		c.garbageCollectMeasurements(run, 2)
+		err := c.garbageCollectMeasurements(run, map[string]*v1alpha1.MeasurementRetention{}, 2)
+		assert.Nil(t, err)
 		assert.Len(t, run.Status.MetricResults[0].Measurements, 1)
 		assert.Equal(t, "1", run.Status.MetricResults[0].Measurements[0].Value)
 		assert.Len(t, run.Status.MetricResults[1].Measurements, 2)
@@ -1064,12 +1065,37 @@ func TestTrimMeasurementHistory(t *testing.T) {
 	}
 	{
 		run := newRun()
-		c.garbageCollectMeasurements(run, 1)
+		err := c.garbageCollectMeasurements(run, map[string]*v1alpha1.MeasurementRetention{}, 1)
+		assert.Nil(t, err)
 		assert.Len(t, run.Status.MetricResults[0].Measurements, 1)
 		assert.Equal(t, "1", run.Status.MetricResults[0].Measurements[0].Value)
 		assert.Len(t, run.Status.MetricResults[1].Measurements, 1)
 		assert.Equal(t, "3", run.Status.MetricResults[1].Measurements[0].Value)
 	}
+	{
+		run := newRun()
+		var measurementRetentionMetricsMap = map[string]*v1alpha1.MeasurementRetention{}
+		measurementRetentionMetricsMap["metric2"] = &v1alpha1.MeasurementRetention{MetricName: "*", Limit: 2}
+		err := c.garbageCollectMeasurements(run, measurementRetentionMetricsMap, 1)
+		assert.Nil(t, err)
+		assert.Len(t, run.Status.MetricResults[0].Measurements, 1)
+		assert.Equal(t, "1", run.Status.MetricResults[0].Measurements[0].Value)
+		assert.Len(t, run.Status.MetricResults[1].Measurements, 2)
+		assert.Equal(t, "2", run.Status.MetricResults[1].Measurements[0].Value)
+		assert.Equal(t, "3", run.Status.MetricResults[1].Measurements[1].Value)
+	}
+	{
+		run := newRun()
+		var measurementRetentionMetricsMap = map[string]*v1alpha1.MeasurementRetention{}
+		measurementRetentionMetricsMap["metric2"] = &v1alpha1.MeasurementRetention{MetricName: "metric2", Limit: 2}
+		err := c.garbageCollectMeasurements(run, measurementRetentionMetricsMap, 1)
+		assert.Nil(t, err)
+		assert.Len(t, run.Status.MetricResults[0].Measurements, 1)
+		assert.Equal(t, "1", run.Status.MetricResults[0].Measurements[0].Value)
+		assert.Len(t, run.Status.MetricResults[1].Measurements, 2)
+		assert.Equal(t, "2", run.Status.MetricResults[1].Measurements[0].Value)
+		assert.Equal(t, "3", run.Status.MetricResults[1].Measurements[1].Value)
+	}
 }
 
 func TestResolveMetricArgsUnableToSubstitute(t *testing.T) {
@@ -1675,3 +1701,44 @@ func TestInvalidDryRunConfigThrowsError(t *testing.T) {
 	assert.Equal(t, v1alpha1.AnalysisPhaseError, newRun.Status.Phase)
 	assert.Equal(t, "Analysis spec invalid: dryRun[0]: Rule didn't match any metric name(s)", newRun.Status.Message)
 }
+
+func TestInvalidMeasurementsRetentionConfigThrowsError(t *testing.T) {
+	f := newFixture(t)
+	defer f.Close()
+	c, _, _ := f.newController(noResyncPeriodFunc)
+
+	// Mocks terminate to cancel the in-progress measurement
+	f.provider.On("Terminate", mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return(newMeasurement(v1alpha1.AnalysisPhaseSuccessful), nil)
+
+	var measurementsRetentionArray []v1alpha1.MeasurementRetention
+	measurementsRetentionArray = append(measurementsRetentionArray, v1alpha1.MeasurementRetention{MetricName: "error-rate"})
+	now := metav1.Now()
+	run := &v1alpha1.AnalysisRun{
+		Spec: v1alpha1.AnalysisRunSpec{
+			Terminate: true,
+			Args: []v1alpha1.Argument{
+				{
+					Name:  "service",
+					Value: pointer.StringPtr("rollouts-demo-canary.default.svc.cluster.local"),
+				},
+			},
+			Metrics: []v1alpha1.Metric{{
+				Name:             "success-rate",
+				InitialDelay:     "20s",
+				Interval:         "20s",
+				SuccessCondition: "result[0] > 0.90",
+				Provider: v1alpha1.MetricProvider{
+					Web: &v1alpha1.WebMetric{},
+				},
+			}},
+			MeasurementRetention: measurementsRetentionArray,
+		},
+		Status: v1alpha1.AnalysisRunStatus{
+			StartedAt: &now,
+			Phase:     v1alpha1.AnalysisPhaseRunning,
+		},
+	}
+	newRun := c.reconcileAnalysisRun(run)
+	assert.Equal(t, v1alpha1.AnalysisPhaseError, newRun.Status.Phase)
+	assert.Equal(t, "Analysis spec invalid: measurementRetention[0]: Rule didn't match any metric name(s)", newRun.Status.Message)
+}
diff --git a/docs/features/analysis.md b/docs/features/analysis.md
@@ -679,6 +679,79 @@ spec:
   - metricName: test.*
 ```
 
+## Measurements Retention
+
+!!! important
+Available since v1.2
+
+`measurementRetention` can be used to retain other than the latest ten results for the metrics running in any mode 
+(dry/non-dry). Setting this option to `0` would disable it and, the controller will revert to the existing behavior of 
+retaining the latest ten measurements.
+
+The following example queries Prometheus every 5 minutes to get the total number of 4XX and 5XX errors and retains the 
+latest twenty measurements for the 5XX metric run results instead of the default ten.
+
+```yaml hl_lines="1 2 3"
+  measurementRetention:
+  - metricName: total-5xx-errors
+    limit: 20
+  metrics:
+  - name: total-5xx-errors
+    interval: 5m
+    failureCondition: result[0] >= 10
+    failureLimit: 3
+    provider:
+      prometheus:
+        address: http://prometheus.example.com:9090
+        query: |
+          sum(irate(
+            istio_requests_total{reporter="source",destination_service=~"{{args.service-name}}",response_code~"5.*"}[5m]
+          ))
+  - name: total-4xx-errors
+    interval: 5m
+    failureCondition: result[0] >= 10
+    failureLimit: 3
+    provider:
+      prometheus:
+        address: http://prometheus.example.com:9090
+        query: |
+          sum(irate(
+            istio_requests_total{reporter="source",destination_service=~"{{args.service-name}}",response_code~"4.*"}[5m]
+          ))
+```
+
+RegEx matches are also supported. `.*` can be used to apply the same retention rule to all the metrics. In the following 
+example, the controller will retain the latest twenty run results for all the metrics instead of the default ten results.
+
+```yaml hl_lines="1 2 3"
+  measurementRetention:
+  - metricName: .*
+    limit: 20
+  metrics:
+  - name: total-5xx-errors
+    interval: 5m
+    failureCondition: result[0] >= 10
+    failureLimit: 3
+    provider:
+      prometheus:
+        address: http://prometheus.example.com:9090
+        query: |
+          sum(irate(
+            istio_requests_total{reporter="source",destination_service=~"{{args.service-name}}",response_code~"5.*"}[5m]
+          ))
+  - name: total-4xx-errors
+    interval: 5m
+    failureCondition: result[0] >= 10
+    failureLimit: 3
+    provider:
+      prometheus:
+        address: http://prometheus.example.com:9090
+        query: |
+          sum(irate(
+            istio_requests_total{reporter="source",destination_service=~"{{args.service-name}}",response_code~"4.*"}[5m]
+          ))
+```
+
 ## Inconclusive Runs
 
 Analysis runs can also be considered `Inconclusive`, which indicates the run was neither successful,

diff --git a/manifests/crds/analysis-run-crd.yaml b/manifests/crds/analysis-run-crd.yaml
@@ -77,6 +77,18 @@ spec:
                   - metricName
                   type: object
                 type: array
+              measurementRetention:
+                items:
+                  properties:
+                    limit:
+                      type: integer
+                    metricName:
+                      type: string
+                  required:
+                  - limit
+                  - metricName
+                  type: object
+                type: array
               metrics:
                 items:
                   properties:

diff --git a/manifests/crds/analysis-template-crd.yaml b/manifests/crds/analysis-template-crd.yaml
@@ -73,6 +73,18 @@ spec:
                   - metricName
                   type: object
                 type: array
+              measurementRetention:
+                items:
+                  properties:
+                    limit:
+                      type: integer
+                    metricName:
+                      type: string
+                  required:
+                  - limit
+                  - metricName
+                  type: object
+                type: array
               metrics:
                 items:
                   properties:

diff --git a/manifests/crds/cluster-analysis-template-crd.yaml b/manifests/crds/cluster-analysis-template-crd.yaml
@@ -73,6 +73,18 @@ spec:
                   - metricName
                   type: object
                 type: array
+              measurementRetention:
+                items:
+                  properties:
+                    limit:
+                      type: integer
+                    metricName:
+                      type: string
+                  required:
+                  - limit
+                  - metricName
+                  type: object
+                type: array
               metrics:
                 items:
                   properties:

diff --git a/manifests/install.yaml b/manifests/install.yaml
@@ -78,6 +78,18 @@ spec:
                   - metricName
                   type: object
                 type: array
+              measurementRetention:
+                items:
+                  properties:
+                    limit:
+                      type: integer
+                    metricName:
+                      type: string
+                  required:
+                  - limit
+                  - metricName
+                  type: object
+                type: array
               metrics:
                 items:
                   properties:
@@ -2815,6 +2827,18 @@ spec:
                   - metricName
                   type: object
                 type: array
+              measurementRetention:
+                items:
+                  properties:
+                    limit:
+                      type: integer
+                    metricName:
+                      type: string
+                  required:
+                  - limit
+                  - metricName
+                  type: object
+                type: array
               metrics:
                 items:
                   properties:
@@ -5442,6 +5466,18 @@ spec:
                   - metricName
                   type: object
                 type: array
+              measurementRetention:
+                items:
+                  properties:
+                    limit:
+                      type: integer
+                    metricName:
+                      type: string
+                  required:
+                  - limit
+                  - metricName
+                  type: object
+                type: array
               metrics:
                 items:
                   properties: