Skip to content

Commit

Permalink
feat(analysis): Add Measurements Retention Limit Option for Metrics
Browse files Browse the repository at this point in the history
Signed-off-by: Rohit Agrawal <[email protected]>
  • Loading branch information
agrawroh committed Dec 23, 2021
1 parent a3477cf commit ee91d9c
Show file tree
Hide file tree
Showing 13 changed files with 428 additions and 11 deletions.
25 changes: 20 additions & 5 deletions analysis/analysis.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,16 @@ func (c *Controller) reconcileAnalysisRun(origRun *v1alpha1.AnalysisRun) *v1alph
return run
}

measurementRetentionMetricsMap, err := analysisutil.GetMeasurementRetentionMetrics(run.Spec.MeasurementRetention, resolvedMetrics)
if err != nil {
message := fmt.Sprintf("Analysis spec invalid: %v", err)
logger.Warn(message)
run.Status.Phase = v1alpha1.AnalysisPhaseError
run.Status.Message = message
c.recordAnalysisRunCompletionEvent(run)
return run
}

tasks := generateMetricTasks(run, resolvedMetrics)
logger.Infof("Taking %d Measurement(s)...", len(tasks))
err = c.runMeasurements(run, tasks, dryRunMetricsMap)
Expand All @@ -101,7 +111,7 @@ func (c *Controller) reconcileAnalysisRun(origRun *v1alpha1.AnalysisRun) *v1alph
}
}

err = c.garbageCollectMeasurements(run, DefaultMeasurementHistoryLimit)
err = c.garbageCollectMeasurements(run, measurementRetentionMetricsMap, DefaultMeasurementHistoryLimit)
if err != nil {
// TODO(jessesuen): surface errors to controller so they can be retried
logger.Warnf("Failed to garbage collect measurements: %v", err)
Expand Down Expand Up @@ -693,7 +703,7 @@ func calculateNextReconcileTime(run *v1alpha1.AnalysisRun, metrics []v1alpha1.Me
}

// garbageCollectMeasurements trims the measurement history to the specified limit and GCs old measurements
func (c *Controller) garbageCollectMeasurements(run *v1alpha1.AnalysisRun, limit int) error {
func (c *Controller) garbageCollectMeasurements(run *v1alpha1.AnalysisRun, measurementRetentionMetricNamesMap map[string]*v1alpha1.MeasurementRetention, limit int) error {
var errors []error

metricsByName := make(map[string]v1alpha1.Metric)
Expand All @@ -703,7 +713,12 @@ func (c *Controller) garbageCollectMeasurements(run *v1alpha1.AnalysisRun, limit

for i, result := range run.Status.MetricResults {
length := len(result.Measurements)
if length > limit {
measurementRetentionObject := measurementRetentionMetricNamesMap[result.Name]
measurementsLimit := limit
if measurementRetentionObject != nil && measurementRetentionObject.Limit > 0 {
measurementsLimit = measurementRetentionObject.Limit
}
if length > measurementsLimit {
metric, ok := metricsByName[result.Name]
if !ok {
continue
Expand All @@ -714,11 +729,11 @@ func (c *Controller) garbageCollectMeasurements(run *v1alpha1.AnalysisRun, limit
errors = append(errors, err)
continue
}
err = provider.GarbageCollect(run, metric, limit)
err = provider.GarbageCollect(run, metric, measurementsLimit)
if err != nil {
return err
}
result.Measurements = result.Measurements[length-limit : length]
result.Measurements = result.Measurements[length-measurementsLimit : length]
}
run.Status.MetricResults[i] = result
}
Expand Down
71 changes: 69 additions & 2 deletions analysis/analysis_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1055,7 +1055,8 @@ func TestTrimMeasurementHistory(t *testing.T) {

{
run := newRun()
c.garbageCollectMeasurements(run, 2)
err := c.garbageCollectMeasurements(run, map[string]*v1alpha1.MeasurementRetention{}, 2)
assert.Nil(t, err)
assert.Len(t, run.Status.MetricResults[0].Measurements, 1)
assert.Equal(t, "1", run.Status.MetricResults[0].Measurements[0].Value)
assert.Len(t, run.Status.MetricResults[1].Measurements, 2)
Expand All @@ -1064,12 +1065,37 @@ func TestTrimMeasurementHistory(t *testing.T) {
}
{
run := newRun()
c.garbageCollectMeasurements(run, 1)
err := c.garbageCollectMeasurements(run, map[string]*v1alpha1.MeasurementRetention{}, 1)
assert.Nil(t, err)
assert.Len(t, run.Status.MetricResults[0].Measurements, 1)
assert.Equal(t, "1", run.Status.MetricResults[0].Measurements[0].Value)
assert.Len(t, run.Status.MetricResults[1].Measurements, 1)
assert.Equal(t, "3", run.Status.MetricResults[1].Measurements[0].Value)
}
{
run := newRun()
var measurementRetentionMetricsMap = map[string]*v1alpha1.MeasurementRetention{}
measurementRetentionMetricsMap["metric2"] = &v1alpha1.MeasurementRetention{MetricName: "*", Limit: 2}
err := c.garbageCollectMeasurements(run, measurementRetentionMetricsMap, 1)
assert.Nil(t, err)
assert.Len(t, run.Status.MetricResults[0].Measurements, 1)
assert.Equal(t, "1", run.Status.MetricResults[0].Measurements[0].Value)
assert.Len(t, run.Status.MetricResults[1].Measurements, 2)
assert.Equal(t, "2", run.Status.MetricResults[1].Measurements[0].Value)
assert.Equal(t, "3", run.Status.MetricResults[1].Measurements[1].Value)
}
{
run := newRun()
var measurementRetentionMetricsMap = map[string]*v1alpha1.MeasurementRetention{}
measurementRetentionMetricsMap["metric2"] = &v1alpha1.MeasurementRetention{MetricName: "metric2", Limit: 2}
err := c.garbageCollectMeasurements(run, measurementRetentionMetricsMap, 1)
assert.Nil(t, err)
assert.Len(t, run.Status.MetricResults[0].Measurements, 1)
assert.Equal(t, "1", run.Status.MetricResults[0].Measurements[0].Value)
assert.Len(t, run.Status.MetricResults[1].Measurements, 2)
assert.Equal(t, "2", run.Status.MetricResults[1].Measurements[0].Value)
assert.Equal(t, "3", run.Status.MetricResults[1].Measurements[1].Value)
}
}

func TestResolveMetricArgsUnableToSubstitute(t *testing.T) {
Expand Down Expand Up @@ -1675,3 +1701,44 @@ func TestInvalidDryRunConfigThrowsError(t *testing.T) {
assert.Equal(t, v1alpha1.AnalysisPhaseError, newRun.Status.Phase)
assert.Equal(t, "Analysis spec invalid: dryRun[0]: Rule didn't match any metric name(s)", newRun.Status.Message)
}

func TestInvalidMeasurementsRetentionConfigThrowsError(t *testing.T) {
f := newFixture(t)
defer f.Close()
c, _, _ := f.newController(noResyncPeriodFunc)

// Mocks terminate to cancel the in-progress measurement
f.provider.On("Terminate", mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return(newMeasurement(v1alpha1.AnalysisPhaseSuccessful), nil)

var measurementsRetentionArray []v1alpha1.MeasurementRetention
measurementsRetentionArray = append(measurementsRetentionArray, v1alpha1.MeasurementRetention{MetricName: "error-rate"})
now := metav1.Now()
run := &v1alpha1.AnalysisRun{
Spec: v1alpha1.AnalysisRunSpec{
Terminate: true,
Args: []v1alpha1.Argument{
{
Name: "service",
Value: pointer.StringPtr("rollouts-demo-canary.default.svc.cluster.local"),
},
},
Metrics: []v1alpha1.Metric{{
Name: "success-rate",
InitialDelay: "20s",
Interval: "20s",
SuccessCondition: "result[0] > 0.90",
Provider: v1alpha1.MetricProvider{
Web: &v1alpha1.WebMetric{},
},
}},
MeasurementRetention: measurementsRetentionArray,
},
Status: v1alpha1.AnalysisRunStatus{
StartedAt: &now,
Phase: v1alpha1.AnalysisPhaseRunning,
},
}
newRun := c.reconcileAnalysisRun(run)
assert.Equal(t, v1alpha1.AnalysisPhaseError, newRun.Status.Phase)
assert.Equal(t, "Analysis spec invalid: measurementRetention[0]: Rule didn't match any metric name(s)", newRun.Status.Message)
}
73 changes: 73 additions & 0 deletions docs/features/analysis.md
Original file line number Diff line number Diff line change
Expand Up @@ -679,6 +679,79 @@ spec:
- metricName: test.*
```

## Measurements Retention

!!! important
Available since v1.2

`measurementRetention` can be used to retain other than the latest ten results for the metrics running in any mode
(dry/non-dry). Setting this option to `0` would disable it and, the controller will revert to the existing behavior of
retaining the latest ten measurements.

The following example queries Prometheus every 5 minutes to get the total number of 4XX and 5XX errors and retains the
latest twenty measurements for the 5XX metric run results instead of the default ten.

```yaml hl_lines="1 2 3"
measurementRetention:
- metricName: total-5xx-errors
limit: 20
metrics:
- name: total-5xx-errors
interval: 5m
failureCondition: result[0] >= 10
failureLimit: 3
provider:
prometheus:
address: http://prometheus.example.com:9090
query: |
sum(irate(
istio_requests_total{reporter="source",destination_service=~"{{args.service-name}}",response_code~"5.*"}[5m]
))
- name: total-4xx-errors
interval: 5m
failureCondition: result[0] >= 10
failureLimit: 3
provider:
prometheus:
address: http://prometheus.example.com:9090
query: |
sum(irate(
istio_requests_total{reporter="source",destination_service=~"{{args.service-name}}",response_code~"4.*"}[5m]
))
```

RegEx matches are also supported. `.*` can be used to apply the same retention rule to all the metrics. In the following
example, the controller will retain the latest twenty run results for all the metrics instead of the default ten results.

```yaml hl_lines="1 2 3"
measurementRetention:
- metricName: .*
limit: 20
metrics:
- name: total-5xx-errors
interval: 5m
failureCondition: result[0] >= 10
failureLimit: 3
provider:
prometheus:
address: http://prometheus.example.com:9090
query: |
sum(irate(
istio_requests_total{reporter="source",destination_service=~"{{args.service-name}}",response_code~"5.*"}[5m]
))
- name: total-4xx-errors
interval: 5m
failureCondition: result[0] >= 10
failureLimit: 3
provider:
prometheus:
address: http://prometheus.example.com:9090
query: |
sum(irate(
istio_requests_total{reporter="source",destination_service=~"{{args.service-name}}",response_code~"4.*"}[5m]
))
```

## Inconclusive Runs

Analysis runs can also be considered `Inconclusive`, which indicates the run was neither successful,
Expand Down
12 changes: 12 additions & 0 deletions manifests/crds/analysis-run-crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,18 @@ spec:
- metricName
type: object
type: array
measurementRetention:
items:
properties:
limit:
type: integer
metricName:
type: string
required:
- limit
- metricName
type: object
type: array
metrics:
items:
properties:
Expand Down
12 changes: 12 additions & 0 deletions manifests/crds/analysis-template-crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,18 @@ spec:
- metricName
type: object
type: array
measurementRetention:
items:
properties:
limit:
type: integer
metricName:
type: string
required:
- limit
- metricName
type: object
type: array
metrics:
items:
properties:
Expand Down
12 changes: 12 additions & 0 deletions manifests/crds/cluster-analysis-template-crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,18 @@ spec:
- metricName
type: object
type: array
measurementRetention:
items:
properties:
limit:
type: integer
metricName:
type: string
required:
- limit
- metricName
type: object
type: array
metrics:
items:
properties:
Expand Down
36 changes: 36 additions & 0 deletions manifests/install.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,18 @@ spec:
- metricName
type: object
type: array
measurementRetention:
items:
properties:
limit:
type: integer
metricName:
type: string
required:
- limit
- metricName
type: object
type: array
metrics:
items:
properties:
Expand Down Expand Up @@ -2815,6 +2827,18 @@ spec:
- metricName
type: object
type: array
measurementRetention:
items:
properties:
limit:
type: integer
metricName:
type: string
required:
- limit
- metricName
type: object
type: array
metrics:
items:
properties:
Expand Down Expand Up @@ -5442,6 +5466,18 @@ spec:
- metricName
type: object
type: array
measurementRetention:
items:
properties:
limit:
type: integer
metricName:
type: string
required:
- limit
- metricName
type: object
type: array
metrics:
items:
properties:
Expand Down
Loading

0 comments on commit ee91d9c

Please sign in to comment.