Skip to content

Commit

Permalink
feat(analysis): Add Measurements Retention Limit Option for Metrics
Browse files Browse the repository at this point in the history
Signed-off-by: Rohit Agrawal <[email protected]>
  • Loading branch information
agrawroh committed Dec 23, 2021
1 parent a3477cf commit cb59bd4
Show file tree
Hide file tree
Showing 13 changed files with 387 additions and 11 deletions.
25 changes: 20 additions & 5 deletions analysis/analysis.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,16 @@ func (c *Controller) reconcileAnalysisRun(origRun *v1alpha1.AnalysisRun) *v1alph
return run
}

measurementRetentionMetricsMap, err := analysisutil.GetMeasurementRetentionMetrics(run.Spec.MeasurementRetention, resolvedMetrics)
if err != nil {
message := fmt.Sprintf("Analysis spec invalid: %v", err)
logger.Warn(message)
run.Status.Phase = v1alpha1.AnalysisPhaseError
run.Status.Message = message
c.recordAnalysisRunCompletionEvent(run)
return run
}

tasks := generateMetricTasks(run, resolvedMetrics)
logger.Infof("Taking %d Measurement(s)...", len(tasks))
err = c.runMeasurements(run, tasks, dryRunMetricsMap)
Expand All @@ -101,7 +111,7 @@ func (c *Controller) reconcileAnalysisRun(origRun *v1alpha1.AnalysisRun) *v1alph
}
}

err = c.garbageCollectMeasurements(run, DefaultMeasurementHistoryLimit)
err = c.garbageCollectMeasurements(run, measurementRetentionMetricsMap, DefaultMeasurementHistoryLimit)
if err != nil {
// TODO(jessesuen): surface errors to controller so they can be retried
logger.Warnf("Failed to garbage collect measurements: %v", err)
Expand Down Expand Up @@ -693,7 +703,7 @@ func calculateNextReconcileTime(run *v1alpha1.AnalysisRun, metrics []v1alpha1.Me
}

// garbageCollectMeasurements trims the measurement history to the specified limit and GCs old measurements
func (c *Controller) garbageCollectMeasurements(run *v1alpha1.AnalysisRun, limit int) error {
func (c *Controller) garbageCollectMeasurements(run *v1alpha1.AnalysisRun, measurementRetentionMetricNamesMap map[string]*v1alpha1.MeasurementRetention, limit int) error {
var errors []error

metricsByName := make(map[string]v1alpha1.Metric)
Expand All @@ -703,7 +713,12 @@ func (c *Controller) garbageCollectMeasurements(run *v1alpha1.AnalysisRun, limit

for i, result := range run.Status.MetricResults {
length := len(result.Measurements)
if length > limit {
measurementRetentionObject := measurementRetentionMetricNamesMap[result.Name]
measurementsLimit := limit
if measurementRetentionObject != nil && measurementRetentionObject.Limit > 0 {
measurementsLimit = measurementRetentionObject.Limit
}
if length > measurementsLimit {
metric, ok := metricsByName[result.Name]
if !ok {
continue
Expand All @@ -714,11 +729,11 @@ func (c *Controller) garbageCollectMeasurements(run *v1alpha1.AnalysisRun, limit
errors = append(errors, err)
continue
}
err = provider.GarbageCollect(run, metric, limit)
err = provider.GarbageCollect(run, metric, measurementsLimit)
if err != nil {
return err
}
result.Measurements = result.Measurements[length-limit : length]
result.Measurements = result.Measurements[length-measurementsLimit : length]
}
run.Status.MetricResults[i] = result
}
Expand Down
30 changes: 28 additions & 2 deletions analysis/analysis_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1055,7 +1055,8 @@ func TestTrimMeasurementHistory(t *testing.T) {

{
run := newRun()
c.garbageCollectMeasurements(run, 2)
err := c.garbageCollectMeasurements(run, map[string]*v1alpha1.MeasurementRetention{}, 2)
assert.Nil(t, err)
assert.Len(t, run.Status.MetricResults[0].Measurements, 1)
assert.Equal(t, "1", run.Status.MetricResults[0].Measurements[0].Value)
assert.Len(t, run.Status.MetricResults[1].Measurements, 2)
Expand All @@ -1064,12 +1065,37 @@ func TestTrimMeasurementHistory(t *testing.T) {
}
{
run := newRun()
c.garbageCollectMeasurements(run, 1)
err := c.garbageCollectMeasurements(run, map[string]*v1alpha1.MeasurementRetention{}, 1)
assert.Nil(t, err)
assert.Len(t, run.Status.MetricResults[0].Measurements, 1)
assert.Equal(t, "1", run.Status.MetricResults[0].Measurements[0].Value)
assert.Len(t, run.Status.MetricResults[1].Measurements, 1)
assert.Equal(t, "3", run.Status.MetricResults[1].Measurements[0].Value)
}
{
run := newRun()
var measurementRetentionMetricsMap = map[string]*v1alpha1.MeasurementRetention{}
measurementRetentionMetricsMap["metric2"] = &v1alpha1.MeasurementRetention{MetricName: "*", Limit: 2}
err := c.garbageCollectMeasurements(run, measurementRetentionMetricsMap, 1)
assert.Nil(t, err)
assert.Len(t, run.Status.MetricResults[0].Measurements, 1)
assert.Equal(t, "1", run.Status.MetricResults[0].Measurements[0].Value)
assert.Len(t, run.Status.MetricResults[1].Measurements, 2)
assert.Equal(t, "2", run.Status.MetricResults[1].Measurements[0].Value)
assert.Equal(t, "3", run.Status.MetricResults[1].Measurements[1].Value)
}
{
run := newRun()
var measurementRetentionMetricsMap = map[string]*v1alpha1.MeasurementRetention{}
measurementRetentionMetricsMap["metric2"] = &v1alpha1.MeasurementRetention{MetricName: "metric2", Limit: 2}
err := c.garbageCollectMeasurements(run, measurementRetentionMetricsMap, 1)
assert.Nil(t, err)
assert.Len(t, run.Status.MetricResults[0].Measurements, 1)
assert.Equal(t, "1", run.Status.MetricResults[0].Measurements[0].Value)
assert.Len(t, run.Status.MetricResults[1].Measurements, 2)
assert.Equal(t, "2", run.Status.MetricResults[1].Measurements[0].Value)
assert.Equal(t, "3", run.Status.MetricResults[1].Measurements[1].Value)
}
}

func TestResolveMetricArgsUnableToSubstitute(t *testing.T) {
Expand Down
73 changes: 73 additions & 0 deletions docs/features/analysis.md
Original file line number Diff line number Diff line change
Expand Up @@ -679,6 +679,79 @@ spec:
- metricName: test.*
```

## Measurements Retention

!!! important
Available since v1.2

`measurementRetention` can be used to retain other than the latest ten results for the metrics running in any mode
(dry/non-dry). Setting this option to `0` would disable it and, the controller will revert to the existing behavior of
retaining the latest ten measurements.

The following example queries Prometheus every 5 minutes to get the total number of 4XX and 5XX errors and retains the
latest twenty measurements for the 5XX metric run results instead of the default ten.

```yaml hl_lines="1 2 3"
measurementRetention:
- metricName: total-5xx-errors
limit: 20
metrics:
- name: total-5xx-errors
interval: 5m
failureCondition: result[0] >= 10
failureLimit: 3
provider:
prometheus:
address: http://prometheus.example.com:9090
query: |
sum(irate(
istio_requests_total{reporter="source",destination_service=~"{{args.service-name}}",response_code~"5.*"}[5m]
))
- name: total-4xx-errors
interval: 5m
failureCondition: result[0] >= 10
failureLimit: 3
provider:
prometheus:
address: http://prometheus.example.com:9090
query: |
sum(irate(
istio_requests_total{reporter="source",destination_service=~"{{args.service-name}}",response_code~"4.*"}[5m]
))
```

RegEx matches are also supported. `.*` can be used to apply the same retention rule to all the metrics. In the following
example, the controller will retain the latest twenty run results for all the metrics instead of the default ten results.

```yaml hl_lines="1 2 3"
measurementRetention:
- metricName: .*
limit: 20
metrics:
- name: total-5xx-errors
interval: 5m
failureCondition: result[0] >= 10
failureLimit: 3
provider:
prometheus:
address: http://prometheus.example.com:9090
query: |
sum(irate(
istio_requests_total{reporter="source",destination_service=~"{{args.service-name}}",response_code~"5.*"}[5m]
))
- name: total-4xx-errors
interval: 5m
failureCondition: result[0] >= 10
failureLimit: 3
provider:
prometheus:
address: http://prometheus.example.com:9090
query: |
sum(irate(
istio_requests_total{reporter="source",destination_service=~"{{args.service-name}}",response_code~"4.*"}[5m]
))
```

## Inconclusive Runs

Analysis runs can also be considered `Inconclusive`, which indicates the run was neither successful,
Expand Down
12 changes: 12 additions & 0 deletions manifests/crds/analysis-run-crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,18 @@ spec:
- metricName
type: object
type: array
measurementRetention:
items:
properties:
limit:
type: integer
metricName:
type: string
required:
- limit
- metricName
type: object
type: array
metrics:
items:
properties:
Expand Down
12 changes: 12 additions & 0 deletions manifests/crds/analysis-template-crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,18 @@ spec:
- metricName
type: object
type: array
measurementRetention:
items:
properties:
limit:
type: integer
metricName:
type: string
required:
- limit
- metricName
type: object
type: array
metrics:
items:
properties:
Expand Down
12 changes: 12 additions & 0 deletions manifests/crds/cluster-analysis-template-crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,18 @@ spec:
- metricName
type: object
type: array
measurementRetention:
items:
properties:
limit:
type: integer
metricName:
type: string
required:
- limit
- metricName
type: object
type: array
metrics:
items:
properties:
Expand Down
36 changes: 36 additions & 0 deletions manifests/install.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,18 @@ spec:
- metricName
type: object
type: array
measurementRetention:
items:
properties:
limit:
type: integer
metricName:
type: string
required:
- limit
- metricName
type: object
type: array
metrics:
items:
properties:
Expand Down Expand Up @@ -2815,6 +2827,18 @@ spec:
- metricName
type: object
type: array
measurementRetention:
items:
properties:
limit:
type: integer
metricName:
type: string
required:
- limit
- metricName
type: object
type: array
metrics:
items:
properties:
Expand Down Expand Up @@ -5442,6 +5466,18 @@ spec:
- metricName
type: object
type: array
measurementRetention:
items:
properties:
limit:
type: integer
metricName:
type: string
required:
- limit
- metricName
type: object
type: array
metrics:
items:
properties:
Expand Down
36 changes: 36 additions & 0 deletions manifests/namespace-install.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,18 @@ spec:
- metricName
type: object
type: array
measurementRetention:
items:
properties:
limit:
type: integer
metricName:
type: string
required:
- limit
- metricName
type: object
type: array
metrics:
items:
properties:
Expand Down Expand Up @@ -2815,6 +2827,18 @@ spec:
- metricName
type: object
type: array
measurementRetention:
items:
properties:
limit:
type: integer
metricName:
type: string
required:
- limit
- metricName
type: object
type: array
metrics:
items:
properties:
Expand Down Expand Up @@ -5442,6 +5466,18 @@ spec:
- metricName
type: object
type: array
measurementRetention:
items:
properties:
limit:
type: integer
metricName:
type: string
required:
- limit
- metricName
type: object
type: array
metrics:
items:
properties:
Expand Down
2 changes: 2 additions & 0 deletions pkg/apis/api-rules/violation_exceptions.list
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
API rule violation: list_type_missing,github.com/argoproj/argo-rollouts/pkg/apis/rollouts/v1alpha1,AmbassadorTrafficRouting,Mappings
API rule violation: list_type_missing,github.com/argoproj/argo-rollouts/pkg/apis/rollouts/v1alpha1,AnalysisRunSpec,Args
API rule violation: list_type_missing,github.com/argoproj/argo-rollouts/pkg/apis/rollouts/v1alpha1,AnalysisRunSpec,DryRun
API rule violation: list_type_missing,github.com/argoproj/argo-rollouts/pkg/apis/rollouts/v1alpha1,AnalysisRunSpec,MeasurementRetention
API rule violation: list_type_missing,github.com/argoproj/argo-rollouts/pkg/apis/rollouts/v1alpha1,AnalysisRunSpec,Metrics
API rule violation: list_type_missing,github.com/argoproj/argo-rollouts/pkg/apis/rollouts/v1alpha1,AnalysisRunStatus,MetricResults
API rule violation: list_type_missing,github.com/argoproj/argo-rollouts/pkg/apis/rollouts/v1alpha1,AnalysisTemplateSpec,Args
API rule violation: list_type_missing,github.com/argoproj/argo-rollouts/pkg/apis/rollouts/v1alpha1,AnalysisTemplateSpec,DryRun
API rule violation: list_type_missing,github.com/argoproj/argo-rollouts/pkg/apis/rollouts/v1alpha1,AnalysisTemplateSpec,MeasurementRetention
API rule violation: list_type_missing,github.com/argoproj/argo-rollouts/pkg/apis/rollouts/v1alpha1,AnalysisTemplateSpec,Metrics
API rule violation: list_type_missing,github.com/argoproj/argo-rollouts/pkg/apis/rollouts/v1alpha1,CanaryStrategy,Steps
API rule violation: list_type_missing,github.com/argoproj/argo-rollouts/pkg/apis/rollouts/v1alpha1,CloudWatchMetric,MetricDataQueries
Expand Down
Loading

0 comments on commit cb59bd4

Please sign in to comment.