Skip to content

Commit

Permalink
feat(analysis): Add Measurements Length Option for Dry-Run Metrics
Browse files Browse the repository at this point in the history
Signed-off-by: Rohit Agrawal <[email protected]>
  • Loading branch information
agrawroh committed Dec 15, 2021
1 parent 061efd4 commit 22e5ba4
Show file tree
Hide file tree
Showing 16 changed files with 173 additions and 41 deletions.
25 changes: 15 additions & 10 deletions analysis/analysis.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ func (c *Controller) reconcileAnalysisRun(origRun *v1alpha1.AnalysisRun) *v1alph
}
}

err = c.garbageCollectMeasurements(run, DefaultMeasurementHistoryLimit)
err = c.garbageCollectMeasurements(run, DefaultMeasurementHistoryLimit, dryRunMetricsMap)
if err != nil {
// TODO(jessesuen): surface errors to controller so they can be retried
logger.Warnf("Failed to garbage collect measurements: %v", err)
Expand Down Expand Up @@ -289,7 +289,7 @@ func (c *Controller) resolveArgs(tasks []metricTask, args []v1alpha1.Argument, n
}

// runMeasurements iterates a list of metric tasks, and runs, resumes, or terminates measurements
func (c *Controller) runMeasurements(run *v1alpha1.AnalysisRun, tasks []metricTask, dryRunMetricsMap map[string]bool) error {
func (c *Controller) runMeasurements(run *v1alpha1.AnalysisRun, tasks []metricTask, dryRunMetricsMap map[string]*v1alpha1.DryRun) error {
var wg sync.WaitGroup
// resultsLock should be held whenever we are accessing or setting status.metricResults since
// we are performing queries in parallel
Expand Down Expand Up @@ -319,7 +319,7 @@ func (c *Controller) runMeasurements(run *v1alpha1.AnalysisRun, tasks []metricTa
metricResult = &v1alpha1.MetricResult{
Name: t.metric.Name,
Phase: v1alpha1.AnalysisPhaseRunning,
DryRun: dryRunMetricsMap[t.metric.Name],
DryRun: dryRunMetricsMap[t.metric.Name] != nil,
}
}

Expand Down Expand Up @@ -404,7 +404,7 @@ func (c *Controller) runMeasurements(run *v1alpha1.AnalysisRun, tasks []metricTa
// assessRunStatus assesses the overall status of this AnalysisRun
// If any metric is not yet completed, the AnalysisRun is still considered Running
// Once all metrics are complete, the worst status is used as the overall AnalysisRun status
func (c *Controller) assessRunStatus(run *v1alpha1.AnalysisRun, metrics []v1alpha1.Metric, dryRunMetricsMap map[string]bool) (v1alpha1.AnalysisPhase, string) {
func (c *Controller) assessRunStatus(run *v1alpha1.AnalysisRun, metrics []v1alpha1.Metric, dryRunMetricsMap map[string]*v1alpha1.DryRun) (v1alpha1.AnalysisPhase, string) {
var worstStatus v1alpha1.AnalysisPhase
var worstMessage string
terminating := analysisutil.IsTerminating(run)
Expand Down Expand Up @@ -436,7 +436,7 @@ func (c *Controller) assessRunStatus(run *v1alpha1.AnalysisRun, metrics []v1alph

// Iterate all metrics and update `MetricResult.Phase` fields based on latest measurement(s)
for _, metric := range metrics {
if dryRunMetricsMap[metric.Name] {
if dryRunMetricsMap[metric.Name] != nil {
log.Infof("Metric '%s' is running in the Dry-Run mode.", metric.Name)
dryRunSummary.Count++
} else {
Expand Down Expand Up @@ -468,7 +468,7 @@ func (c *Controller) assessRunStatus(run *v1alpha1.AnalysisRun, metrics []v1alph
phase, message := assessMetricFailureInconclusiveOrError(metric, *result)
// NOTE: We don't care about the status if the metric is marked as a Dry-Run
// otherwise, remember the worst status of all completed metric results
if !dryRunMetricsMap[metric.Name] {
if dryRunMetricsMap[metric.Name] == nil {
if worstStatus == "" || analysisutil.IsWorse(worstStatus, metricStatus) {
worstStatus = metricStatus
if message != "" {
Expand Down Expand Up @@ -693,7 +693,7 @@ func calculateNextReconcileTime(run *v1alpha1.AnalysisRun, metrics []v1alpha1.Me
}

// garbageCollectMeasurements trims the measurement history to the specified limit and GCs old measurements
func (c *Controller) garbageCollectMeasurements(run *v1alpha1.AnalysisRun, limit int) error {
func (c *Controller) garbageCollectMeasurements(run *v1alpha1.AnalysisRun, limit int, dryRunMetricsMap map[string]*v1alpha1.DryRun) error {
var errors []error

metricsByName := make(map[string]v1alpha1.Metric)
Expand All @@ -703,7 +703,12 @@ func (c *Controller) garbageCollectMeasurements(run *v1alpha1.AnalysisRun, limit

for i, result := range run.Status.MetricResults {
length := len(result.Measurements)
if length > limit {
dryRunObject, metricRunningInDryRun := dryRunMetricsMap[result.Name]
measurementsLimit := limit
if metricRunningInDryRun && dryRunObject.MeasurementsLength > 0 {
measurementsLimit = dryRunObject.MeasurementsLength
}
if length > measurementsLimit {
metric, ok := metricsByName[result.Name]
if !ok {
continue
Expand All @@ -714,11 +719,11 @@ func (c *Controller) garbageCollectMeasurements(run *v1alpha1.AnalysisRun, limit
errors = append(errors, err)
continue
}
err = provider.GarbageCollect(run, metric, limit)
err = provider.GarbageCollect(run, metric, measurementsLimit)
if err != nil {
return err
}
result.Measurements = result.Measurements[length-limit : length]
result.Measurements = result.Measurements[length-measurementsLimit : length]
}
run.Status.MetricResults[i] = result
}
Expand Down
52 changes: 44 additions & 8 deletions analysis/analysis_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -446,7 +446,7 @@ func TestAssessRunStatus(t *testing.T) {
},
},
}
status, message := c.assessRunStatus(run, run.Spec.Metrics, map[string]bool{})
status, message := c.assessRunStatus(run, run.Spec.Metrics, map[string]*v1alpha1.DryRun{})
assert.Equal(t, v1alpha1.AnalysisPhaseRunning, status)
assert.Equal(t, "", message)
}
Expand All @@ -465,7 +465,7 @@ func TestAssessRunStatus(t *testing.T) {
},
},
}
status, message := c.assessRunStatus(run, run.Spec.Metrics, map[string]bool{})
status, message := c.assessRunStatus(run, run.Spec.Metrics, map[string]*v1alpha1.DryRun{})
assert.Equal(t, v1alpha1.AnalysisPhaseFailed, status)
assert.Equal(t, "", message)
}
Expand Down Expand Up @@ -519,7 +519,7 @@ func TestAssessRunStatusUpdateResult(t *testing.T) {
},
},
}
status, message := c.assessRunStatus(run, run.Spec.Metrics, map[string]bool{})
status, message := c.assessRunStatus(run, run.Spec.Metrics, map[string]*v1alpha1.DryRun{})
assert.Equal(t, v1alpha1.AnalysisPhaseRunning, status)
assert.Equal(t, "", message)
assert.Equal(t, v1alpha1.AnalysisPhaseFailed, run.Status.MetricResults[1].Phase)
Expand Down Expand Up @@ -1055,7 +1055,7 @@ func TestTrimMeasurementHistory(t *testing.T) {

{
run := newRun()
c.garbageCollectMeasurements(run, 2)
c.garbageCollectMeasurements(run, 2, map[string]*v1alpha1.DryRun{})
assert.Len(t, run.Status.MetricResults[0].Measurements, 1)
assert.Equal(t, "1", run.Status.MetricResults[0].Measurements[0].Value)
assert.Len(t, run.Status.MetricResults[1].Measurements, 2)
Expand All @@ -1064,12 +1064,36 @@ func TestTrimMeasurementHistory(t *testing.T) {
}
{
run := newRun()
c.garbageCollectMeasurements(run, 1)
c.garbageCollectMeasurements(run, 1, map[string]*v1alpha1.DryRun{})
assert.Len(t, run.Status.MetricResults[0].Measurements, 1)
assert.Equal(t, "1", run.Status.MetricResults[0].Measurements[0].Value)
assert.Len(t, run.Status.MetricResults[1].Measurements, 1)
assert.Equal(t, "3", run.Status.MetricResults[1].Measurements[0].Value)
}
{
run := newRun()
var dryRunMetricsMap = map[string]*v1alpha1.DryRun{}
dryRunMetricsMap["metric2"] = &v1alpha1.DryRun{MetricName: "*", MeasurementsLength: 2}
err := c.garbageCollectMeasurements(run, 1, dryRunMetricsMap)
assert.Nil(t, err)
assert.Len(t, run.Status.MetricResults[0].Measurements, 1)
assert.Equal(t, "1", run.Status.MetricResults[0].Measurements[0].Value)
assert.Len(t, run.Status.MetricResults[1].Measurements, 2)
assert.Equal(t, "2", run.Status.MetricResults[1].Measurements[0].Value)
assert.Equal(t, "3", run.Status.MetricResults[1].Measurements[1].Value)
}
{
run := newRun()
var dryRunMetricsMap = map[string]*v1alpha1.DryRun{}
dryRunMetricsMap["metric2"] = &v1alpha1.DryRun{MetricName: "metric2", MeasurementsLength: 2}
err := c.garbageCollectMeasurements(run, 1, dryRunMetricsMap)
assert.Nil(t, err)
assert.Len(t, run.Status.MetricResults[0].Measurements, 1)
assert.Equal(t, "1", run.Status.MetricResults[0].Measurements[0].Value)
assert.Len(t, run.Status.MetricResults[1].Measurements, 2)
assert.Equal(t, "2", run.Status.MetricResults[1].Measurements[0].Value)
assert.Equal(t, "3", run.Status.MetricResults[1].Measurements[1].Value)
}
}

func TestResolveMetricArgsUnableToSubstitute(t *testing.T) {
Expand Down Expand Up @@ -1418,7 +1442,11 @@ func StartAssessRunStatusErrorMessageAnalysisPhaseFail(t *testing.T, isDryRun bo

run := newTerminatingRun(v1alpha1.AnalysisPhaseFailed, isDryRun)
run.Status.MetricResults[0].Phase = v1alpha1.AnalysisPhaseSuccessful
status, message := c.assessRunStatus(run, run.Spec.Metrics, map[string]bool{"run-forever": isDryRun, "failed-metric": isDryRun})
dryRunMetricsMap := map[string]*v1alpha1.DryRun{}
if isDryRun {
dryRunMetricsMap = map[string]*v1alpha1.DryRun{"run-forever": {MetricName: "run-forever"}, "failed-metric": {MetricName: "failed-metric"}}
}
status, message := c.assessRunStatus(run, run.Spec.Metrics, dryRunMetricsMap)
return status, message, run.Status.DryRunSummary
}

Expand Down Expand Up @@ -1458,7 +1486,11 @@ func StartAssessRunStatusErrorMessageFromProvider(t *testing.T, providerMessage
run := newTerminatingRun(v1alpha1.AnalysisPhaseFailed, isDryRun)
run.Status.MetricResults[0].Phase = v1alpha1.AnalysisPhaseSuccessful // All metrics must complete, or assessRunStatus will not return message
run.Status.MetricResults[1].Message = providerMessage
status, message := c.assessRunStatus(run, run.Spec.Metrics, map[string]bool{"run-forever": isDryRun, "failed-metric": isDryRun})
dryRunMetricsMap := map[string]*v1alpha1.DryRun{}
if isDryRun {
dryRunMetricsMap = map[string]*v1alpha1.DryRun{"run-forever": {MetricName: "run-forever"}, "failed-metric": {MetricName: "failed-metric"}}
}
status, message := c.assessRunStatus(run, run.Spec.Metrics, dryRunMetricsMap)
return status, message, run.Status.DryRunSummary
}

Expand Down Expand Up @@ -1503,7 +1535,11 @@ func StartAssessRunStatusMultipleFailures(t *testing.T, isDryRun bool) (v1alpha1
run := newTerminatingRun(v1alpha1.AnalysisPhaseFailed, isDryRun)
run.Status.MetricResults[0].Phase = v1alpha1.AnalysisPhaseFailed
run.Status.MetricResults[0].Failed = 1
status, message := c.assessRunStatus(run, run.Spec.Metrics, map[string]bool{"run-forever": isDryRun, "failed-metric": isDryRun})
dryRunMetricsMap := map[string]*v1alpha1.DryRun{}
if isDryRun {
dryRunMetricsMap = map[string]*v1alpha1.DryRun{"run-forever": {MetricName: "run-forever"}, "failed-metric": {MetricName: "failed-metric"}}
}
status, message := c.assessRunStatus(run, run.Spec.Metrics, dryRunMetricsMap)
return status, message, run.Status.DryRunSummary
}

Expand Down
16 changes: 10 additions & 6 deletions controller/metrics/analysis.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,12 +91,16 @@ func collectAnalysisRuns(ch chan<- prometheus.Metric, ar *v1alpha1.AnalysisRun)
if metricResult != nil {
calculatedPhase = metricResult.Phase
}
addGauge(MetricAnalysisRunMetricPhase, boolFloat64(calculatedPhase == v1alpha1.AnalysisPhasePending || calculatedPhase == ""), metric.Name, metricType, fmt.Sprint(dryRunMetricsMap[metric.Name]), string(v1alpha1.AnalysisPhasePending))
addGauge(MetricAnalysisRunMetricPhase, boolFloat64(calculatedPhase == v1alpha1.AnalysisPhaseError), metric.Name, metricType, fmt.Sprint(dryRunMetricsMap[metric.Name]), string(v1alpha1.AnalysisPhaseError))
addGauge(MetricAnalysisRunMetricPhase, boolFloat64(calculatedPhase == v1alpha1.AnalysisPhaseFailed), metric.Name, metricType, fmt.Sprint(dryRunMetricsMap[metric.Name]), string(v1alpha1.AnalysisPhaseFailed))
addGauge(MetricAnalysisRunMetricPhase, boolFloat64(calculatedPhase == v1alpha1.AnalysisPhaseSuccessful), metric.Name, metricType, fmt.Sprint(dryRunMetricsMap[metric.Name]), string(v1alpha1.AnalysisPhaseSuccessful))
addGauge(MetricAnalysisRunMetricPhase, boolFloat64(calculatedPhase == v1alpha1.AnalysisPhaseRunning), metric.Name, metricType, fmt.Sprint(dryRunMetricsMap[metric.Name]), string(v1alpha1.AnalysisPhaseRunning))
addGauge(MetricAnalysisRunMetricPhase, boolFloat64(calculatedPhase == v1alpha1.AnalysisPhaseInconclusive), metric.Name, metricType, fmt.Sprint(dryRunMetricsMap[metric.Name]), string(v1alpha1.AnalysisPhaseInconclusive))
isDryRunMetric := "No"
if dryRunMetricsMap[metric.Name] != nil {
isDryRunMetric = "Yes"
}
addGauge(MetricAnalysisRunMetricPhase, boolFloat64(calculatedPhase == v1alpha1.AnalysisPhasePending || calculatedPhase == ""), metric.Name, metricType, fmt.Sprint(isDryRunMetric), string(v1alpha1.AnalysisPhasePending))
addGauge(MetricAnalysisRunMetricPhase, boolFloat64(calculatedPhase == v1alpha1.AnalysisPhaseError), metric.Name, metricType, fmt.Sprint(isDryRunMetric), string(v1alpha1.AnalysisPhaseError))
addGauge(MetricAnalysisRunMetricPhase, boolFloat64(calculatedPhase == v1alpha1.AnalysisPhaseFailed), metric.Name, metricType, fmt.Sprint(isDryRunMetric), string(v1alpha1.AnalysisPhaseFailed))
addGauge(MetricAnalysisRunMetricPhase, boolFloat64(calculatedPhase == v1alpha1.AnalysisPhaseSuccessful), metric.Name, metricType, fmt.Sprint(isDryRunMetric), string(v1alpha1.AnalysisPhaseSuccessful))
addGauge(MetricAnalysisRunMetricPhase, boolFloat64(calculatedPhase == v1alpha1.AnalysisPhaseRunning), metric.Name, metricType, fmt.Sprint(isDryRunMetric), string(v1alpha1.AnalysisPhaseRunning))
addGauge(MetricAnalysisRunMetricPhase, boolFloat64(calculatedPhase == v1alpha1.AnalysisPhaseInconclusive), metric.Name, metricType, fmt.Sprint(isDryRunMetric), string(v1alpha1.AnalysisPhaseInconclusive))
}
}

Expand Down
49 changes: 38 additions & 11 deletions controller/metrics/analysis_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,21 @@ metadata:
name: http-benchmark-test-tr8rn
namespace: jesse-test
spec:
dryRun:
- metricName: web-metric-2
metrics:
- name: webmetric
- name: web-metric-1
provider:
web:
jsonPath: .
url: https://www.google.com
successCondition: "true"
- name: web-metric-2
provider:
web:
jsonPath: .
url: https://www.msn.com
successCondition: "false"
status:
metricResults:
- consecutiveError: 5
Expand All @@ -39,7 +47,17 @@ status:
of value'
phase: Error
startedAt: "2020-03-16T20:02:14Z"
name: webmetric
name: web-metric-1
phase: Error
- consecutiveError: 5
error: 5
measurements:
- finishedAt: "2020-03-16T20:02:15Z"
message: 'Could not parse JSON body: invalid character ''<'' looking for beginning
of value'
phase: Error
startedAt: "2020-03-16T20:02:14Z"
name: web-metric-2
phase: Error
phase: Error
startedAt: "2020-03-16T20:02:15Z"
Expand All @@ -53,6 +71,8 @@ metadata:
name: http-benchmark-test
namespace: jesse-test
spec:
dryRun:
- metricName: web-metric-2
metrics:
- name: web-metric-1
provider:
Expand All @@ -61,7 +81,6 @@ spec:
url: https://www.google.com
successCondition: "true"
- name: web-metric-2
dryRun: true
provider:
web:
jsonPath: .
Expand All @@ -76,6 +95,8 @@ metadata:
creationTimestamp: "2020-03-16T20:01:13Z"
name: http-benchmark-cluster-test
spec:
dryRun:
- metricName: web-metric-2
metrics:
- name: web-metric-1
provider:
Expand All @@ -84,7 +105,6 @@ spec:
url: https://www.google.com
successCondition: "true"
- name: web-metric-2
dryRun: true
provider:
web:
jsonPath: .
Expand All @@ -97,15 +117,22 @@ const expectedAnalysisRunResponse = `# HELP analysis_run_info Information about
analysis_run_info{name="http-benchmark-test-tr8rn",namespace="jesse-test",phase="Error"} 1
# HELP analysis_run_metric_phase Information on the duration of a specific metric in the Analysis Run
# TYPE analysis_run_metric_phase gauge
analysis_run_metric_phase{dry_run="false",metric="webmetric",name="http-benchmark-test-tr8rn",namespace="jesse-test",phase="Error",type="Web"} 1
analysis_run_metric_phase{dry_run="false",metric="webmetric",name="http-benchmark-test-tr8rn",namespace="jesse-test",phase="Failed",type="Web"} 0
analysis_run_metric_phase{dry_run="false",metric="webmetric",name="http-benchmark-test-tr8rn",namespace="jesse-test",phase="Inconclusive",type="Web"} 0
analysis_run_metric_phase{dry_run="false",metric="webmetric",name="http-benchmark-test-tr8rn",namespace="jesse-test",phase="Pending",type="Web"} 0
analysis_run_metric_phase{dry_run="false",metric="webmetric",name="http-benchmark-test-tr8rn",namespace="jesse-test",phase="Running",type="Web"} 0
analysis_run_metric_phase{dry_run="false",metric="webmetric",name="http-benchmark-test-tr8rn",namespace="jesse-test",phase="Successful",type="Web"} 0
analysis_run_metric_phase{dry_run="No",metric="web-metric-1",name="http-benchmark-test-tr8rn",namespace="jesse-test",phase="Error",type="Web"} 1
analysis_run_metric_phase{dry_run="No",metric="web-metric-1",name="http-benchmark-test-tr8rn",namespace="jesse-test",phase="Failed",type="Web"} 0
analysis_run_metric_phase{dry_run="No",metric="web-metric-1",name="http-benchmark-test-tr8rn",namespace="jesse-test",phase="Inconclusive",type="Web"} 0
analysis_run_metric_phase{dry_run="No",metric="web-metric-1",name="http-benchmark-test-tr8rn",namespace="jesse-test",phase="Pending",type="Web"} 0
analysis_run_metric_phase{dry_run="No",metric="web-metric-1",name="http-benchmark-test-tr8rn",namespace="jesse-test",phase="Running",type="Web"} 0
analysis_run_metric_phase{dry_run="No",metric="web-metric-1",name="http-benchmark-test-tr8rn",namespace="jesse-test",phase="Successful",type="Web"} 0
analysis_run_metric_phase{dry_run="Yes",metric="web-metric-2",name="http-benchmark-test-tr8rn",namespace="jesse-test",phase="Error",type="Web"} 1
analysis_run_metric_phase{dry_run="Yes",metric="web-metric-2",name="http-benchmark-test-tr8rn",namespace="jesse-test",phase="Failed",type="Web"} 0
analysis_run_metric_phase{dry_run="Yes",metric="web-metric-2",name="http-benchmark-test-tr8rn",namespace="jesse-test",phase="Inconclusive",type="Web"} 0
analysis_run_metric_phase{dry_run="Yes",metric="web-metric-2",name="http-benchmark-test-tr8rn",namespace="jesse-test",phase="Pending",type="Web"} 0
analysis_run_metric_phase{dry_run="Yes",metric="web-metric-2",name="http-benchmark-test-tr8rn",namespace="jesse-test",phase="Running",type="Web"} 0
analysis_run_metric_phase{dry_run="Yes",metric="web-metric-2",name="http-benchmark-test-tr8rn",namespace="jesse-test",phase="Successful",type="Web"} 0
# HELP analysis_run_metric_type Information on the type of a specific metric in the Analysis Runs
# TYPE analysis_run_metric_type gauge
analysis_run_metric_type{metric="webmetric",name="http-benchmark-test-tr8rn",namespace="jesse-test",type="Web"} 1
analysis_run_metric_type{metric="web-metric-1",name="http-benchmark-test-tr8rn",namespace="jesse-test",type="Web"} 1
analysis_run_metric_type{metric="web-metric-2",name="http-benchmark-test-tr8rn",namespace="jesse-test",type="Web"} 1
# HELP analysis_run_phase Information on the state of the Analysis Run (DEPRECATED - use analysis_run_info)
# TYPE analysis_run_phase gauge
analysis_run_phase{name="http-benchmark-test-tr8rn",namespace="jesse-test",phase="Error"} 1
Expand Down
8 changes: 6 additions & 2 deletions docs/features/analysis.md
Original file line number Diff line number Diff line change
Expand Up @@ -554,9 +554,13 @@ out as inconclusive.
The following example queries prometheus every 5 minutes to get the total number of 4XX and 5XX errors, and even if the
evaluation of the metric to monitor the 5XX error-rate fail, the analysis run will pass.

```yaml hl_lines="1 2"
```yaml hl_lines="1 2 3 4 5 6"
dryRun:
- metricName: total-5xx-errors
# `measurementsLength` can be used to retain more than ten results for the metrics running in the dry-run mode.
# Setting it to `0` will disable this option and, the controller will revert to the existing behavior of retaining
# the latest ten measurements.
measurementsLength: 25
metrics:
- name: total-5xx-errors
interval: 5m
Expand Down Expand Up @@ -713,7 +717,7 @@ A use case for having `Inconclusive` analysis runs are to enable Argo Rollouts t
whether or not measurement value is acceptable and decide to proceed or abort.

## Delay Analysis Runs
If the analysis run does not need to start immediately (i.e give the metric provider time to collect
If the analysis run does not need to start immediately (i.e. give the metric provider time to collect
metrics on the canary version), Analysis Runs can delay the specific metric analysis. Each metric
can be configured to have a different delay. In additional to the metric specific delays, the rollouts
with background analysis can delay creating an analysis run until a certain step is reached
Expand Down
Loading

0 comments on commit 22e5ba4

Please sign in to comment.