tektoncd · tekton-robot · Oct 17, 2024 · Sep 19, 2024 · afrittoli · Oct 8, 2024
diff --git a/config/config-observability.yaml b/config/config-observability.yaml
@@ -59,3 +59,4 @@ data:
     metrics.pipelinerun.level: "pipeline"
     metrics.pipelinerun.duration-type: "histogram"
     metrics.count.enable-reason: "false"
+    metrics.running-pipelinerun.level: ""
diff --git a/docs/metrics.md b/docs/metrics.md
@@ -41,26 +41,31 @@ A sample config-map has been provided as [config-observability](./../config/conf
     metrics.taskrun.level: "task"
     metrics.taskrun.duration-type: "histogram"
     metrics.pipelinerun.level: "pipeline"
+    metrics.running-pipelinerun.level: ""
     metrics.pipelinerun.duration-type: "histogram"
     metrics.count.enable-reason: "false"
 ```
 
 Following values are available in the configmap:
 
-| configmap data | value | description |
-| -- | ----------- | ----------- |
-| metrics.taskrun.level | `taskrun` | Level of metrics is taskrun |
-| metrics.taskrun.level | `task` | Level of metrics is task and taskrun label isn't present in the metrics |
-| metrics.taskrun.level | `namespace` | Level of metrics is namespace, and task and taskrun label isn't present in the metrics
-| metrics.pipelinerun.level | `pipelinerun` | Level of metrics is pipelinerun |
-| metrics.pipelinerun.level | `pipeline` | Level of metrics is pipeline and pipelinerun label isn't present in the metrics |
-| metrics.pipelinerun.level | `namespace` | Level of metrics is namespace, pipeline and pipelinerun label isn't present in the metrics |
-| metrics.taskrun.duration-type | `histogram` | `tekton_pipelines_controller_pipelinerun_taskrun_duration_seconds` and `tekton_pipelines_controller_taskrun_duration_seconds` is of type histogram |
+| configmap data | value | description                                                                                                                                                  |
+| -- | ----------- |--------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| metrics.taskrun.level | `taskrun` | Level of metrics is taskrun                                                                                                                                  |
+| metrics.taskrun.level | `task` | Level of metrics is task and taskrun label isn't present in the metrics                                                                                      |
+| metrics.taskrun.level | `namespace` | Level of metrics is namespace, and task and taskrun label isn't present in the metrics                                                                       
+| metrics.pipelinerun.level | `pipelinerun` | Level of metrics is pipelinerun                                                                                                                              |
+| metrics.pipelinerun.level | `pipeline` | Level of metrics is pipeline and pipelinerun label isn't present in the metrics                                                                              |
+| metrics.pipelinerun.level | `namespace` | Level of metrics is namespace, pipeline and pipelinerun label isn't present in the metrics                                                                   |
+| metrics.running-pipelinerun.level | `pipelinerun` | Level of running-pipelinerun metrics is pipelinerun                                                                                                          |
+| metrics.running-pipelinerun.level | `pipeline` | Level of running-pipelinerun metrics is pipeline and pipelinerun label isn't present in the metrics                                                          |
+| metrics.running-pipelinerun.level | `namespace` | Level of running-pipelinerun metrics is namespace, pipeline and pipelinerun label isn't present in the metrics                                               |
+| metrics.running-pipelinerun.level | `` | Level of running-pipelinerun metrics is cluster, namespace, pipeline and pipelinerun label isn't present in the metrics.                                     |
+| metrics.taskrun.duration-type | `histogram` | `tekton_pipelines_controller_pipelinerun_taskrun_duration_seconds` and `tekton_pipelines_controller_taskrun_duration_seconds` is of type histogram           |
 | metrics.taskrun.duration-type | `lastvalue` | `tekton_pipelines_controller_pipelinerun_taskrun_duration_seconds` and  `tekton_pipelines_controller_taskrun_duration_seconds` is of type gauge or lastvalue |
-| metrics.pipelinerun.duration-type | `histogram` | `tekton_pipelines_controller_pipelinerun_duration_seconds` is of type histogram |
-| metrics.pipelinerun.duration-type | `lastvalue` | `tekton_pipelines_controller_pipelinerun_duration_seconds` is of type gauge or lastvalue |
-| metrics.count.enable-reason | `false` | Sets if the `reason` label should be included on count metrics |
-| metrics.taskrun.throttle.enable-namespace | `false` | Sets if the `namespace` label should be included on the `tekton_pipelines_controller_running_taskruns_throttled_by_quota` metric |
+| metrics.pipelinerun.duration-type | `histogram` | `tekton_pipelines_controller_pipelinerun_duration_seconds` is of type histogram                                                                              |
+| metrics.pipelinerun.duration-type | `lastvalue` | `tekton_pipelines_controller_pipelinerun_duration_seconds` is of type gauge or lastvalue                                                                     |
+| metrics.count.enable-reason | `false` | Sets if the `reason` label should be included on count metrics                                                                                               |
+| metrics.taskrun.throttle.enable-namespace | `false` | Sets if the `namespace` label should be included on the `tekton_pipelines_controller_running_taskruns_throttled_by_quota` metric                             |
 
 Histogram value isn't available when pipelinerun or taskrun labels are selected. The Lastvalue or Gauge will be provided. Histogram would serve no purpose because it would generate a single bar. TaskRun and PipelineRun level metrics aren't recommended because they lead to an unbounded cardinality which degrades the observability database.
 

diff --git a/pkg/apis/config/metrics.go b/pkg/apis/config/metrics.go
@@ -29,6 +29,9 @@ const (
 	// metricsPipelinerunLevel determines to what level to aggregate metrics
 	// for pipelinerun
 	metricsPipelinerunLevelKey = "metrics.pipelinerun.level"
+	// metricsRunningPipelinerunLevelKey determines to what level to aggregate metrics
+	// for running pipelineruns
+	metricsRunningPipelinerunLevelKey = "metrics.running-pipelinerun.level"
 	// metricsDurationTaskrunType determines what type of
 	// metrics to use for aggregating duration for taskrun
 	metricsDurationTaskrunType = "metrics.taskrun.duration-type"
@@ -55,6 +58,9 @@ const (
 	// DefaultPipelinerunLevel determines to what level to aggregate metrics
 	// when it isn't specified in configmap
 	DefaultPipelinerunLevel = PipelinerunLevelAtPipeline
+	// DefaultRunningPipelinerunLevel determines to what level to aggregate metrics
+	// when it isn't specified in configmap
+	DefaultRunningPipelinerunLevel = ""
 	// PipelinerunLevelAtPipelinerun specify that aggregation will be done at
 	// pipelinerun level
 	PipelinerunLevelAtPipelinerun = "pipelinerun"
@@ -96,6 +102,7 @@ var DefaultMetrics, _ = newMetricsFromMap(map[string]string{})
 type Metrics struct {
 	TaskrunLevel            string
 	PipelinerunLevel        string
+	RunningPipelinerunLevel string
 	DurationTaskrunType     string
 	DurationPipelinerunType string
 	CountWithReason         bool
@@ -130,6 +137,7 @@ func newMetricsFromMap(cfgMap map[string]string) (*Metrics, error) {
 	tc := Metrics{
 		TaskrunLevel:            DefaultTaskrunLevel,
 		PipelinerunLevel:        DefaultPipelinerunLevel,
+		RunningPipelinerunLevel: DefaultRunningPipelinerunLevel,
 		DurationTaskrunType:     DefaultDurationTaskrunType,
 		DurationPipelinerunType: DefaultDurationPipelinerunType,
 		CountWithReason:         false,
@@ -143,6 +151,9 @@ func newMetricsFromMap(cfgMap map[string]string) (*Metrics, error) {
 	if pipelinerunLevel, ok := cfgMap[metricsPipelinerunLevelKey]; ok {
 		tc.PipelinerunLevel = pipelinerunLevel
 	}
+	if runningPipelinerunLevel, ok := cfgMap[metricsRunningPipelinerunLevelKey]; ok {
+		tc.RunningPipelinerunLevel = runningPipelinerunLevel
+	}
 	if durationTaskrun, ok := cfgMap[metricsDurationTaskrunType]; ok {
 		tc.DurationTaskrunType = durationTaskrun
 	}

diff --git a/pkg/apis/config/metrics_test.go b/pkg/apis/config/metrics_test.go
@@ -36,6 +36,7 @@ func TestNewMetricsFromConfigMap(t *testing.T) {
 			expectedConfig: &config.Metrics{
 				TaskrunLevel:            config.TaskrunLevelAtTaskrun,
 				PipelinerunLevel:        config.PipelinerunLevelAtPipelinerun,
+				RunningPipelinerunLevel: config.DefaultRunningPipelinerunLevel,
 				DurationTaskrunType:     config.DurationPipelinerunTypeHistogram,
 				DurationPipelinerunType: config.DurationPipelinerunTypeHistogram,
 				CountWithReason:         false,
@@ -47,6 +48,7 @@ func TestNewMetricsFromConfigMap(t *testing.T) {
 			expectedConfig: &config.Metrics{
 				TaskrunLevel:            config.TaskrunLevelAtNS,
 				PipelinerunLevel:        config.PipelinerunLevelAtNS,
+				RunningPipelinerunLevel: config.PipelinerunLevelAtNS,
 				DurationTaskrunType:     config.DurationTaskrunTypeHistogram,
 				DurationPipelinerunType: config.DurationPipelinerunTypeLastValue,
 				CountWithReason:         false,
@@ -58,6 +60,7 @@ func TestNewMetricsFromConfigMap(t *testing.T) {
 			expectedConfig: &config.Metrics{
 				TaskrunLevel:            config.TaskrunLevelAtNS,
 				PipelinerunLevel:        config.PipelinerunLevelAtNS,
+				RunningPipelinerunLevel: config.DefaultRunningPipelinerunLevel,
 				DurationTaskrunType:     config.DurationTaskrunTypeHistogram,
 				DurationPipelinerunType: config.DurationPipelinerunTypeLastValue,
 				CountWithReason:         true,
@@ -69,6 +72,7 @@ func TestNewMetricsFromConfigMap(t *testing.T) {
 			expectedConfig: &config.Metrics{
 				TaskrunLevel:            config.TaskrunLevelAtNS,
 				PipelinerunLevel:        config.PipelinerunLevelAtNS,
+				RunningPipelinerunLevel: config.PipelinerunLevelAtPipeline,
 				DurationTaskrunType:     config.DurationTaskrunTypeHistogram,
 				DurationPipelinerunType: config.DurationPipelinerunTypeLastValue,
 				CountWithReason:         true,
@@ -88,6 +92,7 @@ func TestNewMetricsFromEmptyConfigMap(t *testing.T) {
 	expectedConfig := &config.Metrics{
 		TaskrunLevel:            config.TaskrunLevelAtTask,
 		PipelinerunLevel:        config.PipelinerunLevelAtPipeline,
+		RunningPipelinerunLevel: config.DefaultRunningPipelinerunLevel,
 		DurationTaskrunType:     config.DurationPipelinerunTypeHistogram,
 		DurationPipelinerunType: config.DurationPipelinerunTypeHistogram,
 		CountWithReason:         false,

diff --git a/pkg/apis/config/testdata/config-observability-namespacelevel.yaml b/pkg/apis/config/testdata/config-observability-namespacelevel.yaml
@@ -27,4 +27,5 @@ data:
   metrics.taskrun.level: "namespace"
   metrics.taskrun.duration-type: "histogram"
   metrics.pipelinerun.level: "namespace"
+  metrics.running-pipelinerun.level: "namespace"
   metrics.pipelinerun.duration-type: "lastvalue"
diff --git a/pkg/apis/config/testdata/config-observability-throttle.yaml b/pkg/apis/config/testdata/config-observability-throttle.yaml
@@ -27,6 +27,7 @@ data:
   metrics.taskrun.level: "namespace"
   metrics.taskrun.duration-type: "histogram"
   metrics.pipelinerun.level: "namespace"
+  metrics.running-pipelinerun.level: "pipeline"
   metrics.pipelinerun.duration-type: "lastvalue"
   metrics.count.enable-reason: "true"
   metrics.taskrun.throttle.enable-namespace: "true"
diff --git a/pkg/pipelinerunmetrics/metrics.go b/pkg/pipelinerunmetrics/metrics.go
@@ -39,6 +39,13 @@ import (
 	"knative.dev/pkg/metrics"
 )
 
+const (
+	runningPRLevelPipelinerun = "pipelinerun"
+	runningPRLevelPipeline    = "pipeline"
+	runningPRLevelNamespace   = "namespace"
+	runningPRLevelCluster     = ""
+)
+
 var (
 	pipelinerunTag = tag.MustNewKey("pipelinerun")
 	pipelineTag    = tag.MustNewKey("pipeline")
@@ -134,6 +141,7 @@ func NewRecorder(ctx context.Context) (*Recorder, error) {
 		}
 
 		cfg := config.FromContextOrDefaults(ctx)
+		r.cfg = cfg.Metrics
 		errRegistering = viewRegister(cfg.Metrics)
 		if errRegistering != nil {
 			r.initialized = false
@@ -149,7 +157,6 @@ func viewRegister(cfg *config.Metrics) error {
 	defer r.mutex.Unlock()
 
 	var prunTag []tag.Key
-
 	switch cfg.PipelinerunLevel {
 	case config.PipelinerunLevelAtPipelinerun:
 		prunTag = []tag.Key{pipelinerunTag, pipelineTag}
@@ -164,6 +171,18 @@ func viewRegister(cfg *config.Metrics) error {
 		return errors.New("invalid config for PipelinerunLevel: " + cfg.PipelinerunLevel)
 	}
 
+	var runningPRTag []tag.Key
+	switch cfg.RunningPipelinerunLevel {
+	case config.PipelinerunLevelAtPipelinerun:
+		runningPRTag = []tag.Key{pipelinerunTag, pipelineTag, namespaceTag}
+	case config.PipelinerunLevelAtPipeline:
+		runningPRTag = []tag.Key{pipelineTag, namespaceTag}
+	case config.PipelinerunLevelAtNS:
+		runningPRTag = []tag.Key{namespaceTag}
+	default:
+		runningPRTag = []tag.Key{}
+	}
+
 	distribution := view.Distribution(10, 30, 60, 300, 900, 1800, 3600, 5400, 10800, 21600, 43200, 86400)
 
 	if cfg.PipelinerunLevel == config.PipelinerunLevelAtPipelinerun {
@@ -213,6 +232,7 @@ func viewRegister(cfg *config.Metrics) error {
 		Description: runningPRs.Description(),
 		Measure:     runningPRs,
 		Aggregation: view.LastValue(),
+		TagKeys:     runningPRTag,
 	}
 
 	runningPRsWaitingOnPipelineResolutionCountView = &view.View{
@@ -326,7 +346,7 @@ func (r *Recorder) updateConfig(cfg *config.Metrics) {
 
 // DurationAndCount logs the duration of PipelineRun execution and
 // count for number of PipelineRuns succeed or failed
-// returns an error if its failed to log the metrics
+// returns an error if it fails to log the metrics
 func (r *Recorder) DurationAndCount(pr *v1.PipelineRun, beforeCondition *apis.Condition) error {
 	if !r.initialized {
 		return fmt.Errorf("ignoring the metrics recording for %s , failed to initialize the metrics recorder", pr.Name)
@@ -379,11 +399,10 @@ func (r *Recorder) DurationAndCount(pr *v1.PipelineRun, beforeCondition *apis.Co
 }
 
 // RunningPipelineRuns logs the number of PipelineRuns running right now
-// returns an error if its failed to log the metrics
+// returns an error if it fails to log the metrics
 func (r *Recorder) RunningPipelineRuns(lister listers.PipelineRunLister) error {
 	r.mutex.Lock()
 	defer r.mutex.Unlock()
-
 	if !r.initialized {
 		return errors.New("ignoring the metrics recording, failed to initialize the metrics recorder")
 	}
@@ -396,9 +415,38 @@ func (r *Recorder) RunningPipelineRuns(lister listers.PipelineRunLister) error {
 	var runningPipelineRuns int
 	var trsWaitResolvingTaskRef int
 	var prsWaitResolvingPipelineRef int
+	countMap := map[string]int{}
 
 	for _, pr := range prs {
+		pipelineName := getPipelineTagName(pr)
+		pipelineRunKey := ""
+		mutators := []tag.Mutator{
+			tag.Insert(namespaceTag, pr.Namespace),
+			tag.Insert(pipelineTag, pipelineName),
+			tag.Insert(pipelinerunTag, pr.Name),
+		}
+		if r.cfg != nil {
+			switch r.cfg.RunningPipelinerunLevel {
+			case runningPRLevelPipelinerun:
+				pipelineRunKey = pipelineRunKey + "#" + pr.Name
+				fallthrough
+			case runningPRLevelPipeline:
+				pipelineRunKey = pipelineRunKey + "#" + pipelineName
+				fallthrough
+			case runningPRLevelNamespace:
+				pipelineRunKey = pipelineRunKey + "#" + pr.Namespace
+			case runningPRLevelCluster:
+			default:
+				return fmt.Errorf("RunningPipelineRunLevel value \"%s\" is not valid ", r.cfg.RunningPipelinerunLevel)
+			}
+		}
+		ctx_, err_ := tag.New(context.Background(), mutators...)
+		if err_ != nil {
+			return err
+		}
 		if !pr.IsDone() {
+			countMap[pipelineRunKey]++
+			metrics.Record(ctx_, runningPRs.M(float64(countMap[pipelineRunKey])))
 			runningPipelineRuns++
 			succeedCondition := pr.Status.GetCondition(apis.ConditionSucceeded)
 			if succeedCondition != nil && succeedCondition.Status == corev1.ConditionUnknown {
@@ -409,6 +457,13 @@ func (r *Recorder) RunningPipelineRuns(lister listers.PipelineRunLister) error {
 					prsWaitResolvingPipelineRef++
 				}
 			}
+		} else {
+			// In case there are no running PipelineRuns for the pipelineRunKey, set the metric value to 0 to ensure
+			//  the metric is set for the key.
+			if _, exists := countMap[pipelineRunKey]; !exists {
+				countMap[pipelineRunKey] = 0
+				metrics.Record(ctx_, runningPRs.M(0))
+			}
 		}
 	}
 
@@ -421,7 +476,6 @@ func (r *Recorder) RunningPipelineRuns(lister listers.PipelineRunLister) error {
 	metrics.Record(ctx, runningPRsWaitingOnTaskResolutionCount.M(float64(trsWaitResolvingTaskRef)))
 	metrics.Record(ctx, runningPRsWaitingOnTaskResolution.M(float64(trsWaitResolvingTaskRef)))
 	metrics.Record(ctx, runningPRsCount.M(float64(runningPipelineRuns)))
-	metrics.Record(ctx, runningPRs.M(float64(runningPipelineRuns)))
 
 	return nil
 }