test/promql/tests/data_plane/pipeline_deadlock_crashloop_test.yaml

evaluation_interval: 1m

rule_files:
  - prometheus.pipeline_alerts.yaml

tests:
  # ----- Tekton Controller Rapid Restart Tests ----
  - interval: 1m
    input_series:

      # see https://prometheus.io/docs/prometheus/latest/configuration/unit_testing_rules/ for explanations of the expanding notation used for the values
      # tekton pipeline controller is experiencing controller successive restarts so it will be alerted, whereas we are currently not checking
      # other pods in the same namespace that restart
      - series: 'kube_pod_container_status_restarts_total{namespace="openshift-pipelines", pod="tekton-pipelines-controller-a", source_cluster="cluster01"}'
        values: '1+1x780'
      - series: 'kube_pod_container_status_restarts_total{namespace="openshift-pipelines", pod="tekton-triggers-controller-a", source_cluster="cluster01"}'
        values: '2+2x780'

    alert_rule_test:
      - eval_time: 10m
        alertname: PipelinePodsRepeatedRestarts
        exp_alerts:
          - exp_labels:
              severity: critical
              slo: "true"
              source_cluster: cluster01
            exp_annotations:
              summary: >-
                Tekton controller is rapidly restarting.
              description: >-
                Tekton controllers on cluster cluster01 have restarted 15 times recently.
              alert_team_handle: <!subteam^S04PYECHCCU>
              team: pipelines
              runbook_url: https://gitlab.cee.redhat.com/konflux/docs/sop/-/blob/main/pipeline-service/sre/core-pipeline-controller-repeated-restarts.md

  - interval: 1m
    input_series:

      # see https://prometheus.io/docs/prometheus/latest/configuration/unit_testing_rules/ for explanations of the expanding notation used for the values
      # tekton pipeline controller is experiencing some controller restarts but not fast enough so it should not alert
      - series: 'kube_pod_container_status_restarts_total{namespace="openshift-pipelines", pod="tekton-pipelines-controller-a", source_cluster="cluster01"}'
        values: '1x780'

    alert_rule_test:
      - eval_time: 10m
        alertname: PipelinePodsRepeatedRestarts

  - interval: 1m
    input_series:

      # see https://prometheus.io/docs/prometheus/latest/configuration/unit_testing_rules/ for explanations of the expanding notation used for the values
      # tekton pipeline controller is not experiencing  controller restarts so it should not alert
      - series: 'kube_pod_container_status_restarts_total{namespace="openshift-pipelines", pod="tekton-pipelines-controller-a", source_cluster="cluster01"}'
        values: '0x780'

    alert_rule_test:
      - eval_time: 10m
        alertname: PipelinePodsRepeatedRestarts

  # ----- PipelineRun Controller Potential Deadlock Kicking Off PipelineRuns ----
  - interval: 1m
    input_series:

      # see https://prometheus.io/docs/prometheus/latest/configuration/unit_testing_rules/ for explanations of the expanding notation used for the values
      # tekton pipelinerun controller is experiencing delays getting pipelineruns in tenant-1 started and should alert.  Also make sure the lack
      # of kickoff issues in tenant-2 does not affect the value of the query
      - series: 'pipelinerun_kickoff_not_attempted_count{namespace="tenant-1", source_cluster="cluster01"}'
        values: '1+1x780'
      - series: 'pipelinerun_kickoff_not_attempted_count{namespace="tenant-2", source_cluster="cluster01"}'
        values: '0x780'

    alert_rule_test:
      - eval_time: 85m
        alertname: PipelineControllerDeadlock
        exp_alerts:
          - exp_labels:
              severity: critical
              slo: "true"
              source_cluster: cluster01
            exp_annotations:
              summary: >-
                Tekton pipeline controller appears to have stopped processing active pipelineruns which have not been started yet.
              description: >-
                Tekton pipeline controller on cluster cluster01 has appeared deadlocked on 2 pipelineruns.
              alert_team_handle: <!subteam^S04PYECHCCU>
              team: pipelines
              runbook_url: https://gitlab.cee.redhat.com/konflux/docs/sop/-/blob/main/pipeline-service/sre/tekton-pipeline-related-deadlocks.md

  - interval: 1m
    input_series:

      # see https://prometheus.io/docs/prometheus/latest/configuration/unit_testing_rules/ for explanations of the expanding notation used for the values
      # tekton pipelinerun controller is experiencing some delays getting pipelineruns kicked off but not fast enough so it should not alert
      - series: 'pipelinerun_kickoff_not_attempted_count{namespace="tenant-1", source_cluster="cluster01"}'
        values: '1x780'

    alert_rule_test:
      - eval_time: 10m
        alertname: PipelineControllerDeadlock

  - interval: 1m
    input_series:

      # see https://prometheus.io/docs/prometheus/latest/configuration/unit_testing_rules/ for explanations of the expanding notation used for the values
      # tekton pipelinerun controller is not experiencing some delays getting pipelineruns kicked off so it should not alert
      - series: 'pipelinerun_kickoff_not_attempted_count{namespace="tenant-1", source_cluster="cluster01"}'
        values: '0x780'

    alert_rule_test:
      - eval_time: 10m
        alertname: PipelineControllerDeadlock

  # ----- TaskRun Controller Potential Deadlock Starting Pods that pass K8s checks ----
  - interval: 1m
    input_series:

      # see https://prometheus.io/docs/prometheus/latest/configuration/unit_testing_rules/ for explanations of the expanding notation used for the values
      # tekton taskrun controller is experiencing delays getting the k8s approved pods for taskruns in tenant-1 started and should alert.  Also make sure the lack
      # of pod issues in tenant-2 does not affect the value of the query
      - series: 'taskrun_pod_create_not_attempted_or_pending_count{namespace="tenant-1", source_cluster="cluster01"}'
        values: '1+1x780'
      - series: 'taskrun_pod_create_not_attempted_or_pending_count{namespace="tenant-2", source_cluster="cluster01"}'
        values: '0x780'
      - series: 'tekton_pipelines_controller_running_taskruns_throttled_by_quota{namespace="tenant-1", source_cluster="cluster01"}'
        values: '0x780'
      - series: 'tekton_pipelines_controller_running_taskruns_throttled_by_node{namespace="tenant-1", source_cluster="cluster01"}'
        values: '0x780'

    alert_rule_test:
      - eval_time: 85m
        alertname: TaskControllerDeadlock
        exp_alerts:
          - exp_labels:
              severity: critical
              slo: "true"
              source_cluster: cluster01
            exp_annotations:
              summary: >-
                Tekton taskrun controller appears to have stopped processing active taskruns whose underlying Pod have not failed Kubernetes screening.
              description: >-
                Tekton taskrun controller on cluster cluster01 has appeared deadlocked on 2 taskruns.
              alert_team_handle: <!subteam^S04PYECHCCU>
              team: pipelines
              runbook_url: https://gitlab.cee.redhat.com/konflux/docs/sop/-/blob/main/pipeline-service/sre/tekton-pipeline-related-deadlocks.md
  - interval: 1m
    input_series:

      # see https://prometheus.io/docs/prometheus/latest/configuration/unit_testing_rules/ for explanations of the expanding notation used for the values
      # tekton taskrun controller is experiencing delays getting the k8s approved pods for taskruns in tenant-1 started and should alert.  Also make sure the lack
      # of pod issues in tenant-2 does not affect the value of the query
      - series: 'taskrun_pod_create_not_attempted_or_pending_count{namespace="tenant-1", source_cluster="cluster01"}'
        values: '1+2x780'
      - series: 'taskrun_pod_create_not_attempted_or_pending_count{namespace="tenant-2", source_cluster="cluster01"}'
        values: '0x780'
      - series: 'tekton_pipelines_controller_running_taskruns_throttled_by_quota{namespace="tenant-1", source_cluster="cluster01"}'
        values: '1+1x780'
      - series: 'tekton_pipelines_controller_running_taskruns_throttled_by_node{namespace="tenant-1", source_cluster="cluster01"}'
        values: '0x780'

    alert_rule_test:
      - eval_time: 85m
        alertname: TaskControllerDeadlock
        exp_alerts:
          - exp_labels:
              severity: critical
              slo: "true"
              source_cluster: cluster01
            exp_annotations:
              summary: >-
                Tekton taskrun controller appears to have stopped processing active taskruns whose underlying Pod have not failed Kubernetes screening.
              description: >-
                Tekton taskrun controller on cluster cluster01 has appeared deadlocked on 2 taskruns.
              alert_team_handle: <!subteam^S04PYECHCCU>
              team: pipelines
              runbook_url: https://gitlab.cee.redhat.com/konflux/docs/sop/-/blob/main/pipeline-service/sre/tekton-pipeline-related-deadlocks.md


  - interval: 1m
    input_series:

      # see https://prometheus.io/docs/prometheus/latest/configuration/unit_testing_rules/ for explanations of the expanding notation used for the values
      # tekton taskrun controller is experiencing delays getting the k8s approved pods for taskruns in tenant-1, but only
      # from k8s throttling so no alert fires.
      - series: 'taskrun_pod_create_not_attempted_or_pending_count{namespace="tenant-1", source_cluster="cluster01"}'
        values: '1+1x780'
      - series: 'taskrun_pod_create_not_attempted_or_pending_count{namespace="tenant-2", source_cluster="cluster01"}'
        values: '0x780'
      - series: 'tekton_pipelines_controller_running_taskruns_throttled_by_quota{namespace="tenant-1", source_cluster="cluster01"}'
        values: '1+1x780'
      - series: 'tekton_pipelines_controller_running_taskruns_throttled_by_node{namespace="tenant-1", source_cluster="cluster01"}'
        values: '1+1x780'

    alert_rule_test:
      - eval_time: 85m
        alertname: TaskControllerDeadlock

  - interval: 1m
    input_series:

      # see https://prometheus.io/docs/prometheus/latest/configuration/unit_testing_rules/ for explanations of the expanding notation used for the values
      # tekton pipelinerun controller is experiencing some delays getting pipelineruns kicked off but not fast enough so it should not alert
      - series: 'taskrun_pod_create_not_attempted_or_pending_count{namespace="tenant-1", source_cluster="cluster01"}'
        values: '1x780'

    alert_rule_test:
      - eval_time: 10m
        alertname: TaskControllerDeadlock

  - interval: 1m
    input_series:

      # see https://prometheus.io/docs/prometheus/latest/configuration/unit_testing_rules/ for explanations of the expanding notation used for the values
      # tekton pipelinerun controller is not experiencing some delays getting pipelineruns kicked off so it should not alert
      - series: 'taskrun_pod_create_not_attempted_or_pending_count{namespace="tenant-1", source_cluster="cluster01"}'
        values: '0x780'

    alert_rule_test:
      - eval_time: 10m
        alertname: TaskControllerDeadlock

  # ----- Resolver Controller Potential Deadlock Kicking Off ResolutionRequests ----
  - interval: 1m
    input_series:

      # see https://prometheus.io/docs/prometheus/latest/configuration/unit_testing_rules/ for explanations of the expanding notation used for the values
      # tekton resolver controller is experiencing delays getting resolutionrequests in tenant-1 started and should alert.  Also make sure the lack
      # of issues in tenant-2 does not affect the value of the query
      - series: 'pending_resolutionrequest_count{namespace="tenant-1", source_cluster="cluster01"}'
        values: '1+1x780'
      - series: 'pending_resolutionrequest_count{namespace="tenant-2", source_cluster="cluster01"}'
        values: '0x780'

    alert_rule_test:
      - eval_time: 85m
        alertname: ResolverControllerDeadlock
        exp_alerts:
          - exp_labels:
              severity: critical
              slo: "true"
              source_cluster: cluster01
            exp_annotations:
              summary: >-
                Tekton resolver controller appears to have stopped processing active resolutionrequests which have not been started yet.
              description: >-
                Tekton resolver controller on cluster cluster01 has appeared deadlocked on 2 resolutionrequests.
              alert_team_handle: <!subteam^S04PYECHCCU>
              team: pipelines
              runbook_url: https://gitlab.cee.redhat.com/konflux/docs/sop/-/blob/main/pipeline-service/sre/tekton-pipeline-related-deadlocks.md

  - interval: 1m
    input_series:

      # see https://prometheus.io/docs/prometheus/latest/configuration/unit_testing_rules/ for explanations of the expanding notation used for the values
      # tekton resolver controller is experiencing some delays getting resolutionrequests kicked off but not fast enough so it should not alert
      - series: 'pending_resolutionrequest_count{namespace="tenant-1", source_cluster="cluster01"}'
        values: '1x780'

    alert_rule_test:
      - eval_time: 10m
        alertname: ResolverControllerDeadlock

  # ----- Chains Controller Potential Deadlock ----
  - interval: 1m
    input_series:

      # see https://prometheus.io/docs/prometheus/latest/configuration/unit_testing_rules/ for explanations of the expanding notation used for the values
      # tekton chains controller is experiencing delays processing pipelineruns in tenant-1 started and should alert.  Also make sure the lack
      # of issues in another app does not affect the value of the query
      - series: 'watcher_workqueue_depth{container="tekton-chains-controller", app="tekton-chains-controller", source_cluster="cluster01"}'
        values: '300+300x780'
      - series: 'watcher_workqueue_depth{container="tekton-pipelines-controller", app="tekton-pipelines-controller", source_cluster="cluster01"}'
        values: '0x780'

    alert_rule_test:
      - eval_time: 85m
        alertname: ChainsControllerDeadlock
        exp_alerts:
          - exp_labels:
              severity: critical
              slo: "true"
              source_cluster: cluster01
            exp_annotations:
              summary: >-
                Tekton chains controller appears to have stopped processing active pipelineruns.
              description: >-
                Tekton chains controller on cluster cluster01 has appeared deadlocked on 600 pipelinerun events.
              alert_team_handle: <!subteam^S04PYECHCCU>
              team: pipelines
              runbook_url: https://gitlab.cee.redhat.com/konflux/docs/sop/-/blob/main/pipeline-service/sre/tekton-pipeline-related-deadlocks.md

  - interval: 1m
    input_series:

      # see https://prometheus.io/docs/prometheus/latest/configuration/unit_testing_rules/ for explanations of the expanding notation used for the values
      # tekton chains controller is experiencing some delays processing pipelineruns but not fast enough so it should not alert
      - series: 'watcher_workqueue_depth{container="tekton-chains-controller", app="tekton-chains-controller", source_cluster="cluster01"}'
        values: '1x780'

    alert_rule_test:
      - eval_time: 10m
        alertname: ChainsControllerDeadlock

  # ----- Results Potential Deadlock ----
  - interval: 1m
    input_series:

      # see https://prometheus.io/docs/prometheus/latest/configuration/unit_testing_rules/ for explanations of the expanding notation used for the values
      # tekton results appears deadlocked
      - series: 'tekton_pipelines_controller_pipelinerun_count{source_cluster="cluster01"}'
        values: '1+1x780'
      - series: 'grpc_server_handled_total{job="tekton-results-api", grpc_service="tekton.results.v1alpha2.CreateResult", source_cluster="cluster01"}'
        values: '0x780'
      - series: 'grpc_server_handled_total{job="foo", grpc_service="foo", source_cluster="cluster01"}'
        values: '1+1x780'

    alert_rule_test:
      - eval_time: 85m
        alertname: ResultsDeadlock
        exp_alerts:
          - exp_labels:
              severity: critical
              slo: "true"
              source_cluster: cluster01
            exp_annotations:
              summary: >-
                Tekton results appears to have stopped processing active pipelineruns.
              description: >-
                Tekton results on cluster cluster01 appears deadlocked on pipelinerun events.
              alert_team_handle: <!subteam^S04PYECHCCU>
              team: pipelines
              runbook_url: https://gitlab.cee.redhat.com/konflux/docs/sop/-/blob/main/pipeline-service/sre/tekton-pipeline-related-deadlocks.md

  - interval: 1m
    input_series:

      # see https://prometheus.io/docs/prometheus/latest/configuration/unit_testing_rules/ for explanations of the expanding notation used for the values
      # tekton chains controller is experiencing some delays processing pipelineruns but not fast enough so it should not alert
      - series: 'tekton_pipelines_controller_pipelinerun_count{source_cluster="cluster01"}'
        values: '0x780'
      - series: 'grpc_server_handled_total{job="tekton-results-api", grpc_service="tekton.results.v1alpha2.CreateResult", source_cluster="cluster01"}'
        values: '0x780'

    alert_rule_test:
      - eval_time: 10m
        alertname: ResultsDeadlock

  - interval: 1m
    input_series:

      # see https://prometheus.io/docs/prometheus/latest/configuration/unit_testing_rules/ for explanations of the expanding notation used for the values
      # tekton results not deadlocked
      - series: 'tekton_pipelines_controller_pipelinerun_count{source_cluster="cluster01"}'
        values: '1+1x780'
      - series: 'grpc_server_handled_total{job="tekton-results-api", grpc_service="tekton.results.v1alpha2.CreateResult", source_cluster="cluster01"}'
        values: '1+1x780'

    alert_rule_test:
      - eval_time: 85m
        alertname: ResultsDeadlock

  - interval: 1m
    input_series:

      # see https://prometheus.io/docs/prometheus/latest/configuration/unit_testing_rules/ for explanations of the expanding notation used for the values
      # tekton results not deadlocked
      - series: 'tekton_pipelines_controller_pipelinerun_count{source_cluster="cluster01"}'
        values: '1+1x780'
      - series: 'grpc_server_handled_total{job="foo", grpc_service="foo", source_cluster="cluster01"}'
        values: '0x780'

    alert_rule_test:
      - eval_time: 85m
        alertname: ResultsDeadlock


  # ----- PAC Controller Potential Deadlock ----
  - interval: 1m
    input_series:

      # see https://prometheus.io/docs/prometheus/latest/configuration/unit_testing_rules/ for explanations of the expanding notation used for the values
      - series: 'pac_watcher_work_queue_depth{source_cluster="cluster01"}'
        values: '300+300x780'
      - series: 'pac_watcher_work_queue_depth{source_cluster="cluster02"}'
        values: '0x780'

    alert_rule_test:
      - eval_time: 85m
        alertname: PACControllerDeadlock
        exp_alerts:
          - exp_labels:
              severity: critical
              slo: "true"
              source_cluster: cluster01
            exp_annotations:
              summary: >-
                Tekton PAC controller appears to have stopped processing active pipelineruns.
              description: >-
                Tekton PAC controller on cluster cluster01 has appeared deadlocked on 600 pipelinerun events.
              alert_team_handle: <!subteam^S04PYECHCCU>
              team: pipelines
              runbook_url: https://gitlab.cee.redhat.com/konflux/docs/sop/-/blob/main/pipeline-service/sre/tekton-pipeline-related-deadlocks.md

  - interval: 1m
    input_series:

      # see https://prometheus.io/docs/prometheus/latest/configuration/unit_testing_rules/ for explanations of the expanding notation used for the values
      - series: 'pac_watcher_work_queue_depth{source_cluster="cluster02"}'
        values: '1x780'

    alert_rule_test:
      - eval_time: 10m
        alertname: ChainsControllerDeadlock