Finalize tutorial steps

Signed-off-by: Giovanni Liva <[email protected]>
keptn · Oct 12, 2022 · 3d257d7 · 3d257d7
1 parent c91b3ca
commit 3d257d7
Show file tree

Hide file tree

Showing 95 changed files with 62,379 additions and 9 deletions.
diff --git a/examples/observability/README.md b/examples/observability/README.md
@@ -22,13 +22,12 @@ kubectl create namespace observability
 kubectl create -f https://github.com/jaegertracing/jaeger-operator/releases/download/v1.38.0/jaeger-operator.yaml -n observability
 
 # Install Prometheus
-kubectl apply --server-side -f manifests/setup
-kubectl apply -f manifests/
+kubectl apply --server-side -f config/prometheus/setup
+kubectl apply -f config/prometheus/
 
 ```
 
-In this tutorial, we will assume that Jaeger will be installed in the `keptn-lifecycle-controller-system`, and the Jaeger collector endpoint is reachable under `http://jaeger-collector:14250`.
-
+With these commands, the Jaeger and Prometheus Operator will be installed in the `observability` and `monitoring` namespaces, respectively.
 
 ## Configuring the OpenTelemetry Collector and Prometheus ServiceMonitor
 
@@ -48,13 +47,14 @@ Eventually, there should be a pod for the `otel-collector` deployment up and run
 $ kubectl get pods -lapp=opentelemetry -n keptn-lifecycle-controller-system
 
 NAME                              READY   STATUS    RESTARTS      AGE
-otel-collector-6fc4cc84d6-7hnvp   1/1     Running   6 (51m ago)   92m
+otel-collector-6fc4cc84d6-7hnvp   1/1     Running   0             92m
 ```
 
-## Install master
+When the `otel-collector` pod is up and running, restart the `keptn-scheduler` and `klc-controller-manager` so they can
+pick up the new configuration.
 
-```sh
-make build-deploy-dev-environment RELEASE_REGISTRY=<your-registry>
+```shell
+kubectl rollout restart deployment -n keptn-lifecycle-controller-system keptn-scheduler klc-controller-manager
 ```
 
 ## Seeing the OpenTelemetry Collector in action

diff --git a/examples/observability/config/otel-collector.yaml b/examples/observability/config/otel-collector.yaml
@@ -27,7 +27,7 @@ data:
       health_check: {}
     exporters:
       jaeger:
-        endpoint: "jaeger-collector.observability.svc:14250"
+        endpoint: "jaeger-collector:14250"
         tls:
           insecure: true
       prometheus:

diff --git a/examples/observability/config/prometheus/alertmanager-alertmanager.yaml b/examples/observability/config/prometheus/alertmanager-alertmanager.yaml
@@ -0,0 +1,36 @@
+apiVersion: monitoring.coreos.com/v1
+kind: Alertmanager
+metadata:
+  labels:
+    app.kubernetes.io/component: alert-router
+    app.kubernetes.io/instance: main
+    app.kubernetes.io/name: alertmanager
+    app.kubernetes.io/part-of: kube-prometheus
+    app.kubernetes.io/version: 0.24.0
+  name: main
+  namespace: monitoring
+spec:
+  image: quay.io/prometheus/alertmanager:v0.24.0
+  nodeSelector:
+    kubernetes.io/os: linux
+  podMetadata:
+    labels:
+      app.kubernetes.io/component: alert-router
+      app.kubernetes.io/instance: main
+      app.kubernetes.io/name: alertmanager
+      app.kubernetes.io/part-of: kube-prometheus
+      app.kubernetes.io/version: 0.24.0
+  replicas: 3
+  resources:
+    limits:
+      cpu: 100m
+      memory: 100Mi
+    requests:
+      cpu: 4m
+      memory: 100Mi
+  securityContext:
+    fsGroup: 2000
+    runAsNonRoot: true
+    runAsUser: 1000
+  serviceAccountName: alertmanager-main
+  version: 0.24.0
diff --git a/examples/observability/config/prometheus/alertmanager-networkPolicy.yaml b/examples/observability/config/prometheus/alertmanager-networkPolicy.yaml
@@ -0,0 +1,42 @@
+apiVersion: networking.k8s.io/v1
+kind: NetworkPolicy
+metadata:
+  labels:
+    app.kubernetes.io/component: alert-router
+    app.kubernetes.io/instance: main
+    app.kubernetes.io/name: alertmanager
+    app.kubernetes.io/part-of: kube-prometheus
+    app.kubernetes.io/version: 0.24.0
+  name: alertmanager-main
+  namespace: monitoring
+spec:
+  egress:
+  - {}
+  ingress:
+  - from:
+    - podSelector:
+        matchLabels:
+          app.kubernetes.io/name: prometheus
+    ports:
+    - port: 9093
+      protocol: TCP
+    - port: 8080
+      protocol: TCP
+  - from:
+    - podSelector:
+        matchLabels:
+          app.kubernetes.io/name: alertmanager
+    ports:
+    - port: 9094
+      protocol: TCP
+    - port: 9094
+      protocol: UDP
+  podSelector:
+    matchLabels:
+      app.kubernetes.io/component: alert-router
+      app.kubernetes.io/instance: main
+      app.kubernetes.io/name: alertmanager
+      app.kubernetes.io/part-of: kube-prometheus
+  policyTypes:
+  - Egress
+  - Ingress
diff --git a/examples/observability/config/prometheus/alertmanager-podDisruptionBudget.yaml b/examples/observability/config/prometheus/alertmanager-podDisruptionBudget.yaml
@@ -0,0 +1,19 @@
+apiVersion: policy/v1
+kind: PodDisruptionBudget
+metadata:
+  labels:
+    app.kubernetes.io/component: alert-router
+    app.kubernetes.io/instance: main
+    app.kubernetes.io/name: alertmanager
+    app.kubernetes.io/part-of: kube-prometheus
+    app.kubernetes.io/version: 0.24.0
+  name: alertmanager-main
+  namespace: monitoring
+spec:
+  maxUnavailable: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/component: alert-router
+      app.kubernetes.io/instance: main
+      app.kubernetes.io/name: alertmanager
+      app.kubernetes.io/part-of: kube-prometheus
diff --git a/examples/observability/config/prometheus/alertmanager-prometheusRule.yaml b/examples/observability/config/prometheus/alertmanager-prometheusRule.yaml
@@ -0,0 +1,139 @@
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  labels:
+    app.kubernetes.io/component: alert-router
+    app.kubernetes.io/instance: main
+    app.kubernetes.io/name: alertmanager
+    app.kubernetes.io/part-of: kube-prometheus
+    app.kubernetes.io/version: 0.24.0
+    prometheus: k8s
+    role: alert-rules
+  name: alertmanager-main-rules
+  namespace: monitoring
+spec:
+  groups:
+  - name: alertmanager.rules
+    rules:
+    - alert: AlertmanagerFailedReload
+      annotations:
+        description: Configuration has failed to load for {{ $labels.namespace }}/{{ $labels.pod}}.
+        runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerfailedreload
+        summary: Reloading an Alertmanager configuration has failed.
+      expr: |
+        # Without max_over_time, failed scrapes could create false negatives, see
+        # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
+        max_over_time(alertmanager_config_last_reload_successful{job="alertmanager-main",namespace="monitoring"}[5m]) == 0
+      for: 10m
+      labels:
+        severity: critical
+    - alert: AlertmanagerMembersInconsistent
+      annotations:
+        description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} has only found {{ $value }} members of the {{$labels.job}} cluster.
+        runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagermembersinconsistent
+        summary: A member of an Alertmanager cluster has not found all other cluster members.
+      expr: |
+        # Without max_over_time, failed scrapes could create false negatives, see
+        # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
+          max_over_time(alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"}[5m])
+        < on (namespace,service) group_left
+          count by (namespace,service) (max_over_time(alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"}[5m]))
+      for: 15m
+      labels:
+        severity: critical
+    - alert: AlertmanagerFailedToSendAlerts
+      annotations:
+        description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} failed to send {{ $value | humanizePercentage }} of notifications to {{ $labels.integration }}.
+        runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerfailedtosendalerts
+        summary: An Alertmanager instance failed to send notifications.
+      expr: |
+        (
+          rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring"}[5m])
+        /
+          rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring"}[5m])
+        )
+        > 0.01
+      for: 5m
+      labels:
+        severity: warning
+    - alert: AlertmanagerClusterFailedToSendAlerts
+      annotations:
+        description: The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}.
+        runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterfailedtosendalerts
+        summary: All Alertmanager instances in a cluster failed to send notifications to a critical integration.
+      expr: |
+        min by (namespace,service, integration) (
+          rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring", integration=~`.*`}[5m])
+        /
+          rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring", integration=~`.*`}[5m])
+        )
+        > 0.01
+      for: 5m
+      labels:
+        severity: critical
+    - alert: AlertmanagerClusterFailedToSendAlerts
+      annotations:
+        description: The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}.
+        runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterfailedtosendalerts
+        summary: All Alertmanager instances in a cluster failed to send notifications to a non-critical integration.
+      expr: |
+        min by (namespace,service, integration) (
+          rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring", integration!~`.*`}[5m])
+        /
+          rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring", integration!~`.*`}[5m])
+        )
+        > 0.01
+      for: 5m
+      labels:
+        severity: warning
+    - alert: AlertmanagerConfigInconsistent
+      annotations:
+        description: Alertmanager instances within the {{$labels.job}} cluster have different configurations.
+        runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerconfiginconsistent
+        summary: Alertmanager instances within the same cluster have different configurations.
+      expr: |
+        count by (namespace,service) (
+          count_values by (namespace,service) ("config_hash", alertmanager_config_hash{job="alertmanager-main",namespace="monitoring"})
+        )
+        != 1
+      for: 20m
+      labels:
+        severity: critical
+    - alert: AlertmanagerClusterDown
+      annotations:
+        description: '{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have been up for less than half of the last 5m.'
+        runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterdown
+        summary: Half or more of the Alertmanager instances within the same cluster are down.
+      expr: |
+        (
+          count by (namespace,service) (
+            avg_over_time(up{job="alertmanager-main",namespace="monitoring"}[5m]) < 0.5
+          )
+        /
+          count by (namespace,service) (
+            up{job="alertmanager-main",namespace="monitoring"}
+          )
+        )
+        >= 0.5
+      for: 5m
+      labels:
+        severity: critical
+    - alert: AlertmanagerClusterCrashlooping
+      annotations:
+        description: '{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have restarted at least 5 times in the last 10m.'
+        runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclustercrashlooping
+        summary: Half or more of the Alertmanager instances within the same cluster are crashlooping.
+      expr: |
+        (
+          count by (namespace,service) (
+            changes(process_start_time_seconds{job="alertmanager-main",namespace="monitoring"}[10m]) > 4
+          )
+        /
+          count by (namespace,service) (
+            up{job="alertmanager-main",namespace="monitoring"}
+          )
+        )
+        >= 0.5
+      for: 5m
+      labels:
+        severity: critical
diff --git a/examples/observability/config/prometheus/alertmanager-secret.yaml b/examples/observability/config/prometheus/alertmanager-secret.yaml
@@ -0,0 +1,59 @@
+apiVersion: v1
+kind: Secret
+metadata:
+  labels:
+    app.kubernetes.io/component: alert-router
+    app.kubernetes.io/instance: main
+    app.kubernetes.io/name: alertmanager
+    app.kubernetes.io/part-of: kube-prometheus
+    app.kubernetes.io/version: 0.24.0
+  name: alertmanager-main
+  namespace: monitoring
+stringData:
+  alertmanager.yaml: |-
+    "global":
+      "resolve_timeout": "5m"
+    "inhibit_rules":
+    - "equal":
+      - "namespace"
+      - "alertname"
+      "source_matchers":
+      - "severity = critical"
+      "target_matchers":
+      - "severity =~ warning|info"
+    - "equal":
+      - "namespace"
+      - "alertname"
+      "source_matchers":
+      - "severity = warning"
+      "target_matchers":
+      - "severity = info"
+    - "equal":
+      - "namespace"
+      "source_matchers":
+      - "alertname = InfoInhibitor"
+      "target_matchers":
+      - "severity = info"
+    "receivers":
+    - "name": "Default"
+    - "name": "Watchdog"
+    - "name": "Critical"
+    - "name": "null"
+    "route":
+      "group_by":
+      - "namespace"
+      "group_interval": "5m"
+      "group_wait": "30s"
+      "receiver": "Default"
+      "repeat_interval": "12h"
+      "routes":
+      - "matchers":
+        - "alertname = Watchdog"
+        "receiver": "Watchdog"
+      - "matchers":
+        - "alertname = InfoInhibitor"
+        "receiver": "null"
+      - "matchers":
+        - "severity = critical"
+        "receiver": "Critical"
+type: Opaque
diff --git a/examples/observability/config/prometheus/alertmanager-service.yaml b/examples/observability/config/prometheus/alertmanager-service.yaml
@@ -0,0 +1,25 @@
+apiVersion: v1
+kind: Service
+metadata:
+  labels:
+    app.kubernetes.io/component: alert-router
+    app.kubernetes.io/instance: main
+    app.kubernetes.io/name: alertmanager
+    app.kubernetes.io/part-of: kube-prometheus
+    app.kubernetes.io/version: 0.24.0
+  name: alertmanager-main
+  namespace: monitoring
+spec:
+  ports:
+  - name: web
+    port: 9093
+    targetPort: web
+  - name: reloader-web
+    port: 8080
+    targetPort: reloader-web
+  selector:
+    app.kubernetes.io/component: alert-router
+    app.kubernetes.io/instance: main
+    app.kubernetes.io/name: alertmanager
+    app.kubernetes.io/part-of: kube-prometheus
+  sessionAffinity: ClientIP
diff --git a/examples/observability/config/prometheus/alertmanager-serviceAccount.yaml b/examples/observability/config/prometheus/alertmanager-serviceAccount.yaml
@@ -0,0 +1,12 @@
+apiVersion: v1
+automountServiceAccountToken: false
+kind: ServiceAccount
+metadata:
+  labels:
+    app.kubernetes.io/component: alert-router
+    app.kubernetes.io/instance: main
+    app.kubernetes.io/name: alertmanager
+    app.kubernetes.io/part-of: kube-prometheus
+    app.kubernetes.io/version: 0.24.0
+  name: alertmanager-main
+  namespace: monitoring