diff --git a/install/0000_30_machine-api-operator_00_namespace.yaml b/install/0000_30_machine-api-operator_00_namespace.yaml index 1051a2d79e..3131d3a0d7 100644 --- a/install/0000_30_machine-api-operator_00_namespace.yaml +++ b/install/0000_30_machine-api-operator_00_namespace.yaml @@ -7,3 +7,4 @@ metadata: labels: name: openshift-machine-api openshift.io/run-level: "1" + openshift.io/cluster-monitoring: "true" diff --git a/install/0000_30_machine-api-operator_09_rbac.yaml b/install/0000_30_machine-api-operator_09_rbac.yaml index b7d51959c0..c09398b986 100644 --- a/install/0000_30_machine-api-operator_09_rbac.yaml +++ b/install/0000_30_machine-api-operator_09_rbac.yaml @@ -269,3 +269,21 @@ subjects: - kind: ServiceAccount name: machine-api-operator namespace: openshift-machine-api + +--- +- apiVersion: rbac.authorization.k8s.io/v1 + kind: Role + metadata: + name: prometheus-k8s + namespace: openshift-monitoring + rules: + - apiGroups: + - "" + resources: + - services + - endpoints + - pods + verbs: + - get + - list + - watch diff --git a/install/0000_30_machine-api-operator_13_ironic-exporter-service.yaml b/install/0000_30_machine-api-operator_13_ironic-exporter-service.yaml new file mode 100644 index 0000000000..54f98501ee --- /dev/null +++ b/install/0000_30_machine-api-operator_13_ironic-exporter-service.yaml @@ -0,0 +1,21 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: Service +metadata: + name: metal3-baremetalhost-controller + namespace: openshift-machine-api + labels: + app: ironic-exporter +spec: + ports: + - name: http + protocol: TCP + port: 9608 + targetPort: 9608 + selector: + app: ironic-exporter + clusterIP: None + type: ClusterIP + sessionAffinity: None +status: + loadBalancer: {} diff --git a/install/0000_30_machine-api-operator_14_ironic-exporter-servicemonitor.yaml b/install/0000_30_machine-api-operator_14_ironic-exporter-servicemonitor.yaml new file mode 100644 index 0000000000..29dcf20454 --- /dev/null +++ b/install/0000_30_machine-api-operator_14_ironic-exporter-servicemonitor.yaml @@ -0,0 +1,21 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app: ironic-exporter + name: metal3-baremetalhost-controller + namespace: openshift-machine-api +spec: + endpoints: + - port: "9608-tcp" + scheme: http + path: /metrics + targetPort: 9608 + jobLabel: app + namespaceSelector: + matchNames: + - metal3-baremetalhost-controller + selector: + matchLabels: + app: ironic-exporter diff --git a/install/0000_30_machine-api-operator_15_ironic-exporter-prometheusrules.yaml b/install/0000_30_machine-api-operator_15_ironic-exporter-prometheusrules.yaml new file mode 100644 index 0000000000..ed6c6b458e --- /dev/null +++ b/install/0000_30_machine-api-operator_15_ironic-exporter-prometheusrules.yaml @@ -0,0 +1,25 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: metal3-baremetalhost-controller + namespace: openshift-machine-api +spec: + groups: + - name: metal3-baremetalhost-controller + rules: + - alert: HighCPUTemperature + annotations: + summary: "The baremetal node {{ $labels.node_name }} CPU {{ $labels.entity_id }} is too high" + description: "The baremetal node {{ $labels.node_name }} CPU {{ $labels.entity_id }} is too high in the past minute. Last measurement {{ $value }}" + expr: baremetal_temp_celsius > 96 + for: 5m + labels: + severity: warning + - alert: LowCPUTemperature + annotations: + summary: "The baremetal node {{ $labels.node_name }} CPU {{ $labels.entity_id }} is too low" + description: "The baremetal node {{ $labels.node_name }} CPU {{ $labels.entity_id }} is too low in the past minute. Last measurement {{ $value }}" + expr: baremetal_temp_celsius < 3 + for: 5m + labels: + severity: warning