From c24cc26a452c7803368b9b4373e567b78c2e8224 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20Brigitte?= Date: Fri, 7 Jun 2024 15:33:57 +0200 Subject: [PATCH 1/2] Update tools usage (#1225) --- CHANGELOG.md | 5 +++++ test/hack/bin/check-opsrecipes.sh | 6 +++++- test/hack/bin/fetch-tools.sh | 9 +++++++++ 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f3f3315a6..2741b195b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Fixed + +- Fixed usage of yq, and jq in check-opsrecipes.sh +- Fetch jq with make install-tools + ### Added - Added a new alerting rule to `falco.rules.yml` to fire an alert for XZ-backdoor. diff --git a/test/hack/bin/check-opsrecipes.sh b/test/hack/bin/check-opsrecipes.sh index e889fdaa6..828a2bf44 100755 --- a/test/hack/bin/check-opsrecipes.sh +++ b/test/hack/bin/check-opsrecipes.sh @@ -86,6 +86,10 @@ main() { local -a E_unexistingrecipe=() local returncode=0 + local -r GIT_WORKDIR="$(git rev-parse --show-toplevel)" + local -r YQ=test/hack/bin/yq + local -r JQ=test/hack/bin/jq + # Investigation section ######################## @@ -144,7 +148,7 @@ main() { fi # parse rules yaml files, and for each rule found output alertname, opsrecipe, and severity, space-separated, on one line. - done < <(yq -o json "$rulesFile" | jq -j '.spec.groups[].rules[] | .alert, " ", .annotations.opsrecipe, " ", .labels.severity, "\n"') + done < <("$GIT_WORKDIR/$YQ" -o json "$rulesFile" | "$GIT_WORKDIR/$JQ" -j '.spec.groups[]?.rules[] | .alert, " ", .annotations.opsrecipe, " ", .labels.severity, "\n"') checkedRules+=("$rulesFile") done < <(find $RULES_FILES -type f -print0) diff --git a/test/hack/bin/fetch-tools.sh b/test/hack/bin/fetch-tools.sh index 6644c8b5a..b10f21709 100755 --- a/test/hack/bin/fetch-tools.sh +++ b/test/hack/bin/fetch-tools.sh @@ -6,6 +6,7 @@ ARCHITECT_VERSION="6.8.0" PROMETHEUS_VERSION="2.41.0" HELM_VERSION="3.9.0" YQ_VERSION="4.26.1" +JQ_VERSION="1.7.1" PINT_VERSION="0.58.1" GIT_WORKDIR=$(git rev-parse --show-toplevel) @@ -19,6 +20,8 @@ Linux*) export ARCHITECT_SOURCE="https://github.com/giantswarm/architect/releases/download/v${ARCHITECT_VERSION}/architect-v${ARCHITECT_VERSION}-linux-amd64.tar.gz" export YQ_SOURCE="https://github.com/mikefarah/yq/releases/download/v${YQ_VERSION}/yq_linux_amd64.tar.gz" export YQ_BIN_FILE="yq_linux_amd64" + export JQ_SOURCE="https://github.com/jqlang/jq/releases/download/jq-${JQ_VERSION}/jq-linux-amd64" + export JQ_BIN_FILE="jq" export PINT_SOURCE="https://github.com/cloudflare/pint/releases/download/v${PINT_VERSION}/pint-${PINT_VERSION}-linux-amd64.tar.gz" export PINT_BIN_FILE="pint-linux-amd64" ;; @@ -29,6 +32,8 @@ Darwin*) export ARCHITECT_SOURCE="https://github.com/giantswarm/architect/releases/download/v${ARCHITECT_VERSION}/architect-v${ARCHITECT_VERSION}-darwin-amd64.tar.gz" export YQ_SOURCE="https://github.com/mikefarah/yq/releases/download/v${YQ_VERSION}/yq_darwin_amd64.tar.gz" export YQ_BIN_FILE="yq_darwin_amd64" + export JQ_SOURCE="https://github.com/jqlang/jq/releases/download/jq-${JQ_VERSION}/jq-macos-amd64" + export JQ_BIN_FILE="jq" export PINT_SOURCE="https://github.com/cloudflare/pint/releases/download/v${PINT_VERSION}/pint-${PINT_VERSION}-darwin-amd64.tar.gz" export PINT_BIN_FILE="pint-darwin-amd64" TAR_CMD="gtar" @@ -107,6 +112,10 @@ main() { "${GIT_WORKDIR}/test/hack/bin/yq-${YQ_VERSION}.tar.gz" \ "$YQ_SOURCE" \ "*/yq_*" + download \ + "${JQ_SOURCE}" \ + "${GIT_WORKDIR}/test/hack/bin/${JQ_BIN_FILE}" + chmod +x "${GIT_WORKDIR}/test/hack/bin/${JQ_BIN_FILE}" if [[ ! -f "${GIT_WORKDIR}/test/hack/bin/yq" ]]; then ln -s "${GIT_WORKDIR}/test/hack/bin/${YQ_BIN_FILE}" "${GIT_WORKDIR}/test/hack/bin/yq" fi From da92a86653c47361ebaf77e8f33404f9b0e62e53 Mon Sep 17 00:00:00 2001 From: Quentin Bisson Date: Sun, 9 Jun 2024 22:42:44 +0200 Subject: [PATCH 2/2] Reorganize the job rules and the management-cluster-certificate alerts (#1213) * reorganize-job-and-management-cluster-certificage * Split job alerts into 2 (aws specific and the rest) and move the management-cluster-certificate alerts Signed-off-by: QuentinBisson * Fix changelog --------- Signed-off-by: QuentinBisson --- CHANGELOG.md | 6 +++- .../phoenix/alerting-rules/aws.job.rules.yml | 29 +++++++++++++++++++ .../certificate.management-cluster.rules.yml | 6 ++-- .../alerting-rules/job.rules.yml | 13 --------- 4 files changed, 37 insertions(+), 17 deletions(-) create mode 100644 helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws.job.rules.yml rename helm/prometheus-rules/templates/{kaas/phoenix => shared}/alerting-rules/certificate.management-cluster.rules.yml (88%) rename helm/prometheus-rules/templates/{kaas/phoenix => shared}/alerting-rules/job.rules.yml (58%) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2741b195b..261d04e76 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,7 +19,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed -- Review phoenix alerts towards Mimir. +- Split the phoenix job alert into 2: + - a new file named job.aws.rules that contains the aws specific alerts + - move the rest of job.rules into the shared alerts because it is provider independent +- Move the management cluster certificate alerts into the shared alerts because it is provider independent +- Review and fix phoenix alerts towards Mimir and multi-provider MCs. - Moves cluster-autoscaler and vpa alerts to turtles. ### Fixed diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws.job.rules.yml b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws.job.rules.yml new file mode 100644 index 000000000..b906ad11e --- /dev/null +++ b/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws.job.rules.yml @@ -0,0 +1,29 @@ +## TODO Remove with vintage +# This rule applies to vintage aws management clusters +{{- if eq .Values.managementCluster.provider.flavor "vintage" }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + creationTimestamp: null + labels: + {{- include "labels.common" . | nindent 4 }} + # No need for .Values.mimir.enabled condition - will be gone with Vintage + cluster_type: "management_cluster" + name: aws.job.rules + namespace: {{ .Values.namespace }} +spec: + groups: + - name: aws-jobs + rules: + - alert: JobHasNotBeenScheduledForTooLong + annotations: + description: '{{`CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} has not been scheduled for more than 2 hours.`}}' + opsrecipe: job-has-not-been-scheduled-for-too-long/ + expr: (time() - kube_cronjob_status_last_schedule_time{cronjob="route53-manager"}) > 7200 + for: 15m + labels: + area: kaas + severity: page + team: phoenix + topic: managementcluster +{{- end }} diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/certificate.management-cluster.rules.yml b/helm/prometheus-rules/templates/shared/alerting-rules/certificate.management-cluster.rules.yml similarity index 88% rename from helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/certificate.management-cluster.rules.yml rename to helm/prometheus-rules/templates/shared/alerting-rules/certificate.management-cluster.rules.yml index 877f4fe6e..7d423b508 100644 --- a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/certificate.management-cluster.rules.yml +++ b/helm/prometheus-rules/templates/shared/alerting-rules/certificate.management-cluster.rules.yml @@ -23,13 +23,13 @@ spec: area: kaas cancel_if_outside_working_hours: "true" severity: page - team: phoenix + team: {{ include "providerTeam" . }} topic: security - - alert: ManagementClusterAWSCertificateWillExpireInLessThanOneMonth + - alert: ManagementClusterCertificateWillExpireInLessThanOneMonth annotations: description: '{{`Certificate {{ $labels.path }} on {{ $labels.node }} will expire in less than one month.`}}' opsrecipe: renew-certificates/ - expr: (cert_exporter_not_after{cluster_type="management_cluster", provider="aws", path!="/etc/kubernetes/ssl/service-account-crt.pem"} - time()) < 4 * 7 * 24 * 60 * 60 + expr: (cert_exporter_not_after{cluster_type="management_cluster", path!="/etc/kubernetes/ssl/service-account-crt.pem"} - time()) < 4 * 7 * 24 * 60 * 60 for: 5m labels: area: kaas diff --git a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/job.rules.yml b/helm/prometheus-rules/templates/shared/alerting-rules/job.rules.yml similarity index 58% rename from helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/job.rules.yml rename to helm/prometheus-rules/templates/shared/alerting-rules/job.rules.yml index 533343618..249a1d75e 100644 --- a/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/job.rules.yml +++ b/helm/prometheus-rules/templates/shared/alerting-rules/job.rules.yml @@ -21,16 +21,3 @@ spec: severity: notify team: {{ include "providerTeam" . }} topic: managementcluster -{{- if eq .Values.managementCluster.provider.kind "aws" }} - - alert: JobHasNotBeenScheduledForTooLong - annotations: - description: '{{`CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} has not been scheduled for more than 2 hours.`}}' - opsrecipe: job-has-not-been-scheduled-for-too-long/ - expr: (time() - kube_cronjob_status_last_schedule_time{cronjob="route53-manager"}) > 7200 - for: 15m - labels: - area: kaas - severity: page - team: phoenix - topic: managementcluster -{{- end }}