Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

sync: stage to production #286

Merged
merged 21 commits into from
Oct 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
2d8544c
Certificate Expiring Critical + Warning
aaa5kameric Aug 8, 2024
4d4bad8
Certificate Expiring Critical + Warning-updated
aaa5kameric Aug 9, 2024
c8e0a9d
Certificate Expiring Critical + Warning-updated
aaa5kameric Aug 9, 2024
dd9679d
Certificate Expiring Critical + Warning-updated
aaa5kameric Aug 9, 2024
bb1dc01
updated time check
aaa5kameric Aug 12, 2024
a2221dd
updated time check
aaa5kameric Aug 12, 2024
492ef38
Merge branch 'master' into ROX-21530-certificate-alerting
aaa5kameric Aug 12, 2024
9f9dde7
Merge branch 'master' into ROX-21530-certificate-alerting
aaa5kameric Aug 12, 2024
9e6d0e7
updated time check
aaa5kameric Aug 12, 2024
fc4284c
updating timestamp rule
aaa5kameric Aug 12, 2024
6a55e90
minor changes for consistency
aaa5kameric Aug 13, 2024
9c8990d
minor changes for consistency
aaa5kameric Aug 22, 2024
3d20adc
minor changes for consistency
aaa5kameric Aug 22, 2024
a97705f
Update from suggestions
aaa5kameric Aug 26, 2024
28d62a3
Merge pull request #276 from stackrox/ROX-21530-certificate-alerting
aaa5kameric Aug 26, 2024
f1e34af
chore(deps): bump pascalgn/automerge-action from 0.16.3 to 0.16.4 (#280)
dependabot[bot] Sep 24, 2024
eea3683
setup python for pre commit (#282)
kurlov Oct 11, 2024
e0167fe
Increase RHACSTenantWorkloadMemoryUtilizationHigh eval time to 30 min…
kurlov Oct 11, 2024
afe0e28
Increase tenant workload memory alert threshold to 90%
ludydoo Oct 29, 2024
b3ee599
Merge pull request #284 from stackrox/increase-tenant-workload-memory…
ludydoo Oct 29, 2024
456de45
Merge pull request #285 from stackrox/master
github-actions[bot] Oct 30, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/automerge.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ jobs:
steps:
- id: automerge
name: automerge
uses: "pascalgn/[email protected].3"
uses: "pascalgn/[email protected].4"
env:
GITHUB_TOKEN: "${{ secrets.GITHUB_TOKEN }}"
MERGE_METHOD: merge
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ jobs:
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.12'
- uses: actions/setup-go@v5
with:
go-version: ">=1.18.0"
Expand Down
20 changes: 18 additions & 2 deletions resources/prometheus/prometheus-rules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,22 @@ spec:
summary: "Fleetshard synchronizer container `{{ $labels.pod }}/{{ $labels.container }}` in namespace `{{ $labels.namespace }}` restarted more than 3 times."
description: "Fleetshard synchronizer container `{{ $labels.pod }}/{{ $labels.container }}` in namespace `{{ $labels.namespace }}` has restarted more than 3 times during the last 30 minutes."
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-005-fleetshard-sync-unavailable.md"
- alert: RHACSFleetshardCertificateExpiryCritical
expr: |
acs_fleetshard_certificate_expiration_timestamp <= 1 * 24 * 60 * 60 + time()
labels:
severity: critical
annotations:
summary: "Certificate expiring very soon: `{{ $labels.exported_namespace }}/{{ $labels.secret }}/{{ $labels.data_key }}`."
description: "Certificate `{{ $labels.exported_namespace }}/{{ $labels.secret }}/{{ $labels.data_key }}` expires on {{ humanizeTimestamp $value}}."
- alert: RHACSFleetshardCertificateExpiryWarning
expr: |
acs_fleetshard_certificate_expiration_timestamp <= 7* 24 * 60 * 60 + time()
labels:
severity: warning
annotations:
summary: "Certificate expiring soon: `{{ $labels.exported_namespace }}/{{ $labels.secret }}/{{ $labels.data_key }}`."
description: "Certificate `{{ $labels.exported_namespace }}/{{ $labels.secret }}/{{ $labels.data_key }}` expires on {{ humanizeTimestamp $value}}."
- alert: RHACSFleetshardSyncReconciliationErrors
expr: |
acs_fleetshard_central_errors_per_reconciliations:ratio_rate10m > 0.10
Expand Down Expand Up @@ -225,8 +241,8 @@ spec:
record: rhacs_tenants:namespace:pod:container:max_memory_usage_ratio
- alert: RHACSTenantWorkloadMemoryUtilizationHigh
expr: |
rhacs_tenants:namespace:pod:container:max_memory_usage_ratio{container="central"} >= 0.85
for: 10m
rhacs_tenants:namespace:pod:container:max_memory_usage_ratio{container="central"} >= 0.9
for: 30m
labels:
severity: warning
annotations:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
rule_files:
- /tmp/prometheus-rules-test.yaml

evaluation_interval: 1m

tests:
- interval: 1d
input_series:
- series: acs_fleetshard_certificate_expiration_timestamp{exported_namespace="rhacs-00000000000000000000", secret="secret", data_key="key"}
values: "691200+0x15" # equals to 8 days

alert_rule_test:
- eval_time: 0
alertname: RHACSFleetshardCertificateExpiryWarning
exp_alerts: [ ]
- eval_time: 3d
alertname: RHACSFleetshardCertificateExpiryWarning
exp_alerts:
- exp_labels:
alertname: RHACSFleetshardCertificateExpiryWarning
exported_namespace: rhacs-00000000000000000000
secret: secret
data_key: key
severity: warning
exp_annotations:
summary: "Certificate expiring soon: `rhacs-00000000000000000000/secret/key`."
description: "Certificate `rhacs-00000000000000000000/secret/key` expires on 1970-01-09 00:00:00 +0000 UTC."
- eval_time: 7d
alertname: RHACSFleetshardCertificateExpiryCritical
exp_alerts:
- exp_labels:
alertname: RHACSFleetshardCertificateExpiryCritical
exported_namespace: rhacs-00000000000000000000
secret: secret
data_key: key
severity: critical
exp_annotations:
summary: "Certificate expiring very soon: `rhacs-00000000000000000000/secret/key`."
description: "Certificate `rhacs-00000000000000000000/secret/key` expires on 1970-01-09 00:00:00 +0000 UTC."
- eval_time: 10d
alertname: RHACSFleetshardCertificateExpiryCritical
exp_alerts:
- exp_labels:
alertname: RHACSFleetshardCertificateExpiryCritical
exported_namespace: rhacs-00000000000000000000
secret: secret
data_key: key
severity: critical
exp_annotations:
summary: "Certificate expiring very soon: `rhacs-00000000000000000000/secret/key`."
description: "Certificate `rhacs-00000000000000000000/secret/key` expires on 1970-01-09 00:00:00 +0000 UTC."
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,15 @@ tests:
- interval: 1m
input_series:
- series: container_memory_working_set_bytes{namespace="rhacs-aaaaaaaaaaaaaaaaaaaa", pod="mypod", container="central"}
values: "50+0x10 85+0x10"
# first 10 minutes no alert and then 90% CPU usage for 40 minutes
values: "50+0x10 90+0x40"
- series: container_spec_memory_limit_bytes{namespace="rhacs-aaaaaaaaaaaaaaaaaaaa",pod="mypod", container="central"}
values: "100+0x20"
values: "100+0x40"
alert_rule_test:
- eval_time: 1m
alertname: RHACSTenantWorkloadMemoryUtilizationHigh
exp_alerts: []
- eval_time: 21m
- eval_time: 41m
alertname: RHACSTenantWorkloadMemoryUtilizationHigh
exp_alerts:
- exp_labels:
Expand All @@ -25,7 +26,7 @@ tests:
container: central
exp_annotations:
summary: tenant 'rhacs-aaaaaaaaaaaaaaaaaaaa' container 'central' in pod 'mypod' is reaching its memory limit.
description: tenant 'rhacs-aaaaaaaaaaaaaaaaaaaa' container 'central' in pod 'mypod' reached 85% of its memory limit and is at risk of being OOM killed.
description: tenant 'rhacs-aaaaaaaaaaaaaaaaaaaa' container 'central' in pod 'mypod' reached 90% of its memory limit and is at risk of being OOM killed.
sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-cloud-service/runbooks/-/blob/master/sops/dp-039-tenant-workload-memory-utilization-high.md"
- interval: 1m
input_series:
Expand Down