diff --git a/example-config.yml b/example-config.yml index 7845a55..a8d3df0 100644 --- a/example-config.yml +++ b/example-config.yml @@ -10,6 +10,8 @@ listen_port: 8888 hide_logs: no # How long to wait before alerting that a node is down. node_down_alert_minutes: 3 +# Node Down alert Pagerduty Severity +node_down_alert_severity: critical # Should the prometheus exporter be enabled? prometheus_enabled: yes @@ -74,7 +76,7 @@ chains: consecutive_enabled: yes # How many missed blocks should trigger a notification? consecutive_missed: 5 - # NOT USED: future hint for pagerduty's routing + # Consecutive Missed alert Pagerduty Severity consecutive_priority: critical # For each chain there is a specific window of blocks and a percentage of missed blocks that will result in @@ -82,7 +84,7 @@ chains: percentage_enabled: no # What percentage should trigger the alert percentage_missed: 10 - # Not used yet, pagerduty routing hint + # Percentage Missed alert Pagerduty Severity percentage_priority: warning # Should an alert be sent if the validator is not in the active set ie, jailed, diff --git a/td2/alert.go b/td2/alert.go index 435aff4..0227fc7 100644 --- a/td2/alert.go +++ b/td2/alert.go @@ -5,13 +5,14 @@ import ( "context" "encoding/json" "fmt" - "github.com/PagerDuty/go-pagerduty" - tgbotapi "github.com/go-telegram-bot-api/telegram-bot-api/v5" "log" "net/http" "strings" "sync" "time" + + "github.com/PagerDuty/go-pagerduty" + tgbotapi "github.com/go-telegram-bot-api/telegram-bot-api/v5" ) type alertMsg struct { @@ -34,8 +35,8 @@ type alertMsg struct { discHook string discMentions string - slkHook string - slkMentions string + slkHook string + slkMentions string } type notifyDest uint8 @@ -206,9 +207,9 @@ func buildSlackMessage(msg *alertMsg) *SlackMessage { return &SlackMessage{ Text: msg.message, Attachments: []Attachment{ - Attachment{ - Title: fmt.Sprintf("TenderDuty %s %s %s", prefix, msg.chain, msg.slkMentions), - Color: color, + { + Title: fmt.Sprintf("TenderDuty %s %s %s", prefix, msg.chain, msg.slkMentions), + Color: color, }, }, } @@ -481,7 +482,7 @@ func (cc *ChainConfig) watch() { td.alert( cc.name, fmt.Sprintf("stalled: have not seen a new block on %s in %d minutes", cc.ChainId, cc.Alerts.Stalled), - "critical", + "info", true, &cc.valInfo.Valcons, ) @@ -525,7 +526,7 @@ func (cc *ChainConfig) watch() { td.alert( cc.name, fmt.Sprintf("%s has missed %d blocks on %s", cc.valInfo.Moniker, cc.Alerts.ConsecutiveMissed, cc.ChainId), - "critical", + cc.Alerts.ConsecutivePriority, false, &id, ) @@ -537,7 +538,7 @@ func (cc *ChainConfig) watch() { td.alert( cc.name, fmt.Sprintf("%s has missed %d blocks on %s", cc.valInfo.Moniker, cc.Alerts.ConsecutiveMissed, cc.ChainId), - "critical", + "info", true, &id, ) @@ -552,7 +553,7 @@ func (cc *ChainConfig) watch() { td.alert( cc.name, fmt.Sprintf("%s has missed > %d%% of the slashing window's blocks on %s", cc.valInfo.Moniker, cc.Alerts.Window, cc.ChainId), - "critical", + cc.Alerts.PercentagePriority, false, &id, ) @@ -564,7 +565,7 @@ func (cc *ChainConfig) watch() { td.alert( cc.name, fmt.Sprintf("%s has missed > %d%% of the slashing window's blocks on %s", cc.valInfo.Moniker, cc.Alerts.Window, cc.ChainId), - "critical", + "info", false, &id, ) @@ -585,8 +586,8 @@ func (cc *ChainConfig) watch() { nodeAlarms[node.Url] = true // used to keep active alert count correct td.alert( cc.name, - fmt.Sprintf("RPC node %s has been down for > %d minutes on %s", node.Url, td.NodeDownMin, cc.ChainId), - "critical", + fmt.Sprintf("Severity: %s\nRPC node %s has been down for > %d minutes on %s", td.NodeDownSeverity, node.Url, td.NodeDownMin, cc.ChainId), + td.NodeDownSeverity, false, &node.Url, ) @@ -596,7 +597,7 @@ func (cc *ChainConfig) watch() { node.wasDown = false td.alert( cc.name, - fmt.Sprintf("RPC node %s has been down for > %d minutes on %s", node.Url, td.NodeDownMin, cc.ChainId), + fmt.Sprintf("Severity: %s\nRPC node %s has been down for > %d minutes on %s", td.NodeDownSeverity, node.Url, td.NodeDownMin, cc.ChainId), "info", true, &node.Url, diff --git a/td2/types.go b/td2/types.go index 0e98275..de72abb 100644 --- a/td2/types.go +++ b/td2/types.go @@ -48,6 +48,8 @@ type Config struct { // NodeDownMin controls how long we wait before sending an alert that a node is not responding or has // fallen behind. NodeDownMin int `yaml:"node_down_alert_minutes"` + // NodeDownSeverity controls the Pagerduty severity when notifying if a node is down. + NodeDownSeverity string `yaml:"node_down_alert_severity"` // Prom controls if the prometheus exporter is enabled. Prom bool `yaml:"prometheus_enabled"`