Skip to content

Commit

Permalink
Support multiple Alertmanagers explicitly in the Ruler (#2851)
Browse files Browse the repository at this point in the history
* Support Multiple Alertmanager URLs

At the moment, the Ruler only support sending alerts to multiple
Alertmanagers via DNS based service discovery. However, sending to
multiple Alertmanager groups is something that Prometheus allows.

I believe the Ruler should support this too and this commit introduces
that.

To keep backward compatibility we're reusing the same flag but allowing
a list of 1+. With this, we can treat each URL as an Alertmanager group
where multiple Alertmanagers per group is only supported if DNS service
discovery is enabled.

Signed-off-by: gotjosh <[email protected]>

* Address review comments

Signed-off-by: gotjosh <[email protected]>

* Add the host check when service discovery is enabled

We need to ensure address provided follow the SRV dns specification
format.

Signed-off-by: gotjosh <[email protected]>

* It is actually space-separated

Signed-off-by: gotjosh <[email protected]>

* Update pkg/ruler/notifier.go

Signed-off-by: Marco Pracucci <[email protected]>

* Update pkg/ruler/notifier_test.go

Signed-off-by: Marco Pracucci <[email protected]>

Co-authored-by: Marco Pracucci <[email protected]>
  • Loading branch information
gotjosh and pracucci authored Jul 14, 2020
1 parent 192fc86 commit 6267964
Show file tree
Hide file tree
Showing 5 changed files with 268 additions and 53 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
* Optimized labels regex matchers for patterns containing literals (eg. `foo.*`, `.*foo`, `.*foo.*`)
* [ENHANCEMENT] Add metric `cortex_ruler_config_update_failures_total` to Ruler to track failures of loading rules files. #2857
* [ENHANCEMENT] Experimental Alertmanager: Alertmanager configuration persisted to object storage using an experimental API that accepts and returns YAML-based Alertmanager configuration. #2768
* [ENHANCEMENT] Ruler: `-ruler.alertmanager-url` now supports multiple URLs. Each URL is treated as a separate Alertmanager group. Support for multiple Alertmanagers in a group can be achieved by using DNS service discovery. #2851
* [BUGFIX] Fixed a bug in the index intersect code causing storage to return more chunks/series than required. #2796
* [BUGFIX] Fixed the number of reported keys in the background cache queue. #2764
* [BUGFIX] Fix race in processing of headers in sharded queries. #2762
Expand Down
13 changes: 8 additions & 5 deletions docs/configuration/config-file-reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -1000,19 +1000,22 @@ storage:
# CLI flag: -ruler.rule-path
[rule_path: <string> | default = "/rules"]

# URL of the Alertmanager to send notifications to.
# Space-separated list of URL(s) of the Alertmanager(s) to send notifications
# to. Each Alertmanager URL is treated as a separate group in the configuration.
# Multiple Alertmanagers in HA per group can be supported by using DNS
# resolution via -ruler.alertmanager-discovery.
# CLI flag: -ruler.alertmanager-url
[alertmanager_url: <url> | default = ]
[alertmanager_url: <list of string> | default = ]

# Use DNS SRV records to discover alertmanager hosts.
# Use DNS SRV records to discover Alertmanager hosts.
# CLI flag: -ruler.alertmanager-discovery
[enable_alertmanager_discovery: <boolean> | default = false]

# How long to wait between refreshing alertmanager hosts.
# How long to wait between refreshing DNS resolutions of Alertmanager hosts.
# CLI flag: -ruler.alertmanager-refresh-interval
[alertmanager_refresh_interval: <duration> | default = 1m]

# If enabled requests to alertmanager will utilize the V2 API.
# If enabled requests to Alertmanager will utilize the V2 API.
# CLI flag: -ruler.alertmanager-use-v2
[enable_alertmanager_v2: <boolean> | default = false]

Expand Down
99 changes: 59 additions & 40 deletions pkg/ruler/notifier.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@ package ruler
import (
"context"
"fmt"
"strings"
"net/url"
"regexp"
"sync"

gklog "github.com/go-kit/kit/log"
Expand Down Expand Up @@ -74,68 +75,86 @@ func (rn *rulerNotifier) stop() {
// Builds a Prometheus config.Config from a ruler.Config with just the required
// options to configure notifications to Alertmanager.
func buildNotifierConfig(rulerConfig *Config) (*config.Config, error) {
if rulerConfig.AlertmanagerURL.URL == nil {
return &config.Config{}, nil
}
validURLs := make([]*url.URL, 0, len(rulerConfig.AlertmanagerURL))

u := rulerConfig.AlertmanagerURL
var sdConfig sd_config.ServiceDiscoveryConfig
if rulerConfig.AlertmanagerDiscovery {
if !strings.Contains(u.Host, "_tcp.") {
return nil, fmt.Errorf("When alertmanager-discovery is on, host name must be of the form _portname._tcp.service.fqdn (is %q)", u.Host)
srvDNSregexp := regexp.MustCompile(`^_.+._.+`)
for _, h := range rulerConfig.AlertmanagerURL {
url, err := url.Parse(h)
if err != nil {
return nil, err
}
dnsSDConfig := dns.SDConfig{
Names: []string{u.Host},
RefreshInterval: model.Duration(rulerConfig.AlertmanagerRefreshInterval),
Type: "SRV",
Port: 0, // Ignored, because of SRV.
}
sdConfig = sd_config.ServiceDiscoveryConfig{
DNSSDConfigs: []*dns.SDConfig{&dnsSDConfig},

if url.String() == "" {
continue
}
} else {
sdConfig = sd_config.ServiceDiscoveryConfig{
StaticConfigs: []*targetgroup.Group{
{
Targets: []model.LabelSet{
{
model.AddressLabel: model.LabelValue(u.Host),
},
},
},
},

// Given we only support SRV lookups as part of service discovery, we need to ensure
// hosts provided follow this specification: _service._proto.name
// e.g. _http._tcp.alertmanager.com
if rulerConfig.AlertmanagerDiscovery && !srvDNSregexp.MatchString(url.Host) {
return nil, fmt.Errorf("when alertmanager-discovery is on, host name must be of the form _portname._tcp.service.fqdn (is %q)", url.Host)
}

validURLs = append(validURLs, url)
}

amConfig := &config.AlertmanagerConfig{
APIVersion: config.AlertmanagerAPIVersionV1,
Scheme: u.Scheme,
PathPrefix: u.Path,
Timeout: model.Duration(rulerConfig.NotificationTimeout),
ServiceDiscoveryConfig: sdConfig,
if len(validURLs) == 0 {
return &config.Config{}, nil
}

apiVersion := config.AlertmanagerAPIVersionV1
if rulerConfig.AlertmanangerEnableV2API {
amConfig.APIVersion = config.AlertmanagerAPIVersionV2
apiVersion = config.AlertmanagerAPIVersionV2
}

amConfigs := make([]*config.AlertmanagerConfig, 0, len(validURLs))
for _, url := range validURLs {
amConfigs = append(amConfigs, amConfigFromURL(rulerConfig, url, apiVersion))
}

promConfig := &config.Config{
AlertingConfig: config.AlertingConfig{
AlertmanagerConfigs: []*config.AlertmanagerConfig{amConfig},
AlertmanagerConfigs: amConfigs,
},
}

if u.User != nil {
return promConfig, nil
}

func amConfigFromURL(rulerConfig *Config, url *url.URL, apiVersion config.AlertmanagerAPIVersion) *config.AlertmanagerConfig {
var sdConfig sd_config.ServiceDiscoveryConfig
if rulerConfig.AlertmanagerDiscovery {
sdConfig.DNSSDConfigs = []*dns.SDConfig{{
Names: []string{url.Host},
RefreshInterval: model.Duration(rulerConfig.AlertmanagerRefreshInterval),
Type: "SRV",
Port: 0, // Ignored, because of SRV.
}}
} else {
sdConfig.StaticConfigs = []*targetgroup.Group{{
Targets: []model.LabelSet{{model.AddressLabel: model.LabelValue(url.Host)}},
}}
}

amConfig := &config.AlertmanagerConfig{
APIVersion: apiVersion,
Scheme: url.Scheme,
PathPrefix: url.Path,
Timeout: model.Duration(rulerConfig.NotificationTimeout),
ServiceDiscoveryConfig: sdConfig,
}

if url.User != nil {
amConfig.HTTPClientConfig = config_util.HTTPClientConfig{
BasicAuth: &config_util.BasicAuth{
Username: u.User.Username(),
Username: url.User.Username(),
},
}

if password, isSet := u.User.Password(); isSet {
if password, isSet := url.User.Password(); isSet {
amConfig.HTTPClientConfig.BasicAuth.Password = config_util.Secret(password)
}
}

return promConfig, nil
return amConfig
}
190 changes: 190 additions & 0 deletions pkg/ruler/notifier_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
package ruler

import (
"fmt"
"testing"
"time"

config_util "github.com/prometheus/common/config"
"github.com/prometheus/common/model"
"github.com/prometheus/prometheus/config"
sd_config "github.com/prometheus/prometheus/discovery/config"
"github.com/prometheus/prometheus/discovery/dns"
"github.com/prometheus/prometheus/discovery/targetgroup"
"github.com/stretchr/testify/require"
)

func TestBuildNotifierConfig(t *testing.T) {
tests := []struct {
name string
cfg *Config
ncfg *config.Config
err error
}{
{
name: "with no valid hosts, returns an empty config",
cfg: &Config{},
ncfg: &config.Config{},
},
{
name: "with a single URL and no service discovery",
cfg: &Config{
AlertmanagerURL: []string{"http://alertmanager.default.svc.cluster.local/alertmanager"},
},
ncfg: &config.Config{
AlertingConfig: config.AlertingConfig{
AlertmanagerConfigs: []*config.AlertmanagerConfig{
{
APIVersion: "v1",
Scheme: "http",
PathPrefix: "/alertmanager",
ServiceDiscoveryConfig: sd_config.ServiceDiscoveryConfig{StaticConfigs: []*targetgroup.Group{{
Targets: []model.LabelSet{{"__address__": "alertmanager.default.svc.cluster.local"}},
}}},
},
},
},
},
},
{
name: "with a single URL and service discovery",
cfg: &Config{
AlertmanagerURL: []string{"http://_http._tcp.alertmanager.default.svc.cluster.local/alertmanager"},
AlertmanagerDiscovery: true,
AlertmanagerRefreshInterval: time.Duration(60),
},
ncfg: &config.Config{
AlertingConfig: config.AlertingConfig{
AlertmanagerConfigs: []*config.AlertmanagerConfig{
{
APIVersion: "v1",
Scheme: "http",
PathPrefix: "/alertmanager",
ServiceDiscoveryConfig: sd_config.ServiceDiscoveryConfig{DNSSDConfigs: []*dns.SDConfig{{
Names: []string{"_http._tcp.alertmanager.default.svc.cluster.local"},
RefreshInterval: 60,
Type: "SRV",
Port: 0,
}}},
},
},
},
},
},
{
name: "with service discovery and an invalid URL",
cfg: &Config{
AlertmanagerURL: []string{"http://_http.default.svc.cluster.local/alertmanager"},
AlertmanagerDiscovery: true,
},
err: fmt.Errorf("when alertmanager-discovery is on, host name must be of the form _portname._tcp.service.fqdn (is \"alertmanager.default.svc.cluster.local\")"),
},
{
name: "with multiple URLs and no service discovery",
cfg: &Config{
AlertmanagerURL: []string{
"http://alertmanager-0.default.svc.cluster.local/alertmanager",
"http://alertmanager-1.default.svc.cluster.local/alertmanager",
},
},
ncfg: &config.Config{
AlertingConfig: config.AlertingConfig{
AlertmanagerConfigs: []*config.AlertmanagerConfig{
{
APIVersion: "v1",
Scheme: "http",
PathPrefix: "/alertmanager",
ServiceDiscoveryConfig: sd_config.ServiceDiscoveryConfig{StaticConfigs: []*targetgroup.Group{{
Targets: []model.LabelSet{{"__address__": "alertmanager-0.default.svc.cluster.local"}},
}}},
},
{
APIVersion: "v1",
Scheme: "http",
PathPrefix: "/alertmanager",
ServiceDiscoveryConfig: sd_config.ServiceDiscoveryConfig{StaticConfigs: []*targetgroup.Group{{
Targets: []model.LabelSet{{"__address__": "alertmanager-1.default.svc.cluster.local"}},
}}},
},
},
},
},
},
{
name: "with multiple URLs and service discovery",
cfg: &Config{
AlertmanagerURL: []string{
"http://_http._tcp.alertmanager-0.default.svc.cluster.local/alertmanager",
"http://_http._tcp.alertmanager-1.default.svc.cluster.local/alertmanager",
},
AlertmanagerDiscovery: true,
AlertmanagerRefreshInterval: time.Duration(60),
},
ncfg: &config.Config{
AlertingConfig: config.AlertingConfig{
AlertmanagerConfigs: []*config.AlertmanagerConfig{
{
APIVersion: "v1",
Scheme: "http",
PathPrefix: "/alertmanager",
ServiceDiscoveryConfig: sd_config.ServiceDiscoveryConfig{DNSSDConfigs: []*dns.SDConfig{{
Names: []string{"_http._tcp.alertmanager-0.default.svc.cluster.local"},
RefreshInterval: 60,
Type: "SRV",
Port: 0,
}}},
},
{
APIVersion: "v1",
Scheme: "http",
PathPrefix: "/alertmanager",
ServiceDiscoveryConfig: sd_config.ServiceDiscoveryConfig{DNSSDConfigs: []*dns.SDConfig{{
Names: []string{"_http._tcp.alertmanager-1.default.svc.cluster.local"},
RefreshInterval: 60,
Type: "SRV",
Port: 0,
}}},
},
},
},
},
},
{
name: "with Basic Authentication",
cfg: &Config{
AlertmanagerURL: []string{
"http://marco:[email protected]/alertmanager",
},
},
ncfg: &config.Config{
AlertingConfig: config.AlertingConfig{
AlertmanagerConfigs: []*config.AlertmanagerConfig{
{
HTTPClientConfig: config_util.HTTPClientConfig{
BasicAuth: &config_util.BasicAuth{Username: "marco", Password: "hunter2"},
},
APIVersion: "v1",
Scheme: "http",
PathPrefix: "/alertmanager",
ServiceDiscoveryConfig: sd_config.ServiceDiscoveryConfig{StaticConfigs: []*targetgroup.Group{{
Targets: []model.LabelSet{{"__address__": "alertmanager-0.default.svc.cluster.local"}},
}}},
},
},
},
},
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
ncfg, err := buildNotifierConfig(tt.cfg)
if tt.err == nil {
require.NoError(t, err)
require.Equal(t, tt.ncfg, ncfg)
} else {
require.Error(t, tt.err, err)
}
})
}
}
18 changes: 10 additions & 8 deletions pkg/ruler/ruler.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,12 +81,12 @@ type Config struct {
RulePath string `yaml:"rule_path"`

// URL of the Alertmanager to send notifications to.
AlertmanagerURL flagext.URLValue `yaml:"alertmanager_url"`
// Whether to use DNS SRV records to discover alertmanagers.
AlertmanagerURL flagext.StringSlice `yaml:"alertmanager_url"`
// Whether to use DNS SRV records to discover Alertmanager.
AlertmanagerDiscovery bool `yaml:"enable_alertmanager_discovery"`
// How long to wait between refreshing the list of alertmanagers based on DNS service discovery.
// How long to wait between refreshing the list of Alertmanager based on DNS service discovery.
AlertmanagerRefreshInterval time.Duration `yaml:"alertmanager_refresh_interval"`
// Enables the ruler notifier to use the alertmananger V2 API.
// Enables the ruler notifier to use the Alertmananger V2 API.
AlertmanangerEnableV2API bool `yaml:"enable_alertmanager_v2"`
// Capacity of the queue for notifications to be sent to the Alertmanager.
NotificationQueueCapacity int `yaml:"notification_queue_capacity"`
Expand Down Expand Up @@ -133,12 +133,14 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
f.DurationVar(&cfg.EvaluationInterval, "ruler.evaluation-interval", 1*time.Minute, "How frequently to evaluate rules")
f.DurationVar(&cfg.EvaluationDelay, "ruler.evaluation-delay-duration", 0, "Duration to delay the evaluation of rules to ensure they underlying metrics have been pushed to cortex.")
f.DurationVar(&cfg.PollInterval, "ruler.poll-interval", 1*time.Minute, "How frequently to poll for rule changes")
f.Var(&cfg.AlertmanagerURL, "ruler.alertmanager-url", "URL of the Alertmanager to send notifications to.")
f.BoolVar(&cfg.AlertmanagerDiscovery, "ruler.alertmanager-discovery", false, "Use DNS SRV records to discover alertmanager hosts.")
f.DurationVar(&cfg.AlertmanagerRefreshInterval, "ruler.alertmanager-refresh-interval", 1*time.Minute, "How long to wait between refreshing alertmanager hosts.")
f.BoolVar(&cfg.AlertmanangerEnableV2API, "ruler.alertmanager-use-v2", false, "If enabled requests to alertmanager will utilize the V2 API.")

f.Var(&cfg.AlertmanagerURL, "ruler.alertmanager-url", "Space-separated list of URL(s) of the Alertmanager(s) to send notifications to. Each Alertmanager URL is treated as a separate group in the configuration. Multiple Alertmanagers in HA per group can be supported by using DNS resolution via -ruler.alertmanager-discovery.")
f.BoolVar(&cfg.AlertmanagerDiscovery, "ruler.alertmanager-discovery", false, "Use DNS SRV records to discover Alertmanager hosts.")
f.DurationVar(&cfg.AlertmanagerRefreshInterval, "ruler.alertmanager-refresh-interval", 1*time.Minute, "How long to wait between refreshing DNS resolutions of Alertmanager hosts.")
f.BoolVar(&cfg.AlertmanangerEnableV2API, "ruler.alertmanager-use-v2", false, "If enabled requests to Alertmanager will utilize the V2 API.")
f.IntVar(&cfg.NotificationQueueCapacity, "ruler.notification-queue-capacity", 10000, "Capacity of the queue for notifications to be sent to the Alertmanager.")
f.DurationVar(&cfg.NotificationTimeout, "ruler.notification-timeout", 10*time.Second, "HTTP timeout duration when sending notifications to the Alertmanager.")

f.DurationVar(&cfg.SearchPendingFor, "ruler.search-pending-for", 5*time.Minute, "Time to spend searching for a pending ruler when shutting down.")
f.BoolVar(&cfg.EnableSharding, "ruler.enable-sharding", false, "Distribute rule evaluation using ring backend")
f.DurationVar(&cfg.FlushCheckPeriod, "ruler.flush-period", 1*time.Minute, "Period with which to attempt to flush rule groups.")
Expand Down

0 comments on commit 6267964

Please sign in to comment.