diff --git a/CHANGELOG.md b/CHANGELOG.md index 779636edb5..5e6999a7f5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -54,6 +54,7 @@ * Optimized labels regex matchers for patterns containing literals (eg. `foo.*`, `.*foo`, `.*foo.*`) * [ENHANCEMENT] Add metric `cortex_ruler_config_update_failures_total` to Ruler to track failures of loading rules files. #2857 * [ENHANCEMENT] Experimental Alertmanager: Alertmanager configuration persisted to object storage using an experimental API that accepts and returns YAML-based Alertmanager configuration. #2768 +* [ENHANCEMENT] Ruler: `-ruler.alertmanager-url` now supports multiple URLs. Each URL is treated as a separate Alertmanager group. Support for multiple Alertmanagers in a group can be achieved by using DNS service discovery. #2851 * [BUGFIX] Fixed a bug in the index intersect code causing storage to return more chunks/series than required. #2796 * [BUGFIX] Fixed the number of reported keys in the background cache queue. #2764 * [BUGFIX] Fix race in processing of headers in sharded queries. #2762 diff --git a/docs/configuration/config-file-reference.md b/docs/configuration/config-file-reference.md index 8f12a6c881..37b1065e75 100644 --- a/docs/configuration/config-file-reference.md +++ b/docs/configuration/config-file-reference.md @@ -1000,19 +1000,22 @@ storage: # CLI flag: -ruler.rule-path [rule_path: | default = "/rules"] -# URL of the Alertmanager to send notifications to. +# Space-separated list of URL(s) of the Alertmanager(s) to send notifications +# to. Each Alertmanager URL is treated as a separate group in the configuration. +# Multiple Alertmanagers in HA per group can be supported by using DNS +# resolution via -ruler.alertmanager-discovery. # CLI flag: -ruler.alertmanager-url -[alertmanager_url: | default = ] +[alertmanager_url: | default = ] -# Use DNS SRV records to discover alertmanager hosts. +# Use DNS SRV records to discover Alertmanager hosts. # CLI flag: -ruler.alertmanager-discovery [enable_alertmanager_discovery: | default = false] -# How long to wait between refreshing alertmanager hosts. +# How long to wait between refreshing DNS resolutions of Alertmanager hosts. # CLI flag: -ruler.alertmanager-refresh-interval [alertmanager_refresh_interval: | default = 1m] -# If enabled requests to alertmanager will utilize the V2 API. +# If enabled requests to Alertmanager will utilize the V2 API. # CLI flag: -ruler.alertmanager-use-v2 [enable_alertmanager_v2: | default = false] diff --git a/pkg/ruler/notifier.go b/pkg/ruler/notifier.go index fb36d7e808..6407e52301 100644 --- a/pkg/ruler/notifier.go +++ b/pkg/ruler/notifier.go @@ -3,7 +3,8 @@ package ruler import ( "context" "fmt" - "strings" + "net/url" + "regexp" "sync" gklog "github.com/go-kit/kit/log" @@ -74,68 +75,86 @@ func (rn *rulerNotifier) stop() { // Builds a Prometheus config.Config from a ruler.Config with just the required // options to configure notifications to Alertmanager. func buildNotifierConfig(rulerConfig *Config) (*config.Config, error) { - if rulerConfig.AlertmanagerURL.URL == nil { - return &config.Config{}, nil - } + validURLs := make([]*url.URL, 0, len(rulerConfig.AlertmanagerURL)) - u := rulerConfig.AlertmanagerURL - var sdConfig sd_config.ServiceDiscoveryConfig - if rulerConfig.AlertmanagerDiscovery { - if !strings.Contains(u.Host, "_tcp.") { - return nil, fmt.Errorf("When alertmanager-discovery is on, host name must be of the form _portname._tcp.service.fqdn (is %q)", u.Host) + srvDNSregexp := regexp.MustCompile(`^_.+._.+`) + for _, h := range rulerConfig.AlertmanagerURL { + url, err := url.Parse(h) + if err != nil { + return nil, err } - dnsSDConfig := dns.SDConfig{ - Names: []string{u.Host}, - RefreshInterval: model.Duration(rulerConfig.AlertmanagerRefreshInterval), - Type: "SRV", - Port: 0, // Ignored, because of SRV. - } - sdConfig = sd_config.ServiceDiscoveryConfig{ - DNSSDConfigs: []*dns.SDConfig{&dnsSDConfig}, + + if url.String() == "" { + continue } - } else { - sdConfig = sd_config.ServiceDiscoveryConfig{ - StaticConfigs: []*targetgroup.Group{ - { - Targets: []model.LabelSet{ - { - model.AddressLabel: model.LabelValue(u.Host), - }, - }, - }, - }, + + // Given we only support SRV lookups as part of service discovery, we need to ensure + // hosts provided follow this specification: _service._proto.name + // e.g. _http._tcp.alertmanager.com + if rulerConfig.AlertmanagerDiscovery && !srvDNSregexp.MatchString(url.Host) { + return nil, fmt.Errorf("when alertmanager-discovery is on, host name must be of the form _portname._tcp.service.fqdn (is %q)", url.Host) } + + validURLs = append(validURLs, url) } - amConfig := &config.AlertmanagerConfig{ - APIVersion: config.AlertmanagerAPIVersionV1, - Scheme: u.Scheme, - PathPrefix: u.Path, - Timeout: model.Duration(rulerConfig.NotificationTimeout), - ServiceDiscoveryConfig: sdConfig, + if len(validURLs) == 0 { + return &config.Config{}, nil } + apiVersion := config.AlertmanagerAPIVersionV1 if rulerConfig.AlertmanangerEnableV2API { - amConfig.APIVersion = config.AlertmanagerAPIVersionV2 + apiVersion = config.AlertmanagerAPIVersionV2 + } + + amConfigs := make([]*config.AlertmanagerConfig, 0, len(validURLs)) + for _, url := range validURLs { + amConfigs = append(amConfigs, amConfigFromURL(rulerConfig, url, apiVersion)) } promConfig := &config.Config{ AlertingConfig: config.AlertingConfig{ - AlertmanagerConfigs: []*config.AlertmanagerConfig{amConfig}, + AlertmanagerConfigs: amConfigs, }, } - if u.User != nil { + return promConfig, nil +} + +func amConfigFromURL(rulerConfig *Config, url *url.URL, apiVersion config.AlertmanagerAPIVersion) *config.AlertmanagerConfig { + var sdConfig sd_config.ServiceDiscoveryConfig + if rulerConfig.AlertmanagerDiscovery { + sdConfig.DNSSDConfigs = []*dns.SDConfig{{ + Names: []string{url.Host}, + RefreshInterval: model.Duration(rulerConfig.AlertmanagerRefreshInterval), + Type: "SRV", + Port: 0, // Ignored, because of SRV. + }} + } else { + sdConfig.StaticConfigs = []*targetgroup.Group{{ + Targets: []model.LabelSet{{model.AddressLabel: model.LabelValue(url.Host)}}, + }} + } + + amConfig := &config.AlertmanagerConfig{ + APIVersion: apiVersion, + Scheme: url.Scheme, + PathPrefix: url.Path, + Timeout: model.Duration(rulerConfig.NotificationTimeout), + ServiceDiscoveryConfig: sdConfig, + } + + if url.User != nil { amConfig.HTTPClientConfig = config_util.HTTPClientConfig{ BasicAuth: &config_util.BasicAuth{ - Username: u.User.Username(), + Username: url.User.Username(), }, } - if password, isSet := u.User.Password(); isSet { + if password, isSet := url.User.Password(); isSet { amConfig.HTTPClientConfig.BasicAuth.Password = config_util.Secret(password) } } - return promConfig, nil + return amConfig } diff --git a/pkg/ruler/notifier_test.go b/pkg/ruler/notifier_test.go new file mode 100644 index 0000000000..9125683f69 --- /dev/null +++ b/pkg/ruler/notifier_test.go @@ -0,0 +1,190 @@ +package ruler + +import ( + "fmt" + "testing" + "time" + + config_util "github.com/prometheus/common/config" + "github.com/prometheus/common/model" + "github.com/prometheus/prometheus/config" + sd_config "github.com/prometheus/prometheus/discovery/config" + "github.com/prometheus/prometheus/discovery/dns" + "github.com/prometheus/prometheus/discovery/targetgroup" + "github.com/stretchr/testify/require" +) + +func TestBuildNotifierConfig(t *testing.T) { + tests := []struct { + name string + cfg *Config + ncfg *config.Config + err error + }{ + { + name: "with no valid hosts, returns an empty config", + cfg: &Config{}, + ncfg: &config.Config{}, + }, + { + name: "with a single URL and no service discovery", + cfg: &Config{ + AlertmanagerURL: []string{"http://alertmanager.default.svc.cluster.local/alertmanager"}, + }, + ncfg: &config.Config{ + AlertingConfig: config.AlertingConfig{ + AlertmanagerConfigs: []*config.AlertmanagerConfig{ + { + APIVersion: "v1", + Scheme: "http", + PathPrefix: "/alertmanager", + ServiceDiscoveryConfig: sd_config.ServiceDiscoveryConfig{StaticConfigs: []*targetgroup.Group{{ + Targets: []model.LabelSet{{"__address__": "alertmanager.default.svc.cluster.local"}}, + }}}, + }, + }, + }, + }, + }, + { + name: "with a single URL and service discovery", + cfg: &Config{ + AlertmanagerURL: []string{"http://_http._tcp.alertmanager.default.svc.cluster.local/alertmanager"}, + AlertmanagerDiscovery: true, + AlertmanagerRefreshInterval: time.Duration(60), + }, + ncfg: &config.Config{ + AlertingConfig: config.AlertingConfig{ + AlertmanagerConfigs: []*config.AlertmanagerConfig{ + { + APIVersion: "v1", + Scheme: "http", + PathPrefix: "/alertmanager", + ServiceDiscoveryConfig: sd_config.ServiceDiscoveryConfig{DNSSDConfigs: []*dns.SDConfig{{ + Names: []string{"_http._tcp.alertmanager.default.svc.cluster.local"}, + RefreshInterval: 60, + Type: "SRV", + Port: 0, + }}}, + }, + }, + }, + }, + }, + { + name: "with service discovery and an invalid URL", + cfg: &Config{ + AlertmanagerURL: []string{"http://_http.default.svc.cluster.local/alertmanager"}, + AlertmanagerDiscovery: true, + }, + err: fmt.Errorf("when alertmanager-discovery is on, host name must be of the form _portname._tcp.service.fqdn (is \"alertmanager.default.svc.cluster.local\")"), + }, + { + name: "with multiple URLs and no service discovery", + cfg: &Config{ + AlertmanagerURL: []string{ + "http://alertmanager-0.default.svc.cluster.local/alertmanager", + "http://alertmanager-1.default.svc.cluster.local/alertmanager", + }, + }, + ncfg: &config.Config{ + AlertingConfig: config.AlertingConfig{ + AlertmanagerConfigs: []*config.AlertmanagerConfig{ + { + APIVersion: "v1", + Scheme: "http", + PathPrefix: "/alertmanager", + ServiceDiscoveryConfig: sd_config.ServiceDiscoveryConfig{StaticConfigs: []*targetgroup.Group{{ + Targets: []model.LabelSet{{"__address__": "alertmanager-0.default.svc.cluster.local"}}, + }}}, + }, + { + APIVersion: "v1", + Scheme: "http", + PathPrefix: "/alertmanager", + ServiceDiscoveryConfig: sd_config.ServiceDiscoveryConfig{StaticConfigs: []*targetgroup.Group{{ + Targets: []model.LabelSet{{"__address__": "alertmanager-1.default.svc.cluster.local"}}, + }}}, + }, + }, + }, + }, + }, + { + name: "with multiple URLs and service discovery", + cfg: &Config{ + AlertmanagerURL: []string{ + "http://_http._tcp.alertmanager-0.default.svc.cluster.local/alertmanager", + "http://_http._tcp.alertmanager-1.default.svc.cluster.local/alertmanager", + }, + AlertmanagerDiscovery: true, + AlertmanagerRefreshInterval: time.Duration(60), + }, + ncfg: &config.Config{ + AlertingConfig: config.AlertingConfig{ + AlertmanagerConfigs: []*config.AlertmanagerConfig{ + { + APIVersion: "v1", + Scheme: "http", + PathPrefix: "/alertmanager", + ServiceDiscoveryConfig: sd_config.ServiceDiscoveryConfig{DNSSDConfigs: []*dns.SDConfig{{ + Names: []string{"_http._tcp.alertmanager-0.default.svc.cluster.local"}, + RefreshInterval: 60, + Type: "SRV", + Port: 0, + }}}, + }, + { + APIVersion: "v1", + Scheme: "http", + PathPrefix: "/alertmanager", + ServiceDiscoveryConfig: sd_config.ServiceDiscoveryConfig{DNSSDConfigs: []*dns.SDConfig{{ + Names: []string{"_http._tcp.alertmanager-1.default.svc.cluster.local"}, + RefreshInterval: 60, + Type: "SRV", + Port: 0, + }}}, + }, + }, + }, + }, + }, + { + name: "with Basic Authentication", + cfg: &Config{ + AlertmanagerURL: []string{ + "http://marco:hunter2@alertmanager-0.default.svc.cluster.local/alertmanager", + }, + }, + ncfg: &config.Config{ + AlertingConfig: config.AlertingConfig{ + AlertmanagerConfigs: []*config.AlertmanagerConfig{ + { + HTTPClientConfig: config_util.HTTPClientConfig{ + BasicAuth: &config_util.BasicAuth{Username: "marco", Password: "hunter2"}, + }, + APIVersion: "v1", + Scheme: "http", + PathPrefix: "/alertmanager", + ServiceDiscoveryConfig: sd_config.ServiceDiscoveryConfig{StaticConfigs: []*targetgroup.Group{{ + Targets: []model.LabelSet{{"__address__": "alertmanager-0.default.svc.cluster.local"}}, + }}}, + }, + }, + }, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + ncfg, err := buildNotifierConfig(tt.cfg) + if tt.err == nil { + require.NoError(t, err) + require.Equal(t, tt.ncfg, ncfg) + } else { + require.Error(t, tt.err, err) + } + }) + } +} diff --git a/pkg/ruler/ruler.go b/pkg/ruler/ruler.go index 7e3c5f3264..ad915b656d 100644 --- a/pkg/ruler/ruler.go +++ b/pkg/ruler/ruler.go @@ -81,12 +81,12 @@ type Config struct { RulePath string `yaml:"rule_path"` // URL of the Alertmanager to send notifications to. - AlertmanagerURL flagext.URLValue `yaml:"alertmanager_url"` - // Whether to use DNS SRV records to discover alertmanagers. + AlertmanagerURL flagext.StringSlice `yaml:"alertmanager_url"` + // Whether to use DNS SRV records to discover Alertmanager. AlertmanagerDiscovery bool `yaml:"enable_alertmanager_discovery"` - // How long to wait between refreshing the list of alertmanagers based on DNS service discovery. + // How long to wait between refreshing the list of Alertmanager based on DNS service discovery. AlertmanagerRefreshInterval time.Duration `yaml:"alertmanager_refresh_interval"` - // Enables the ruler notifier to use the alertmananger V2 API. + // Enables the ruler notifier to use the Alertmananger V2 API. AlertmanangerEnableV2API bool `yaml:"enable_alertmanager_v2"` // Capacity of the queue for notifications to be sent to the Alertmanager. NotificationQueueCapacity int `yaml:"notification_queue_capacity"` @@ -133,12 +133,14 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) { f.DurationVar(&cfg.EvaluationInterval, "ruler.evaluation-interval", 1*time.Minute, "How frequently to evaluate rules") f.DurationVar(&cfg.EvaluationDelay, "ruler.evaluation-delay-duration", 0, "Duration to delay the evaluation of rules to ensure they underlying metrics have been pushed to cortex.") f.DurationVar(&cfg.PollInterval, "ruler.poll-interval", 1*time.Minute, "How frequently to poll for rule changes") - f.Var(&cfg.AlertmanagerURL, "ruler.alertmanager-url", "URL of the Alertmanager to send notifications to.") - f.BoolVar(&cfg.AlertmanagerDiscovery, "ruler.alertmanager-discovery", false, "Use DNS SRV records to discover alertmanager hosts.") - f.DurationVar(&cfg.AlertmanagerRefreshInterval, "ruler.alertmanager-refresh-interval", 1*time.Minute, "How long to wait between refreshing alertmanager hosts.") - f.BoolVar(&cfg.AlertmanangerEnableV2API, "ruler.alertmanager-use-v2", false, "If enabled requests to alertmanager will utilize the V2 API.") + + f.Var(&cfg.AlertmanagerURL, "ruler.alertmanager-url", "Space-separated list of URL(s) of the Alertmanager(s) to send notifications to. Each Alertmanager URL is treated as a separate group in the configuration. Multiple Alertmanagers in HA per group can be supported by using DNS resolution via -ruler.alertmanager-discovery.") + f.BoolVar(&cfg.AlertmanagerDiscovery, "ruler.alertmanager-discovery", false, "Use DNS SRV records to discover Alertmanager hosts.") + f.DurationVar(&cfg.AlertmanagerRefreshInterval, "ruler.alertmanager-refresh-interval", 1*time.Minute, "How long to wait between refreshing DNS resolutions of Alertmanager hosts.") + f.BoolVar(&cfg.AlertmanangerEnableV2API, "ruler.alertmanager-use-v2", false, "If enabled requests to Alertmanager will utilize the V2 API.") f.IntVar(&cfg.NotificationQueueCapacity, "ruler.notification-queue-capacity", 10000, "Capacity of the queue for notifications to be sent to the Alertmanager.") f.DurationVar(&cfg.NotificationTimeout, "ruler.notification-timeout", 10*time.Second, "HTTP timeout duration when sending notifications to the Alertmanager.") + f.DurationVar(&cfg.SearchPendingFor, "ruler.search-pending-for", 5*time.Minute, "Time to spend searching for a pending ruler when shutting down.") f.BoolVar(&cfg.EnableSharding, "ruler.enable-sharding", false, "Distribute rule evaluation using ring backend") f.DurationVar(&cfg.FlushCheckPeriod, "ruler.flush-period", 1*time.Minute, "Period with which to attempt to flush rule groups.")