diff --git a/CHANGELOG.asciidoc b/CHANGELOG.asciidoc index 6eb5e5437b6..c3d19c960b5 100644 --- a/CHANGELOG.asciidoc +++ b/CHANGELOG.asciidoc @@ -40,6 +40,7 @@ https://github.com/elastic/beats/compare/v6.4.0...6.4[Check the HEAD diff] - Remove unix-like permission checks on Windows, so files can be opened. {issue}7849[7849] - Deregister pipeline loader callback when inputsRunner is stopped. {pull}7893[7893] - Replace index patterns in TSVB visualizations. {pull}7929[7929] +- Add backoff support to x-pack monitoring outputs. {issue}7966[7966] *Auditbeat* diff --git a/auditbeat/auditbeat.reference.yml b/auditbeat/auditbeat.reference.yml index c0d63317b2f..b31493eb02f 100644 --- a/auditbeat/auditbeat.reference.yml +++ b/auditbeat/auditbeat.reference.yml @@ -1143,6 +1143,17 @@ logging.files: # The default is 50. #bulk_max_size: 50 + # The number of seconds to wait before trying to reconnect to Elasticsearch + # after a network error. After waiting backoff.init seconds, the Beat + # tries to reconnect. If the attempt fails, the backoff timer is increased + # exponentially up to backoff.max. After a successful connection, the backoff + # timer is reset. The default is 1s. + #backoff.init: 1s + + # The maximum number of seconds to wait before attempting to connect to + # Elasticsearch after a network error. The default is 60s. + #backoff.max: 60s + # Configure http request timeout before failing an request to Elasticsearch. #timeout: 90 diff --git a/filebeat/filebeat.reference.yml b/filebeat/filebeat.reference.yml index f798e4ed301..c580a75598a 100644 --- a/filebeat/filebeat.reference.yml +++ b/filebeat/filebeat.reference.yml @@ -1803,6 +1803,17 @@ logging.files: # The default is 50. #bulk_max_size: 50 + # The number of seconds to wait before trying to reconnect to Elasticsearch + # after a network error. After waiting backoff.init seconds, the Beat + # tries to reconnect. If the attempt fails, the backoff timer is increased + # exponentially up to backoff.max. After a successful connection, the backoff + # timer is reset. The default is 1s. + #backoff.init: 1s + + # The maximum number of seconds to wait before attempting to connect to + # Elasticsearch after a network error. The default is 60s. + #backoff.max: 60s + # Configure http request timeout before failing an request to Elasticsearch. #timeout: 90 diff --git a/heartbeat/heartbeat.reference.yml b/heartbeat/heartbeat.reference.yml index ae75232c080..5247b15b81c 100644 --- a/heartbeat/heartbeat.reference.yml +++ b/heartbeat/heartbeat.reference.yml @@ -1250,6 +1250,17 @@ logging.files: # The default is 50. #bulk_max_size: 50 + # The number of seconds to wait before trying to reconnect to Elasticsearch + # after a network error. After waiting backoff.init seconds, the Beat + # tries to reconnect. If the attempt fails, the backoff timer is increased + # exponentially up to backoff.max. After a successful connection, the backoff + # timer is reset. The default is 1s. + #backoff.init: 1s + + # The maximum number of seconds to wait before attempting to connect to + # Elasticsearch after a network error. The default is 60s. + #backoff.max: 60s + # Configure http request timeout before failing an request to Elasticsearch. #timeout: 90 diff --git a/libbeat/_meta/config.reference.yml b/libbeat/_meta/config.reference.yml index 75caa03bc42..61066c4ac80 100644 --- a/libbeat/_meta/config.reference.yml +++ b/libbeat/_meta/config.reference.yml @@ -1036,6 +1036,17 @@ logging.files: # The default is 50. #bulk_max_size: 50 + # The number of seconds to wait before trying to reconnect to Elasticsearch + # after a network error. After waiting backoff.init seconds, the Beat + # tries to reconnect. If the attempt fails, the backoff timer is increased + # exponentially up to backoff.max. After a successful connection, the backoff + # timer is reset. The default is 1s. + #backoff.init: 1s + + # The maximum number of seconds to wait before attempting to connect to + # Elasticsearch after a network error. The default is 60s. + #backoff.max: 60s + # Configure http request timeout before failing an request to Elasticsearch. #timeout: 90 diff --git a/libbeat/docs/monitoring/shared-monitor-config.asciidoc b/libbeat/docs/monitoring/shared-monitor-config.asciidoc index 2990d8ef7e3..2ea94649b45 100644 --- a/libbeat/docs/monitoring/shared-monitor-config.asciidoc +++ b/libbeat/docs/monitoring/shared-monitor-config.asciidoc @@ -39,6 +39,21 @@ configuration option contains the following fields: The maximum number of metrics to bulk in a single {es} bulk API index request. The default is `50`. For more information, see <>. +[float] +==== `backoff.init` + +The number of seconds to wait before trying to reconnect to Elasticsearch after +a network error. After waiting `backoff.init` seconds, {beatname_uc} tries to +reconnect. If the attempt fails, the backoff timer is increased exponentially up +to `backoff.max`. After a successful connection, the backoff timer is reset. The +default is 1s. + +[float] +===== `backoff.max` + +The maximum number of seconds to wait before attempting to connect to +Elasticsearch after a network error. The default is 60s. + [float] ==== `compression_level` @@ -79,10 +94,17 @@ The password that {beatname_uc} uses to authenticate with the {es} instances for shipping monitoring data. [float] -==== `period` +==== `metrics.period` The time interval (in seconds) when metrics are sent to the {es} cluster. A new snapshot of {beatname_uc} metrics is generated and scheduled for publishing each +period. The default value is 10 * time.Second. + +[float] +==== `state.period` + +The time interval (in seconds) when state information are sent to the {es} cluster. A new +snapshot of {beatname_uc} state is generated and scheduled for publishing each period. The default value is 60 * time.Second. [float] diff --git a/libbeat/monitoring/report/elasticsearch/config.go b/libbeat/monitoring/report/elasticsearch/config.go index 2856e6d88b8..8f59cf79bad 100644 --- a/libbeat/monitoring/report/elasticsearch/config.go +++ b/libbeat/monitoring/report/elasticsearch/config.go @@ -42,6 +42,12 @@ type config struct { BulkMaxSize int `config:"bulk_max_size" validate:"min=0"` BufferSize int `config:"buffer_size"` Tags []string `config:"tags"` + Backoff backoff `config:"backoff"` +} + +type backoff struct { + Init time.Duration + Max time.Duration } var defaultConfig = config{ @@ -61,4 +67,8 @@ var defaultConfig = config{ BulkMaxSize: 50, BufferSize: 50, Tags: nil, + Backoff: backoff{ + Init: 1 * time.Second, + Max: 60 * time.Second, + }, } diff --git a/libbeat/monitoring/report/elasticsearch/elasticsearch.go b/libbeat/monitoring/report/elasticsearch/elasticsearch.go index 7c9357abdd7..83115a9fe95 100644 --- a/libbeat/monitoring/report/elasticsearch/elasticsearch.go +++ b/libbeat/monitoring/report/elasticsearch/elasticsearch.go @@ -54,7 +54,8 @@ type reporter struct { // pipeline pipeline *pipeline.Pipeline client beat.Client - out outputs.Group + + out []outputs.NetworkClient } var debugf = logp.MakeDebug("monitoring") @@ -104,22 +105,21 @@ func makeReporter(beat beat.Info, cfg *common.Config) (report.Reporter, error) { params[k] = v } - out := outputs.Group{ - Clients: nil, - BatchSize: windowSize, - Retry: 0, // no retry. on error drop events - } - hosts, err := outputs.ReadHostList(cfg) if err != nil { return nil, err } + if len(hosts) == 0 { + return nil, errors.New("empty hosts list") + } + + var clients []outputs.NetworkClient for _, host := range hosts { client, err := makeClient(host, params, proxyURL, tlsConfig, &config) if err != nil { return nil, err } - out.Clients = append(out.Clients, client) + clients = append(clients, client) } queueFactory := func(e queue.Eventer) (queue.Queue, error) { @@ -131,10 +131,19 @@ func makeReporter(beat beat.Info, cfg *common.Config) (report.Reporter, error) { monitoring := monitoring.Default.GetRegistry("xpack.monitoring") + outClient := outputs.NewFailoverClient(clients) + outClient = outputs.WithBackoff(outClient, config.Backoff.Init, config.Backoff.Max) + pipeline, err := pipeline.New( beat, monitoring, - queueFactory, out, pipeline.Settings{ + queueFactory, + outputs.Group{ + Clients: []outputs.Client{outClient}, + BatchSize: windowSize, + Retry: 0, // no retry. Drop event on error. + }, + pipeline.Settings{ WaitClose: 0, WaitCloseMode: pipeline.NoWaitOnClose, }) @@ -142,7 +151,7 @@ func makeReporter(beat beat.Info, cfg *common.Config) (report.Reporter, error) { return nil, err } - client, err := pipeline.Connect() + pipeConn, err := pipeline.Connect() if err != nil { pipeline.Close() return nil, err @@ -154,8 +163,8 @@ func makeReporter(beat beat.Info, cfg *common.Config) (report.Reporter, error) { tags: config.Tags, checkRetry: checkRetry, pipeline: pipeline, - client: client, - out: out, + client: pipeConn, + out: clients, } go r.initLoop(config) return r, nil @@ -175,7 +184,7 @@ func (r *reporter) initLoop(c config) { for { // Select one configured endpoint by random and check if xpack is available - client := r.out.Clients[rand.Intn(len(r.out.Clients))].(outputs.NetworkClient) + client := r.out[rand.Intn(len(r.out))] err := client.Connect() if err == nil { closing(client) diff --git a/metricbeat/metricbeat.reference.yml b/metricbeat/metricbeat.reference.yml index f9df4015a71..a5f55cb3210 100644 --- a/metricbeat/metricbeat.reference.yml +++ b/metricbeat/metricbeat.reference.yml @@ -1710,6 +1710,17 @@ logging.files: # The default is 50. #bulk_max_size: 50 + # The number of seconds to wait before trying to reconnect to Elasticsearch + # after a network error. After waiting backoff.init seconds, the Beat + # tries to reconnect. If the attempt fails, the backoff timer is increased + # exponentially up to backoff.max. After a successful connection, the backoff + # timer is reset. The default is 1s. + #backoff.init: 1s + + # The maximum number of seconds to wait before attempting to connect to + # Elasticsearch after a network error. The default is 60s. + #backoff.max: 60s + # Configure http request timeout before failing an request to Elasticsearch. #timeout: 90 diff --git a/packetbeat/packetbeat.reference.yml b/packetbeat/packetbeat.reference.yml index 76ba9223f16..fddd29ee53b 100644 --- a/packetbeat/packetbeat.reference.yml +++ b/packetbeat/packetbeat.reference.yml @@ -1513,6 +1513,17 @@ logging.files: # The default is 50. #bulk_max_size: 50 + # The number of seconds to wait before trying to reconnect to Elasticsearch + # after a network error. After waiting backoff.init seconds, the Beat + # tries to reconnect. If the attempt fails, the backoff timer is increased + # exponentially up to backoff.max. After a successful connection, the backoff + # timer is reset. The default is 1s. + #backoff.init: 1s + + # The maximum number of seconds to wait before attempting to connect to + # Elasticsearch after a network error. The default is 60s. + #backoff.max: 60s + # Configure http request timeout before failing an request to Elasticsearch. #timeout: 90 diff --git a/winlogbeat/winlogbeat.reference.yml b/winlogbeat/winlogbeat.reference.yml index f0ac22bdd65..b686b5746d6 100644 --- a/winlogbeat/winlogbeat.reference.yml +++ b/winlogbeat/winlogbeat.reference.yml @@ -1065,6 +1065,17 @@ logging.files: # The default is 50. #bulk_max_size: 50 + # The number of seconds to wait before trying to reconnect to Elasticsearch + # after a network error. After waiting backoff.init seconds, the Beat + # tries to reconnect. If the attempt fails, the backoff timer is increased + # exponentially up to backoff.max. After a successful connection, the backoff + # timer is reset. The default is 1s. + #backoff.init: 1s + + # The maximum number of seconds to wait before attempting to connect to + # Elasticsearch after a network error. The default is 60s. + #backoff.max: 60s + # Configure http request timeout before failing an request to Elasticsearch. #timeout: 90