From e90742796d69eef9317a576aad140b34554a27e1 Mon Sep 17 00:00:00 2001 From: Alex Bublichenko Date: Wed, 2 Sep 2020 22:48:21 -0400 Subject: [PATCH 1/3] Emit metric with dbnode health status Problem: In a large M3DB cluster, when a database node becomes non-functional (service fails to start or host is down), it may go unnoticed. If it goes unnoticed long enough, and one more node that owns the same shard(s) becomes non-functional, a quorum may be lost and block writes to the database. Solution: The connection pool in `src/dbnode/client/connection_pool.go` already does periodic health check from the client's node/process. Let that code emit a gauge with the result of health check. The metrics scope passed to `newConnectionPool` is already tagged with `hostID`. Since the health check is done from the client, it implies that node is in M3DB placement and expected to be functional. Thus, alerting can be set up based on this metric alone. Considered Alternatives: 1. Emit a heartbeat metric from `src/dbnode/network/server/tchannelthrift/node/service.go`. Alerting on lost heartbeat requires knoweldge about whether the node is in placement, i.e. expected to be functional. 2. Let independent monitoring/canary system actively probe healthcheck endpoint of every database node, determine whether the node is expected to be functional by comparing to M3DB placement data, and alert operator. Such solution would be ideal but has much higher cost. --- src/dbnode/client/connection_pool.go | 7 ++++++- src/dbnode/client/types.go | 7 +++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/src/dbnode/client/connection_pool.go b/src/dbnode/client/connection_pool.go index bcfc04973c..cd795eaec8 100644 --- a/src/dbnode/client/connection_pool.go +++ b/src/dbnode/client/connection_pool.go @@ -32,8 +32,9 @@ import ( "github.com/m3db/m3/src/dbnode/generated/thrift/rpc" "github.com/m3db/m3/src/dbnode/topology" xclose "github.com/m3db/m3/src/x/close" - "github.com/m3db/stackmurmur3/v2" + murmur3 "github.com/m3db/stackmurmur3/v2" + "github.com/uber-go/tally" "github.com/uber/tchannel-go/thrift" "go.uber.org/zap" ) @@ -63,6 +64,7 @@ type connPool struct { sleepHealth sleepFn sleepHealthRetry sleepFn status status + healthStatus tally.Gauge } type conn struct { @@ -94,6 +96,7 @@ func newConnectionPool(host topology.Host, opts Options) connectionPool { sleepConnect: time.Sleep, sleepHealth: time.Sleep, sleepHealthRetry: time.Sleep, + healthStatus: opts.InstrumentOptions().MetricsScope().Gauge("health-status"), } return p @@ -186,11 +189,13 @@ func (p *connPool) connectEvery(interval time.Duration, stutter time.Duration) { // Health check the connection if err := p.healthCheckNewConn(client, p.opts); err != nil { + p.healthStatus.Update(float64(healthStatusCheckFailed)) log.Debug("could not connect, failed health check", zap.String("host", address), zap.Error(err)) channel.Close() return } + p.healthStatus.Update(float64(healthStatusOK)) p.Lock() if p.status == statusOpen { p.pool = append(p.pool, conn{channel, client}) diff --git a/src/dbnode/client/types.go b/src/dbnode/client/types.go index f30a196671..6783078918 100644 --- a/src/dbnode/client/types.go +++ b/src/dbnode/client/types.go @@ -702,6 +702,13 @@ const ( statusClosed ) +type healthStatus int + +const ( + healthStatusCheckFailed healthStatus = iota + healthStatusOK +) + type op interface { // Size returns the effective size of inner operations. Size() int From 84bd2b2e10b43c6a29851be0c3a8632c792133b7 Mon Sep 17 00:00:00 2001 From: Alex Bublichenko Date: Thu, 3 Sep 2020 15:16:34 -0400 Subject: [PATCH 2/3] Make this behavior optional, disabled by default --- src/dbnode/client/connection_pool.go | 10 ++++++++-- src/dbnode/client/options.go | 15 +++++++++++++++ src/dbnode/client/types.go | 6 ++++++ 3 files changed, 29 insertions(+), 2 deletions(-) diff --git a/src/dbnode/client/connection_pool.go b/src/dbnode/client/connection_pool.go index cd795eaec8..937408673d 100644 --- a/src/dbnode/client/connection_pool.go +++ b/src/dbnode/client/connection_pool.go @@ -189,13 +189,13 @@ func (p *connPool) connectEvery(interval time.Duration, stutter time.Duration) { // Health check the connection if err := p.healthCheckNewConn(client, p.opts); err != nil { - p.healthStatus.Update(float64(healthStatusCheckFailed)) + p.maybeEmitHealthStatus(healthStatusCheckFailed) log.Debug("could not connect, failed health check", zap.String("host", address), zap.Error(err)) channel.Close() return } - p.healthStatus.Update(float64(healthStatusOK)) + p.maybeEmitHealthStatus(healthStatusOK) p.Lock() if p.status == statusOpen { p.pool = append(p.pool, conn{channel, client}) @@ -211,6 +211,12 @@ func (p *connPool) connectEvery(interval time.Duration, stutter time.Duration) { } } +func (p *connPool) maybeEmitHealthStatus(hs healthStatus) { + if p.opts.HostQueueEmitsHealthStatus() { + p.healthStatus.Update(float64(hs)) + } +} + func (p *connPool) healthCheckEvery(interval time.Duration, stutter time.Duration) { log := p.opts.InstrumentOptions().Logger() nowFn := p.opts.ClockOptions().NowFn() diff --git a/src/dbnode/client/options.go b/src/dbnode/client/options.go index 5ddfe73a35..b657f0f172 100644 --- a/src/dbnode/client/options.go +++ b/src/dbnode/client/options.go @@ -116,6 +116,9 @@ const ( // defaultHostQueueOpsArrayPoolSize is the default host queue ops array pool size defaultHostQueueOpsArrayPoolSize = 8 + // defaultHostQueueEmitsHealthStatus is false + defaultHostQueueEmitsHealthStatus = false + // defaultBackgroundConnectInterval is the default background connect interval defaultBackgroundConnectInterval = 4 * time.Second @@ -257,6 +260,7 @@ type options struct { hostQueueOpsFlushSize int hostQueueOpsFlushInterval time.Duration hostQueueOpsArrayPoolSize int + hostQueueEmitsHealthStatus bool seriesIteratorPoolSize int seriesIteratorArrayPoolBuckets []pool.Bucket checkedBytesWrapperPoolSize int @@ -376,6 +380,7 @@ func newOptions() *options { hostQueueOpsFlushSize: defaultHostQueueOpsFlushSize, hostQueueOpsFlushInterval: defaultHostQueueOpsFlushInterval, hostQueueOpsArrayPoolSize: defaultHostQueueOpsArrayPoolSize, + hostQueueEmitsHealthStatus: defaultHostQueueEmitsHealthStatus, seriesIteratorPoolSize: defaultSeriesIteratorPoolSize, seriesIteratorArrayPoolBuckets: defaultSeriesIteratorArrayPoolBuckets, checkedBytesWrapperPoolSize: defaultCheckedBytesWrapperPoolSize, @@ -869,6 +874,16 @@ func (o *options) HostQueueOpsArrayPoolSize() int { return o.hostQueueOpsArrayPoolSize } +func (o *options) SetHostQueueEmitsHealthStatus(value bool) Options { + opts := *o + opts.hostQueueEmitsHealthStatus = value + return &opts +} + +func (o *options) HostQueueEmitsHealthStatus() bool { + return o.hostQueueEmitsHealthStatus +} + func (o *options) SetSeriesIteratorPoolSize(value int) Options { opts := *o opts.seriesIteratorPoolSize = value diff --git a/src/dbnode/client/types.go b/src/dbnode/client/types.go index 6783078918..5d1b76b6f0 100644 --- a/src/dbnode/client/types.go +++ b/src/dbnode/client/types.go @@ -507,6 +507,12 @@ type Options interface { // HostQueueOpsArrayPoolSize returns the hostQueueOpsArrayPoolSize. HostQueueOpsArrayPoolSize() int + // SetHostQueueEmitsHealthStatus sets the hostQueueEmitHealthStatus. + SetHostQueueEmitsHealthStatus(value bool) Options + + // HostQueueEmitsHealthStatus returns the hostQueueEmitHealthStatus. + HostQueueEmitsHealthStatus() bool + // SetSeriesIteratorPoolSize sets the seriesIteratorPoolSize. SetSeriesIteratorPoolSize(value int) Options From 21b1f148ad7c99e6156f8813713ef851809f6374 Mon Sep 17 00:00:00 2001 From: Alex Bublichenko Date: Thu, 3 Sep 2020 16:53:02 -0400 Subject: [PATCH 3/3] Generate new client_mock.go --- src/dbnode/client/client_mock.go | 56 ++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/src/dbnode/client/client_mock.go b/src/dbnode/client/client_mock.go index 9da52ea71a..1cfc50a951 100644 --- a/src/dbnode/client/client_mock.go +++ b/src/dbnode/client/client_mock.go @@ -2196,6 +2196,34 @@ func (mr *MockOptionsMockRecorder) HostQueueOpsArrayPoolSize() *gomock.Call { return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "HostQueueOpsArrayPoolSize", reflect.TypeOf((*MockOptions)(nil).HostQueueOpsArrayPoolSize)) } +// SetHostQueueEmitsHealthStatus mocks base method +func (m *MockOptions) SetHostQueueEmitsHealthStatus(value bool) Options { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "SetHostQueueEmitsHealthStatus", value) + ret0, _ := ret[0].(Options) + return ret0 +} + +// SetHostQueueEmitsHealthStatus indicates an expected call of SetHostQueueEmitsHealthStatus +func (mr *MockOptionsMockRecorder) SetHostQueueEmitsHealthStatus(value interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SetHostQueueEmitsHealthStatus", reflect.TypeOf((*MockOptions)(nil).SetHostQueueEmitsHealthStatus), value) +} + +// HostQueueEmitsHealthStatus mocks base method +func (m *MockOptions) HostQueueEmitsHealthStatus() bool { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "HostQueueEmitsHealthStatus") + ret0, _ := ret[0].(bool) + return ret0 +} + +// HostQueueEmitsHealthStatus indicates an expected call of HostQueueEmitsHealthStatus +func (mr *MockOptionsMockRecorder) HostQueueEmitsHealthStatus() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "HostQueueEmitsHealthStatus", reflect.TypeOf((*MockOptions)(nil).HostQueueEmitsHealthStatus)) +} + // SetSeriesIteratorPoolSize mocks base method func (m *MockOptions) SetSeriesIteratorPoolSize(value int) Options { m.ctrl.T.Helper() @@ -3675,6 +3703,34 @@ func (mr *MockAdminOptionsMockRecorder) HostQueueOpsArrayPoolSize() *gomock.Call return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "HostQueueOpsArrayPoolSize", reflect.TypeOf((*MockAdminOptions)(nil).HostQueueOpsArrayPoolSize)) } +// SetHostQueueEmitsHealthStatus mocks base method +func (m *MockAdminOptions) SetHostQueueEmitsHealthStatus(value bool) Options { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "SetHostQueueEmitsHealthStatus", value) + ret0, _ := ret[0].(Options) + return ret0 +} + +// SetHostQueueEmitsHealthStatus indicates an expected call of SetHostQueueEmitsHealthStatus +func (mr *MockAdminOptionsMockRecorder) SetHostQueueEmitsHealthStatus(value interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SetHostQueueEmitsHealthStatus", reflect.TypeOf((*MockAdminOptions)(nil).SetHostQueueEmitsHealthStatus), value) +} + +// HostQueueEmitsHealthStatus mocks base method +func (m *MockAdminOptions) HostQueueEmitsHealthStatus() bool { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "HostQueueEmitsHealthStatus") + ret0, _ := ret[0].(bool) + return ret0 +} + +// HostQueueEmitsHealthStatus indicates an expected call of HostQueueEmitsHealthStatus +func (mr *MockAdminOptionsMockRecorder) HostQueueEmitsHealthStatus() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "HostQueueEmitsHealthStatus", reflect.TypeOf((*MockAdminOptions)(nil).HostQueueEmitsHealthStatus)) +} + // SetSeriesIteratorPoolSize mocks base method func (m *MockAdminOptions) SetSeriesIteratorPoolSize(value int) Options { m.ctrl.T.Helper()