From 5a9e48be300156423ee1b82d0bfee83a3b28083e Mon Sep 17 00:00:00 2001 From: Gyuho Lee Date: Wed, 23 May 2018 13:17:50 -0700 Subject: [PATCH 1/9] etcdserver/api/rafthttp: increase bucket upperbound up-to 3-sec From 0.8 sec to 3.2 sec for more detailed latency analysis Signed-off-by: Gyuho Lee --- etcdserver/api/rafthttp/metrics.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/etcdserver/api/rafthttp/metrics.go b/etcdserver/api/rafthttp/metrics.go index 320bfe72661..066cba9cc8a 100644 --- a/etcdserver/api/rafthttp/metrics.go +++ b/etcdserver/api/rafthttp/metrics.go @@ -58,7 +58,10 @@ var ( Subsystem: "network", Name: "peer_round_trip_time_seconds", Help: "Round-Trip-Time histogram between peers.", - Buckets: prometheus.ExponentialBuckets(0.0001, 2, 14), + + // lowest bucket start of upper bound 0.0001 sec (0.1 ms) with factor 2 + // highest bucket start of 0.0001 sec * 2^15 == 3.2768 sec + Buckets: prometheus.ExponentialBuckets(0.0001, 2, 16), }, []string{"To"}, ) From 640f5e64a93fe49b90e9bd7a75586449a16eaee6 Mon Sep 17 00:00:00 2001 From: Gyuho Lee Date: Wed, 23 May 2018 13:45:43 -0700 Subject: [PATCH 2/9] etcdserver/api/rafthttp: document round-trip metrics, clean up Signed-off-by: Gyuho Lee --- etcdserver/api/rafthttp/metrics.go | 4 ++-- etcdserver/api/rafthttp/probing_status.go | 8 +++++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/etcdserver/api/rafthttp/metrics.go b/etcdserver/api/rafthttp/metrics.go index 066cba9cc8a..c63a082f6d5 100644 --- a/etcdserver/api/rafthttp/metrics.go +++ b/etcdserver/api/rafthttp/metrics.go @@ -53,7 +53,7 @@ var ( []string{"From"}, ) - rtts = prometheus.NewHistogramVec(prometheus.HistogramOpts{ + rttSec = prometheus.NewHistogramVec(prometheus.HistogramOpts{ Namespace: "etcd", Subsystem: "network", Name: "peer_round_trip_time_seconds", @@ -72,5 +72,5 @@ func init() { prometheus.MustRegister(receivedBytes) prometheus.MustRegister(sentFailures) prometheus.MustRegister(recvFailures) - prometheus.MustRegister(rtts) + prometheus.MustRegister(rttSec) } diff --git a/etcdserver/api/rafthttp/probing_status.go b/etcdserver/api/rafthttp/probing_status.go index 63a764884e0..a8199dfdfad 100644 --- a/etcdserver/api/rafthttp/probing_status.go +++ b/etcdserver/api/rafthttp/probing_status.go @@ -44,9 +44,10 @@ func addPeerToProber(lg *zap.Logger, p probing.Prober, id string, us []string) { } else { plog.Errorf("failed to add peer %s into prober", id) } - } else { - go monitorProbingStatus(lg, s, id) + return } + + go monitorProbingStatus(lg, s, id) } func monitorProbingStatus(lg *zap.Logger, s probing.Status, id string) { @@ -83,7 +84,8 @@ func monitorProbingStatus(lg *zap.Logger, s probing.Status, id string) { plog.Warningf("the clock difference against peer %s is too high [%v > %v]", id, s.ClockDiff(), time.Second) } } - rtts.WithLabelValues(id).Observe(s.SRTT().Seconds()) + rttSec.WithLabelValues(id).Observe(s.SRTT().Seconds()) + case <-s.StopNotify(): return } From 973fe43b834aa410f53de6304653efe8b7f760ed Mon Sep 17 00:00:00 2001 From: Gyuho Lee Date: Wed, 23 May 2018 13:46:15 -0700 Subject: [PATCH 3/9] etcdserver/api/snap: document histograms, add "etcd_snap_fsync_duration_seconds" Signed-off-by: Gyuho Lee --- etcdserver/api/snap/metrics.go | 37 ++++++++++++++++++++++-------- etcdserver/api/snap/snapshotter.go | 9 +++++--- 2 files changed, 33 insertions(+), 13 deletions(-) diff --git a/etcdserver/api/snap/metrics.go b/etcdserver/api/snap/metrics.go index 433ef09d4ba..98003f54c44 100644 --- a/etcdserver/api/snap/metrics.go +++ b/etcdserver/api/snap/metrics.go @@ -17,25 +17,42 @@ package snap import "github.com/prometheus/client_golang/prometheus" var ( - // TODO: save_fsync latency? - saveDurations = prometheus.NewHistogram(prometheus.HistogramOpts{ + snapMarshallingSec = prometheus.NewHistogram(prometheus.HistogramOpts{ + Namespace: "etcd_debugging", + Subsystem: "snap", + Name: "save_marshalling_duration_seconds", + Help: "The marshalling cost distributions of save called by snapshot.", + + // lowest bucket start of upper bound 0.001 sec (1 ms) with factor 2 + // highest bucket start of 0.001 sec * 2^13 == 8.192 sec + Buckets: prometheus.ExponentialBuckets(0.001, 2, 14), + }) + + snapSaveSec = prometheus.NewHistogram(prometheus.HistogramOpts{ Namespace: "etcd_debugging", Subsystem: "snap", Name: "save_total_duration_seconds", Help: "The total latency distributions of save called by snapshot.", - Buckets: prometheus.ExponentialBuckets(0.001, 2, 14), + + // lowest bucket start of upper bound 0.001 sec (1 ms) with factor 2 + // highest bucket start of 0.001 sec * 2^13 == 8.192 sec + Buckets: prometheus.ExponentialBuckets(0.001, 2, 14), }) - marshallingDurations = prometheus.NewHistogram(prometheus.HistogramOpts{ - Namespace: "etcd_debugging", + snapFsyncSec = prometheus.NewHistogram(prometheus.HistogramOpts{ + Namespace: "etcd", Subsystem: "snap", - Name: "save_marshalling_duration_seconds", - Help: "The marshalling cost distributions of save called by snapshot.", - Buckets: prometheus.ExponentialBuckets(0.001, 2, 14), + Name: "fsync_duration_seconds", + Help: "The latency distributions of fsync called by snap.", + + // lowest bucket start of upper bound 0.001 sec (1 ms) with factor 2 + // highest bucket start of 0.001 sec * 2^13 == 8.192 sec + Buckets: prometheus.ExponentialBuckets(0.001, 2, 14), }) ) func init() { - prometheus.MustRegister(saveDurations) - prometheus.MustRegister(marshallingDurations) + prometheus.MustRegister(snapMarshallingSec) + prometheus.MustRegister(snapSaveSec) + prometheus.MustRegister(snapFsyncSec) } diff --git a/etcdserver/api/snap/snapshotter.go b/etcdserver/api/snap/snapshotter.go index c129919a5b2..75edadbff58 100644 --- a/etcdserver/api/snap/snapshotter.go +++ b/etcdserver/api/snap/snapshotter.go @@ -81,11 +81,14 @@ func (s *Snapshotter) save(snapshot *raftpb.Snapshot) error { if err != nil { return err } - - marshallingDurations.Observe(float64(time.Since(start)) / float64(time.Second)) + snapMarshallingSec.Observe(time.Since(start).Seconds()) spath := filepath.Join(s.dir, fname) + + fsyncStart := time.Now() err = pioutil.WriteAndSyncFile(spath, d, 0666) + snapFsyncSec.Observe(time.Since(fsyncStart).Seconds()) + if err != nil { if s.lg != nil { s.lg.Warn("failed to write a snap file", zap.String("path", spath), zap.Error(err)) @@ -101,7 +104,7 @@ func (s *Snapshotter) save(snapshot *raftpb.Snapshot) error { return err } - saveDurations.Observe(float64(time.Since(start)) / float64(time.Second)) + snapSaveSec.Observe(time.Since(start).Seconds()) return nil } From afe78fbe69c22ad140371440f0021af58dabbaed Mon Sep 17 00:00:00 2001 From: Gyuho Lee Date: Wed, 23 May 2018 13:46:42 -0700 Subject: [PATCH 4/9] etcdserver/api/v2http: document histogram Signed-off-by: Gyuho Lee --- etcdserver/api/v2http/metrics.go | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/etcdserver/api/v2http/metrics.go b/etcdserver/api/v2http/metrics.go index 39d2d043ec3..88e32993456 100644 --- a/etcdserver/api/v2http/metrics.go +++ b/etcdserver/api/v2http/metrics.go @@ -23,6 +23,7 @@ import ( "github.com/coreos/etcd/etcdserver/api/v2error" "github.com/coreos/etcd/etcdserver/api/v2http/httptypes" "github.com/coreos/etcd/etcdserver/etcdserverpb" + "github.com/prometheus/client_golang/prometheus" ) @@ -43,20 +44,23 @@ var ( Help: "Counter of handle failures of requests (non-watches), by method (GET/PUT etc.) and code (400, 500 etc.).", }, []string{"method", "code"}) - successfulEventsHandlingTime = prometheus.NewHistogramVec( + successfulEventsHandlingSec = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Namespace: "etcd", Subsystem: "http", Name: "successful_duration_seconds", Help: "Bucketed histogram of processing time (s) of successfully handled requests (non-watches), by method (GET/PUT etc.).", - Buckets: prometheus.ExponentialBuckets(0.0005, 2, 13), + + // lowest bucket start of upper bound 0.0005 sec (0.5 ms) with factor 2 + // highest bucket start of 0.0005 sec * 2^12 == 2.048 sec + Buckets: prometheus.ExponentialBuckets(0.0005, 2, 13), }, []string{"method"}) ) func init() { prometheus.MustRegister(incomingEvents) prometheus.MustRegister(failedEvents) - prometheus.MustRegister(successfulEventsHandlingTime) + prometheus.MustRegister(successfulEventsHandlingSec) } func reportRequestReceived(request etcdserverpb.Request) { @@ -65,7 +69,7 @@ func reportRequestReceived(request etcdserverpb.Request) { func reportRequestCompleted(request etcdserverpb.Request, startTime time.Time) { method := methodFromRequest(request) - successfulEventsHandlingTime.WithLabelValues(method).Observe(time.Since(startTime).Seconds()) + successfulEventsHandlingSec.WithLabelValues(method).Observe(time.Since(startTime).Seconds()) } func reportRequestFailed(request etcdserverpb.Request, err error) { From e6a113cdcddc02bfcce248aeb4107edaef63b6f0 Mon Sep 17 00:00:00 2001 From: Gyuho Lee Date: Wed, 23 May 2018 13:47:16 -0700 Subject: [PATCH 5/9] mvcc/backend: clean up histogram variables Signed-off-by: Gyuho Lee --- mvcc/backend/backend.go | 5 +++-- mvcc/backend/batch_tx.go | 2 +- mvcc/backend/metrics.go | 12 ++++++------ 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/mvcc/backend/backend.go b/mvcc/backend/backend.go index d4f25da2bb7..4ba8ec5689b 100644 --- a/mvcc/backend/backend.go +++ b/mvcc/backend/backend.go @@ -230,8 +230,9 @@ func (b *backend) Snapshot() Snapshot { } else { plog.Warningf("snapshotting is taking more than %v seconds to finish transferring %v MB [started at %v]", time.Since(start).Seconds(), float64(dbBytes)/float64(1024*1014), start) } + case <-stopc: - snapshotDurations.Observe(time.Since(start).Seconds()) + snapshotTransferSec.Observe(time.Since(start).Seconds()) return } } @@ -416,7 +417,7 @@ func (b *backend) defrag() error { atomic.StoreInt64(&b.sizeInUse, size-(int64(db.Stats().FreePageN)*int64(db.Info().PageSize))) took := time.Since(now) - defragDurations.Observe(took.Seconds()) + defragSec.Observe(took.Seconds()) size2, sizeInUse2 := b.Size(), b.SizeInUse() if b.lg != nil { diff --git a/mvcc/backend/batch_tx.go b/mvcc/backend/batch_tx.go index dee9938dc7f..dad95b045a3 100644 --- a/mvcc/backend/batch_tx.go +++ b/mvcc/backend/batch_tx.go @@ -220,7 +220,7 @@ func (t *batchTx) commit(stop bool) { err := t.tx.Commit() // gofail: var afterCommit struct{} - commitDurations.Observe(time.Since(start).Seconds()) + commitSec.Observe(time.Since(start).Seconds()) atomic.AddInt64(&t.backend.commits, 1) t.pending = 0 diff --git a/mvcc/backend/metrics.go b/mvcc/backend/metrics.go index 34157080499..dda36a57a38 100644 --- a/mvcc/backend/metrics.go +++ b/mvcc/backend/metrics.go @@ -17,7 +17,7 @@ package backend import "github.com/prometheus/client_golang/prometheus" var ( - commitDurations = prometheus.NewHistogram(prometheus.HistogramOpts{ + commitSec = prometheus.NewHistogram(prometheus.HistogramOpts{ Namespace: "etcd", Subsystem: "disk", Name: "backend_commit_duration_seconds", @@ -28,7 +28,7 @@ var ( Buckets: prometheus.ExponentialBuckets(0.001, 2, 14), }) - defragDurations = prometheus.NewHistogram(prometheus.HistogramOpts{ + defragSec = prometheus.NewHistogram(prometheus.HistogramOpts{ Namespace: "etcd", Subsystem: "disk", Name: "backend_defrag_duration_seconds", @@ -40,7 +40,7 @@ var ( Buckets: prometheus.ExponentialBuckets(.1, 2, 13), }) - snapshotDurations = prometheus.NewHistogram(prometheus.HistogramOpts{ + snapshotTransferSec = prometheus.NewHistogram(prometheus.HistogramOpts{ Namespace: "etcd", Subsystem: "disk", Name: "backend_snapshot_duration_seconds", @@ -53,7 +53,7 @@ var ( ) func init() { - prometheus.MustRegister(commitDurations) - prometheus.MustRegister(defragDurations) - prometheus.MustRegister(snapshotDurations) + prometheus.MustRegister(commitSec) + prometheus.MustRegister(defragSec) + prometheus.MustRegister(snapshotTransferSec) } From 1d916982687a82319a8d48179f5476fb14eb2b6f Mon Sep 17 00:00:00 2001 From: Gyuho Lee Date: Wed, 23 May 2018 13:47:52 -0700 Subject: [PATCH 6/9] mvcc: document, clean up histogram variables Signed-off-by: Gyuho Lee --- mvcc/kvstore.go | 6 +++--- mvcc/kvstore_compaction.go | 4 ++-- mvcc/metrics.go | 36 +++++++++++++++++++++--------------- 3 files changed, 26 insertions(+), 20 deletions(-) diff --git a/mvcc/kvstore.go b/mvcc/kvstore.go index 97e04ad78d8..a445f6a4a1a 100644 --- a/mvcc/kvstore.go +++ b/mvcc/kvstore.go @@ -168,7 +168,7 @@ func (s *store) Hash() (hash uint32, revision int64, err error) { s.b.ForceCommit() h, err := s.b.Hash(DefaultIgnores) - hashDurations.Observe(time.Since(start).Seconds()) + hashSec.Observe(time.Since(start).Seconds()) return h, s.currentRev, err } @@ -221,7 +221,7 @@ func (s *store) HashByRev(rev int64) (hash uint32, currentRev int64, compactRev }) hash = h.Sum32() - hashRevDurations.Observe(time.Since(start).Seconds()) + hashRevSec.Observe(time.Since(start).Seconds()) return hash, currentRev, compactRev, err } @@ -274,7 +274,7 @@ func (s *store) Compact(rev int64) (<-chan struct{}, error) { s.fifoSched.Schedule(j) - indexCompactionPauseDurations.Observe(float64(time.Since(start) / time.Millisecond)) + indexCompactionPauseMs.Observe(float64(time.Since(start) / time.Millisecond)) return ch, nil } diff --git a/mvcc/kvstore_compaction.go b/mvcc/kvstore_compaction.go index e7cfec1a5a0..d92dacd2396 100644 --- a/mvcc/kvstore_compaction.go +++ b/mvcc/kvstore_compaction.go @@ -23,7 +23,7 @@ import ( func (s *store) scheduleCompaction(compactMainRev int64, keep map[revision]struct{}) bool { totalStart := time.Now() - defer dbCompactionTotalDurations.Observe(float64(time.Since(totalStart) / time.Millisecond)) + defer dbCompactionTotalMs.Observe(float64(time.Since(totalStart) / time.Millisecond)) keyCompactions := 0 defer func() { dbCompactionKeysCounter.Add(float64(keyCompactions)) }() @@ -68,7 +68,7 @@ func (s *store) scheduleCompaction(compactMainRev int64, keep map[revision]struc // update last revToBytes(revision{main: rev.main, sub: rev.sub + 1}, last) tx.Unlock() - dbCompactionPauseDurations.Observe(float64(time.Since(start) / time.Millisecond)) + dbCompactionPauseMs.Observe(float64(time.Since(start) / time.Millisecond)) select { case <-time.After(100 * time.Millisecond): diff --git a/mvcc/metrics.go b/mvcc/metrics.go index 419a12e9cc9..f80c70c8563 100644 --- a/mvcc/metrics.go +++ b/mvcc/metrics.go @@ -101,33 +101,39 @@ var ( Help: "Total number of pending events to be sent.", }) - indexCompactionPauseDurations = prometheus.NewHistogram( + indexCompactionPauseMs = prometheus.NewHistogram( prometheus.HistogramOpts{ Namespace: "etcd_debugging", Subsystem: "mvcc", Name: "index_compaction_pause_duration_milliseconds", Help: "Bucketed histogram of index compaction pause duration.", - // 0.5ms -> 1second - Buckets: prometheus.ExponentialBuckets(0.5, 2, 12), + + // lowest bucket start of upper bound 0.5 ms with factor 2 + // highest bucket start of 0.5 ms * 2^13 == 4.096 sec + Buckets: prometheus.ExponentialBuckets(0.5, 2, 14), }) - dbCompactionPauseDurations = prometheus.NewHistogram( + dbCompactionPauseMs = prometheus.NewHistogram( prometheus.HistogramOpts{ Namespace: "etcd_debugging", Subsystem: "mvcc", Name: "db_compaction_pause_duration_milliseconds", Help: "Bucketed histogram of db compaction pause duration.", - // 1ms -> 4second + + // lowest bucket start of upper bound 1 ms with factor 2 + // highest bucket start of 1 ms * 2^12 == 4.096 sec Buckets: prometheus.ExponentialBuckets(1, 2, 13), }) - dbCompactionTotalDurations = prometheus.NewHistogram( + dbCompactionTotalMs = prometheus.NewHistogram( prometheus.HistogramOpts{ Namespace: "etcd_debugging", Subsystem: "mvcc", Name: "db_compaction_total_duration_milliseconds", Help: "Bucketed histogram of db compaction total duration.", - // 100ms -> 800second + + // lowest bucket start of upper bound 100 ms with factor 2 + // highest bucket start of 100 ms * 2^13 == 8.192 sec Buckets: prometheus.ExponentialBuckets(100, 2, 14), }) @@ -169,9 +175,9 @@ var ( ) // overridden by mvcc initialization reportDbTotalSizeInUseInBytesMu sync.RWMutex - reportDbTotalSizeInUseInBytes func() float64 = func() float64 { return 0 } + reportDbTotalSizeInUseInBytes = func() float64 { return 0 } - hashDurations = prometheus.NewHistogram(prometheus.HistogramOpts{ + hashSec = prometheus.NewHistogram(prometheus.HistogramOpts{ Namespace: "etcd", Subsystem: "mvcc", Name: "hash_duration_seconds", @@ -183,7 +189,7 @@ var ( Buckets: prometheus.ExponentialBuckets(.01, 2, 15), }) - hashRevDurations = prometheus.NewHistogram(prometheus.HistogramOpts{ + hashRevSec = prometheus.NewHistogram(prometheus.HistogramOpts{ Namespace: "etcd", Subsystem: "mvcc", Name: "hash_rev_duration_seconds", @@ -207,14 +213,14 @@ func init() { prometheus.MustRegister(slowWatcherGauge) prometheus.MustRegister(totalEventsCounter) prometheus.MustRegister(pendingEventsGauge) - prometheus.MustRegister(indexCompactionPauseDurations) - prometheus.MustRegister(dbCompactionPauseDurations) - prometheus.MustRegister(dbCompactionTotalDurations) + prometheus.MustRegister(indexCompactionPauseMs) + prometheus.MustRegister(dbCompactionPauseMs) + prometheus.MustRegister(dbCompactionTotalMs) prometheus.MustRegister(dbCompactionKeysCounter) prometheus.MustRegister(dbTotalSize) prometheus.MustRegister(dbTotalSizeInUse) - prometheus.MustRegister(hashDurations) - prometheus.MustRegister(hashRevDurations) + prometheus.MustRegister(hashSec) + prometheus.MustRegister(hashRevSec) } // ReportEventReceived reports that an event is received. From b0b966c43c3a54c0eece63d315584f0fe876791b Mon Sep 17 00:00:00 2001 From: Gyuho Lee Date: Wed, 23 May 2018 13:48:29 -0700 Subject: [PATCH 7/9] wal: document, clean up fsync histogram Signed-off-by: Gyuho Lee --- wal/metrics.go | 11 +++++++---- wal/wal.go | 2 +- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/wal/metrics.go b/wal/metrics.go index 9e089d380f9..22cb8003c98 100644 --- a/wal/metrics.go +++ b/wal/metrics.go @@ -17,15 +17,18 @@ package wal import "github.com/prometheus/client_golang/prometheus" var ( - syncDurations = prometheus.NewHistogram(prometheus.HistogramOpts{ + walFsyncSec = prometheus.NewHistogram(prometheus.HistogramOpts{ Namespace: "etcd", Subsystem: "disk", Name: "wal_fsync_duration_seconds", - Help: "The latency distributions of fsync called by wal.", - Buckets: prometheus.ExponentialBuckets(0.001, 2, 14), + Help: "The latency distributions of fsync called by WAL.", + + // lowest bucket start of upper bound 0.001 sec (1 ms) with factor 2 + // highest bucket start of 0.001 sec * 2^13 == 8.192 sec + Buckets: prometheus.ExponentialBuckets(0.001, 2, 14), }) ) func init() { - prometheus.MustRegister(syncDurations) + prometheus.MustRegister(walFsyncSec) } diff --git a/wal/wal.go b/wal/wal.go index 49d4505845e..7f653fbc28a 100644 --- a/wal/wal.go +++ b/wal/wal.go @@ -590,7 +590,7 @@ func (w *WAL) sync() error { plog.Warningf("sync duration of %v, expected less than %v", took, warnSyncDuration) } } - syncDurations.Observe(took.Seconds()) + walFsyncSec.Observe(took.Seconds()) return err } From 1a102fb3f508f49a845e468510849d4ea8c6ab25 Mon Sep 17 00:00:00 2001 From: Gyuho Lee Date: Wed, 23 May 2018 13:48:51 -0700 Subject: [PATCH 8/9] proxy/httpproxy: document histogram Signed-off-by: Gyuho Lee --- proxy/httpproxy/metrics.go | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/proxy/httpproxy/metrics.go b/proxy/httpproxy/metrics.go index f71258cba57..fcbedc28a88 100644 --- a/proxy/httpproxy/metrics.go +++ b/proxy/httpproxy/metrics.go @@ -47,13 +47,15 @@ var ( Help: "Counter of requests dropped on the proxy.", }, []string{"method", "proxying_error"}) - requestsHandlingTime = prometheus.NewHistogramVec( + requestsHandlingSec = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Namespace: "etcd", Subsystem: "proxy", Name: "handling_duration_seconds", - Help: "Bucketed histogram of handling time of successful events (non-watches), by method " + - "(GET/PUT etc.).", + Help: "Bucketed histogram of handling time of successful events (non-watches), by method (GET/PUT etc.).", + + // lowest bucket start of upper bound 0.0005 sec (0.5 ms) with factor 2 + // highest bucket start of 0.0005 sec * 2^12 == 2.048 sec Buckets: prometheus.ExponentialBuckets(0.0005, 2, 13), }, []string{"method"}) ) @@ -70,7 +72,7 @@ func init() { prometheus.MustRegister(requestsIncoming) prometheus.MustRegister(requestsHandled) prometheus.MustRegister(requestsDropped) - prometheus.MustRegister(requestsHandlingTime) + prometheus.MustRegister(requestsHandlingSec) } func reportIncomingRequest(request *http.Request) { @@ -80,7 +82,7 @@ func reportIncomingRequest(request *http.Request) { func reportRequestHandled(request *http.Request, response *http.Response, startTime time.Time) { method := request.Method requestsHandled.WithLabelValues(method, strconv.Itoa(response.StatusCode)).Inc() - requestsHandlingTime.WithLabelValues(method).Observe(time.Since(startTime).Seconds()) + requestsHandlingSec.WithLabelValues(method).Observe(time.Since(startTime).Seconds()) } func reportRequestDropped(request *http.Request, err forwardingError) { From 3821f3364dd5783097701c8384147a582cfb0806 Mon Sep 17 00:00:00 2001 From: Gyuho Lee Date: Wed, 23 May 2018 14:13:26 -0700 Subject: [PATCH 9/9] etcdserver/api/rafthttp: add "etcd_network_active_peers/disconnected_peers_total" Signed-off-by: Gyuho Lee --- etcdserver/api/rafthttp/metrics.go | 20 ++++++++++++++++++++ etcdserver/api/rafthttp/peer.go | 2 +- etcdserver/api/rafthttp/peer_status.go | 11 +++++++++-- etcdserver/api/rafthttp/pipeline_test.go | 2 +- etcdserver/api/rafthttp/remote.go | 2 +- etcdserver/api/rafthttp/snapshot_test.go | 2 +- etcdserver/api/rafthttp/stream_test.go | 10 +++++----- 7 files changed, 38 insertions(+), 11 deletions(-) diff --git a/etcdserver/api/rafthttp/metrics.go b/etcdserver/api/rafthttp/metrics.go index c63a082f6d5..4df45d35e90 100644 --- a/etcdserver/api/rafthttp/metrics.go +++ b/etcdserver/api/rafthttp/metrics.go @@ -17,6 +17,24 @@ package rafthttp import "github.com/prometheus/client_golang/prometheus" var ( + activePeers = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "etcd", + Subsystem: "network", + Name: "active_peers", + Help: "The current number of active peer connections.", + }, + []string{"Local", "Remote"}, + ) + + disconnectedPeers = prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: "etcd", + Subsystem: "network", + Name: "disconnected_peers_total", + Help: "The total number of disconnected peers.", + }, + []string{"Local", "Remote"}, + ) + sentBytes = prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: "etcd", Subsystem: "network", @@ -68,6 +86,8 @@ var ( ) func init() { + prometheus.MustRegister(activePeers) + prometheus.MustRegister(disconnectedPeers) prometheus.MustRegister(sentBytes) prometheus.MustRegister(receivedBytes) prometheus.MustRegister(sentFailures) diff --git a/etcdserver/api/rafthttp/peer.go b/etcdserver/api/rafthttp/peer.go index 404461b4edc..d3342fd1a8e 100644 --- a/etcdserver/api/rafthttp/peer.go +++ b/etcdserver/api/rafthttp/peer.go @@ -137,7 +137,7 @@ func startPeer(t *Transport, urls types.URLs, peerID types.ID, fs *stats.Followe } }() - status := newPeerStatus(t.Logger, peerID) + status := newPeerStatus(t.Logger, t.ID, peerID) picker := newURLPicker(urls) errorc := t.ErrorC r := t.Raft diff --git a/etcdserver/api/rafthttp/peer_status.go b/etcdserver/api/rafthttp/peer_status.go index 68d9ac01b5b..80a0076d972 100644 --- a/etcdserver/api/rafthttp/peer_status.go +++ b/etcdserver/api/rafthttp/peer_status.go @@ -32,14 +32,15 @@ type failureType struct { type peerStatus struct { lg *zap.Logger + local types.ID id types.ID mu sync.Mutex // protect variables below active bool since time.Time } -func newPeerStatus(lg *zap.Logger, id types.ID) *peerStatus { - return &peerStatus{lg: lg, id: id} +func newPeerStatus(lg *zap.Logger, local, id types.ID) *peerStatus { + return &peerStatus{lg: lg, local: local, id: id} } func (s *peerStatus) activate() { @@ -53,6 +54,8 @@ func (s *peerStatus) activate() { } s.active = true s.since = time.Now() + + activePeers.WithLabelValues(s.local.String(), s.id.String()).Inc() } } @@ -69,8 +72,12 @@ func (s *peerStatus) deactivate(failure failureType, reason string) { } s.active = false s.since = time.Time{} + + activePeers.WithLabelValues(s.local.String(), s.id.String()).Dec() + disconnectedPeers.WithLabelValues(s.local.String(), s.id.String()).Inc() return } + if s.lg != nil { s.lg.Debug("peer deactivated again", zap.String("peer-id", s.id.String()), zap.Error(errors.New(msg))) } diff --git a/etcdserver/api/rafthttp/pipeline_test.go b/etcdserver/api/rafthttp/pipeline_test.go index 201427e216b..4370887f62a 100644 --- a/etcdserver/api/rafthttp/pipeline_test.go +++ b/etcdserver/api/rafthttp/pipeline_test.go @@ -303,7 +303,7 @@ func startTestPipeline(tr *Transport, picker *urlPicker) *pipeline { peerID: types.ID(1), tr: tr, picker: picker, - status: newPeerStatus(zap.NewExample(), types.ID(1)), + status: newPeerStatus(zap.NewExample(), tr.ID, types.ID(1)), raft: &fakeRaft{}, followerStats: &stats.FollowerStats{}, errorc: make(chan error, 1), diff --git a/etcdserver/api/rafthttp/remote.go b/etcdserver/api/rafthttp/remote.go index de3e1cc4a20..02acd282634 100644 --- a/etcdserver/api/rafthttp/remote.go +++ b/etcdserver/api/rafthttp/remote.go @@ -31,7 +31,7 @@ type remote struct { func startRemote(tr *Transport, urls types.URLs, id types.ID) *remote { picker := newURLPicker(urls) - status := newPeerStatus(tr.Logger, id) + status := newPeerStatus(tr.Logger, tr.ID, id) pipeline := &pipeline{ peerID: id, tr: tr, diff --git a/etcdserver/api/rafthttp/snapshot_test.go b/etcdserver/api/rafthttp/snapshot_test.go index ba3ce55eb35..902d17e98d3 100644 --- a/etcdserver/api/rafthttp/snapshot_test.go +++ b/etcdserver/api/rafthttp/snapshot_test.go @@ -109,7 +109,7 @@ func testSnapshotSend(t *testing.T, sm *snap.Message) (bool, []os.FileInfo) { defer srv.Close() picker := mustNewURLPicker(t, []string{srv.URL}) - snapsend := newSnapshotSender(tr, picker, types.ID(1), newPeerStatus(zap.NewExample(), types.ID(1))) + snapsend := newSnapshotSender(tr, picker, types.ID(1), newPeerStatus(zap.NewExample(), types.ID(0), types.ID(1))) defer snapsend.stop() snapsend.send(*sm) diff --git a/etcdserver/api/rafthttp/stream_test.go b/etcdserver/api/rafthttp/stream_test.go index 535dcfa1168..90986b8fd13 100644 --- a/etcdserver/api/rafthttp/stream_test.go +++ b/etcdserver/api/rafthttp/stream_test.go @@ -41,7 +41,7 @@ import ( // to streamWriter. After that, streamWriter can use it to send messages // continuously, and closes it when stopped. func TestStreamWriterAttachOutgoingConn(t *testing.T) { - sw := startStreamWriter(zap.NewExample(), types.ID(0), types.ID(1), newPeerStatus(zap.NewExample(), types.ID(1)), &stats.FollowerStats{}, &fakeRaft{}) + sw := startStreamWriter(zap.NewExample(), types.ID(0), types.ID(1), newPeerStatus(zap.NewExample(), types.ID(0), types.ID(1)), &stats.FollowerStats{}, &fakeRaft{}) // the expected initial state of streamWriter is not working if _, ok := sw.writec(); ok { t.Errorf("initial working status = %v, want false", ok) @@ -93,7 +93,7 @@ func TestStreamWriterAttachOutgoingConn(t *testing.T) { // TestStreamWriterAttachBadOutgoingConn tests that streamWriter with bad // outgoingConn will close the outgoingConn and fall back to non-working status. func TestStreamWriterAttachBadOutgoingConn(t *testing.T) { - sw := startStreamWriter(zap.NewExample(), types.ID(0), types.ID(1), newPeerStatus(zap.NewExample(), types.ID(1)), &stats.FollowerStats{}, &fakeRaft{}) + sw := startStreamWriter(zap.NewExample(), types.ID(0), types.ID(1), newPeerStatus(zap.NewExample(), types.ID(0), types.ID(1)), &stats.FollowerStats{}, &fakeRaft{}) defer sw.stop() wfc := newFakeWriteFlushCloser(errors.New("blah")) sw.attach(&outgoingConn{t: streamTypeMessage, Writer: wfc, Flusher: wfc, Closer: wfc}) @@ -197,7 +197,7 @@ func TestStreamReaderStopOnDial(t *testing.T) { picker: mustNewURLPicker(t, []string{"http://localhost:2380"}), errorc: make(chan error, 1), typ: streamTypeMessage, - status: newPeerStatus(zap.NewExample(), types.ID(2)), + status: newPeerStatus(zap.NewExample(), types.ID(1), types.ID(2)), rl: rate.NewLimiter(rate.Every(100*time.Millisecond), 1), } tr.onResp = func() { @@ -304,7 +304,7 @@ func TestStream(t *testing.T) { srv := httptest.NewServer(h) defer srv.Close() - sw := startStreamWriter(zap.NewExample(), types.ID(0), types.ID(1), newPeerStatus(zap.NewExample(), types.ID(1)), &stats.FollowerStats{}, &fakeRaft{}) + sw := startStreamWriter(zap.NewExample(), types.ID(0), types.ID(1), newPeerStatus(zap.NewExample(), types.ID(0), types.ID(1)), &stats.FollowerStats{}, &fakeRaft{}) defer sw.stop() h.sw = sw @@ -316,7 +316,7 @@ func TestStream(t *testing.T) { typ: tt.t, tr: tr, picker: picker, - status: newPeerStatus(zap.NewExample(), types.ID(2)), + status: newPeerStatus(zap.NewExample(), types.ID(0), types.ID(2)), recvc: recvc, propc: propc, rl: rate.NewLimiter(rate.Every(100*time.Millisecond), 1),