From c94dece12f181808fb472a9181d8f8ef2008f978 Mon Sep 17 00:00:00 2001 From: Shreyas Rao Date: Fri, 30 Aug 2019 10:48:44 +0530 Subject: [PATCH] Expose snapshot_required metric Signed-off-by: Shreyas Rao --- README.md | 5 +++-- cmd/server.go | 5 +++++ doc/usage/metrics.md | 9 +++++---- pkg/metrics/metrics.go | 25 ++++++++++++++++++++++++- pkg/snapshot/restorer/restorer.go | 2 +- pkg/snapshot/snapshotter/snapshotter.go | 19 +++++++++++++++++++ 6 files changed, 57 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index b7cf51f4d..6f73ea39c 100644 --- a/README.md +++ b/README.md @@ -14,8 +14,10 @@ Etcd-backup-restore is collection of components to backup and restore the [etcd] * [Monitoring](doc/usage/metrics.md) ### Design and Proposals + * [Core design](doc/proposals/design.md) -* [Etcd data validation ](doc/proposals/validation.md) +* [Etcd data validation](doc/proposals/validation.md) +* [Data restoration](doc/proposals/restoration.md) * [High watch events ingress rate issue](doc/proposals/high_watch_event_ingress_rate.md) ### Development @@ -24,5 +26,4 @@ Etcd-backup-restore is collection of components to backup and restore the [etcd] * [Testing and Dependency Management](doc/development/testing_and_dependencies.md) * [Adding support for a new cloud provider](doc/development/new_cp_support.md) - [etcd]: https://github.com/coreos/etcd diff --git a/cmd/server.go b/cmd/server.go index 339e26105..de395f9fb 100644 --- a/cmd/server.go +++ b/cmd/server.go @@ -26,10 +26,12 @@ import ( "github.com/gardener/etcd-backup-restore/pkg/errors" "github.com/gardener/etcd-backup-restore/pkg/etcdutil" "github.com/gardener/etcd-backup-restore/pkg/initializer" + "github.com/gardener/etcd-backup-restore/pkg/metrics" "github.com/gardener/etcd-backup-restore/pkg/server" "github.com/gardener/etcd-backup-restore/pkg/snapshot/restorer" "github.com/gardener/etcd-backup-restore/pkg/snapshot/snapshotter" "github.com/gardener/etcd-backup-restore/pkg/snapstore" + "github.com/prometheus/client_golang/prometheus" "github.com/spf13/cobra" ) @@ -245,6 +247,9 @@ func runEtcdProbeLoopWithSnapshotter(tlsConfig *etcdutil.TLSConfig, handler *ser } } if !initialDeltaSnapshotTaken { + // need to take a full snapshot here + metrics.SnapshotRequired.With(prometheus.Labels{metrics.LabelKind: snapstore.SnapshotKindDelta}).Set(0) + metrics.SnapshotRequired.With(prometheus.Labels{metrics.LabelKind: snapstore.SnapshotKindFull}).Set(1) if err := ssr.TakeFullSnapshotAndResetTimer(); err != nil { logger.Errorf("Failed to take substitute first full snapshot: %v", err) continue diff --git a/doc/usage/metrics.md b/doc/usage/metrics.md index b83c2b3f3..ac33636e3 100644 --- a/doc/usage/metrics.md +++ b/doc/usage/metrics.md @@ -8,11 +8,10 @@ Follow the [Prometheus getting started doc][prometheus-getting-started] to spin The naming of metrics follows the suggested [Prometheus best practices][prometheus-naming]. All etcd-backup-restore related metrics are put under namespace `etcdbr`. -### ETCD metrics +## ETCD metrics The metrics under the `etcd` prefix/namespace are carried forward from etcd library that we use. These metrics do not include details of the `etcd` deployment on which `etcd-backup-restore` utility operates. Instead, it helps in monitoring the `embedded etcd` we spawn during restoration process. - ### Snapshot These metrics describe the status of the snapshotter. In order to detect outages or problems for troubleshooting, these metrics should be closely monitored. The below mentioned metrics are listed as collection of series using prometheus labels `kind` and `succeeded`. `Kind` label indicates the snapshot kind i.e. full snapshot or incremental/delta snapshot in the context. And succeeded indicates whether the metrics is for successful operation or erroneous operation. @@ -23,6 +22,7 @@ These metrics describe the status of the snapshotter. In order to detect outages | etcdbr_snapshot_gc_total | Total number of garbage collected snapshots. | Counter | | etcdbr_snapshot_latest_revision | Revision number of latest snapshot taken. | Gauge | | etcdbr_snapshot_latest_timestamp | Timestamp of latest snapshot taken. | Gauge | +| etcdbr_snapshot_required | Indicates whether a new snapshot is required to be taken. | Gauge | Abnormally high snapshot duration (`etcdbr_snapshot_duration_seconds`) indicates disk issues and low network bandwidth. @@ -30,6 +30,8 @@ Abnormally high snapshot duration (`etcdbr_snapshot_duration_seconds`) indicates `etcdbr_snapshot_gc_total` gives the total number of snapshots garbage collected since bootstrap. You can use this in coordination with `etcdbr_snapshot_duration_seconds_count` to get number of snapshots in object store. +`etcdbr_snapshot_required` indicates whether a new snapshot is required to be taken. Acts as a boolean flag where zero value implies 'false' and non-zero values imply 'true'. :warning: This metric does not work as expected for the case where delta snapshots are disabled (by setting the etcdbrctl flag `delta-snapshot-period-seconds` to 0). + ### Defragmentation The metrics for defragmentation is of type histogram, which gives the number of times defragmentation was triggered. :warning: The defragmentation latency should be as low as possible, since @@ -59,7 +61,6 @@ All these metrics are under subsystem `network`. | etcdbr_network_transmitted_bytes | The total number of bytes received over network. | Counter | | etcdbr_network_received_bytes | The total number of bytes received over network. | Counter | - `etcdbr_network_transmitted_bytes` counts the total number of bytes transmitted. Usually this reflects the data uploaded to object store as part of snapshot uploads. `etcdbr_network_received_bytes` counts the total number of bytes received. Usually this reflects the data received as part of snapshots from actual `etcd`. There could be a sudden spike in this at the time of restoration as well. @@ -77,4 +78,4 @@ The Prometheus client library provides a number of metrics under the `go` and `p [prometheus-getting-started]: http://prometheus.io/docs/introduction/getting_started/ [prometheus-naming]: http://prometheus.io/docs/practices/naming/ [v2-http-metrics]: v2/metrics.md#http-requests -[go-grpc-prometheus]: https://github.com/grpc-ecosystem/go-grpc-prometheus \ No newline at end of file +[go-grpc-prometheus]: https://github.com/grpc-ecosystem/go-grpc-prometheus diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index 23a547a96..47569d23c 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -70,7 +70,8 @@ var ( }, []string{LabelKind}, ) - // LatestSnapshotTimestamp is metric expose latest snapshot timestamp. + + // LatestSnapshotTimestamp is metric to expose latest snapshot timestamp. LatestSnapshotTimestamp = prometheus.NewGaugeVec( prometheus.GaugeOpts{ Namespace: namespaceEtcdBR, @@ -81,6 +82,17 @@ var ( []string{LabelKind}, ) + // SnapshotRequired is metric to expose snapshot required flag. + SnapshotRequired = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespaceEtcdBR, + Subsystem: subsystemSnapshot, + Name: "required", + Help: "Indicates whether a snapshot is required to be taken.", + }, + []string{LabelKind}, + ) + // SnapshotDurationSeconds is metric to expose the duration required to save snapshot in seconds. SnapshotDurationSeconds = prometheus.NewHistogramVec( prometheus.HistogramOpts{ @@ -91,6 +103,7 @@ var ( }, []string{LabelKind, LabelSucceeded}, ) + // ValidationDurationSeconds is metric to expose the duration required to validate the etcd data directory in seconds. ValidationDurationSeconds = prometheus.NewHistogramVec( prometheus.HistogramOpts{ @@ -232,6 +245,15 @@ func init() { LatestSnapshotTimestamp.With(prometheus.Labels(combination)) } + // SnapshotRequired + snapshotRequiredLabelValues := map[string][]string{ + LabelKind: labels[LabelKind], + } + snapshotRequiredCombinations := generateLabelCombinations(snapshotRequiredLabelValues) + for _, combination := range snapshotRequiredCombinations { + SnapshotRequired.With(prometheus.Labels(combination)) + } + // SnapshotDurationSeconds snapshotDurationSecondsLabelValues := map[string][]string{ LabelKind: labels[LabelKind], @@ -274,6 +296,7 @@ func init() { prometheus.MustRegister(LatestSnapshotRevision) prometheus.MustRegister(LatestSnapshotTimestamp) + prometheus.MustRegister(SnapshotRequired) prometheus.MustRegister(SnapshotDurationSeconds) prometheus.MustRegister(RestorationDurationSeconds) diff --git a/pkg/snapshot/restorer/restorer.go b/pkg/snapshot/restorer/restorer.go index 333096807..58349ed40 100644 --- a/pkg/snapshot/restorer/restorer.go +++ b/pkg/snapshot/restorer/restorer.go @@ -379,7 +379,7 @@ func (r *Restorer) applyDeltaSnapshots(client *clientv3.Client, ro RestoreOption if err == nil { r.logger.Infof("Restoration complete.") } else { - r.logger.Warnf("Restoration failed.") + r.logger.Errorf("Restoration failed.") } return err diff --git a/pkg/snapshot/snapshotter/snapshotter.go b/pkg/snapshot/snapshotter/snapshotter.go index 928e29cc6..b78f260fb 100644 --- a/pkg/snapshot/snapshotter/snapshotter.go +++ b/pkg/snapshot/snapshotter/snapshotter.go @@ -249,6 +249,8 @@ func (ssr *Snapshotter) takeFullSnapshot() error { metrics.LatestSnapshotRevision.With(prometheus.Labels{metrics.LabelKind: ssr.prevSnapshot.Kind}).Set(float64(ssr.prevSnapshot.LastRevision)) metrics.LatestSnapshotTimestamp.With(prometheus.Labels{metrics.LabelKind: ssr.prevSnapshot.Kind}).Set(float64(ssr.prevSnapshot.CreatedOn.Unix())) + metrics.SnapshotRequired.With(prometheus.Labels{metrics.LabelKind: snapstore.SnapshotKindFull}).Set(0) + metrics.SnapshotRequired.With(prometheus.Labels{metrics.LabelKind: snapstore.SnapshotKindDelta}).Set(0) ssr.logger.Infof("Successfully saved full snapshot at: %s", path.Join(s.SnapDir, s.SnapName)) } @@ -325,6 +327,7 @@ func (ssr *Snapshotter) TakeDeltaSnapshot() error { ssr.prevSnapshot = snap metrics.LatestSnapshotRevision.With(prometheus.Labels{metrics.LabelKind: ssr.prevSnapshot.Kind}).Set(float64(ssr.prevSnapshot.LastRevision)) metrics.LatestSnapshotTimestamp.With(prometheus.Labels{metrics.LabelKind: ssr.prevSnapshot.Kind}).Set(float64(ssr.prevSnapshot.CreatedOn.Unix())) + metrics.SnapshotRequired.With(prometheus.Labels{metrics.LabelKind: snapstore.SnapshotKindDelta}).Set(0) ssr.logger.Infof("Successfully saved delta snapshot at: %s", path.Join(snap.SnapDir, snap.SnapName)) return nil } @@ -351,11 +354,25 @@ func (ssr *Snapshotter) CollectEventsSincePrevSnapshot(stopCh <-chan struct{}) ( } lastEtcdRevision := resp.Header.Revision + metrics.SnapshotRequired.With(prometheus.Labels{metrics.LabelKind: snapstore.SnapshotKindFull}).Set(0) + metrics.SnapshotRequired.With(prometheus.Labels{metrics.LabelKind: snapstore.SnapshotKindDelta}).Set(0) + + // if etcd revision newer than latest full snapshot revision, + // set `required` metric for full snapshot to 1 + if ssr.PrevFullSnapshot == nil || ssr.PrevFullSnapshot.LastRevision != lastEtcdRevision { + metrics.SnapshotRequired.With(prometheus.Labels{metrics.LabelKind: snapstore.SnapshotKindFull}).Set(1) + } if ssr.prevSnapshot.LastRevision == lastEtcdRevision { ssr.logger.Infof("No new events since last snapshot. Skipping initial delta snapshot.") return false, nil } + // need to take a delta snapshot here, because etcd revision is + // newer than latest snapshot revision. Also means, a subsequent + // full snapshot will be required later + metrics.SnapshotRequired.With(prometheus.Labels{metrics.LabelKind: snapstore.SnapshotKindDelta}).Set(1) + metrics.SnapshotRequired.With(prometheus.Labels{metrics.LabelKind: snapstore.SnapshotKindFull}).Set(1) + watchCtx, cancelWatch := context.WithCancel(context.TODO()) ssr.cancelWatch = cancelWatch ssr.etcdClient = client @@ -401,6 +418,8 @@ func (ssr *Snapshotter) handleDeltaWatchEvents(wr clientv3.WatchResponse) error } ssr.events = append(ssr.events, jsonByte...) ssr.lastEventRevision = ev.Kv.ModRevision + metrics.SnapshotRequired.With(prometheus.Labels{metrics.LabelKind: snapstore.SnapshotKindFull}).Set(1) + metrics.SnapshotRequired.With(prometheus.Labels{metrics.LabelKind: snapstore.SnapshotKindDelta}).Set(1) } ssr.logger.Debugf("Added events till revision: %d", ssr.lastEventRevision) if len(ssr.events) >= ssr.config.deltaSnapshotMemoryLimit {