diff --git a/doc/usage/metrics.md b/doc/usage/metrics.md index e2fcc6616..a0982942c 100644 --- a/doc/usage/metrics.md +++ b/doc/usage/metrics.md @@ -50,6 +50,17 @@ Two major steps in initialization of etcd data directory are validation and rest | etcdbr_validation_duration_seconds | Total latency distribution of validating data directory. | Histogram | | etcdbr_restoration_duration_seconds | Total latency distribution of restoring from snapshot. | Histogram | +### Snapstore + +These bucket-related metrics provide information about the latest set of delta snapshots stored in the snapstore. They provide a rough estimation of the amount of time required to perform a restoration from the latest set of snapshots. + +| Name | Description | Type | +|------|-------------|------| +| etcdbr_snapstore_latest_deltas_total | Total number of delta snapshots taken since the latest full snapshot. | Gauge | +| etcdbr_snapstore_latest_deltas_revisions_total | Total number of revisions stored in delta snapshots taken since the latest full snapshot. | Gauge | + +`etcdbr_snapstore_latest_deltas_revisions_total` indicates the total number of etcd revisions (events) stored in the latest set of delta snapshots. The amount of time it would take to perform an etcd data restoration with the latest set of snapshots is directly proportional to this value. + ### Network These metrics describe the status of the network usage. We use `/proc//net/dev` to get network usage details for the etcdbr process. Currently these metrics are only supported on linux-based distributions. diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index 47569d23c..d59152b9f 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -32,8 +32,9 @@ const ( // LabelKind is a metrics label indicates kind of snapshot associated with metric. LabelKind = "kind" - namespaceEtcdBR = "etcdbr" - subsystemSnapshot = "snapshot" + namespaceEtcdBR = "etcdbr" + subsystemSnapshot = "snapshot" + subsystemSnapstore = "snapstore" ) var ( @@ -131,6 +132,27 @@ var ( }, []string{LabelSucceeded}, ) + + // SnapstoreLatestDeltasTotal is metric to expose total number of delta snapshots taken since the latest full snapshot. + SnapstoreLatestDeltasTotal = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespaceEtcdBR, + Subsystem: subsystemSnapstore, + Name: "latest_deltas_total", + Help: "Total number of delta snapshots taken since the latest full snapshot.", + }, + []string{}, + ) + // SnapstoreLatestDeltasRevisionsTotal is metric to expose total number of revisions stored in delta snapshots taken since the latest full snapshot. + SnapstoreLatestDeltasRevisionsTotal = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespaceEtcdBR, + Subsystem: subsystemSnapstore, + Name: "latest_deltas_revisions_total", + Help: "Total number of revisions stored in delta snapshots taken since the latest full snapshot.", + }, + []string{}, + ) ) // generateLabelCombinations generates combinations of label values for metrics @@ -291,6 +313,12 @@ func init() { DefragmentationDurationSeconds.With(prometheus.Labels(combination)) } + // SnapstoreLatestDeltasTotal + SnapstoreLatestDeltasTotal.With(prometheus.Labels(map[string]string{})) + + // SnapstoreLatestDeltasSize + SnapstoreLatestDeltasRevisionsTotal.With(prometheus.Labels(map[string]string{})) + // Metrics have to be registered to be exposed: prometheus.MustRegister(GCSnapshotCounter) @@ -302,4 +330,7 @@ func init() { prometheus.MustRegister(RestorationDurationSeconds) prometheus.MustRegister(ValidationDurationSeconds) prometheus.MustRegister(DefragmentationDurationSeconds) + + prometheus.MustRegister(SnapstoreLatestDeltasTotal) + prometheus.MustRegister(SnapstoreLatestDeltasRevisionsTotal) } diff --git a/pkg/miscellaneous/miscellaneous.go b/pkg/miscellaneous/miscellaneous.go index 72a7288ff..6e2183cf5 100644 --- a/pkg/miscellaneous/miscellaneous.go +++ b/pkg/miscellaneous/miscellaneous.go @@ -17,12 +17,17 @@ package miscellaneous import ( "sort" + "github.com/gardener/etcd-backup-restore/pkg/metrics" "github.com/gardener/etcd-backup-restore/pkg/snapstore" + "github.com/prometheus/client_golang/prometheus" ) // GetLatestFullSnapshotAndDeltaSnapList returns the latest snapshot func GetLatestFullSnapshotAndDeltaSnapList(store snapstore.SnapStore) (*snapstore.Snapshot, snapstore.SnapList, error) { - var deltaSnapList snapstore.SnapList + var ( + fullSnapshot *snapstore.Snapshot + deltaSnapList snapstore.SnapList + ) snapList, err := store.List() if err != nil { return nil, nil, err @@ -33,11 +38,19 @@ func GetLatestFullSnapshotAndDeltaSnapList(store snapstore.SnapStore) (*snapstor continue } if snapList[index-1].Kind == snapstore.SnapshotKindFull { - sort.Sort(deltaSnapList) - return snapList[index-1], deltaSnapList, nil + fullSnapshot = snapList[index-1] + break } deltaSnapList = append(deltaSnapList, snapList[index-1]) } - sort.Sort(deltaSnapList) //added to ensure the list is well formed for only deltasnapshots scenarios as well - return nil, deltaSnapList, nil + + sort.Sort(deltaSnapList) // ensures that the delta snapshot list is well formed + metrics.SnapstoreLatestDeltasTotal.With(prometheus.Labels{}).Set(float64(len(deltaSnapList))) + if len(deltaSnapList) == 0 { + metrics.SnapstoreLatestDeltasRevisionsTotal.With(prometheus.Labels{}).Set(0) + } else { + revisionDiff := deltaSnapList[len(deltaSnapList)-1].LastRevision - deltaSnapList[0].StartRevision + metrics.SnapstoreLatestDeltasRevisionsTotal.With(prometheus.Labels{}).Set(float64(revisionDiff)) + } + return fullSnapshot, deltaSnapList, nil } diff --git a/pkg/snapshot/snapshotter/snapshotter.go b/pkg/snapshot/snapshotter/snapshotter.go index 5f51cd9bc..99dab74dc 100644 --- a/pkg/snapshot/snapshotter/snapshotter.go +++ b/pkg/snapshot/snapshotter/snapshotter.go @@ -255,6 +255,8 @@ func (ssr *Snapshotter) takeFullSnapshot() (*snapstore.Snapshot, error) { metrics.LatestSnapshotRevision.With(prometheus.Labels{metrics.LabelKind: ssr.prevSnapshot.Kind}).Set(float64(ssr.prevSnapshot.LastRevision)) metrics.LatestSnapshotTimestamp.With(prometheus.Labels{metrics.LabelKind: ssr.prevSnapshot.Kind}).Set(float64(ssr.prevSnapshot.CreatedOn.Unix())) + metrics.SnapstoreLatestDeltasTotal.With(prometheus.Labels{}).Set(0) + metrics.SnapstoreLatestDeltasRevisionsTotal.With(prometheus.Labels{}).Set(0) ssr.logger.Infof("Successfully saved full snapshot at: %s", path.Join(s.SnapDir, s.SnapName)) } @@ -340,6 +342,8 @@ func (ssr *Snapshotter) TakeDeltaSnapshot() (*snapstore.Snapshot, error) { metrics.LatestSnapshotRevision.With(prometheus.Labels{metrics.LabelKind: ssr.prevSnapshot.Kind}).Set(float64(ssr.prevSnapshot.LastRevision)) metrics.LatestSnapshotTimestamp.With(prometheus.Labels{metrics.LabelKind: ssr.prevSnapshot.Kind}).Set(float64(ssr.prevSnapshot.CreatedOn.Unix())) metrics.SnapshotRequired.With(prometheus.Labels{metrics.LabelKind: snapstore.SnapshotKindDelta}).Set(0) + metrics.SnapstoreLatestDeltasTotal.With(prometheus.Labels{}).Inc() + metrics.SnapstoreLatestDeltasRevisionsTotal.With(prometheus.Labels{}).Add(float64(snap.LastRevision - snap.StartRevision)) ssr.logger.Infof("Successfully saved delta snapshot at: %s", path.Join(snap.SnapDir, snap.SnapName)) return snap, nil }