Skip to content

Commit

Permalink
Export some backup stats as Prometheus metrics.
Browse files Browse the repository at this point in the history
Each of the following metrics will have a label called "policyName" that
is set to the name of the MetadataBackupPolicy resource.

kubedr_num_backups
kubedr_num_successful_backups
kubedr_num_failed_backups
    All of the above metrics are counters and are self-explanatory.

kubedr_backup_size_bytes
    A Gauge that captures the size of a backup in bytes.

kubedr_backup_duration_seconds
    A Histogram with the following buckets:

    15s, 30s, 1m, 5m, 10m, 15m, 30m, 1h, ...., 10h
  • Loading branch information
draghuram committed Feb 11, 2020
1 parent 06ffd96 commit 079a46a
Show file tree
Hide file tree
Showing 14 changed files with 522 additions and 152 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ DOCKER_KUBEDR_IMAGE_TAG ?= latest
DOCKER_KUBEDR_IMAGE_NAME_SHORT ?= kubedr
DOCKER_KUBEDR_IMAGE_NAME_LONG ?= ${DOCKER_PREFIX}${DOCKER_KUBEDR_IMAGE_NAME_SHORT}

DOCKER_KUBEDRUTIL_IMAGE_TAG ?= 0.2.7
DOCKER_KUBEDRUTIL_IMAGE_TAG ?= 0.2.8
DOCKER_KUBEDRUTIL_IMAGE_NAME_SHORT ?= kubedrutil
DOCKER_KUBEDRUTIL_IMAGE_NAME_LONG ?= ${DOCKER_PREFIX}${DOCKER_KUBEDRUTIL_IMAGE_NAME_SHORT}

Expand Down
1 change: 1 addition & 0 deletions kubedr/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ RUN go mod download
COPY main.go main.go
COPY api/ api/
COPY controllers/ controllers/
COPY metrics/ metrics/

# Build
RUN CGO_ENABLED=0 GOOS=linux GOARCH=amd64 GO111MODULE=on go build -a -o manager main.go
Expand Down
4 changes: 4 additions & 0 deletions kubedr/api/v1alpha1/metadatabackuppolicy_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,10 @@ type MetadataBackupPolicyStatus struct {

// +kubebuilder:validation:Optional
SnapshotID string `json:"snapshotId"`

// Name of the pod that performed the backup.
// +kubebuilder:validation:Optional
BackupPod string `json:"backupPod"`
}

// +kubebuilder:object:root=true
Expand Down
3 changes: 2 additions & 1 deletion kubedr/api/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,12 @@ spec:
apiVersion:
description: 'APIVersion defines the versioned schema of this representation
of an object. Servers should convert recognized schemas to the latest
internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources'
internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources'
type: string
kind:
description: 'Kind is a string value representing the REST resource this
object represents. Servers may infer this from the endpoint the client
submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds'
submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds'
type: string
metadata:
type: object
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,12 @@ spec:
apiVersion:
description: 'APIVersion defines the versioned schema of this representation
of an object. Servers should convert recognized schemas to the latest
internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources'
internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources'
type: string
kind:
description: 'Kind is a string value representing the REST resource this
object represents. Servers may infer this from the endpoint the client
submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds'
submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds'
type: string
metadata:
type: object
Expand Down Expand Up @@ -77,6 +77,9 @@ spec:
properties:
backupErrorMessage:
type: string
backupPod:
description: Name of the pod that performed the backup.
type: string
backupStatus:
type: string
backupTime:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,12 @@ spec:
apiVersion:
description: 'APIVersion defines the versioned schema of this representation
of an object. Servers should convert recognized schemas to the latest
internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources'
internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources'
type: string
kind:
description: 'Kind is a string value representing the REST resource this
object represents. Servers may infer this from the endpoint the client
submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds'
submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds'
type: string
metadata:
type: object
Expand Down
2 changes: 1 addition & 1 deletion kubedr/config/manager/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@ kind: Kustomization
images:
- name: controller
newName: kubedr
newTag: "0.54"
newTag: "0.42"
86 changes: 76 additions & 10 deletions kubedr/controllers/metadatabackuppolicy_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,15 @@ import (
"sigs.k8s.io/controller-runtime/pkg/client"

kubedrv1alpha1 "kubedr/api/v1alpha1"
"kubedr/metrics"
)

// MetadataBackupPolicyReconciler reconciles a MetadataBackupPolicy object
type MetadataBackupPolicyReconciler struct {
client.Client
Log logr.Logger
Scheme *runtime.Scheme
Log logr.Logger
Scheme *runtime.Scheme
MetricsInfo *metrics.MetricsInfo
}

// Implements logic to handle a new policy.
Expand Down Expand Up @@ -96,7 +98,7 @@ func (r *MetadataBackupPolicyReconciler) processUpdate(policy *kubedrv1alpha1.Me
updateCron = true
}

if cronJob.Spec.Suspend != policy.Spec.Suspend {
if *cronJob.Spec.Suspend != *policy.Spec.Suspend {
r.Log.V(1).Info("suspend status changed")
cronJob.Spec.Suspend = policy.Spec.Suspend
updateCron = true
Expand All @@ -111,8 +113,70 @@ func (r *MetadataBackupPolicyReconciler) processUpdate(policy *kubedrv1alpha1.Me
return ctrl.Result{}, nil
}

// We use information in "Status" field to export metrics.
// Since reconcile can be called multiple times, we need a way to check
// whether we already processed a given status. So after processing a
// backup and added the metrics, we set an annotation to indicate that
// this particular backup is already processed.
func (r *MetadataBackupPolicyReconciler) processStatus(policy *kubedrv1alpha1.MetadataBackupPolicy) (ctrl.Result, error) {
r.Log.Info("processStatus...")

backupAnnotation := "processed-backup.annotations.kubedr.catalogicsoftware.com"
currentBackupPod := policy.Status.BackupPod

if currentBackupPod == "" {
r.Log.Info("No backup pod so nothing to process...")
return ctrl.Result{}, nil
}

processedBackupPod, exists := policy.ObjectMeta.Annotations[backupAnnotation]
r.Log.Info(fmt.Sprintf("processStatus: annotation exists: %v, currentBackupPod: %s, processedBackupPod: %s",
exists, currentBackupPod, processedBackupPod))

if exists && (processedBackupPod == currentBackupPod) {
r.Log.Info("Already processed the backup...")
return ctrl.Result{}, nil
}

r.Log.Info("Processing the backup...")

// Update metrics
policyName := policy.Name
backupStatus := policy.Status.BackupStatus
r.MetricsInfo.RecordBackup(policyName)

if backupStatus == "Completed" {
r.MetricsInfo.RecordSuccessfulBackup(policyName)
r.MetricsInfo.SetBackupSizeBytes(policyName, policy.Status.DataAdded)

// As I could not find a way to get float value out of resource.Quantity, I am
// using its Value() method which always returns rounded up value to
// nearest integer (away from 0).
r.MetricsInfo.RecordBackupDuration(policyName, float64(policy.Status.TotalDurationSecs.Value()))
// r.Log.Info(fmt.Sprintf("Backup duration: %v", policy.Status.TotalDurationSecs))
} else {
// backup failed
r.MetricsInfo.RecordFailedBackup(policyName)
}

// Set the annotation
if policy.ObjectMeta.Annotations == nil {
policy.ObjectMeta.Annotations = make(map[string]string)
}
policy.ObjectMeta.Annotations[backupAnnotation] = currentBackupPod

err := r.Update(context.Background(), policy)
if err != nil {
// There is no point in requeuing the request since we already updated
// the metrics.
r.Log.Error(err, "Error in updating the backup annotation, ignoring...")
}

return ctrl.Result{}, nil
}

// Process spec and make sure it matches status of the world.
func (r *MetadataBackupPolicyReconciler) processSpec(policy *kubedrv1alpha1.MetadataBackupPolicy,
func (r *MetadataBackupPolicyReconciler) processSpecAndStatus(policy *kubedrv1alpha1.MetadataBackupPolicy,
namespace string) (ctrl.Result, error) {

var cronJob batchv1beta1.CronJob
Expand All @@ -131,7 +195,11 @@ func (r *MetadataBackupPolicyReconciler) processSpec(policy *kubedrv1alpha1.Meta
}

// The policy exists. We need to check and make any required changes to cronJob.
return r.processUpdate(policy, &cronJob)
if result, err := r.processUpdate(policy, &cronJob); err != nil {
return result, err
}

return r.processStatus(policy)
}

func (r *MetadataBackupPolicyReconciler) setStatus(policy *kubedrv1alpha1.MetadataBackupPolicy) {
Expand All @@ -152,19 +220,17 @@ func (r *MetadataBackupPolicyReconciler) setStatus(policy *kubedrv1alpha1.Metada
// Reconcile is the the main entry point called by the framework.
func (r *MetadataBackupPolicyReconciler) Reconcile(req ctrl.Request) (ctrl.Result, error) {
ctx := context.Background()
log := r.Log.WithValues("metadatabackuppolicy", req.NamespacedName)
r.Log = log

var policy kubedrv1alpha1.MetadataBackupPolicy
if err := r.Get(ctx, req.NamespacedName, &policy); err != nil {
if apierrors.IsNotFound(err) {
// we'll ignore not-found errors, since they can't be fixed by an immediate
// requeue (we'll need to wait for a new notification).
log.Info("MetadataBackupPolicy (" + req.NamespacedName.Name + ") is not found")
r.Log.Info("MetadataBackupPolicy (" + req.NamespacedName.Name + ") is not found")
return ctrl.Result{}, nil
}

log.Error(err, "unable to fetch MetadataBackupPolicy")
r.Log.Error(err, "unable to fetch MetadataBackupPolicy")
return ctrl.Result{}, err
}

Expand Down Expand Up @@ -197,7 +263,7 @@ func (r *MetadataBackupPolicyReconciler) Reconcile(req ctrl.Request) (ctrl.Resul
return ctrl.Result{}, nil
}

return r.processSpec(&policy, req.Namespace)
return r.processSpecAndStatus(&policy, req.Namespace)
}

// SetupWithManager hooks up this controller with the manager.
Expand Down
19 changes: 0 additions & 19 deletions kubedr/controllers/metadatabackuprecord_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,18 +30,7 @@ import (
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"

"github.com/prometheus/client_golang/prometheus"
kubedrv1alpha1 "kubedr/api/v1alpha1"
"sigs.k8s.io/controller-runtime/pkg/metrics"
)

var (
numDeletedBackups = prometheus.NewCounter(
prometheus.CounterOpts{
Name: "kubedr_num_metadata_backups_deleted",
Help: "Number of metadata backups deleted",
},
)
)

// MetadataBackupRecordReconciler reconciles a MetadataBackupRecord object
Expand Down Expand Up @@ -169,9 +158,6 @@ func (r *MetadataBackupRecordReconciler) Reconcile(req ctrl.Request) (ctrl.Resul
log.Error(err, "Error in starting snap delete pod")
return ctrl.Result{}, err
}

// FIX: We really need to make sure that delete succeeded.
numDeletedBackups.Inc()
}

// Keep last 3 snap deletetion pods and clean up the rest.
Expand Down Expand Up @@ -287,8 +273,3 @@ func createResticSnapDeletePod(backupLocation *kubedrv1alpha1.BackupLocation, lo
},
}, nil
}

func init() {
// Register custom metrics with the global prometheus registry
metrics.Registry.MustRegister(numDeletedBackups)
}
16 changes: 7 additions & 9 deletions kubedr/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,13 @@ module kubedr
go 1.12

require (
github.com/aws/aws-sdk-go v1.25.29
github.com/go-logr/logr v0.1.0
github.com/onsi/ginkgo v1.6.0
github.com/onsi/gomega v1.4.2
github.com/prometheus/client_golang v1.0.0
github.com/onsi/ginkgo v1.12.0
github.com/onsi/gomega v1.9.0
github.com/prometheus/client_golang v1.4.0
github.com/robfig/cron v1.2.0
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2 // indirect
k8s.io/api v0.0.0-20190918195907-bd6ac527cfd2
k8s.io/apimachinery v0.0.0-20190817020851-f2f3a405f61d
k8s.io/client-go v0.0.0-20190918200256-06eb1244587a
sigs.k8s.io/controller-runtime v0.3.0
k8s.io/api v0.17.2
k8s.io/apimachinery v0.17.2
k8s.io/client-go v0.17.2
sigs.k8s.io/controller-runtime v0.4.0
)
Loading

0 comments on commit 079a46a

Please sign in to comment.