From 0faf0927bc396db57b3c455615103014c7f57d00 Mon Sep 17 00:00:00 2001 From: Mario Constanti Date: Wed, 21 Feb 2024 11:49:52 +0100 Subject: [PATCH 1/5] feat: count external provider operations Signed-off-by: Mario Constanti --- metrics/instance.go | 14 +++++ metrics/metrics.go | 12 ++++ runner/providers/external/external.go | 87 ++++++++++++++++++++++++++- 3 files changed, 112 insertions(+), 1 deletion(-) diff --git a/metrics/instance.go b/metrics/instance.go index a4ac66bf..1dc61d16 100644 --- a/metrics/instance.go +++ b/metrics/instance.go @@ -11,4 +11,18 @@ var ( Name: "status", Help: "Status of the instance", }, []string{"name", "status", "runner_status", "pool_owner", "pool_type", "pool_id", "provider"}) + + InstanceOperationCount = prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: metricsNamespace, + Subsystem: metricsRunnerSubsystem, + Name: "operation_total", + Help: "Total number of instance operation attempts", + }, []string{"operation", "provider"}) + + InstanceOperationFailedCount = prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: metricsNamespace, + Subsystem: metricsRunnerSubsystem, + Name: "operation_failed_total", + Help: "Total number of failed instance operation attempts", + }, []string{"operation", "provider"}) ) diff --git a/metrics/metrics.go b/metrics/metrics.go index 7dec272c..43c0c294 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -12,12 +12,16 @@ const metricsOrganizationSubsystem = "organization" const metricsRepositorySubsystem = "repository" const metricsEnterpriseSubsystem = "enterprise" const metricsWebhookSubsystem = "webhook" +const metricsGithubSubsystem = "github" // RegisterMetrics registers all the metrics func RegisterMetrics() error { var collectors []prometheus.Collector collectors = append(collectors, + + // metrics created during the periodically update of the metrics + // // runner metrics InstanceStatus, // organization metrics @@ -39,6 +43,14 @@ func RegisterMetrics() error { PoolBootstrapTimeout, // health metrics GarmHealth, + + // metrics used within normal garm operations + // e.g. count instance creations, count github api calls, ... + // + // runner instances + InstanceOperationCount, + InstanceOperationFailedCount, + // webhook metrics WebhooksReceived, ) diff --git a/runner/providers/external/external.go b/runner/providers/external/external.go index d1574047..11b614de 100644 --- a/runner/providers/external/external.go +++ b/runner/providers/external/external.go @@ -14,6 +14,7 @@ import ( garmErrors "github.com/cloudbase/garm-provider-common/errors" garmExec "github.com/cloudbase/garm-provider-common/util/exec" "github.com/cloudbase/garm/config" + "github.com/cloudbase/garm/metrics" "github.com/cloudbase/garm/params" "github.com/cloudbase/garm/runner/common" @@ -82,17 +83,34 @@ func (e *external) CreateInstance(ctx context.Context, bootstrapParams commonPar return commonParams.ProviderInstance{}, errors.Wrap(err, "serializing bootstrap params") } + metrics.InstanceOperationCount.WithLabelValues( + "CreateInstance", // label: operation + e.cfg.Name, // label: provider + ).Inc() + out, err := garmExec.Exec(ctx, e.execPath, asJs, asEnv) if err != nil { + metrics.InstanceOperationFailedCount.WithLabelValues( + "CreateInstance", // label: operation + e.cfg.Name, // label: provider + ).Inc() return commonParams.ProviderInstance{}, garmErrors.NewProviderError("provider binary %s returned error: %s", e.execPath, err) } var param commonParams.ProviderInstance if err := json.Unmarshal(out, ¶m); err != nil { + metrics.InstanceOperationFailedCount.WithLabelValues( + "CreateInstance", // label: operation + e.cfg.Name, // label: provider + ).Inc() return commonParams.ProviderInstance{}, garmErrors.NewProviderError("failed to decode response from binary: %s", err) } if err := e.validateResult(ctx, param); err != nil { + metrics.InstanceOperationFailedCount.WithLabelValues( + "CreateInstance", // label: operation + e.cfg.Name, // label: provider + ).Inc() return commonParams.ProviderInstance{}, garmErrors.NewProviderError("failed to validate result: %s", err) } @@ -113,10 +131,18 @@ func (e *external) DeleteInstance(ctx context.Context, instance string) error { } asEnv = append(asEnv, e.environmentVariables...) + metrics.InstanceOperationCount.WithLabelValues( + "DeleteInstance", // label: operation + e.cfg.Name, // label: provider + ).Inc() _, err := garmExec.Exec(ctx, e.execPath, nil, asEnv) if err != nil { var exitErr *exec.ExitError if !errors.As(err, &exitErr) || exitErr.ExitCode() != execution.ExitCodeNotFound { + metrics.InstanceOperationFailedCount.WithLabelValues( + "DeleteInstance", // label: operation + e.cfg.Name, // label: provider + ).Inc() return garmErrors.NewProviderError("provider binary %s returned error: %s", e.execPath, err) } @@ -134,19 +160,35 @@ func (e *external) GetInstance(ctx context.Context, instance string) (commonPara } asEnv = append(asEnv, e.environmentVariables...) - // TODO(gabriel-samfira): handle error types. Of particular insterest is to + // TODO(gabriel-samfira): handle error types. Of particular interest is to // know when the error is ErrNotFound. + metrics.InstanceOperationCount.WithLabelValues( + "GetInstance", // label: operation + e.cfg.Name, // label: provider + ).Inc() out, err := garmExec.Exec(ctx, e.execPath, nil, asEnv) if err != nil { + metrics.InstanceOperationFailedCount.WithLabelValues( + "GetInstance", // label: operation + e.cfg.Name, // label: provider + ).Inc() return commonParams.ProviderInstance{}, garmErrors.NewProviderError("provider binary %s returned error: %s", e.execPath, err) } var param commonParams.ProviderInstance if err := json.Unmarshal(out, ¶m); err != nil { + metrics.InstanceOperationFailedCount.WithLabelValues( + "GetInstance", // label: operation + e.cfg.Name, // label: provider + ).Inc() return commonParams.ProviderInstance{}, garmErrors.NewProviderError("failed to decode response from binary: %s", err) } if err := e.validateResult(ctx, param); err != nil { + metrics.InstanceOperationFailedCount.WithLabelValues( + "GetInstance", // label: operation + e.cfg.Name, // label: provider + ).Inc() return commonParams.ProviderInstance{}, garmErrors.NewProviderError("failed to validate result: %s", err) } @@ -163,19 +205,36 @@ func (e *external) ListInstances(ctx context.Context, poolID string) ([]commonPa } asEnv = append(asEnv, e.environmentVariables...) + metrics.InstanceOperationCount.WithLabelValues( + "ListInstances", // label: operation + e.cfg.Name, // label: provider + ).Inc() + out, err := garmExec.Exec(ctx, e.execPath, nil, asEnv) if err != nil { + metrics.InstanceOperationFailedCount.WithLabelValues( + "ListInstances", // label: operation + e.cfg.Name, // label: provider + ).Inc() return []commonParams.ProviderInstance{}, garmErrors.NewProviderError("provider binary %s returned error: %s", e.execPath, err) } var param []commonParams.ProviderInstance if err := json.Unmarshal(out, ¶m); err != nil { + metrics.InstanceOperationFailedCount.WithLabelValues( + "ListInstances", // label: operation + e.cfg.Name, // label: provider + ).Inc() return []commonParams.ProviderInstance{}, garmErrors.NewProviderError("failed to decode response from binary: %s", err) } ret := make([]commonParams.ProviderInstance, len(param)) for idx, inst := range param { if err := e.validateResult(ctx, inst); err != nil { + metrics.InstanceOperationFailedCount.WithLabelValues( + "ListInstances", // label: operation + e.cfg.Name, // label: provider + ).Inc() return []commonParams.ProviderInstance{}, garmErrors.NewProviderError("failed to validate result: %s", err) } ret[idx] = inst @@ -192,8 +251,17 @@ func (e *external) RemoveAllInstances(ctx context.Context) error { } asEnv = append(asEnv, e.environmentVariables...) + metrics.InstanceOperationCount.WithLabelValues( + "RemoveAllInstances", // label: operation + e.cfg.Name, // label: provider + ).Inc() + _, err := garmExec.Exec(ctx, e.execPath, nil, asEnv) if err != nil { + metrics.InstanceOperationFailedCount.WithLabelValues( + "RemoveAllInstances", // label: operation + e.cfg.Name, // label: provider + ).Inc() return garmErrors.NewProviderError("provider binary %s returned error: %s", e.execPath, err) } return nil @@ -209,8 +277,16 @@ func (e *external) Stop(ctx context.Context, instance string, force bool) error } asEnv = append(asEnv, e.environmentVariables...) + metrics.InstanceOperationCount.WithLabelValues( + "Stop", // label: operation + e.cfg.Name, // label: provider + ).Inc() _, err := garmExec.Exec(ctx, e.execPath, nil, asEnv) if err != nil { + metrics.InstanceOperationFailedCount.WithLabelValues( + "Stop", // label: operation + e.cfg.Name, // label: provider + ).Inc() return garmErrors.NewProviderError("provider binary %s returned error: %s", e.execPath, err) } return nil @@ -226,8 +302,17 @@ func (e *external) Start(ctx context.Context, instance string) error { } asEnv = append(asEnv, e.environmentVariables...) + metrics.InstanceOperationCount.WithLabelValues( + "Start", // label: operation + e.cfg.Name, // label: provider + ).Inc() + _, err := garmExec.Exec(ctx, e.execPath, nil, asEnv) if err != nil { + metrics.InstanceOperationFailedCount.WithLabelValues( + "Start", // label: operation + e.cfg.Name, // label: provider + ).Inc() return garmErrors.NewProviderError("provider binary %s returned error: %s", e.execPath, err) } return nil From b36b5137b6c9cc9237341d2907a0cc1c94e98321 Mon Sep 17 00:00:00 2001 From: Mario Constanti Date: Wed, 21 Feb 2024 14:22:45 +0100 Subject: [PATCH 2/5] feat: count github api calls introduce metrics counter for github api calls Signed-off-by: Mario Constanti --- metrics/github.go | 19 +++++++ metrics/metrics.go | 4 +- runner/pool/enterprise.go | 60 +++++++++++++++++++++ runner/pool/organization.go | 101 +++++++++++++++++++++++++++++++++++- runner/pool/pool.go | 4 ++ runner/pool/repository.go | 83 ++++++++++++++++++++++++++++- 6 files changed, 268 insertions(+), 3 deletions(-) create mode 100644 metrics/github.go diff --git a/metrics/github.go b/metrics/github.go new file mode 100644 index 00000000..c4f043a1 --- /dev/null +++ b/metrics/github.go @@ -0,0 +1,19 @@ +package metrics + +import "github.com/prometheus/client_golang/prometheus" + +var ( + GithubOperationCount = prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: metricsNamespace, + Subsystem: metricsGithubSubsystem, + Name: "operation_total", + Help: "Total number of github operation attempts", + }, []string{"operation", "scope"}) + + GithubOperationFailedCount = prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: metricsNamespace, + Subsystem: metricsGithubSubsystem, + Name: "operation_failed_total", + Help: "Total number of failed github operation attempts", + }, []string{"operation", "scope"}) +) diff --git a/metrics/metrics.go b/metrics/metrics.go index 43c0c294..62bdf2c2 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -50,7 +50,9 @@ func RegisterMetrics() error { // runner instances InstanceOperationCount, InstanceOperationFailedCount, - + // github + GithubOperationCount, + GithubOperationFailedCount, // webhook metrics WebhooksReceived, ) diff --git a/runner/pool/enterprise.go b/runner/pool/enterprise.go index f03947b5..cf318c65 100644 --- a/runner/pool/enterprise.go +++ b/runner/pool/enterprise.go @@ -13,6 +13,7 @@ import ( runnerErrors "github.com/cloudbase/garm-provider-common/errors" commonParams "github.com/cloudbase/garm-provider-common/params" dbCommon "github.com/cloudbase/garm/database/common" + "github.com/cloudbase/garm/metrics" "github.com/cloudbase/garm/params" "github.com/cloudbase/garm/runner/common" "github.com/cloudbase/garm/util" @@ -85,8 +86,16 @@ func (r *enterprise) findRunnerGroupByName(ctx context.Context, name string) (*g } for { + metrics.GithubOperationCount.WithLabelValues( + "ListOrganizationRunnerGroups", // label: operation + metricsLabelEnterpriseScope, // label: scope + ).Inc() runnerGroups, ghResp, err := r.ghcEnterpriseCli.ListRunnerGroups(r.ctx, r.cfg.Name, &opts) if err != nil { + metrics.GithubOperationFailedCount.WithLabelValues( + "ListOrganizationRunnerGroups", // label: operation + metricsLabelEnterpriseScope, // label: scope + ).Inc() if ghResp != nil && ghResp.StatusCode == http.StatusUnauthorized { return nil, errors.Wrap(runnerErrors.ErrUnauthorized, "fetching runners") } @@ -123,8 +132,16 @@ func (r *enterprise) GetJITConfig(ctx context.Context, instance string, pool par // TODO(gabriel-samfira): Should we make this configurable? WorkFolder: github.String("_work"), } + metrics.GithubOperationCount.WithLabelValues( + "GenerateEnterpriseJITConfig", // label: operation + metricsLabelEnterpriseScope, // label: scope + ).Inc() jitConfig, resp, err := r.ghcEnterpriseCli.GenerateEnterpriseJITConfig(ctx, r.cfg.Name, &req) if err != nil { + metrics.GithubOperationFailedCount.WithLabelValues( + "GenerateEnterpriseJITConfig", // label: operation + metricsLabelEnterpriseScope, // label: scope + ).Inc() if resp != nil && resp.StatusCode == http.StatusUnauthorized { return nil, nil, fmt.Errorf("failed to get JIT config: %w", err) } @@ -134,7 +151,17 @@ func (r *enterprise) GetJITConfig(ctx context.Context, instance string, pool par runner = jitConfig.Runner defer func() { if err != nil && runner != nil { + metrics.GithubOperationCount.WithLabelValues( + "RemoveRunner", // label: operation + metricsLabelEnterpriseScope, // label: scope + ).Inc() _, innerErr := r.ghcEnterpriseCli.RemoveRunner(r.ctx, r.cfg.Name, runner.GetID()) + if innerErr != nil { + metrics.GithubOperationFailedCount.WithLabelValues( + "RemoveRunner", // label: operation + metricsLabelEnterpriseScope, // label: scope + ).Inc() + } slog.With(slog.Any("error", innerErr)).ErrorContext( ctx, "failed to remove runner", "runner_id", runner.GetID(), "organization", r.cfg.Name) @@ -166,8 +193,16 @@ func (r *enterprise) GetRunnerInfoFromWorkflow(job params.WorkflowJob) (params.R if err := r.ValidateOwner(job); err != nil { return params.RunnerInfo{}, errors.Wrap(err, "validating owner") } + metrics.GithubOperationCount.WithLabelValues( + "GetWorkflowJobByID", // label: operation + metricsLabelEnterpriseScope, // label: scope + ).Inc() workflow, ghResp, err := r.ghcli.GetWorkflowJobByID(r.ctx, job.Repository.Owner.Login, job.Repository.Name, job.WorkflowJob.ID) if err != nil { + metrics.GithubOperationFailedCount.WithLabelValues( + "GetWorkflowJobByID", // label: operation + metricsLabelEnterpriseScope, // label: scope + ).Inc() if ghResp != nil && ghResp.StatusCode == http.StatusUnauthorized { return params.RunnerInfo{}, errors.Wrap(runnerErrors.ErrUnauthorized, "fetching workflow info") } @@ -212,8 +247,16 @@ func (r *enterprise) GetGithubRunners() ([]*github.Runner, error) { var allRunners []*github.Runner for { + metrics.GithubOperationCount.WithLabelValues( + "ListRunners", // label: operation + metricsLabelEnterpriseScope, // label: scope + ).Inc() runners, ghResp, err := r.ghcEnterpriseCli.ListRunners(r.ctx, r.cfg.Name, &opts) if err != nil { + metrics.GithubOperationFailedCount.WithLabelValues( + "ListRunners", // label: operation + metricsLabelEnterpriseScope, // label: scope + ).Inc() if ghResp != nil && ghResp.StatusCode == http.StatusUnauthorized { return nil, errors.Wrap(runnerErrors.ErrUnauthorized, "fetching runners") } @@ -231,8 +274,16 @@ func (r *enterprise) GetGithubRunners() ([]*github.Runner, error) { func (r *enterprise) FetchTools() ([]commonParams.RunnerApplicationDownload, error) { r.mux.Lock() defer r.mux.Unlock() + metrics.GithubOperationCount.WithLabelValues( + "ListRunnerApplicationDownloads", // label: operation + metricsLabelEnterpriseScope, // label: scope + ).Inc() tools, ghResp, err := r.ghcEnterpriseCli.ListRunnerApplicationDownloads(r.ctx, r.cfg.Name) if err != nil { + metrics.GithubOperationFailedCount.WithLabelValues( + "ListRunnerApplicationDownloads", // label: operation + metricsLabelEnterpriseScope, // label: scope + ).Inc() if ghResp != nil && ghResp.StatusCode == http.StatusUnauthorized { return nil, errors.Wrap(runnerErrors.ErrUnauthorized, "fetching runners") } @@ -275,9 +326,18 @@ func (r *enterprise) JwtToken() string { } func (r *enterprise) GetGithubRegistrationToken() (string, error) { + metrics.GithubOperationCount.WithLabelValues( + "CreateRegistrationToken", // label: operation + metricsLabelEnterpriseScope, // label: scope + ).Inc() + tk, ghResp, err := r.ghcEnterpriseCli.CreateRegistrationToken(r.ctx, r.cfg.Name) if err != nil { + metrics.GithubOperationFailedCount.WithLabelValues( + "CreateRegistrationToken", // label: operation + metricsLabelEnterpriseScope, // label: scope + ).Inc() if ghResp != nil && ghResp.StatusCode == http.StatusUnauthorized { return "", errors.Wrap(runnerErrors.ErrUnauthorized, "fetching registration token") } diff --git a/runner/pool/organization.go b/runner/pool/organization.go index 0f534508..3de5a9b1 100644 --- a/runner/pool/organization.go +++ b/runner/pool/organization.go @@ -27,6 +27,7 @@ import ( runnerErrors "github.com/cloudbase/garm-provider-common/errors" commonParams "github.com/cloudbase/garm-provider-common/params" dbCommon "github.com/cloudbase/garm/database/common" + "github.com/cloudbase/garm/metrics" "github.com/cloudbase/garm/params" "github.com/cloudbase/garm/runner/common" "github.com/cloudbase/garm/util" @@ -97,8 +98,16 @@ func (r *organization) findRunnerGroupByName(ctx context.Context, name string) ( } for { + metrics.GithubOperationCount.WithLabelValues( + "ListOrganizationRunnerGroups", // label: operation + metricsLabelOrganizationScope, // label: scope + ).Inc() runnerGroups, ghResp, err := r.ghcli.ListOrganizationRunnerGroups(r.ctx, r.cfg.Name, &opts) if err != nil { + metrics.GithubOperationFailedCount.WithLabelValues( + "ListOrganizationRunnerGroups", // label: operation + metricsLabelOrganizationScope, // label: scope + ).Inc() if ghResp != nil && ghResp.StatusCode == http.StatusUnauthorized { return nil, errors.Wrap(runnerErrors.ErrUnauthorized, "fetching runners") } @@ -135,8 +144,16 @@ func (r *organization) GetJITConfig(ctx context.Context, instance string, pool p // TODO(gabriel-samfira): Should we make this configurable? WorkFolder: github.String("_work"), } + metrics.GithubOperationCount.WithLabelValues( + "GenerateOrgJITConfig", // label: operation + metricsLabelOrganizationScope, // label: scope + ).Inc() jitConfig, resp, err := r.ghcli.GenerateOrgJITConfig(ctx, r.cfg.Name, &req) if err != nil { + metrics.GithubOperationFailedCount.WithLabelValues( + "GenerateOrgJITConfig", // label: operation + metricsLabelOrganizationScope, // label: scope + ).Inc() if resp != nil && resp.StatusCode == http.StatusUnauthorized { return nil, nil, fmt.Errorf("failed to get JIT config: %w", err) } @@ -146,7 +163,17 @@ func (r *organization) GetJITConfig(ctx context.Context, instance string, pool p runner = jitConfig.GetRunner() defer func() { if err != nil && runner != nil { + metrics.GithubOperationCount.WithLabelValues( + "RemoveOrganizationRunner", // label: operation + metricsLabelOrganizationScope, // label: scope + ).Inc() _, innerErr := r.ghcli.RemoveOrganizationRunner(r.ctx, r.cfg.Name, runner.GetID()) + if innerErr != nil { + metrics.GithubOperationFailedCount.WithLabelValues( + "RemoveOrganizationRunner", // label: operation + metricsLabelOrganizationScope, // label: scope + ).Inc() + } slog.With(slog.Any("error", innerErr)).ErrorContext( ctx, "failed to remove runner", "runner_id", runner.GetID(), "organization", r.cfg.Name) @@ -178,8 +205,16 @@ func (r *organization) GetRunnerInfoFromWorkflow(job params.WorkflowJob) (params if err := r.ValidateOwner(job); err != nil { return params.RunnerInfo{}, errors.Wrap(err, "validating owner") } + metrics.GithubOperationCount.WithLabelValues( + "GetWorkflowJobByID", // label: operation + metricsLabelOrganizationScope, // label: scope + ).Inc() workflow, ghResp, err := r.ghcli.GetWorkflowJobByID(r.ctx, job.Organization.Login, job.Repository.Name, job.WorkflowJob.ID) if err != nil { + metrics.GithubOperationFailedCount.WithLabelValues( + "GetWorkflowJobByID", // label: operation + metricsLabelOrganizationScope, // label: scope + ).Inc() if ghResp != nil && ghResp.StatusCode == http.StatusUnauthorized { return params.RunnerInfo{}, errors.Wrap(runnerErrors.ErrUnauthorized, "fetching workflow info") } @@ -223,8 +258,16 @@ func (r *organization) GetGithubRunners() ([]*github.Runner, error) { var allRunners []*github.Runner for { + metrics.GithubOperationCount.WithLabelValues( + "ListOrganizationRunners", // label: operation + metricsLabelOrganizationScope, // label: scope + ).Inc() runners, ghResp, err := r.ghcli.ListOrganizationRunners(r.ctx, r.cfg.Name, &opts) if err != nil { + metrics.GithubOperationFailedCount.WithLabelValues( + "ListOrganizationRunners", // label: operation + metricsLabelOrganizationScope, // label: scope + ).Inc() if ghResp != nil && ghResp.StatusCode == http.StatusUnauthorized { return nil, errors.Wrap(runnerErrors.ErrUnauthorized, "fetching runners") } @@ -243,8 +286,16 @@ func (r *organization) GetGithubRunners() ([]*github.Runner, error) { func (r *organization) FetchTools() ([]commonParams.RunnerApplicationDownload, error) { r.mux.Lock() defer r.mux.Unlock() + metrics.GithubOperationCount.WithLabelValues( + "ListOrganizationRunnerApplicationDownloads", // label: operation + metricsLabelOrganizationScope, // label: scope + ).Inc() tools, ghResp, err := r.ghcli.ListOrganizationRunnerApplicationDownloads(r.ctx, r.cfg.Name) if err != nil { + metrics.GithubOperationFailedCount.WithLabelValues( + "ListOrganizationRunnerApplicationDownloads", // label: operation + metricsLabelOrganizationScope, // label: scope + ).Inc() if ghResp != nil && ghResp.StatusCode == http.StatusUnauthorized { return nil, errors.Wrap(runnerErrors.ErrUnauthorized, "fetching tools") } @@ -267,7 +318,21 @@ func (r *organization) FetchDbInstances() ([]params.Instance, error) { } func (r *organization) RemoveGithubRunner(runnerID int64) (*github.Response, error) { - return r.ghcli.RemoveOrganizationRunner(r.ctx, r.cfg.Name, runnerID) + metrics.GithubOperationCount.WithLabelValues( + "RemoveRunner", // label: operation + metricsLabelOrganizationScope, // label: scope + ).Inc() + + ghResp, err := r.ghcli.RemoveOrganizationRunner(r.ctx, r.cfg.Name, runnerID) + if err != nil { + metrics.GithubOperationFailedCount.WithLabelValues( + "RemoveRunner", // label: operation + metricsLabelOrganizationScope, // label: scope + ).Inc() + return nil, err + } + + return ghResp, nil } func (r *organization) ListPools() ([]params.Pool, error) { @@ -340,8 +405,16 @@ func (r *organization) listHooks(ctx context.Context) ([]*github.Hook, error) { } var allHooks []*github.Hook for { + metrics.GithubOperationCount.WithLabelValues( + "ListOrgHooks", // label: operation + metricsLabelOrganizationScope, // label: scope + ).Inc() hooks, ghResp, err := r.ghcli.ListOrgHooks(ctx, r.cfg.Name, &opts) if err != nil { + metrics.GithubOperationFailedCount.WithLabelValues( + "ListOrgHooks", // label: operation + metricsLabelOrganizationScope, // label: scope + ).Inc() if ghResp != nil && ghResp.StatusCode == http.StatusNotFound { return nil, runnerErrors.NewBadRequestError("organization not found or your PAT does not have access to manage webhooks") } @@ -366,12 +439,30 @@ func (r *organization) InstallHook(ctx context.Context, req *github.Hook) (param return params.HookInfo{}, errors.Wrap(err, "validating hook request") } + metrics.GithubOperationCount.WithLabelValues( + "CreateOrgHook", // label: operation + metricsLabelOrganizationScope, // label: scope + ).Inc() + hook, _, err := r.ghcli.CreateOrgHook(ctx, r.cfg.Name, req) if err != nil { + metrics.GithubOperationFailedCount.WithLabelValues( + "CreateOrgHook", // label: operation + metricsLabelOrganizationScope, // label: scope + ).Inc() return params.HookInfo{}, errors.Wrap(err, "creating organization hook") } + metrics.GithubOperationCount.WithLabelValues( + "PingOrgHook", // label: operation + metricsLabelOrganizationScope, // label: scope + ).Inc() + if _, err := r.ghcli.PingOrgHook(ctx, r.cfg.Name, hook.GetID()); err != nil { + metrics.GithubOperationFailedCount.WithLabelValues( + "PingOrgHook", // label: operation + metricsLabelOrganizationScope, // label: scope + ).Inc() slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to ping hook", "hook_id", hook.GetID()) } @@ -386,8 +477,16 @@ func (r *organization) UninstallHook(ctx context.Context, url string) error { for _, hook := range allHooks { if hook.Config["url"] == url { + metrics.GithubOperationCount.WithLabelValues( + "DeleteOrgHook", // label: operation + metricsLabelOrganizationScope, // label: scope + ).Inc() _, err = r.ghcli.DeleteOrgHook(ctx, r.cfg.Name, hook.GetID()) if err != nil { + metrics.GithubOperationFailedCount.WithLabelValues( + "DeleteOrgHook", // label: operation + metricsLabelOrganizationScope, // label: scope + ).Inc() return errors.Wrap(err, "deleting hook") } return nil diff --git a/runner/pool/pool.go b/runner/pool/pool.go index b15e3677..8e69ff03 100644 --- a/runner/pool/pool.go +++ b/runner/pool/pool.go @@ -57,6 +57,10 @@ const ( // before we give up. // TODO: make this configurable(?) maxCreateAttempts = 5 + + metricsLabelEnterpriseScope = "Enterprise" + metricsLabelRepositoryScope = "Repository" + metricsLabelOrganizationScope = "Organization" ) type keyMutex struct { diff --git a/runner/pool/repository.go b/runner/pool/repository.go index 0505eece..c21e49a0 100644 --- a/runner/pool/repository.go +++ b/runner/pool/repository.go @@ -27,6 +27,7 @@ import ( runnerErrors "github.com/cloudbase/garm-provider-common/errors" commonParams "github.com/cloudbase/garm-provider-common/params" dbCommon "github.com/cloudbase/garm/database/common" + "github.com/cloudbase/garm/metrics" "github.com/cloudbase/garm/params" "github.com/cloudbase/garm/runner/common" "github.com/cloudbase/garm/util" @@ -110,7 +111,17 @@ func (r *repository) GetJITConfig(ctx context.Context, instance string, pool par defer func() { if err != nil && runner != nil { + metrics.GithubOperationCount.WithLabelValues( + "RemoveRunner", // label: operation + metricsLabelRepositoryScope, // label: scope + ).Inc() _, innerErr := r.ghcli.RemoveRunner(r.ctx, r.cfg.Owner, r.cfg.Name, runner.GetID()) + if innerErr != nil { + metrics.GithubOperationFailedCount.WithLabelValues( + "RemoveRunner", // label: operation + metricsLabelRepositoryScope, // label: scope + ).Inc() + } slog.With(slog.Any("error", innerErr)).ErrorContext( ctx, "failed to remove runner", "runner_id", runner.GetID(), @@ -144,8 +155,16 @@ func (r *repository) GetRunnerInfoFromWorkflow(job params.WorkflowJob) (params.R if err := r.ValidateOwner(job); err != nil { return params.RunnerInfo{}, errors.Wrap(err, "validating owner") } + metrics.GithubOperationCount.WithLabelValues( + "GetWorkflowJobByID", // label: operation + metricsLabelRepositoryScope, // label: scope + ).Inc() workflow, ghResp, err := r.ghcli.GetWorkflowJobByID(r.ctx, job.Repository.Owner.Login, job.Repository.Name, job.WorkflowJob.ID) if err != nil { + metrics.GithubOperationFailedCount.WithLabelValues( + "GetWorkflowJobByID", // label: operation + metricsLabelRepositoryScope, // label: scope + ).Inc() if ghResp != nil && ghResp.StatusCode == http.StatusUnauthorized { return params.RunnerInfo{}, errors.Wrap(runnerErrors.ErrUnauthorized, "fetching workflow info") } @@ -189,8 +208,16 @@ func (r *repository) GetGithubRunners() ([]*github.Runner, error) { var allRunners []*github.Runner for { + metrics.GithubOperationCount.WithLabelValues( + "ListRunners", // label: operation + metricsLabelRepositoryScope, // label: scope + ).Inc() runners, ghResp, err := r.ghcli.ListRunners(r.ctx, r.cfg.Owner, r.cfg.Name, &opts) if err != nil { + metrics.GithubOperationFailedCount.WithLabelValues( + "ListRunners", // label: operation + metricsLabelRepositoryScope, // label: scope + ).Inc() if ghResp != nil && ghResp.StatusCode == http.StatusUnauthorized { return nil, errors.Wrap(runnerErrors.ErrUnauthorized, "fetching runners") } @@ -209,8 +236,16 @@ func (r *repository) GetGithubRunners() ([]*github.Runner, error) { func (r *repository) FetchTools() ([]commonParams.RunnerApplicationDownload, error) { r.mux.Lock() defer r.mux.Unlock() + metrics.GithubOperationCount.WithLabelValues( + "ListRunnerApplicationDownloads", // label: operation + metricsLabelRepositoryScope, // label: scope + ).Inc() tools, ghResp, err := r.ghcli.ListRunnerApplicationDownloads(r.ctx, r.cfg.Owner, r.cfg.Name) if err != nil { + metrics.GithubOperationFailedCount.WithLabelValues( + "ListRunnerApplicationDownloads", // label: operation + metricsLabelRepositoryScope, // label: scope + ).Inc() if ghResp != nil && ghResp.StatusCode == http.StatusUnauthorized { return nil, errors.Wrap(runnerErrors.ErrUnauthorized, "fetching tools") } @@ -233,7 +268,20 @@ func (r *repository) FetchDbInstances() ([]params.Instance, error) { } func (r *repository) RemoveGithubRunner(runnerID int64) (*github.Response, error) { - return r.ghcli.RemoveRunner(r.ctx, r.cfg.Owner, r.cfg.Name, runnerID) + metrics.GithubOperationCount.WithLabelValues( + "RemoveRunner", // label: operation + metricsLabelRepositoryScope, // label: scope + ).Inc() + ghResp, err := r.ghcli.RemoveRunner(r.ctx, r.cfg.Owner, r.cfg.Name, runnerID) + if err != nil { + metrics.GithubOperationFailedCount.WithLabelValues( + "RemoveRunner", // label: operation + metricsLabelRepositoryScope, // label: scope + ).Inc() + return nil, err + } + + return ghResp, nil } func (r *repository) ListPools() ([]params.Pool, error) { @@ -253,9 +301,17 @@ func (r *repository) JwtToken() string { } func (r *repository) GetGithubRegistrationToken() (string, error) { + metrics.GithubOperationCount.WithLabelValues( + "CreateRegistrationToken", // label: operation + metricsLabelRepositoryScope, // label: scope + ).Inc() tk, ghResp, err := r.ghcli.CreateRegistrationToken(r.ctx, r.cfg.Owner, r.cfg.Name) if err != nil { + metrics.GithubOperationFailedCount.WithLabelValues( + "CreateRegistrationToken", // label: operation + metricsLabelRepositoryScope, // label: scope + ).Inc() if ghResp != nil && ghResp.StatusCode == http.StatusUnauthorized { return "", errors.Wrap(runnerErrors.ErrUnauthorized, "fetching token") } @@ -305,8 +361,16 @@ func (r *repository) listHooks(ctx context.Context) ([]*github.Hook, error) { } var allHooks []*github.Hook for { + metrics.GithubOperationCount.WithLabelValues( + "ListRepoHooks", // label: operation + metricsLabelRepositoryScope, // label: scope + ).Inc() hooks, ghResp, err := r.ghcli.ListRepoHooks(ctx, r.cfg.Owner, r.cfg.Name, &opts) if err != nil { + metrics.GithubOperationCount.WithLabelValues( + "ListRepoHooks", // label: operation + metricsLabelRepositoryScope, // label: scope + ).Inc() if ghResp != nil && ghResp.StatusCode == http.StatusNotFound { return nil, runnerErrors.NewBadRequestError("repository not found or your PAT does not have access to manage webhooks") } @@ -336,7 +400,16 @@ func (r *repository) InstallHook(ctx context.Context, req *github.Hook) (params. return params.HookInfo{}, errors.Wrap(err, "creating repository hook") } + metrics.GithubOperationCount.WithLabelValues( + "PingRepoHook", // label: operation + metricsLabelRepositoryScope, // label: scope + ).Inc() + if _, err := r.ghcli.PingRepoHook(ctx, r.cfg.Owner, r.cfg.Name, hook.GetID()); err != nil { + metrics.GithubOperationFailedCount.WithLabelValues( + "PingRepoHook", // label: operation + metricsLabelRepositoryScope, // label: scope + ).Inc() slog.With(slog.Any("error", err)).ErrorContext( ctx, "failed to ping hook", "hook_id", hook.GetID(), @@ -355,8 +428,16 @@ func (r *repository) UninstallHook(ctx context.Context, url string) error { for _, hook := range allHooks { if hook.Config["url"] == url { + metrics.GithubOperationCount.WithLabelValues( + "DeleteRepoHook", // label: operation + metricsLabelRepositoryScope, // label: scope + ).Inc() _, err = r.ghcli.DeleteRepoHook(ctx, r.cfg.Owner, r.cfg.Name, hook.GetID()) if err != nil { + metrics.GithubOperationFailedCount.WithLabelValues( + "DeleteRepoHook", // label: operation + metricsLabelRepositoryScope, // label: scope + ).Inc() return errors.Wrap(err, "deleting hook") } return nil From 9716b1d8c9a7e700e5dded2ad433b066df331145 Mon Sep 17 00:00:00 2001 From: Mario Constanti Date: Wed, 21 Feb 2024 14:29:43 +0100 Subject: [PATCH 3/5] fix: align metric names Signed-off-by: Mario Constanti --- metrics/github.go | 4 ++-- metrics/instance.go | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/metrics/github.go b/metrics/github.go index c4f043a1..0c050652 100644 --- a/metrics/github.go +++ b/metrics/github.go @@ -6,14 +6,14 @@ var ( GithubOperationCount = prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: metricsNamespace, Subsystem: metricsGithubSubsystem, - Name: "operation_total", + Name: "operations_total", Help: "Total number of github operation attempts", }, []string{"operation", "scope"}) GithubOperationFailedCount = prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: metricsNamespace, Subsystem: metricsGithubSubsystem, - Name: "operation_failed_total", + Name: "errors_total", Help: "Total number of failed github operation attempts", }, []string{"operation", "scope"}) ) diff --git a/metrics/instance.go b/metrics/instance.go index 1dc61d16..7c2f2f96 100644 --- a/metrics/instance.go +++ b/metrics/instance.go @@ -15,14 +15,14 @@ var ( InstanceOperationCount = prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: metricsNamespace, Subsystem: metricsRunnerSubsystem, - Name: "operation_total", + Name: "operations_total", Help: "Total number of instance operation attempts", }, []string{"operation", "provider"}) InstanceOperationFailedCount = prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: metricsNamespace, Subsystem: metricsRunnerSubsystem, - Name: "operation_failed_total", + Name: "errors_total", Help: "Total number of failed instance operation attempts", }, []string{"operation", "provider"}) ) From 6cb6350602ec6f609cbee8aa17ce94dd6ed199cf Mon Sep 17 00:00:00 2001 From: Mario Constanti Date: Wed, 21 Feb 2024 14:41:08 +0100 Subject: [PATCH 4/5] doc: document new operations/errors metrics Signed-off-by: Mario Constanti --- doc/config_metrics.md | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/doc/config_metrics.md b/doc/config_metrics.md index 17161616..7e5318e3 100644 --- a/doc/config_metrics.md +++ b/doc/config_metrics.md @@ -48,11 +48,18 @@ This is one of the features in GARM that I really love having. For one thing, it ## Runner metrics -| Metric name | Type | Labels | Description | -|----------------------|-------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------| -| `garm_runner_status` | Gauge | `name`=<runner name>
`pool_owner`=<owner name>
`pool_type`=<repository\|organization\|enterprise>
`provider`=<provider name>
`runner_status`=<running\|stopped\|error\|pending_delete\|deleting\|pending_create\|creating\|unknown>
`status`=<idle\|pending\|terminated\|installing\|failed\|active>
| This is a gauge value that gives us details about the runners garm spawns | - -More metrics will be added in the future. +| Metric name | Type | Labels | Description | +|--------------------------------|---------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------| +| `garm_runner_status` | Gauge | `name`=<runner name>
`pool_owner`=<owner name>
`pool_type`=<repository\|organization\|enterprise>
`provider`=<provider name>
`runner_status`=<running\|stopped\|error\|pending_delete\|deleting\|pending_create\|creating\|unknown>
`status`=<idle\|pending\|terminated\|installing\|failed\|active>
| This is a gauge value that gives us details about the runners garm spawns | +| `garm_runner_operations_total` | Counter | `provider`=<provider name>
`operation`=<CreateInstance\|DeleteInstance\|GetInstance\|ListInstances\|RemoveAllInstances\|Start\Stop> | This is a counter that increments every time a runner operation is performed | +| `garm_runner_errors_total` | Counter | `provider`=<provider name>
`operation`=<CreateInstance\|DeleteInstance\|GetInstance\|ListInstances\|RemoveAllInstances\|Start\Stop> | This is a counter that increments every time a runner operation errored | + +## Github metrics + +| Metric name | Type | Labels | Description | +|--------------------------------|---------|------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------| +| `garm_github_operations_total` | Counter | `operation`=<ListRunners\|CreateRegistrationToken\|...>
`scope`=<Organization\|Repository\|Enterprise> | This is a counter that increments every time a github operation is performed | +| `garm_github_errors_total` | Counter | `operation`=<ListRunners\|CreateRegistrationToken\|...>
`scope`=<Organization\|Repository\|Enterprise> | This is a counter that increments every time a github operation errored | ## Enabling metrics From d68cc3bf051833ba893779c874ff0dcad3f1faf2 Mon Sep 17 00:00:00 2001 From: Mario Constanti Date: Thu, 22 Feb 2024 05:51:34 +0100 Subject: [PATCH 5/5] fix: add missing metrics for few gh api callS Signed-off-by: Mario Constanti --- metrics/metrics.go | 1 - runner/pool/enterprise.go | 14 +++++++++++++- runner/pool/organization.go | 8 ++++++++ runner/pool/repository.go | 17 +++++++++++++++++ 4 files changed, 38 insertions(+), 2 deletions(-) diff --git a/metrics/metrics.go b/metrics/metrics.go index 62bdf2c2..44b75031 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -16,7 +16,6 @@ const metricsGithubSubsystem = "github" // RegisterMetrics registers all the metrics func RegisterMetrics() error { - var collectors []prometheus.Collector collectors = append(collectors, diff --git a/runner/pool/enterprise.go b/runner/pool/enterprise.go index cf318c65..e71f04d4 100644 --- a/runner/pool/enterprise.go +++ b/runner/pool/enterprise.go @@ -306,7 +306,19 @@ func (r *enterprise) FetchDbInstances() ([]params.Instance, error) { } func (r *enterprise) RemoveGithubRunner(runnerID int64) (*github.Response, error) { - return r.ghcEnterpriseCli.RemoveRunner(r.ctx, r.cfg.Name, runnerID) + metrics.GithubOperationCount.WithLabelValues( + "RemoveRunner", // label: operation + metricsLabelEnterpriseScope, // label: scope + ).Inc() + ghResp, err := r.ghcEnterpriseCli.RemoveRunner(r.ctx, r.cfg.Name, runnerID) + if err != nil { + metrics.GithubOperationFailedCount.WithLabelValues( + "RemoveRunner", // label: operation + metricsLabelEnterpriseScope, // label: scope + ).Inc() + return nil, err + } + return ghResp, nil } func (r *enterprise) ListPools() ([]params.Pool, error) { diff --git a/runner/pool/organization.go b/runner/pool/organization.go index 3de5a9b1..0eee23f4 100644 --- a/runner/pool/organization.go +++ b/runner/pool/organization.go @@ -352,9 +352,17 @@ func (r *organization) JwtToken() string { } func (r *organization) GetGithubRegistrationToken() (string, error) { + metrics.GithubOperationCount.WithLabelValues( + "CreateOrganizationRegistrationToken", // label: operation + metricsLabelOrganizationScope, // label: scope + ).Inc() tk, ghResp, err := r.ghcli.CreateOrganizationRegistrationToken(r.ctx, r.cfg.Name) if err != nil { + metrics.GithubOperationFailedCount.WithLabelValues( + "CreateOrganizationRegistrationToken", // label: operation + metricsLabelOrganizationScope, // label: scope + ).Inc() if ghResp != nil && ghResp.StatusCode == http.StatusUnauthorized { return "", errors.Wrap(runnerErrors.ErrUnauthorized, "fetching token") } diff --git a/runner/pool/repository.go b/runner/pool/repository.go index c21e49a0..9ba77c13 100644 --- a/runner/pool/repository.go +++ b/runner/pool/repository.go @@ -100,8 +100,16 @@ func (r *repository) GetJITConfig(ctx context.Context, instance string, pool par // TODO(gabriel-samfira): Should we make this configurable? WorkFolder: github.String("_work"), } + metrics.GithubOperationCount.WithLabelValues( + "GenerateRepoJITConfig", // label: operation + metricsLabelRepositoryScope, // label: scope + ).Inc() jitConfig, resp, err := r.ghcli.GenerateRepoJITConfig(ctx, r.cfg.Owner, r.cfg.Name, &req) if err != nil { + metrics.GithubOperationFailedCount.WithLabelValues( + "GenerateRepoJITConfig", // label: operation + metricsLabelRepositoryScope, // label: scope + ).Inc() if resp != nil && resp.StatusCode == http.StatusUnauthorized { return nil, nil, fmt.Errorf("failed to get JIT config: %w", err) } @@ -395,8 +403,17 @@ func (r *repository) InstallHook(ctx context.Context, req *github.Hook) (params. return params.HookInfo{}, errors.Wrap(err, "validating hook request") } + metrics.GithubOperationCount.WithLabelValues( + "CreateRepoHook", // label: operation + metricsLabelRepositoryScope, // label: scope + ).Inc() + hook, _, err := r.ghcli.CreateRepoHook(ctx, r.cfg.Owner, r.cfg.Name, req) if err != nil { + metrics.GithubOperationFailedCount.WithLabelValues( + "CreateRepoHook", // label: operation + metricsLabelRepositoryScope, // label: scope + ).Inc() return params.HookInfo{}, errors.Wrap(err, "creating repository hook") }