Skip to content

Commit

Permalink
roachprod: add promethius/grafana monitoring
Browse files Browse the repository at this point in the history
Release note: none
  • Loading branch information
msbutler committed Jun 16, 2022
1 parent d82ac30 commit fef4dd6
Show file tree
Hide file tree
Showing 8 changed files with 824 additions and 0 deletions.
24 changes: 24 additions & 0 deletions pkg/cmd/roachprod/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -885,6 +885,28 @@ var getProvidersCmd = &cobra.Command{
},
}

var startGrafanaCmd = &cobra.Command{
Use: `start-grafana <cluster> [<grafanaConfigURL>]`,
Short: `spins up a promethius and grafana instance on an each roachprod node.`,
Long: `by default, the prom and grafana instances on the lowest numbered node in the cluster
and will scrape from all nodes provided`,
Args: cobra.RangeArgs(1, 2),
RunE: func(cmd *cobra.Command, args []string) error {
return roachprod.InitGrafana(context.Background(), roachprodLibraryLogger, args[0], args[1])
},
}

var stopGrafanaCmd = &cobra.Command{
Use: `stop-grafana <cluster>`,
Short: `spins down promethius and grafana instances`,
Long: fmt.Sprintf(`spins down the promethius and grafana instances on provided roachprod node and
dumps the promethius data into %s`, config.ClustersDir),
Args: cobra.ExactArgs(1),
RunE: func(cmd *cobra.Command, args []string) error {
return roachprod.StopGrafana(context.Background(), roachprodLibraryLogger, args[0])
},
}

func main() {
loggerCfg := logger.Config{Stdout: os.Stdout, Stderr: os.Stderr}
var loggerError error
Expand Down Expand Up @@ -935,6 +957,8 @@ func main() {
cachedHostsCmd,
versionCmd,
getProvidersCmd,
startGrafanaCmd,
stopGrafanaCmd,
)
setBashCompletionFunction()

Expand Down
1 change: 1 addition & 0 deletions pkg/roachprod/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ go_library(
"//pkg/roachprod/config",
"//pkg/roachprod/install",
"//pkg/roachprod/logger",
"//pkg/roachprod/prometheus",
"//pkg/roachprod/vm",
"//pkg/roachprod/vm/aws",
"//pkg/roachprod/vm/azure",
Expand Down
1 change: 1 addition & 0 deletions pkg/roachprod/install/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ go_library(
"//pkg/util",
"//pkg/util/httputil",
"//pkg/util/log",
"//pkg/util/retry",
"//pkg/util/syncutil",
"//pkg/util/timeutil",
"@com_github_alessio_shellescape//:shellescape",
Expand Down
57 changes: 57 additions & 0 deletions pkg/roachprod/install/cluster_synced.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/roachprod/vm/aws"
"github.com/cockroachdb/cockroach/pkg/roachprod/vm/local"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/cockroach/pkg/util/retry"
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
"github.com/cockroachdb/errors"
Expand Down Expand Up @@ -749,6 +750,35 @@ func (c *SyncedCluster) RunWithDetails(
return results, nil
}

var roachprodRetryOptions = retry.Options{
InitialBackoff: 10 * time.Second,
Multiplier: 2,
MaxBackoff: 5 * time.Minute,
MaxRetries: 10,
}

// repeatRun is the same function as c.Run but with an automatic retry loop.
func (c *SyncedCluster) RepeatRun(
ctx context.Context, l *logger.Logger, stdout, stderr io.Writer, nodes Nodes, title,
cmd string,
) error {
var lastError error
for attempt, r := 0, retry.StartWithCtx(ctx, roachprodRetryOptions); r.Next(); {
if ctx.Err() != nil {
return ctx.Err()
}
attempt++
l.Printf("attempt %d - %s", attempt, title)
lastError = c.Run(ctx, l, stdout, stderr, nodes, title, cmd)
if lastError != nil {
l.Printf("error - retrying: %s", lastError)
continue
}
return nil
}
return errors.Wrapf(lastError, "all attempts failed for %s", title)
}

// Wait TODO(peter): document
func (c *SyncedCluster) Wait(ctx context.Context, l *logger.Logger) error {
display := fmt.Sprintf("%s: waiting for nodes to start", c.Name)
Expand Down Expand Up @@ -1195,6 +1225,33 @@ func formatProgress(p float64) string {
return fmt.Sprintf("[%s%s] %.0f%%", progressDone[i:], progressTodo[:i], 100*p)
}

// PutString into the specified file on the remote(s).
func (c *SyncedCluster) PutString(
ctx context.Context, l *logger.Logger, content string, dest string, mode os.FileMode,
) error {
if ctx.Err() != nil {
return errors.Wrap(ctx.Err(), "syncedCluster.PutString")
}

temp, err := ioutil.TempFile("", filepath.Base(dest))
if err != nil {
return errors.Wrap(err, "cluster.PutString")
}
if _, err := temp.WriteString(content); err != nil {
return errors.Wrap(err, "cluster.PutString")
}
temp.Close()
src := temp.Name()

if err := os.Chmod(src, mode); err != nil {
return errors.Wrap(err, "cluster.PutString")
}
// NB: we intentionally don't remove the temp files. This is because roachprod
// will symlink them when running locally.

return errors.Wrap(c.Put(ctx, l, src, dest), "syncedCluster.PutString")
}

// Put TODO(peter): document
func (c *SyncedCluster) Put(ctx context.Context, l *logger.Logger, src, dest string) error {
// Check if source file exists and if it's a symlink.
Expand Down
27 changes: 27 additions & 0 deletions pkg/roachprod/prometheus/BUILD.bazel
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")

go_library(
name = "prometheus",
srcs = ["prometheus.go"],
importpath = "github.com/cockroachdb/cockroach/pkg/roachprod/prometheus",
visibility = ["//visibility:public"],
deps = [
"//pkg/roachprod/config",
"//pkg/roachprod/install",
"//pkg/roachprod/logger",
"@com_github_cockroachdb_errors//:errors",
"@com_github_prometheus_client_golang//api/prometheus/v1:prometheus",
"@com_github_prometheus_common//model",
"@in_gopkg_yaml_v2//:yaml_v2",
],
)

go_test(
name = "prometheus_test",
srcs = ["prometheus_test.go"],
embed = [":prometheus"],
deps = [
"//pkg/roachprod/install",
"@com_github_stretchr_testify//require",
],
)
Loading

0 comments on commit fef4dd6

Please sign in to comment.