Skip to content

Commit

Permalink
roachprod: add promethius/grafana monitoring
Browse files Browse the repository at this point in the history
Previously, only roachtests could spin up prom/grafana servers that lasted the
lifetime of the roachtest. This PR introduces new roachprod cmds that allow
a roachprod user to easily spin up/down their own prom/grafana instances. The PR
also hooks up roachtests that rely on prom/grafana into this new infrastructure.

Release note: none
  • Loading branch information
msbutler committed Jun 28, 2022
1 parent 443c4d5 commit d1d3c42
Show file tree
Hide file tree
Showing 20 changed files with 707 additions and 488 deletions.
16 changes: 16 additions & 0 deletions pkg/cmd/roachprod/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ var (
destroyAllLocal bool
extendLifetime time.Duration
wipePreserveCerts bool
grafanaConfig string
grafanaurlOpen bool
grafanaDumpDir string
listDetails bool
listJSON bool
listMine bool
Expand Down Expand Up @@ -232,6 +235,19 @@ func initFlags() {
cachedHostsCmd.Flags().StringVar(&cachedHostsCluster,
"cluster", "", "print hosts matching cluster")

// TODO (msbutler): this flag should instead point to a relative file path that's check into
// the repo, not some random URL.
grafanaStartCmd.Flags().StringVar(&grafanaConfig,
"grafana-config", "", "URL to grafana json config")

grafanaURLCmd.Flags().BoolVar(&grafanaurlOpen,
"open", false, "open the grafana dashboard url on the browser")

grafanaStopCmd.Flags().StringVar(&grafanaDumpDir, "dump-dir", "",
"the absolute path, on the machine running roachprod, to dump prometheus data to.\n"+
"In the dump-dir, the 'prometheus-docker-run.sh' script spins up a prometheus UI accessible on \n"+
" 0.0.0.0:9090. If dump-dir is empty, no data will get dumped.")

for _, cmd := range []*cobra.Command{createCmd, destroyCmd, extendCmd, logsCmd} {
cmd.Flags().StringVarP(&username, "username", "u", os.Getenv("ROACHPROD_USER"),
"Username to run under, detect if blank")
Expand Down
43 changes: 43 additions & 0 deletions pkg/cmd/roachprod/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -885,6 +885,46 @@ var getProvidersCmd = &cobra.Command{
},
}

var grafanaStartCmd = &cobra.Command{
Use: `grafana-start <cluster>`,
Short: `spins up a prometheus and grafana instances on the last node in the cluster`,
Long: `spins up a prometheus and grafana instances on the highest numbered node in the cluster
and will scrape from all nodes in the cluster`,
Args: cobra.ExactArgs(1),
Run: wrap(func(cmd *cobra.Command, args []string) error {
return roachprod.StartGrafana(context.Background(), roachprodLibraryLogger, args[0],
grafanaConfig, nil)
}),
}

var grafanaStopCmd = &cobra.Command{
Use: `grafana-stop <cluster>`,
Short: `spins down prometheus and grafana instances on the last node in the cluster`,
Long: `spins down the prometheus and grafana instances on the last node in the cluster`,
Args: cobra.ExactArgs(1),
Run: wrap(func(cmd *cobra.Command, args []string) error {
return roachprod.StopGrafana(context.Background(), roachprodLibraryLogger, args[0], grafanaDumpDir)
}),
}

var grafanaURLCmd = &cobra.Command{
Use: `grafanaurl <cluster>`,
Short: `returns a url to the grafana dashboard`,
Args: cobra.ExactArgs(1),
Run: wrap(func(cmd *cobra.Command, args []string) error {
urls, err := roachprod.GrafanaURL(context.Background(), roachprodLibraryLogger, args[0],
grafanaurlOpen)
if err != nil {
return err
}
for _, url := range urls {
fmt.Println(url)
}
fmt.Println("username: admin; pwd: admin")
return nil
}),
}

func main() {
loggerCfg := logger.Config{Stdout: os.Stdout, Stderr: os.Stderr}
var loggerError error
Expand Down Expand Up @@ -935,6 +975,9 @@ func main() {
cachedHostsCmd,
versionCmd,
getProvidersCmd,
grafanaStartCmd,
grafanaStopCmd,
grafanaURLCmd,
)
setBashCompletionFunction()

Expand Down
1 change: 1 addition & 0 deletions pkg/cmd/roachtest/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ go_library(
"//pkg/roachprod/config",
"//pkg/roachprod/install",
"//pkg/roachprod/logger",
"//pkg/roachprod/prometheus",
"//pkg/roachprod/vm",
"//pkg/testutils/skip",
"//pkg/util/contextutil",
Expand Down
11 changes: 11 additions & 0 deletions pkg/cmd/roachtest/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/roachprod"
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
"github.com/cockroachdb/cockroach/pkg/roachprod/prometheus"
"github.com/cockroachdb/cockroach/pkg/roachprod/vm"
"github.com/cockroachdb/cockroach/pkg/util/contextutil"
"github.com/cockroachdb/cockroach/pkg/util/log"
Expand Down Expand Up @@ -2384,3 +2385,13 @@ func (c *clusterImpl) Extend(ctx context.Context, d time.Duration, l *logger.Log
func (c *clusterImpl) NewMonitor(ctx context.Context, opts ...option.Option) cluster.Monitor {
return newMonitor(ctx, c.t, c, opts...)
}

func (c *clusterImpl) StartGrafana(
ctx context.Context, l *logger.Logger, promCfg *prometheus.Config,
) error {
return roachprod.StartGrafana(ctx, l, c.name, "", promCfg)
}

func (c *clusterImpl) StopGrafana(ctx context.Context, l *logger.Logger, dumpDir string) error {
return roachprod.StopGrafana(ctx, l, c.name, dumpDir)
}
1 change: 1 addition & 0 deletions pkg/cmd/roachtest/cluster/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ go_library(
"//pkg/cmd/roachtest/test",
"//pkg/roachprod/install",
"//pkg/roachprod/logger",
"//pkg/roachprod/prometheus",
"@com_github_cockroachdb_errors//:errors",
],
)
4 changes: 4 additions & 0 deletions pkg/cmd/roachtest/cluster/cluster_interface.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
"github.com/cockroachdb/cockroach/pkg/roachprod/prometheus"
)

// Cluster is the interface through which a given roachtest interacts with the
Expand Down Expand Up @@ -131,4 +132,7 @@ type Cluster interface {

FetchTimeseriesData(ctx context.Context, t test.Test) error
RefetchCertsFromNode(ctx context.Context, node int) error

StartGrafana(ctx context.Context, l *logger.Logger, promCfg *prometheus.Config) error
StopGrafana(ctx context.Context, l *logger.Logger, dumpDir string) error
}
11 changes: 11 additions & 0 deletions pkg/cmd/roachtest/option/node_list_option.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ import (
"math/rand"
"sort"
"strconv"

"github.com/cockroachdb/cockroach/pkg/roachprod/install"
)

// A NodeListOption is a slice of roachprod node identifiers. The first node is
Expand Down Expand Up @@ -116,3 +118,12 @@ func (n NodeListOption) String() string {
}
return buf.String()
}

// InstallNodes converts the NodeListOption to install.Nodes
func (n NodeListOption) InstallNodes() install.Nodes {
installNodes := make(install.Nodes, 0, len(n))
for _, i := range n {
installNodes = append(installNodes, install.Node(i))
}
return installNodes
}
11 changes: 0 additions & 11 deletions pkg/cmd/roachtest/tests/canary.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,17 +109,6 @@ var canaryRetryOptions = retry.Options{
MaxRetries: 10,
}

type repeatRunner struct {
T test.Test
C cluster.Cluster
}

func (rr repeatRunner) repeatRunE(
ctx context.Context, node option.NodeListOption, operation string, args ...string,
) error {
return repeatRunE(ctx, rr.T, rr.C, node, operation, args...)
}

// repeatRunE is the same function as c.RunE but with an automatic retry loop.
func repeatRunE(
ctx context.Context,
Expand Down
45 changes: 21 additions & 24 deletions pkg/cmd/roachtest/tests/tpcc.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,6 @@ type workloadInstance struct {
}

const workloadPProfStartPort = 33333
const workloadPrometheusPort = 2112

// tpccImportCmd generates the command string to load tpcc data for the
// specified warehouse count into a cluster.
Expand Down Expand Up @@ -183,8 +182,7 @@ func runTPCC(ctx context.Context, t test.Test, c cluster.Cluster, opts tpccOptio
workloadInstances = append(
workloadInstances,
workloadInstance{
nodes: c.Range(1, c.Spec().NodeCount-1),
prometheusPort: workloadPrometheusPort,
nodes: c.Range(1, c.Spec().NodeCount-1),
},
)
}
Expand All @@ -202,7 +200,7 @@ func runTPCC(ctx context.Context, t test.Test, c cluster.Cluster, opts tpccOptio
return
}
cep, err := opts.ChaosEventsProcessor(
promCfg.PrometheusNode,
c.Nodes(int(promCfg.PrometheusNode[0])),
workloadInstances,
)
if err != nil {
Expand Down Expand Up @@ -1401,13 +1399,13 @@ func registerTPCCBench(r registry.Registry) {

// makeWorkloadScrapeNodes creates a ScrapeNode for every workloadInstance.
func makeWorkloadScrapeNodes(
workloadNode option.NodeListOption, workloadInstances []workloadInstance,
workloadNode install.Node, workloadInstances []workloadInstance,
) []prometheus.ScrapeNode {
workloadScrapeNodes := make([]prometheus.ScrapeNode, len(workloadInstances))
for i, workloadInstance := range workloadInstances {
workloadScrapeNodes[i] = prometheus.ScrapeNode{
Nodes: workloadNode,
Port: workloadInstance.prometheusPort,
Node: workloadNode,
Port: workloadInstance.prometheusPort,
}
}
return workloadScrapeNodes
Expand Down Expand Up @@ -1435,14 +1433,14 @@ func setupPrometheusForTPCC(
if opts.DisablePrometheus {
return nil, func() {}
}
workloadNode := c.Node(c.Spec().NodeCount)
cfg = &prometheus.Config{
PrometheusNode: workloadNode,
// Scrape each CockroachDB node and the workload node.
ScrapeConfigs: append(prometheus.MakeInsecureCockroachScrapeConfig(c.Range(1, c.Spec().NodeCount-1)),
prometheus.MakeWorkloadScrapeConfig("workload", makeWorkloadScrapeNodes(workloadNode, workloadInstances)),
),
}
cfg = &prometheus.Config{}
workloadNode := c.Node(c.Spec().NodeCount).InstallNodes()[0]
cfg.WithPrometheusNode(workloadNode)
cfg.WithNodeExporter(c.Range(1, c.Spec().NodeCount-1).InstallNodes())
cfg.WithCluster(c.Range(1, c.Spec().NodeCount-1).InstallNodes())
cfg.ScrapeConfigs = append(cfg.ScrapeConfigs, prometheus.MakeWorkloadScrapeConfig("workload",
"/", makeWorkloadScrapeNodes(workloadNode, workloadInstances)))

}
if opts.DisablePrometheus {
t.Fatal("test has PrometheusConfig but DisablePrometheus was on")
Expand All @@ -1451,15 +1449,14 @@ func setupPrometheusForTPCC(
t.Skip("skipping test as prometheus is needed, but prometheus does not yet work locally")
return nil, func() {}
}
_, saveSnap, err := prometheus.Init(
ctx,
*cfg,
c,
t.L(),
repeatRunner{C: c, T: t}.repeatRunE,
)
if err != nil {

if err := c.StartGrafana(ctx, t.L(), cfg); err != nil {
t.Fatal(err)
}
return cfg, func() { saveSnap(t.ArtifactsDir()) }
cleanupFunc := func() {
if err := c.StopGrafana(ctx, t.L(), t.ArtifactsDir()); err != nil {
t.L().ErrorfCtx(ctx, "Error(s) shutting down prom/grafana %s", err)
}
}
return cfg, cleanupFunc
}
1 change: 0 additions & 1 deletion pkg/gen/gomock.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ GOMOCK_SRCS = [
"//pkg/kv/kvclient/rangecache/rangecachemock:mocks_generated.go",
"//pkg/kv/kvclient/rangefeed:mocks_generated_test.go",
"//pkg/roachpb/roachpbmock:mocks_generated.go",
"//pkg/roachprod/prometheus:mocks_generated_test.go",
"//pkg/security/certmgr:mocks_generated_test.go",
"//pkg/sql/schemachanger/scexec:mocks_generated_test.go",
"//pkg/util/log:mocks_generated_test.go",
Expand Down
1 change: 1 addition & 0 deletions pkg/roachprod/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ go_library(
"//pkg/roachprod/config",
"//pkg/roachprod/install",
"//pkg/roachprod/logger",
"//pkg/roachprod/prometheus",
"//pkg/roachprod/vm",
"//pkg/roachprod/vm/aws",
"//pkg/roachprod/vm/azure",
Expand Down
1 change: 1 addition & 0 deletions pkg/roachprod/install/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ go_library(
"//pkg/util",
"//pkg/util/httputil",
"//pkg/util/log",
"//pkg/util/retry",
"//pkg/util/syncutil",
"//pkg/util/timeutil",
"//pkg/util/version",
Expand Down
Loading

0 comments on commit d1d3c42

Please sign in to comment.