Skip to content

Commit

Permalink
prometheus: improve UX, add grafana, node_exporter, custom dashboards
Browse files Browse the repository at this point in the history
We already had the ability to deploy a prometheus instance to a node in
the cluster. However, to run experiments / long investigations[^1] we
often need a Grafana instance with the dashboards du jour. This commit
dramatically cuts down on the manual steps needed to get this set up.

All it takes is adding setup like this to the roachtest:

```
clusNodes := c.Range(1, c.Spec().NodeCount-1)
workloadNode := c.Node(c.Spec().NodeCount)
promNode := workloadNode
cfg := (&prometheus.Config{}).
        WithCluster(clusNodes).
        WithPrometheusNode(promNode).
        WithGrafanaDashboard("https://gist.githubusercontent.com/tbg/f238d578269143187e71a1046562225f/raw").
        WithNodeExporter(clusNodes).
        WithWorkload(workloadNode, 2112).
        WithWorkload(workloadNode, 2113)

p, saveSnap, err := prometheus.Init(
        ctx,
        *cfg,
        c,
        t.L(),
        repeatRunner{C: c, T: t}.repeatRunE,
)
require.NoError(t, err)
defer saveSnap(ctx, t.ArtifactsDir())
```

There has been talk[^2] of adding some of this tooling to `roachprod`.
Probably a good idea, but we can pour infinite amount of work into this,
and for now I think this is a good stepping stone and satisfies my
immediate needs.

[^1]: cockroachdb#82109
[^2]: [internal slack](https://cockroachlabs.slack.com/archives/CAC6K3SLU/p1654267035695569?thread_ts=1654153265.215669&cid=CAC6K3SLU)

Release note: None
  • Loading branch information
tbg committed Jun 9, 2022
1 parent 6336800 commit a442b27
Show file tree
Hide file tree
Showing 3 changed files with 177 additions and 38 deletions.
169 changes: 160 additions & 9 deletions pkg/cmd/roachtest/prometheus/prometheus.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,67 @@ type ScrapeConfig struct {
type Config struct {
PrometheusNode option.NodeListOption
ScrapeConfigs []ScrapeConfig
NodeExporter option.NodeListOption
Grafana GrafanaConfig
}

// GrafanaConfig are options related to setting up a Grafana instance.
type GrafanaConfig struct {
Enabled bool
// DashboardURLs are URLs (must be accessible by prometheus node, e.g. gists)
// to provision into Grafana. Failure to download them will be ignored.
// Datasource UID for these dashboards should be "localprom" or they won't
// load properly.
//
// NB: when using gists, https://gist.github.com/[gist_user]/[gist_id]/raw/
// provides a link that always references the most up to date version.
DashboardURLs []string
}

// WithWorkload sets up scraping for `workload` processes running on the given
// node(s) and port. Chains for convenience.
func (cfg *Config) WithWorkload(nodes option.NodeListOption, port int) *Config {
sn := ScrapeNode{Nodes: nodes, Port: port}
for i := range cfg.ScrapeConfigs {
sc := &cfg.ScrapeConfigs[i]
if sc.JobName == "workload" {
sc.ScrapeNodes = append(sc.ScrapeNodes, sn)
return cfg
}
}
cfg.ScrapeConfigs = append(cfg.ScrapeConfigs, MakeWorkloadScrapeConfig("workload", []ScrapeNode{sn}))
return cfg
}

// WithPrometheusNode specifies the node to set up prometheus on.
func (cfg *Config) WithPrometheusNode(node option.NodeListOption) *Config {
cfg.PrometheusNode = node
return cfg
}

// WithCluster adds scraping for a CockroachDB cluster running on the given nodes.
// Chains for convenience.
func (cfg *Config) WithCluster(nodes option.NodeListOption) *Config {
cfg.ScrapeConfigs = append(cfg.ScrapeConfigs, MakeInsecureCockroachScrapeConfig(
"cockroach", nodes))
return cfg
}

// WithGrafanaDashboard adds links to dashboards to provision into Grafana. See
// cfg.Grafana.DashboardURLs for helpful tips.
// Enables Grafana if not already enabled.
// Chains for convenience.
func (cfg *Config) WithGrafanaDashboard(url string) *Config {
cfg.Grafana.Enabled = true
cfg.Grafana.DashboardURLs = append(cfg.Grafana.DashboardURLs, url)
return cfg
}

// WithNodeExporter causes node_exporter to be set up on the specified machines.
// Chains for convenience.
func (cfg *Config) WithNodeExporter(nodes option.NodeListOption) *Config {
cfg.NodeExporter = cfg.NodeExporter.Merge(nodes)
return cfg
}

// Cluster is a subset of roachtest.Cluster.
Expand All @@ -74,23 +135,48 @@ func Init(
c Cluster,
l *logger.Logger,
repeatFunc func(context.Context, option.NodeListOption, string, ...string) error,
) (*Prometheus, error) {
if err := c.RunE(
) (_ *Prometheus, saveSnap func(artifactsDir string), _ error) {
if len(cfg.NodeExporter) > 0 {
if err := repeatFunc(ctx, cfg.NodeExporter, "download node exporter",
`
(sudo systemctl stop node_exporter || true) &&
rm -rf node_exporter && mkdir -p node_exporter && curl -fsSL \
https://github.com/prometheus/node_exporter/releases/download/v1.3.1/node_exporter-1.3.1.linux-amd64.tar.gz |
tar zxv --strip-components 1 -C node_exporter
`); err != nil {
return nil, nil, err
}

// Start node_exporter.
if err := c.RunE(ctx, cfg.NodeExporter, `cd node_exporter &&
sudo systemd-run --unit node_exporter --same-dir ./node_exporter`,
); err != nil {
return nil, nil, err
}
cfg.ScrapeConfigs = append(cfg.ScrapeConfigs, ScrapeConfig{
JobName: "node_exporter",
MetricsPath: "/metrics",
ScrapeNodes: []ScrapeNode{{Nodes: cfg.NodeExporter, Port: 9100}},
})
}

if err := repeatFunc(
ctx,
cfg.PrometheusNode,
"reset prometheus",
"sudo systemctl stop prometheus || echo 'no prometheus is running'",
); err != nil {
return nil, err
return nil, nil, err
}

if err := repeatFunc(
ctx,
cfg.PrometheusNode,
"download prometheus",
`rm -rf /tmp/prometheus && mkdir /tmp/prometheus && cd /tmp/prometheus &&
`sudo rm -rf /tmp/prometheus && mkdir /tmp/prometheus && cd /tmp/prometheus &&
curl -fsSL https://storage.googleapis.com/cockroach-fixtures/prometheus/prometheus-2.27.1.linux-amd64.tar.gz | tar zxv --strip-components=1`,
); err != nil {
return nil, err
return nil, nil, err
}

yamlCfg, err := makeYAMLConfig(
Expand All @@ -100,7 +186,7 @@ func Init(
cfg.ScrapeConfigs,
)
if err != nil {
return nil, err
return nil, nil, err
}

if err := c.PutString(
Expand All @@ -110,7 +196,7 @@ func Init(
0644,
cfg.PrometheusNode,
); err != nil {
return nil, err
return nil, nil, err
}

// Start prometheus as systemd.
Expand All @@ -121,9 +207,74 @@ func Init(
sudo systemd-run --unit prometheus --same-dir \
./prometheus --config.file=prometheus.yml --storage.tsdb.path=data/ --web.enable-admin-api`,
); err != nil {
return nil, err
return nil, nil, err
}
return &Prometheus{Config: cfg}, nil

if cfg.Grafana.Enabled {
// Install Grafana.
if err := repeatFunc(ctx, cfg.PrometheusNode, "install grafana",
`sudo apt-get install -qqy apt-transport-https &&
sudo apt-get install -qqy software-properties-common wget &&
wget -q -O - https://packages.grafana.com/gpg.key | sudo apt-key add - &&
echo "deb https://packages.grafana.com/enterprise/deb stable main" | sudo tee -a /etc/apt/sources.list.d/grafana.list &&
sudo apt-get update -qqy && sudo apt-get install -qqy grafana-enterprise && sudo mkdir -p /var/lib/grafana/dashboards`,
); err != nil {
return nil, nil, err
}

// Provision local prometheus instance as data source.
if err := repeatFunc(ctx, cfg.PrometheusNode, "permissions",
`sudo chmod 777 /etc/grafana/provisioning/datasources /etc/grafana/provisioning/dashboards /var/lib/grafana/dashboards`,
); err != nil {
return nil, nil, err
}
if err := c.PutString(ctx, `apiVersion: 1
datasources:
- name: prometheusdata
type: prometheus
uid: localprom
url: http://localhost:9090
`, "/etc/grafana/provisioning/datasources/prometheus.yaml", 0644, cfg.PrometheusNode); err != nil {
return nil, nil, err
}

if err := c.PutString(ctx, `apiVersion: 1
providers:
- name: 'default'
orgId: 1
folder: ''
folderUid: ''
type: file
options:
path: /var/lib/grafana/dashboards
`, "/etc/grafana/provisioning/dashboards/cockroach.yaml", 0644, cfg.PrometheusNode); err != nil {
return nil, nil, err
}

for idx, u := range cfg.Grafana.DashboardURLs {
if err := c.RunE(ctx, cfg.PrometheusNode,
"curl", "-fsSL", u, "-o", fmt.Sprintf("/var/lib/grafana/dashboards/%d.json", idx),
); err != nil {
l.PrintfCtx(ctx, "failed to download dashboard from %s: %s", u, err)
}
}

// Start Grafana. Default port is 3000.
if err := c.RunE(ctx, cfg.PrometheusNode, `sudo systemctl restart grafana-server`); err != nil {
return nil, nil, err
}
}

p := &Prometheus{Config: cfg}
return p, func(destDir string) {
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
defer cancel()
if err := p.Snapshot(ctx, c, l, destDir); err != nil {
l.Printf("failed to get prometheus snapshot: %v", err)
}
}, nil
}

// Snapshot takes a snapshot of prometheus and stores the snapshot and a script to spin up
Expand Down
11 changes: 11 additions & 0 deletions pkg/cmd/roachtest/tests/canary.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,17 @@ var canaryRetryOptions = retry.Options{
MaxRetries: 10,
}

type repeatRunner struct {
T test.Test
C cluster.Cluster
}

func (rr repeatRunner) repeatRunE(
ctx context.Context, node option.NodeListOption, operation string, args ...string,
) error {
return repeatRunE(ctx, rr.T, rr.C, node, operation, args...)
}

// repeatRunE is the same function as c.RunE but with an automatic retry loop.
func repeatRunE(
ctx context.Context,
Expand Down
35 changes: 6 additions & 29 deletions pkg/cmd/roachtest/tests/tpcc.go
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ func runTPCC(ctx context.Context, t test.Test, c cluster.Cluster, opts tpccOptio
}

var ep *tpccChaosEventProcessor
promCfg, cleanupFunc := setupPrometheus(ctx, t, c, opts, workloadInstances)
promCfg, cleanupFunc := setupPrometheusForTPCC(ctx, t, c, opts, workloadInstances)
defer cleanupFunc()
if opts.ChaosEventsProcessor != nil {
if promCfg == nil {
Expand Down Expand Up @@ -1413,13 +1413,13 @@ func makeWorkloadScrapeNodes(
return workloadScrapeNodes
}

// setupPrometheus initializes prometheus to run against the provided
// setupPrometheusForTPCC initializes prometheus to run against the provided
// PrometheusConfig. If no PrometheusConfig is provided, it creates a prometheus
// scraper for all CockroachDB nodes in the TPC-C setup, as well as one for
// each workloadInstance.
// Returns the created PrometheusConfig if prometheus is initialized, as well
// as a cleanup function which should be called in a defer statement.
func setupPrometheus(
func setupPrometheusForTPCC(
ctx context.Context,
t test.Test,
c cluster.Cluster,
Expand Down Expand Up @@ -1455,38 +1455,15 @@ func setupPrometheus(
t.Skip("skipping test as prometheus is needed, but prometheus does not yet work locally")
return nil, func() {}
}
p, err := prometheus.Init(
_, saveSnap, err := prometheus.Init(
ctx,
*cfg,
c,
t.L(),
func(ctx context.Context, nodes option.NodeListOption, operation string, args ...string) error {
return repeatRunE(
ctx,
t,
c,
nodes,
operation,
args...,
)
},
repeatRunner{C: c, T: t}.repeatRunE,
)
if err != nil {
t.Fatal(err)
}

return cfg, func() {
// Use a context that will not time out to avoid the issue where
// ctx gets canceled if t.Fatal gets called.
snapshotCtx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
defer cancel()
if err := p.Snapshot(
snapshotCtx,
c,
t.L(),
t.ArtifactsDir(),
); err != nil {
t.L().Printf("failed to get prometheus snapshot: %v", err)
}
}
return cfg, func() { saveSnap(t.ArtifactsDir()) }
}

0 comments on commit a442b27

Please sign in to comment.