Skip to content

Commit

Permalink
roachprod: support for dynamic admin url port
Browse files Browse the repository at this point in the history
In order to support multiple tenants on the same host, a unique, custom port can be assigned for DefaultAdminUIPort. This breaks because the port is hard-coded in prometheus config for scraping.

The change is to have a http server running in prometheus server that can dynamically update the configuration when a new cluster is brought up. More details of the solution is explained in the page - https://cockroachlabs.atlassian.net/wiki/spaces/~7120207825326fb5e546c194029506f2c5335e/pages/3458531376/Dynamic+Scrape+Configs+on+Prometheus+for+Roachprod

Fixes: cockroachdb#117125
Epic: none
  • Loading branch information
nameisbhaskar committed May 7, 2024
1 parent e90d15c commit 59123f6
Show file tree
Hide file tree
Showing 11 changed files with 697 additions and 5 deletions.
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ require (
)

require (
cloud.google.com/go/secretmanager v1.10.0
github.com/Azure/azure-sdk-for-go/sdk/azcore v1.3.0
github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.1.0
github.com/Azure/azure-sdk-for-go/sdk/keyvault/azkeys v0.9.0
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ cloud.google.com/go/pubsub v1.2.0/go.mod h1:jhfEVHT8odbXTkndysNHCcx0awwzvfOlguIA
cloud.google.com/go/pubsub v1.3.1/go.mod h1:i+ucay31+CNRpDW4Lu78I4xXG+O1r/MAHgjpRVR+TSU=
cloud.google.com/go/pubsub v1.28.0 h1:XzabfdPx/+eNrsVVGLFgeUnQQKPGkMb8klRCeYK52is=
cloud.google.com/go/pubsub v1.28.0/go.mod h1:vuXFpwaVoIPQMGXqRyUQigu/AX1S3IWugR9xznmcXX8=
cloud.google.com/go/secretmanager v1.10.0 h1:pu03bha7ukxF8otyPKTFdDz+rr9sE3YauS5PliDXK60=
cloud.google.com/go/secretmanager v1.10.0/go.mod h1:MfnrdvKMPNra9aZtQFvBcvRU54hbPD8/HayQdlUgJpU=
cloud.google.com/go/storage v1.0.0/go.mod h1:IhtSnM/ZTZV8YYJWCY8RULGVqBDmpoyjwiyrjsg+URw=
cloud.google.com/go/storage v1.5.0/go.mod h1:tpKbwo567HUNpVclU5sGELwQWBDZ8gh0ZeosJ0Rtdos=
cloud.google.com/go/storage v1.6.0/go.mod h1:N7U0C8pVQ/+NIKOBQyamJIeKQKkZ+mxpohlUTyfDhBk=
Expand Down
3 changes: 3 additions & 0 deletions pkg/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,7 @@ ALL_TESTS = [
"//pkg/roachprod/config:config_test",
"//pkg/roachprod/install:install_test",
"//pkg/roachprod/prometheus:prometheus_test",
"//pkg/roachprod/promhelperclient:promhelperclient_test",
"//pkg/roachprod/ssh:ssh_test",
"//pkg/roachprod/vm/gce:gce_test",
"//pkg/roachprod/vm/local:local_test",
Expand Down Expand Up @@ -1560,6 +1561,8 @@ GO_TARGETS = [
"//pkg/roachprod/logger:logger",
"//pkg/roachprod/prometheus:prometheus",
"//pkg/roachprod/prometheus:prometheus_test",
"//pkg/roachprod/promhelperclient:promhelperclient",
"//pkg/roachprod/promhelperclient:promhelperclient_test",
"//pkg/roachprod/ssh:ssh",
"//pkg/roachprod/ssh:ssh_test",
"//pkg/roachprod/ui:ui",
Expand Down
27 changes: 25 additions & 2 deletions pkg/cmd/roachprod/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -544,6 +544,30 @@ cluster setting will be set to its value.
}),
}

var updateTargetsCmd = &cobra.Command{
Use: "update-targets <cluster>",
Short: "update prometheus target configurations for a cluster",
Long: `Update prometheus target configurations of each node of a cluster.
The "start" command updates the prometheus target configuration every time. But, in case of any
failure, this command can be used to update the configurations.
The --args and --env flags can be used to pass arbitrary command line flags and
environment variables to the cockroach process.
` + tagHelp + `
The default prometheus url is https://grafana.testeng.crdb.io/. This can be overwritten by using the
environment variable COCKROACH_PROM_HOST_URL
`,
Args: cobra.ExactArgs(1),
Run: wrap(func(cmd *cobra.Command, args []string) error {
clusterSettingsOpts := []install.ClusterSettingOption{
install.TagOption(tag),
install.EnvOption(nodeEnv),
}
return roachprod.UpdateTargets(context.Background(), config.Logger, args[0], clusterSettingsOpts...)
}),
}

var stopCmd = &cobra.Command{
Use: "stop <cluster> [--sig] [--wait]",
Short: "stop nodes on a cluster",
Expand Down Expand Up @@ -619,8 +643,6 @@ environment variables to the cockroach process.
install.EnvOption(nodeEnv),
install.NumRacksOption(numRacks),
}
// TODO(DarrylWong): remove once #117125 is addressed.
startOpts.AdminUIPort = 0

startOpts.Target = install.StartSharedProcessForVirtualCluster
// If the user passed an `--external-nodes` option, we are
Expand Down Expand Up @@ -1718,6 +1740,7 @@ func main() {
statusCmd,
monitorCmd,
startCmd,
updateTargetsCmd,
stopCmd,
startInstanceCmd,
stopInstanceCmd,
Expand Down
2 changes: 2 additions & 0 deletions pkg/roachprod/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,15 @@ go_library(
"//pkg/roachprod/lock",
"//pkg/roachprod/logger",
"//pkg/roachprod/prometheus",
"//pkg/roachprod/promhelperclient",
"//pkg/roachprod/vm",
"//pkg/roachprod/vm/aws",
"//pkg/roachprod/vm/azure",
"//pkg/roachprod/vm/gce",
"//pkg/roachprod/vm/local",
"//pkg/server/debug/replay",
"//pkg/util/ctxgroup",
"//pkg/util/envutil",
"//pkg/util/httputil",
"//pkg/util/retry",
"//pkg/util/syncutil",
Expand Down
33 changes: 33 additions & 0 deletions pkg/roachprod/promhelperclient/BUILD.bazel
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test")

go_library(
name = "promhelperclient",
srcs = [
"client.go",
"promhelper_utils.go",
],
importpath = "github.com/cockroachdb/cockroach/pkg/roachprod/promhelperclient",
visibility = ["//visibility:public"],
deps = [
"//pkg/roachprod/logger",
"//pkg/util/httputil",
"@com_github_cockroachdb_errors//:errors",
"@com_google_cloud_go_secretmanager//apiv1",
"@com_google_cloud_go_secretmanager//apiv1/secretmanagerpb",
"@org_golang_google_api//idtoken",
"@org_golang_x_oauth2//:oauth2",
],
)

go_test(
name = "promhelperclient_test",
srcs = ["client_test.go"],
embed = [":promhelperclient"],
deps = [
"//pkg/roachprod/logger",
"//pkg/util/httputil",
"@com_github_stretchr_testify//require",
"@org_golang_google_api//idtoken",
"@org_golang_x_oauth2//:oauth2",
],
)
213 changes: 213 additions & 0 deletions pkg/roachprod/promhelperclient/client.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
// Copyright 2024 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.

// Utility to connect and invoke APIs of the promhelperservice.
// Doc reference - https://cockroachlabs.atlassian.net/wiki/x/MAAlzg

package promhelperclient

import (
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"os"
"strconv"
"strings"
"text/template"

"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
"github.com/cockroachdb/cockroach/pkg/util/httputil"
"github.com/cockroachdb/errors"
"golang.org/x/oauth2"
"google.golang.org/api/idtoken"
)

const (
resourceName = "instance-configs"
resourceVersion = "v1"

ServiceAccountJson = "PROM_HELPER_SERVICE_ACCOUNT_JSON"
ServiceAccountAudience = "PROM_HELPER_SERVICE_ACCOUNT_AUDIENCE"
)

// PromClient is used to communicate with the prometheus helper service
// keeping the functions as a variable enables us to override the value for unit testing
type PromClient struct {
// httpPut is used for http PUT operation.
httpPut func(
ctx context.Context, url string, h *httputil.RequestHeaders, body io.Reader,
) (resp *http.Response, err error)
// httpDelete is used for http DELETE operation.
httpDelete func(ctx context.Context, url string, h *httputil.RequestHeaders) (
resp *http.Response, err error)
// newTokenSource is the token generator source.
newTokenSource func(ctx context.Context, audience string, opts ...idtoken.ClientOption) (
oauth2.TokenSource, error)
}

// NewPromClient returns a new instance of PromClient
func NewPromClient() *PromClient {
return &PromClient{
httpPut: httputil.Put,
httpDelete: httputil.Delete,
newTokenSource: idtoken.NewTokenSource,
}
}

// instanceConfigRequest is the HTTP request received for generating instance config
type instanceConfigRequest struct {
//Config is the content of the yaml file
Config string `json:"config"`
}

// UpdatePrometheusTargets updates the cluster config in the promUrl
func (c *PromClient) UpdatePrometheusTargets(
ctx context.Context,
promUrl, clusterName string,
forceFetchCreds bool,
nodes []string,
l *logger.Logger,
) error {
req, err := buildCreateRequest(nodes)
if err != nil {
return err
}
token, err := c.getToken(ctx, promUrl, forceFetchCreds, l)
if err != nil {
return err
}
url := getUrl(promUrl, clusterName)
l.Printf("invoking PUT for URL: %s", url)
response, err := c.httpPut(ctx, url, &httputil.RequestHeaders{
ContentType: "application/json",
Authorization: token,
}, req)
if err != nil {
return err
}
if response.StatusCode != http.StatusOK {
defer func() { _ = response.Body.Close() }()
if response.StatusCode == http.StatusUnauthorized && !forceFetchCreds {
l.Printf("request failed - this may be due to a stale token. retrying with forceFetchCreds true ...")
return c.UpdatePrometheusTargets(ctx, promUrl, clusterName, true, nodes, l)
}
body, err := io.ReadAll(response.Body)
if err != nil {
return err
}
return errors.Newf("request failed with status %d and error %s", response.StatusCode,
string(body))
}
return nil
}

// DeleteClusterConfig deletes the cluster config in the promUrl
func (c *PromClient) DeleteClusterConfig(
ctx context.Context, promUrl, clusterName string, forceFetchCreds bool, l *logger.Logger,
) error {
if _, err := SetPromHelperCredsEnv(ctx, false, l); err != nil {
return err
}
token, err := c.getToken(ctx, promUrl, forceFetchCreds, l)
if err != nil {
return err
}
url := getUrl(promUrl, clusterName)
l.Printf("invoking DELETE for URL: %s", url)
response, err := c.httpDelete(ctx, url, &httputil.RequestHeaders{
Authorization: token,
})
if err != nil {
return err
}
if response.StatusCode != http.StatusNoContent {
defer func() { _ = response.Body.Close() }()
if response.StatusCode == http.StatusUnauthorized && !forceFetchCreds {
return c.DeleteClusterConfig(ctx, promUrl, clusterName, true, l)
}
body, err := io.ReadAll(response.Body)
if err != nil {
return err
}
return errors.Newf("request failed with status %d and error %s", response.StatusCode,
string(body))
}
return nil
}

func getUrl(promUrl, clusterName string) string {
return fmt.Sprintf("%s/%s/%s/%s", promUrl, resourceVersion, resourceName, clusterName)
}

// ccParams are the params for the clusterConfFileTemplate
type ccParams struct {
Targets []string
Labels []string
}

const clusterConfFileTemplate = `- targets:
{{range $val := .Targets}} - {{$val}}
{{end}} labels:
{{range $val := .Labels}} {{$val}}
{{end}}
`

// createClusterConfigFile creates the cluster config file per node
func buildCreateRequest(nodes []string) (io.Reader, error) {
buffer := bytes.NewBufferString("---\n")
for i, n := range nodes {
params := &ccParams{
Targets: []string{n},
Labels: make([]string, 0),
}
params.Labels = append(params.Labels,
fmt.Sprintf("node: \"%s\"", strconv.Itoa(i+1)),
"tenant: system",
)
t := template.Must(template.New("start").Parse(clusterConfFileTemplate))
if err := t.Execute(buffer, params); err != nil {
return nil, err
}
}

b, err := json.Marshal(&instanceConfigRequest{Config: buffer.String()})
if err != nil {
return nil, err
}
return bytes.NewReader(b), nil
}

// getToken gets the Authorization token for grafana
func (c *PromClient) getToken(
ctx context.Context, promUrl string, forceFetchCreds bool, l *logger.Logger,
) (string, error) {
if strings.HasPrefix(promUrl, "http:/") {
// no token needed for insecure URL
return "", nil
}
// Read in the service account key and audience, so we can retrieve the identity token.
if _, err := SetPromHelperCredsEnv(ctx, forceFetchCreds, l); err != nil {
return "", err
}
grafanaKey := os.Getenv(ServiceAccountJson)
grafanaAudience := os.Getenv(ServiceAccountAudience)
ts, err := c.newTokenSource(ctx, grafanaAudience, idtoken.WithCredentialsJSON([]byte(grafanaKey)))
if err != nil {
return "", errors.Wrap(err, "error creating GCS oauth token source from specified credential")
}
token, err := ts.Token()
if err != nil {
return "", errors.Wrap(err, "error getting identity token")
}
return fmt.Sprintf("Bearer %s", token.AccessToken), nil
}
Loading

0 comments on commit 59123f6

Please sign in to comment.