From d29043574b23f44fa344c0428afe78c0f8456295 Mon Sep 17 00:00:00 2001 From: Bhaskarjyoti Bora Date: Thu, 2 May 2024 12:00:46 +0530 Subject: [PATCH] roachprod: support for dynamic admin url port In order to support multiple tenants on the same host, a unique, custom port can be assigned for DefaultAdminUIPort. This breaks because the port is hard-coded in prometheus config for scraping. The change is to have a http server running in prometheus server that can dynamically update the configuration when a new cluster is brought up. More details of the solution is explained in the page - https://cockroachlabs.atlassian.net/wiki/spaces/~7120207825326fb5e546c194029506f2c5335e/pages/3458531376/Dynamic+Scrape+Configs+on+Prometheus+for+Roachprod Fixes: #117125 Epic: none --- go.mod | 1 + go.sum | 2 + pkg/BUILD.bazel | 3 + pkg/cmd/roachprod/main.go | 27 ++- pkg/roachprod/BUILD.bazel | 2 + pkg/roachprod/promhelperclient/BUILD.bazel | 33 +++ pkg/roachprod/promhelperclient/client.go | 214 ++++++++++++++++++ pkg/roachprod/promhelperclient/client_test.go | 189 ++++++++++++++++ .../promhelperclient/promhelper_utils.go | 109 +++++++++ pkg/roachprod/roachprod.go | 78 ++++++- pkg/util/httputil/client.go | 56 +++++ 11 files changed, 709 insertions(+), 5 deletions(-) create mode 100644 pkg/roachprod/promhelperclient/BUILD.bazel create mode 100644 pkg/roachprod/promhelperclient/client.go create mode 100644 pkg/roachprod/promhelperclient/client_test.go create mode 100644 pkg/roachprod/promhelperclient/promhelper_utils.go diff --git a/go.mod b/go.mod index 642f0a0d41d3..32e757d9b00a 100644 --- a/go.mod +++ b/go.mod @@ -87,6 +87,7 @@ require ( ) require ( + cloud.google.com/go/secretmanager v1.10.0 github.com/Azure/azure-sdk-for-go/sdk/azcore v1.3.0 github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.1.0 github.com/Azure/azure-sdk-for-go/sdk/keyvault/azkeys v0.9.0 diff --git a/go.sum b/go.sum index 09d6e5a787ab..c389cf175c05 100644 --- a/go.sum +++ b/go.sum @@ -61,6 +61,8 @@ cloud.google.com/go/pubsub v1.2.0/go.mod h1:jhfEVHT8odbXTkndysNHCcx0awwzvfOlguIA cloud.google.com/go/pubsub v1.3.1/go.mod h1:i+ucay31+CNRpDW4Lu78I4xXG+O1r/MAHgjpRVR+TSU= cloud.google.com/go/pubsub v1.28.0 h1:XzabfdPx/+eNrsVVGLFgeUnQQKPGkMb8klRCeYK52is= cloud.google.com/go/pubsub v1.28.0/go.mod h1:vuXFpwaVoIPQMGXqRyUQigu/AX1S3IWugR9xznmcXX8= +cloud.google.com/go/secretmanager v1.10.0 h1:pu03bha7ukxF8otyPKTFdDz+rr9sE3YauS5PliDXK60= +cloud.google.com/go/secretmanager v1.10.0/go.mod h1:MfnrdvKMPNra9aZtQFvBcvRU54hbPD8/HayQdlUgJpU= cloud.google.com/go/storage v1.0.0/go.mod h1:IhtSnM/ZTZV8YYJWCY8RULGVqBDmpoyjwiyrjsg+URw= cloud.google.com/go/storage v1.5.0/go.mod h1:tpKbwo567HUNpVclU5sGELwQWBDZ8gh0ZeosJ0Rtdos= cloud.google.com/go/storage v1.6.0/go.mod h1:N7U0C8pVQ/+NIKOBQyamJIeKQKkZ+mxpohlUTyfDhBk= diff --git a/pkg/BUILD.bazel b/pkg/BUILD.bazel index 70c9718311dd..0b9d3fef311f 100644 --- a/pkg/BUILD.bazel +++ b/pkg/BUILD.bazel @@ -294,6 +294,7 @@ ALL_TESTS = [ "//pkg/roachprod/config:config_test", "//pkg/roachprod/install:install_test", "//pkg/roachprod/prometheus:prometheus_test", + "//pkg/roachprod/promhelperclient:promhelperclient_test", "//pkg/roachprod/ssh:ssh_test", "//pkg/roachprod/vm/gce:gce_test", "//pkg/roachprod/vm/local:local_test", @@ -1563,6 +1564,8 @@ GO_TARGETS = [ "//pkg/roachprod/logger:logger", "//pkg/roachprod/prometheus:prometheus", "//pkg/roachprod/prometheus:prometheus_test", + "//pkg/roachprod/promhelperclient:promhelperclient", + "//pkg/roachprod/promhelperclient:promhelperclient_test", "//pkg/roachprod/ssh:ssh", "//pkg/roachprod/ssh:ssh_test", "//pkg/roachprod/ui:ui", diff --git a/pkg/cmd/roachprod/main.go b/pkg/cmd/roachprod/main.go index b5d6682ecb56..c02013e34612 100644 --- a/pkg/cmd/roachprod/main.go +++ b/pkg/cmd/roachprod/main.go @@ -551,6 +551,30 @@ cluster setting will be set to its value. }), } +var updateTargetsCmd = &cobra.Command{ + Use: "update-targets ", + Short: "update prometheus target configurations for a cluster", + Long: `Update prometheus target configurations of each node of a cluster. + +The "start" command updates the prometheus target configuration every time. But, in case of any +failure, this command can be used to update the configurations. + +The --args and --env flags can be used to pass arbitrary command line flags and +environment variables to the cockroach process. +` + tagHelp + ` +The default prometheus url is https://grafana.testeng.crdb.io/. This can be overwritten by using the +environment variable COCKROACH_PROM_HOST_URL +`, + Args: cobra.ExactArgs(1), + Run: wrap(func(cmd *cobra.Command, args []string) error { + clusterSettingsOpts := []install.ClusterSettingOption{ + install.TagOption(tag), + install.EnvOption(nodeEnv), + } + return roachprod.UpdateTargets(context.Background(), config.Logger, args[0], clusterSettingsOpts...) + }), +} + var stopCmd = &cobra.Command{ Use: "stop [--sig] [--wait]", Short: "stop nodes on a cluster", @@ -626,8 +650,6 @@ environment variables to the cockroach process. install.EnvOption(nodeEnv), install.NumRacksOption(numRacks), } - // TODO(DarrylWong): remove once #117125 is addressed. - startOpts.AdminUIPort = 0 startOpts.Target = install.StartSharedProcessForVirtualCluster // If the user passed an `--external-nodes` option, we are @@ -1755,6 +1777,7 @@ func main() { statusCmd, monitorCmd, startCmd, + updateTargetsCmd, stopCmd, startInstanceCmd, stopInstanceCmd, diff --git a/pkg/roachprod/BUILD.bazel b/pkg/roachprod/BUILD.bazel index b5c9cac3b621..cf264585eccf 100644 --- a/pkg/roachprod/BUILD.bazel +++ b/pkg/roachprod/BUILD.bazel @@ -20,6 +20,7 @@ go_library( "//pkg/roachprod/lock", "//pkg/roachprod/logger", "//pkg/roachprod/prometheus", + "//pkg/roachprod/promhelperclient", "//pkg/roachprod/vm", "//pkg/roachprod/vm/aws", "//pkg/roachprod/vm/azure", @@ -27,6 +28,7 @@ go_library( "//pkg/roachprod/vm/local", "//pkg/server/debug/replay", "//pkg/util/ctxgroup", + "//pkg/util/envutil", "//pkg/util/httputil", "//pkg/util/retry", "//pkg/util/syncutil", diff --git a/pkg/roachprod/promhelperclient/BUILD.bazel b/pkg/roachprod/promhelperclient/BUILD.bazel new file mode 100644 index 000000000000..414455850422 --- /dev/null +++ b/pkg/roachprod/promhelperclient/BUILD.bazel @@ -0,0 +1,33 @@ +load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") + +go_library( + name = "promhelperclient", + srcs = [ + "client.go", + "promhelper_utils.go", + ], + importpath = "github.com/cockroachdb/cockroach/pkg/roachprod/promhelperclient", + visibility = ["//visibility:public"], + deps = [ + "//pkg/roachprod/logger", + "//pkg/util/httputil", + "@com_github_cockroachdb_errors//:errors", + "@com_google_cloud_go_secretmanager//apiv1", + "@com_google_cloud_go_secretmanager//apiv1/secretmanagerpb", + "@org_golang_google_api//idtoken", + "@org_golang_x_oauth2//:oauth2", + ], +) + +go_test( + name = "promhelperclient_test", + srcs = ["client_test.go"], + embed = [":promhelperclient"], + deps = [ + "//pkg/roachprod/logger", + "//pkg/util/httputil", + "@com_github_stretchr_testify//require", + "@org_golang_google_api//idtoken", + "@org_golang_x_oauth2//:oauth2", + ], +) diff --git a/pkg/roachprod/promhelperclient/client.go b/pkg/roachprod/promhelperclient/client.go new file mode 100644 index 000000000000..0adbf9505e9f --- /dev/null +++ b/pkg/roachprod/promhelperclient/client.go @@ -0,0 +1,214 @@ +// Copyright 2024 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +// Utility to connect and invoke APIs of the promhelperservice. +// Doc reference - https://cockroachlabs.atlassian.net/wiki/x/MAAlzg + +package promhelperclient + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "os" + "strconv" + "strings" + "text/template" + + "github.com/cockroachdb/cockroach/pkg/roachprod/logger" + "github.com/cockroachdb/cockroach/pkg/util/httputil" + "github.com/cockroachdb/errors" + "golang.org/x/oauth2" + "google.golang.org/api/idtoken" +) + +const ( + resourceName = "instance-configs" + resourceVersion = "v1" + + ServiceAccountJson = "PROM_HELPER_SERVICE_ACCOUNT_JSON" + ServiceAccountAudience = "PROM_HELPER_SERVICE_ACCOUNT_AUDIENCE" +) + +// PromClient is used to communicate with the prometheus helper service +// keeping the functions as a variable enables us to override the value for unit testing +type PromClient struct { + // httpPut is used for http PUT operation. + httpPut func( + ctx context.Context, url string, h *httputil.RequestHeaders, body io.Reader, + ) (resp *http.Response, err error) + // httpDelete is used for http DELETE operation. + httpDelete func(ctx context.Context, url string, h *httputil.RequestHeaders) ( + resp *http.Response, err error) + // newTokenSource is the token generator source. + newTokenSource func(ctx context.Context, audience string, opts ...idtoken.ClientOption) ( + oauth2.TokenSource, error) +} + +// NewPromClient returns a new instance of PromClient +func NewPromClient() *PromClient { + return &PromClient{ + httpPut: httputil.Put, + httpDelete: httputil.Delete, + newTokenSource: idtoken.NewTokenSource, + } +} + +// instanceConfigRequest is the HTTP request received for generating instance config +type instanceConfigRequest struct { + //Config is the content of the yaml file + Config string `json:"config"` +} + +// UpdatePrometheusTargets updates the cluster config in the promUrl +func (c *PromClient) UpdatePrometheusTargets( + ctx context.Context, + promUrl, clusterName string, + forceFetchCreds bool, + nodes []string, + l *logger.Logger, +) error { + req, err := buildCreateRequest(nodes) + if err != nil { + return err + } + token, err := c.getToken(ctx, promUrl, forceFetchCreds, l) + if err != nil { + return err + } + url := getUrl(promUrl, clusterName) + l.Printf("invoking PUT for URL: %s", url) + response, err := c.httpPut(ctx, url, &httputil.RequestHeaders{ + ContentType: "application/json", + Authorization: token, + }, req) + if err != nil { + return err + } + if response.StatusCode != http.StatusOK { + defer func() { _ = response.Body.Close() }() + if response.StatusCode == http.StatusUnauthorized && !forceFetchCreds { + l.Printf("request failed - this may be due to a stale token. retrying with forceFetchCreds true ...") + return c.UpdatePrometheusTargets(ctx, promUrl, clusterName, true, nodes, l) + } + body, err := io.ReadAll(response.Body) + if err != nil { + return err + } + return errors.Newf("request failed with status %d and error %s", response.StatusCode, + string(body)) + } + return nil +} + +// DeleteClusterConfig deletes the cluster config in the promUrl +func (c *PromClient) DeleteClusterConfig( + ctx context.Context, promUrl, clusterName string, forceFetchCreds bool, l *logger.Logger, +) error { + token, err := c.getToken(ctx, promUrl, forceFetchCreds, l) + if err != nil { + return err + } + url := getUrl(promUrl, clusterName) + l.Printf("invoking DELETE for URL: %s", url) + response, err := c.httpDelete(ctx, url, &httputil.RequestHeaders{ + Authorization: token, + }) + if err != nil { + return err + } + if response.StatusCode != http.StatusNoContent { + defer func() { _ = response.Body.Close() }() + if response.StatusCode == http.StatusUnauthorized && !forceFetchCreds { + return c.DeleteClusterConfig(ctx, promUrl, clusterName, true, l) + } + body, err := io.ReadAll(response.Body) + if err != nil { + return err + } + return errors.Newf("request failed with status %d and error %s", response.StatusCode, + string(body)) + } + return nil +} + +func getUrl(promUrl, clusterName string) string { + return fmt.Sprintf("%s/%s/%s/%s", promUrl, resourceVersion, resourceName, clusterName) +} + +// ccParams are the params for the clusterConfFileTemplate +type ccParams struct { + Targets []string + Labels []string +} + +const clusterConfFileTemplate = `- targets: +{{range $val := .Targets}} - {{$val}} +{{end}} labels: +{{range $val := .Labels}} {{$val}} +{{end}} +` + +// createClusterConfigFile creates the cluster config file per node +func buildCreateRequest(nodes []string) (io.Reader, error) { + buffer := bytes.NewBufferString("---\n") + for i, n := range nodes { + if n == "" { + // this is an unsupported node + continue + } + params := &ccParams{ + Targets: []string{n}, + Labels: make([]string, 0), + } + params.Labels = append(params.Labels, + fmt.Sprintf("node: \"%s\"", strconv.Itoa(i+1)), + "tenant: system", + ) + t := template.Must(template.New("start").Parse(clusterConfFileTemplate)) + if err := t.Execute(buffer, params); err != nil { + return nil, err + } + } + + b, err := json.Marshal(&instanceConfigRequest{Config: buffer.String()}) + if err != nil { + return nil, err + } + return bytes.NewReader(b), nil +} + +// getToken gets the Authorization token for grafana +func (c *PromClient) getToken( + ctx context.Context, promUrl string, forceFetchCreds bool, l *logger.Logger, +) (string, error) { + if strings.HasPrefix(promUrl, "http:/") { + // no token needed for insecure URL + return "", nil + } + // Read in the service account key and audience, so we can retrieve the identity token. + if _, err := SetPromHelperCredsEnv(ctx, forceFetchCreds, l); err != nil { + return "", err + } + grafanaKey := os.Getenv(ServiceAccountJson) + grafanaAudience := os.Getenv(ServiceAccountAudience) + ts, err := c.newTokenSource(ctx, grafanaAudience, idtoken.WithCredentialsJSON([]byte(grafanaKey))) + if err != nil { + return "", errors.Wrap(err, "error creating GCS oauth token source from specified credential") + } + token, err := ts.Token() + if err != nil { + return "", errors.Wrap(err, "error getting identity token") + } + return fmt.Sprintf("Bearer %s", token.AccessToken), nil +} diff --git a/pkg/roachprod/promhelperclient/client_test.go b/pkg/roachprod/promhelperclient/client_test.go new file mode 100644 index 000000000000..cc4fb6177471 --- /dev/null +++ b/pkg/roachprod/promhelperclient/client_test.go @@ -0,0 +1,189 @@ +// Copyright 2024 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package promhelperclient + +import ( + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "os" + "path/filepath" + "strings" + "testing" + + "github.com/cockroachdb/cockroach/pkg/roachprod/logger" + "github.com/cockroachdb/cockroach/pkg/util/httputil" + "github.com/stretchr/testify/require" + "golang.org/x/oauth2" + "google.golang.org/api/idtoken" +) + +func TestUpdatePrometheusTargets(t *testing.T) { + l := func() *logger.Logger { + l, err := logger.RootLogger("", logger.TeeToStdout) + if err != nil { + panic(err) + } + return l + }() + ctx := context.Background() + promUrl := "http://prom_url.com" + c := NewPromClient() + t.Run("UpdatePrometheusTargets fails with 400", func(t *testing.T) { + c.httpPut = func(ctx context.Context, reqUrl string, h *httputil.RequestHeaders, body io.Reader) ( + resp *http.Response, err error) { + require.Equal(t, getUrl(promUrl, "c1"), reqUrl) + return &http.Response{ + StatusCode: 400, + Body: io.NopCloser(strings.NewReader("failed")), + }, nil + } + err := c.UpdatePrometheusTargets(ctx, promUrl, "c1", false, []string{"n1"}, l) + require.NotNil(t, err) + require.Equal(t, "request failed with status 400 and error failed", err.Error()) + }) + t.Run("UpdatePrometheusTargets succeeds", func(t *testing.T) { + c.httpPut = func(ctx context.Context, url string, h *httputil.RequestHeaders, body io.Reader) ( + resp *http.Response, err error) { + require.Equal(t, getUrl(promUrl, "c1"), url) + ir, err := getInstanceConfigRequest(io.NopCloser(body)) + require.Nil(t, err) + require.Equal(t, `--- +- targets: + - n1 + labels: + node: "1" + tenant: system + +- targets: + - n3 + labels: + node: "3" + tenant: system + +`, ir.Config) + return &http.Response{ + StatusCode: 200, + }, nil + } + err := c.UpdatePrometheusTargets(ctx, promUrl, "c1", false, []string{"n1", "", "n3"}, l) + require.Nil(t, err) + }) +} + +func TestDeleteClusterConfig(t *testing.T) { + l := func() *logger.Logger { + l, err := logger.RootLogger(filepath.Join(t.TempDir(), "test.log"), logger.TeeToStdout) + if err != nil { + panic(err) + } + return l + }() + ctx := context.Background() + promUrl := "http://prom_url.com" + c := NewPromClient() + t.Run("DeleteClusterConfig fails with 400", func(t *testing.T) { + c.httpDelete = func(ctx context.Context, url string, h *httputil.RequestHeaders) ( + resp *http.Response, err error) { + require.Equal(t, getUrl(promUrl, "c1"), url) + return &http.Response{ + StatusCode: 400, + Body: io.NopCloser(strings.NewReader("failed")), + }, nil + } + err := c.DeleteClusterConfig(ctx, promUrl, "c1", false, l) + require.NotNil(t, err) + require.Equal(t, "request failed with status 400 and error failed", err.Error()) + }) + t.Run("DeleteClusterConfig succeeds", func(t *testing.T) { + c.httpDelete = func(ctx context.Context, url string, h *httputil.RequestHeaders) ( + resp *http.Response, err error) { + require.Equal(t, getUrl(promUrl, "c1"), url) + return &http.Response{ + StatusCode: 204, + }, nil + } + err := c.DeleteClusterConfig(ctx, promUrl, "c1", false, l) + require.Nil(t, err) + }) +} + +// getInstanceConfigRequest returns the instanceConfigRequest after parsing the request json +func getInstanceConfigRequest(body io.ReadCloser) (*instanceConfigRequest, error) { + var insConfigReq instanceConfigRequest + if err := json.NewDecoder(body).Decode(&insConfigReq); err != nil { + return nil, err + } + return &insConfigReq, nil +} + +func Test_getToken(t *testing.T) { + ctx := context.Background() + l := func() *logger.Logger { + l, err := logger.RootLogger("", logger.TeeToStdout) + if err != nil { + panic(err) + } + return l + }() + c := NewPromClient() + t.Run("insecure url", func(t *testing.T) { + token, err := c.getToken(ctx, "http://test.com", false, l) + require.Nil(t, err) + require.Empty(t, token) + }) + t.Run("invalid credentials", func(t *testing.T) { + os.Setenv(ServiceAccountJson, "{}") + os.Setenv(ServiceAccountAudience, "dummy_audience") + c.newTokenSource = func(ctx context.Context, audience string, opts ...idtoken.ClientOption) (oauth2.TokenSource, error) { + return nil, fmt.Errorf("invalid") + } + token, err := c.getToken(ctx, "https://test.com", false, l) + require.NotNil(t, err) + require.Empty(t, token) + require.Equal(t, "error creating GCS oauth token source from specified credential: invalid", err.Error()) + }) + t.Run("invalid token", func(t *testing.T) { + os.Setenv(ServiceAccountJson, "{}") + os.Setenv(ServiceAccountAudience, "dummy_audience") + c.newTokenSource = func(ctx context.Context, audience string, opts ...idtoken.ClientOption) (oauth2.TokenSource, error) { + return &mockToken{token: "", err: fmt.Errorf("failed")}, nil + } + token, err := c.getToken(ctx, "https://test.com", false, l) + require.NotNil(t, err) + require.Empty(t, token) + require.Equal(t, "error getting identity token: failed", err.Error()) + }) + t.Run("success", func(t *testing.T) { + os.Setenv(ServiceAccountJson, "{}") + os.Setenv(ServiceAccountAudience, "dummy_audience") + c.newTokenSource = func(ctx context.Context, audience string, opts ...idtoken.ClientOption) (oauth2.TokenSource, error) { + return &mockToken{token: "token"}, nil + } + token, err := c.getToken(ctx, "https://test.com", false, l) + require.Nil(t, err) + require.Equal(t, "Bearer token", token) + }) +} + +type mockToken struct { + token string + err error +} + +func (tk *mockToken) Token() (*oauth2.Token, error) { + if tk.err != nil { + return nil, tk.err + } + return &oauth2.Token{AccessToken: tk.token}, nil +} diff --git a/pkg/roachprod/promhelperclient/promhelper_utils.go b/pkg/roachprod/promhelperclient/promhelper_utils.go new file mode 100644 index 000000000000..a90283e17f50 --- /dev/null +++ b/pkg/roachprod/promhelperclient/promhelper_utils.go @@ -0,0 +1,109 @@ +// Copyright 2024 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package promhelperclient + +import ( + "context" + "fmt" + "os" + "strings" + + secretmanager "cloud.google.com/go/secretmanager/apiv1" + "cloud.google.com/go/secretmanager/apiv1/secretmanagerpb" + "github.com/cockroachdb/cockroach/pkg/roachprod/logger" +) + +var ( + // promCredFile is where the prom helper credentials are stored + promCredFile = os.TempDir() + "promhelpers-secrets" +) + +// FetchedFrom indicates where the credentials have been fetched from. +// this helps in debugging and testing. +type FetchedFrom string + +const ( + Env FetchedFrom = "Env" // fetched from environment + File FetchedFrom = "File" // fetched from the promCredFile + SecretMgr FetchedFrom = "SecretMgr" // fetched from the secrets manager + + // secretsDelimiter is used as a delimeter between service account audience and JSON when stored in promCredFile or + // secrets manager + secretsDelimiter = "--||--" + + // project secrets and versions are for fetching the creds from secrets manager + project = "cockroach-ephemeral" + secrets = "prom-helpers-access" + versions = "latest" +) + +// SetPromHelperCredsEnv sets the environment variables ServiceAccountAudience and +// ServiceAccountJson based on the following conditions: +// > check if forFetch is false +// +// > forceFetch is false +// > if env is set return +// > if env is not set read from the promCredFile +// > if file is available and the creds can be read, set env return +// > read the creds from secrets manager +// +// > set the env variable and save the creds to the promCredFile +func SetPromHelperCredsEnv( + ctx context.Context, forceFetch bool, l *logger.Logger, +) (FetchedFrom, error) { + creds := "" + fetchedFrom := Env + if !forceFetch { // bypass environment anf creds file if forceFetch is false + // check if environment is set + audience := os.Getenv(ServiceAccountAudience) + saJson := os.Getenv(ServiceAccountJson) + if audience != "" && saJson != "" { + l.Printf("Secrets obtained from environment.") + return fetchedFrom, nil + } + // check if the secrets file is available + b, err := os.ReadFile(promCredFile) + if err == nil { + l.Printf("Secrets obtained from temp file: %s", promCredFile) + creds = string(b) + fetchedFrom = File + } + } + if creds == "" { + // creds == "" means (env is not set and the file does not have the creds) or forFetch is true + l.Printf("creds need to be fetched from secret manager.") + client, err := secretmanager.NewClient(ctx) + if err != nil { + return fetchedFrom, err + } + defer func() { _ = client.Close() }() + req := &secretmanagerpb.AccessSecretVersionRequest{ + Name: fmt.Sprintf("projects/%s/secrets/%s/versions/%s", project, secrets, versions), + } + fetchedFrom = SecretMgr + secrets, err := client.AccessSecretVersion(ctx, req) + if err != nil { + return fetchedFrom, err + } + creds = string(secrets.GetPayload().GetData()) + err = os.WriteFile(promCredFile, []byte(creds), 0700) + if err != nil { + l.Errorf("error writing to the credential file: %v", err) + } + } + secretValues := strings.Split(creds, secretsDelimiter) + if len(secretValues) == 2 { + _ = os.Setenv(ServiceAccountAudience, secretValues[0]) + _ = os.Setenv(ServiceAccountJson, secretValues[1]) + return fetchedFrom, nil + } + return fetchedFrom, fmt.Errorf("invalid secret values - %s", creds) +} diff --git a/pkg/roachprod/roachprod.go b/pkg/roachprod/roachprod.go index d40768db9d09..1493c63ba5ec 100644 --- a/pkg/roachprod/roachprod.go +++ b/pkg/roachprod/roachprod.go @@ -41,6 +41,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/roachprod/lock" "github.com/cockroachdb/cockroach/pkg/roachprod/logger" "github.com/cockroachdb/cockroach/pkg/roachprod/prometheus" + "github.com/cockroachdb/cockroach/pkg/roachprod/promhelperclient" "github.com/cockroachdb/cockroach/pkg/roachprod/vm" "github.com/cockroachdb/cockroach/pkg/roachprod/vm/aws" "github.com/cockroachdb/cockroach/pkg/roachprod/vm/azure" @@ -48,6 +49,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/roachprod/vm/local" "github.com/cockroachdb/cockroach/pkg/server/debug/replay" "github.com/cockroachdb/cockroach/pkg/util/ctxgroup" + "github.com/cockroachdb/cockroach/pkg/util/envutil" "github.com/cockroachdb/cockroach/pkg/util/httputil" "github.com/cockroachdb/cockroach/pkg/util/retry" "github.com/cockroachdb/cockroach/pkg/util/syncutil" @@ -57,6 +59,13 @@ import ( "golang.org/x/sys/unix" ) +const ( + // defaultPrometheusHostUrl for prometheus cluster config + defaultPrometheusHostUrl = "https://grafana.testeng.crdb.io/promhelpers" + // prometheusHostUrlEnv is the environment variable to override defaultPrometheusHostUrl + prometheusHostUrlEnv = "COCKROACH_PROM_HOST_URL" +) + // verifyClusterName ensures that the given name conforms to // our naming pattern of "-". The // username must match one of the vm.Provider account names @@ -723,8 +732,7 @@ func DefaultStartOpts() install.StartOpts { InitTarget: 1, SQLPort: 0, VirtualClusterName: install.SystemInterfaceName, - // TODO(DarrylWong): revert back to 0 once #117125 is addressed. - AdminUIPort: config.DefaultAdminUIPort, + AdminUIPort: 0, } } @@ -740,7 +748,60 @@ func Start( if err != nil { return err } - return c.Start(ctx, l, startOpts) + if err = c.Start(ctx, l, startOpts); err != nil { + return err + } + updatePrometheusTargets(ctx, l, c) + return nil +} + +// UpdateTargets updates prometheus target configurations for a cluster. +func UpdateTargets( + ctx context.Context, + l *logger.Logger, + clusterName string, + clusterSettingsOpts ...install.ClusterSettingOption, +) error { + if err := LoadClusters(); err != nil { + return err + } + c, err := newCluster(l, clusterName, clusterSettingsOpts...) + if err != nil { + return err + } + updatePrometheusTargets(ctx, l, c) + return nil +} + +// updatePrometheusTargets updates the prometheus instance cluster config. Any error is logged and ignored. +func updatePrometheusTargets(ctx context.Context, l *logger.Logger, c *install.SyncedCluster) { + nodeIPPorts := make([]string, len(c.Nodes)) + var wg sync.WaitGroup + for i, node := range c.VMs { + if node.Provider == gce.ProviderName { + wg.Add(1) + go func(index int, v vm.VM) { + defer wg.Done() + // only gce is supported for prometheus + desc, err := c.DiscoverService(ctx, install.Node(index+1), "", install.ServiceTypeUI, 0) + if err != nil { + l.Errorf("error getting the port for node %d: %v", index+1, err) + return + } + nodeInfo := fmt.Sprintf("%s:%d", v.PublicIP, desc.Port) + l.Printf("node information obtained for node index %d: %s", index, nodeInfo) + nodeIPPorts[index] = nodeInfo + }(i, node) + } + } + wg.Wait() + if len(nodeIPPorts) > 0 { + if err := promhelperclient.NewPromClient().UpdatePrometheusTargets(ctx, + envutil.EnvOrDefaultString(prometheusHostUrlEnv, defaultPrometheusHostUrl), + c.Name, false, nodeIPPorts, l); err != nil { + l.Errorf("creating cluster config failed for the ip:ports %v: %v", nodeIPPorts, err) + } + } } // Monitor monitors the status of cockroach nodes in a cluster. @@ -787,6 +848,8 @@ func Stop(ctx context.Context, l *logger.Logger, clusterName string, opts StopOp if err != nil { return err } + + _ = deleteClusterConfig(clusterName, l) return c.Stop(ctx, l, opts.Sig, opts.Wait, opts.MaxWait, "") } @@ -1358,9 +1421,18 @@ func destroyCluster(cld *cloud.Cloud, l *logger.Logger, clusterName string) erro l.Printf("Destroying cluster %s with %d nodes", clusterName, len(c.VMs)) } + _ = deleteClusterConfig(clusterName, l) + return cloud.DestroyCluster(l, c) } +// deleteClusterConfig deletes the prometheus instance cluster config. Any error is logged and ignored. +func deleteClusterConfig(clusterName string, l *logger.Logger) error { + return promhelperclient.NewPromClient().DeleteClusterConfig(context.Background(), + envutil.EnvOrDefaultString(prometheusHostUrlEnv, defaultPrometheusHostUrl), + clusterName, false, l) +} + func destroyLocalCluster(ctx context.Context, l *logger.Logger, clusterName string) error { if _, ok := readSyncedClusters(clusterName); !ok { return fmt.Errorf("cluster %s does not exist", clusterName) diff --git a/pkg/util/httputil/client.go b/pkg/util/httputil/client.go index bbeebc3ef200..7c2537593528 100644 --- a/pkg/util/httputil/client.go +++ b/pkg/util/httputil/client.go @@ -25,6 +25,12 @@ var DefaultClient = NewClientWithTimeout(StandardHTTPTimeout) // StandardHTTPTimeout is the default timeout to use for HTTP connections. const StandardHTTPTimeout time.Duration = 3 * time.Second +// RequestHeaders are the headers to be part of the request +type RequestHeaders struct { + ContentType string + Authorization string +} + // NewClientWithTimeout defines a http.Client with the given timeout. func NewClientWithTimeout(timeout time.Duration) *Client { return NewClientWithTimeouts(timeout, timeout) @@ -69,6 +75,20 @@ func Post( return DefaultClient.Post(ctx, url, contentType, body) } +// Put is like http.Put but uses the provided context and obeys its cancellation. +// It also uses the default client with a default 3 second timeout. +func Put( + ctx context.Context, url string, h *RequestHeaders, body io.Reader, +) (resp *http.Response, err error) { + return DefaultClient.Put(ctx, url, h, body) +} + +// Delete is like http.Delete but uses the provided context and obeys its cancellation. +// It also uses the default client with a default 3 second timeout. +func Delete(ctx context.Context, url string, h *RequestHeaders) (resp *http.Response, err error) { + return DefaultClient.Delete(ctx, url, h) +} + // Get does like http.Client.Get but uses the provided context and obeys its cancellation. func (c *Client) Get(ctx context.Context, url string) (resp *http.Response, err error) { req, err := http.NewRequestWithContext(ctx, "GET", url, nil) @@ -78,6 +98,42 @@ func (c *Client) Get(ctx context.Context, url string) (resp *http.Response, err return c.Do(req) } +// Put is like http.Client.Put but uses the provided context and obeys its cancellation. +// RequestHeaders can be used to set the following http headers: +// 1. ContentType +// 2. Authorization +func (c *Client) Put( + ctx context.Context, url string, h *RequestHeaders, body io.Reader, +) (resp *http.Response, err error) { + req, err := http.NewRequestWithContext(ctx, "PUT", url, body) + if err != nil { + return nil, err + } + if h != nil { + if h.ContentType != "" { + req.Header.Set("Content-Type", h.ContentType) + } + if h.Authorization != "" { + req.Header.Set("Authorization", h.Authorization) + } + } + return c.Do(req) +} + +// Delete is like http.Client.Delete but uses the provided context and obeys its cancellation. +func (c *Client) Delete( + ctx context.Context, url string, h *RequestHeaders, +) (resp *http.Response, err error) { + req, err := http.NewRequestWithContext(ctx, "DELETE", url, nil) + if err != nil { + return nil, err + } + if h != nil && h.Authorization != "" { + req.Header.Set("Authorization", h.Authorization) + } + return c.Do(req) +} + // Head does like http.Client.Head but uses the provided context and obeys its cancellation. func (c *Client) Head(ctx context.Context, url string) (resp *http.Response, err error) { req, err := http.NewRequestWithContext(ctx, "HEAD", url, nil)