From b437271d1f7ea47a693e85cb63fb84e66bab89c7 Mon Sep 17 00:00:00 2001 From: Andrei Matei Date: Thu, 23 May 2024 14:08:34 -0400 Subject: [PATCH] roachprod: permit disabling of Prometheus registration This patch makes an empty value for the COCKROACH_PROM_HOST_URL env var disable the roachprod's cluster registration with the Prometheus registration service. This is useful for people who don't work at CockroachLabs and don't have access to this service. Release note: None Epic: None --- pkg/roachprod/BUILD.bazel | 1 - pkg/roachprod/promhelperclient/BUILD.bazel | 1 + pkg/roachprod/promhelperclient/client.go | 48 +++++++++++++++---- pkg/roachprod/promhelperclient/client_test.go | 22 +++++---- pkg/roachprod/roachprod.go | 25 ++++------ 5 files changed, 61 insertions(+), 36 deletions(-) diff --git a/pkg/roachprod/BUILD.bazel b/pkg/roachprod/BUILD.bazel index b91703411bda..1839c8cc4b66 100644 --- a/pkg/roachprod/BUILD.bazel +++ b/pkg/roachprod/BUILD.bazel @@ -28,7 +28,6 @@ go_library( "//pkg/roachprod/vm/local", "//pkg/server/debug/replay", "//pkg/util/ctxgroup", - "//pkg/util/envutil", "//pkg/util/httputil", "//pkg/util/retry", "//pkg/util/syncutil", diff --git a/pkg/roachprod/promhelperclient/BUILD.bazel b/pkg/roachprod/promhelperclient/BUILD.bazel index 21cbaf2cd4fc..45e2e8ef1788 100644 --- a/pkg/roachprod/promhelperclient/BUILD.bazel +++ b/pkg/roachprod/promhelperclient/BUILD.bazel @@ -11,6 +11,7 @@ go_library( deps = [ "//pkg/roachprod/install", "//pkg/roachprod/logger", + "//pkg/util/envutil", "//pkg/util/httputil", "@com_github_cockroachdb_errors//:errors", "@com_google_cloud_go_storage//:storage", diff --git a/pkg/roachprod/promhelperclient/client.go b/pkg/roachprod/promhelperclient/client.go index 3f0193f82961..ae94f8018315 100644 --- a/pkg/roachprod/promhelperclient/client.go +++ b/pkg/roachprod/promhelperclient/client.go @@ -26,6 +26,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/roachprod/install" "github.com/cockroachdb/cockroach/pkg/roachprod/logger" + "github.com/cockroachdb/cockroach/pkg/util/envutil" "github.com/cockroachdb/cockroach/pkg/util/httputil" "github.com/cockroachdb/errors" "golang.org/x/oauth2" @@ -41,9 +42,18 @@ const ( ServiceAccountAudience = "PROM_HELPER_SERVICE_ACCOUNT_AUDIENCE" ) +// The URL for the Prometheus registration service. An empty string means that the +// Prometheus integration is disabled. Should be accessed through +// getPrometheusRegistrationUrl(). +var promRegistrationUrl = envutil.EnvOrDefaultString("COCKROACH_PROM_HOST_URL", + "https://grafana.testeng.crdb.io/promhelpers") + // PromClient is used to communicate with the prometheus helper service // keeping the functions as a variable enables us to override the value for unit testing type PromClient struct { + promUrl string + disabled bool + // httpPut is used for http PUT operation. httpPut func( ctx context.Context, url string, h *http.Header, body io.Reader, @@ -56,15 +66,26 @@ type PromClient struct { oauth2.TokenSource, error) } +// DefaultPromClient is the default instance of PromClient. This instance should +// be used unless custom configuration is needed. +var DefaultPromClient = NewPromClient() + // NewPromClient returns a new instance of PromClient func NewPromClient() *PromClient { return &PromClient{ + promUrl: promRegistrationUrl, + disabled: promRegistrationUrl == "", httpPut: httputil.Put, httpDelete: httputil.Delete, newTokenSource: idtoken.NewTokenSource, } } +func (c *PromClient) setUrl(url string) { + c.promUrl = url + c.disabled = false +} + // instanceConfigRequest is the HTTP request received for generating instance config type instanceConfigRequest struct { //Config is the content of the yaml file @@ -75,21 +96,25 @@ type instanceConfigRequest struct { // UpdatePrometheusTargets updates the cluster config in the promUrl func (c *PromClient) UpdatePrometheusTargets( ctx context.Context, - promUrl, clusterName string, + clusterName string, forceFetchCreds bool, nodes map[int]*NodeInfo, insecure bool, l *logger.Logger, ) error { + if c.disabled { + l.Printf("Prometheus registration is disabled") + return nil + } req, err := buildCreateRequest(nodes, insecure) if err != nil { return err } - token, err := c.getToken(ctx, promUrl, forceFetchCreds, l) + token, err := c.getToken(ctx, forceFetchCreds, l) if err != nil { return err } - url := getUrl(promUrl, clusterName) + url := getUrl(c.promUrl, clusterName) l.Printf("invoking PUT for URL: %s", url) h := &http.Header{} h.Set("ContentType", "application/json") @@ -104,7 +129,7 @@ func (c *PromClient) UpdatePrometheusTargets( defer func() { _ = response.Body.Close() }() if response.StatusCode == http.StatusUnauthorized && !forceFetchCreds { l.Printf("request failed - this may be due to a stale token. retrying with forceFetchCreds true ...") - return c.UpdatePrometheusTargets(ctx, promUrl, clusterName, true, nodes, insecure, l) + return c.UpdatePrometheusTargets(ctx, clusterName, true, nodes, insecure, l) } body, err := io.ReadAll(response.Body) if err != nil { @@ -118,13 +143,16 @@ func (c *PromClient) UpdatePrometheusTargets( // DeleteClusterConfig deletes the cluster config in the promUrl func (c *PromClient) DeleteClusterConfig( - ctx context.Context, promUrl, clusterName string, forceFetchCreds bool, l *logger.Logger, + ctx context.Context, clusterName string, forceFetchCreds bool, l *logger.Logger, ) error { - token, err := c.getToken(ctx, promUrl, forceFetchCreds, l) + if c.disabled { + return nil + } + token, err := c.getToken(ctx, forceFetchCreds, l) if err != nil { return err } - url := getUrl(promUrl, clusterName) + url := getUrl(c.promUrl, clusterName) l.Printf("invoking DELETE for URL: %s", url) h := &http.Header{} h.Set("Authorization", token) @@ -135,7 +163,7 @@ func (c *PromClient) DeleteClusterConfig( if response.StatusCode != http.StatusNoContent { defer func() { _ = response.Body.Close() }() if response.StatusCode == http.StatusUnauthorized && !forceFetchCreds { - return c.DeleteClusterConfig(ctx, promUrl, clusterName, true, l) + return c.DeleteClusterConfig(ctx, clusterName, true, l) } body, err := io.ReadAll(response.Body) if err != nil { @@ -195,9 +223,9 @@ func buildCreateRequest(nodes map[int]*NodeInfo, insecure bool) (io.Reader, erro // getToken gets the Authorization token for grafana func (c *PromClient) getToken( - ctx context.Context, promUrl string, forceFetchCreds bool, l *logger.Logger, + ctx context.Context, forceFetchCreds bool, l *logger.Logger, ) (string, error) { - if strings.HasPrefix(promUrl, "http:/") { + if strings.HasPrefix(c.promUrl, "http:/") { // no token needed for insecure URL return "", nil } diff --git a/pkg/roachprod/promhelperclient/client_test.go b/pkg/roachprod/promhelperclient/client_test.go index 748988c3bf8b..f09f3ef3d394 100644 --- a/pkg/roachprod/promhelperclient/client_test.go +++ b/pkg/roachprod/promhelperclient/client_test.go @@ -40,6 +40,7 @@ func TestUpdatePrometheusTargets(t *testing.T) { ctx := context.Background() promUrl := "http://prom_url.com" c := NewPromClient() + c.setUrl(promUrl) t.Run("UpdatePrometheusTargets fails with 400", func(t *testing.T) { c.httpPut = func(ctx context.Context, reqUrl string, h *http.Header, body io.Reader) ( resp *http.Response, err error) { @@ -49,7 +50,7 @@ func TestUpdatePrometheusTargets(t *testing.T) { Body: io.NopCloser(strings.NewReader("failed")), }, nil } - err := c.UpdatePrometheusTargets(ctx, promUrl, "c1", false, + err := c.UpdatePrometheusTargets(ctx, "c1", false, map[int]*NodeInfo{1: {Target: "n1"}}, true, l) require.NotNil(t, err) require.Equal(t, "request failed with status 400 and error failed", err.Error()) @@ -82,7 +83,7 @@ func TestUpdatePrometheusTargets(t *testing.T) { StatusCode: 200, }, nil } - err := c.UpdatePrometheusTargets(ctx, promUrl, "c1", false, nodeInfos, true, l) + err := c.UpdatePrometheusTargets(ctx, "c1", false, nodeInfos, true, l) require.Nil(t, err) }) } @@ -98,6 +99,7 @@ func TestDeleteClusterConfig(t *testing.T) { ctx := context.Background() promUrl := "http://prom_url.com" c := NewPromClient() + c.setUrl(promUrl) t.Run("DeleteClusterConfig fails with 400", func(t *testing.T) { c.httpDelete = func(ctx context.Context, url string, h *http.Header) ( resp *http.Response, err error) { @@ -107,7 +109,7 @@ func TestDeleteClusterConfig(t *testing.T) { Body: io.NopCloser(strings.NewReader("failed")), }, nil } - err := c.DeleteClusterConfig(ctx, promUrl, "c1", false, l) + err := c.DeleteClusterConfig(ctx, "c1", false, l) require.NotNil(t, err) require.Equal(t, "request failed with status 400 and error failed", err.Error()) }) @@ -119,7 +121,7 @@ func TestDeleteClusterConfig(t *testing.T) { StatusCode: 204, }, nil } - err := c.DeleteClusterConfig(ctx, promUrl, "c1", false, l) + err := c.DeleteClusterConfig(ctx, "c1", false, l) require.Nil(t, err) }) } @@ -144,7 +146,8 @@ func Test_getToken(t *testing.T) { }() c := NewPromClient() t.Run("insecure url", func(t *testing.T) { - token, err := c.getToken(ctx, "http://test.com", false, l) + c.setUrl("http://test.com") + token, err := c.getToken(ctx, false, l) require.Nil(t, err) require.Empty(t, token) }) @@ -156,7 +159,8 @@ func Test_getToken(t *testing.T) { c.newTokenSource = func(ctx context.Context, audience string, opts ...idtoken.ClientOption) (oauth2.TokenSource, error) { return nil, fmt.Errorf("invalid") } - token, err := c.getToken(ctx, "https://test.com", false, l) + c.setUrl("https://test.com") + token, err := c.getToken(ctx, false, l) require.NotNil(t, err) require.Empty(t, token) require.Equal(t, "error creating GCS oauth token source from specified credential: invalid", err.Error()) @@ -169,7 +173,8 @@ func Test_getToken(t *testing.T) { c.newTokenSource = func(ctx context.Context, audience string, opts ...idtoken.ClientOption) (oauth2.TokenSource, error) { return &mockToken{token: "", err: fmt.Errorf("failed")}, nil } - token, err := c.getToken(ctx, "https://test.com", false, l) + c.setUrl("https://test.com") + token, err := c.getToken(ctx, false, l) require.NotNil(t, err) require.Empty(t, token) require.Equal(t, "error getting identity token: failed", err.Error()) @@ -182,7 +187,8 @@ func Test_getToken(t *testing.T) { c.newTokenSource = func(ctx context.Context, audience string, opts ...idtoken.ClientOption) (oauth2.TokenSource, error) { return &mockToken{token: "token"}, nil } - token, err := c.getToken(ctx, "https://test.com", false, l) + c.setUrl("https://test.com") + token, err := c.getToken(ctx, false, l) require.Nil(t, err) require.Equal(t, "Bearer token", token) }) diff --git a/pkg/roachprod/roachprod.go b/pkg/roachprod/roachprod.go index 7bae10513d11..21866e4bfdb3 100644 --- a/pkg/roachprod/roachprod.go +++ b/pkg/roachprod/roachprod.go @@ -50,7 +50,6 @@ import ( "github.com/cockroachdb/cockroach/pkg/roachprod/vm/local" "github.com/cockroachdb/cockroach/pkg/server/debug/replay" "github.com/cockroachdb/cockroach/pkg/util/ctxgroup" - "github.com/cockroachdb/cockroach/pkg/util/envutil" "github.com/cockroachdb/cockroach/pkg/util/httputil" "github.com/cockroachdb/cockroach/pkg/util/retry" "github.com/cockroachdb/cockroach/pkg/util/syncutil" @@ -60,13 +59,6 @@ import ( "golang.org/x/sys/unix" ) -const ( - // defaultPrometheusHostUrl for prometheus cluster config - defaultPrometheusHostUrl = "https://grafana.testeng.crdb.io/promhelpers" - // prometheusHostUrlEnv is the environment variable to override defaultPrometheusHostUrl - prometheusHostUrlEnv = "COCKROACH_PROM_HOST_URL" -) - // MalformedClusterNameError is returned when the cluster name passed to Create is invalid. type MalformedClusterNameError struct { name string @@ -828,8 +820,7 @@ func updatePrometheusTargets(ctx context.Context, l *logger.Logger, c *install.S } wg.Wait() if len(nodeIPPorts) > 0 { - if err := promhelperclient.NewPromClient().UpdatePrometheusTargets(ctx, - envutil.EnvOrDefaultString(prometheusHostUrlEnv, defaultPrometheusHostUrl), + if err := promhelperclient.DefaultPromClient.UpdatePrometheusTargets(ctx, c.Name, false, nodeIPPorts, !c.Secure, l); err != nil { l.Errorf("creating cluster config failed for the ip:ports %v: %v", nodeIPPorts, err) } @@ -1459,7 +1450,7 @@ func Destroy( // and let the caller retry. cld, _ = cloud.ListCloud(l, vm.ListOptions{IncludeEmptyClusters: true}) } - return destroyCluster(cld, l, name) + return destroyCluster(ctx, cld, l, name) }); err != nil { return err } @@ -1467,7 +1458,9 @@ func Destroy( return nil } -func destroyCluster(cld *cloud.Cloud, l *logger.Logger, clusterName string) error { +func destroyCluster( + ctx context.Context, cld *cloud.Cloud, l *logger.Logger, clusterName string, +) error { c, ok := cld.Clusters[clusterName] if !ok { return fmt.Errorf("cluster %s does not exist", clusterName) @@ -1478,7 +1471,7 @@ func destroyCluster(cld *cloud.Cloud, l *logger.Logger, clusterName string) erro l.Printf("Destroying cluster %s with %d nodes", clusterName, len(c.VMs)) } - if err := deleteClusterConfig(clusterName, l); err != nil { + if err := deleteClusterConfig(ctx, clusterName, l); err != nil { l.Printf("Failed to delete cluster config: %v", err) } @@ -1486,10 +1479,8 @@ func destroyCluster(cld *cloud.Cloud, l *logger.Logger, clusterName string) erro } // deleteClusterConfig deletes the prometheus instance cluster config. Any error is logged and ignored. -func deleteClusterConfig(clusterName string, l *logger.Logger) error { - return promhelperclient.NewPromClient().DeleteClusterConfig(context.Background(), - envutil.EnvOrDefaultString(prometheusHostUrlEnv, defaultPrometheusHostUrl), - clusterName, false, l) +func deleteClusterConfig(ctx context.Context, clusterName string, l *logger.Logger) error { + return promhelperclient.DefaultPromClient.DeleteClusterConfig(ctx, clusterName, false, l) } func destroyLocalCluster(ctx context.Context, l *logger.Logger, clusterName string) error {