diff --git a/pkg/roachprod/BUILD.bazel b/pkg/roachprod/BUILD.bazel index b91703411bda..1839c8cc4b66 100644 --- a/pkg/roachprod/BUILD.bazel +++ b/pkg/roachprod/BUILD.bazel @@ -28,7 +28,6 @@ go_library( "//pkg/roachprod/vm/local", "//pkg/server/debug/replay", "//pkg/util/ctxgroup", - "//pkg/util/envutil", "//pkg/util/httputil", "//pkg/util/retry", "//pkg/util/syncutil", diff --git a/pkg/roachprod/promhelperclient/BUILD.bazel b/pkg/roachprod/promhelperclient/BUILD.bazel index 21cbaf2cd4fc..45e2e8ef1788 100644 --- a/pkg/roachprod/promhelperclient/BUILD.bazel +++ b/pkg/roachprod/promhelperclient/BUILD.bazel @@ -11,6 +11,7 @@ go_library( deps = [ "//pkg/roachprod/install", "//pkg/roachprod/logger", + "//pkg/util/envutil", "//pkg/util/httputil", "@com_github_cockroachdb_errors//:errors", "@com_google_cloud_go_storage//:storage", diff --git a/pkg/roachprod/promhelperclient/client.go b/pkg/roachprod/promhelperclient/client.go index 3f0193f82961..ae94f8018315 100644 --- a/pkg/roachprod/promhelperclient/client.go +++ b/pkg/roachprod/promhelperclient/client.go @@ -26,6 +26,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/roachprod/install" "github.com/cockroachdb/cockroach/pkg/roachprod/logger" + "github.com/cockroachdb/cockroach/pkg/util/envutil" "github.com/cockroachdb/cockroach/pkg/util/httputil" "github.com/cockroachdb/errors" "golang.org/x/oauth2" @@ -41,9 +42,18 @@ const ( ServiceAccountAudience = "PROM_HELPER_SERVICE_ACCOUNT_AUDIENCE" ) +// The URL for the Prometheus registration service. An empty string means that the +// Prometheus integration is disabled. Should be accessed through +// getPrometheusRegistrationUrl(). +var promRegistrationUrl = envutil.EnvOrDefaultString("COCKROACH_PROM_HOST_URL", + "https://grafana.testeng.crdb.io/promhelpers") + // PromClient is used to communicate with the prometheus helper service // keeping the functions as a variable enables us to override the value for unit testing type PromClient struct { + promUrl string + disabled bool + // httpPut is used for http PUT operation. httpPut func( ctx context.Context, url string, h *http.Header, body io.Reader, @@ -56,15 +66,26 @@ type PromClient struct { oauth2.TokenSource, error) } +// DefaultPromClient is the default instance of PromClient. This instance should +// be used unless custom configuration is needed. +var DefaultPromClient = NewPromClient() + // NewPromClient returns a new instance of PromClient func NewPromClient() *PromClient { return &PromClient{ + promUrl: promRegistrationUrl, + disabled: promRegistrationUrl == "", httpPut: httputil.Put, httpDelete: httputil.Delete, newTokenSource: idtoken.NewTokenSource, } } +func (c *PromClient) setUrl(url string) { + c.promUrl = url + c.disabled = false +} + // instanceConfigRequest is the HTTP request received for generating instance config type instanceConfigRequest struct { //Config is the content of the yaml file @@ -75,21 +96,25 @@ type instanceConfigRequest struct { // UpdatePrometheusTargets updates the cluster config in the promUrl func (c *PromClient) UpdatePrometheusTargets( ctx context.Context, - promUrl, clusterName string, + clusterName string, forceFetchCreds bool, nodes map[int]*NodeInfo, insecure bool, l *logger.Logger, ) error { + if c.disabled { + l.Printf("Prometheus registration is disabled") + return nil + } req, err := buildCreateRequest(nodes, insecure) if err != nil { return err } - token, err := c.getToken(ctx, promUrl, forceFetchCreds, l) + token, err := c.getToken(ctx, forceFetchCreds, l) if err != nil { return err } - url := getUrl(promUrl, clusterName) + url := getUrl(c.promUrl, clusterName) l.Printf("invoking PUT for URL: %s", url) h := &http.Header{} h.Set("ContentType", "application/json") @@ -104,7 +129,7 @@ func (c *PromClient) UpdatePrometheusTargets( defer func() { _ = response.Body.Close() }() if response.StatusCode == http.StatusUnauthorized && !forceFetchCreds { l.Printf("request failed - this may be due to a stale token. retrying with forceFetchCreds true ...") - return c.UpdatePrometheusTargets(ctx, promUrl, clusterName, true, nodes, insecure, l) + return c.UpdatePrometheusTargets(ctx, clusterName, true, nodes, insecure, l) } body, err := io.ReadAll(response.Body) if err != nil { @@ -118,13 +143,16 @@ func (c *PromClient) UpdatePrometheusTargets( // DeleteClusterConfig deletes the cluster config in the promUrl func (c *PromClient) DeleteClusterConfig( - ctx context.Context, promUrl, clusterName string, forceFetchCreds bool, l *logger.Logger, + ctx context.Context, clusterName string, forceFetchCreds bool, l *logger.Logger, ) error { - token, err := c.getToken(ctx, promUrl, forceFetchCreds, l) + if c.disabled { + return nil + } + token, err := c.getToken(ctx, forceFetchCreds, l) if err != nil { return err } - url := getUrl(promUrl, clusterName) + url := getUrl(c.promUrl, clusterName) l.Printf("invoking DELETE for URL: %s", url) h := &http.Header{} h.Set("Authorization", token) @@ -135,7 +163,7 @@ func (c *PromClient) DeleteClusterConfig( if response.StatusCode != http.StatusNoContent { defer func() { _ = response.Body.Close() }() if response.StatusCode == http.StatusUnauthorized && !forceFetchCreds { - return c.DeleteClusterConfig(ctx, promUrl, clusterName, true, l) + return c.DeleteClusterConfig(ctx, clusterName, true, l) } body, err := io.ReadAll(response.Body) if err != nil { @@ -195,9 +223,9 @@ func buildCreateRequest(nodes map[int]*NodeInfo, insecure bool) (io.Reader, erro // getToken gets the Authorization token for grafana func (c *PromClient) getToken( - ctx context.Context, promUrl string, forceFetchCreds bool, l *logger.Logger, + ctx context.Context, forceFetchCreds bool, l *logger.Logger, ) (string, error) { - if strings.HasPrefix(promUrl, "http:/") { + if strings.HasPrefix(c.promUrl, "http:/") { // no token needed for insecure URL return "", nil } diff --git a/pkg/roachprod/promhelperclient/client_test.go b/pkg/roachprod/promhelperclient/client_test.go index 748988c3bf8b..f09f3ef3d394 100644 --- a/pkg/roachprod/promhelperclient/client_test.go +++ b/pkg/roachprod/promhelperclient/client_test.go @@ -40,6 +40,7 @@ func TestUpdatePrometheusTargets(t *testing.T) { ctx := context.Background() promUrl := "http://prom_url.com" c := NewPromClient() + c.setUrl(promUrl) t.Run("UpdatePrometheusTargets fails with 400", func(t *testing.T) { c.httpPut = func(ctx context.Context, reqUrl string, h *http.Header, body io.Reader) ( resp *http.Response, err error) { @@ -49,7 +50,7 @@ func TestUpdatePrometheusTargets(t *testing.T) { Body: io.NopCloser(strings.NewReader("failed")), }, nil } - err := c.UpdatePrometheusTargets(ctx, promUrl, "c1", false, + err := c.UpdatePrometheusTargets(ctx, "c1", false, map[int]*NodeInfo{1: {Target: "n1"}}, true, l) require.NotNil(t, err) require.Equal(t, "request failed with status 400 and error failed", err.Error()) @@ -82,7 +83,7 @@ func TestUpdatePrometheusTargets(t *testing.T) { StatusCode: 200, }, nil } - err := c.UpdatePrometheusTargets(ctx, promUrl, "c1", false, nodeInfos, true, l) + err := c.UpdatePrometheusTargets(ctx, "c1", false, nodeInfos, true, l) require.Nil(t, err) }) } @@ -98,6 +99,7 @@ func TestDeleteClusterConfig(t *testing.T) { ctx := context.Background() promUrl := "http://prom_url.com" c := NewPromClient() + c.setUrl(promUrl) t.Run("DeleteClusterConfig fails with 400", func(t *testing.T) { c.httpDelete = func(ctx context.Context, url string, h *http.Header) ( resp *http.Response, err error) { @@ -107,7 +109,7 @@ func TestDeleteClusterConfig(t *testing.T) { Body: io.NopCloser(strings.NewReader("failed")), }, nil } - err := c.DeleteClusterConfig(ctx, promUrl, "c1", false, l) + err := c.DeleteClusterConfig(ctx, "c1", false, l) require.NotNil(t, err) require.Equal(t, "request failed with status 400 and error failed", err.Error()) }) @@ -119,7 +121,7 @@ func TestDeleteClusterConfig(t *testing.T) { StatusCode: 204, }, nil } - err := c.DeleteClusterConfig(ctx, promUrl, "c1", false, l) + err := c.DeleteClusterConfig(ctx, "c1", false, l) require.Nil(t, err) }) } @@ -144,7 +146,8 @@ func Test_getToken(t *testing.T) { }() c := NewPromClient() t.Run("insecure url", func(t *testing.T) { - token, err := c.getToken(ctx, "http://test.com", false, l) + c.setUrl("http://test.com") + token, err := c.getToken(ctx, false, l) require.Nil(t, err) require.Empty(t, token) }) @@ -156,7 +159,8 @@ func Test_getToken(t *testing.T) { c.newTokenSource = func(ctx context.Context, audience string, opts ...idtoken.ClientOption) (oauth2.TokenSource, error) { return nil, fmt.Errorf("invalid") } - token, err := c.getToken(ctx, "https://test.com", false, l) + c.setUrl("https://test.com") + token, err := c.getToken(ctx, false, l) require.NotNil(t, err) require.Empty(t, token) require.Equal(t, "error creating GCS oauth token source from specified credential: invalid", err.Error()) @@ -169,7 +173,8 @@ func Test_getToken(t *testing.T) { c.newTokenSource = func(ctx context.Context, audience string, opts ...idtoken.ClientOption) (oauth2.TokenSource, error) { return &mockToken{token: "", err: fmt.Errorf("failed")}, nil } - token, err := c.getToken(ctx, "https://test.com", false, l) + c.setUrl("https://test.com") + token, err := c.getToken(ctx, false, l) require.NotNil(t, err) require.Empty(t, token) require.Equal(t, "error getting identity token: failed", err.Error()) @@ -182,7 +187,8 @@ func Test_getToken(t *testing.T) { c.newTokenSource = func(ctx context.Context, audience string, opts ...idtoken.ClientOption) (oauth2.TokenSource, error) { return &mockToken{token: "token"}, nil } - token, err := c.getToken(ctx, "https://test.com", false, l) + c.setUrl("https://test.com") + token, err := c.getToken(ctx, false, l) require.Nil(t, err) require.Equal(t, "Bearer token", token) }) diff --git a/pkg/roachprod/roachprod.go b/pkg/roachprod/roachprod.go index 7bae10513d11..21866e4bfdb3 100644 --- a/pkg/roachprod/roachprod.go +++ b/pkg/roachprod/roachprod.go @@ -50,7 +50,6 @@ import ( "github.com/cockroachdb/cockroach/pkg/roachprod/vm/local" "github.com/cockroachdb/cockroach/pkg/server/debug/replay" "github.com/cockroachdb/cockroach/pkg/util/ctxgroup" - "github.com/cockroachdb/cockroach/pkg/util/envutil" "github.com/cockroachdb/cockroach/pkg/util/httputil" "github.com/cockroachdb/cockroach/pkg/util/retry" "github.com/cockroachdb/cockroach/pkg/util/syncutil" @@ -60,13 +59,6 @@ import ( "golang.org/x/sys/unix" ) -const ( - // defaultPrometheusHostUrl for prometheus cluster config - defaultPrometheusHostUrl = "https://grafana.testeng.crdb.io/promhelpers" - // prometheusHostUrlEnv is the environment variable to override defaultPrometheusHostUrl - prometheusHostUrlEnv = "COCKROACH_PROM_HOST_URL" -) - // MalformedClusterNameError is returned when the cluster name passed to Create is invalid. type MalformedClusterNameError struct { name string @@ -828,8 +820,7 @@ func updatePrometheusTargets(ctx context.Context, l *logger.Logger, c *install.S } wg.Wait() if len(nodeIPPorts) > 0 { - if err := promhelperclient.NewPromClient().UpdatePrometheusTargets(ctx, - envutil.EnvOrDefaultString(prometheusHostUrlEnv, defaultPrometheusHostUrl), + if err := promhelperclient.DefaultPromClient.UpdatePrometheusTargets(ctx, c.Name, false, nodeIPPorts, !c.Secure, l); err != nil { l.Errorf("creating cluster config failed for the ip:ports %v: %v", nodeIPPorts, err) } @@ -1459,7 +1450,7 @@ func Destroy( // and let the caller retry. cld, _ = cloud.ListCloud(l, vm.ListOptions{IncludeEmptyClusters: true}) } - return destroyCluster(cld, l, name) + return destroyCluster(ctx, cld, l, name) }); err != nil { return err } @@ -1467,7 +1458,9 @@ func Destroy( return nil } -func destroyCluster(cld *cloud.Cloud, l *logger.Logger, clusterName string) error { +func destroyCluster( + ctx context.Context, cld *cloud.Cloud, l *logger.Logger, clusterName string, +) error { c, ok := cld.Clusters[clusterName] if !ok { return fmt.Errorf("cluster %s does not exist", clusterName) @@ -1478,7 +1471,7 @@ func destroyCluster(cld *cloud.Cloud, l *logger.Logger, clusterName string) erro l.Printf("Destroying cluster %s with %d nodes", clusterName, len(c.VMs)) } - if err := deleteClusterConfig(clusterName, l); err != nil { + if err := deleteClusterConfig(ctx, clusterName, l); err != nil { l.Printf("Failed to delete cluster config: %v", err) } @@ -1486,10 +1479,8 @@ func destroyCluster(cld *cloud.Cloud, l *logger.Logger, clusterName string) erro } // deleteClusterConfig deletes the prometheus instance cluster config. Any error is logged and ignored. -func deleteClusterConfig(clusterName string, l *logger.Logger) error { - return promhelperclient.NewPromClient().DeleteClusterConfig(context.Background(), - envutil.EnvOrDefaultString(prometheusHostUrlEnv, defaultPrometheusHostUrl), - clusterName, false, l) +func deleteClusterConfig(ctx context.Context, clusterName string, l *logger.Logger) error { + return promhelperclient.DefaultPromClient.DeleteClusterConfig(ctx, clusterName, false, l) } func destroyLocalCluster(ctx context.Context, l *logger.Logger, clusterName string) error {