From df671c9014255787c430fa46a702f88f3f11d876 Mon Sep 17 00:00:00 2001 From: Bhaskarjyoti Bora Date: Thu, 23 May 2024 13:58:51 +0530 Subject: [PATCH] roachprod: delete cluster config on GC currently, prometheus cluster configs are not deleted on gc. This makes stale entries to remain. This PR deletes the cluster configs on GC. Fixes: #124599 Epic: none --- pkg/roachprod/BUILD.bazel | 1 - pkg/roachprod/cloud/BUILD.bazel | 1 + pkg/roachprod/cloud/gc.go | 3 +++ pkg/roachprod/promhelperclient/BUILD.bazel | 2 +- pkg/roachprod/promhelperclient/client.go | 19 +++++++---------- pkg/roachprod/promhelperclient/client_test.go | 12 +++++------ .../promhelperclient/promhelper_utils.go | 7 +++++++ pkg/roachprod/roachprod.go | 21 ++++--------------- 8 files changed, 30 insertions(+), 36 deletions(-) diff --git a/pkg/roachprod/BUILD.bazel b/pkg/roachprod/BUILD.bazel index b91703411bda..1839c8cc4b66 100644 --- a/pkg/roachprod/BUILD.bazel +++ b/pkg/roachprod/BUILD.bazel @@ -28,7 +28,6 @@ go_library( "//pkg/roachprod/vm/local", "//pkg/server/debug/replay", "//pkg/util/ctxgroup", - "//pkg/util/envutil", "//pkg/util/httputil", "//pkg/util/retry", "//pkg/util/syncutil", diff --git a/pkg/roachprod/cloud/BUILD.bazel b/pkg/roachprod/cloud/BUILD.bazel index 31d519d34853..4890ef5d8692 100644 --- a/pkg/roachprod/cloud/BUILD.bazel +++ b/pkg/roachprod/cloud/BUILD.bazel @@ -12,6 +12,7 @@ go_library( deps = [ "//pkg/roachprod/config", "//pkg/roachprod/logger", + "//pkg/roachprod/promhelperclient", "//pkg/roachprod/vm", "//pkg/roachprod/vm/gce", "//pkg/util/timeutil", diff --git a/pkg/roachprod/cloud/gc.go b/pkg/roachprod/cloud/gc.go index ae73bcf9dd37..ae594fe95040 100644 --- a/pkg/roachprod/cloud/gc.go +++ b/pkg/roachprod/cloud/gc.go @@ -24,6 +24,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/roachprod/config" "github.com/cockroachdb/cockroach/pkg/roachprod/logger" + "github.com/cockroachdb/cockroach/pkg/roachprod/promhelperclient" "github.com/cockroachdb/cockroach/pkg/roachprod/vm" "github.com/cockroachdb/cockroach/pkg/util/timeutil" "github.com/cockroachdb/errors" @@ -442,6 +443,8 @@ func GCClusters(l *logger.Logger, cloud *Cloud, dryrun bool) error { var destroyedClusters []resourceDescription for _, c := range s.destroy { + _ = promhelperclient.NewPromClient().DeleteClusterConfig(context.Background(), + c.Name, false, l) if err := destroyResource(dryrun, func() error { return DestroyCluster(l, c) }); err == nil { diff --git a/pkg/roachprod/promhelperclient/BUILD.bazel b/pkg/roachprod/promhelperclient/BUILD.bazel index 21cbaf2cd4fc..d1d0f43968d4 100644 --- a/pkg/roachprod/promhelperclient/BUILD.bazel +++ b/pkg/roachprod/promhelperclient/BUILD.bazel @@ -9,8 +9,8 @@ go_library( importpath = "github.com/cockroachdb/cockroach/pkg/roachprod/promhelperclient", visibility = ["//visibility:public"], deps = [ - "//pkg/roachprod/install", "//pkg/roachprod/logger", + "//pkg/util/envutil", "//pkg/util/httputil", "@com_github_cockroachdb_errors//:errors", "@com_google_cloud_go_storage//:storage", diff --git a/pkg/roachprod/promhelperclient/client.go b/pkg/roachprod/promhelperclient/client.go index 3f0193f82961..12e2b12d1ba7 100644 --- a/pkg/roachprod/promhelperclient/client.go +++ b/pkg/roachprod/promhelperclient/client.go @@ -24,8 +24,8 @@ import ( "strconv" "strings" - "github.com/cockroachdb/cockroach/pkg/roachprod/install" "github.com/cockroachdb/cockroach/pkg/roachprod/logger" + "github.com/cockroachdb/cockroach/pkg/util/envutil" "github.com/cockroachdb/cockroach/pkg/util/httputil" "github.com/cockroachdb/errors" "golang.org/x/oauth2" @@ -75,12 +75,13 @@ type instanceConfigRequest struct { // UpdatePrometheusTargets updates the cluster config in the promUrl func (c *PromClient) UpdatePrometheusTargets( ctx context.Context, - promUrl, clusterName string, + clusterName string, forceFetchCreds bool, nodes map[int]*NodeInfo, insecure bool, l *logger.Logger, ) error { + promUrl := envutil.EnvOrDefaultString(PrometheusHostUrlEnv, DefaultPrometheusHostUrl) req, err := buildCreateRequest(nodes, insecure) if err != nil { return err @@ -104,7 +105,7 @@ func (c *PromClient) UpdatePrometheusTargets( defer func() { _ = response.Body.Close() }() if response.StatusCode == http.StatusUnauthorized && !forceFetchCreds { l.Printf("request failed - this may be due to a stale token. retrying with forceFetchCreds true ...") - return c.UpdatePrometheusTargets(ctx, promUrl, clusterName, true, nodes, insecure, l) + return c.UpdatePrometheusTargets(ctx, clusterName, true, nodes, insecure, l) } body, err := io.ReadAll(response.Body) if err != nil { @@ -118,8 +119,9 @@ func (c *PromClient) UpdatePrometheusTargets( // DeleteClusterConfig deletes the cluster config in the promUrl func (c *PromClient) DeleteClusterConfig( - ctx context.Context, promUrl, clusterName string, forceFetchCreds bool, l *logger.Logger, + ctx context.Context, clusterName string, forceFetchCreds bool, l *logger.Logger, ) error { + promUrl := envutil.EnvOrDefaultString(PrometheusHostUrlEnv, DefaultPrometheusHostUrl) token, err := c.getToken(ctx, promUrl, forceFetchCreds, l) if err != nil { return err @@ -135,7 +137,7 @@ func (c *PromClient) DeleteClusterConfig( if response.StatusCode != http.StatusNoContent { defer func() { _ = response.Body.Close() }() if response.StatusCode == http.StatusUnauthorized && !forceFetchCreds { - return c.DeleteClusterConfig(ctx, promUrl, clusterName, true, l) + return c.DeleteClusterConfig(ctx, clusterName, true, l) } body, err := io.ReadAll(response.Body) if err != nil { @@ -169,12 +171,7 @@ func buildCreateRequest(nodes map[int]*NodeInfo, insecure bool) (io.Reader, erro for i, n := range nodes { params := &CCParams{ Targets: []string{n.Target}, - Labels: map[string]string{ - // default labels - "node": strconv.Itoa(i), - "tenant": install.SystemInterfaceName, - "job": "cockroachdb", - }, + Labels: map[string]string{"node": strconv.Itoa(i)}, } // custom labels - this can override the default labels if needed for n, v := range n.CustomLabels { diff --git a/pkg/roachprod/promhelperclient/client_test.go b/pkg/roachprod/promhelperclient/client_test.go index 748988c3bf8b..97f0429b2ef5 100644 --- a/pkg/roachprod/promhelperclient/client_test.go +++ b/pkg/roachprod/promhelperclient/client_test.go @@ -39,6 +39,7 @@ func TestUpdatePrometheusTargets(t *testing.T) { }() ctx := context.Background() promUrl := "http://prom_url.com" + _ = os.Setenv(PrometheusHostUrlEnv, promUrl) c := NewPromClient() t.Run("UpdatePrometheusTargets fails with 400", func(t *testing.T) { c.httpPut = func(ctx context.Context, reqUrl string, h *http.Header, body io.Reader) ( @@ -49,7 +50,7 @@ func TestUpdatePrometheusTargets(t *testing.T) { Body: io.NopCloser(strings.NewReader("failed")), }, nil } - err := c.UpdatePrometheusTargets(ctx, promUrl, "c1", false, + err := c.UpdatePrometheusTargets(ctx, "c1", false, map[int]*NodeInfo{1: {Target: "n1"}}, true, l) require.NotNil(t, err) require.Equal(t, "request failed with status 400 and error failed", err.Error()) @@ -72,8 +73,6 @@ func TestUpdatePrometheusTargets(t *testing.T) { nodeID, err := strconv.Atoi(c.Labels["node"]) require.NoError(t, err) require.Equal(t, nodeInfos[nodeID].Target, c.Targets[0]) - require.Equal(t, "system", c.Labels["tenant"]) - require.Equal(t, "cockroachdb", c.Labels["job"]) for k, v := range nodeInfos[nodeID].CustomLabels { require.Equal(t, v, c.Labels[k]) } @@ -82,7 +81,7 @@ func TestUpdatePrometheusTargets(t *testing.T) { StatusCode: 200, }, nil } - err := c.UpdatePrometheusTargets(ctx, promUrl, "c1", false, nodeInfos, true, l) + err := c.UpdatePrometheusTargets(ctx, "c1", false, nodeInfos, true, l) require.Nil(t, err) }) } @@ -97,6 +96,7 @@ func TestDeleteClusterConfig(t *testing.T) { }() ctx := context.Background() promUrl := "http://prom_url.com" + _ = os.Setenv(PrometheusHostUrlEnv, promUrl) c := NewPromClient() t.Run("DeleteClusterConfig fails with 400", func(t *testing.T) { c.httpDelete = func(ctx context.Context, url string, h *http.Header) ( @@ -107,7 +107,7 @@ func TestDeleteClusterConfig(t *testing.T) { Body: io.NopCloser(strings.NewReader("failed")), }, nil } - err := c.DeleteClusterConfig(ctx, promUrl, "c1", false, l) + err := c.DeleteClusterConfig(ctx, "c1", false, l) require.NotNil(t, err) require.Equal(t, "request failed with status 400 and error failed", err.Error()) }) @@ -119,7 +119,7 @@ func TestDeleteClusterConfig(t *testing.T) { StatusCode: 204, }, nil } - err := c.DeleteClusterConfig(ctx, promUrl, "c1", false, l) + err := c.DeleteClusterConfig(ctx, "c1", false, l) require.Nil(t, err) }) } diff --git a/pkg/roachprod/promhelperclient/promhelper_utils.go b/pkg/roachprod/promhelperclient/promhelper_utils.go index ca3269649f3a..db691c4f6135 100644 --- a/pkg/roachprod/promhelperclient/promhelper_utils.go +++ b/pkg/roachprod/promhelperclient/promhelper_utils.go @@ -23,6 +23,13 @@ import ( "google.golang.org/api/option" ) +const ( + // DefaultPrometheusHostUrl for prometheus cluster config + DefaultPrometheusHostUrl = "https://grafana.testeng.crdb.io/promhelpers" + // PrometheusHostUrlEnv is the environment variable to override defaultPrometheusHostUrl + PrometheusHostUrlEnv = "COCKROACH_PROM_HOST_URL" +) + var ( userHome, _ = os.UserHomeDir() // promCredFile is where the prom helper credentials are stored diff --git a/pkg/roachprod/roachprod.go b/pkg/roachprod/roachprod.go index a7f9103ea381..25aaff83d1ad 100644 --- a/pkg/roachprod/roachprod.go +++ b/pkg/roachprod/roachprod.go @@ -50,7 +50,6 @@ import ( "github.com/cockroachdb/cockroach/pkg/roachprod/vm/local" "github.com/cockroachdb/cockroach/pkg/server/debug/replay" "github.com/cockroachdb/cockroach/pkg/util/ctxgroup" - "github.com/cockroachdb/cockroach/pkg/util/envutil" "github.com/cockroachdb/cockroach/pkg/util/httputil" "github.com/cockroachdb/cockroach/pkg/util/retry" "github.com/cockroachdb/cockroach/pkg/util/syncutil" @@ -60,13 +59,6 @@ import ( "golang.org/x/sys/unix" ) -const ( - // defaultPrometheusHostUrl for prometheus cluster config - defaultPrometheusHostUrl = "https://grafana.testeng.crdb.io/promhelpers" - // prometheusHostUrlEnv is the environment variable to override defaultPrometheusHostUrl - prometheusHostUrlEnv = "COCKROACH_PROM_HOST_URL" -) - // supportedPromProjects are the projects supported for prometheus target var supportedPromProjects = map[string]struct{}{gce.DefaultProject(): {}} @@ -822,7 +814,6 @@ func updatePrometheusTargets(ctx context.Context, l *logger.Logger, c *install.S wg.Wait() if len(nodeIPPorts) > 0 { if err := promhelperclient.NewPromClient().UpdatePrometheusTargets(ctx, - envutil.EnvOrDefaultString(prometheusHostUrlEnv, defaultPrometheusHostUrl), c.Name, false, nodeIPPorts, !c.Secure, l); err != nil { l.Errorf("creating cluster config failed for the ip:ports %v: %v", nodeIPPorts, err) } @@ -840,6 +831,8 @@ func getLabels(v vm.VM) map[string]string { "host_ip": v.PrivateIP, "project": v.Project, "zone": v.Zone, + "tenant": install.SystemInterfaceName, + "job": "cockroachdb", } match := regionRegEx.FindStringSubmatch(v.Zone) if len(match) > 1 { @@ -1471,20 +1464,14 @@ func destroyCluster(cld *cloud.Cloud, l *logger.Logger, clusterName string) erro l.Printf("Destroying cluster %s with %d nodes", clusterName, len(c.VMs)) } - if err := deleteClusterConfig(clusterName, l); err != nil { + if err := promhelperclient.NewPromClient().DeleteClusterConfig(context.Background(), + clusterName, false, l); err != nil { l.Printf("Failed to delete cluster config: %v", err) } return cloud.DestroyCluster(l, c) } -// deleteClusterConfig deletes the prometheus instance cluster config. Any error is logged and ignored. -func deleteClusterConfig(clusterName string, l *logger.Logger) error { - return promhelperclient.NewPromClient().DeleteClusterConfig(context.Background(), - envutil.EnvOrDefaultString(prometheusHostUrlEnv, defaultPrometheusHostUrl), - clusterName, false, l) -} - func destroyLocalCluster(ctx context.Context, l *logger.Logger, clusterName string) error { if _, ok := readSyncedClusters(clusterName); !ok { return fmt.Errorf("cluster %s does not exist", clusterName)