Skip to content

Commit

Permalink
roachprod: delete cluster config on GC
Browse files Browse the repository at this point in the history
currently, prometheus cluster configs are not deleted on
gc. This makes stale entries to remain. This PR deletes the
cluster configs on GC.

Fixes: #124599
Epic: none
  • Loading branch information
nameisbhaskar committed May 28, 2024
1 parent 46b6fab commit df671c9
Show file tree
Hide file tree
Showing 8 changed files with 30 additions and 36 deletions.
1 change: 0 additions & 1 deletion pkg/roachprod/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ go_library(
"//pkg/roachprod/vm/local",
"//pkg/server/debug/replay",
"//pkg/util/ctxgroup",
"//pkg/util/envutil",
"//pkg/util/httputil",
"//pkg/util/retry",
"//pkg/util/syncutil",
Expand Down
1 change: 1 addition & 0 deletions pkg/roachprod/cloud/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ go_library(
deps = [
"//pkg/roachprod/config",
"//pkg/roachprod/logger",
"//pkg/roachprod/promhelperclient",
"//pkg/roachprod/vm",
"//pkg/roachprod/vm/gce",
"//pkg/util/timeutil",
Expand Down
3 changes: 3 additions & 0 deletions pkg/roachprod/cloud/gc.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (

"github.com/cockroachdb/cockroach/pkg/roachprod/config"
"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
"github.com/cockroachdb/cockroach/pkg/roachprod/promhelperclient"
"github.com/cockroachdb/cockroach/pkg/roachprod/vm"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
"github.com/cockroachdb/errors"
Expand Down Expand Up @@ -442,6 +443,8 @@ func GCClusters(l *logger.Logger, cloud *Cloud, dryrun bool) error {

var destroyedClusters []resourceDescription
for _, c := range s.destroy {
_ = promhelperclient.NewPromClient().DeleteClusterConfig(context.Background(),
c.Name, false, l)
if err := destroyResource(dryrun, func() error {
return DestroyCluster(l, c)
}); err == nil {
Expand Down
2 changes: 1 addition & 1 deletion pkg/roachprod/promhelperclient/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ go_library(
importpath = "github.com/cockroachdb/cockroach/pkg/roachprod/promhelperclient",
visibility = ["//visibility:public"],
deps = [
"//pkg/roachprod/install",
"//pkg/roachprod/logger",
"//pkg/util/envutil",
"//pkg/util/httputil",
"@com_github_cockroachdb_errors//:errors",
"@com_google_cloud_go_storage//:storage",
Expand Down
19 changes: 8 additions & 11 deletions pkg/roachprod/promhelperclient/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ import (
"strconv"
"strings"

"github.com/cockroachdb/cockroach/pkg/roachprod/install"
"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
"github.com/cockroachdb/cockroach/pkg/util/envutil"
"github.com/cockroachdb/cockroach/pkg/util/httputil"
"github.com/cockroachdb/errors"
"golang.org/x/oauth2"
Expand Down Expand Up @@ -75,12 +75,13 @@ type instanceConfigRequest struct {
// UpdatePrometheusTargets updates the cluster config in the promUrl
func (c *PromClient) UpdatePrometheusTargets(
ctx context.Context,
promUrl, clusterName string,
clusterName string,
forceFetchCreds bool,
nodes map[int]*NodeInfo,
insecure bool,
l *logger.Logger,
) error {
promUrl := envutil.EnvOrDefaultString(PrometheusHostUrlEnv, DefaultPrometheusHostUrl)
req, err := buildCreateRequest(nodes, insecure)
if err != nil {
return err
Expand All @@ -104,7 +105,7 @@ func (c *PromClient) UpdatePrometheusTargets(
defer func() { _ = response.Body.Close() }()
if response.StatusCode == http.StatusUnauthorized && !forceFetchCreds {
l.Printf("request failed - this may be due to a stale token. retrying with forceFetchCreds true ...")
return c.UpdatePrometheusTargets(ctx, promUrl, clusterName, true, nodes, insecure, l)
return c.UpdatePrometheusTargets(ctx, clusterName, true, nodes, insecure, l)
}
body, err := io.ReadAll(response.Body)
if err != nil {
Expand All @@ -118,8 +119,9 @@ func (c *PromClient) UpdatePrometheusTargets(

// DeleteClusterConfig deletes the cluster config in the promUrl
func (c *PromClient) DeleteClusterConfig(
ctx context.Context, promUrl, clusterName string, forceFetchCreds bool, l *logger.Logger,
ctx context.Context, clusterName string, forceFetchCreds bool, l *logger.Logger,
) error {
promUrl := envutil.EnvOrDefaultString(PrometheusHostUrlEnv, DefaultPrometheusHostUrl)
token, err := c.getToken(ctx, promUrl, forceFetchCreds, l)
if err != nil {
return err
Expand All @@ -135,7 +137,7 @@ func (c *PromClient) DeleteClusterConfig(
if response.StatusCode != http.StatusNoContent {
defer func() { _ = response.Body.Close() }()
if response.StatusCode == http.StatusUnauthorized && !forceFetchCreds {
return c.DeleteClusterConfig(ctx, promUrl, clusterName, true, l)
return c.DeleteClusterConfig(ctx, clusterName, true, l)
}
body, err := io.ReadAll(response.Body)
if err != nil {
Expand Down Expand Up @@ -169,12 +171,7 @@ func buildCreateRequest(nodes map[int]*NodeInfo, insecure bool) (io.Reader, erro
for i, n := range nodes {
params := &CCParams{
Targets: []string{n.Target},
Labels: map[string]string{
// default labels
"node": strconv.Itoa(i),
"tenant": install.SystemInterfaceName,
"job": "cockroachdb",
},
Labels: map[string]string{"node": strconv.Itoa(i)},
}
// custom labels - this can override the default labels if needed
for n, v := range n.CustomLabels {
Expand Down
12 changes: 6 additions & 6 deletions pkg/roachprod/promhelperclient/client_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ func TestUpdatePrometheusTargets(t *testing.T) {
}()
ctx := context.Background()
promUrl := "http://prom_url.com"
_ = os.Setenv(PrometheusHostUrlEnv, promUrl)
c := NewPromClient()
t.Run("UpdatePrometheusTargets fails with 400", func(t *testing.T) {
c.httpPut = func(ctx context.Context, reqUrl string, h *http.Header, body io.Reader) (
Expand All @@ -49,7 +50,7 @@ func TestUpdatePrometheusTargets(t *testing.T) {
Body: io.NopCloser(strings.NewReader("failed")),
}, nil
}
err := c.UpdatePrometheusTargets(ctx, promUrl, "c1", false,
err := c.UpdatePrometheusTargets(ctx, "c1", false,
map[int]*NodeInfo{1: {Target: "n1"}}, true, l)
require.NotNil(t, err)
require.Equal(t, "request failed with status 400 and error failed", err.Error())
Expand All @@ -72,8 +73,6 @@ func TestUpdatePrometheusTargets(t *testing.T) {
nodeID, err := strconv.Atoi(c.Labels["node"])
require.NoError(t, err)
require.Equal(t, nodeInfos[nodeID].Target, c.Targets[0])
require.Equal(t, "system", c.Labels["tenant"])
require.Equal(t, "cockroachdb", c.Labels["job"])
for k, v := range nodeInfos[nodeID].CustomLabels {
require.Equal(t, v, c.Labels[k])
}
Expand All @@ -82,7 +81,7 @@ func TestUpdatePrometheusTargets(t *testing.T) {
StatusCode: 200,
}, nil
}
err := c.UpdatePrometheusTargets(ctx, promUrl, "c1", false, nodeInfos, true, l)
err := c.UpdatePrometheusTargets(ctx, "c1", false, nodeInfos, true, l)
require.Nil(t, err)
})
}
Expand All @@ -97,6 +96,7 @@ func TestDeleteClusterConfig(t *testing.T) {
}()
ctx := context.Background()
promUrl := "http://prom_url.com"
_ = os.Setenv(PrometheusHostUrlEnv, promUrl)
c := NewPromClient()
t.Run("DeleteClusterConfig fails with 400", func(t *testing.T) {
c.httpDelete = func(ctx context.Context, url string, h *http.Header) (
Expand All @@ -107,7 +107,7 @@ func TestDeleteClusterConfig(t *testing.T) {
Body: io.NopCloser(strings.NewReader("failed")),
}, nil
}
err := c.DeleteClusterConfig(ctx, promUrl, "c1", false, l)
err := c.DeleteClusterConfig(ctx, "c1", false, l)
require.NotNil(t, err)
require.Equal(t, "request failed with status 400 and error failed", err.Error())
})
Expand All @@ -119,7 +119,7 @@ func TestDeleteClusterConfig(t *testing.T) {
StatusCode: 204,
}, nil
}
err := c.DeleteClusterConfig(ctx, promUrl, "c1", false, l)
err := c.DeleteClusterConfig(ctx, "c1", false, l)
require.Nil(t, err)
})
}
Expand Down
7 changes: 7 additions & 0 deletions pkg/roachprod/promhelperclient/promhelper_utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,13 @@ import (
"google.golang.org/api/option"
)

const (
// DefaultPrometheusHostUrl for prometheus cluster config
DefaultPrometheusHostUrl = "https://grafana.testeng.crdb.io/promhelpers"
// PrometheusHostUrlEnv is the environment variable to override defaultPrometheusHostUrl
PrometheusHostUrlEnv = "COCKROACH_PROM_HOST_URL"
)

var (
userHome, _ = os.UserHomeDir()
// promCredFile is where the prom helper credentials are stored
Expand Down
21 changes: 4 additions & 17 deletions pkg/roachprod/roachprod.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@ import (
"github.com/cockroachdb/cockroach/pkg/roachprod/vm/local"
"github.com/cockroachdb/cockroach/pkg/server/debug/replay"
"github.com/cockroachdb/cockroach/pkg/util/ctxgroup"
"github.com/cockroachdb/cockroach/pkg/util/envutil"
"github.com/cockroachdb/cockroach/pkg/util/httputil"
"github.com/cockroachdb/cockroach/pkg/util/retry"
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
Expand All @@ -60,13 +59,6 @@ import (
"golang.org/x/sys/unix"
)

const (
// defaultPrometheusHostUrl for prometheus cluster config
defaultPrometheusHostUrl = "https://grafana.testeng.crdb.io/promhelpers"
// prometheusHostUrlEnv is the environment variable to override defaultPrometheusHostUrl
prometheusHostUrlEnv = "COCKROACH_PROM_HOST_URL"
)

// supportedPromProjects are the projects supported for prometheus target
var supportedPromProjects = map[string]struct{}{gce.DefaultProject(): {}}

Expand Down Expand Up @@ -822,7 +814,6 @@ func updatePrometheusTargets(ctx context.Context, l *logger.Logger, c *install.S
wg.Wait()
if len(nodeIPPorts) > 0 {
if err := promhelperclient.NewPromClient().UpdatePrometheusTargets(ctx,
envutil.EnvOrDefaultString(prometheusHostUrlEnv, defaultPrometheusHostUrl),
c.Name, false, nodeIPPorts, !c.Secure, l); err != nil {
l.Errorf("creating cluster config failed for the ip:ports %v: %v", nodeIPPorts, err)
}
Expand All @@ -840,6 +831,8 @@ func getLabels(v vm.VM) map[string]string {
"host_ip": v.PrivateIP,
"project": v.Project,
"zone": v.Zone,
"tenant": install.SystemInterfaceName,
"job": "cockroachdb",
}
match := regionRegEx.FindStringSubmatch(v.Zone)
if len(match) > 1 {
Expand Down Expand Up @@ -1471,20 +1464,14 @@ func destroyCluster(cld *cloud.Cloud, l *logger.Logger, clusterName string) erro
l.Printf("Destroying cluster %s with %d nodes", clusterName, len(c.VMs))
}

if err := deleteClusterConfig(clusterName, l); err != nil {
if err := promhelperclient.NewPromClient().DeleteClusterConfig(context.Background(),
clusterName, false, l); err != nil {
l.Printf("Failed to delete cluster config: %v", err)
}

return cloud.DestroyCluster(l, c)
}

// deleteClusterConfig deletes the prometheus instance cluster config. Any error is logged and ignored.
func deleteClusterConfig(clusterName string, l *logger.Logger) error {
return promhelperclient.NewPromClient().DeleteClusterConfig(context.Background(),
envutil.EnvOrDefaultString(prometheusHostUrlEnv, defaultPrometheusHostUrl),
clusterName, false, l)
}

func destroyLocalCluster(ctx context.Context, l *logger.Logger, clusterName string) error {
if _, ok := readSyncedClusters(clusterName); !ok {
return fmt.Errorf("cluster %s does not exist", clusterName)
Expand Down

0 comments on commit df671c9

Please sign in to comment.