Skip to content

Commit

Permalink
roachprod: permit disabling of Prometheus registration
Browse files Browse the repository at this point in the history
This patch makes an empty value for the COCKROACH_PROM_HOST_URL env var
disable the roachprod's cluster registration with the Prometheus
registration service. This is useful for people who don't work at
CockroachLabs and don't have access to this service.

Release note: None
Epic: None
  • Loading branch information
andreimatei committed May 23, 2024
1 parent 4de35f5 commit b437271
Show file tree
Hide file tree
Showing 5 changed files with 61 additions and 36 deletions.
1 change: 0 additions & 1 deletion pkg/roachprod/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ go_library(
"//pkg/roachprod/vm/local",
"//pkg/server/debug/replay",
"//pkg/util/ctxgroup",
"//pkg/util/envutil",
"//pkg/util/httputil",
"//pkg/util/retry",
"//pkg/util/syncutil",
Expand Down
1 change: 1 addition & 0 deletions pkg/roachprod/promhelperclient/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ go_library(
deps = [
"//pkg/roachprod/install",
"//pkg/roachprod/logger",
"//pkg/util/envutil",
"//pkg/util/httputil",
"@com_github_cockroachdb_errors//:errors",
"@com_google_cloud_go_storage//:storage",
Expand Down
48 changes: 38 additions & 10 deletions pkg/roachprod/promhelperclient/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (

"github.com/cockroachdb/cockroach/pkg/roachprod/install"
"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
"github.com/cockroachdb/cockroach/pkg/util/envutil"
"github.com/cockroachdb/cockroach/pkg/util/httputil"
"github.com/cockroachdb/errors"
"golang.org/x/oauth2"
Expand All @@ -41,9 +42,18 @@ const (
ServiceAccountAudience = "PROM_HELPER_SERVICE_ACCOUNT_AUDIENCE"
)

// The URL for the Prometheus registration service. An empty string means that the
// Prometheus integration is disabled. Should be accessed through
// getPrometheusRegistrationUrl().
var promRegistrationUrl = envutil.EnvOrDefaultString("COCKROACH_PROM_HOST_URL",
"https://grafana.testeng.crdb.io/promhelpers")

// PromClient is used to communicate with the prometheus helper service
// keeping the functions as a variable enables us to override the value for unit testing
type PromClient struct {
promUrl string
disabled bool

// httpPut is used for http PUT operation.
httpPut func(
ctx context.Context, url string, h *http.Header, body io.Reader,
Expand All @@ -56,15 +66,26 @@ type PromClient struct {
oauth2.TokenSource, error)
}

// DefaultPromClient is the default instance of PromClient. This instance should
// be used unless custom configuration is needed.
var DefaultPromClient = NewPromClient()

// NewPromClient returns a new instance of PromClient
func NewPromClient() *PromClient {
return &PromClient{
promUrl: promRegistrationUrl,
disabled: promRegistrationUrl == "",
httpPut: httputil.Put,
httpDelete: httputil.Delete,
newTokenSource: idtoken.NewTokenSource,
}
}

func (c *PromClient) setUrl(url string) {
c.promUrl = url
c.disabled = false
}

// instanceConfigRequest is the HTTP request received for generating instance config
type instanceConfigRequest struct {
//Config is the content of the yaml file
Expand All @@ -75,21 +96,25 @@ type instanceConfigRequest struct {
// UpdatePrometheusTargets updates the cluster config in the promUrl
func (c *PromClient) UpdatePrometheusTargets(
ctx context.Context,
promUrl, clusterName string,
clusterName string,
forceFetchCreds bool,
nodes map[int]*NodeInfo,
insecure bool,
l *logger.Logger,
) error {
if c.disabled {
l.Printf("Prometheus registration is disabled")
return nil
}
req, err := buildCreateRequest(nodes, insecure)
if err != nil {
return err
}
token, err := c.getToken(ctx, promUrl, forceFetchCreds, l)
token, err := c.getToken(ctx, forceFetchCreds, l)
if err != nil {
return err
}
url := getUrl(promUrl, clusterName)
url := getUrl(c.promUrl, clusterName)
l.Printf("invoking PUT for URL: %s", url)
h := &http.Header{}
h.Set("ContentType", "application/json")
Expand All @@ -104,7 +129,7 @@ func (c *PromClient) UpdatePrometheusTargets(
defer func() { _ = response.Body.Close() }()
if response.StatusCode == http.StatusUnauthorized && !forceFetchCreds {
l.Printf("request failed - this may be due to a stale token. retrying with forceFetchCreds true ...")
return c.UpdatePrometheusTargets(ctx, promUrl, clusterName, true, nodes, insecure, l)
return c.UpdatePrometheusTargets(ctx, clusterName, true, nodes, insecure, l)
}
body, err := io.ReadAll(response.Body)
if err != nil {
Expand All @@ -118,13 +143,16 @@ func (c *PromClient) UpdatePrometheusTargets(

// DeleteClusterConfig deletes the cluster config in the promUrl
func (c *PromClient) DeleteClusterConfig(
ctx context.Context, promUrl, clusterName string, forceFetchCreds bool, l *logger.Logger,
ctx context.Context, clusterName string, forceFetchCreds bool, l *logger.Logger,
) error {
token, err := c.getToken(ctx, promUrl, forceFetchCreds, l)
if c.disabled {
return nil
}
token, err := c.getToken(ctx, forceFetchCreds, l)
if err != nil {
return err
}
url := getUrl(promUrl, clusterName)
url := getUrl(c.promUrl, clusterName)
l.Printf("invoking DELETE for URL: %s", url)
h := &http.Header{}
h.Set("Authorization", token)
Expand All @@ -135,7 +163,7 @@ func (c *PromClient) DeleteClusterConfig(
if response.StatusCode != http.StatusNoContent {
defer func() { _ = response.Body.Close() }()
if response.StatusCode == http.StatusUnauthorized && !forceFetchCreds {
return c.DeleteClusterConfig(ctx, promUrl, clusterName, true, l)
return c.DeleteClusterConfig(ctx, clusterName, true, l)
}
body, err := io.ReadAll(response.Body)
if err != nil {
Expand Down Expand Up @@ -195,9 +223,9 @@ func buildCreateRequest(nodes map[int]*NodeInfo, insecure bool) (io.Reader, erro

// getToken gets the Authorization token for grafana
func (c *PromClient) getToken(
ctx context.Context, promUrl string, forceFetchCreds bool, l *logger.Logger,
ctx context.Context, forceFetchCreds bool, l *logger.Logger,
) (string, error) {
if strings.HasPrefix(promUrl, "http:/") {
if strings.HasPrefix(c.promUrl, "http:/") {
// no token needed for insecure URL
return "", nil
}
Expand Down
22 changes: 14 additions & 8 deletions pkg/roachprod/promhelperclient/client_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ func TestUpdatePrometheusTargets(t *testing.T) {
ctx := context.Background()
promUrl := "http://prom_url.com"
c := NewPromClient()
c.setUrl(promUrl)
t.Run("UpdatePrometheusTargets fails with 400", func(t *testing.T) {
c.httpPut = func(ctx context.Context, reqUrl string, h *http.Header, body io.Reader) (
resp *http.Response, err error) {
Expand All @@ -49,7 +50,7 @@ func TestUpdatePrometheusTargets(t *testing.T) {
Body: io.NopCloser(strings.NewReader("failed")),
}, nil
}
err := c.UpdatePrometheusTargets(ctx, promUrl, "c1", false,
err := c.UpdatePrometheusTargets(ctx, "c1", false,
map[int]*NodeInfo{1: {Target: "n1"}}, true, l)
require.NotNil(t, err)
require.Equal(t, "request failed with status 400 and error failed", err.Error())
Expand Down Expand Up @@ -82,7 +83,7 @@ func TestUpdatePrometheusTargets(t *testing.T) {
StatusCode: 200,
}, nil
}
err := c.UpdatePrometheusTargets(ctx, promUrl, "c1", false, nodeInfos, true, l)
err := c.UpdatePrometheusTargets(ctx, "c1", false, nodeInfos, true, l)
require.Nil(t, err)
})
}
Expand All @@ -98,6 +99,7 @@ func TestDeleteClusterConfig(t *testing.T) {
ctx := context.Background()
promUrl := "http://prom_url.com"
c := NewPromClient()
c.setUrl(promUrl)
t.Run("DeleteClusterConfig fails with 400", func(t *testing.T) {
c.httpDelete = func(ctx context.Context, url string, h *http.Header) (
resp *http.Response, err error) {
Expand All @@ -107,7 +109,7 @@ func TestDeleteClusterConfig(t *testing.T) {
Body: io.NopCloser(strings.NewReader("failed")),
}, nil
}
err := c.DeleteClusterConfig(ctx, promUrl, "c1", false, l)
err := c.DeleteClusterConfig(ctx, "c1", false, l)
require.NotNil(t, err)
require.Equal(t, "request failed with status 400 and error failed", err.Error())
})
Expand All @@ -119,7 +121,7 @@ func TestDeleteClusterConfig(t *testing.T) {
StatusCode: 204,
}, nil
}
err := c.DeleteClusterConfig(ctx, promUrl, "c1", false, l)
err := c.DeleteClusterConfig(ctx, "c1", false, l)
require.Nil(t, err)
})
}
Expand All @@ -144,7 +146,8 @@ func Test_getToken(t *testing.T) {
}()
c := NewPromClient()
t.Run("insecure url", func(t *testing.T) {
token, err := c.getToken(ctx, "http://test.com", false, l)
c.setUrl("http://test.com")
token, err := c.getToken(ctx, false, l)
require.Nil(t, err)
require.Empty(t, token)
})
Expand All @@ -156,7 +159,8 @@ func Test_getToken(t *testing.T) {
c.newTokenSource = func(ctx context.Context, audience string, opts ...idtoken.ClientOption) (oauth2.TokenSource, error) {
return nil, fmt.Errorf("invalid")
}
token, err := c.getToken(ctx, "https://test.com", false, l)
c.setUrl("https://test.com")
token, err := c.getToken(ctx, false, l)
require.NotNil(t, err)
require.Empty(t, token)
require.Equal(t, "error creating GCS oauth token source from specified credential: invalid", err.Error())
Expand All @@ -169,7 +173,8 @@ func Test_getToken(t *testing.T) {
c.newTokenSource = func(ctx context.Context, audience string, opts ...idtoken.ClientOption) (oauth2.TokenSource, error) {
return &mockToken{token: "", err: fmt.Errorf("failed")}, nil
}
token, err := c.getToken(ctx, "https://test.com", false, l)
c.setUrl("https://test.com")
token, err := c.getToken(ctx, false, l)
require.NotNil(t, err)
require.Empty(t, token)
require.Equal(t, "error getting identity token: failed", err.Error())
Expand All @@ -182,7 +187,8 @@ func Test_getToken(t *testing.T) {
c.newTokenSource = func(ctx context.Context, audience string, opts ...idtoken.ClientOption) (oauth2.TokenSource, error) {
return &mockToken{token: "token"}, nil
}
token, err := c.getToken(ctx, "https://test.com", false, l)
c.setUrl("https://test.com")
token, err := c.getToken(ctx, false, l)
require.Nil(t, err)
require.Equal(t, "Bearer token", token)
})
Expand Down
25 changes: 8 additions & 17 deletions pkg/roachprod/roachprod.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@ import (
"github.com/cockroachdb/cockroach/pkg/roachprod/vm/local"
"github.com/cockroachdb/cockroach/pkg/server/debug/replay"
"github.com/cockroachdb/cockroach/pkg/util/ctxgroup"
"github.com/cockroachdb/cockroach/pkg/util/envutil"
"github.com/cockroachdb/cockroach/pkg/util/httputil"
"github.com/cockroachdb/cockroach/pkg/util/retry"
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
Expand All @@ -60,13 +59,6 @@ import (
"golang.org/x/sys/unix"
)

const (
// defaultPrometheusHostUrl for prometheus cluster config
defaultPrometheusHostUrl = "https://grafana.testeng.crdb.io/promhelpers"
// prometheusHostUrlEnv is the environment variable to override defaultPrometheusHostUrl
prometheusHostUrlEnv = "COCKROACH_PROM_HOST_URL"
)

// MalformedClusterNameError is returned when the cluster name passed to Create is invalid.
type MalformedClusterNameError struct {
name string
Expand Down Expand Up @@ -828,8 +820,7 @@ func updatePrometheusTargets(ctx context.Context, l *logger.Logger, c *install.S
}
wg.Wait()
if len(nodeIPPorts) > 0 {
if err := promhelperclient.NewPromClient().UpdatePrometheusTargets(ctx,
envutil.EnvOrDefaultString(prometheusHostUrlEnv, defaultPrometheusHostUrl),
if err := promhelperclient.DefaultPromClient.UpdatePrometheusTargets(ctx,
c.Name, false, nodeIPPorts, !c.Secure, l); err != nil {
l.Errorf("creating cluster config failed for the ip:ports %v: %v", nodeIPPorts, err)
}
Expand Down Expand Up @@ -1459,15 +1450,17 @@ func Destroy(
// and let the caller retry.
cld, _ = cloud.ListCloud(l, vm.ListOptions{IncludeEmptyClusters: true})
}
return destroyCluster(cld, l, name)
return destroyCluster(ctx, cld, l, name)
}); err != nil {
return err
}
l.Printf("OK")
return nil
}

func destroyCluster(cld *cloud.Cloud, l *logger.Logger, clusterName string) error {
func destroyCluster(
ctx context.Context, cld *cloud.Cloud, l *logger.Logger, clusterName string,
) error {
c, ok := cld.Clusters[clusterName]
if !ok {
return fmt.Errorf("cluster %s does not exist", clusterName)
Expand All @@ -1478,18 +1471,16 @@ func destroyCluster(cld *cloud.Cloud, l *logger.Logger, clusterName string) erro
l.Printf("Destroying cluster %s with %d nodes", clusterName, len(c.VMs))
}

if err := deleteClusterConfig(clusterName, l); err != nil {
if err := deleteClusterConfig(ctx, clusterName, l); err != nil {
l.Printf("Failed to delete cluster config: %v", err)
}

return cloud.DestroyCluster(l, c)
}

// deleteClusterConfig deletes the prometheus instance cluster config. Any error is logged and ignored.
func deleteClusterConfig(clusterName string, l *logger.Logger) error {
return promhelperclient.NewPromClient().DeleteClusterConfig(context.Background(),
envutil.EnvOrDefaultString(prometheusHostUrlEnv, defaultPrometheusHostUrl),
clusterName, false, l)
func deleteClusterConfig(ctx context.Context, clusterName string, l *logger.Logger) error {
return promhelperclient.DefaultPromClient.DeleteClusterConfig(ctx, clusterName, false, l)
}

func destroyLocalCluster(ctx context.Context, l *logger.Logger, clusterName string) error {
Expand Down

0 comments on commit b437271

Please sign in to comment.