Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

roachprod: enable roachprod to be configured for external use #120340

Merged
merged 5 commits into from
May 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion pkg/cmd/roachprod/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,8 @@ func initFlags() {
rootCmd.PersistentFlags().IntVarP(&config.MaxConcurrency, "max-concurrency", "", 32,
"maximum number of operations to execute on nodes concurrently, set to zero for infinite",
)
rootCmd.PersistentFlags().StringVarP(&config.EmailDomain, "email-domain", "",
config.DefaultEmailDomain, "email domain for users")

createCmd.Flags().DurationVarP(&createVMOpts.Lifetime,
"lifetime", "l", 12*time.Hour, "Lifetime of the cluster")
Expand Down Expand Up @@ -162,7 +164,7 @@ func initFlags() {
providerOptsContainer[providerName].ConfigureCreateFlags(createCmd.Flags())

for _, cmd := range []*cobra.Command{
destroyCmd, extendCmd, listCmd, syncCmd, gcCmd,
destroyCmd, extendCmd, listCmd, syncCmd, gcCmd, setupSSHCmd, startCmd, pgurlCmd, adminurlCmd,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Methinks only setupSSHCmd calls Sync, which in turn may invoke SyncDNS. The others don't need parameterized DNS flags.

} {
providerOptsContainer[providerName].ConfigureClusterFlags(cmd.Flags(), vm.AcceptMultipleProjects)
}
Expand Down Expand Up @@ -448,6 +450,13 @@ func initFlags() {
"sql-instance", 0, "specific SQL/HTTP instance to connect to (this is a roachprod abstraction distinct from the internal instance ID)")
}

for _, cmd := range []*cobra.Command{startCmd, listCmd, syncCmd} {
cmd.Flags().StringSliceVar(&config.DNSRequiredProviders,
"dns-required-providers", config.DefaultDNSRequiredProviders,
"the cloud providers that must be active to refresh DNS entries",
)
}

grafanaAnnotationCmd.Flags().StringArrayVar(&grafanaTags,
"tags", []string{}, "grafana annotation tags")
grafanaAnnotationCmd.Flags().StringVar(&grafanaDashboardUID,
Expand Down
39 changes: 36 additions & 3 deletions pkg/roachprod/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (
"path"
"regexp"
"slices"
"strings"

"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
"github.com/cockroachdb/cockroach/pkg/util/envutil"
Expand Down Expand Up @@ -49,8 +50,43 @@ var (
// SSHDirectory is the path to search for SSH keys needed to set up
// set up new roachprod clusters.
SSHDirectory = os.ExpandEnv("${HOME}/.ssh")

// DefaultEmailDomain is used to form the full account name for GCE and Slack.
DefaultEmailDomain = EnvOrDefaultString(
"ROACHPROD_EMAIL_DOMAIN", "@cockroachlabs.com",
)

// EmailDomain used to form fully qualified usernames for gcloud and slack.
EmailDomain string

// DNSRequiredProviders is the list of cloud providers that must be active for
// DNS records to be synced when roachprod syncs its state.
DefaultDNSRequiredProviders = envOrDefaultStrings(
"ROACHPROD_DNS_REQUIRED_PROVIDERS", []string{"gce", "aws"},
)

// DNSRequiredProviders is the list of cloud providers that must be active for
DNSRequiredProviders []string
)

// EnvOrDefaultString returns the value of the environment variable with the
// given key, or the default value if the environment variable is not set.
//
// Unlike envutil.EnvOrDefaultString, it does not assert properties of the key.
func EnvOrDefaultString(key, def string) string {
if v, ok := os.LookupEnv(key); ok {
return v
}
return def
}

func envOrDefaultStrings(key string, def []string) []string {
if v, ok := os.LookupEnv(key); ok {
return strings.Split(v, ",")
}
return def
}

func init() {
var err error
OSUser, err = user.Current()
Expand All @@ -70,9 +106,6 @@ const (
// DefaultDebugDir is used to stash debug information.
DefaultDebugDir = "${HOME}/.roachprod/debug"

// EmailDomain is used to form the full account name for GCE and Slack.
EmailDomain = "@cockroachlabs.com"

// Local is the prefix used to identify local clusters.
// It is also used as the zone for local clusters.
Local = "local"
Expand Down
1 change: 0 additions & 1 deletion pkg/roachprod/install/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,6 @@ go_test(
"//pkg/roachprod/cloud",
"//pkg/roachprod/logger",
"//pkg/roachprod/vm",
"//pkg/roachprod/vm/gce",
"//pkg/roachprod/vm/gce/testutils",
"//pkg/roachprod/vm/local",
"//pkg/testutils/datapathutils",
Expand Down
7 changes: 3 additions & 4 deletions pkg/roachprod/install/services_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ import (
"github.com/cockroachdb/cockroach/pkg/roachprod/cloud"
"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
"github.com/cockroachdb/cockroach/pkg/roachprod/vm"
"github.com/cockroachdb/cockroach/pkg/roachprod/vm/gce"
"github.com/cockroachdb/cockroach/pkg/roachprod/vm/gce/testutils"
"github.com/cockroachdb/cockroach/pkg/roachprod/vm/local"
"github.com/cockroachdb/cockroach/pkg/util/randutil"
Expand Down Expand Up @@ -106,14 +105,14 @@ func TestServiceNameComponents(t *testing.T) {
func TestMaybeRegisterServices(t *testing.T) {
ctx := context.Background()
rng, _ := randutil.NewTestRand()
dnsServer, _, providerName := testutils.ProviderWithTestDNSServer(rng)
dnsServer, dnsProvider, providerName := testutils.ProviderWithTestDNSServer(rng)

// Create a cluster with 3 nodes.
makeVM := func(clusterName string, i int) vm.VM {
return vm.VM{
Provider: providerName,
DNSProvider: providerName,
PublicDNS: fmt.Sprintf("%s.%s", vm.Name(clusterName, i), gce.Subdomain),
PublicDNS: fmt.Sprintf("%s.%s", vm.Name(clusterName, i), dnsProvider.Domain()),
}
}
makeCluster := func() *SyncedCluster {
Expand Down Expand Up @@ -171,7 +170,7 @@ func TestMultipleRegistrations(t *testing.T) {
return vm.VM{
Provider: providerName,
DNSProvider: providerName,
PublicDNS: fmt.Sprintf("%s.%s", vm.Name(clusterName, i), gce.Subdomain),
PublicDNS: fmt.Sprintf("%s.%s", vm.Name(clusterName, i), testDNS.Domain()),
}
}
clusterName := fmt.Sprintf("cluster-%d", rng.Uint32())
Expand Down
1 change: 1 addition & 0 deletions pkg/roachprod/promhelperclient/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ go_library(
importpath = "github.com/cockroachdb/cockroach/pkg/roachprod/promhelperclient",
visibility = ["//visibility:public"],
deps = [
"//pkg/roachprod/config",
"//pkg/roachprod/logger",
"//pkg/roachprod/vm/gce",
"//pkg/util/httputil",
Expand Down
55 changes: 35 additions & 20 deletions pkg/roachprod/promhelperclient/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
"strconv"
"strings"

"github.com/cockroachdb/cockroach/pkg/roachprod/config"
"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
"github.com/cockroachdb/cockroach/pkg/roachprod/vm/gce"
"github.com/cockroachdb/cockroach/pkg/util/httputil"
Expand All @@ -34,24 +35,27 @@ import (
)

const (
resourceName = "instance-configs"
resourceVersion = "v1"

// defaultPrometheusHostUrl for prometheus cluster config
defaultPrometheusHostUrl = "https://grafana.testeng.crdb.io/promhelpers"
// prometheusHostUrlEnv is the environment variable to override defaultPrometheusHostUrl
prometheusHostUrlEnv = "ROACHPROD_PROM_HOST_URL"

resourceName = "instance-configs"
resourceVersion = "v1"
serviceAccountJson = "PROM_HELPER_SERVICE_ACCOUNT_JSON"
serviceAccountAudience = "PROM_HELPER_SERVICE_ACCOUNT_AUDIENCE"
)

// SupportedPromProjects are the projects supported for prometheus target
var SupportedPromProjects = map[string]struct{}{gce.DefaultProject(): {}}

// The URL for the Prometheus registration service. An empty string means that the
// Prometheus integration is disabled. Should be accessed through
// getPrometheusRegistrationUrl().
var promRegistrationUrl = config.EnvOrDefaultString("ROACHPROD_PROM_HOST_URL",
"https://grafana.testeng.crdb.io/promhelpers")

// PromClient is used to communicate with the prometheus helper service
// keeping the functions as a variable enables us to override the value for unit testing
type PromClient struct {
promUrl string
disabled bool

// httpPut is used for http PUT operation.
httpPut func(
ctx context.Context, url string, h *http.Header, body io.Reader,
Expand All @@ -64,15 +68,26 @@ type PromClient struct {
oauth2.TokenSource, error)
}

// DefaultPromClient is the default instance of PromClient. This instance should
// be used unless custom configuration is needed.
var DefaultPromClient = NewPromClient()

// NewPromClient returns a new instance of PromClient
func NewPromClient() *PromClient {
return &PromClient{
promUrl: promRegistrationUrl,
disabled: promRegistrationUrl == "",
httpPut: httputil.Put,
httpDelete: httputil.Delete,
newTokenSource: idtoken.NewTokenSource,
}
}

func (c *PromClient) setUrl(url string) {
c.promUrl = url
c.disabled = false
}

// instanceConfigRequest is the HTTP request received for generating instance config
type instanceConfigRequest struct {
//Config is the content of the yaml file
Expand All @@ -89,19 +104,19 @@ func (c *PromClient) UpdatePrometheusTargets(
insecure bool,
l *logger.Logger,
) error {
promUrl := defaultPrometheusHostUrl
if v, ok := os.LookupEnv(prometheusHostUrlEnv); ok {
promUrl = v
if c.disabled {
l.Printf("Prometheus registration is disabled")
return nil
}
req, err := buildCreateRequest(nodes, insecure)
if err != nil {
return err
}
token, err := c.getToken(ctx, promUrl, forceFetchCreds, l)
token, err := c.getToken(ctx, forceFetchCreds, l)
if err != nil {
return err
}
url := getUrl(promUrl, clusterName)
url := getUrl(c.promUrl, clusterName)
l.Printf("invoking PUT for URL: %s", url)
h := &http.Header{}
h.Set("ContentType", "application/json")
Expand Down Expand Up @@ -132,15 +147,15 @@ func (c *PromClient) UpdatePrometheusTargets(
func (c *PromClient) DeleteClusterConfig(
ctx context.Context, clusterName string, forceFetchCreds bool, l *logger.Logger,
) error {
promUrl := defaultPrometheusHostUrl
if v, ok := os.LookupEnv(prometheusHostUrlEnv); ok {
promUrl = v

if c.disabled {
return nil
}
token, err := c.getToken(ctx, promUrl, forceFetchCreds, l)
token, err := c.getToken(ctx, forceFetchCreds, l)
if err != nil {
return err
}
url := getUrl(promUrl, clusterName)
url := getUrl(c.promUrl, clusterName)
l.Printf("invoking DELETE for URL: %s", url)
h := &http.Header{}
h.Set("Authorization", token)
Expand Down Expand Up @@ -206,9 +221,9 @@ func buildCreateRequest(nodes map[int]*NodeInfo, insecure bool) (io.Reader, erro

// getToken gets the Authorization token for grafana
func (c *PromClient) getToken(
ctx context.Context, promUrl string, forceFetchCreds bool, l *logger.Logger,
ctx context.Context, forceFetchCreds bool, l *logger.Logger,
) (string, error) {
if strings.HasPrefix(promUrl, "http:/") {
if strings.HasPrefix(c.promUrl, "http:/") {
// no token needed for insecure URL
return "", nil
}
Expand Down
16 changes: 10 additions & 6 deletions pkg/roachprod/promhelperclient/client_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@ func TestUpdatePrometheusTargets(t *testing.T) {
}()
ctx := context.Background()
promUrl := "http://prom_url.com"
_ = os.Setenv(prometheusHostUrlEnv, promUrl)
c := NewPromClient()
c.setUrl(promUrl)
t.Run("UpdatePrometheusTargets fails with 400", func(t *testing.T) {
c.httpPut = func(ctx context.Context, reqUrl string, h *http.Header, body io.Reader) (
resp *http.Response, err error) {
Expand Down Expand Up @@ -96,8 +96,8 @@ func TestDeleteClusterConfig(t *testing.T) {
}()
ctx := context.Background()
promUrl := "http://prom_url.com"
_ = os.Setenv(prometheusHostUrlEnv, promUrl)
c := NewPromClient()
c.setUrl(promUrl)
t.Run("DeleteClusterConfig fails with 400", func(t *testing.T) {
c.httpDelete = func(ctx context.Context, url string, h *http.Header) (
resp *http.Response, err error) {
Expand Down Expand Up @@ -144,7 +144,8 @@ func Test_getToken(t *testing.T) {
}()
c := NewPromClient()
t.Run("insecure url", func(t *testing.T) {
token, err := c.getToken(ctx, "http://test.com", false, l)
c.setUrl("http://test.com")
token, err := c.getToken(ctx, false, l)
require.Nil(t, err)
require.Empty(t, token)
})
Expand All @@ -156,7 +157,8 @@ func Test_getToken(t *testing.T) {
c.newTokenSource = func(ctx context.Context, audience string, opts ...idtoken.ClientOption) (oauth2.TokenSource, error) {
return nil, fmt.Errorf("invalid")
}
token, err := c.getToken(ctx, "https://test.com", false, l)
c.setUrl("https://test.com")
token, err := c.getToken(ctx, false, l)
require.NotNil(t, err)
require.Empty(t, token)
require.Equal(t, "error creating GCS oauth token source from specified credential: invalid", err.Error())
Expand All @@ -169,7 +171,8 @@ func Test_getToken(t *testing.T) {
c.newTokenSource = func(ctx context.Context, audience string, opts ...idtoken.ClientOption) (oauth2.TokenSource, error) {
return &mockToken{token: "", err: fmt.Errorf("failed")}, nil
}
token, err := c.getToken(ctx, "https://test.com", false, l)
c.setUrl("https://test.com")
token, err := c.getToken(ctx, false, l)
require.NotNil(t, err)
require.Empty(t, token)
require.Equal(t, "error getting identity token: failed", err.Error())
Expand All @@ -182,7 +185,8 @@ func Test_getToken(t *testing.T) {
c.newTokenSource = func(ctx context.Context, audience string, opts ...idtoken.ClientOption) (oauth2.TokenSource, error) {
return &mockToken{token: "token"}, nil
}
token, err := c.getToken(ctx, "https://test.com", false, l)
c.setUrl("https://test.com")
token, err := c.getToken(ctx, false, l)
require.Nil(t, err)
require.Equal(t, "Bearer token", token)
})
Expand Down
24 changes: 18 additions & 6 deletions pkg/roachprod/roachprod.go
Original file line number Diff line number Diff line change
Expand Up @@ -319,8 +319,18 @@ func Sync(l *logger.Logger, options vm.ListOptions) (*cloud.Cloud, error) {
refreshDNS = false
}
}
if !vm.Providers[aws.ProviderName].Active() {
// If there are no DNS required providers, we shouldn't refresh DNS,
// it's probably a misconfiguration.
if len(config.DNSRequiredProviders) == 0 {
refreshDNS = false
} else {
// If any of the required providers is not active, we shouldn't refresh DNS.
for _, p := range config.DNSRequiredProviders {
if !vm.Providers[p].Active() {
refreshDNS = false
break
}
}
}
// DNS entries are maintained in the GCE DNS registry for all vms, from all
// clouds.
Expand All @@ -329,7 +339,7 @@ func Sync(l *logger.Logger, options vm.ListOptions) (*cloud.Cloud, error) {
l.Printf("Refreshing DNS entries...")
}
if err := gce.SyncDNS(l, vms); err != nil {
l.Errorf("failed to update %s DNS: %v", gce.Subdomain, err)
l.Errorf("failed to update DNS: %v", err)
}
} else {
if !config.Quiet {
Expand Down Expand Up @@ -811,7 +821,7 @@ func updatePrometheusTargets(ctx context.Context, l *logger.Logger, c *install.S
}
wg.Wait()
if len(nodeIPPorts) > 0 {
if err := promhelperclient.NewPromClient().UpdatePrometheusTargets(ctx,
if err := promhelperclient.DefaultPromClient.UpdatePrometheusTargets(ctx,
c.Name, false, nodeIPPorts, !c.Secure, l); err != nil {
l.Errorf("creating cluster config failed for the ip:ports %v: %v", nodeIPPorts, err)
}
Expand Down Expand Up @@ -1114,7 +1124,7 @@ func urlGenerator(
) ([]string, error) {
var urls []string
for i, node := range nodes {
host := vm.Name(c.Name, int(node)) + "." + gce.Subdomain
host := vm.Name(c.Name, int(node)) + "." + gce.DNSDomain()

// There are no DNS entries for local clusters.
if c.IsLocal() {
Expand Down Expand Up @@ -1443,15 +1453,17 @@ func Destroy(
// and let the caller retry.
cld, _ = cloud.ListCloud(l, vm.ListOptions{IncludeEmptyClusters: true})
}
return destroyCluster(cld, l, name)
return destroyCluster(ctx, cld, l, name)
}); err != nil {
return err
}
l.Printf("OK")
return nil
}

func destroyCluster(cld *cloud.Cloud, l *logger.Logger, clusterName string) error {
func destroyCluster(
ctx context.Context, cld *cloud.Cloud, l *logger.Logger, clusterName string,
) error {
c, ok := cld.Clusters[clusterName]
if !ok {
return fmt.Errorf("cluster %s does not exist", clusterName)
Expand Down
Loading
Loading