Skip to content

Commit

Permalink
Merge #120340
Browse files Browse the repository at this point in the history
120340: roachprod: enable roachprod to be configured for external use r=srosenberg a=ajwerner

After this change, roachprod fully works on GCP at my company with the following configuration:

```
ROACHPROD_DNS_REQUIRED_PROVIDERS=gce
ROACHPROD_EMAIL_DOMAIN=`@dataexmachina.dev`
ROACHPROD_GCE_DEFAULT_PROJECT=dem-ephemeral
ROACHPROD_GCE_DEFAULT_SERVICE_ACCOUNT=roachprod@dem-ephemeral.iam.gserviceaccount.com
ROACHPROD_GCE_DNS_DOMAIN=roachprod.exray.cloud
ROACHPROD_GCE_DNS_MANAGED_DOMAIN=roachprod-managed.dataexmachina.dev
ROACHPROD_GCE_DNS_MANAGED_ZONE=roachprod-managed
ROACHPROD_GCE_DNS_PROJECT=dem-ephemeral
ROACHPROD_GCE_DNS_ZONE=roachprod
```

All of these configuration options can also be set via flags.

See individual commits.

Epic: none
Release note: none

Co-authored-by: Andrew Werner <[email protected]>
Co-authored-by: Andrei Matei <[email protected]>
  • Loading branch information
3 people committed May 31, 2024
2 parents 9e61527 + ff46295 commit f2e7709
Show file tree
Hide file tree
Showing 11 changed files with 339 additions and 118 deletions.
11 changes: 10 additions & 1 deletion pkg/cmd/roachprod/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,8 @@ func initFlags() {
rootCmd.PersistentFlags().IntVarP(&config.MaxConcurrency, "max-concurrency", "", 32,
"maximum number of operations to execute on nodes concurrently, set to zero for infinite",
)
rootCmd.PersistentFlags().StringVarP(&config.EmailDomain, "email-domain", "",
config.DefaultEmailDomain, "email domain for users")

createCmd.Flags().DurationVarP(&createVMOpts.Lifetime,
"lifetime", "l", 12*time.Hour, "Lifetime of the cluster")
Expand Down Expand Up @@ -162,7 +164,7 @@ func initFlags() {
providerOptsContainer[providerName].ConfigureCreateFlags(createCmd.Flags())

for _, cmd := range []*cobra.Command{
destroyCmd, extendCmd, listCmd, syncCmd, gcCmd,
destroyCmd, extendCmd, listCmd, syncCmd, gcCmd, setupSSHCmd, startCmd, pgurlCmd, adminurlCmd,
} {
providerOptsContainer[providerName].ConfigureClusterFlags(cmd.Flags(), vm.AcceptMultipleProjects)
}
Expand Down Expand Up @@ -448,6 +450,13 @@ func initFlags() {
"sql-instance", 0, "specific SQL/HTTP instance to connect to (this is a roachprod abstraction distinct from the internal instance ID)")
}

for _, cmd := range []*cobra.Command{startCmd, listCmd, syncCmd} {
cmd.Flags().StringSliceVar(&config.DNSRequiredProviders,
"dns-required-providers", config.DefaultDNSRequiredProviders,
"the cloud providers that must be active to refresh DNS entries",
)
}

grafanaAnnotationCmd.Flags().StringArrayVar(&grafanaTags,
"tags", []string{}, "grafana annotation tags")
grafanaAnnotationCmd.Flags().StringVar(&grafanaDashboardUID,
Expand Down
39 changes: 36 additions & 3 deletions pkg/roachprod/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (
"path"
"regexp"
"slices"
"strings"

"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
"github.com/cockroachdb/cockroach/pkg/util/envutil"
Expand Down Expand Up @@ -49,8 +50,43 @@ var (
// SSHDirectory is the path to search for SSH keys needed to set up
// set up new roachprod clusters.
SSHDirectory = os.ExpandEnv("${HOME}/.ssh")

// DefaultEmailDomain is used to form the full account name for GCE and Slack.
DefaultEmailDomain = EnvOrDefaultString(
"ROACHPROD_EMAIL_DOMAIN", "@cockroachlabs.com",
)

// EmailDomain used to form fully qualified usernames for gcloud and slack.
EmailDomain string

// DNSRequiredProviders is the list of cloud providers that must be active for
// DNS records to be synced when roachprod syncs its state.
DefaultDNSRequiredProviders = envOrDefaultStrings(
"ROACHPROD_DNS_REQUIRED_PROVIDERS", []string{"gce", "aws"},
)

// DNSRequiredProviders is the list of cloud providers that must be active for
DNSRequiredProviders []string
)

// EnvOrDefaultString returns the value of the environment variable with the
// given key, or the default value if the environment variable is not set.
//
// Unlike envutil.EnvOrDefaultString, it does not assert properties of the key.
func EnvOrDefaultString(key, def string) string {
if v, ok := os.LookupEnv(key); ok {
return v
}
return def
}

func envOrDefaultStrings(key string, def []string) []string {
if v, ok := os.LookupEnv(key); ok {
return strings.Split(v, ",")
}
return def
}

func init() {
var err error
OSUser, err = user.Current()
Expand All @@ -70,9 +106,6 @@ const (
// DefaultDebugDir is used to stash debug information.
DefaultDebugDir = "${HOME}/.roachprod/debug"

// EmailDomain is used to form the full account name for GCE and Slack.
EmailDomain = "@cockroachlabs.com"

// Local is the prefix used to identify local clusters.
// It is also used as the zone for local clusters.
Local = "local"
Expand Down
1 change: 0 additions & 1 deletion pkg/roachprod/install/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,6 @@ go_test(
"//pkg/roachprod/cloud",
"//pkg/roachprod/logger",
"//pkg/roachprod/vm",
"//pkg/roachprod/vm/gce",
"//pkg/roachprod/vm/gce/testutils",
"//pkg/roachprod/vm/local",
"//pkg/testutils/datapathutils",
Expand Down
7 changes: 3 additions & 4 deletions pkg/roachprod/install/services_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ import (
"github.com/cockroachdb/cockroach/pkg/roachprod/cloud"
"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
"github.com/cockroachdb/cockroach/pkg/roachprod/vm"
"github.com/cockroachdb/cockroach/pkg/roachprod/vm/gce"
"github.com/cockroachdb/cockroach/pkg/roachprod/vm/gce/testutils"
"github.com/cockroachdb/cockroach/pkg/roachprod/vm/local"
"github.com/cockroachdb/cockroach/pkg/util/randutil"
Expand Down Expand Up @@ -106,14 +105,14 @@ func TestServiceNameComponents(t *testing.T) {
func TestMaybeRegisterServices(t *testing.T) {
ctx := context.Background()
rng, _ := randutil.NewTestRand()
dnsServer, _, providerName := testutils.ProviderWithTestDNSServer(rng)
dnsServer, dnsProvider, providerName := testutils.ProviderWithTestDNSServer(rng)

// Create a cluster with 3 nodes.
makeVM := func(clusterName string, i int) vm.VM {
return vm.VM{
Provider: providerName,
DNSProvider: providerName,
PublicDNS: fmt.Sprintf("%s.%s", vm.Name(clusterName, i), gce.Subdomain),
PublicDNS: fmt.Sprintf("%s.%s", vm.Name(clusterName, i), dnsProvider.Domain()),
}
}
makeCluster := func() *SyncedCluster {
Expand Down Expand Up @@ -171,7 +170,7 @@ func TestMultipleRegistrations(t *testing.T) {
return vm.VM{
Provider: providerName,
DNSProvider: providerName,
PublicDNS: fmt.Sprintf("%s.%s", vm.Name(clusterName, i), gce.Subdomain),
PublicDNS: fmt.Sprintf("%s.%s", vm.Name(clusterName, i), testDNS.Domain()),
}
}
clusterName := fmt.Sprintf("cluster-%d", rng.Uint32())
Expand Down
1 change: 1 addition & 0 deletions pkg/roachprod/promhelperclient/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ go_library(
importpath = "github.com/cockroachdb/cockroach/pkg/roachprod/promhelperclient",
visibility = ["//visibility:public"],
deps = [
"//pkg/roachprod/config",
"//pkg/roachprod/logger",
"//pkg/roachprod/vm/gce",
"//pkg/util/httputil",
Expand Down
55 changes: 35 additions & 20 deletions pkg/roachprod/promhelperclient/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
"strconv"
"strings"

"github.com/cockroachdb/cockroach/pkg/roachprod/config"
"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
"github.com/cockroachdb/cockroach/pkg/roachprod/vm/gce"
"github.com/cockroachdb/cockroach/pkg/util/httputil"
Expand All @@ -34,24 +35,27 @@ import (
)

const (
resourceName = "instance-configs"
resourceVersion = "v1"

// defaultPrometheusHostUrl for prometheus cluster config
defaultPrometheusHostUrl = "https://grafana.testeng.crdb.io/promhelpers"
// prometheusHostUrlEnv is the environment variable to override defaultPrometheusHostUrl
prometheusHostUrlEnv = "ROACHPROD_PROM_HOST_URL"

resourceName = "instance-configs"
resourceVersion = "v1"
serviceAccountJson = "PROM_HELPER_SERVICE_ACCOUNT_JSON"
serviceAccountAudience = "PROM_HELPER_SERVICE_ACCOUNT_AUDIENCE"
)

// SupportedPromProjects are the projects supported for prometheus target
var SupportedPromProjects = map[string]struct{}{gce.DefaultProject(): {}}

// The URL for the Prometheus registration service. An empty string means that the
// Prometheus integration is disabled. Should be accessed through
// getPrometheusRegistrationUrl().
var promRegistrationUrl = config.EnvOrDefaultString("ROACHPROD_PROM_HOST_URL",
"https://grafana.testeng.crdb.io/promhelpers")

// PromClient is used to communicate with the prometheus helper service
// keeping the functions as a variable enables us to override the value for unit testing
type PromClient struct {
promUrl string
disabled bool

// httpPut is used for http PUT operation.
httpPut func(
ctx context.Context, url string, h *http.Header, body io.Reader,
Expand All @@ -64,15 +68,26 @@ type PromClient struct {
oauth2.TokenSource, error)
}

// DefaultPromClient is the default instance of PromClient. This instance should
// be used unless custom configuration is needed.
var DefaultPromClient = NewPromClient()

// NewPromClient returns a new instance of PromClient
func NewPromClient() *PromClient {
return &PromClient{
promUrl: promRegistrationUrl,
disabled: promRegistrationUrl == "",
httpPut: httputil.Put,
httpDelete: httputil.Delete,
newTokenSource: idtoken.NewTokenSource,
}
}

func (c *PromClient) setUrl(url string) {
c.promUrl = url
c.disabled = false
}

// instanceConfigRequest is the HTTP request received for generating instance config
type instanceConfigRequest struct {
//Config is the content of the yaml file
Expand All @@ -89,19 +104,19 @@ func (c *PromClient) UpdatePrometheusTargets(
insecure bool,
l *logger.Logger,
) error {
promUrl := defaultPrometheusHostUrl
if v, ok := os.LookupEnv(prometheusHostUrlEnv); ok {
promUrl = v
if c.disabled {
l.Printf("Prometheus registration is disabled")
return nil
}
req, err := buildCreateRequest(nodes, insecure)
if err != nil {
return err
}
token, err := c.getToken(ctx, promUrl, forceFetchCreds, l)
token, err := c.getToken(ctx, forceFetchCreds, l)
if err != nil {
return err
}
url := getUrl(promUrl, clusterName)
url := getUrl(c.promUrl, clusterName)
l.Printf("invoking PUT for URL: %s", url)
h := &http.Header{}
h.Set("ContentType", "application/json")
Expand Down Expand Up @@ -132,15 +147,15 @@ func (c *PromClient) UpdatePrometheusTargets(
func (c *PromClient) DeleteClusterConfig(
ctx context.Context, clusterName string, forceFetchCreds bool, l *logger.Logger,
) error {
promUrl := defaultPrometheusHostUrl
if v, ok := os.LookupEnv(prometheusHostUrlEnv); ok {
promUrl = v

if c.disabled {
return nil
}
token, err := c.getToken(ctx, promUrl, forceFetchCreds, l)
token, err := c.getToken(ctx, forceFetchCreds, l)
if err != nil {
return err
}
url := getUrl(promUrl, clusterName)
url := getUrl(c.promUrl, clusterName)
l.Printf("invoking DELETE for URL: %s", url)
h := &http.Header{}
h.Set("Authorization", token)
Expand Down Expand Up @@ -206,9 +221,9 @@ func buildCreateRequest(nodes map[int]*NodeInfo, insecure bool) (io.Reader, erro

// getToken gets the Authorization token for grafana
func (c *PromClient) getToken(
ctx context.Context, promUrl string, forceFetchCreds bool, l *logger.Logger,
ctx context.Context, forceFetchCreds bool, l *logger.Logger,
) (string, error) {
if strings.HasPrefix(promUrl, "http:/") {
if strings.HasPrefix(c.promUrl, "http:/") {
// no token needed for insecure URL
return "", nil
}
Expand Down
16 changes: 10 additions & 6 deletions pkg/roachprod/promhelperclient/client_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@ func TestUpdatePrometheusTargets(t *testing.T) {
}()
ctx := context.Background()
promUrl := "http://prom_url.com"
_ = os.Setenv(prometheusHostUrlEnv, promUrl)
c := NewPromClient()
c.setUrl(promUrl)
t.Run("UpdatePrometheusTargets fails with 400", func(t *testing.T) {
c.httpPut = func(ctx context.Context, reqUrl string, h *http.Header, body io.Reader) (
resp *http.Response, err error) {
Expand Down Expand Up @@ -96,8 +96,8 @@ func TestDeleteClusterConfig(t *testing.T) {
}()
ctx := context.Background()
promUrl := "http://prom_url.com"
_ = os.Setenv(prometheusHostUrlEnv, promUrl)
c := NewPromClient()
c.setUrl(promUrl)
t.Run("DeleteClusterConfig fails with 400", func(t *testing.T) {
c.httpDelete = func(ctx context.Context, url string, h *http.Header) (
resp *http.Response, err error) {
Expand Down Expand Up @@ -144,7 +144,8 @@ func Test_getToken(t *testing.T) {
}()
c := NewPromClient()
t.Run("insecure url", func(t *testing.T) {
token, err := c.getToken(ctx, "http://test.com", false, l)
c.setUrl("http://test.com")
token, err := c.getToken(ctx, false, l)
require.Nil(t, err)
require.Empty(t, token)
})
Expand All @@ -156,7 +157,8 @@ func Test_getToken(t *testing.T) {
c.newTokenSource = func(ctx context.Context, audience string, opts ...idtoken.ClientOption) (oauth2.TokenSource, error) {
return nil, fmt.Errorf("invalid")
}
token, err := c.getToken(ctx, "https://test.com", false, l)
c.setUrl("https://test.com")
token, err := c.getToken(ctx, false, l)
require.NotNil(t, err)
require.Empty(t, token)
require.Equal(t, "error creating GCS oauth token source from specified credential: invalid", err.Error())
Expand All @@ -169,7 +171,8 @@ func Test_getToken(t *testing.T) {
c.newTokenSource = func(ctx context.Context, audience string, opts ...idtoken.ClientOption) (oauth2.TokenSource, error) {
return &mockToken{token: "", err: fmt.Errorf("failed")}, nil
}
token, err := c.getToken(ctx, "https://test.com", false, l)
c.setUrl("https://test.com")
token, err := c.getToken(ctx, false, l)
require.NotNil(t, err)
require.Empty(t, token)
require.Equal(t, "error getting identity token: failed", err.Error())
Expand All @@ -182,7 +185,8 @@ func Test_getToken(t *testing.T) {
c.newTokenSource = func(ctx context.Context, audience string, opts ...idtoken.ClientOption) (oauth2.TokenSource, error) {
return &mockToken{token: "token"}, nil
}
token, err := c.getToken(ctx, "https://test.com", false, l)
c.setUrl("https://test.com")
token, err := c.getToken(ctx, false, l)
require.Nil(t, err)
require.Equal(t, "Bearer token", token)
})
Expand Down
24 changes: 18 additions & 6 deletions pkg/roachprod/roachprod.go
Original file line number Diff line number Diff line change
Expand Up @@ -319,8 +319,18 @@ func Sync(l *logger.Logger, options vm.ListOptions) (*cloud.Cloud, error) {
refreshDNS = false
}
}
if !vm.Providers[aws.ProviderName].Active() {
// If there are no DNS required providers, we shouldn't refresh DNS,
// it's probably a misconfiguration.
if len(config.DNSRequiredProviders) == 0 {
refreshDNS = false
} else {
// If any of the required providers is not active, we shouldn't refresh DNS.
for _, p := range config.DNSRequiredProviders {
if !vm.Providers[p].Active() {
refreshDNS = false
break
}
}
}
// DNS entries are maintained in the GCE DNS registry for all vms, from all
// clouds.
Expand All @@ -329,7 +339,7 @@ func Sync(l *logger.Logger, options vm.ListOptions) (*cloud.Cloud, error) {
l.Printf("Refreshing DNS entries...")
}
if err := gce.SyncDNS(l, vms); err != nil {
l.Errorf("failed to update %s DNS: %v", gce.Subdomain, err)
l.Errorf("failed to update DNS: %v", err)
}
} else {
if !config.Quiet {
Expand Down Expand Up @@ -811,7 +821,7 @@ func updatePrometheusTargets(ctx context.Context, l *logger.Logger, c *install.S
}
wg.Wait()
if len(nodeIPPorts) > 0 {
if err := promhelperclient.NewPromClient().UpdatePrometheusTargets(ctx,
if err := promhelperclient.DefaultPromClient.UpdatePrometheusTargets(ctx,
c.Name, false, nodeIPPorts, !c.Secure, l); err != nil {
l.Errorf("creating cluster config failed for the ip:ports %v: %v", nodeIPPorts, err)
}
Expand Down Expand Up @@ -1114,7 +1124,7 @@ func urlGenerator(
) ([]string, error) {
var urls []string
for i, node := range nodes {
host := vm.Name(c.Name, int(node)) + "." + gce.Subdomain
host := vm.Name(c.Name, int(node)) + "." + gce.DNSDomain()

// There are no DNS entries for local clusters.
if c.IsLocal() {
Expand Down Expand Up @@ -1443,15 +1453,17 @@ func Destroy(
// and let the caller retry.
cld, _ = cloud.ListCloud(l, vm.ListOptions{IncludeEmptyClusters: true})
}
return destroyCluster(cld, l, name)
return destroyCluster(ctx, cld, l, name)
}); err != nil {
return err
}
l.Printf("OK")
return nil
}

func destroyCluster(cld *cloud.Cloud, l *logger.Logger, clusterName string) error {
func destroyCluster(
ctx context.Context, cld *cloud.Cloud, l *logger.Logger, clusterName string,
) error {
c, ok := cld.Clusters[clusterName]
if !ok {
return fmt.Errorf("cluster %s does not exist", clusterName)
Expand Down
Loading

0 comments on commit f2e7709

Please sign in to comment.