diff --git a/pkg/cmd/roachprod/flags.go b/pkg/cmd/roachprod/flags.go index 2a06e52f028a..7df30a8fdfe8 100644 --- a/pkg/cmd/roachprod/flags.go +++ b/pkg/cmd/roachprod/flags.go @@ -40,8 +40,8 @@ var ( wipePreserveCerts bool grafanaConfig string grafanaArch string - grafanaurlOpen bool grafanaDumpDir string + jaegerConfigNodes string listDetails bool listJSON bool listMine bool @@ -54,9 +54,9 @@ var ( tag string external = false pgurlCertsDir string - adminurlOpen = false adminurlPath = "" adminurlIPs = false + urlOpen = false useTreeDist = true sig = 9 waitFlag = false @@ -159,7 +159,6 @@ func initFlags() { listCmd.Flags().StringVar(&listPattern, "pattern", "", "Show only clusters matching the regex pattern. Empty string matches everything.") - adminurlCmd.Flags().BoolVar(&adminurlOpen, "open", false, "Open the url in a browser") adminurlCmd.Flags().StringVar(&adminurlPath, "path", "/", "Path to add to URL (e.g. to open a same page on each node)") adminurlCmd.Flags().BoolVar(&adminurlIPs, @@ -272,12 +271,12 @@ Default is "RECURRING '*/15 * * * *' FULL BACKUP '@hourly' WITH SCHEDULE OPTIONS grafanaStartCmd.Flags().StringVar(&grafanaArch, "arch", "", "binary architecture override [amd64, arm64]") - grafanaURLCmd.Flags().BoolVar(&grafanaurlOpen, - "open", false, "open the grafana dashboard url on the browser") - grafanaDumpCmd.Flags().StringVar(&grafanaDumpDir, "dump-dir", "", "the absolute path to dump prometheus data to (use the contained 'prometheus-docker-run.sh' to visualize") + jaegerStartCmd.Flags().StringVar(&jaegerConfigNodes, "configure-nodes", "", + "the nodes on which to set the relevant CRDB cluster settings") + initCmd.Flags().IntVar(&startOpts.InitTarget, "init-target", startOpts.InitTarget, "node on which to run initialization") @@ -320,6 +319,10 @@ Default is "RECURRING '*/15 * * * *' FULL BACKUP '@hourly' WITH SCHEDULE OPTIONS updateCmd.Flags().BoolVar(&revertUpdate, "revert", false, "restore roachprod to the previous version "+ "which would have been renamed to roachprod.bak during the update process") + for _, cmd := range []*cobra.Command{adminurlCmd, grafanaURLCmd, jaegerURLCmd} { + cmd.Flags().BoolVar(&urlOpen, "open", false, "Open the url in a browser") + } + for _, cmd := range []*cobra.Command{createCmd, destroyCmd, extendCmd, logsCmd} { cmd.Flags().StringVarP(&username, "username", "u", os.Getenv("ROACHPROD_USER"), "Username to run under, detect if blank") @@ -355,11 +358,11 @@ Default is "RECURRING '*/15 * * * *' FULL BACKUP '@hourly' WITH SCHEDULE OPTIONS cmd.Flags().StringVarP(&config.Binary, "binary", "b", config.Binary, "the remote cockroach binary to use") } - for _, cmd := range []*cobra.Command{startCmd, startInstanceCmd, stopInstanceCmd, sqlCmd, pgurlCmd, adminurlCmd, runCmd} { + for _, cmd := range []*cobra.Command{startCmd, startInstanceCmd, stopInstanceCmd, sqlCmd, pgurlCmd, adminurlCmd, runCmd, jaegerStartCmd} { cmd.Flags().BoolVar(&secure, "secure", false, "use a secure cluster") } - for _, cmd := range []*cobra.Command{pgurlCmd, sqlCmd, adminurlCmd, stopInstanceCmd} { + for _, cmd := range []*cobra.Command{pgurlCmd, sqlCmd, adminurlCmd, stopInstanceCmd, jaegerStartCmd} { cmd.Flags().StringVar(&virtualClusterName, "cluster", "", "specific virtual cluster to connect to") cmd.Flags().IntVar(&sqlInstance, diff --git a/pkg/cmd/roachprod/main.go b/pkg/cmd/roachprod/main.go index ac0002fb00a8..f2e37dd099cc 100644 --- a/pkg/cmd/roachprod/main.go +++ b/pkg/cmd/roachprod/main.go @@ -1014,7 +1014,7 @@ var adminurlCmd = &cobra.Command{ Args: cobra.ExactArgs(1), Run: wrap(func(cmd *cobra.Command, args []string) error { urls, err := roachprod.AdminURL( - context.Background(), config.Logger, args[0], virtualClusterName, sqlInstance, adminurlPath, adminurlIPs, adminurlOpen, secure, + context.Background(), config.Logger, args[0], virtualClusterName, sqlInstance, adminurlPath, adminurlIPs, urlOpen, secure, ) if err != nil { return err @@ -1131,7 +1131,41 @@ var grafanaURLCmd = &cobra.Command{ Args: cobra.ExactArgs(1), Run: wrap(func(cmd *cobra.Command, args []string) error { url, err := roachprod.GrafanaURL(context.Background(), config.Logger, args[0], - grafanaurlOpen) + urlOpen) + if err != nil { + return err + } + fmt.Println(url) + return nil + }), +} + +var jaegerStartCmd = &cobra.Command{ + Use: `jaeger-start `, + Short: `starts a jaeger container on the last node in the cluster`, + Args: cobra.ExactArgs(1), + Run: wrap(func(cmd *cobra.Command, args []string) error { + return roachprod.StartJaeger(context.Background(), config.Logger, args[0], + virtualClusterName, secure, jaegerConfigNodes) + }), +} + +var jaegerStopCmd = &cobra.Command{ + Use: `jaeger-stop `, + Short: `stops a running jaeger container on the last node in the cluster`, + Args: cobra.ExactArgs(1), + Run: wrap(func(cmd *cobra.Command, args []string) error { + return roachprod.StopJaeger(context.Background(), config.Logger, args[0]) + }), +} + +var jaegerURLCmd = &cobra.Command{ + Use: `jaegerurl `, + Short: `returns the URL of the cluster's jaeger UI`, + Args: cobra.ExactArgs(1), + Run: wrap(func(cmd *cobra.Command, args []string) error { + url, err := roachprod.JaegerURL(context.Background(), config.Logger, args[0], + urlOpen) if err != nil { return err } @@ -1443,6 +1477,9 @@ func main() { rootStorageCmd, snapshotCmd, updateCmd, + jaegerStartCmd, + jaegerStopCmd, + jaegerURLCmd, ) setBashCompletionFunction() diff --git a/pkg/roachprod/install/install.go b/pkg/roachprod/install/install.go index 41402f7505e4..baecde3c8221 100644 --- a/pkg/roachprod/install/install.go +++ b/pkg/roachprod/install/install.go @@ -14,6 +14,7 @@ import ( "bytes" "context" "fmt" + "io" "sort" "github.com/cockroachdb/cockroach/pkg/roachprod/logger" @@ -95,6 +96,7 @@ sudo add-apt-repository \ sudo apt-get update; sudo apt-get install -y docker-ce; +sudo usermod -aG docker ubuntu; `, "gcc": ` @@ -159,26 +161,29 @@ func SortedCmds() []string { // Install TODO(peter): document func Install(ctx context.Context, l *logger.Logger, c *SyncedCluster, args []string) error { - do := func(title, cmd string) error { + for _, arg := range args { var buf bytes.Buffer - err := c.Run(ctx, l, &buf, &buf, OnNodes(c.Nodes), "installing "+title, cmd) - if err != nil { + if err := InstallTool(ctx, l, c, c.Nodes, arg, &buf, &buf); err != nil { l.Printf(buf.String()) - } - return err - } - - for _, arg := range args { - cmd, ok := installCmds[arg] - if !ok { - return fmt.Errorf("unknown tool %q", arg) - } - - // Ensure that we early exit if any of the shell statements fail. - cmd = "set -exuo pipefail;" + cmd - if err := do(arg, cmd); err != nil { return err } } return nil } + +func InstallTool( + ctx context.Context, + l *logger.Logger, + c *SyncedCluster, + nodes Nodes, + softwareName string, + stdout, stderr io.Writer, +) error { + cmd, ok := installCmds[softwareName] + if !ok { + return fmt.Errorf("unknown tool %q", softwareName) + } + // Ensure that we early exit if any of the shell statements fail. + cmd = "set -exuo pipefail;" + cmd + return c.Run(ctx, l, stdout, stderr, OnNodes(nodes), "installing "+softwareName, cmd) +} diff --git a/pkg/roachprod/roachprod.go b/pkg/roachprod/roachprod.go index 0020faf18a60..771f95387c82 100644 --- a/pkg/roachprod/roachprod.go +++ b/pkg/roachprod/roachprod.go @@ -1975,6 +1975,123 @@ func isWorkloadCollectorVolume(v vm.Volume) bool { return false } +const ( + otelCollectorPort = 4317 + jaegerUIPort = 16686 + jaegerContainerName = "jaeger" + jaegerImageName = "jaegertracing/all-in-one:latest" +) + +// StartJaeger starts a jaeger instance on the last node in the given +// cluster and configures the cluster to use it. +func StartJaeger( + ctx context.Context, + l *logger.Logger, + clusterName string, + virtualClusterName string, + secure bool, + configureNodes string, +) error { + if err := LoadClusters(); err != nil { + return err + } + c, err := newCluster(l, clusterName, install.SecureOption(secure)) + if err != nil { + return err + } + + // TODO(ssd): Currently this just uses the all-in-one docker + // container with in memory storage. Might be nicer to just + // install from source or get linux binaries and start them + // with systemd. For now this just matches what we've been + // copy and pasting. + jaegerNode := c.TargetNodes()[len(c.TargetNodes())-1:] + err = install.InstallTool(ctx, l, c, jaegerNode, "docker", l.Stdout, l.Stderr) + if err != nil { + return err + } + startCmd := fmt.Sprintf("docker run -d --name %s -p %[2]d:%[2]d -p %[3]d:%[3]d %s", + jaegerContainerName, + otelCollectorPort, + jaegerUIPort, + jaegerImageName) + err = c.Run(ctx, l, l.Stdout, l.Stderr, install.OnNodes(jaegerNode), "start jaegertracing/all-in-one using docker", startCmd) + if err != nil { + return err + } + + otelCollectionHost, err := c.GetInternalIP(jaegerNode[0]) + if err != nil { + return err + } + otelCollectionHostPort := net.JoinHostPort(otelCollectionHost, strconv.Itoa(otelCollectorPort)) + setupStmt := fmt.Sprintf("SET CLUSTER SETTING trace.opentelemetry.collector='%s'", otelCollectionHostPort) + + if configureNodes != "" { + nodes, err := install.ListNodes(configureNodes, len(c.VMs)) + if err != nil { + return err + } + _, err = c.ExecSQL(ctx, l, nodes, virtualClusterName, 0, []string{"-e", setupStmt}) + if err != nil { + return err + } + } + + url, err := JaegerURL(ctx, l, clusterName, false) + if err != nil { + return err + } + + l.Printf("To use with CRDB: %s", setupStmt) + l.Printf("Jaeger UI: %s", url) + return nil +} + +// StopJaeger stops and removes the jaeger container. +func StopJaeger(ctx context.Context, l *logger.Logger, clusterName string) error { + if err := LoadClusters(); err != nil { + return err + } + c, err := newCluster(l, clusterName) + if err != nil { + return err + } + jaegerNode := c.TargetNodes()[len(c.TargetNodes())-1:] + stopCmd := fmt.Sprintf("docker stop %s", jaegerContainerName) + err = c.Run(ctx, l, l.Stdout, l.Stderr, install.OnNodes(jaegerNode), stopCmd, stopCmd) + if err != nil { + return err + } + rmCmd := fmt.Sprintf("docker rm %s", jaegerContainerName) + return c.Run(ctx, l, l.Stdout, l.Stderr, install.OnNodes(jaegerNode), rmCmd, rmCmd) +} + +// JaegerURL returns a url to the jaeger UI, assuming it was installed +// on the lat node in the given cluster. +func JaegerURL( + ctx context.Context, l *logger.Logger, clusterName string, openInBrowser bool, +) (string, error) { + if err := LoadClusters(); err != nil { + return "", err + } + c, err := newCluster(l, clusterName) + if err != nil { + return "", err + } + jaegerNode := c.TargetNodes()[len(c.TargetNodes())-1:] + urls, err := urlGenerator(ctx, c, l, jaegerNode, urlConfig{ + usePublicIP: true, + openInBrowser: openInBrowser, + secure: false, + port: jaegerUIPort, + }) + if err != nil { + return "", err + } + return urls[0], nil +} + // StorageCollectionPerformAction either starts or stops workload collection on // a target cluster. //