From d6dbedcc4a075502799b75d378d75675c2dc13a3 Mon Sep 17 00:00:00 2001 From: Miral Gadani Date: Tue, 1 Aug 2023 18:36:13 +0000 Subject: [PATCH] roachtest: roachprod: add test name labels to vms for prometheus This commit will add a `test_name` label to each VM when a particular roachtest is about to be executed on the cluster, with the label being removed at the end of the roachtest. The `test_name` label is being scraped by Prometheus to allow filtering of dashboards based on the roachtest name. GCE labelling rules mean that test names are sanitised to match `[a-zA-Z-]`. Epic: none Fixes: #98658 Release note: None --- pkg/cmd/roachtest/cluster.go | 8 +++ pkg/cmd/roachtest/test_runner.go | 4 ++ pkg/roachprod/roachprod.go | 28 ++++++++++ pkg/roachprod/vm/aws/aws.go | 79 ++++++++++++++++++++------- pkg/roachprod/vm/azure/azure.go | 10 ++++ pkg/roachprod/vm/flagstub/flagstub.go | 8 +++ pkg/roachprod/vm/gce/gcloud.go | 67 +++++++++++++++++------ pkg/roachprod/vm/local/local.go | 8 +++ pkg/roachprod/vm/vm.go | 19 +++++++ 9 files changed, 192 insertions(+), 39 deletions(-) diff --git a/pkg/cmd/roachtest/cluster.go b/pkg/cmd/roachtest/cluster.go index a72393ac4cd2..74062fa2a062 100644 --- a/pkg/cmd/roachtest/cluster.go +++ b/pkg/cmd/roachtest/cluster.go @@ -1873,6 +1873,14 @@ func (c *clusterImpl) doDestroy(ctx context.Context, l *logger.Logger) <-chan st return ch } +func (c *clusterImpl) addLabels(labels map[string]string) error { + return roachprod.AddLabels(c.l, c.name, labels) +} + +func (c *clusterImpl) removeLabels(labels []string) error { + return roachprod.RemoveLabels(c.l, c.name, labels) +} + func (c *clusterImpl) ListSnapshots( ctx context.Context, vslo vm.VolumeSnapshotListOpts, ) ([]vm.VolumeSnapshot, error) { diff --git a/pkg/cmd/roachtest/test_runner.go b/pkg/cmd/roachtest/test_runner.go index c7e87abcbf7f..32d2d2d46dd2 100644 --- a/pkg/cmd/roachtest/test_runner.go +++ b/pkg/cmd/roachtest/test_runner.go @@ -911,7 +911,11 @@ func (r *testRunner) runTest( t.runnerID = goid.Get() s := t.Spec().(*registry.TestSpec) + _ = c.addLabels(map[string]string{ + "test_name": s.Name, + }) defer func() { + _ = c.removeLabels([]string{"test_name"}) t.end = timeutil.Now() // We only have to record panics if the panic'd value is not the sentinel diff --git a/pkg/roachprod/roachprod.go b/pkg/roachprod/roachprod.go index 5682bcc3b005..5404804bb382 100644 --- a/pkg/roachprod/roachprod.go +++ b/pkg/roachprod/roachprod.go @@ -1290,6 +1290,34 @@ func cleanupFailedCreate(l *logger.Logger, clusterName string) error { return cloud.DestroyCluster(l, c) } +func AddLabels(l *logger.Logger, clusterName string, labels map[string]string) error { + if err := LoadClusters(); err != nil { + return err + } + c, err := newCluster(l, clusterName) + if err != nil { + return err + } + + return vm.FanOut(c.VMs, func(p vm.Provider, vms vm.List) error { + return p.AddLabels(l, vms, labels) + }) +} + +func RemoveLabels(l *logger.Logger, clusterName string, labels []string) error { + if err := LoadClusters(); err != nil { + return err + } + c, err := newCluster(l, clusterName) + if err != nil { + return err + } + + return vm.FanOut(c.VMs, func(p vm.Provider, vms vm.List) error { + return p.RemoveLabels(l, vms, labels) + }) +} + // Create TODO func Create( ctx context.Context, diff --git a/pkg/roachprod/vm/aws/aws.go b/pkg/roachprod/vm/aws/aws.go index 93867e74e1de..ad17b7983ac8 100644 --- a/pkg/roachprod/vm/aws/aws.go +++ b/pkg/roachprod/vm/aws/aws.go @@ -421,6 +421,61 @@ func (p *Provider) ConfigSSH(l *logger.Logger, zones []string) error { return g.Wait() } +// editLabels is a helper that adds or removes labels from the given VMs. +func (p *Provider) editLabels( + l *logger.Logger, vms vm.List, labels map[string]string, remove bool, +) error { + args := []string{"ec2"} + if remove { + args = append(args, "delete-tags") + } else { + args = append(args, "create-tags") + } + + args = append(args, "--tags") + tagArgs := make([]string, 0, len(labels)) + for key, value := range labels { + if remove { + tagArgs = append(tagArgs, fmt.Sprintf("Key=%s", key)) + } else { + tagArgs = append(tagArgs, fmt.Sprintf("Key=%s,Value=%s", key, vm.SanitizeLabel(value))) + } + } + args = append(args, tagArgs...) + + byRegion, err := regionMap(vms) + if err != nil { + return err + } + g := errgroup.Group{} + for region, list := range byRegion { + // Capture loop vars here + regionArgs := append(args, "--region", region) + regionArgs = append(regionArgs, "--resources") + regionArgs = append(regionArgs, list.ProviderIDs()...) + + g.Go(func() error { + _, err := p.runCommand(l, regionArgs) + return err + }) + } + return g.Wait() +} + +// AddLabels adds the given labels to the given VMs. +func (p *Provider) AddLabels(l *logger.Logger, vms vm.List, labels map[string]string) error { + return p.editLabels(l, vms, labels, false) +} + +// RemoveLabels removes the given labels from the given VMs. +func (p *Provider) RemoveLabels(l *logger.Logger, vms vm.List, labels []string) error { + labelMap := make(map[string]string, len(labels)) + for _, label := range labels { + labelMap[label] = "" + } + return p.editLabels(l, vms, labelMap, true) +} + // Create is part of the vm.Provider interface. func (p *Provider) Create( l *logger.Logger, names []string, opts vm.CreateOpts, vmProviderOpts vm.ProviderOpts, @@ -594,27 +649,9 @@ func (p *Provider) Reset(l *logger.Logger, vms vm.List) error { // Extend is part of the vm.Provider interface. // This will update the Lifetime tag on the instances. func (p *Provider) Extend(l *logger.Logger, vms vm.List, lifetime time.Duration) error { - byRegion, err := regionMap(vms) - if err != nil { - return err - } - g := errgroup.Group{} - for region, list := range byRegion { - // Capture loop vars here - args := []string{ - "ec2", "create-tags", - "--region", region, - "--tags", "Key=Lifetime,Value=" + lifetime.String(), - "--resources", - } - args = append(args, list.ProviderIDs()...) - - g.Go(func() error { - _, err := p.runCommand(l, args) - return err - }) - } - return g.Wait() + return p.AddLabels(l, vms, map[string]string{ + "Lifetime": lifetime.String(), + }) } // cachedActiveAccount memoizes the return value from FindActiveAccount diff --git a/pkg/roachprod/vm/azure/azure.go b/pkg/roachprod/vm/azure/azure.go index 3e0d8f507430..339405443295 100644 --- a/pkg/roachprod/vm/azure/azure.go +++ b/pkg/roachprod/vm/azure/azure.go @@ -159,6 +159,16 @@ func getAzureDefaultLabelMap(opts vm.CreateOpts) map[string]string { return m } +func (p *Provider) AddLabels(l *logger.Logger, vms vm.List, labels map[string]string) error { + l.Printf("adding labels to Azure VMs not yet supported") + return nil +} + +func (p *Provider) RemoveLabels(l *logger.Logger, vms vm.List, labels []string) error { + l.Printf("removing labels from Azure VMs not yet supported") + return nil +} + // Create implements vm.Provider. func (p *Provider) Create( l *logger.Logger, names []string, opts vm.CreateOpts, vmProviderOpts vm.ProviderOpts, diff --git a/pkg/roachprod/vm/flagstub/flagstub.go b/pkg/roachprod/vm/flagstub/flagstub.go index e79d50955eec..40a7bc7ddd62 100644 --- a/pkg/roachprod/vm/flagstub/flagstub.go +++ b/pkg/roachprod/vm/flagstub/flagstub.go @@ -74,6 +74,14 @@ func (p *provider) ConfigSSH(l *logger.Logger, zones []string) error { return nil } +func (p *provider) AddLabels(l *logger.Logger, vms vm.List, labels map[string]string) error { + return nil +} + +func (p *provider) RemoveLabels(l *logger.Logger, vms vm.List, labels []string) error { + return nil +} + // Create implements vm.Provider and returns Unimplemented. func (p *provider) Create( l *logger.Logger, names []string, opts vm.CreateOpts, providerOpts vm.ProviderOpts, diff --git a/pkg/roachprod/vm/gce/gcloud.go b/pkg/roachprod/vm/gce/gcloud.go index 61f59c3f2780..f4de859ff57a 100644 --- a/pkg/roachprod/vm/gce/gcloud.go +++ b/pkg/roachprod/vm/gce/gcloud.go @@ -883,6 +883,52 @@ func (p *Provider) ConfigSSH(l *logger.Logger, zones []string) error { return nil } +func (p *Provider) editLabels( + l *logger.Logger, vms vm.List, labels map[string]string, remove bool, +) error { + cmdArgs := []string{"compute", "instances"} + if remove { + cmdArgs = append(cmdArgs, "remove-labels") + } else { + cmdArgs = append(cmdArgs, "add-labels") + } + + tagArgs := make([]string, 0, len(labels)) + for key, value := range labels { + if remove { + tagArgs = append(tagArgs, key) + } else { + tagArgs = append(tagArgs, fmt.Sprintf("%s=%s", key, vm.SanitizeLabel(value))) + } + } + tagArgsString := strings.Join(tagArgs, ",") + commonArgs := []string{"--project", p.GetProject(), fmt.Sprintf("--labels=%s", tagArgsString)} + + for _, v := range vms { + vmArgs := append(cmdArgs, v.Name, "--zone", v.Zone) + vmArgs = append(vmArgs, commonArgs...) + //fmt.Printf("gcloud %s\n", strings.Join(vmArgs, " ")) + cmd := exec.Command("gcloud", vmArgs...) + if b, err := cmd.CombinedOutput(); err != nil { + return errors.Wrapf(err, "Command: gcloud %s\nOutput: %s", vmArgs, string(b)) + } + } + return nil +} + +// AddLabels adds the given labels to the given VMs. +func (p *Provider) AddLabels(l *logger.Logger, vms vm.List, labels map[string]string) error { + return p.editLabels(l, vms, labels, false /* remove */) +} + +func (p *Provider) RemoveLabels(l *logger.Logger, vms vm.List, labels []string) error { + labelsMap := make(map[string]string, len(labels)) + for _, label := range labels { + labelsMap[label] = "" + } + return p.editLabels(l, vms, labelsMap, true /* remove */) +} + // Create TODO(peter): document func (p *Provider) Create( l *logger.Logger, names []string, opts vm.CreateOpts, vmProviderOpts vm.ProviderOpts, @@ -1292,24 +1338,9 @@ func (p *Provider) Reset(l *logger.Logger, vms vm.List) error { // Extend TODO(peter): document func (p *Provider) Extend(l *logger.Logger, vms vm.List, lifetime time.Duration) error { - // The gcloud command only takes a single instance. Unlike Delete() above, we have to - // perform the iteration here. - for _, v := range vms { - args := []string{"compute", "instances", "add-labels"} - - args = append(args, "--project", v.Project) - args = append(args, "--zone", v.Zone) - args = append(args, "--labels", fmt.Sprintf("lifetime=%s", lifetime)) - args = append(args, v.Name) - - cmd := exec.Command("gcloud", args...) - - output, err := cmd.CombinedOutput() - if err != nil { - return errors.Wrapf(err, "Command: gcloud %s\nOutput: %s", args, output) - } - } - return nil + return p.AddLabels(l, vms, map[string]string{ + "lifetime": lifetime.String(), + }) } // FindActiveAccount TODO(peter): document diff --git a/pkg/roachprod/vm/local/local.go b/pkg/roachprod/vm/local/local.go index 7025894c276c..a006a0c5fb03 100644 --- a/pkg/roachprod/vm/local/local.go +++ b/pkg/roachprod/vm/local/local.go @@ -173,6 +173,14 @@ func (p *Provider) ConfigSSH(l *logger.Logger, zones []string) error { return nil } +func (p *Provider) AddLabels(l *logger.Logger, vms vm.List, labels map[string]string) error { + return nil +} + +func (p *Provider) RemoveLabels(l *logger.Logger, vms vm.List, labels []string) error { + return nil +} + // Create just creates fake host-info entries in the local filesystem func (p *Provider) Create( l *logger.Logger, names []string, opts vm.CreateOpts, unusedProviderOpts vm.ProviderOpts, diff --git a/pkg/roachprod/vm/vm.go b/pkg/roachprod/vm/vm.go index f42aa3a5737b..cb2b93d4a273 100644 --- a/pkg/roachprod/vm/vm.go +++ b/pkg/roachprod/vm/vm.go @@ -406,6 +406,9 @@ type Provider interface { FindActiveAccount(l *logger.Logger) (string, error) List(l *logger.Logger, opts ListOptions) (List, error) // The name of the Provider, which will also surface in the top-level Providers map. + + AddLabels(l *logger.Logger, vms List, labels map[string]string) error + RemoveLabels(l *logger.Logger, vms List, labels []string) error Name() string // Active returns true if the provider is properly installed and capable of @@ -643,3 +646,19 @@ func DNSSafeAccount(account string) string { } return strings.Map(safe, account) } + +func SanitizeLabel(label string) string { + // Replace any non-alphanumeric characters with hyphens + re := regexp.MustCompile("[^a-zA-Z0-9]+") + label = re.ReplaceAllString(label, "-") + + // Remove any leading or trailing hyphens + label = strings.Trim(label, "-") + + // Truncate the label to 63 characters (the maximum allowed by GCP) + if len(label) > 63 { + label = label[:63] + } + + return label +}