diff --git a/pkg/ccl/serverccl/admin_test.go b/pkg/ccl/serverccl/admin_test.go index ec929dcb5537..25bab7e51fdb 100644 --- a/pkg/ccl/serverccl/admin_test.go +++ b/pkg/ccl/serverccl/admin_test.go @@ -224,6 +224,12 @@ func TestTableAndDatabaseDetailsAndStats(t *testing.T) { require.Equal(t, dbResp.TableNames[0], "public.test") + var dbDetailsResp serverpb.DatabaseDetailsResponse + err = getAdminJSONProto(st, "databases/defaultdb?include_stats=true", &dbDetailsResp) + require.NoError(t, err) + + require.Greater(t, dbDetailsResp.Stats.RangeCount, int64(0)) + // TableStats tableStatsResp := &serverpb.TableStatsResponse{} err = getAdminJSONProto(st, "databases/defaultdb/tables/public.test/stats", tableStatsResp) diff --git a/pkg/cmd/roachtest/cluster.go b/pkg/cmd/roachtest/cluster.go index 43e9a27abb4e..11199ede3cac 100644 --- a/pkg/cmd/roachtest/cluster.go +++ b/pkg/cmd/roachtest/cluster.go @@ -615,6 +615,11 @@ func (r *clusterRegistry) registerCluster(c *clusterImpl) error { return fmt.Errorf("cluster named %q already exists in registry", c.name) } r.mu.clusters[c.name] = c + if err := c.addLabels(map[string]string{ + VmLabelTestRunID: runID, + }); err != nil && c.l != nil { + c.l.Printf("failed to add %s label to cluster: %s", VmLabelTestRunID, err) + } return nil } @@ -626,6 +631,9 @@ func (r *clusterRegistry) unregisterCluster(c *clusterImpl) bool { // method to be called defensively. return false } + if err := c.removeLabels([]string{VmLabelTestRunID}); err != nil && c.l != nil { + c.l.Printf("failed to remove %s label from cluster: %s", VmLabelTestRunID, err) + } delete(r.mu.clusters, c.name) if c.tag != "" { if _, ok := r.mu.tagCount[c.tag]; !ok { @@ -1870,6 +1878,14 @@ func (c *clusterImpl) doDestroy(ctx context.Context, l *logger.Logger) <-chan st return ch } +func (c *clusterImpl) addLabels(labels map[string]string) error { + return roachprod.AddLabels(c.l, c.name, labels) +} + +func (c *clusterImpl) removeLabels(labels []string) error { + return roachprod.RemoveLabels(c.l, c.name, labels) +} + func (c *clusterImpl) ListSnapshots( ctx context.Context, vslo vm.VolumeSnapshotListOpts, ) ([]vm.VolumeSnapshot, error) { diff --git a/pkg/cmd/roachtest/main.go b/pkg/cmd/roachtest/main.go index bebf18e13824..daeb652aeef4 100644 --- a/pkg/cmd/roachtest/main.go +++ b/pkg/cmd/roachtest/main.go @@ -279,6 +279,29 @@ Examples: listCmd.Flags().StringVar( &cloud, "cloud", cloud, "cloud provider to use (aws, azure, or gce)") + runFn := func(args []string, benchOnly bool) error { + if literalArtifacts == "" { + literalArtifacts = artifacts + } + return runTests(tests.RegisterTests, cliCfg{ + args: args, + count: count, + cpuQuota: cpuQuota, + runSkipped: runSkipped, + debugMode: debugModeFromOpts(), + skipInit: skipInit, + httpPort: httpPort, + promPort: promPort, + parallelism: parallelism, + artifactsDir: artifacts, + literalArtifactsDir: literalArtifacts, + user: getUser(username), + clusterID: clusterID, + versionsBinaryOverride: versionsBinaryOverride, + selectProbability: selectProbability, + }, benchOnly) + } + var runCmd = &cobra.Command{ // Don't display usage when tests fail. SilenceUsage: true, @@ -298,26 +321,7 @@ COCKROACH_ environment variables in the local environment are passed through to the cluster nodes on start. `, RunE: func(_ *cobra.Command, args []string) error { - if literalArtifacts == "" { - literalArtifacts = artifacts - } - return runTests(tests.RegisterTests, cliCfg{ - args: args, - count: count, - cpuQuota: cpuQuota, - runSkipped: runSkipped, - debugMode: debugModeFromOpts(), - skipInit: skipInit, - httpPort: httpPort, - promPort: promPort, - parallelism: parallelism, - artifactsDir: artifacts, - literalArtifactsDir: literalArtifacts, - user: username, - clusterID: clusterID, - versionsBinaryOverride: versionsBinaryOverride, - selectProbability: selectProbability, - }, false /* benchOnly */) + return runFn(args, false /* benchOnly */) }, } @@ -341,24 +345,7 @@ the cluster nodes on start. Short: "run automated benchmarks on cockroach cluster", Long: `Run automated benchmarks on existing or ephemeral cockroach clusters.`, RunE: func(_ *cobra.Command, args []string) error { - if literalArtifacts == "" { - literalArtifacts = artifacts - } - return runTests(tests.RegisterTests, cliCfg{ - args: args, - count: count, - cpuQuota: cpuQuota, - runSkipped: runSkipped, - debugMode: debugModeFromOpts(), - skipInit: skipInit, - httpPort: httpPort, - parallelism: parallelism, - artifactsDir: artifacts, - user: username, - clusterID: clusterID, - versionsBinaryOverride: versionsBinaryOverride, - selectProbability: selectProbability, - }, true /* benchOnly */) + return runFn(args, true /* benchOnly */) }, } @@ -498,10 +485,11 @@ func runTests(register func(registry.Registry), cfg cliCfg, benchOnly bool) erro opt := clustersOpt{ typ: clusterType, clusterName: clusterName, - user: getUser(cfg.user), - cpuQuota: cfg.cpuQuota, - debugMode: cfg.debugMode, - clusterID: cfg.clusterID, + // Precedence for resolving the user: cli arg, env.ROACHPROD_USER, current user. + user: cfg.user, + cpuQuota: cfg.cpuQuota, + debugMode: cfg.debugMode, + clusterID: cfg.clusterID, } if err := runner.runHTTPServer(cfg.httpPort, os.Stdout, bindTo); err != nil { return err diff --git a/pkg/cmd/roachtest/test_runner.go b/pkg/cmd/roachtest/test_runner.go index c7e87abcbf7f..05f27f15997a 100644 --- a/pkg/cmd/roachtest/test_runner.go +++ b/pkg/cmd/roachtest/test_runner.go @@ -70,8 +70,16 @@ var ( prometheusScrapeInterval = time.Second * 15 prng, _ = randutil.NewLockedPseudoRand() + + runID string ) +// VmLabelTestName is the label used to identify the test name in the VM metadata +const VmLabelTestName string = "test_name" + +// VmLabelTestRunID is the label used to identify the test run id in the VM metadata +const VmLabelTestRunID string = "test_run_id" + // testRunner runs tests. type testRunner struct { stopper *stop.Stopper @@ -296,6 +304,8 @@ func (r *testRunner) Run( qp := quotapool.NewIntPool("cloud cpu", uint64(clustersOpt.cpuQuota)) l := lopt.l + runID = generateRunID(clustersOpt.user) + shout(ctx, l, lopt.stdout, "%s: %s", VmLabelTestRunID, runID) var wg sync.WaitGroup for i := 0; i < parallelism; i++ { @@ -386,6 +396,16 @@ func numConcurrentClusterCreations() int { return res } +// This will be added as a label to all cluster nodes when the +// cluster is registered. +func generateRunID(user string) string { + uniqueId := os.Getenv("TC_BUILD_ID") + if uniqueId == "" { + uniqueId = fmt.Sprintf("%d", timeutil.Now().Unix()) + } + return fmt.Sprintf("%s-%s", user, uniqueId) +} + // defaultClusterAllocator is used by workers to create new clusters (or to attach // to an existing one). // @@ -911,7 +931,11 @@ func (r *testRunner) runTest( t.runnerID = goid.Get() s := t.Spec().(*registry.TestSpec) + _ = c.addLabels(map[string]string{ + VmLabelTestName: s.Name, + }) defer func() { + _ = c.removeLabels([]string{VmLabelTestName}) t.end = timeutil.Now() // We only have to record panics if the panic'd value is not the sentinel diff --git a/pkg/roachprod/roachprod.go b/pkg/roachprod/roachprod.go index 5682bcc3b005..5404804bb382 100644 --- a/pkg/roachprod/roachprod.go +++ b/pkg/roachprod/roachprod.go @@ -1290,6 +1290,34 @@ func cleanupFailedCreate(l *logger.Logger, clusterName string) error { return cloud.DestroyCluster(l, c) } +func AddLabels(l *logger.Logger, clusterName string, labels map[string]string) error { + if err := LoadClusters(); err != nil { + return err + } + c, err := newCluster(l, clusterName) + if err != nil { + return err + } + + return vm.FanOut(c.VMs, func(p vm.Provider, vms vm.List) error { + return p.AddLabels(l, vms, labels) + }) +} + +func RemoveLabels(l *logger.Logger, clusterName string, labels []string) error { + if err := LoadClusters(); err != nil { + return err + } + c, err := newCluster(l, clusterName) + if err != nil { + return err + } + + return vm.FanOut(c.VMs, func(p vm.Provider, vms vm.List) error { + return p.RemoveLabels(l, vms, labels) + }) +} + // Create TODO func Create( ctx context.Context, diff --git a/pkg/roachprod/vm/aws/aws.go b/pkg/roachprod/vm/aws/aws.go index 93867e74e1de..20bfb1aec427 100644 --- a/pkg/roachprod/vm/aws/aws.go +++ b/pkg/roachprod/vm/aws/aws.go @@ -421,6 +421,64 @@ func (p *Provider) ConfigSSH(l *logger.Logger, zones []string) error { return g.Wait() } +// editLabels is a helper that adds or removes labels from the given VMs. +func (p *Provider) editLabels( + l *logger.Logger, vms vm.List, labels map[string]string, remove bool, +) error { + args := []string{"ec2"} + if remove { + args = append(args, "delete-tags") + } else { + args = append(args, "create-tags") + } + + args = append(args, "--tags") + tagArgs := make([]string, 0, len(labels)) + for key, value := range labels { + if remove { + tagArgs = append(tagArgs, fmt.Sprintf("Key=%s", key)) + } else { + tagArgs = append(tagArgs, fmt.Sprintf("Key=%s,Value=%s", key, vm.SanitizeLabel(value))) + } + } + args = append(args, tagArgs...) + + byRegion, err := regionMap(vms) + if err != nil { + return err + } + g := errgroup.Group{} + for region, list := range byRegion { + // Capture loop vars here + regionArgs := make([]string, len(args)) + copy(regionArgs, args) + + regionArgs = append(regionArgs, "--region", region) + regionArgs = append(regionArgs, "--resources") + regionArgs = append(regionArgs, list.ProviderIDs()...) + + g.Go(func() error { + _, err := p.runCommand(l, regionArgs) + return err + }) + } + return g.Wait() +} + +// AddLabels adds the given labels to the given VMs. +func (p *Provider) AddLabels(l *logger.Logger, vms vm.List, labels map[string]string) error { + return p.editLabels(l, vms, labels, false) +} + +// RemoveLabels removes the given labels from the given VMs. +func (p *Provider) RemoveLabels(l *logger.Logger, vms vm.List, labels []string) error { + labelMap := make(map[string]string, len(labels)) + for _, label := range labels { + labelMap[label] = "" + } + return p.editLabels(l, vms, labelMap, true) +} + // Create is part of the vm.Provider interface. func (p *Provider) Create( l *logger.Logger, names []string, opts vm.CreateOpts, vmProviderOpts vm.ProviderOpts, @@ -594,27 +652,9 @@ func (p *Provider) Reset(l *logger.Logger, vms vm.List) error { // Extend is part of the vm.Provider interface. // This will update the Lifetime tag on the instances. func (p *Provider) Extend(l *logger.Logger, vms vm.List, lifetime time.Duration) error { - byRegion, err := regionMap(vms) - if err != nil { - return err - } - g := errgroup.Group{} - for region, list := range byRegion { - // Capture loop vars here - args := []string{ - "ec2", "create-tags", - "--region", region, - "--tags", "Key=Lifetime,Value=" + lifetime.String(), - "--resources", - } - args = append(args, list.ProviderIDs()...) - - g.Go(func() error { - _, err := p.runCommand(l, args) - return err - }) - } - return g.Wait() + return p.AddLabels(l, vms, map[string]string{ + "Lifetime": lifetime.String(), + }) } // cachedActiveAccount memoizes the return value from FindActiveAccount diff --git a/pkg/roachprod/vm/azure/azure.go b/pkg/roachprod/vm/azure/azure.go index 3e0d8f507430..339405443295 100644 --- a/pkg/roachprod/vm/azure/azure.go +++ b/pkg/roachprod/vm/azure/azure.go @@ -159,6 +159,16 @@ func getAzureDefaultLabelMap(opts vm.CreateOpts) map[string]string { return m } +func (p *Provider) AddLabels(l *logger.Logger, vms vm.List, labels map[string]string) error { + l.Printf("adding labels to Azure VMs not yet supported") + return nil +} + +func (p *Provider) RemoveLabels(l *logger.Logger, vms vm.List, labels []string) error { + l.Printf("removing labels from Azure VMs not yet supported") + return nil +} + // Create implements vm.Provider. func (p *Provider) Create( l *logger.Logger, names []string, opts vm.CreateOpts, vmProviderOpts vm.ProviderOpts, diff --git a/pkg/roachprod/vm/flagstub/flagstub.go b/pkg/roachprod/vm/flagstub/flagstub.go index e79d50955eec..40a7bc7ddd62 100644 --- a/pkg/roachprod/vm/flagstub/flagstub.go +++ b/pkg/roachprod/vm/flagstub/flagstub.go @@ -74,6 +74,14 @@ func (p *provider) ConfigSSH(l *logger.Logger, zones []string) error { return nil } +func (p *provider) AddLabels(l *logger.Logger, vms vm.List, labels map[string]string) error { + return nil +} + +func (p *provider) RemoveLabels(l *logger.Logger, vms vm.List, labels []string) error { + return nil +} + // Create implements vm.Provider and returns Unimplemented. func (p *provider) Create( l *logger.Logger, names []string, opts vm.CreateOpts, providerOpts vm.ProviderOpts, diff --git a/pkg/roachprod/vm/gce/gcloud.go b/pkg/roachprod/vm/gce/gcloud.go index 5a2cb04063bc..ea2dea4a1c2f 100644 --- a/pkg/roachprod/vm/gce/gcloud.go +++ b/pkg/roachprod/vm/gce/gcloud.go @@ -883,6 +883,54 @@ func (p *Provider) ConfigSSH(l *logger.Logger, zones []string) error { return nil } +func (p *Provider) editLabels( + l *logger.Logger, vms vm.List, labels map[string]string, remove bool, +) error { + cmdArgs := []string{"compute", "instances"} + if remove { + cmdArgs = append(cmdArgs, "remove-labels") + } else { + cmdArgs = append(cmdArgs, "add-labels") + } + + tagArgs := make([]string, 0, len(labels)) + for key, value := range labels { + if remove { + tagArgs = append(tagArgs, key) + } else { + tagArgs = append(tagArgs, fmt.Sprintf("%s=%s", key, vm.SanitizeLabel(value))) + } + } + tagArgsString := strings.Join(tagArgs, ",") + commonArgs := []string{"--project", p.GetProject(), fmt.Sprintf("--labels=%s", tagArgsString)} + + for _, v := range vms { + vmArgs := make([]string, len(cmdArgs)) + copy(vmArgs, cmdArgs) + + vmArgs = append(vmArgs, v.Name, "--zone", v.Zone) + vmArgs = append(vmArgs, commonArgs...) + cmd := exec.Command("gcloud", vmArgs...) + if b, err := cmd.CombinedOutput(); err != nil { + return errors.Wrapf(err, "Command: gcloud %s\nOutput: %s", vmArgs, string(b)) + } + } + return nil +} + +// AddLabels adds the given labels to the given VMs. +func (p *Provider) AddLabels(l *logger.Logger, vms vm.List, labels map[string]string) error { + return p.editLabels(l, vms, labels, false /* remove */) +} + +func (p *Provider) RemoveLabels(l *logger.Logger, vms vm.List, labels []string) error { + labelsMap := make(map[string]string, len(labels)) + for _, label := range labels { + labelsMap[label] = "" + } + return p.editLabels(l, vms, labelsMap, true /* remove */) +} + // Create TODO(peter): document func (p *Provider) Create( l *logger.Logger, names []string, opts vm.CreateOpts, vmProviderOpts vm.ProviderOpts, @@ -1295,24 +1343,9 @@ func (p *Provider) Reset(l *logger.Logger, vms vm.List) error { // Extend TODO(peter): document func (p *Provider) Extend(l *logger.Logger, vms vm.List, lifetime time.Duration) error { - // The gcloud command only takes a single instance. Unlike Delete() above, we have to - // perform the iteration here. - for _, v := range vms { - args := []string{"compute", "instances", "add-labels"} - - args = append(args, "--project", v.Project) - args = append(args, "--zone", v.Zone) - args = append(args, "--labels", fmt.Sprintf("lifetime=%s", lifetime)) - args = append(args, v.Name) - - cmd := exec.Command("gcloud", args...) - - output, err := cmd.CombinedOutput() - if err != nil { - return errors.Wrapf(err, "Command: gcloud %s\nOutput: %s", args, output) - } - } - return nil + return p.AddLabels(l, vms, map[string]string{ + "lifetime": lifetime.String(), + }) } // FindActiveAccount TODO(peter): document diff --git a/pkg/roachprod/vm/local/local.go b/pkg/roachprod/vm/local/local.go index 7025894c276c..a006a0c5fb03 100644 --- a/pkg/roachprod/vm/local/local.go +++ b/pkg/roachprod/vm/local/local.go @@ -173,6 +173,14 @@ func (p *Provider) ConfigSSH(l *logger.Logger, zones []string) error { return nil } +func (p *Provider) AddLabels(l *logger.Logger, vms vm.List, labels map[string]string) error { + return nil +} + +func (p *Provider) RemoveLabels(l *logger.Logger, vms vm.List, labels []string) error { + return nil +} + // Create just creates fake host-info entries in the local filesystem func (p *Provider) Create( l *logger.Logger, names []string, opts vm.CreateOpts, unusedProviderOpts vm.ProviderOpts, diff --git a/pkg/roachprod/vm/vm.go b/pkg/roachprod/vm/vm.go index f42aa3a5737b..be04d5ebed2e 100644 --- a/pkg/roachprod/vm/vm.go +++ b/pkg/roachprod/vm/vm.go @@ -406,6 +406,9 @@ type Provider interface { FindActiveAccount(l *logger.Logger) (string, error) List(l *logger.Logger, opts ListOptions) (List, error) // The name of the Provider, which will also surface in the top-level Providers map. + + AddLabels(l *logger.Logger, vms List, labels map[string]string) error + RemoveLabels(l *logger.Logger, vms List, labels []string) error Name() string // Active returns true if the provider is properly installed and capable of @@ -643,3 +646,20 @@ func DNSSafeAccount(account string) string { } return strings.Map(safe, account) } + +// SanitizeLabel returns a version of the string that can be used as a label. +func SanitizeLabel(label string) string { + // Replace any non-alphanumeric characters with hyphens + re := regexp.MustCompile("[^a-zA-Z0-9]+") + label = re.ReplaceAllString(label, "-") + + // Remove any leading or trailing hyphens + label = strings.Trim(label, "-") + + // Truncate the label to 63 characters (the maximum allowed by GCP) + if len(label) > 63 { + label = label[:63] + } + + return label +} diff --git a/pkg/server/admin.go b/pkg/server/admin.go index df7dd7ccf848..3172f3d75200 100644 --- a/pkg/server/admin.go +++ b/pkg/server/admin.go @@ -67,6 +67,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/mon" "github.com/cockroachdb/cockroach/pkg/util/protoutil" "github.com/cockroachdb/cockroach/pkg/util/quotapool" + "github.com/cockroachdb/cockroach/pkg/util/rangedesc" "github.com/cockroachdb/cockroach/pkg/util/safesql" "github.com/cockroachdb/cockroach/pkg/util/stop" "github.com/cockroachdb/cockroach/pkg/util/timeutil" @@ -1328,16 +1329,9 @@ func (s *adminServer) statsForSpan( ctx, cancel := context.WithCancel(ctx) defer cancel() - rSpan, err := keys.SpanAddr(span) - if err != nil { - return nil, err - } - // Get a list of nodeIDs, range counts, and replica counts per node // for the specified span. - nodeIDs, rangeCount, replCounts, err := getNodeIDsRangeCountReplCountForSpan( - ctx, s.distSender, rSpan, - ) + nodeIDs, rangeCount, replCounts, err := s.getSpanDetails(ctx, span) if err != nil { return nil, err } @@ -1450,25 +1444,29 @@ func (s *adminServer) statsForSpan( // Returns the list of node ids, range count, // and replica count for the specified span. -func getNodeIDsRangeCountReplCountForSpan( - ctx context.Context, ds *kvcoord.DistSender, rSpan roachpb.RSpan, +func (s *adminServer) getSpanDetails( + ctx context.Context, span roachpb.Span, ) (nodeIDList []roachpb.NodeID, rangeCount int64, replCounts map[roachpb.NodeID]int64, _ error) { nodeIDs := make(map[roachpb.NodeID]struct{}) replCountForNodeID := make(map[roachpb.NodeID]int64) - ri := kvcoord.MakeRangeIterator(ds) - ri.Seek(ctx, rSpan.Key, kvcoord.Ascending) - for ; ri.Valid(); ri.Next(ctx) { + var it rangedesc.Iterator + var err error + if s.sqlServer.tenantConnect == nil { + it, err = s.sqlServer.execCfg.RangeDescIteratorFactory.NewIterator(ctx, span) + } else { + it, err = s.sqlServer.tenantConnect.NewIterator(ctx, span) + } + if err != nil { + return nil, 0, nil, err + } + var rangeDesc roachpb.RangeDescriptor + for ; it.Valid(); it.Next() { rangeCount++ - for _, repl := range ri.Desc().Replicas().Descriptors() { + rangeDesc = it.CurRangeDescriptor() + for _, repl := range rangeDesc.Replicas().Descriptors() { replCountForNodeID[repl.NodeID]++ nodeIDs[repl.NodeID] = struct{}{} } - if !ri.NeedAnother(rSpan) { - break - } - } - if err := ri.Error(); err != nil { - return nil, 0, nil, err } nodeIDList = make([]roachpb.NodeID, 0, len(nodeIDs)) diff --git a/pkg/upgrade/upgrades/role_members_ids_migration_test.go b/pkg/upgrade/upgrades/role_members_ids_migration_test.go index adc8c796e7bd..217cc21067fe 100644 --- a/pkg/upgrade/upgrades/role_members_ids_migration_test.go +++ b/pkg/upgrade/upgrades/role_members_ids_migration_test.go @@ -36,10 +36,12 @@ func TestRoleMembersIDMigration10Users(t *testing.T) { runTestRoleMembersIDMigration(t, 10) } -func TestRoleMembersIDMigration1500Users(t *testing.T) { +func TestRoleMembersIDMigration1200Users(t *testing.T) { skip.UnderRace(t) skip.UnderStress(t) - runTestRoleMembersIDMigration(t, 1500) + // Choosing a number larger than 1000 tests that the batching logic in + // this upgrade works correctly. + runTestRoleMembersIDMigration(t, 1200) } func runTestRoleMembersIDMigration(t *testing.T, numUsers int) {