From 5a9b75e0d41dbf228e331e8a1fc0299963245641 Mon Sep 17 00:00:00 2001 From: Miral Gadani Date: Tue, 1 Aug 2023 18:36:13 +0000 Subject: [PATCH 1/4] roachtest: roachprod: add test name labels to vms for prometheus This commit will add a `test_name` label to each VM when a particular roachtest is about to be executed on the cluster, with the label being removed at the end of the roachtest. The `test_name` label is being scraped by Prometheus to allow filtering of dashboards based on the roachtest name. GCE labelling rules mean that test names are sanitised to match `[a-zA-Z-]`. Epic: none Fixes: #98658 Release note: None --- pkg/cmd/roachtest/cluster.go | 8 +++ pkg/cmd/roachtest/test_runner.go | 4 ++ pkg/roachprod/roachprod.go | 28 +++++++++ pkg/roachprod/vm/aws/aws.go | 82 ++++++++++++++++++++------- pkg/roachprod/vm/azure/azure.go | 10 ++++ pkg/roachprod/vm/flagstub/flagstub.go | 8 +++ pkg/roachprod/vm/gce/gcloud.go | 69 ++++++++++++++++------ pkg/roachprod/vm/local/local.go | 8 +++ pkg/roachprod/vm/vm.go | 19 +++++++ 9 files changed, 197 insertions(+), 39 deletions(-) diff --git a/pkg/cmd/roachtest/cluster.go b/pkg/cmd/roachtest/cluster.go index 86a75fce72b9..297c453de58f 100644 --- a/pkg/cmd/roachtest/cluster.go +++ b/pkg/cmd/roachtest/cluster.go @@ -1872,6 +1872,14 @@ func (c *clusterImpl) doDestroy(ctx context.Context, l *logger.Logger) <-chan st return ch } +func (c *clusterImpl) addLabels(labels map[string]string) error { + return roachprod.AddLabels(c.l, c.name, labels) +} + +func (c *clusterImpl) removeLabels(labels []string) error { + return roachprod.RemoveLabels(c.l, c.name, labels) +} + func (c *clusterImpl) ListSnapshots( ctx context.Context, vslo vm.VolumeSnapshotListOpts, ) ([]vm.VolumeSnapshot, error) { diff --git a/pkg/cmd/roachtest/test_runner.go b/pkg/cmd/roachtest/test_runner.go index c7e87abcbf7f..32d2d2d46dd2 100644 --- a/pkg/cmd/roachtest/test_runner.go +++ b/pkg/cmd/roachtest/test_runner.go @@ -911,7 +911,11 @@ func (r *testRunner) runTest( t.runnerID = goid.Get() s := t.Spec().(*registry.TestSpec) + _ = c.addLabels(map[string]string{ + "test_name": s.Name, + }) defer func() { + _ = c.removeLabels([]string{"test_name"}) t.end = timeutil.Now() // We only have to record panics if the panic'd value is not the sentinel diff --git a/pkg/roachprod/roachprod.go b/pkg/roachprod/roachprod.go index 5682bcc3b005..5404804bb382 100644 --- a/pkg/roachprod/roachprod.go +++ b/pkg/roachprod/roachprod.go @@ -1290,6 +1290,34 @@ func cleanupFailedCreate(l *logger.Logger, clusterName string) error { return cloud.DestroyCluster(l, c) } +func AddLabels(l *logger.Logger, clusterName string, labels map[string]string) error { + if err := LoadClusters(); err != nil { + return err + } + c, err := newCluster(l, clusterName) + if err != nil { + return err + } + + return vm.FanOut(c.VMs, func(p vm.Provider, vms vm.List) error { + return p.AddLabels(l, vms, labels) + }) +} + +func RemoveLabels(l *logger.Logger, clusterName string, labels []string) error { + if err := LoadClusters(); err != nil { + return err + } + c, err := newCluster(l, clusterName) + if err != nil { + return err + } + + return vm.FanOut(c.VMs, func(p vm.Provider, vms vm.List) error { + return p.RemoveLabels(l, vms, labels) + }) +} + // Create TODO func Create( ctx context.Context, diff --git a/pkg/roachprod/vm/aws/aws.go b/pkg/roachprod/vm/aws/aws.go index 93867e74e1de..20bfb1aec427 100644 --- a/pkg/roachprod/vm/aws/aws.go +++ b/pkg/roachprod/vm/aws/aws.go @@ -421,6 +421,64 @@ func (p *Provider) ConfigSSH(l *logger.Logger, zones []string) error { return g.Wait() } +// editLabels is a helper that adds or removes labels from the given VMs. +func (p *Provider) editLabels( + l *logger.Logger, vms vm.List, labels map[string]string, remove bool, +) error { + args := []string{"ec2"} + if remove { + args = append(args, "delete-tags") + } else { + args = append(args, "create-tags") + } + + args = append(args, "--tags") + tagArgs := make([]string, 0, len(labels)) + for key, value := range labels { + if remove { + tagArgs = append(tagArgs, fmt.Sprintf("Key=%s", key)) + } else { + tagArgs = append(tagArgs, fmt.Sprintf("Key=%s,Value=%s", key, vm.SanitizeLabel(value))) + } + } + args = append(args, tagArgs...) + + byRegion, err := regionMap(vms) + if err != nil { + return err + } + g := errgroup.Group{} + for region, list := range byRegion { + // Capture loop vars here + regionArgs := make([]string, len(args)) + copy(regionArgs, args) + + regionArgs = append(regionArgs, "--region", region) + regionArgs = append(regionArgs, "--resources") + regionArgs = append(regionArgs, list.ProviderIDs()...) + + g.Go(func() error { + _, err := p.runCommand(l, regionArgs) + return err + }) + } + return g.Wait() +} + +// AddLabels adds the given labels to the given VMs. +func (p *Provider) AddLabels(l *logger.Logger, vms vm.List, labels map[string]string) error { + return p.editLabels(l, vms, labels, false) +} + +// RemoveLabels removes the given labels from the given VMs. +func (p *Provider) RemoveLabels(l *logger.Logger, vms vm.List, labels []string) error { + labelMap := make(map[string]string, len(labels)) + for _, label := range labels { + labelMap[label] = "" + } + return p.editLabels(l, vms, labelMap, true) +} + // Create is part of the vm.Provider interface. func (p *Provider) Create( l *logger.Logger, names []string, opts vm.CreateOpts, vmProviderOpts vm.ProviderOpts, @@ -594,27 +652,9 @@ func (p *Provider) Reset(l *logger.Logger, vms vm.List) error { // Extend is part of the vm.Provider interface. // This will update the Lifetime tag on the instances. func (p *Provider) Extend(l *logger.Logger, vms vm.List, lifetime time.Duration) error { - byRegion, err := regionMap(vms) - if err != nil { - return err - } - g := errgroup.Group{} - for region, list := range byRegion { - // Capture loop vars here - args := []string{ - "ec2", "create-tags", - "--region", region, - "--tags", "Key=Lifetime,Value=" + lifetime.String(), - "--resources", - } - args = append(args, list.ProviderIDs()...) - - g.Go(func() error { - _, err := p.runCommand(l, args) - return err - }) - } - return g.Wait() + return p.AddLabels(l, vms, map[string]string{ + "Lifetime": lifetime.String(), + }) } // cachedActiveAccount memoizes the return value from FindActiveAccount diff --git a/pkg/roachprod/vm/azure/azure.go b/pkg/roachprod/vm/azure/azure.go index 3e0d8f507430..339405443295 100644 --- a/pkg/roachprod/vm/azure/azure.go +++ b/pkg/roachprod/vm/azure/azure.go @@ -159,6 +159,16 @@ func getAzureDefaultLabelMap(opts vm.CreateOpts) map[string]string { return m } +func (p *Provider) AddLabels(l *logger.Logger, vms vm.List, labels map[string]string) error { + l.Printf("adding labels to Azure VMs not yet supported") + return nil +} + +func (p *Provider) RemoveLabels(l *logger.Logger, vms vm.List, labels []string) error { + l.Printf("removing labels from Azure VMs not yet supported") + return nil +} + // Create implements vm.Provider. func (p *Provider) Create( l *logger.Logger, names []string, opts vm.CreateOpts, vmProviderOpts vm.ProviderOpts, diff --git a/pkg/roachprod/vm/flagstub/flagstub.go b/pkg/roachprod/vm/flagstub/flagstub.go index e79d50955eec..40a7bc7ddd62 100644 --- a/pkg/roachprod/vm/flagstub/flagstub.go +++ b/pkg/roachprod/vm/flagstub/flagstub.go @@ -74,6 +74,14 @@ func (p *provider) ConfigSSH(l *logger.Logger, zones []string) error { return nil } +func (p *provider) AddLabels(l *logger.Logger, vms vm.List, labels map[string]string) error { + return nil +} + +func (p *provider) RemoveLabels(l *logger.Logger, vms vm.List, labels []string) error { + return nil +} + // Create implements vm.Provider and returns Unimplemented. func (p *provider) Create( l *logger.Logger, names []string, opts vm.CreateOpts, providerOpts vm.ProviderOpts, diff --git a/pkg/roachprod/vm/gce/gcloud.go b/pkg/roachprod/vm/gce/gcloud.go index 61f59c3f2780..7847a637ee4a 100644 --- a/pkg/roachprod/vm/gce/gcloud.go +++ b/pkg/roachprod/vm/gce/gcloud.go @@ -883,6 +883,54 @@ func (p *Provider) ConfigSSH(l *logger.Logger, zones []string) error { return nil } +func (p *Provider) editLabels( + l *logger.Logger, vms vm.List, labels map[string]string, remove bool, +) error { + cmdArgs := []string{"compute", "instances"} + if remove { + cmdArgs = append(cmdArgs, "remove-labels") + } else { + cmdArgs = append(cmdArgs, "add-labels") + } + + tagArgs := make([]string, 0, len(labels)) + for key, value := range labels { + if remove { + tagArgs = append(tagArgs, key) + } else { + tagArgs = append(tagArgs, fmt.Sprintf("%s=%s", key, vm.SanitizeLabel(value))) + } + } + tagArgsString := strings.Join(tagArgs, ",") + commonArgs := []string{"--project", p.GetProject(), fmt.Sprintf("--labels=%s", tagArgsString)} + + for _, v := range vms { + vmArgs := make([]string, len(cmdArgs)) + copy(vmArgs, cmdArgs) + + vmArgs = append(vmArgs, v.Name, "--zone", v.Zone) + vmArgs = append(vmArgs, commonArgs...) + cmd := exec.Command("gcloud", vmArgs...) + if b, err := cmd.CombinedOutput(); err != nil { + return errors.Wrapf(err, "Command: gcloud %s\nOutput: %s", vmArgs, string(b)) + } + } + return nil +} + +// AddLabels adds the given labels to the given VMs. +func (p *Provider) AddLabels(l *logger.Logger, vms vm.List, labels map[string]string) error { + return p.editLabels(l, vms, labels, false /* remove */) +} + +func (p *Provider) RemoveLabels(l *logger.Logger, vms vm.List, labels []string) error { + labelsMap := make(map[string]string, len(labels)) + for _, label := range labels { + labelsMap[label] = "" + } + return p.editLabels(l, vms, labelsMap, true /* remove */) +} + // Create TODO(peter): document func (p *Provider) Create( l *logger.Logger, names []string, opts vm.CreateOpts, vmProviderOpts vm.ProviderOpts, @@ -1292,24 +1340,9 @@ func (p *Provider) Reset(l *logger.Logger, vms vm.List) error { // Extend TODO(peter): document func (p *Provider) Extend(l *logger.Logger, vms vm.List, lifetime time.Duration) error { - // The gcloud command only takes a single instance. Unlike Delete() above, we have to - // perform the iteration here. - for _, v := range vms { - args := []string{"compute", "instances", "add-labels"} - - args = append(args, "--project", v.Project) - args = append(args, "--zone", v.Zone) - args = append(args, "--labels", fmt.Sprintf("lifetime=%s", lifetime)) - args = append(args, v.Name) - - cmd := exec.Command("gcloud", args...) - - output, err := cmd.CombinedOutput() - if err != nil { - return errors.Wrapf(err, "Command: gcloud %s\nOutput: %s", args, output) - } - } - return nil + return p.AddLabels(l, vms, map[string]string{ + "lifetime": lifetime.String(), + }) } // FindActiveAccount TODO(peter): document diff --git a/pkg/roachprod/vm/local/local.go b/pkg/roachprod/vm/local/local.go index 7025894c276c..a006a0c5fb03 100644 --- a/pkg/roachprod/vm/local/local.go +++ b/pkg/roachprod/vm/local/local.go @@ -173,6 +173,14 @@ func (p *Provider) ConfigSSH(l *logger.Logger, zones []string) error { return nil } +func (p *Provider) AddLabels(l *logger.Logger, vms vm.List, labels map[string]string) error { + return nil +} + +func (p *Provider) RemoveLabels(l *logger.Logger, vms vm.List, labels []string) error { + return nil +} + // Create just creates fake host-info entries in the local filesystem func (p *Provider) Create( l *logger.Logger, names []string, opts vm.CreateOpts, unusedProviderOpts vm.ProviderOpts, diff --git a/pkg/roachprod/vm/vm.go b/pkg/roachprod/vm/vm.go index f42aa3a5737b..cb2b93d4a273 100644 --- a/pkg/roachprod/vm/vm.go +++ b/pkg/roachprod/vm/vm.go @@ -406,6 +406,9 @@ type Provider interface { FindActiveAccount(l *logger.Logger) (string, error) List(l *logger.Logger, opts ListOptions) (List, error) // The name of the Provider, which will also surface in the top-level Providers map. + + AddLabels(l *logger.Logger, vms List, labels map[string]string) error + RemoveLabels(l *logger.Logger, vms List, labels []string) error Name() string // Active returns true if the provider is properly installed and capable of @@ -643,3 +646,19 @@ func DNSSafeAccount(account string) string { } return strings.Map(safe, account) } + +func SanitizeLabel(label string) string { + // Replace any non-alphanumeric characters with hyphens + re := regexp.MustCompile("[^a-zA-Z0-9]+") + label = re.ReplaceAllString(label, "-") + + // Remove any leading or trailing hyphens + label = strings.Trim(label, "-") + + // Truncate the label to 63 characters (the maximum allowed by GCP) + if len(label) > 63 { + label = label[:63] + } + + return label +} From 45b20bc95b01bf27be07f1769d1110b072d44b04 Mon Sep 17 00:00:00 2001 From: Miral Gadani Date: Wed, 2 Aug 2023 19:43:37 +0000 Subject: [PATCH 2/4] roachtest: set `test_run_id` label which will be scraped for metrics The `test_run_id` will be unique per invocation of `roachtest`. It's purpose is to simplify finding metrics for a given run. TeamCity invoked roachtests, such as GCE Roachtest Nightly, will have a `test_run_id` in the form `-` to make it easy to find metrics for a particular build. Roachtests run by individual users will have a `test_run_id` in the form `-`. Epic: none Release note: None --- pkg/cmd/roachtest/cluster.go | 8 ++++ pkg/cmd/roachtest/main.go | 72 +++++++++++++------------------- pkg/cmd/roachtest/test_runner.go | 24 ++++++++++- pkg/roachprod/vm/vm.go | 1 + 4 files changed, 61 insertions(+), 44 deletions(-) diff --git a/pkg/cmd/roachtest/cluster.go b/pkg/cmd/roachtest/cluster.go index 297c453de58f..c3f8b22130ed 100644 --- a/pkg/cmd/roachtest/cluster.go +++ b/pkg/cmd/roachtest/cluster.go @@ -615,6 +615,11 @@ func (r *clusterRegistry) registerCluster(c *clusterImpl) error { return fmt.Errorf("cluster named %q already exists in registry", c.name) } r.mu.clusters[c.name] = c + if err := c.addLabels(map[string]string{ + VmLabelTestRunID: runID, + }); err != nil && c.l != nil { + c.l.Printf("failed to add %s label to cluster: %s", VmLabelTestRunID, err) + } return nil } @@ -626,6 +631,9 @@ func (r *clusterRegistry) unregisterCluster(c *clusterImpl) bool { // method to be called defensively. return false } + if err := c.removeLabels([]string{VmLabelTestRunID}); err != nil && c.l != nil { + c.l.Printf("failed to remove %s label from cluster: %s", VmLabelTestRunID, err) + } delete(r.mu.clusters, c.name) if c.tag != "" { if _, ok := r.mu.tagCount[c.tag]; !ok { diff --git a/pkg/cmd/roachtest/main.go b/pkg/cmd/roachtest/main.go index bebf18e13824..daeb652aeef4 100644 --- a/pkg/cmd/roachtest/main.go +++ b/pkg/cmd/roachtest/main.go @@ -279,6 +279,29 @@ Examples: listCmd.Flags().StringVar( &cloud, "cloud", cloud, "cloud provider to use (aws, azure, or gce)") + runFn := func(args []string, benchOnly bool) error { + if literalArtifacts == "" { + literalArtifacts = artifacts + } + return runTests(tests.RegisterTests, cliCfg{ + args: args, + count: count, + cpuQuota: cpuQuota, + runSkipped: runSkipped, + debugMode: debugModeFromOpts(), + skipInit: skipInit, + httpPort: httpPort, + promPort: promPort, + parallelism: parallelism, + artifactsDir: artifacts, + literalArtifactsDir: literalArtifacts, + user: getUser(username), + clusterID: clusterID, + versionsBinaryOverride: versionsBinaryOverride, + selectProbability: selectProbability, + }, benchOnly) + } + var runCmd = &cobra.Command{ // Don't display usage when tests fail. SilenceUsage: true, @@ -298,26 +321,7 @@ COCKROACH_ environment variables in the local environment are passed through to the cluster nodes on start. `, RunE: func(_ *cobra.Command, args []string) error { - if literalArtifacts == "" { - literalArtifacts = artifacts - } - return runTests(tests.RegisterTests, cliCfg{ - args: args, - count: count, - cpuQuota: cpuQuota, - runSkipped: runSkipped, - debugMode: debugModeFromOpts(), - skipInit: skipInit, - httpPort: httpPort, - promPort: promPort, - parallelism: parallelism, - artifactsDir: artifacts, - literalArtifactsDir: literalArtifacts, - user: username, - clusterID: clusterID, - versionsBinaryOverride: versionsBinaryOverride, - selectProbability: selectProbability, - }, false /* benchOnly */) + return runFn(args, false /* benchOnly */) }, } @@ -341,24 +345,7 @@ the cluster nodes on start. Short: "run automated benchmarks on cockroach cluster", Long: `Run automated benchmarks on existing or ephemeral cockroach clusters.`, RunE: func(_ *cobra.Command, args []string) error { - if literalArtifacts == "" { - literalArtifacts = artifacts - } - return runTests(tests.RegisterTests, cliCfg{ - args: args, - count: count, - cpuQuota: cpuQuota, - runSkipped: runSkipped, - debugMode: debugModeFromOpts(), - skipInit: skipInit, - httpPort: httpPort, - parallelism: parallelism, - artifactsDir: artifacts, - user: username, - clusterID: clusterID, - versionsBinaryOverride: versionsBinaryOverride, - selectProbability: selectProbability, - }, true /* benchOnly */) + return runFn(args, true /* benchOnly */) }, } @@ -498,10 +485,11 @@ func runTests(register func(registry.Registry), cfg cliCfg, benchOnly bool) erro opt := clustersOpt{ typ: clusterType, clusterName: clusterName, - user: getUser(cfg.user), - cpuQuota: cfg.cpuQuota, - debugMode: cfg.debugMode, - clusterID: cfg.clusterID, + // Precedence for resolving the user: cli arg, env.ROACHPROD_USER, current user. + user: cfg.user, + cpuQuota: cfg.cpuQuota, + debugMode: cfg.debugMode, + clusterID: cfg.clusterID, } if err := runner.runHTTPServer(cfg.httpPort, os.Stdout, bindTo); err != nil { return err diff --git a/pkg/cmd/roachtest/test_runner.go b/pkg/cmd/roachtest/test_runner.go index 32d2d2d46dd2..05f27f15997a 100644 --- a/pkg/cmd/roachtest/test_runner.go +++ b/pkg/cmd/roachtest/test_runner.go @@ -70,8 +70,16 @@ var ( prometheusScrapeInterval = time.Second * 15 prng, _ = randutil.NewLockedPseudoRand() + + runID string ) +// VmLabelTestName is the label used to identify the test name in the VM metadata +const VmLabelTestName string = "test_name" + +// VmLabelTestRunID is the label used to identify the test run id in the VM metadata +const VmLabelTestRunID string = "test_run_id" + // testRunner runs tests. type testRunner struct { stopper *stop.Stopper @@ -296,6 +304,8 @@ func (r *testRunner) Run( qp := quotapool.NewIntPool("cloud cpu", uint64(clustersOpt.cpuQuota)) l := lopt.l + runID = generateRunID(clustersOpt.user) + shout(ctx, l, lopt.stdout, "%s: %s", VmLabelTestRunID, runID) var wg sync.WaitGroup for i := 0; i < parallelism; i++ { @@ -386,6 +396,16 @@ func numConcurrentClusterCreations() int { return res } +// This will be added as a label to all cluster nodes when the +// cluster is registered. +func generateRunID(user string) string { + uniqueId := os.Getenv("TC_BUILD_ID") + if uniqueId == "" { + uniqueId = fmt.Sprintf("%d", timeutil.Now().Unix()) + } + return fmt.Sprintf("%s-%s", user, uniqueId) +} + // defaultClusterAllocator is used by workers to create new clusters (or to attach // to an existing one). // @@ -912,10 +932,10 @@ func (r *testRunner) runTest( s := t.Spec().(*registry.TestSpec) _ = c.addLabels(map[string]string{ - "test_name": s.Name, + VmLabelTestName: s.Name, }) defer func() { - _ = c.removeLabels([]string{"test_name"}) + _ = c.removeLabels([]string{VmLabelTestName}) t.end = timeutil.Now() // We only have to record panics if the panic'd value is not the sentinel diff --git a/pkg/roachprod/vm/vm.go b/pkg/roachprod/vm/vm.go index cb2b93d4a273..be04d5ebed2e 100644 --- a/pkg/roachprod/vm/vm.go +++ b/pkg/roachprod/vm/vm.go @@ -647,6 +647,7 @@ func DNSSafeAccount(account string) string { return strings.Map(safe, account) } +// SanitizeLabel returns a version of the string that can be used as a label. func SanitizeLabel(label string) string { // Replace any non-alphanumeric characters with hyphens re := regexp.MustCompile("[^a-zA-Z0-9]+") From b2faa2b25fe36f6fdebb7a7fe06ef2f180a69436 Mon Sep 17 00:00:00 2001 From: Thomas Hardy Date: Wed, 2 Aug 2023 13:25:31 -0400 Subject: [PATCH 3/4] server: return authoritative span statistics for db details endpoint Resolves: #96163 This change makes the admin API endpoint getting database statistics scan KV for span statistics instead of using the range descriptor cache. This provides authoritative output, helping deflake `TestMultiRegionDatabaseStats`. Release note (sql change): admin API database details endpoint now returns authoritative range statistics. --- pkg/ccl/serverccl/admin_test.go | 6 ++++++ pkg/server/admin.go | 38 ++++++++++++++++----------------- 2 files changed, 24 insertions(+), 20 deletions(-) diff --git a/pkg/ccl/serverccl/admin_test.go b/pkg/ccl/serverccl/admin_test.go index 07a90515a766..958db1707605 100644 --- a/pkg/ccl/serverccl/admin_test.go +++ b/pkg/ccl/serverccl/admin_test.go @@ -225,6 +225,12 @@ func TestTableAndDatabaseDetailsAndStats(t *testing.T) { require.Equal(t, dbResp.TableNames[0], "public.test") + var dbDetailsResp serverpb.DatabaseDetailsResponse + err = getAdminJSONProto(st, "databases/defaultdb?include_stats=true", &dbDetailsResp) + require.NoError(t, err) + + require.Greater(t, dbDetailsResp.Stats.RangeCount, int64(0)) + // TableStats tableStatsResp := &serverpb.TableStatsResponse{} err = getAdminJSONProto(st, "databases/defaultdb/tables/public.test/stats", tableStatsResp) diff --git a/pkg/server/admin.go b/pkg/server/admin.go index ba0d7035df6b..2f4128237b63 100644 --- a/pkg/server/admin.go +++ b/pkg/server/admin.go @@ -67,6 +67,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/mon" "github.com/cockroachdb/cockroach/pkg/util/protoutil" "github.com/cockroachdb/cockroach/pkg/util/quotapool" + "github.com/cockroachdb/cockroach/pkg/util/rangedesc" "github.com/cockroachdb/cockroach/pkg/util/safesql" "github.com/cockroachdb/cockroach/pkg/util/stop" "github.com/cockroachdb/cockroach/pkg/util/timeutil" @@ -1328,16 +1329,9 @@ func (s *adminServer) statsForSpan( ctx, cancel := context.WithCancel(ctx) defer cancel() - rSpan, err := keys.SpanAddr(span) - if err != nil { - return nil, err - } - // Get a list of nodeIDs, range counts, and replica counts per node // for the specified span. - nodeIDs, rangeCount, replCounts, err := getNodeIDsRangeCountReplCountForSpan( - ctx, s.distSender, rSpan, - ) + nodeIDs, rangeCount, replCounts, err := s.getSpanDetails(ctx, span) if err != nil { return nil, err } @@ -1450,25 +1444,29 @@ func (s *adminServer) statsForSpan( // Returns the list of node ids, range count, // and replica count for the specified span. -func getNodeIDsRangeCountReplCountForSpan( - ctx context.Context, ds *kvcoord.DistSender, rSpan roachpb.RSpan, +func (s *adminServer) getSpanDetails( + ctx context.Context, span roachpb.Span, ) (nodeIDList []roachpb.NodeID, rangeCount int64, replCounts map[roachpb.NodeID]int64, _ error) { nodeIDs := make(map[roachpb.NodeID]struct{}) replCountForNodeID := make(map[roachpb.NodeID]int64) - ri := kvcoord.MakeRangeIterator(ds) - ri.Seek(ctx, rSpan.Key, kvcoord.Ascending) - for ; ri.Valid(); ri.Next(ctx) { + var it rangedesc.Iterator + var err error + if s.sqlServer.tenantConnect == nil { + it, err = s.sqlServer.execCfg.RangeDescIteratorFactory.NewIterator(ctx, span) + } else { + it, err = s.sqlServer.tenantConnect.NewIterator(ctx, span) + } + if err != nil { + return nil, 0, nil, err + } + var rangeDesc roachpb.RangeDescriptor + for ; it.Valid(); it.Next() { rangeCount++ - for _, repl := range ri.Desc().Replicas().Descriptors() { + rangeDesc = it.CurRangeDescriptor() + for _, repl := range rangeDesc.Replicas().Descriptors() { replCountForNodeID[repl.NodeID]++ nodeIDs[repl.NodeID] = struct{}{} } - if !ri.NeedAnother(rSpan) { - break - } - } - if err := ri.Error(); err != nil { - return nil, 0, nil, err } nodeIDList = make([]roachpb.NodeID, 0, len(nodeIDs)) From 08b2eb70e5e168429795f3a294a15daec93f349c Mon Sep 17 00:00:00 2001 From: Rafi Shamim Date: Mon, 14 Aug 2023 10:24:29 -0400 Subject: [PATCH 4/4] upgrades: deflake TestRoleMembersIDMigration1500Users TeamCity has a new machine type where this test has started to time out more, so this change will make it take less time. Release note: None --- pkg/upgrade/upgrades/role_members_ids_migration_test.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pkg/upgrade/upgrades/role_members_ids_migration_test.go b/pkg/upgrade/upgrades/role_members_ids_migration_test.go index adc8c796e7bd..217cc21067fe 100644 --- a/pkg/upgrade/upgrades/role_members_ids_migration_test.go +++ b/pkg/upgrade/upgrades/role_members_ids_migration_test.go @@ -36,10 +36,12 @@ func TestRoleMembersIDMigration10Users(t *testing.T) { runTestRoleMembersIDMigration(t, 10) } -func TestRoleMembersIDMigration1500Users(t *testing.T) { +func TestRoleMembersIDMigration1200Users(t *testing.T) { skip.UnderRace(t) skip.UnderStress(t) - runTestRoleMembersIDMigration(t, 1500) + // Choosing a number larger than 1000 tests that the batching logic in + // this upgrade works correctly. + runTestRoleMembersIDMigration(t, 1200) } func runTestRoleMembersIDMigration(t *testing.T, numUsers int) {