Skip to content

Commit

Permalink
roachtest: set test_run_id label which will be scraped for metrics
Browse files Browse the repository at this point in the history
The `test_run_id` will be unique per invocation of `roachtest`. It's
purpose is to simplify finding metrics for a given run.

TeamCity invoked roachtests, such as GCE Roachtest Nightly, will have
a `test_run_id` in the form `<teamcity_user>-<tc_build_id>` to make it
easy to find metrics for a particular build.

Roachtests run by individual users will have a `test_run_id` in the
form `<user>-<unix_ts>`.

Epic: none

Release note: None
  • Loading branch information
Miral Gadani committed Aug 10, 2023
1 parent 5a9b75e commit 45b20bc
Show file tree
Hide file tree
Showing 4 changed files with 61 additions and 44 deletions.
8 changes: 8 additions & 0 deletions pkg/cmd/roachtest/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -615,6 +615,11 @@ func (r *clusterRegistry) registerCluster(c *clusterImpl) error {
return fmt.Errorf("cluster named %q already exists in registry", c.name)
}
r.mu.clusters[c.name] = c
if err := c.addLabels(map[string]string{
VmLabelTestRunID: runID,
}); err != nil && c.l != nil {
c.l.Printf("failed to add %s label to cluster: %s", VmLabelTestRunID, err)
}
return nil
}

Expand All @@ -626,6 +631,9 @@ func (r *clusterRegistry) unregisterCluster(c *clusterImpl) bool {
// method to be called defensively.
return false
}
if err := c.removeLabels([]string{VmLabelTestRunID}); err != nil && c.l != nil {
c.l.Printf("failed to remove %s label from cluster: %s", VmLabelTestRunID, err)
}
delete(r.mu.clusters, c.name)
if c.tag != "" {
if _, ok := r.mu.tagCount[c.tag]; !ok {
Expand Down
72 changes: 30 additions & 42 deletions pkg/cmd/roachtest/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,29 @@ Examples:
listCmd.Flags().StringVar(
&cloud, "cloud", cloud, "cloud provider to use (aws, azure, or gce)")

runFn := func(args []string, benchOnly bool) error {
if literalArtifacts == "" {
literalArtifacts = artifacts
}
return runTests(tests.RegisterTests, cliCfg{
args: args,
count: count,
cpuQuota: cpuQuota,
runSkipped: runSkipped,
debugMode: debugModeFromOpts(),
skipInit: skipInit,
httpPort: httpPort,
promPort: promPort,
parallelism: parallelism,
artifactsDir: artifacts,
literalArtifactsDir: literalArtifacts,
user: getUser(username),
clusterID: clusterID,
versionsBinaryOverride: versionsBinaryOverride,
selectProbability: selectProbability,
}, benchOnly)
}

var runCmd = &cobra.Command{
// Don't display usage when tests fail.
SilenceUsage: true,
Expand All @@ -298,26 +321,7 @@ COCKROACH_ environment variables in the local environment are passed through to
the cluster nodes on start.
`,
RunE: func(_ *cobra.Command, args []string) error {
if literalArtifacts == "" {
literalArtifacts = artifacts
}
return runTests(tests.RegisterTests, cliCfg{
args: args,
count: count,
cpuQuota: cpuQuota,
runSkipped: runSkipped,
debugMode: debugModeFromOpts(),
skipInit: skipInit,
httpPort: httpPort,
promPort: promPort,
parallelism: parallelism,
artifactsDir: artifacts,
literalArtifactsDir: literalArtifacts,
user: username,
clusterID: clusterID,
versionsBinaryOverride: versionsBinaryOverride,
selectProbability: selectProbability,
}, false /* benchOnly */)
return runFn(args, false /* benchOnly */)
},
}

Expand All @@ -341,24 +345,7 @@ the cluster nodes on start.
Short: "run automated benchmarks on cockroach cluster",
Long: `Run automated benchmarks on existing or ephemeral cockroach clusters.`,
RunE: func(_ *cobra.Command, args []string) error {
if literalArtifacts == "" {
literalArtifacts = artifacts
}
return runTests(tests.RegisterTests, cliCfg{
args: args,
count: count,
cpuQuota: cpuQuota,
runSkipped: runSkipped,
debugMode: debugModeFromOpts(),
skipInit: skipInit,
httpPort: httpPort,
parallelism: parallelism,
artifactsDir: artifacts,
user: username,
clusterID: clusterID,
versionsBinaryOverride: versionsBinaryOverride,
selectProbability: selectProbability,
}, true /* benchOnly */)
return runFn(args, true /* benchOnly */)
},
}

Expand Down Expand Up @@ -498,10 +485,11 @@ func runTests(register func(registry.Registry), cfg cliCfg, benchOnly bool) erro
opt := clustersOpt{
typ: clusterType,
clusterName: clusterName,
user: getUser(cfg.user),
cpuQuota: cfg.cpuQuota,
debugMode: cfg.debugMode,
clusterID: cfg.clusterID,
// Precedence for resolving the user: cli arg, env.ROACHPROD_USER, current user.
user: cfg.user,
cpuQuota: cfg.cpuQuota,
debugMode: cfg.debugMode,
clusterID: cfg.clusterID,
}
if err := runner.runHTTPServer(cfg.httpPort, os.Stdout, bindTo); err != nil {
return err
Expand Down
24 changes: 22 additions & 2 deletions pkg/cmd/roachtest/test_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,16 @@ var (
prometheusScrapeInterval = time.Second * 15

prng, _ = randutil.NewLockedPseudoRand()

runID string
)

// VmLabelTestName is the label used to identify the test name in the VM metadata
const VmLabelTestName string = "test_name"

// VmLabelTestRunID is the label used to identify the test run id in the VM metadata
const VmLabelTestRunID string = "test_run_id"

// testRunner runs tests.
type testRunner struct {
stopper *stop.Stopper
Expand Down Expand Up @@ -296,6 +304,8 @@ func (r *testRunner) Run(

qp := quotapool.NewIntPool("cloud cpu", uint64(clustersOpt.cpuQuota))
l := lopt.l
runID = generateRunID(clustersOpt.user)
shout(ctx, l, lopt.stdout, "%s: %s", VmLabelTestRunID, runID)
var wg sync.WaitGroup

for i := 0; i < parallelism; i++ {
Expand Down Expand Up @@ -386,6 +396,16 @@ func numConcurrentClusterCreations() int {
return res
}

// This will be added as a label to all cluster nodes when the
// cluster is registered.
func generateRunID(user string) string {
uniqueId := os.Getenv("TC_BUILD_ID")
if uniqueId == "" {
uniqueId = fmt.Sprintf("%d", timeutil.Now().Unix())
}
return fmt.Sprintf("%s-%s", user, uniqueId)
}

// defaultClusterAllocator is used by workers to create new clusters (or to attach
// to an existing one).
//
Expand Down Expand Up @@ -912,10 +932,10 @@ func (r *testRunner) runTest(

s := t.Spec().(*registry.TestSpec)
_ = c.addLabels(map[string]string{
"test_name": s.Name,
VmLabelTestName: s.Name,
})
defer func() {
_ = c.removeLabels([]string{"test_name"})
_ = c.removeLabels([]string{VmLabelTestName})
t.end = timeutil.Now()

// We only have to record panics if the panic'd value is not the sentinel
Expand Down
1 change: 1 addition & 0 deletions pkg/roachprod/vm/vm.go
Original file line number Diff line number Diff line change
Expand Up @@ -647,6 +647,7 @@ func DNSSafeAccount(account string) string {
return strings.Map(safe, account)
}

// SanitizeLabel returns a version of the string that can be used as a label.
func SanitizeLabel(label string) string {
// Replace any non-alphanumeric characters with hyphens
re := regexp.MustCompile("[^a-zA-Z0-9]+")
Expand Down

0 comments on commit 45b20bc

Please sign in to comment.