From 1fa4fac5d70cbe1a30f92e898a8d8296936b55e4 Mon Sep 17 00:00:00 2001 From: Stan Rosenberg Date: Wed, 17 May 2023 11:05:21 -0400 Subject: [PATCH] roachtest: metamorphic ARM64 and FIPS clusters Previously, all roachtests used (cloud) machine types with the AMD64 (cpu) architecture. Recently [1], new CI infrastructure was added to run a clone of all the nightly roachtests, configured with FIPS; i.e., same AMD64 machine types, different AMI and crdb binary, patched with FIPS-certified openssl native code. As of this PR, we add the capability to execute any roachtest in a cluster, configured with either ARM64, FIPS, or AMD64 (default). This is controlled via the two CLI args: `metamorphic-arm64-probability` and `metamorphic-fips-probability`. The former denotes the probability (over the uniform distribution) of a new cluster provisioned using ARM64 VMs. The latter denotes the probability of a new AMD64 cluster provisioned with the FIPS-compliant (kernel) configuration. In case a test is compatible only with AMD64, it's effectively excluded from the set; i.e., both probabilities apply to compatible tests only. Note, the two probabilties don't have to add up to 1. E.g., `metamorphic-arm64-probability==0.4`, `metamorphic-fips-probability==0.2` denotes that ARM64 clusters are chosen ~40% of the time, whereas of the remaining ~60% AMD clusters, FIPS is chosen ~20% of the time; i.e., ~12% of all clusters will use FIPS. Note, the values '0' and '1' are absolute. Setting both to '0' is tantamount to the behavior before this PR. Setting either to '1' enforces _all_ clusters are provisioned with either ARM64 or FIPS. A test can specify its required architecture, in which case, it takes precedence over metamorphic settings. This PR builds on [1], which enabled ARM64 provisioning for AWS in roachprod. We add ARM64 provisioning for GCE, i.e., T2A, as well as refactor 'arch' argument to denote one of: AMD64, ARM64, FIPS, where the latter isn't formally a CPU architecture; however, it simplifies provisioning and binary staging. We also modify roachprod.List to display CPU architecture, other than AMD64, with the machine type; this should make it easier to see which clusters are running ARM64 and FIPS configurations, as we ramp up their testing. The PR also adds validation to cockroach binaries and libs to ensure we can execute tests under ARM64 and FIPS. Furthermore, we add 'Enabled Assertions' header, generated at build time, to the cockroach binary; the header is used to validate whether or not the binary has runtime assertions enabled. Epic: none Release note: None Resolves: https://github.com/cockroachdb/cockroach/issues/94957 Resolves: https://github.com/cockroachdb/cockroach/issues/89268 Informs: https://github.com/cockroachdb/cockroach/issues/94986 [1] https://github.com/cockroachdb/cockroach/pull/99224 [2] https://github.com/cockroachdb/cockroach/pull/103243 --- pkg/build/BUILD.bazel | 1 + pkg/build/info.go | 38 +- pkg/build/info.proto | 2 + pkg/cmd/roachprod/flags.go | 9 +- pkg/cmd/roachprod/main.go | 17 +- pkg/cmd/roachtest/cluster.go | 374 +++++++++++++----- pkg/cmd/roachtest/cluster/BUILD.bazel | 1 + .../roachtest/cluster/cluster_interface.go | 4 + pkg/cmd/roachtest/cluster_test.go | 21 +- pkg/cmd/roachtest/github.go | 10 +- pkg/cmd/roachtest/github_test.go | 28 +- pkg/cmd/roachtest/main.go | 69 +++- pkg/cmd/roachtest/slack.go | 2 - pkg/cmd/roachtest/spec/cluster_spec.go | 29 +- pkg/cmd/roachtest/spec/machine_type.go | 55 ++- pkg/cmd/roachtest/spec/option.go | 28 +- pkg/cmd/roachtest/test_impl.go | 1 + pkg/cmd/roachtest/test_registry_test.go | 7 + pkg/cmd/roachtest/test_runner.go | 74 +++- pkg/cmd/roachtest/test_test.go | 1 + pkg/cmd/roachtest/tests/BUILD.bazel | 1 + pkg/cmd/roachtest/tests/autoupgrade.go | 1 + pkg/cmd/roachtest/tests/cdc.go | 96 +++-- pkg/cmd/roachtest/tests/follower_reads.go | 4 - pkg/cmd/roachtest/tests/import.go | 1 + pkg/cmd/roachtest/tests/indexes.go | 6 - pkg/cmd/roachtest/tests/mixed_version_cdc.go | 8 +- .../tests/mixed_version_schemachange.go | 1 + pkg/cmd/roachtest/tests/rebalance_load.go | 4 - pkg/cmd/roachtest/tests/tpcc.go | 113 +----- pkg/cmd/roachtest/tests/tpch_concurrency.go | 14 +- pkg/cmd/roachtest/tests/versionupgrade.go | 1 + pkg/roachprod/install/BUILD.bazel | 2 + pkg/roachprod/install/staging.go | 19 +- pkg/roachprod/install/staging_test.go | 3 +- pkg/roachprod/prometheus/BUILD.bazel | 1 + pkg/roachprod/prometheus/prometheus.go | 36 +- pkg/roachprod/roachprod.go | 7 +- pkg/roachprod/vm/aws/aws.go | 36 +- pkg/roachprod/vm/gce/gcloud.go | 29 +- pkg/roachprod/vm/vm.go | 24 +- pkg/util/randutil/rand.go | 6 + 42 files changed, 792 insertions(+), 392 deletions(-) diff --git a/pkg/build/BUILD.bazel b/pkg/build/BUILD.bazel index 5bb789b75e29..0fbb18953466 100644 --- a/pkg/build/BUILD.bazel +++ b/pkg/build/BUILD.bazel @@ -23,6 +23,7 @@ go_library( "github.com/cockroachdb/cockroach/pkg/build.utcTime": "{BUILD_UTCTIME}", }, deps = [ + "//pkg/util/buildutil", "//pkg/util/envutil", "//pkg/util/version", ], diff --git a/pkg/build/info.go b/pkg/build/info.go index d4ab09ef72e5..1f257c75fd30 100644 --- a/pkg/build/info.go +++ b/pkg/build/info.go @@ -17,6 +17,7 @@ import ( "text/tabwriter" "time" + "github.com/cockroachdb/cockroach/pkg/util/buildutil" "github.com/cockroachdb/cockroach/pkg/util/envutil" "github.com/cockroachdb/cockroach/pkg/util/version" ) @@ -35,11 +36,12 @@ var ( cgoTargetTriple string platform = fmt.Sprintf("%s %s", runtime.GOOS, runtime.GOARCH) // Distribution is changed by the CCL init-time hook in non-APL builds. - Distribution = "OSS" - typ string // Type of this build: , "development", or "release" - channel = "unknown" - envChannel = envutil.EnvOrDefaultString("COCKROACH_CHANNEL", "unknown") - binaryVersion = computeVersion(tag) + Distribution = "OSS" + typ string // Type of this build: , "development", or "release" + channel = "unknown" + envChannel = envutil.EnvOrDefaultString("COCKROACH_CHANNEL", "unknown") + enabledAssertions = buildutil.CrdbTestBuild + binaryVersion = computeVersion(tag) ) const ( @@ -113,7 +115,8 @@ func (b Info) Long() string { fmt.Fprintf(tw, "Go Version: %s\n", b.GoVersion) fmt.Fprintf(tw, "C Compiler: %s\n", b.CgoCompiler) fmt.Fprintf(tw, "Build Commit ID: %s\n", b.Revision) - fmt.Fprintf(tw, "Build Type: %s", b.Type) // No final newline: cobra prints one for us. + fmt.Fprintf(tw, "Build Type: %s\n", b.Type) + fmt.Fprintf(tw, "Enabled Assertions: %t", b.EnabledAssertions) // No final newline: cobra prints one for us. _ = tw.Flush() return buf.String() } @@ -139,17 +142,18 @@ func (b Info) Timestamp() (int64, error) { // GetInfo returns an Info struct populated with the build information. func GetInfo() Info { return Info{ - GoVersion: runtime.Version(), - Tag: tag, - Time: utcTime, - Revision: rev, - CgoCompiler: cgoCompiler, - CgoTargetTriple: cgoTargetTriple, - Platform: platform, - Distribution: Distribution, - Type: typ, - Channel: channel, - EnvChannel: envChannel, + GoVersion: runtime.Version(), + Tag: tag, + Time: utcTime, + Revision: rev, + CgoCompiler: cgoCompiler, + CgoTargetTriple: cgoTargetTriple, + Platform: platform, + Distribution: Distribution, + Type: typ, + Channel: channel, + EnvChannel: envChannel, + EnabledAssertions: enabledAssertions, } } diff --git a/pkg/build/info.proto b/pkg/build/info.proto index 199666501739..c4e0a4b1657e 100644 --- a/pkg/build/info.proto +++ b/pkg/build/info.proto @@ -38,6 +38,8 @@ message Info { optional string channel = 9 [(gogoproto.nullable) = false]; // env_channel identifies the product channel as overridden by the COCKROACH_CHANNEL environment variable. optional string env_channel = 11 [(gogoproto.nullable) = false]; + // enabled_assertions returns the value of 'CrdbTestBuild' (true iff compiled with 'crdb_test' tag) + optional bool enabled_assertions = 12 [(gogoproto.nullable) = false]; // dependencies exists to allow tests that run against old clusters // to unmarshal JSON containing this field. The tag is unimportant, diff --git a/pkg/cmd/roachprod/flags.go b/pkg/cmd/roachprod/flags.go index c5e8380b528f..66bf054204d6 100644 --- a/pkg/cmd/roachprod/flags.go +++ b/pkg/cmd/roachprod/flags.go @@ -40,6 +40,7 @@ var ( extendLifetime time.Duration wipePreserveCerts bool grafanaConfig string + grafanaArch string grafanaurlOpen bool grafanaDumpDir string listDetails bool @@ -107,8 +108,9 @@ func initFlags() { vm.AllProviderNames())) createCmd.Flags().BoolVar(&createVMOpts.GeoDistributed, "geo", false, "Create geo-distributed cluster") - createCmd.Flags().BoolVar(&createVMOpts.EnableFIPS, - "fips", false, "Enable FIPS mode (uses custom AMI)") + createCmd.Flags().StringVar(&createVMOpts.Arch, "arch", "", + "architecture override for VM [amd64, arm64, fips]; N.B. fips implies amd64 with openssl") + // N.B. We set "usage=roachprod" as the default, custom label for billing tracking. createCmd.Flags().StringToStringVar(&createVMOpts.CustomLabels, "label", map[string]string{"usage": "roachprod"}, @@ -249,6 +251,9 @@ func initFlags() { grafanaStartCmd.Flags().StringVar(&grafanaConfig, "grafana-config", "", "URL to grafana json config") + grafanaStartCmd.Flags().StringVar(&grafanaArch, "arch", "", + "binary architecture override [amd64, arm64]") + grafanaURLCmd.Flags().BoolVar(&grafanaurlOpen, "open", false, "open the grafana dashboard url on the browser") diff --git a/pkg/cmd/roachprod/main.go b/pkg/cmd/roachprod/main.go index e91a6d68c62c..abec0008f585 100644 --- a/pkg/cmd/roachprod/main.go +++ b/pkg/cmd/roachprod/main.go @@ -273,6 +273,7 @@ hosts file. c.PrintDetails(roachprodLibraryLogger) } else { fmt.Fprintf(tw, "%s\t%s\t%d", c.Name, c.Clouds(), len(c.VMs)) + if !c.IsLocal() { fmt.Fprintf(tw, "\t(%s)", c.LifetimeRemaining().Round(time.Second)) } else { @@ -904,10 +905,14 @@ var grafanaStartCmd = &cobra.Command{ Use: `grafana-start `, Short: `spins up a prometheus and grafana instances on the last node in the cluster`, Long: `spins up a prometheus and grafana instances on the highest numbered node in the cluster -and will scrape from all nodes in the cluster`, +and will scrape from all nodes in the cluster; NOTE: for arm64 clusters, use --arch arm64`, Args: cobra.ExactArgs(1), Run: wrap(func(cmd *cobra.Command, args []string) error { - return roachprod.StartGrafana(context.Background(), roachprodLibraryLogger, args[0], + arch := vm.ArchAMD64 + if grafanaArch == "arm64" { + arch = vm.ArchARM64 + } + return roachprod.StartGrafana(context.Background(), roachprodLibraryLogger, args[0], arch, grafanaConfig, nil) }), } @@ -954,14 +959,14 @@ func validateAndConfigure(cmd *cobra.Command, args []string) { // Validate architecture flag, if set. if archOpt := cmd.Flags().Lookup("arch"); archOpt != nil && archOpt.Changed { - arch := strings.ToLower(archOpt.Value.String()) + arch := vm.CPUArch(strings.ToLower(archOpt.Value.String())) - if arch != "amd64" && arch != "arm64" && arch != "fips" { + if arch != vm.ArchAMD64 && arch != vm.ArchARM64 && arch != vm.ArchFIPS { printErrAndExit(fmt.Errorf("unsupported architecture %q", arch)) } - if arch != archOpt.Value.String() { + if string(arch) != archOpt.Value.String() { // Set the canonical value. - _ = cmd.Flags().Set("arch", arch) + _ = cmd.Flags().Set("arch", string(arch)) } } } diff --git a/pkg/cmd/roachtest/cluster.go b/pkg/cmd/roachtest/cluster.go index a0141cf704f5..a112dd40a4ba 100644 --- a/pkg/cmd/roachtest/cluster.go +++ b/pkg/cmd/roachtest/cluster.go @@ -57,13 +57,20 @@ func init() { } var ( - // TODO(tbg): this is redundant with --cloud==local. Make the --local flag an - // alias for `--cloud=local` and remove this variable. - local bool - - cockroach string - cockroachShort string - libraryFilePaths []string + // user-specified path to crdb binary + cockroachPath string + // maps cpuArch to the corresponding crdb binary's absolute path + cockroach = make(map[vm.CPUArch]string) + // user-specified path to short crdb binary + cockroachShortPath string + // maps cpuArch to the corresponding short crdb (i.e., without UI) binary's absolute path + cockroachShort = make(map[vm.CPUArch]string) + // user-specified path to workload binary + workloadPath string + // maps cpuArch to the corresponding workload binary's absolute path + workload = make(map[vm.CPUArch]string) + // maps cpuArch to the corresponding dynamically-linked libraries' absolute paths + libraryFilePaths = make(map[vm.CPUArch][]string) cloud = spec.GCE // encryptionProbability controls when encryption-at-rest is enabled // in a cluster for tests that have opted-in to metamorphic @@ -73,10 +80,18 @@ var ( // encryption enabled by default (probability 1). In order to run // them with encryption disabled (perhaps to reproduce a test // failure), roachtest can be invoked with --metamorphic-encryption-probability=0 - encryptionProbability float64 + encryptionProbability float64 + // Total probability with which new ARM64 clusters are provisioned, modulo test specs. which are incompatible. + // N.B. if all selected tests are incompatible with ARM64, then arm64Probability is effectively 0. + // In other words, ClusterSpec.Arch takes precedence over the arm64Probability flag. + arm64Probability float64 + // Conditional probability with which new FIPS clusters are provisioned, modulo test specs. The total probability + // is the product of this and 1-arm64Probability. + // As in the case of arm64Probability, ClusterSpec.Arch takes precedence over the fipsProbability flag. + fipsProbability float64 + instanceType string localSSDArg bool - workload string deprecatedRoachprodBinary string // overrideOpts contains vm.CreateOpts override values passed from the cli. overrideOpts vm.CreateOpts @@ -97,6 +112,9 @@ var ( const ( defaultEncryptionProbability = 1 + defaultFIPSProbability = 0 + defaultARM64Probability = 0 + defaultCockroachPath = "./cockroach-default" ) type errBinaryOrLibraryNotFound struct { @@ -107,29 +125,59 @@ func (e errBinaryOrLibraryNotFound) Error() string { return fmt.Sprintf("binary or library %q not found (or was not executable)", e.binary) } -func filepathAbs(path string) (string, error) { - path, err := filepath.Abs(path) +func validateBinaryFormat(path string, arch vm.CPUArch, checkEA bool) (string, error) { + abspath, err := filepath.Abs(path) if err != nil { return "", errors.WithStack(err) } - return path, nil -} - -func findBinary(binary, defValue string) (abspath string, err error) { - if binary == "" { - binary = defValue + // Check that the binary ELF format matches the expected architecture. + cmd := exec.Command("file", "-b", abspath) + var out bytes.Buffer + cmd.Stdout = &out + if err := cmd.Run(); err != nil { + return "", errors.Wrapf(err, "error executing 'file %s'", abspath) + } + fileFormat := strings.ToLower(out.String()) + // N.B. 'arm64' is returned on macOS, while 'aarch64' is returned on Linux; + // "x86_64" string is returned on macOS, while "x86-64" is returned on Linux. + if arch == vm.ArchARM64 && !strings.Contains(fileFormat, "arm64") && !strings.Contains(fileFormat, "aarch64") { + return "", errors.Newf("%s has incompatible architecture; want: %q, got: %q", abspath, arch, fileFormat) + } else if arch == vm.ArchAMD64 && !strings.Contains(fileFormat, "x86-64") && !strings.Contains(fileFormat, "x86_64") { + // Otherwise, we expect a binary that was built for amd64. + return "", errors.Newf("%s has incompatible architecture; want: %q, got: %q", abspath, arch, fileFormat) + } + if arch == vm.ArchFIPS && strings.HasSuffix(abspath, "cockroach") { + // Check that the binary is patched to use OpenSSL FIPS. + // N.B. only the cockroach binary is patched, so we exclude this check for dynamically-linked libraries. + cmd = exec.Command("bash", "-c", fmt.Sprintf("nm %s | grep golang-fips |head -1", abspath)) + if err := cmd.Run(); err != nil { + return "", errors.Newf("%s is not compiled with FIPS", abspath) + } + } + if checkEA { + // Check that the binary was compiled with assertions _enabled_. + cmd = exec.Command("bash", "-c", fmt.Sprintf("%s version |grep \"Enabled Assertions\" |grep true", abspath)) + if err := cmd.Run(); err != nil { + return "", errors.Newf("%s is not compiled with assertions enabled", abspath) + } } + return abspath, nil +} + +func findBinary( + name string, osName string, arch vm.CPUArch, checkEA bool, +) (abspath string, err error) { // Check to see if binary exists and is a regular file and executable. - if fi, err := os.Stat(binary); err == nil && fi.Mode().IsRegular() && (fi.Mode()&0111) != 0 { - return filepathAbs(binary) + if fi, err := os.Stat(name); err == nil && fi.Mode().IsRegular() && (fi.Mode()&0111) != 0 { + return validateBinaryFormat(name, arch, checkEA) } - return findBinaryOrLibrary("bin", binary) + return findBinaryOrLibrary("bin", name, "", osName, arch, checkEA) } -func findLibrary(libraryName string) (string, error) { +func findLibrary(libraryName string, os string, arch vm.CPUArch) (string, error) { suffix := ".so" - if local { + if cloud == spec.Local { switch runtime.GOOS { case "linux": case "freebsd": @@ -143,65 +191,102 @@ func findLibrary(libraryName string) (string, error) { return "", errors.Newf("failed to find suffix for runtime %s", runtime.GOOS) } } - return findBinaryOrLibrary("lib", libraryName+suffix) + + return findBinaryOrLibrary("lib", libraryName, suffix, os, arch, false) } -func findBinaryOrLibrary(binOrLib string, name string) (string, error) { +// findBinaryOrLibrary searches for a binary or library, _first_ in the $PATH, _then_ in the following hardcoded paths, +// +// $GOPATH/src/github.com/cockroachdb/cockroach/ +// $GOPATH/src/github.com/cockroachdb/artifacts/ +// $PWD/binOrLib +// $GOPATH/src/github.com/cockroachdb/cockroach/binOrLib +// +// in the above order, unless 'name' is an absolute path, in which case the hardcoded paths are skipped. +// +// binOrLib is either 'bin' or 'lib'; nameSuffix is either empty, '.so', '.dll', or '.dylib'. +// Both osName and arch are used to derive a fully qualified binary or library name by inserting the +// corresponding arch suffix (see install.ArchInfoForOS), e.g. '.linux-arm64' or '.darwin-amd64'. +// That is, each hardcoded path is searched for a file named 'name' or 'name.nameSuffix.archSuffix', respectively. +// +// If no binary or library is found, an error is returned. +// Otherwise, if multiple binaries or libraries are located at the above paths, the first one found is returned. +// If the found binary or library happens to be of the wrong type, e.g., architecture is different from 'arch', or +// checkEA is true, and the binary was not compiled with runtime assertions enabled, an error is returned. +// While we could continue the search instead of returning an error, it is assumed the user can stage the binaries +// to avoid such ambiguity. Alternatively, the user can specify the absolute path to the binary or library, +// e.g., via --cockroach; in this case, only the absolute path is checked and validated. +func findBinaryOrLibrary( + binOrLib string, name string, nameSuffix string, osName string, arch vm.CPUArch, checkEA bool, +) (string, error) { // Find the binary to run and translate it to an absolute path. First, look // for the binary in PATH. - path, err := exec.LookPath(name) + pathFromEnv, err := exec.LookPath(name) + if err == nil { + // Found it in PATH, validate and return absolute path. + return validateBinaryFormat(pathFromEnv, arch, checkEA) + } + if strings.HasPrefix(name, "/") { + // Specified name is an absolute path, but we couldn't find it; bail out. + return "", errors.WithStack(err) + } + // We're unable to find the name in PATH and "name" is a relative path: + // look in the cockroach repo. + gopath := os.Getenv("GOPATH") + if gopath == "" { + gopath = filepath.Join(os.Getenv("HOME"), "go") + } + + dirs := []string{ + filepath.Join(gopath, "/src/github.com/cockroachdb/cockroach/"), + filepath.Join(gopath, "/src/github.com/cockroachdb/cockroach/artifacts/"), + filepath.Join(os.ExpandEnv("$PWD"), binOrLib), + filepath.Join(gopath, "/src/github.com/cockroachdb/cockroach", binOrLib), + } + + archInfo, err := install.ArchInfoForOS(osName, arch) if err != nil { - if strings.HasPrefix(name, "/") { - return "", errors.WithStack(err) - } - - // We're unable to find the name in PATH and "name" is a relative path: - // look in the cockroach repo. - gopath := os.Getenv("GOPATH") - if gopath == "" { - gopath = filepath.Join(os.Getenv("HOME"), "go") - } - - var suffix string - if !local { - suffix = ".docker_amd64" - } - dirs := []string{ - filepath.Join(gopath, "/src/github.com/cockroachdb/cockroach/"), - filepath.Join(gopath, "/src/github.com/cockroachdb/cockroach/artifacts/"), - filepath.Join(gopath, "/src/github.com/cockroachdb/cockroach", binOrLib+suffix), - filepath.Join(os.ExpandEnv("$PWD"), binOrLib+suffix), - filepath.Join(gopath, "/src/github.com/cockroachdb/cockroach", binOrLib), - } - for _, dir := range dirs { - path = filepath.Join(dir, name) - var err2 error - path, err2 = exec.LookPath(path) - if err2 == nil { - return filepathAbs(path) + return "", err + } + archSuffixes := []string{"." + archInfo.DebugArchitecture, "." + archInfo.ReleaseArchitecture} + + for _, dir := range dirs { + var path string + + if path, err = exec.LookPath(filepath.Join(dir, name)); err == nil { + return validateBinaryFormat(path, arch, checkEA) + } + for _, archSuffix := range archSuffixes { + if path, err = exec.LookPath(filepath.Join(dir, name+archSuffix+nameSuffix)); err == nil { + return validateBinaryFormat(path, arch, checkEA) } } - return "", errBinaryOrLibraryNotFound{name} } - return filepathAbs(path) + return "", errBinaryOrLibraryNotFound{name} } // VerifyLibraries verifies that the required libraries, specified by name, are // available for the target environment. -func VerifyLibraries(requiredLibs []string) error { +func VerifyLibraries(requiredLibs []string, arch vm.CPUArch) error { + foundLibraryPaths := libraryFilePaths[arch] + for _, requiredLib := range requiredLibs { - if !contains(libraryFilePaths, libraryNameFromPath, requiredLib) { - return errors.Wrap(errors.Errorf("missing required library %s", requiredLib), "cluster.VerifyLibraries") + if !contains(foundLibraryPaths, libraryNameFromPath, requiredLib) { + return errors.Wrap(errors.Errorf("missing required library %s (arch=%q)", requiredLib, arch), "cluster.VerifyLibraries") } } return nil } -// libraryNameFromPath returns the name of a library without the extension, for a +// libraryNameFromPath returns the name of a library without the extension(s), for a // given path. func libraryNameFromPath(path string) string { filename := filepath.Base(path) - return strings.TrimSuffix(filename, filepath.Ext(filename)) + // N.B. filename may contain multiple extensions, e.g. "libgeos.linux-amd64.fips.so". + for ext := filepath.Ext(filename); ext != ""; ext = filepath.Ext(filename) { + filename = strings.TrimSuffix(filename, ext) + } + return filename } func contains(list []string, transformString func(s string) string, str string) bool { @@ -217,50 +302,128 @@ func contains(list []string, transformString func(s string) string, str string) } func initBinariesAndLibraries() { - // If we're running against an existing "local" cluster, force the local flag - // to true in order to get the "local" test configurations. - if clusterName == "local" { - local = true - } - if local { - cloud = spec.Local - } + // TODO(srosenberg): enable metamorphic local clusters; currently, spec.Local means run all tests locally. + // This could be revisited after we have a way to specify which clouds a given test supports, + // see https://github.com/cockroachdb/cockroach/issues/104029. + defaultOSName := "linux" + defaultArch := vm.ArchAMD64 + if cloud == spec.Local { + defaultOSName = runtime.GOOS + if arm64Probability == 1 { + // N.B. if arm64Probability != 1, then we're running a local cluster with both arm64 and amd64. + defaultArch = vm.ArchARM64 + } + if string(defaultArch) != runtime.GOARCH { + fmt.Printf("WARN: local cluster's architecture (%q) differs from default (%q)\n", runtime.GOARCH, defaultArch) + } + } + fmt.Printf("Locating and verifying binaries for os=%q, arch=%q\n", defaultOSName, defaultArch) + + // Finds and validates a binary. If the binary 'isRequired', but not found, exit and print the error. + resolveBinary := func(binName string, userSpecified string, arch vm.CPUArch, isRequired bool, checkEA bool) (string, error) { + path := binName + if userSpecified != "" { + path = userSpecified + } + abspath, err := findBinary(path, defaultOSName, arch, checkEA) + if err != nil { + if isRequired { + fmt.Fprintf(os.Stderr, "ERROR: unable to find required binary %q for %q: %v\n", binName, arch, err) + os.Exit(1) + } + return "", err + } + if userSpecified == "" { + // No user-specified path, so return the found absolute path. + return abspath, nil + } + // Bail out if a path other than the user-specified was found. + userPath, err := filepath.Abs(userSpecified) + + if err != nil || userPath != abspath { + err = errors.Wrapf(err, "ERROR: found %q at: %s instead of the user-specified path: %q\n", binName, abspath, userSpecified) - cockroachDefault := "cockroach" - if !local { - cockroachDefault = "cockroach-linux-2.6.32-gnu-amd64" + if isRequired { + fmt.Fprintf(os.Stderr, "%v", err) + os.Exit(1) + } + return "", err + } + return abspath, nil } + // We need to verify we have at least both the cockroach and the workload binaries. var err error - cockroach, err = findBinary(cockroach, cockroachDefault) + + cockroach[defaultArch], _ = resolveBinary("cockroach", cockroachPath, defaultArch, true, false) + workload[defaultArch], _ = resolveBinary("workload", workloadPath, defaultArch, true, false) + cockroachShort[defaultArch], err = resolveBinary("cockroach-short", cockroachShortPath, defaultArch, false, true) if err != nil { - fmt.Fprintf(os.Stderr, "%+v\n", err) - os.Exit(1) + fmt.Fprintf(os.Stderr, "WARN: unable to find %q for %q: %s\n", "cockroach-short", defaultArch, err) } - if cockroachShort != "" { - // defValue doesn't matter since cockroachShort is a non-empty string. - cockroachShort, err = findBinary(cockroachShort, "" /* defValue */) + if arm64Probability > 0 && defaultArch != vm.ArchARM64 { + fmt.Printf("Locating and verifying binaries for os=%q, arch=%q\n", defaultOSName, vm.ArchARM64) + // We need to verify we have all the required binaries for arm64. + cockroach[vm.ArchARM64], _ = resolveBinary("cockroach", cockroachPath, vm.ArchARM64, true, false) + workload[vm.ArchARM64], _ = resolveBinary("workload", workloadPath, vm.ArchARM64, true, false) + cockroachShort[vm.ArchARM64], err = resolveBinary("cockroach-short", cockroachShortPath, vm.ArchARM64, false, true) if err != nil { - fmt.Fprintf(os.Stderr, "%+v\n", err) - os.Exit(1) + fmt.Fprintf(os.Stderr, "WARN: unable to find %q for %q: %s\n", "cockroach-short", vm.ArchARM64, err) } } - - workload, err = findBinary(workload, "workload") - if errors.As(err, &errBinaryOrLibraryNotFound{}) { - fmt.Fprintln(os.Stderr, "workload binary not provided, proceeding anyway") - } else if err != nil { - fmt.Fprintf(os.Stderr, "%+v\n", err) - os.Exit(1) + if fipsProbability > 0 && defaultArch != vm.ArchFIPS { + fmt.Printf("Locating and verifying binaries for os=%q, arch=%q\n", defaultOSName, vm.ArchFIPS) + // We need to verify we have all the required binaries for fips. + cockroach[vm.ArchFIPS], _ = resolveBinary("cockroach", cockroachPath, vm.ArchFIPS, true, false) + workload[vm.ArchFIPS], _ = resolveBinary("workload", workloadPath, vm.ArchFIPS, true, false) + cockroachShort[vm.ArchFIPS], err = resolveBinary("cockroach-short", cockroachShortPath, vm.ArchFIPS, false, true) + if err != nil { + fmt.Fprintf(os.Stderr, "WARN: unable to find %q for %q: %s\n", "cockroach-short", vm.ArchFIPS, err) + } } // In v20.2 or higher, optionally expect certain library files to exist. // Since they may not be found in older versions, do not hard error if they are not found. - for _, libraryName := range []string{"libgeos", "libgeos_c"} { - if libraryFilePath, err := findLibrary(libraryName); err != nil { - fmt.Fprintf(os.Stderr, "error finding library %s, ignoring: %+v\n", libraryName, err) - } else { - libraryFilePaths = append(libraryFilePaths, libraryFilePath) + for _, arch := range []vm.CPUArch{vm.ArchAMD64, vm.ArchARM64, vm.ArchFIPS} { + if arm64Probability == 0 && defaultArch != vm.ArchARM64 && arch == vm.ArchARM64 { + // arm64 isn't used, skip finding libs for it. + continue + } + if fipsProbability == 0 && arch == vm.ArchFIPS { + // fips isn't used, skip finding libs for it. + continue + } + paths := []string(nil) + + for _, libraryName := range []string{"libgeos", "libgeos_c"} { + if libraryFilePath, err := findLibrary(libraryName, defaultOSName, arch); err != nil { + fmt.Fprintf(os.Stderr, "WARN: unable to find library %s, ignoring: %s\n", libraryName, err) + } else { + paths = append(paths, libraryFilePath) + } + } + libraryFilePaths[arch] = paths + } + // Looks like we have all the binaries we'll need. Let's print them out. + fmt.Printf("\nFound the following binaries:\n") + for arch, path := range cockroach { + if path != "" { + fmt.Printf("\tcockroach %q at: %s\n", arch, path) + } + } + for arch, path := range workload { + if path != "" { + fmt.Printf("\tworkload %q at: %s\n", arch, path) + } + } + for arch, path := range cockroachShort { + if path != "" { + fmt.Printf("\tcockroach-short %q at: %s\n", arch, path) + } + } + for arch, paths := range libraryFilePaths { + if len(paths) > 0 { + fmt.Printf("\tlibraries %q at: %s\n", arch, strings.Join(paths, ", ")) } } } @@ -654,6 +817,8 @@ type clusterImpl struct { expiration time.Time encAtRest bool // use encryption at rest + os string // OS of the cluster + arch vm.CPUArch // CPU architecture of the cluster // destroyState contains state related to the cluster's destruction. destroyState destroyState } @@ -737,6 +902,10 @@ type clusterConfig struct { localCluster bool useIOBarrier bool alloc *quotapool.IntAlloc + // Specifies CPU architecture which may require a custom AMI and cockroach binary. + arch vm.CPUArch + // Specifies the OS which may require a custom AMI and cockroach binary. + os string } // clusterFactory is a creator of clusters. @@ -873,7 +1042,8 @@ func (f *clusterFactory) newCluster( providerOptsContainer := vm.CreateProviderOptionsContainer() // The ClusterName is set below in the retry loop to ensure // that each create attempt gets a unique cluster name. - createVMOpts, providerOpts, err := cfg.spec.RoachprodOpts("", cfg.useIOBarrier) + createVMOpts, providerOpts, err := cfg.spec.RoachprodOpts("", cfg.useIOBarrier, cfg.arch) + if err != nil { // We must release the allocation because cluster creation is not possible at this point. cfg.alloc.Release() @@ -909,6 +1079,8 @@ func (f *clusterFactory) newCluster( spec: cfg.spec, expiration: cfg.spec.Expiration(), r: f.r, + arch: cfg.arch, + os: cfg.os, destroyState: destroyState{ owned: true, alloc: cfg.alloc, @@ -1698,11 +1870,13 @@ func (c *clusterImpl) PutLibraries( if err := c.RunE(ctx, c.All(), "mkdir", "-p", libraryDir); err != nil { return err } - for _, libraryFilePath := range libraryFilePaths { - if !contains(libraries, nil, libraryNameFromPath(libraryFilePath)) { + + for _, libraryFilePath := range libraryFilePaths[c.arch] { + libName := libraryNameFromPath(libraryFilePath) + if !contains(libraries, nil, libName) { continue } - putPath := filepath.Join(libraryDir, filepath.Base(libraryFilePath)) + putPath := filepath.Join(libraryDir, libName) if err := c.PutE( ctx, c.l, @@ -1728,7 +1902,7 @@ func (c *clusterImpl) Stage( c.status("staging binary") defer c.status("") return errors.Wrap(roachprod.Stage(ctx, l, c.MakeNodes(opts...), - "" /* stageOS */, "" /* stageArch */, dir, application, versionOrSHA), "cluster.Stage") + c.os, string(c.arch), dir, application, versionOrSHA), "cluster.Stage") } // Get gets files from remote hosts. @@ -2421,6 +2595,10 @@ func (c *clusterImpl) IsSecure() bool { return c.localCertsDir != "" } +func (c *clusterImpl) Architecture() vm.CPUArch { + return c.arch +} + // Extend extends the cluster's expiration by d. func (c *clusterImpl) Extend(ctx context.Context, d time.Duration, l *logger.Logger) error { if ctx.Err() != nil { @@ -2443,7 +2621,9 @@ func (c *clusterImpl) NewMonitor(ctx context.Context, opts ...option.Option) clu func (c *clusterImpl) StartGrafana( ctx context.Context, l *logger.Logger, promCfg *prometheus.Config, ) error { - return roachprod.StartGrafana(ctx, l, c.name, "", promCfg) + + return roachprod.StartGrafana(ctx, l, c.name, c.arch, "", promCfg) + } func (c *clusterImpl) StopGrafana(ctx context.Context, l *logger.Logger, dumpDir string) error { diff --git a/pkg/cmd/roachtest/cluster/BUILD.bazel b/pkg/cmd/roachtest/cluster/BUILD.bazel index afd151fae747..a5c0b9f53899 100644 --- a/pkg/cmd/roachtest/cluster/BUILD.bazel +++ b/pkg/cmd/roachtest/cluster/BUILD.bazel @@ -16,6 +16,7 @@ go_library( "//pkg/roachprod/install", "//pkg/roachprod/logger", "//pkg/roachprod/prometheus", + "//pkg/roachprod/vm", "@com_github_cockroachdb_errors//:errors", ], ) diff --git a/pkg/cmd/roachtest/cluster/cluster_interface.go b/pkg/cmd/roachtest/cluster/cluster_interface.go index bbd538676ee9..b855cb39b18c 100644 --- a/pkg/cmd/roachtest/cluster/cluster_interface.go +++ b/pkg/cmd/roachtest/cluster/cluster_interface.go @@ -20,6 +20,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/roachprod/install" "github.com/cockroachdb/cockroach/pkg/roachprod/logger" "github.com/cockroachdb/cockroach/pkg/roachprod/prometheus" + "github.com/cockroachdb/cockroach/pkg/roachprod/vm" ) // Cluster is the interface through which a given roachtest interacts with the @@ -107,7 +108,10 @@ type Cluster interface { Spec() spec.ClusterSpec Name() string IsLocal() bool + // IsSecure returns true iff the cluster uses TLS. IsSecure() bool + // Returns CPU architecture of the nodes. + Architecture() vm.CPUArch // Deleting CockroachDB data and logs on nodes. diff --git a/pkg/cmd/roachtest/cluster_test.go b/pkg/cmd/roachtest/cluster_test.go index fb26d417e285..4aaddbab3e1a 100644 --- a/pkg/cmd/roachtest/cluster_test.go +++ b/pkg/cmd/roachtest/cluster_test.go @@ -18,6 +18,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec" test2 "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" "github.com/cockroachdb/cockroach/pkg/roachprod/logger" + "github.com/cockroachdb/cockroach/pkg/roachprod/vm" "github.com/cockroachdb/cockroach/pkg/util/version" "github.com/cockroachdb/errors" "github.com/stretchr/testify/assert" @@ -199,14 +200,14 @@ func TestVerifyLibraries(t *testing.T) { name: "no match", verifyLibs: []string{"required_c"}, libraryFilePaths: []string{"/some/path/lib.so"}, - expectedError: errors.Wrap(errors.Errorf("missing required library %s", + expectedError: errors.Wrap(errors.Errorf("missing required library %s (arch=\"amd64\")", "required_c"), "cluster.VerifyLibraries"), }, { name: "no match on nil libs", verifyLibs: []string{"required_b"}, libraryFilePaths: nil, - expectedError: errors.Wrap(errors.Errorf("missing required library %s", + expectedError: errors.Wrap(errors.Errorf("missing required library %s (arch=\"amd64\")", "required_b"), "cluster.VerifyLibraries"), }, { @@ -215,17 +216,29 @@ func TestVerifyLibraries(t *testing.T) { libraryFilePaths: []string{"/lib/geos.so"}, expectedError: nil, }, + { + name: "single match, multiple extensions", + verifyLibs: []string{"geos"}, + libraryFilePaths: []string{"/lib/geos.linux-amd.so"}, + expectedError: nil, + }, { name: "multiple matches", verifyLibs: []string{"lib", "ltwo", "geos"}, libraryFilePaths: []string{"ltwo.so", "a/geos.so", "/some/path/to/lib.so"}, expectedError: nil, }, + { + name: "multiple matches, multiple extensions", + verifyLibs: []string{"lib", "ltwo", "geos"}, + libraryFilePaths: []string{"ltwo.linux-arm64.so", "a/geos.linux-amd64.fips.so", "/some/path/to/lib.darwin-arm64.so"}, + expectedError: nil, + }, } for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { - libraryFilePaths = tc.libraryFilePaths - actualError := VerifyLibraries(tc.verifyLibs) + libraryFilePaths = map[vm.CPUArch][]string{vm.ArchAMD64: tc.libraryFilePaths} + actualError := VerifyLibraries(tc.verifyLibs, vm.ArchAMD64) if tc.expectedError == nil { require.NoError(t, actualError) } else { diff --git a/pkg/cmd/roachtest/github.go b/pkg/cmd/roachtest/github.go index 05768e49d606..8d1414f1e664 100644 --- a/pkg/cmd/roachtest/github.go +++ b/pkg/cmd/roachtest/github.go @@ -125,7 +125,10 @@ func (g *githubIssues) createPostRequest( roachtestPrefix("cpu"): fmt.Sprintf("%d", spec.Cluster.CPUs), roachtestPrefix("ssd"): fmt.Sprintf("%d", spec.Cluster.SSDs), } - + // Emit CPU architecture only if it was specified; otherwise, it's captured below, assuming cluster was created. + if spec.Cluster.Arch != "" { + clusterParams[roachtestPrefix("arch")] = string(spec.Cluster.Arch) + } // These params can be probabilistically set, so we pass them here to // show what their actual values are in the posted issue. if g.vmCreateOpts != nil { @@ -135,6 +138,11 @@ func (g *githubIssues) createPostRequest( if g.cluster != nil { clusterParams[roachtestPrefix("encrypted")] = fmt.Sprintf("%v", g.cluster.encAtRest) + if spec.Cluster.Arch == "" { + // N.B. when Arch is specified, it cannot differ from cluster's arch. + // Hence, we only emit when arch was unspecified. + clusterParams[roachtestPrefix("arch")] = string(g.cluster.arch) + } } return issues.PostRequest{ diff --git a/pkg/cmd/roachtest/github_test.go b/pkg/cmd/roachtest/github_test.go index 8b49dfe86c7a..b2c2f139ad9f 100644 --- a/pkg/cmd/roachtest/github_test.go +++ b/pkg/cmd/roachtest/github_test.go @@ -74,7 +74,8 @@ func TestShouldPost(t *testing.T) { {false, 1, "token", "master", true}, } - reg := makeTestRegistry(spec.GCE, "", "", false, false) + reg, err := makeTestRegistry(spec.GCE, "", "", false, false) + require.NoError(t, err) for _, c := range testCases { t.Setenv("GITHUB_API_TOKEN", c.envGithubAPIToken) @@ -108,34 +109,40 @@ func TestCreatePostRequest(t *testing.T) { clusterCreationFailed bool loadTeamsFailed bool localSSD bool + arch vm.CPUArch category issueCategory expectedPost bool expectedParams map[string]string }{ - {true, false, false, false, otherErr, true, + {true, false, false, false, "", otherErr, true, + prefixAll(map[string]string{ "cloud": "gce", "encrypted": "false", "fs": "ext4", "ssd": "0", "cpu": "4", + "arch": "amd64", "localSSD": "false", }), }, - {true, false, false, true, clusterCreationErr, true, + {true, false, false, true, vm.ArchARM64, clusterCreationErr, true, + prefixAll(map[string]string{ "cloud": "gce", "encrypted": "false", "fs": "ext4", "ssd": "0", "cpu": "4", + "arch": "arm64", "localSSD": "true", }), }, // Assert that release-blocker label exists when !nonReleaseBlocker // Also ensure that in the event of a failed cluster creation, // nil `vmOptions` and `clusterImpl` are not dereferenced - {false, true, false, false, sshErr, true, + {false, true, false, false, "", sshErr, true, + prefixAll(map[string]string{ "cloud": "gce", "ssd": "0", @@ -143,17 +150,14 @@ func TestCreatePostRequest(t *testing.T) { }), }, //Simulate failure loading TEAMS.yaml - {true, false, true, false, otherErr, false, nil}, + {true, false, true, false, "", otherErr, false, nil}, } -<<<<<<< HEAD - reg, _ := makeTestRegistry(spec.GCE, "", "", false) + reg, err := makeTestRegistry(spec.GCE, "", "", false, false) + require.NoError(t, err) -======= - reg := makeTestRegistry(spec.GCE, "", "", false, false) ->>>>>>> 0df3a03e781 (roachtest: require perf. tests to opt in via TestSpec.Benchmark) for _, c := range testCases { - clusterSpec := reg.MakeClusterSpec(1) + clusterSpec := reg.MakeClusterSpec(1, spec.Arch(c.arch)) testSpec := ®istry.TestSpec{ Name: "github_test", @@ -167,7 +171,7 @@ func TestCreatePostRequest(t *testing.T) { l: nilLogger(), } - testClusterImpl := &clusterImpl{spec: clusterSpec} + testClusterImpl := &clusterImpl{spec: clusterSpec, arch: vm.ArchAMD64} vo := vm.DefaultCreateOpts() vmOpts := &vo diff --git a/pkg/cmd/roachtest/main.go b/pkg/cmd/roachtest/main.go index 9e03a453e81e..a71688ce6652 100644 --- a/pkg/cmd/roachtest/main.go +++ b/pkg/cmd/roachtest/main.go @@ -18,10 +18,12 @@ import ( "os/signal" "os/user" "path/filepath" + "runtime" "time" "github.com/cockroachdb/cockroach/pkg/build" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/tests" "github.com/cockroachdb/cockroach/pkg/roachprod" "github.com/cockroachdb/cockroach/pkg/roachprod/config" @@ -102,16 +104,50 @@ func main() { if cmd.Name() == "help" { return nil } - - if clusterName != "" && local { - return fmt.Errorf( - "cannot specify both an existing cluster (%s) and --local. However, if a local cluster "+ - "already exists, --clusters=local will use it", - clusterName) + local := cmd.Flags().Lookup("local").Value.String() == "true" + if local { + if clusterName != "" { + return fmt.Errorf( + "cannot specify both an existing cluster (%s) and --local. However, if a local cluster "+ + "already exists, --clusters=local will use it", + clusterName) + } + cloud = spec.Local } switch cmd.Name() { case "run", "bench", "store-gen": + if !(0 <= arm64Probability && arm64Probability <= 1) { + return fmt.Errorf("'metamorphic-arm64-probability' must be in [0,1]") + } + if !(0 <= fipsProbability && fipsProbability <= 1) { + return fmt.Errorf("'metamorphic-fips-probability' must be in [0,1]") + } + if arm64Probability == 1 && fipsProbability != 0 { + return fmt.Errorf("'metamorphic-fips-probability' must be 0 when 'metamorphic-arm64-probability' is 1") + } + if fipsProbability == 1 && arm64Probability != 0 { + return fmt.Errorf("'metamorphic-arm64-probability' must be 0 when 'metamorphic-fips-probability' is 1") + } + arm64Opt := cmd.Flags().Lookup("metamorphic-arm64-probability") + if !arm64Opt.Changed && runtime.GOARCH == "arm64" && cloud == spec.Local { + fmt.Printf("Detected 'arm64' in 'local mode', setting 'metamorphic-arm64-probability' to 1; use --metamorphic-arm64-probability to run (emulated) with other binaries\n") + arm64Probability = 1 + } + // Find and validate all required binaries and libraries. initBinariesAndLibraries() + + if arm64Probability > 0 { + fmt.Printf("ARM64 clusters will be provisioned with probability %.2f\n", arm64Probability) + } + amd64Probability := 1 - arm64Probability + if amd64Probability > 0 { + fmt.Printf("AMD64 clusters will be provisioned with probability %.2f\n", amd64Probability) + } + if fipsProbability > 0 { + // N.B. arm64Probability < 1, otherwise fipsProbability == 0, as per above check. + // Hence, amd64Probability > 0 is implied. + fmt.Printf("FIPS clusters will be provisioned with probability %.2f\n", fipsProbability*amd64Probability) + } } return nil }, @@ -123,6 +159,7 @@ func main() { "If fewer than --parallelism names are specified, then the parallelism "+ "is capped to the number of clusters specified. When a cluster does not exist "+ "yet, it is created according to the spec.") + var local bool rootCmd.PersistentFlags().BoolVarP( &local, "local", "l", local, "run tests locally") rootCmd.PersistentFlags().StringVarP( @@ -130,15 +167,25 @@ func main() { "Username to use as a cluster name prefix. "+ "If blank, the current OS user is detected and specified.") rootCmd.PersistentFlags().StringVar( - &cockroach, "cockroach", "", "path to cockroach binary to use") + &cockroachPath, "cockroach", "", "path to cockroach binary to use") rootCmd.PersistentFlags().StringVar( - &cockroachShort, "cockroach-short", "", "path to cockroach-short binary (compiled with crdb_test build tag) to use") + &cockroachShortPath, "cockroach-short", "", "path to cockroach-short binary (compiled with crdb_test build tag) to use") rootCmd.PersistentFlags().StringVar( - &workload, "workload", "", "path to workload binary to use") + &workloadPath, "workload", "", "path to workload binary to use") rootCmd.PersistentFlags().Float64Var( &encryptionProbability, "metamorphic-encryption-probability", defaultEncryptionProbability, "probability that clusters will be created with encryption-at-rest enabled "+ "for tests that support metamorphic encryption (default 1.0)") + rootCmd.PersistentFlags().Float64Var( + &fipsProbability, "metamorphic-fips-probability", defaultFIPSProbability, + "conditional probability that amd64 clusters will be created with FIPS, i.e., P(fips | amd64), "+ + "for tests that support FIPS and whose CPU architecture is 'amd64' (default 0) "+ + "NOTE: amd64 clusters are created with probability 1-P(arm64), where P(arm64) is 'metamorphic-arm64-probability'. "+ + "Hence, P(fips | amd64) = P(fips) * (1 - P(arm64))") + rootCmd.PersistentFlags().Float64Var( + &arm64Probability, "metamorphic-arm64-probability", defaultARM64Probability, + "probability that clusters will be created with 'arm64' CPU architecture "+ + "for tests that support 'arm64' (default 0)") rootCmd.AddCommand(&cobra.Command{ Use: `version`, @@ -224,7 +271,6 @@ runner itself. user: username, clusterID: clusterID, versionsBinaryOverride: versionsBinaryOverride, - enableFIPS: enableFIPS, }, false /* benchOnly */) }, } @@ -263,7 +309,6 @@ runner itself. user: username, clusterID: clusterID, versionsBinaryOverride: versionsBinaryOverride, - enableFIPS: enableFIPS, }, true /* benchOnly */) }, } @@ -378,7 +423,7 @@ func runTests(register func(registry.Registry), cfg cliCfg, benchOnly bool) erro filter := registry.NewTestFilter(cfg.args) clusterType := roachprodCluster bindTo := "" - if local { + if cloud == spec.Local { clusterType = localCluster // This will suppress the annoying "Allow incoming network connections" popup from diff --git a/pkg/cmd/roachtest/slack.go b/pkg/cmd/roachtest/slack.go index 1e653d142f61..40f2505003e0 100644 --- a/pkg/cmd/roachtest/slack.go +++ b/pkg/cmd/roachtest/slack.go @@ -75,8 +75,6 @@ func postSlackReport(pass, fail, skip map[*testImpl]struct{}) { switch { case cloud != "": prefix = strings.ToUpper(cloud) - case local: - prefix = "LOCAL" default: prefix = "GCE" } diff --git a/pkg/cmd/roachtest/spec/cluster_spec.go b/pkg/cmd/roachtest/spec/cluster_spec.go index dfae8d44733b..592a7054c268 100644 --- a/pkg/cmd/roachtest/spec/cluster_spec.go +++ b/pkg/cmd/roachtest/spec/cluster_spec.go @@ -40,7 +40,8 @@ const ( // look like. It becomes part of a clusterConfig when the cluster is created. type ClusterSpec struct { Cloud string - InstanceType string // auto-chosen if left empty + Arch vm.CPUArch // CPU architecture; auto-chosen if left empty + InstanceType string // auto-chosen if left empty NodeCount int // CPUs is the number of CPUs per node. CPUs int @@ -156,7 +157,7 @@ func getAzureOpts(machineType string, zones []string) vm.ProviderOpts { // RoachprodOpts returns the opts to use when calling `roachprod.Create()` // in order to create the cluster described in the spec. func (s *ClusterSpec) RoachprodOpts( - clusterName string, useIOBarrier bool, + clusterName string, useIOBarrier bool, arch vm.CPUArch, ) (vm.CreateOpts, vm.ProviderOpts, error) { createVMOpts := vm.DefaultCreateOpts() @@ -187,29 +188,41 @@ func (s *ClusterSpec) RoachprodOpts( } createVMOpts.GeoDistributed = s.Geo + createVMOpts.Arch = string(arch) machineType := s.InstanceType ssdCount := s.SSDs + if s.CPUs != 0 { // Default to the user-supplied machine type, if any. // Otherwise, pick based on requested CPU count. + var selectedArch vm.CPUArch + if len(machineType) == 0 { // If no machine type was specified, choose one // based on the cloud and CPU count. switch s.Cloud { case AWS: - machineType = AWSMachineType(s.CPUs, s.HighMem) + machineType, selectedArch = AWSMachineType(s.CPUs, s.HighMem, arch) case GCE: - machineType = GCEMachineType(s.CPUs, s.HighMem) + machineType, selectedArch = GCEMachineType(s.CPUs, s.HighMem, arch) case Azure: machineType = AzureMachineType(s.CPUs, s.HighMem) } } + if selectedArch != "" && selectedArch != arch { + // TODO(srosenberg): we need a better way to monitor the rate of this mismatch, i.e., + // other than grepping cluster creation logs. + fmt.Printf("WARN: requested arch %s for machineType %s, but selected %s\n", arch, machineType, selectedArch) + createVMOpts.Arch = string(selectedArch) + } // Local SSD can only be requested // - if configured to prefer doing so, // - if no particular volume size is requested, and, // - on AWS, if the machine type supports it. - if s.PreferLocalSSD && s.VolumeSize == 0 && (s.Cloud != AWS || awsMachineSupportsSSD(machineType)) { + // - on GCE, if the machine type is not ARM64. + if s.PreferLocalSSD && s.VolumeSize == 0 && (s.Cloud != AWS || awsMachineSupportsSSD(machineType)) && + (s.Cloud != GCE || selectedArch != vm.ArchARM64) { // Ensure SSD count is at least 1 if UseLocalSSD is true. if ssdCount == 0 { ssdCount = 1 @@ -242,6 +255,12 @@ func (s *ClusterSpec) RoachprodOpts( } } + if createVMOpts.Arch == string(vm.ArchFIPS) && !(s.Cloud == GCE || s.Cloud == AWS) { + return vm.CreateOpts{}, nil, errors.Errorf( + "FIPS not yet supported on %s", s.Cloud, + ) + } + var providerOpts vm.ProviderOpts switch s.Cloud { case AWS: diff --git a/pkg/cmd/roachtest/spec/machine_type.go b/pkg/cmd/roachtest/spec/machine_type.go index af77a4e62e64..db9ce94520ee 100644 --- a/pkg/cmd/roachtest/spec/machine_type.go +++ b/pkg/cmd/roachtest/spec/machine_type.go @@ -10,15 +10,31 @@ package spec -import "fmt" +import ( + "fmt" + + "github.com/cockroachdb/cockroach/pkg/roachprod/vm" +) // AWSMachineType selects a machine type given the desired number of CPUs. -func AWSMachineType(cpus int, highmem bool) string { +// Also returns the architecture of the selected machine type. +func AWSMachineType(cpus int, highmem bool, arch vm.CPUArch) (string, vm.CPUArch) { // TODO(erikgrinaker): These have significantly less RAM than // their GCE counterparts. Consider harmonizing them. family := "c5d" // 2 GB RAM per CPU + selectedArch := vm.ArchAMD64 + if arch == vm.ArchFIPS { + selectedArch = vm.ArchFIPS + } else if arch == vm.ArchARM64 { + family = "c7g" // 2 GB RAM per CPU (graviton3) + selectedArch = vm.ArchARM64 + } + if highmem { family = "m5d" // 4 GB RAM per CPU + if arch == vm.ArchARM64 { + family = "m7g" // 4 GB RAM per CPU (graviton3) + } } var size string @@ -33,36 +49,63 @@ func AWSMachineType(cpus int, highmem bool) string { size = "4xlarge" case cpus <= 36: size = "9xlarge" + if family == "c7g" || family == "m7g" { + size = "8xlarge" + } case cpus <= 72: size = "18xlarge" + if family == "c7g" || family == "m7g" { + size = "16xlarge" + } case cpus <= 96: size = "24xlarge" default: panic(fmt.Sprintf("no aws machine type with %d cpus", cpus)) } - // There is no c5d.24xlarge. + // There is no m7g.24xlarge, fall back to m5d.24xlarge. + if family == "m7g" && size == "24xlarge" { + family = "m5d" + selectedArch = vm.ArchAMD64 + } + // There is no c7g.24xlarge, fall back to c5d.24xlarge. + if family == "c7g" && size == "24xlarge" { + family = "c5d" + selectedArch = vm.ArchAMD64 + } + + // There is no c5d.24xlarge, fall back to m5d.24xlarge. if family == "c5d" && size == "24xlarge" { family = "m5d" } - return fmt.Sprintf("%s.%s", family, size) + return fmt.Sprintf("%s.%s", family, size), selectedArch } // GCEMachineType selects a machine type given the desired number of CPUs. -func GCEMachineType(cpus int, highmem bool) string { +// Also returns the architecture of the selected machine type. +func GCEMachineType(cpus int, highmem bool, arch vm.CPUArch) (string, vm.CPUArch) { // TODO(peter): This is awkward: at or below 16 cpus, use n1-standard so that // the machines have a decent amount of RAM. We could use custom machine // configurations, but the rules for the amount of RAM per CPU need to be // determined (you can't request any arbitrary amount of RAM). series := "n1" + selectedArch := vm.ArchAMD64 + if arch == vm.ArchFIPS { + selectedArch = vm.ArchFIPS + } kind := "standard" // 3.75 GB RAM per CPU if highmem { kind = "highmem" // 6.5 GB RAM per CPU } else if cpus > 16 { kind = "highcpu" // 0.9 GB RAM per CPU } - return fmt.Sprintf("%s-%s-%d", series, kind, cpus) + if arch == vm.ArchARM64 && !highmem && cpus <= 48 { + series = "t2a" + kind = "standard" + selectedArch = vm.ArchARM64 + } + return fmt.Sprintf("%s-%s-%d", series, kind, cpus), selectedArch } // AzureMachineType selects a machine type given the desired number of CPUs. diff --git a/pkg/cmd/roachtest/spec/option.go b/pkg/cmd/roachtest/spec/option.go index 00acc79c0dcf..ee8080f2adc2 100644 --- a/pkg/cmd/roachtest/spec/option.go +++ b/pkg/cmd/roachtest/spec/option.go @@ -10,13 +10,39 @@ package spec -import "time" +import ( + "time" + + "github.com/cockroachdb/cockroach/pkg/roachprod/vm" +) // Option is the interface satisfied by options to MakeClusterSpec. type Option interface { apply(spec *ClusterSpec) } +type cloudOption string + +func (o cloudOption) apply(spec *ClusterSpec) { + spec.Cloud = string(o) +} + +// Cloud controls what cloud is used to create the cluster. +func Cloud(s string) Option { + return cloudOption(s) +} + +type archOption string + +func (o archOption) apply(spec *ClusterSpec) { + spec.Arch = vm.CPUArch(o) +} + +// Request specific CPU architecture. +func Arch(arch vm.CPUArch) Option { + return archOption(arch) +} + type nodeCPUOption int func (o nodeCPUOption) apply(spec *ClusterSpec) { diff --git a/pkg/cmd/roachtest/test_impl.go b/pkg/cmd/roachtest/test_impl.go index a3600257fa51..23424b7da8f4 100644 --- a/pkg/cmd/roachtest/test_impl.go +++ b/pkg/cmd/roachtest/test_impl.go @@ -122,6 +122,7 @@ func (t *testImpl) BuildVersion() *version.Version { return &t.buildVersion } +// Cockroach returns the path to the cockroach binary. func (t *testImpl) Cockroach() string { return t.cockroach } diff --git a/pkg/cmd/roachtest/test_registry_test.go b/pkg/cmd/roachtest/test_registry_test.go index 9ab9c2780537..5678d25a09d2 100644 --- a/pkg/cmd/roachtest/test_registry_test.go +++ b/pkg/cmd/roachtest/test_registry_test.go @@ -14,6 +14,7 @@ import ( "testing" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec" + "github.com/cockroachdb/cockroach/pkg/roachprod/vm" "github.com/cockroachdb/cockroach/pkg/testutils" "github.com/stretchr/testify/require" ) @@ -41,5 +42,11 @@ func TestMakeTestRegistry(t *testing.T) { require.Equal(t, "foo", s.InstanceType) require.EqualValues(t, 4, s.CPUs) require.True(t, s.TerminateOnMigration) + + s = r.MakeClusterSpec(10, spec.CPU(16), spec.Arch(vm.ArchARM64)) + require.EqualValues(t, 10, s.NodeCount) + require.Equal(t, "foo", s.InstanceType) + require.EqualValues(t, 16, s.CPUs) + require.EqualValues(t, vm.ArchARM64, s.Arch) }) } diff --git a/pkg/cmd/roachtest/test_runner.go b/pkg/cmd/roachtest/test_runner.go index 7160cd4b9ec3..1c165880601a 100644 --- a/pkg/cmd/roachtest/test_runner.go +++ b/pkg/cmd/roachtest/test_runner.go @@ -58,6 +58,8 @@ var ( // reference error used when cluster creation fails for a test errClusterProvisioningFailed = fmt.Errorf("cluster could not be created") + + prng, _ = randutil.NewLockedPseudoRand() ) // testRunner runs tests. @@ -351,11 +353,12 @@ func defaultClusterAllocator( allocateCluster := func( ctx context.Context, t registry.TestSpec, + arch vm.CPUArch, alloc *quotapool.IntAlloc, artifactsDir string, wStatus *workerStatus, ) (*clusterImpl, *vm.CreateOpts, error) { - wStatus.SetStatus("creating cluster") + wStatus.SetStatus(fmt.Sprintf("creating cluster (arch=%q)", arch)) defer wStatus.SetStatus("") existingClusterName := clustersOpt.clusterName @@ -372,6 +375,9 @@ func defaultClusterAllocator( skipStop: r.config.skipClusterStopOnAttach, skipWipe: r.config.skipClusterWipeOnAttach, } + // TODO(srosenberg): we need to think about validation here. Attaching to an incompatible cluster, e.g., + // using arm64 AMI with amd64 binary, would result in obscure errors. The test runner ensures compatibility + // during cluster reuse, whereas attachment via CLI (e.g., via roachprod) does not. lopt.l.PrintfCtx(ctx, "Attaching to existing cluster %s for test %s", existingClusterName, t.Name) c, err := attachToExistingCluster(ctx, existingClusterName, clusterL, t.Cluster, opt, r.cr) if err == nil { @@ -382,11 +388,11 @@ func defaultClusterAllocator( } // Fall through to create new cluster with name override. lopt.l.PrintfCtx( - ctx, "Creating new cluster with custom name %q for test %s: %s", - clustersOpt.clusterName, t.Name, t.Cluster, + ctx, "Creating new cluster with custom name %q for test %s: %s (arch=%q)", + clustersOpt.clusterName, t.Name, t.Cluster, arch, ) } else { - lopt.l.PrintfCtx(ctx, "Creating new cluster for test %s: %s", t.Name, t.Cluster) + lopt.l.PrintfCtx(ctx, "Creating new cluster for test %s: %s (arch=%q)", t.Name, t.Cluster, arch) } cfg := clusterConfig{ @@ -396,6 +402,7 @@ func defaultClusterAllocator( username: clustersOpt.user, localCluster: clustersOpt.typ == localCluster, alloc: alloc, + arch: arch, } return clusterFactory.newCluster(ctx, cfg, wStatus.SetStatus, lopt.tee) } @@ -405,6 +412,7 @@ func defaultClusterAllocator( type clusterAllocatorFn func( ctx context.Context, t registry.TestSpec, + arch vm.CPUArch, alloc *quotapool.IntAlloc, artifactsDir string, wStatus *workerStatus, @@ -485,8 +493,6 @@ func (r *testRunner) runWorker( } }() - prng, _ := randutil.NewPseudoRand() - // Loop until there's no more work in the pool, we get interrupted, or an // error occurs. for { @@ -532,7 +538,7 @@ func (r *testRunner) runWorker( // Attempt to reuse existing cluster. if c != nil && testToRun.canReuseCluster { err = func() error { - l.PrintfCtx(ctx, "Using existing cluster: %s. Wiping", c.name) + l.PrintfCtx(ctx, "Using existing cluster: %s (arch=%q). Wiping", c.name, c.arch) if err := c.WipeE(ctx, l); err != nil { return err } @@ -559,10 +565,48 @@ func (r *testRunner) runWorker( // Let's attempt to create a fresh one. testToRun.canReuseCluster = false } + // sanity check + if c.spec.Cloud != spec.Local && c.spec.Arch != "" && c.arch != c.spec.Arch { + return errors.Newf("cluster arch %q does not match specified arch %q on cloud: %q", c.arch, c.spec.Arch, c.spec.Cloud) + } + } + arch := testToRun.spec.Cluster.Arch + // N.B. local cluster can mix different CPU architectures via emulation; e.g., mac silicon running x86. + if testToRun.canReuseCluster && c != nil && c.spec.Cloud != spec.Local { + // We're reusing a non-local cluster, so we must use the same arch. + arch = c.arch + } + if arch == "" { + // CPU architecture is unspecified, choose one according to the probability distribution. + arch = vm.ArchAMD64 + if prng.Float64() < arm64Probability { + arch = vm.ArchARM64 + } else if prng.Float64() < fipsProbability { + // N.B. branch is taken with probability (1 - arm64Probability) * fipsProbability which is P(fips | amd64). + // N.B. FIPS is only supported on 'amd64' at this time. + arch = vm.ArchFIPS + } + if testToRun.spec.Benchmark { + // TODO(srosenberg): enable after https://github.com/cockroachdb/cockroach/issues/104213 + l.PrintfCtx(ctx, "Disabling randomly chosen arch=%q, %s", arch, testToRun.spec.Name) + arch = vm.ArchAMD64 + } + l.PrintfCtx(ctx, "Using randomly chosen arch=%q, %s", arch, testToRun.spec.Name) + } else { + l.PrintfCtx(ctx, "Using specified arch=%q, %s", arch, testToRun.spec.Name) + } + // N.B. if canReuseCluster is false, then the previous cluster has been destroyed; new one will be created below. + if testToRun.canReuseCluster && c != nil && c.arch != arch { + // Non-local cluster that's being reused must have the same architecture as was ensured above. + if c.spec.Cloud != spec.Local { + return errors.New("infeasible path: non-local cluster arch mismatch") + } + // Local cluster is now reused to emulate a different CPU architecture. + c.arch = arch } // Verify that required native libraries are available. - if err = VerifyLibraries(testToRun.spec.NativeLibs); err != nil { + if err = VerifyLibraries(testToRun.spec.NativeLibs, arch); err != nil { shout(ctx, l, stdout, "Library verification failed: %s", err) return err } @@ -574,13 +618,14 @@ func (r *testRunner) runWorker( // Create a new cluster if can't reuse or reuse attempt failed. // N.B. non-reusable cluster would have been destroyed above. wStatus.SetTest(nil /* test */, testToRun) - wStatus.SetStatus("creating cluster") - c, vmCreateOpts, clusterCreateErr = allocateCluster(ctx, testToRun.spec, testToRun.alloc, artifactsRootDir, wStatus) + c, vmCreateOpts, clusterCreateErr = allocateCluster(ctx, testToRun.spec, arch, testToRun.alloc, artifactsRootDir, wStatus) if clusterCreateErr != nil { clusterCreateErr = errors.Mark(clusterCreateErr, errClusterProvisioningFailed) atomic.AddInt32(&r.numClusterErrs, 1) shout(ctx, l, stdout, "Unable to create (or reuse) cluster for test %s due to: %s.", testToRun.spec.Name, clusterCreateErr) + } else { + l.PrintfCtx(ctx, "Created new cluster for test %s: %s (arch=%q)", testToRun.spec.Name, c.Name(), arch) } } // Prepare the test's logger. Always set this up with real files, using a @@ -606,9 +651,9 @@ func (r *testRunner) runWorker( } t := &testImpl{ spec: &testToRun.spec, - cockroach: cockroach, - cockroachShort: cockroachShort, - deprecatedWorkload: workload, + cockroach: cockroach[arch], + cockroachShort: cockroachShort[arch], + deprecatedWorkload: workload[arch], buildVersion: r.buildVersion, artifactsDir: artifactsDir, artifactsSpec: artifactsSpec, @@ -633,6 +678,9 @@ func (r *testRunner) runWorker( shout(ctx, l, stdout, "failed to post issue: %s", err) } } else { + // Now run the test. + l.PrintfCtx(ctx, "Starting test: %s:%d on cluster=%s (arch=%q)", testToRun.spec.Name, testToRun.runNum, c.Name(), arch) + c.setTest(t) err = c.PutLibraries(ctx, "./lib", t.spec.NativeLibs) diff --git a/pkg/cmd/roachtest/test_test.go b/pkg/cmd/roachtest/test_test.go index b42236a7dbd8..941dfd1ac50b 100644 --- a/pkg/cmd/roachtest/test_test.go +++ b/pkg/cmd/roachtest/test_test.go @@ -98,6 +98,7 @@ func nilLogger() *logger.Logger { func alwaysFailingClusterAllocator( ctx context.Context, t registry.TestSpec, + arch vm.CPUArch, alloc *quotapool.IntAlloc, artifactsDir string, wStatus *workerStatus, diff --git a/pkg/cmd/roachtest/tests/BUILD.bazel b/pkg/cmd/roachtest/tests/BUILD.bazel index 45ed875dc05a..304a41cf4ff0 100644 --- a/pkg/cmd/roachtest/tests/BUILD.bazel +++ b/pkg/cmd/roachtest/tests/BUILD.bazel @@ -184,6 +184,7 @@ go_library( "//pkg/roachprod/install", "//pkg/roachprod/logger", "//pkg/roachprod/prometheus", + "//pkg/roachprod/vm", "//pkg/security/username", "//pkg/server", "//pkg/server/serverpb", diff --git a/pkg/cmd/roachtest/tests/autoupgrade.go b/pkg/cmd/roachtest/tests/autoupgrade.go index 88b308b10ae2..c3aef0c3dea4 100644 --- a/pkg/cmd/roachtest/tests/autoupgrade.go +++ b/pkg/cmd/roachtest/tests/autoupgrade.go @@ -260,6 +260,7 @@ func registerAutoUpgrade(r registry.Registry) { t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") } pred, err := PredecessorVersion(*t.BuildVersion()) + if err != nil { t.Fatal(err) } diff --git a/pkg/cmd/roachtest/tests/cdc.go b/pkg/cmd/roachtest/tests/cdc.go index 9a7cba8726b8..a656127e1ca0 100644 --- a/pkg/cmd/roachtest/tests/cdc.go +++ b/pkg/cmd/roachtest/tests/cdc.go @@ -25,7 +25,6 @@ import ( "net/url" "path/filepath" "regexp" - "runtime" "sort" "strconv" "strings" @@ -44,6 +43,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/jobs/jobspb" "github.com/cockroachdb/cockroach/pkg/roachprod/install" "github.com/cockroachdb/cockroach/pkg/roachprod/logger" + "github.com/cockroachdb/cockroach/pkg/roachprod/vm" "github.com/cockroachdb/cockroach/pkg/testutils/sqlutils" "github.com/cockroachdb/cockroach/pkg/util/log" "github.com/cockroachdb/cockroach/pkg/util/protoutil" @@ -313,9 +313,6 @@ func cdcBasicTest(ctx context.Context, t test.Test, c cluster.Cluster, args cdcT } func runCDCBank(ctx context.Context, t test.Test, c cluster.Cluster) { - if runtime.GOARCH == "arm64" { - t.Skip("Skipping cdc/bank under ARM64.") - } // Make the logs dir on every node to work around the `roachprod get logs` // spam. c.Run(ctx, c.All(), `mkdir -p logs`) @@ -683,9 +680,11 @@ func runCDCKafkaAuth(ctx context.Context, t test.Test, c cluster.Cluster) { func registerCDC(r registry.Registry) { r.Add(registry.TestSpec{ - Name: "cdc/tpcc-1000", - Owner: registry.OwnerCDC, - Cluster: r.MakeClusterSpec(4, spec.CPU(16)), + Name: "cdc/tpcc-1000", + Owner: registry.OwnerCDC, + Benchmark: true, + // N.B. ARM64 is not yet supported, see https://github.com/cockroachdb/cockroach/issues/103888. + Cluster: r.MakeClusterSpec(4, spec.CPU(16), spec.Arch(vm.ArchAMD64)), RequiresLicense: true, Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { cdcBasicTest(ctx, t, c, cdcTestArgs{ @@ -698,9 +697,11 @@ func registerCDC(r registry.Registry) { }, }) r.Add(registry.TestSpec{ - Name: "cdc/tpcc-1000/sink=null", - Owner: registry.OwnerCDC, - Cluster: r.MakeClusterSpec(4, spec.CPU(16)), + Name: "cdc/tpcc-1000/sink=null", + Owner: registry.OwnerCDC, + Benchmark: true, + // N.B. ARM64 is not yet supported, see https://github.com/cockroachdb/cockroach/issues/103888. + Cluster: r.MakeClusterSpec(4, spec.CPU(16), spec.Arch(vm.ArchAMD64)), Tags: []string{"manual"}, RequiresLicense: true, Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { @@ -715,9 +716,11 @@ func registerCDC(r registry.Registry) { }, }) r.Add(registry.TestSpec{ - Name: "cdc/initial-scan", - Owner: registry.OwnerCDC, - Cluster: r.MakeClusterSpec(4, spec.CPU(16)), + Name: "cdc/initial-scan", + Owner: registry.OwnerCDC, + Benchmark: true, + // N.B. ARM64 is not yet supported, see https://github.com/cockroachdb/cockroach/issues/103888. + Cluster: r.MakeClusterSpec(4, spec.CPU(16), spec.Arch(vm.ArchAMD64)), RequiresLicense: true, Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { cdcBasicTest(ctx, t, c, cdcTestArgs{ @@ -731,9 +734,11 @@ func registerCDC(r registry.Registry) { }, }) r.Add(registry.TestSpec{ - Name: "cdc/sink-chaos", - Owner: `cdc`, - Cluster: r.MakeClusterSpec(4, spec.CPU(16)), + Name: "cdc/sink-chaos", + Owner: `cdc`, + Benchmark: true, + // N.B. ARM64 is not yet supported, see https://github.com/cockroachdb/cockroach/issues/103888. + Cluster: r.MakeClusterSpec(4, spec.CPU(16), spec.Arch(vm.ArchAMD64)), RequiresLicense: true, Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { cdcBasicTest(ctx, t, c, cdcTestArgs{ @@ -747,9 +752,11 @@ func registerCDC(r registry.Registry) { }, }) r.Add(registry.TestSpec{ - Name: "cdc/crdb-chaos", - Owner: `cdc`, - Cluster: r.MakeClusterSpec(4, spec.CPU(16)), + Name: "cdc/crdb-chaos", + Owner: `cdc`, + Benchmark: true, + // N.B. ARM64 is not yet supported, see https://github.com/cockroachdb/cockroach/issues/103888. + Cluster: r.MakeClusterSpec(4, spec.CPU(16), spec.Arch(vm.ArchAMD64)), RequiresLicense: true, Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { cdcBasicTest(ctx, t, c, cdcTestArgs{ @@ -770,7 +777,9 @@ func registerCDC(r registry.Registry) { // TODO(mrtracy): This workload is designed to be running on a 20CPU nodes, // but this cannot be allocated without some sort of configuration outside // of this test. Look into it. - Cluster: r.MakeClusterSpec(4, spec.CPU(16)), + Benchmark: true, + // N.B. ARM64 is not yet supported, see https://github.com/cockroachdb/cockroach/issues/103888. + Cluster: r.MakeClusterSpec(4, spec.CPU(16), spec.Arch(vm.ArchAMD64)), RequiresLicense: true, Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { cdcBasicTest(ctx, t, c, cdcTestArgs{ @@ -789,9 +798,11 @@ func registerCDC(r registry.Registry) { }, }) r.Add(registry.TestSpec{ - Name: "cdc/cloud-sink-gcs/rangefeed=true", - Owner: `cdc`, - Cluster: r.MakeClusterSpec(4, spec.CPU(16)), + Name: "cdc/cloud-sink-gcs/rangefeed=true", + Owner: `cdc`, + Benchmark: true, + // N.B. ARM64 is not yet supported, see https://github.com/cockroachdb/cockroach/issues/103888. + Cluster: r.MakeClusterSpec(4, spec.CPU(16), spec.Arch(vm.ArchAMD64)), RequiresLicense: true, Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { cdcBasicTest(ctx, t, c, cdcTestArgs{ @@ -810,9 +821,11 @@ func registerCDC(r registry.Registry) { }, }) r.Add(registry.TestSpec{ - Name: "cdc/pubsub-sink", - Owner: `cdc`, - Cluster: r.MakeClusterSpec(4, spec.CPU(16)), + Name: "cdc/pubsub-sink", + Owner: `cdc`, + Benchmark: true, + // N.B. ARM64 is not yet supported, see https://github.com/cockroachdb/cockroach/issues/103888. + Cluster: r.MakeClusterSpec(4, spec.CPU(16), spec.Arch(vm.ArchAMD64)), RequiresLicense: true, Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { cdcBasicTest(ctx, t, c, cdcTestArgs{ @@ -836,9 +849,11 @@ func registerCDC(r registry.Registry) { // TODO(rui): Change to a shorter test as it just needs to validate // permissions and shouldn't need to run a full 30m workload. r.Add(registry.TestSpec{ - Name: "cdc/pubsub-sink/assume-role", - Owner: `cdc`, - Cluster: r.MakeClusterSpec(4, spec.CPU(16)), + Name: "cdc/pubsub-sink/assume-role", + Owner: `cdc`, + Benchmark: true, + // N.B. ARM64 is not yet supported, see https://github.com/cockroachdb/cockroach/issues/103888. + Cluster: r.MakeClusterSpec(4, spec.CPU(16), spec.Arch(vm.ArchAMD64)), RequiresLicense: true, Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { cdcBasicTest(ctx, t, c, cdcTestArgs{ @@ -863,9 +878,11 @@ func registerCDC(r registry.Registry) { // TODO(rui): Change to a shorter test as it just needs to validate // permissions and shouldn't need to run a full 30m workload. r.Add(registry.TestSpec{ - Name: "cdc/cloud-sink-gcs/assume-role", - Owner: `cdc`, - Cluster: r.MakeClusterSpec(4, spec.CPU(16)), + Name: "cdc/cloud-sink-gcs/assume-role", + Owner: `cdc`, + Benchmark: true, + // N.B. ARM64 is not yet supported, see https://github.com/cockroachdb/cockroach/issues/103888. + Cluster: r.MakeClusterSpec(4, spec.CPU(16), spec.Arch(vm.ArchAMD64)), RequiresLicense: true, Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { cdcBasicTest(ctx, t, c, cdcTestArgs{ @@ -903,18 +920,21 @@ func registerCDC(r registry.Registry) { }) */ r.Add(registry.TestSpec{ - Name: "cdc/kafka-auth", - Owner: `cdc`, - Cluster: r.MakeClusterSpec(1), + Name: "cdc/kafka-auth", + Owner: `cdc`, + Benchmark: true, + // N.B. ARM64 is not yet supported, see https://github.com/cockroachdb/cockroach/issues/103888. + Cluster: r.MakeClusterSpec(1, spec.Arch(vm.ArchAMD64)), RequiresLicense: true, Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { runCDCKafkaAuth(ctx, t, c) }, }) r.Add(registry.TestSpec{ - Name: "cdc/bank", - Owner: `cdc`, - Cluster: r.MakeClusterSpec(4), + Name: "cdc/bank", + Owner: `cdc`, + // N.B. ARM64 is not yet supported, see https://github.com/cockroachdb/cockroach/issues/103888. + Cluster: r.MakeClusterSpec(4, spec.Arch(vm.ArchAMD64)), RequiresLicense: true, Timeout: 30 * time.Minute, Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { diff --git a/pkg/cmd/roachtest/tests/follower_reads.go b/pkg/cmd/roachtest/tests/follower_reads.go index a2ee91ef5a7b..cebb3c439cb2 100644 --- a/pkg/cmd/roachtest/tests/follower_reads.go +++ b/pkg/cmd/roachtest/tests/follower_reads.go @@ -19,7 +19,6 @@ import ( "net/http" "reflect" "regexp" - "runtime" "strconv" "strings" "time" @@ -100,9 +99,6 @@ func registerFollowerReads(r registry.Registry) { spec.CPU(2), ), Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { - if runtime.GOARCH == "arm64" { - t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") - } runFollowerReadsMixedVersionSingleRegionTest(ctx, t, c, *t.BuildVersion()) }, }) diff --git a/pkg/cmd/roachtest/tests/import.go b/pkg/cmd/roachtest/tests/import.go index 89e006e30589..5e6e35741fb0 100644 --- a/pkg/cmd/roachtest/tests/import.go +++ b/pkg/cmd/roachtest/tests/import.go @@ -359,6 +359,7 @@ func registerImportMixedVersion(r registry.Registry) { t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") } predV, err := PredecessorVersion(*t.BuildVersion()) + if err != nil { t.Fatal(err) } diff --git a/pkg/cmd/roachtest/tests/indexes.go b/pkg/cmd/roachtest/tests/indexes.go index cb2cdff6124b..2e2e619531d7 100644 --- a/pkg/cmd/roachtest/tests/indexes.go +++ b/pkg/cmd/roachtest/tests/indexes.go @@ -140,9 +140,3 @@ func registerNIndexes(r registry.Registry, secondaryIndexes int) { func registerIndexes(r registry.Registry) { registerNIndexes(r, 2) } - -func registerIndexesBench(r registry.Registry) { - for i := 0; i <= 100; i++ { - registerNIndexes(r, i) - } -} diff --git a/pkg/cmd/roachtest/tests/mixed_version_cdc.go b/pkg/cmd/roachtest/tests/mixed_version_cdc.go index 1a86b1977add..e5f291e99f39 100644 --- a/pkg/cmd/roachtest/tests/mixed_version_cdc.go +++ b/pkg/cmd/roachtest/tests/mixed_version_cdc.go @@ -25,6 +25,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" + "github.com/cockroachdb/cockroach/pkg/roachprod/vm" "github.com/cockroachdb/cockroach/pkg/util/randutil" "github.com/cockroachdb/cockroach/pkg/util/syncutil" "github.com/cockroachdb/cockroach/pkg/util/timeutil" @@ -65,9 +66,10 @@ func registerCDCMixedVersions(r registry.Registry) { zones = teamcityAgentZone } r.Add(registry.TestSpec{ - Name: "cdc/mixed-versions", - Owner: registry.OwnerTestEng, - Cluster: r.MakeClusterSpec(5, spec.Zones(zones)), + Name: "cdc/mixed-versions", + Owner: registry.OwnerTestEng, + // N.B. ARM64 is not yet supported, see https://github.com/cockroachdb/cockroach/issues/103888. + Cluster: r.MakeClusterSpec(5, spec.Zones(zones), spec.Arch(vm.ArchAMD64)), Timeout: timeout, RequiresLicense: true, Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { diff --git a/pkg/cmd/roachtest/tests/mixed_version_schemachange.go b/pkg/cmd/roachtest/tests/mixed_version_schemachange.go index 797fd97ce65b..e8e53e438e94 100644 --- a/pkg/cmd/roachtest/tests/mixed_version_schemachange.go +++ b/pkg/cmd/roachtest/tests/mixed_version_schemachange.go @@ -34,6 +34,7 @@ func registerSchemaChangeMixedVersions(r registry.Registry) { if runtime.GOARCH == "arm64" { t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") } + maxOps := 100 concurrency := 5 if c.IsLocal() { diff --git a/pkg/cmd/roachtest/tests/rebalance_load.go b/pkg/cmd/roachtest/tests/rebalance_load.go index d3223a0000d2..81485336de6e 100644 --- a/pkg/cmd/roachtest/tests/rebalance_load.go +++ b/pkg/cmd/roachtest/tests/rebalance_load.go @@ -15,7 +15,6 @@ import ( gosql "database/sql" "fmt" "math/rand" - "runtime" "sort" "strings" "time" @@ -157,9 +156,6 @@ func registerRebalanceLoad(r registry.Registry) { Owner: registry.OwnerKV, Cluster: r.MakeClusterSpec(4), // the last node is just used to generate load Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { - if runtime.GOARCH == "arm64" { - t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") - } if c.IsLocal() { concurrency = 32 fmt.Printf("lowering concurrency to %d in local testing\n", concurrency) diff --git a/pkg/cmd/roachtest/tests/tpcc.go b/pkg/cmd/roachtest/tests/tpcc.go index c22666379bcc..5333ce823c89 100644 --- a/pkg/cmd/roachtest/tests/tpcc.go +++ b/pkg/cmd/roachtest/tests/tpcc.go @@ -297,6 +297,7 @@ var tpccSupportedWarehouses = []struct { // TODO(tbg): this number is copied from gce-n4cpu16. The real number should be a // little higher, find out what it is. {hardware: "gce-n5cpu16", v: version.MustParse(`v19.1.0-0`), warehouses: 1300}, + {hardware: "aws-n5cpu16", v: version.MustParse(`v19.1.0-0`), warehouses: 2100}, // Ditto. {hardware: "gce-n5cpu16", v: version.MustParse(`v2.1.0-0`), warehouses: 1300}, } @@ -504,6 +505,7 @@ func registerTPCC(r registry.Registry) { runTPCCMixedHeadroom(ctx, t, c, cloud, 1) }, }) + r.Add(registry.TestSpec{ // run the same mixed-headroom test, but going back two versions Name: "tpcc/mixed-headroom/multiple-upgrades/" + mixedHeadroomSpec.String(), @@ -1383,117 +1385,6 @@ func runTPCCBench(ctx context.Context, t test.Test, c cluster.Cluster, b tpccBen } } -func registerTPCCBench(r registry.Registry) { - specs := []tpccBenchSpec{ - { - Nodes: 3, - CPUs: 4, - - LoadWarehouses: 1000, - EstimatedMax: 325, - }, - { - Nodes: 3, - CPUs: 16, - - LoadWarehouses: 2000, - EstimatedMax: 1300, - }, - // objective 1, key result 1. - { - Nodes: 30, - CPUs: 16, - - LoadWarehouses: 10000, - EstimatedMax: 5300, - }, - // objective 1, key result 2. - { - Nodes: 18, - CPUs: 16, - LoadConfig: singlePartitionedLoadgen, - - LoadWarehouses: 10000, - EstimatedMax: 8000, - }, - // objective 2, key result 1. - { - Nodes: 7, - CPUs: 16, - Chaos: true, - - LoadWarehouses: 5000, - EstimatedMax: 2000, - }, - // objective 3, key result 1. - { - Nodes: 3, - CPUs: 16, - Distribution: multiZone, - - LoadWarehouses: 2000, - EstimatedMax: 1000, - }, - // objective 3, key result 2. - { - Nodes: 9, - CPUs: 16, - Distribution: multiRegion, - LoadConfig: multiLoadgen, - - LoadWarehouses: 12000, - EstimatedMax: 8000, - }, - // objective 4, key result 2. - { - Nodes: 64, - CPUs: 16, - - LoadWarehouses: 50000, - EstimatedMax: 40000, - }, - - // See https://github.com/cockroachdb/cockroach/issues/31409 for the next three specs. - { - Nodes: 6, - CPUs: 16, - - LoadWarehouses: 5000, - EstimatedMax: 3000, - LoadConfig: singlePartitionedLoadgen, - }, - { - Nodes: 12, - CPUs: 16, - - LoadWarehouses: 10000, - EstimatedMax: 6000, - LoadConfig: singlePartitionedLoadgen, - }, - { - Nodes: 24, - CPUs: 16, - - LoadWarehouses: 20000, - EstimatedMax: 12000, - LoadConfig: singlePartitionedLoadgen, - }, - - // Requested by @awoods87. - { - Nodes: 11, - CPUs: 32, - - LoadWarehouses: 10000, - EstimatedMax: 8000, - }, - } - - for _, b := range specs { - registerTPCCBenchSpec(r, b) - } -} - // makeWorkloadScrapeNodes creates a ScrapeNode for every workloadInstance. func makeWorkloadScrapeNodes( workloadNode install.Node, workloadInstances []workloadInstance, diff --git a/pkg/cmd/roachtest/tests/tpch_concurrency.go b/pkg/cmd/roachtest/tests/tpch_concurrency.go index 974cf9b9c55e..999a43c16608 100644 --- a/pkg/cmd/roachtest/tests/tpch_concurrency.go +++ b/pkg/cmd/roachtest/tests/tpch_concurrency.go @@ -198,10 +198,10 @@ func registerTPCHConcurrency(r registry.Registry) { } r.Add(registry.TestSpec{ - Name: "tpch_concurrency", - Owner: registry.OwnerSQLQueries, + Name: "tpch_concurrency", + Owner: registry.OwnerSQLQueries, Benchmark: true, - Cluster: r.MakeClusterSpec(numNodes), + Cluster: r.MakeClusterSpec(numNodes), Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { runTPCHConcurrency(ctx, t, c, true /* lowerRefreshSpansBytes */, false /* disableStreamer */) @@ -232,17 +232,11 @@ func registerTPCHConcurrency(r registry.Registry) { // TODO(yuzefovich): remove this once the streamer is stabilized. r.Add(registry.TestSpec{ -<<<<<<< HEAD - Name: "tpch_concurrency/no_streamer", - Owner: registry.OwnerSQLQueries, - Cluster: r.MakeClusterSpec(numNodes), -======= Name: "tpch_concurrency/no_streamer", Owner: registry.OwnerSQLQueries, Benchmark: true, - Timeout: timeout, Cluster: r.MakeClusterSpec(numNodes), ->>>>>>> 0df3a03e781 (roachtest: require perf. tests to opt in via TestSpec.Benchmark) + Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { runTPCHConcurrency(ctx, t, c, true /* lowerRefreshSpansBytes */, true /* disableStreamer */) }, diff --git a/pkg/cmd/roachtest/tests/versionupgrade.go b/pkg/cmd/roachtest/tests/versionupgrade.go index 1e6d52a42ef3..3e4f2998bc91 100644 --- a/pkg/cmd/roachtest/tests/versionupgrade.go +++ b/pkg/cmd/roachtest/tests/versionupgrade.go @@ -102,6 +102,7 @@ func runVersionUpgrade(ctx context.Context, t test.Test, c cluster.Cluster) { // of #58489 is being addressed. _ = schemaChangeStep backupStep := func(ctx context.Context, t test.Test, u *versionUpgradeTest) { + // Verify that backups can be created in various configurations. This is // important to test because changes in system tables might cause backups to // fail in mixed-version clusters. diff --git a/pkg/roachprod/install/BUILD.bazel b/pkg/roachprod/install/BUILD.bazel index 0c41730b9ac4..066c6e416245 100644 --- a/pkg/roachprod/install/BUILD.bazel +++ b/pkg/roachprod/install/BUILD.bazel @@ -29,6 +29,7 @@ go_library( "//pkg/roachprod/logger", "//pkg/roachprod/ssh", "//pkg/roachprod/ui", + "//pkg/roachprod/vm", "//pkg/roachprod/vm/aws", "//pkg/roachprod/vm/local", "//pkg/util", @@ -54,6 +55,7 @@ go_test( embed = [":install"], deps = [ "//pkg/roachprod/logger", + "//pkg/roachprod/vm", "//pkg/testutils", "//pkg/util/retry", "@com_github_cockroachdb_datadriven//:datadriven", diff --git a/pkg/roachprod/install/staging.go b/pkg/roachprod/install/staging.go index 09c31a6dd2fa..7a4382479fe5 100644 --- a/pkg/roachprod/install/staging.go +++ b/pkg/roachprod/install/staging.go @@ -17,6 +17,7 @@ import ( "path/filepath" "github.com/cockroachdb/cockroach/pkg/roachprod/logger" + "github.com/cockroachdb/cockroach/pkg/roachprod/vm" "github.com/cockroachdb/errors" ) @@ -98,30 +99,30 @@ var ( ) // ArchInfoForOS returns an ArchInfo for the given OS and Architecture if currently supported. -func ArchInfoForOS(os string, arch string) (archInfo, error) { - if arch != "" && arch != "amd64" && arch != "arm64" && arch != "fips" { +func ArchInfoForOS(os string, arch vm.CPUArch) (archInfo, error) { + if arch != "" && arch != vm.ArchAMD64 && arch != vm.ArchARM64 && arch != vm.ArchFIPS { return archInfo{}, errors.Errorf("unsupported architecture %q", arch) } switch os { case "linux": - if arch == "arm64" { + if arch == vm.ArchARM64 { return linux_arm64_ArchInfo, nil } - if arch == "fips" { + if arch == vm.ArchFIPS { return linux_x86_64_fips_ArchInfo, nil } return linux_x86_64_ArchInfo, nil case "darwin": - if arch == "arm64" { + if arch == vm.ArchARM64 { return darwin_arm64_ArchInfo, nil } - if arch == "fips" { + if arch == vm.ArchFIPS { return archInfo{}, errors.Errorf("%q is not supported on %q", arch, os) } return darwin_x86_64_ArchInfo, nil case "windows": - if arch == "fips" || arch == "arm64" { + if arch == vm.ArchFIPS || arch == vm.ArchARM64 { return archInfo{}, errors.Errorf("%q is not supported on %q", arch, os) } return windowsArchInfo, nil @@ -176,7 +177,7 @@ func StageApplication( applicationName string, version string, os string, - arch string, + arch vm.CPUArch, destDir string, ) error { archInfo, err := ArchInfoForOS(os, arch) @@ -226,7 +227,7 @@ func StageApplication( // URLsForApplication returns a slice of URLs that should be // downloaded for the given application. func URLsForApplication( - application string, version string, os string, arch string, + application string, version string, os string, arch vm.CPUArch, ) ([]*url.URL, error) { archInfo, err := ArchInfoForOS(os, arch) if err != nil { diff --git a/pkg/roachprod/install/staging_test.go b/pkg/roachprod/install/staging_test.go index 3455df5fff79..977d755bf401 100644 --- a/pkg/roachprod/install/staging_test.go +++ b/pkg/roachprod/install/staging_test.go @@ -13,6 +13,7 @@ package install import ( "testing" + "github.com/cockroachdb/cockroach/pkg/roachprod/vm" "github.com/stretchr/testify/require" ) @@ -322,7 +323,7 @@ func TestURLsForApplication(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - got, err := URLsForApplication(tt.args.application, tt.args.version, tt.args.os, tt.args.arch) + got, err := URLsForApplication(tt.args.application, tt.args.version, tt.args.os, vm.CPUArch(tt.args.arch)) if (err != nil) != tt.wantErr { t.Errorf("URLsForApplication() error = %v, wantErr %v", err, tt.wantErr) return diff --git a/pkg/roachprod/prometheus/BUILD.bazel b/pkg/roachprod/prometheus/BUILD.bazel index 6c763655eeb3..8da074a7c4e1 100644 --- a/pkg/roachprod/prometheus/BUILD.bazel +++ b/pkg/roachprod/prometheus/BUILD.bazel @@ -9,6 +9,7 @@ go_library( deps = [ "//pkg/roachprod/install", "//pkg/roachprod/logger", + "//pkg/roachprod/vm", "@com_github_cockroachdb_errors//:errors", "@com_github_prometheus_client_golang//api/prometheus/v1:prometheus", "@com_github_prometheus_common//model", diff --git a/pkg/roachprod/prometheus/prometheus.go b/pkg/roachprod/prometheus/prometheus.go index adc994e51f72..4798d7628eb7 100644 --- a/pkg/roachprod/prometheus/prometheus.go +++ b/pkg/roachprod/prometheus/prometheus.go @@ -20,6 +20,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/roachprod/install" "github.com/cockroachdb/cockroach/pkg/roachprod/logger" + "github.com/cockroachdb/cockroach/pkg/roachprod/vm" "github.com/cockroachdb/errors" promv1 "github.com/prometheus/client_golang/api/prometheus/v1" "github.com/prometheus/common/model" @@ -189,20 +190,25 @@ type Prometheus struct { // Init creates a prometheus instance on the given cluster. func Init( - ctx context.Context, l *logger.Logger, c *install.SyncedCluster, cfg Config, + ctx context.Context, l *logger.Logger, c *install.SyncedCluster, arch vm.CPUArch, cfg Config, ) (_ *Prometheus, _ error) { + binArch := "amd64" + if arch == vm.ArchARM64 { + binArch = "arm64" + } + if len(cfg.NodeExporter) > 0 { // NB: when upgrading here, make sure to target a version that picks up this PR: // https://github.com/prometheus/node_exporter/pull/2311 // At time of writing, there hasn't been a release in over half a year. if err := c.RepeatRun(ctx, l, os.Stdout, os.Stderr, cfg.NodeExporter, "download node exporter", - ` + fmt.Sprintf(` (sudo systemctl stop node_exporter || true) && rm -rf node_exporter && mkdir -p node_exporter && curl -fsSL \ - https://github.com/prometheus/node_exporter/releases/download/v1.2.2/node_exporter-1.2.2.linux-amd64.tar.gz | + https://storage.googleapis.com/cockroach-fixtures/prometheus/node_exporter-1.2.2.linux-%s.tar.gz | tar zxv --strip-components 1 -C node_exporter -`); err != nil { +`, binArch)); err != nil { return nil, err } @@ -235,9 +241,9 @@ sudo systemd-run --unit node_exporter --same-dir ./node_exporter`, os.Stderr, cfg.PrometheusNode, "download prometheus", - `sudo rm -rf /tmp/prometheus && mkdir /tmp/prometheus && cd /tmp/prometheus && - curl -fsSL https://storage.googleapis.com/cockroach-fixtures/prometheus/prometheus-2.27.1.linux-amd64.tar.gz | tar zxv --strip-components=1`, - ); err != nil { + fmt.Sprintf(`sudo rm -rf /tmp/prometheus && mkdir /tmp/prometheus && cd /tmp/prometheus && + curl -fsSL https://storage.googleapis.com/cockroach-fixtures/prometheus/prometheus-2.27.1.linux-%s.tar.gz | tar zxv --strip-components=1`, + binArch)); err != nil { return nil, err } // create and upload prom config @@ -279,14 +285,16 @@ sudo systemd-run --unit prometheus --same-dir \ if cfg.Grafana.Enabled { // Install Grafana. if err := c.RepeatRun(ctx, l, - os.Stdout, - os.Stderr, cfg.PrometheusNode, "install grafana", - `sudo apt-get install -qqy apt-transport-https && + l.Stdout, + l.Stderr, cfg.PrometheusNode, "install grafana", + fmt.Sprintf(` +sudo apt-get install -qqy apt-transport-https && sudo apt-get install -qqy software-properties-common wget && -wget -q -O - https://packages.grafana.com/gpg.key | sudo apt-key add - && -echo "deb https://packages.grafana.com/enterprise/deb stable main" | sudo tee -a /etc/apt/sources.list.d/grafana.list && -sudo apt-get update -qqy && sudo apt-get install -qqy grafana-enterprise && sudo mkdir -p /var/lib/grafana/dashboards`, - ); err != nil { +sudo apt-get install -y adduser libfontconfig1 && +wget https://dl.grafana.com/enterprise/release/grafana-enterprise_9.2.3_%s.deb -O grafana-enterprise_9.2.3_%s.deb && +sudo dpkg -i grafana-enterprise_9.2.3_%s.deb && +sudo mkdir -p /var/lib/grafana/dashboards`, + binArch, binArch, binArch)); err != nil { return nil, err } diff --git a/pkg/roachprod/roachprod.go b/pkg/roachprod/roachprod.go index ff88cb0707c8..7ad6b7ef7d4b 100644 --- a/pkg/roachprod/roachprod.go +++ b/pkg/roachprod/roachprod.go @@ -517,7 +517,7 @@ func Stage( dir = stageDir } - return install.StageApplication(ctx, l, c, applicationName, version, os, arch, dir) + return install.StageApplication(ctx, l, c, applicationName, version, os, vm.CPUArch(arch), dir) } // Reset resets all VMs in a cluster. @@ -1357,7 +1357,7 @@ func StageURL( if stageArch != "" { arch = stageArch } - urls, err := install.URLsForApplication(applicationName, version, os, arch) + urls, err := install.URLsForApplication(applicationName, version, os, vm.CPUArch(arch)) if err != nil { return nil, err } @@ -1402,6 +1402,7 @@ func StartGrafana( ctx context.Context, l *logger.Logger, clusterName string, + arch vm.CPUArch, grafanaURL string, promCfg *prometheus.Config, // passed iff grafanaURL is empty ) error { @@ -1435,7 +1436,7 @@ func StartGrafana( promCfg.WithGrafanaDashboard(grafanaURL) } } - _, err = prometheus.Init(ctx, l, c, *promCfg) + _, err = prometheus.Init(ctx, l, c, arch, *promCfg) if err != nil { return err } diff --git a/pkg/roachprod/vm/aws/aws.go b/pkg/roachprod/vm/aws/aws.go index b1b20114bf58..cd60221c0eec 100644 --- a/pkg/roachprod/vm/aws/aws.go +++ b/pkg/roachprod/vm/aws/aws.go @@ -265,7 +265,9 @@ var defaultConfig = func() (cfg *awsConfig) { // cluster creation. If the geo flag is specified, nodes are distributed between // zones. var defaultCreateZones = []string{ - "us-east-2b", + // N.B. us-east-2a is the default zone for non-geo distributed clusters. It appears to have a higher on-demand + // capacity of c7g.8xlarge (graviton3) than us-east-2b. + "us-east-2a", "us-west-2b", "eu-west-2b", } @@ -434,12 +436,13 @@ func (p *Provider) Create( var g errgroup.Group limiter := rate.NewLimiter(rate.Limit(providerOpts.CreateRateLimit), 2 /* buckets */) for i := range names { + index := i capName := names[i] placement := zones[i] res := limiter.Reserve() g.Go(func() error { time.Sleep(res.Delay()) - return p.runInstance(l, capName, placement, opts, providerOpts) + return p.runInstance(l, capName, index, placement, opts, providerOpts) }) } if err := g.Wait(); err != nil { @@ -797,7 +800,12 @@ func (p *Provider) listRegion(region string, opts ProviderOpts) (vm.List, error) // we need to do a bit of work to look up all of the various ids that // we need in order to actually allocate an instance. func (p *Provider) runInstance( - l *logger.Logger, name string, zone string, opts vm.CreateOpts, providerOpts *ProviderOpts, + l *logger.Logger, + name string, + instanceIdx int, + zone string, + opts vm.CreateOpts, + providerOpts *ProviderOpts, ) error { // There exist different flags to control the machine type when ssd is true. // This enables sane defaults for either setting but the behavior can be @@ -850,7 +858,10 @@ func (p *Provider) runInstance( var labelPairs []string addLabel := func(key, value string) { - labelPairs = append(labelPairs, fmt.Sprintf("{Key=%s,Value=%s}", key, value)) + // N.B. AWS does not allow empty values. + if value != "" { + labelPairs = append(labelPairs, fmt.Sprintf("{Key=%s,Value=%s}", key, value)) + } } for key, value := range opts.CustomLabels { @@ -888,7 +899,8 @@ func (p *Provider) runInstance( extraMountOpts = "nobarrier" } } - filename, err := writeStartupScript(extraMountOpts, providerOpts.UseMultipleDisks, opts.EnableFIPS) + filename, err := writeStartupScript(extraMountOpts, providerOpts.UseMultipleDisks, opts.Arch == string(vm.ArchFIPS)) + if err != nil { return errors.Wrapf(err, "could not write AWS startup script to temp file") } @@ -904,14 +916,22 @@ func (p *Provider) runInstance( } imageID := withFlagOverride(az.region.AMI_X86_64, &providerOpts.ImageAMI) useArmAMI := strings.Index(machineType, "6g.") == 1 || strings.Index(machineType, "7g.") == 1 + if useArmAMI && (opts.Arch != "" && opts.Arch != string(vm.ArchARM64)) { + return errors.Errorf("machine type %s is arm64, but requested arch is %s", machineType, opts.Arch) + } //TODO(srosenberg): remove this once we have a better way to detect ARM64 machines if useArmAMI { imageID = withFlagOverride(az.region.AMI_ARM64, &providerOpts.ImageAMI) - l.Printf("Using ARM64 AMI: %s for machine type: %s", imageID, machineType) + // N.B. use arbitrary instanceIdx to suppress the same info for every other instance being created. + if instanceIdx == 0 { + l.Printf("Using ARM64 AMI: %s for machine type: %s", imageID, machineType) + } } - if !useArmAMI && opts.EnableFIPS { + if opts.Arch == string(vm.ArchFIPS) { imageID = withFlagOverride(az.region.AMI_FIPS, &providerOpts.ImageAMI) - l.Printf("Using FIPS-enabled AMI: %s for machine type: %s", imageID, machineType) + if instanceIdx == 0 { + l.Printf("Using FIPS-enabled AMI: %s for machine type: %s", imageID, machineType) + } } args := []string{ "ec2", "run-instances", diff --git a/pkg/roachprod/vm/gce/gcloud.go b/pkg/roachprod/vm/gce/gcloud.go index 9e9e112c902d..660d8d37c548 100644 --- a/pkg/roachprod/vm/gce/gcloud.go +++ b/pkg/roachprod/vm/gce/gcloud.go @@ -36,6 +36,7 @@ const ( // ProviderName is gce. ProviderName = "gce" DefaultImage = "ubuntu-2004-focal-v20210603" + ARM64Image = "ubuntu-2004-focal-arm64-v20230523" FIPSImage = "ubuntu-pro-fips-2004-focal-v20230302" defaultImageProject = "ubuntu-os-cloud" FIPSImageProject = "ubuntu-os-pro-cloud" @@ -421,10 +422,34 @@ func (p *Provider) Create( // Fixed args. image := providerOpts.Image imageProject := defaultImageProject - if opts.EnableFIPS { + useArmAMI := strings.HasPrefix(strings.ToLower(providerOpts.MachineType), "t2a-") + if useArmAMI && (opts.Arch != "" && opts.Arch != string(vm.ArchARM64)) { + return errors.Errorf("machine type %s is arm64, but requested arch is %s", providerOpts.MachineType, opts.Arch) + } + if useArmAMI && opts.SSDOpts.UseLocalSSD { + return errors.New("local SSDs are not supported with T2A instances, use --local-ssd=false") + } + if useArmAMI { + if len(providerOpts.Zones) == 0 { + zones = []string{"us-central1-a"} + } else { + for _, zone := range providerOpts.Zones { + if !strings.HasPrefix(zone, "us-central1-") { + return errors.New("T2A instances are not supported outside of us-central1") + } + } + } + } + //TODO(srosenberg): remove this once we have a better way to detect ARM64 machines + if useArmAMI { + image = ARM64Image + l.Printf("Using ARM64 AMI: %s for machine type: %s", image, providerOpts.MachineType) + } + if opts.Arch == string(vm.ArchFIPS) { // NB: if FIPS is enabled, it overrides the image passed via CLI (--gce-image) image = FIPSImage imageProject = FIPSImageProject + l.Printf("Using FIPS-enabled AMI: %s for machine type: %s", image, providerOpts.MachineType) } args := []string{ "compute", "instances", "create", @@ -495,7 +520,7 @@ func (p *Provider) Create( } // Create GCE startup script file. - filename, err := writeStartupScript(extraMountOpts, opts.SSDOpts.FileSystem, providerOpts.UseMultipleDisks, opts.EnableFIPS) + filename, err := writeStartupScript(extraMountOpts, opts.SSDOpts.FileSystem, providerOpts.UseMultipleDisks, opts.Arch == string(vm.ArchFIPS)) if err != nil { return errors.Wrapf(err, "could not write GCE startup script to temp file") } diff --git a/pkg/roachprod/vm/vm.go b/pkg/roachprod/vm/vm.go index e0046c6f5287..b1d6dddbadfb 100644 --- a/pkg/roachprod/vm/vm.go +++ b/pkg/roachprod/vm/vm.go @@ -34,10 +34,30 @@ const ( TagLifetime = "lifetime" // TagRoachprod is roachprod tag const, value is true & false. TagRoachprod = "roachprod" + // TagUsage indicates where a certain resource is used. "roachtest" is used + // as the key for roachtest created resources. + TagUsage = "usage" + // TagArch is the CPU architecture tag const. + TagArch = "arch" + + ArchARM64 = CPUArch("arm64") + ArchAMD64 = CPUArch("amd64") + ArchFIPS = CPUArch("fips") ) +type CPUArch string + // GetDefaultLabelMap returns a label map for a common set of labels. func GetDefaultLabelMap(opts CreateOpts) map[string]string { + // Add architecture override tag, only if it was specified. + if opts.Arch != "" { + return map[string]string{ + TagCluster: opts.ClusterName, + TagLifetime: opts.Lifetime.String(), + TagRoachprod: "true", + TagArch: opts.Arch, + } + } return map[string]string{ TagCluster: opts.ClusterName, TagLifetime: opts.Lifetime.String(), @@ -176,7 +196,7 @@ type CreateOpts struct { CustomLabels map[string]string GeoDistributed bool - EnableFIPS bool + Arch string VMProviders []string SSDOpts struct { UseLocalSSD bool @@ -197,6 +217,8 @@ func DefaultCreateOpts() CreateOpts { GeoDistributed: false, VMProviders: []string{}, OsVolumeSize: 10, + // N.B. When roachprod is used via CLI, this will be overridden by {"roachprod":"true"}. + CustomLabels: map[string]string{"roachtest": "true"}, } defaultCreateOpts.SSDOpts.UseLocalSSD = true defaultCreateOpts.SSDOpts.NoExt4Barrier = true diff --git a/pkg/util/randutil/rand.go b/pkg/util/randutil/rand.go index daa2be07a772..b83f8547974d 100644 --- a/pkg/util/randutil/rand.go +++ b/pkg/util/randutil/rand.go @@ -97,6 +97,12 @@ func NewPseudoRand() (*rand.Rand, int64) { return rand.New(rand.NewSource(seed)), seed } +// Same as NewPseudoRand, but the returned Rand is using thread safe underlying source. +func NewLockedPseudoRand() (*rand.Rand, int64) { + seed := envutil.EnvOrDefaultInt64("COCKROACH_RANDOM_SEED", NewPseudoSeed()) + return rand.New(NewLockedSource(seed)), seed +} + // NewTestRand returns an instance of math/rand.Rand seeded from rng, which is // seeded with the global seed. If the caller is a test with a different // path-qualified name than the previous caller, rng is reseeded from the global