diff --git a/pkg/build/BUILD.bazel b/pkg/build/BUILD.bazel index 5bb789b75e29..0fbb18953466 100644 --- a/pkg/build/BUILD.bazel +++ b/pkg/build/BUILD.bazel @@ -23,6 +23,7 @@ go_library( "github.com/cockroachdb/cockroach/pkg/build.utcTime": "{BUILD_UTCTIME}", }, deps = [ + "//pkg/util/buildutil", "//pkg/util/envutil", "//pkg/util/version", ], diff --git a/pkg/build/info.go b/pkg/build/info.go index d4ab09ef72e5..1f257c75fd30 100644 --- a/pkg/build/info.go +++ b/pkg/build/info.go @@ -17,6 +17,7 @@ import ( "text/tabwriter" "time" + "github.com/cockroachdb/cockroach/pkg/util/buildutil" "github.com/cockroachdb/cockroach/pkg/util/envutil" "github.com/cockroachdb/cockroach/pkg/util/version" ) @@ -35,11 +36,12 @@ var ( cgoTargetTriple string platform = fmt.Sprintf("%s %s", runtime.GOOS, runtime.GOARCH) // Distribution is changed by the CCL init-time hook in non-APL builds. - Distribution = "OSS" - typ string // Type of this build: , "development", or "release" - channel = "unknown" - envChannel = envutil.EnvOrDefaultString("COCKROACH_CHANNEL", "unknown") - binaryVersion = computeVersion(tag) + Distribution = "OSS" + typ string // Type of this build: , "development", or "release" + channel = "unknown" + envChannel = envutil.EnvOrDefaultString("COCKROACH_CHANNEL", "unknown") + enabledAssertions = buildutil.CrdbTestBuild + binaryVersion = computeVersion(tag) ) const ( @@ -113,7 +115,8 @@ func (b Info) Long() string { fmt.Fprintf(tw, "Go Version: %s\n", b.GoVersion) fmt.Fprintf(tw, "C Compiler: %s\n", b.CgoCompiler) fmt.Fprintf(tw, "Build Commit ID: %s\n", b.Revision) - fmt.Fprintf(tw, "Build Type: %s", b.Type) // No final newline: cobra prints one for us. + fmt.Fprintf(tw, "Build Type: %s\n", b.Type) + fmt.Fprintf(tw, "Enabled Assertions: %t", b.EnabledAssertions) // No final newline: cobra prints one for us. _ = tw.Flush() return buf.String() } @@ -139,17 +142,18 @@ func (b Info) Timestamp() (int64, error) { // GetInfo returns an Info struct populated with the build information. func GetInfo() Info { return Info{ - GoVersion: runtime.Version(), - Tag: tag, - Time: utcTime, - Revision: rev, - CgoCompiler: cgoCompiler, - CgoTargetTriple: cgoTargetTriple, - Platform: platform, - Distribution: Distribution, - Type: typ, - Channel: channel, - EnvChannel: envChannel, + GoVersion: runtime.Version(), + Tag: tag, + Time: utcTime, + Revision: rev, + CgoCompiler: cgoCompiler, + CgoTargetTriple: cgoTargetTriple, + Platform: platform, + Distribution: Distribution, + Type: typ, + Channel: channel, + EnvChannel: envChannel, + EnabledAssertions: enabledAssertions, } } diff --git a/pkg/build/info.proto b/pkg/build/info.proto index 199666501739..c4e0a4b1657e 100644 --- a/pkg/build/info.proto +++ b/pkg/build/info.proto @@ -38,6 +38,8 @@ message Info { optional string channel = 9 [(gogoproto.nullable) = false]; // env_channel identifies the product channel as overridden by the COCKROACH_CHANNEL environment variable. optional string env_channel = 11 [(gogoproto.nullable) = false]; + // enabled_assertions returns the value of 'CrdbTestBuild' (true iff compiled with 'crdb_test' tag) + optional bool enabled_assertions = 12 [(gogoproto.nullable) = false]; // dependencies exists to allow tests that run against old clusters // to unmarshal JSON containing this field. The tag is unimportant, diff --git a/pkg/cmd/roachprod/flags.go b/pkg/cmd/roachprod/flags.go index c5e8380b528f..66bf054204d6 100644 --- a/pkg/cmd/roachprod/flags.go +++ b/pkg/cmd/roachprod/flags.go @@ -40,6 +40,7 @@ var ( extendLifetime time.Duration wipePreserveCerts bool grafanaConfig string + grafanaArch string grafanaurlOpen bool grafanaDumpDir string listDetails bool @@ -107,8 +108,9 @@ func initFlags() { vm.AllProviderNames())) createCmd.Flags().BoolVar(&createVMOpts.GeoDistributed, "geo", false, "Create geo-distributed cluster") - createCmd.Flags().BoolVar(&createVMOpts.EnableFIPS, - "fips", false, "Enable FIPS mode (uses custom AMI)") + createCmd.Flags().StringVar(&createVMOpts.Arch, "arch", "", + "architecture override for VM [amd64, arm64, fips]; N.B. fips implies amd64 with openssl") + // N.B. We set "usage=roachprod" as the default, custom label for billing tracking. createCmd.Flags().StringToStringVar(&createVMOpts.CustomLabels, "label", map[string]string{"usage": "roachprod"}, @@ -249,6 +251,9 @@ func initFlags() { grafanaStartCmd.Flags().StringVar(&grafanaConfig, "grafana-config", "", "URL to grafana json config") + grafanaStartCmd.Flags().StringVar(&grafanaArch, "arch", "", + "binary architecture override [amd64, arm64]") + grafanaURLCmd.Flags().BoolVar(&grafanaurlOpen, "open", false, "open the grafana dashboard url on the browser") diff --git a/pkg/cmd/roachprod/main.go b/pkg/cmd/roachprod/main.go index e91a6d68c62c..abec0008f585 100644 --- a/pkg/cmd/roachprod/main.go +++ b/pkg/cmd/roachprod/main.go @@ -273,6 +273,7 @@ hosts file. c.PrintDetails(roachprodLibraryLogger) } else { fmt.Fprintf(tw, "%s\t%s\t%d", c.Name, c.Clouds(), len(c.VMs)) + if !c.IsLocal() { fmt.Fprintf(tw, "\t(%s)", c.LifetimeRemaining().Round(time.Second)) } else { @@ -904,10 +905,14 @@ var grafanaStartCmd = &cobra.Command{ Use: `grafana-start `, Short: `spins up a prometheus and grafana instances on the last node in the cluster`, Long: `spins up a prometheus and grafana instances on the highest numbered node in the cluster -and will scrape from all nodes in the cluster`, +and will scrape from all nodes in the cluster; NOTE: for arm64 clusters, use --arch arm64`, Args: cobra.ExactArgs(1), Run: wrap(func(cmd *cobra.Command, args []string) error { - return roachprod.StartGrafana(context.Background(), roachprodLibraryLogger, args[0], + arch := vm.ArchAMD64 + if grafanaArch == "arm64" { + arch = vm.ArchARM64 + } + return roachprod.StartGrafana(context.Background(), roachprodLibraryLogger, args[0], arch, grafanaConfig, nil) }), } @@ -954,14 +959,14 @@ func validateAndConfigure(cmd *cobra.Command, args []string) { // Validate architecture flag, if set. if archOpt := cmd.Flags().Lookup("arch"); archOpt != nil && archOpt.Changed { - arch := strings.ToLower(archOpt.Value.String()) + arch := vm.CPUArch(strings.ToLower(archOpt.Value.String())) - if arch != "amd64" && arch != "arm64" && arch != "fips" { + if arch != vm.ArchAMD64 && arch != vm.ArchARM64 && arch != vm.ArchFIPS { printErrAndExit(fmt.Errorf("unsupported architecture %q", arch)) } - if arch != archOpt.Value.String() { + if string(arch) != archOpt.Value.String() { // Set the canonical value. - _ = cmd.Flags().Set("arch", arch) + _ = cmd.Flags().Set("arch", string(arch)) } } } diff --git a/pkg/cmd/roachtest/cluster.go b/pkg/cmd/roachtest/cluster.go index a0141cf704f5..a112dd40a4ba 100644 --- a/pkg/cmd/roachtest/cluster.go +++ b/pkg/cmd/roachtest/cluster.go @@ -57,13 +57,20 @@ func init() { } var ( - // TODO(tbg): this is redundant with --cloud==local. Make the --local flag an - // alias for `--cloud=local` and remove this variable. - local bool - - cockroach string - cockroachShort string - libraryFilePaths []string + // user-specified path to crdb binary + cockroachPath string + // maps cpuArch to the corresponding crdb binary's absolute path + cockroach = make(map[vm.CPUArch]string) + // user-specified path to short crdb binary + cockroachShortPath string + // maps cpuArch to the corresponding short crdb (i.e., without UI) binary's absolute path + cockroachShort = make(map[vm.CPUArch]string) + // user-specified path to workload binary + workloadPath string + // maps cpuArch to the corresponding workload binary's absolute path + workload = make(map[vm.CPUArch]string) + // maps cpuArch to the corresponding dynamically-linked libraries' absolute paths + libraryFilePaths = make(map[vm.CPUArch][]string) cloud = spec.GCE // encryptionProbability controls when encryption-at-rest is enabled // in a cluster for tests that have opted-in to metamorphic @@ -73,10 +80,18 @@ var ( // encryption enabled by default (probability 1). In order to run // them with encryption disabled (perhaps to reproduce a test // failure), roachtest can be invoked with --metamorphic-encryption-probability=0 - encryptionProbability float64 + encryptionProbability float64 + // Total probability with which new ARM64 clusters are provisioned, modulo test specs. which are incompatible. + // N.B. if all selected tests are incompatible with ARM64, then arm64Probability is effectively 0. + // In other words, ClusterSpec.Arch takes precedence over the arm64Probability flag. + arm64Probability float64 + // Conditional probability with which new FIPS clusters are provisioned, modulo test specs. The total probability + // is the product of this and 1-arm64Probability. + // As in the case of arm64Probability, ClusterSpec.Arch takes precedence over the fipsProbability flag. + fipsProbability float64 + instanceType string localSSDArg bool - workload string deprecatedRoachprodBinary string // overrideOpts contains vm.CreateOpts override values passed from the cli. overrideOpts vm.CreateOpts @@ -97,6 +112,9 @@ var ( const ( defaultEncryptionProbability = 1 + defaultFIPSProbability = 0 + defaultARM64Probability = 0 + defaultCockroachPath = "./cockroach-default" ) type errBinaryOrLibraryNotFound struct { @@ -107,29 +125,59 @@ func (e errBinaryOrLibraryNotFound) Error() string { return fmt.Sprintf("binary or library %q not found (or was not executable)", e.binary) } -func filepathAbs(path string) (string, error) { - path, err := filepath.Abs(path) +func validateBinaryFormat(path string, arch vm.CPUArch, checkEA bool) (string, error) { + abspath, err := filepath.Abs(path) if err != nil { return "", errors.WithStack(err) } - return path, nil -} - -func findBinary(binary, defValue string) (abspath string, err error) { - if binary == "" { - binary = defValue + // Check that the binary ELF format matches the expected architecture. + cmd := exec.Command("file", "-b", abspath) + var out bytes.Buffer + cmd.Stdout = &out + if err := cmd.Run(); err != nil { + return "", errors.Wrapf(err, "error executing 'file %s'", abspath) + } + fileFormat := strings.ToLower(out.String()) + // N.B. 'arm64' is returned on macOS, while 'aarch64' is returned on Linux; + // "x86_64" string is returned on macOS, while "x86-64" is returned on Linux. + if arch == vm.ArchARM64 && !strings.Contains(fileFormat, "arm64") && !strings.Contains(fileFormat, "aarch64") { + return "", errors.Newf("%s has incompatible architecture; want: %q, got: %q", abspath, arch, fileFormat) + } else if arch == vm.ArchAMD64 && !strings.Contains(fileFormat, "x86-64") && !strings.Contains(fileFormat, "x86_64") { + // Otherwise, we expect a binary that was built for amd64. + return "", errors.Newf("%s has incompatible architecture; want: %q, got: %q", abspath, arch, fileFormat) + } + if arch == vm.ArchFIPS && strings.HasSuffix(abspath, "cockroach") { + // Check that the binary is patched to use OpenSSL FIPS. + // N.B. only the cockroach binary is patched, so we exclude this check for dynamically-linked libraries. + cmd = exec.Command("bash", "-c", fmt.Sprintf("nm %s | grep golang-fips |head -1", abspath)) + if err := cmd.Run(); err != nil { + return "", errors.Newf("%s is not compiled with FIPS", abspath) + } + } + if checkEA { + // Check that the binary was compiled with assertions _enabled_. + cmd = exec.Command("bash", "-c", fmt.Sprintf("%s version |grep \"Enabled Assertions\" |grep true", abspath)) + if err := cmd.Run(); err != nil { + return "", errors.Newf("%s is not compiled with assertions enabled", abspath) + } } + return abspath, nil +} + +func findBinary( + name string, osName string, arch vm.CPUArch, checkEA bool, +) (abspath string, err error) { // Check to see if binary exists and is a regular file and executable. - if fi, err := os.Stat(binary); err == nil && fi.Mode().IsRegular() && (fi.Mode()&0111) != 0 { - return filepathAbs(binary) + if fi, err := os.Stat(name); err == nil && fi.Mode().IsRegular() && (fi.Mode()&0111) != 0 { + return validateBinaryFormat(name, arch, checkEA) } - return findBinaryOrLibrary("bin", binary) + return findBinaryOrLibrary("bin", name, "", osName, arch, checkEA) } -func findLibrary(libraryName string) (string, error) { +func findLibrary(libraryName string, os string, arch vm.CPUArch) (string, error) { suffix := ".so" - if local { + if cloud == spec.Local { switch runtime.GOOS { case "linux": case "freebsd": @@ -143,65 +191,102 @@ func findLibrary(libraryName string) (string, error) { return "", errors.Newf("failed to find suffix for runtime %s", runtime.GOOS) } } - return findBinaryOrLibrary("lib", libraryName+suffix) + + return findBinaryOrLibrary("lib", libraryName, suffix, os, arch, false) } -func findBinaryOrLibrary(binOrLib string, name string) (string, error) { +// findBinaryOrLibrary searches for a binary or library, _first_ in the $PATH, _then_ in the following hardcoded paths, +// +// $GOPATH/src/github.com/cockroachdb/cockroach/ +// $GOPATH/src/github.com/cockroachdb/artifacts/ +// $PWD/binOrLib +// $GOPATH/src/github.com/cockroachdb/cockroach/binOrLib +// +// in the above order, unless 'name' is an absolute path, in which case the hardcoded paths are skipped. +// +// binOrLib is either 'bin' or 'lib'; nameSuffix is either empty, '.so', '.dll', or '.dylib'. +// Both osName and arch are used to derive a fully qualified binary or library name by inserting the +// corresponding arch suffix (see install.ArchInfoForOS), e.g. '.linux-arm64' or '.darwin-amd64'. +// That is, each hardcoded path is searched for a file named 'name' or 'name.nameSuffix.archSuffix', respectively. +// +// If no binary or library is found, an error is returned. +// Otherwise, if multiple binaries or libraries are located at the above paths, the first one found is returned. +// If the found binary or library happens to be of the wrong type, e.g., architecture is different from 'arch', or +// checkEA is true, and the binary was not compiled with runtime assertions enabled, an error is returned. +// While we could continue the search instead of returning an error, it is assumed the user can stage the binaries +// to avoid such ambiguity. Alternatively, the user can specify the absolute path to the binary or library, +// e.g., via --cockroach; in this case, only the absolute path is checked and validated. +func findBinaryOrLibrary( + binOrLib string, name string, nameSuffix string, osName string, arch vm.CPUArch, checkEA bool, +) (string, error) { // Find the binary to run and translate it to an absolute path. First, look // for the binary in PATH. - path, err := exec.LookPath(name) + pathFromEnv, err := exec.LookPath(name) + if err == nil { + // Found it in PATH, validate and return absolute path. + return validateBinaryFormat(pathFromEnv, arch, checkEA) + } + if strings.HasPrefix(name, "/") { + // Specified name is an absolute path, but we couldn't find it; bail out. + return "", errors.WithStack(err) + } + // We're unable to find the name in PATH and "name" is a relative path: + // look in the cockroach repo. + gopath := os.Getenv("GOPATH") + if gopath == "" { + gopath = filepath.Join(os.Getenv("HOME"), "go") + } + + dirs := []string{ + filepath.Join(gopath, "/src/github.com/cockroachdb/cockroach/"), + filepath.Join(gopath, "/src/github.com/cockroachdb/cockroach/artifacts/"), + filepath.Join(os.ExpandEnv("$PWD"), binOrLib), + filepath.Join(gopath, "/src/github.com/cockroachdb/cockroach", binOrLib), + } + + archInfo, err := install.ArchInfoForOS(osName, arch) if err != nil { - if strings.HasPrefix(name, "/") { - return "", errors.WithStack(err) - } - - // We're unable to find the name in PATH and "name" is a relative path: - // look in the cockroach repo. - gopath := os.Getenv("GOPATH") - if gopath == "" { - gopath = filepath.Join(os.Getenv("HOME"), "go") - } - - var suffix string - if !local { - suffix = ".docker_amd64" - } - dirs := []string{ - filepath.Join(gopath, "/src/github.com/cockroachdb/cockroach/"), - filepath.Join(gopath, "/src/github.com/cockroachdb/cockroach/artifacts/"), - filepath.Join(gopath, "/src/github.com/cockroachdb/cockroach", binOrLib+suffix), - filepath.Join(os.ExpandEnv("$PWD"), binOrLib+suffix), - filepath.Join(gopath, "/src/github.com/cockroachdb/cockroach", binOrLib), - } - for _, dir := range dirs { - path = filepath.Join(dir, name) - var err2 error - path, err2 = exec.LookPath(path) - if err2 == nil { - return filepathAbs(path) + return "", err + } + archSuffixes := []string{"." + archInfo.DebugArchitecture, "." + archInfo.ReleaseArchitecture} + + for _, dir := range dirs { + var path string + + if path, err = exec.LookPath(filepath.Join(dir, name)); err == nil { + return validateBinaryFormat(path, arch, checkEA) + } + for _, archSuffix := range archSuffixes { + if path, err = exec.LookPath(filepath.Join(dir, name+archSuffix+nameSuffix)); err == nil { + return validateBinaryFormat(path, arch, checkEA) } } - return "", errBinaryOrLibraryNotFound{name} } - return filepathAbs(path) + return "", errBinaryOrLibraryNotFound{name} } // VerifyLibraries verifies that the required libraries, specified by name, are // available for the target environment. -func VerifyLibraries(requiredLibs []string) error { +func VerifyLibraries(requiredLibs []string, arch vm.CPUArch) error { + foundLibraryPaths := libraryFilePaths[arch] + for _, requiredLib := range requiredLibs { - if !contains(libraryFilePaths, libraryNameFromPath, requiredLib) { - return errors.Wrap(errors.Errorf("missing required library %s", requiredLib), "cluster.VerifyLibraries") + if !contains(foundLibraryPaths, libraryNameFromPath, requiredLib) { + return errors.Wrap(errors.Errorf("missing required library %s (arch=%q)", requiredLib, arch), "cluster.VerifyLibraries") } } return nil } -// libraryNameFromPath returns the name of a library without the extension, for a +// libraryNameFromPath returns the name of a library without the extension(s), for a // given path. func libraryNameFromPath(path string) string { filename := filepath.Base(path) - return strings.TrimSuffix(filename, filepath.Ext(filename)) + // N.B. filename may contain multiple extensions, e.g. "libgeos.linux-amd64.fips.so". + for ext := filepath.Ext(filename); ext != ""; ext = filepath.Ext(filename) { + filename = strings.TrimSuffix(filename, ext) + } + return filename } func contains(list []string, transformString func(s string) string, str string) bool { @@ -217,50 +302,128 @@ func contains(list []string, transformString func(s string) string, str string) } func initBinariesAndLibraries() { - // If we're running against an existing "local" cluster, force the local flag - // to true in order to get the "local" test configurations. - if clusterName == "local" { - local = true - } - if local { - cloud = spec.Local - } + // TODO(srosenberg): enable metamorphic local clusters; currently, spec.Local means run all tests locally. + // This could be revisited after we have a way to specify which clouds a given test supports, + // see https://github.com/cockroachdb/cockroach/issues/104029. + defaultOSName := "linux" + defaultArch := vm.ArchAMD64 + if cloud == spec.Local { + defaultOSName = runtime.GOOS + if arm64Probability == 1 { + // N.B. if arm64Probability != 1, then we're running a local cluster with both arm64 and amd64. + defaultArch = vm.ArchARM64 + } + if string(defaultArch) != runtime.GOARCH { + fmt.Printf("WARN: local cluster's architecture (%q) differs from default (%q)\n", runtime.GOARCH, defaultArch) + } + } + fmt.Printf("Locating and verifying binaries for os=%q, arch=%q\n", defaultOSName, defaultArch) + + // Finds and validates a binary. If the binary 'isRequired', but not found, exit and print the error. + resolveBinary := func(binName string, userSpecified string, arch vm.CPUArch, isRequired bool, checkEA bool) (string, error) { + path := binName + if userSpecified != "" { + path = userSpecified + } + abspath, err := findBinary(path, defaultOSName, arch, checkEA) + if err != nil { + if isRequired { + fmt.Fprintf(os.Stderr, "ERROR: unable to find required binary %q for %q: %v\n", binName, arch, err) + os.Exit(1) + } + return "", err + } + if userSpecified == "" { + // No user-specified path, so return the found absolute path. + return abspath, nil + } + // Bail out if a path other than the user-specified was found. + userPath, err := filepath.Abs(userSpecified) + + if err != nil || userPath != abspath { + err = errors.Wrapf(err, "ERROR: found %q at: %s instead of the user-specified path: %q\n", binName, abspath, userSpecified) - cockroachDefault := "cockroach" - if !local { - cockroachDefault = "cockroach-linux-2.6.32-gnu-amd64" + if isRequired { + fmt.Fprintf(os.Stderr, "%v", err) + os.Exit(1) + } + return "", err + } + return abspath, nil } + // We need to verify we have at least both the cockroach and the workload binaries. var err error - cockroach, err = findBinary(cockroach, cockroachDefault) + + cockroach[defaultArch], _ = resolveBinary("cockroach", cockroachPath, defaultArch, true, false) + workload[defaultArch], _ = resolveBinary("workload", workloadPath, defaultArch, true, false) + cockroachShort[defaultArch], err = resolveBinary("cockroach-short", cockroachShortPath, defaultArch, false, true) if err != nil { - fmt.Fprintf(os.Stderr, "%+v\n", err) - os.Exit(1) + fmt.Fprintf(os.Stderr, "WARN: unable to find %q for %q: %s\n", "cockroach-short", defaultArch, err) } - if cockroachShort != "" { - // defValue doesn't matter since cockroachShort is a non-empty string. - cockroachShort, err = findBinary(cockroachShort, "" /* defValue */) + if arm64Probability > 0 && defaultArch != vm.ArchARM64 { + fmt.Printf("Locating and verifying binaries for os=%q, arch=%q\n", defaultOSName, vm.ArchARM64) + // We need to verify we have all the required binaries for arm64. + cockroach[vm.ArchARM64], _ = resolveBinary("cockroach", cockroachPath, vm.ArchARM64, true, false) + workload[vm.ArchARM64], _ = resolveBinary("workload", workloadPath, vm.ArchARM64, true, false) + cockroachShort[vm.ArchARM64], err = resolveBinary("cockroach-short", cockroachShortPath, vm.ArchARM64, false, true) if err != nil { - fmt.Fprintf(os.Stderr, "%+v\n", err) - os.Exit(1) + fmt.Fprintf(os.Stderr, "WARN: unable to find %q for %q: %s\n", "cockroach-short", vm.ArchARM64, err) } } - - workload, err = findBinary(workload, "workload") - if errors.As(err, &errBinaryOrLibraryNotFound{}) { - fmt.Fprintln(os.Stderr, "workload binary not provided, proceeding anyway") - } else if err != nil { - fmt.Fprintf(os.Stderr, "%+v\n", err) - os.Exit(1) + if fipsProbability > 0 && defaultArch != vm.ArchFIPS { + fmt.Printf("Locating and verifying binaries for os=%q, arch=%q\n", defaultOSName, vm.ArchFIPS) + // We need to verify we have all the required binaries for fips. + cockroach[vm.ArchFIPS], _ = resolveBinary("cockroach", cockroachPath, vm.ArchFIPS, true, false) + workload[vm.ArchFIPS], _ = resolveBinary("workload", workloadPath, vm.ArchFIPS, true, false) + cockroachShort[vm.ArchFIPS], err = resolveBinary("cockroach-short", cockroachShortPath, vm.ArchFIPS, false, true) + if err != nil { + fmt.Fprintf(os.Stderr, "WARN: unable to find %q for %q: %s\n", "cockroach-short", vm.ArchFIPS, err) + } } // In v20.2 or higher, optionally expect certain library files to exist. // Since they may not be found in older versions, do not hard error if they are not found. - for _, libraryName := range []string{"libgeos", "libgeos_c"} { - if libraryFilePath, err := findLibrary(libraryName); err != nil { - fmt.Fprintf(os.Stderr, "error finding library %s, ignoring: %+v\n", libraryName, err) - } else { - libraryFilePaths = append(libraryFilePaths, libraryFilePath) + for _, arch := range []vm.CPUArch{vm.ArchAMD64, vm.ArchARM64, vm.ArchFIPS} { + if arm64Probability == 0 && defaultArch != vm.ArchARM64 && arch == vm.ArchARM64 { + // arm64 isn't used, skip finding libs for it. + continue + } + if fipsProbability == 0 && arch == vm.ArchFIPS { + // fips isn't used, skip finding libs for it. + continue + } + paths := []string(nil) + + for _, libraryName := range []string{"libgeos", "libgeos_c"} { + if libraryFilePath, err := findLibrary(libraryName, defaultOSName, arch); err != nil { + fmt.Fprintf(os.Stderr, "WARN: unable to find library %s, ignoring: %s\n", libraryName, err) + } else { + paths = append(paths, libraryFilePath) + } + } + libraryFilePaths[arch] = paths + } + // Looks like we have all the binaries we'll need. Let's print them out. + fmt.Printf("\nFound the following binaries:\n") + for arch, path := range cockroach { + if path != "" { + fmt.Printf("\tcockroach %q at: %s\n", arch, path) + } + } + for arch, path := range workload { + if path != "" { + fmt.Printf("\tworkload %q at: %s\n", arch, path) + } + } + for arch, path := range cockroachShort { + if path != "" { + fmt.Printf("\tcockroach-short %q at: %s\n", arch, path) + } + } + for arch, paths := range libraryFilePaths { + if len(paths) > 0 { + fmt.Printf("\tlibraries %q at: %s\n", arch, strings.Join(paths, ", ")) } } } @@ -654,6 +817,8 @@ type clusterImpl struct { expiration time.Time encAtRest bool // use encryption at rest + os string // OS of the cluster + arch vm.CPUArch // CPU architecture of the cluster // destroyState contains state related to the cluster's destruction. destroyState destroyState } @@ -737,6 +902,10 @@ type clusterConfig struct { localCluster bool useIOBarrier bool alloc *quotapool.IntAlloc + // Specifies CPU architecture which may require a custom AMI and cockroach binary. + arch vm.CPUArch + // Specifies the OS which may require a custom AMI and cockroach binary. + os string } // clusterFactory is a creator of clusters. @@ -873,7 +1042,8 @@ func (f *clusterFactory) newCluster( providerOptsContainer := vm.CreateProviderOptionsContainer() // The ClusterName is set below in the retry loop to ensure // that each create attempt gets a unique cluster name. - createVMOpts, providerOpts, err := cfg.spec.RoachprodOpts("", cfg.useIOBarrier) + createVMOpts, providerOpts, err := cfg.spec.RoachprodOpts("", cfg.useIOBarrier, cfg.arch) + if err != nil { // We must release the allocation because cluster creation is not possible at this point. cfg.alloc.Release() @@ -909,6 +1079,8 @@ func (f *clusterFactory) newCluster( spec: cfg.spec, expiration: cfg.spec.Expiration(), r: f.r, + arch: cfg.arch, + os: cfg.os, destroyState: destroyState{ owned: true, alloc: cfg.alloc, @@ -1698,11 +1870,13 @@ func (c *clusterImpl) PutLibraries( if err := c.RunE(ctx, c.All(), "mkdir", "-p", libraryDir); err != nil { return err } - for _, libraryFilePath := range libraryFilePaths { - if !contains(libraries, nil, libraryNameFromPath(libraryFilePath)) { + + for _, libraryFilePath := range libraryFilePaths[c.arch] { + libName := libraryNameFromPath(libraryFilePath) + if !contains(libraries, nil, libName) { continue } - putPath := filepath.Join(libraryDir, filepath.Base(libraryFilePath)) + putPath := filepath.Join(libraryDir, libName) if err := c.PutE( ctx, c.l, @@ -1728,7 +1902,7 @@ func (c *clusterImpl) Stage( c.status("staging binary") defer c.status("") return errors.Wrap(roachprod.Stage(ctx, l, c.MakeNodes(opts...), - "" /* stageOS */, "" /* stageArch */, dir, application, versionOrSHA), "cluster.Stage") + c.os, string(c.arch), dir, application, versionOrSHA), "cluster.Stage") } // Get gets files from remote hosts. @@ -2421,6 +2595,10 @@ func (c *clusterImpl) IsSecure() bool { return c.localCertsDir != "" } +func (c *clusterImpl) Architecture() vm.CPUArch { + return c.arch +} + // Extend extends the cluster's expiration by d. func (c *clusterImpl) Extend(ctx context.Context, d time.Duration, l *logger.Logger) error { if ctx.Err() != nil { @@ -2443,7 +2621,9 @@ func (c *clusterImpl) NewMonitor(ctx context.Context, opts ...option.Option) clu func (c *clusterImpl) StartGrafana( ctx context.Context, l *logger.Logger, promCfg *prometheus.Config, ) error { - return roachprod.StartGrafana(ctx, l, c.name, "", promCfg) + + return roachprod.StartGrafana(ctx, l, c.name, c.arch, "", promCfg) + } func (c *clusterImpl) StopGrafana(ctx context.Context, l *logger.Logger, dumpDir string) error { diff --git a/pkg/cmd/roachtest/cluster/BUILD.bazel b/pkg/cmd/roachtest/cluster/BUILD.bazel index afd151fae747..a5c0b9f53899 100644 --- a/pkg/cmd/roachtest/cluster/BUILD.bazel +++ b/pkg/cmd/roachtest/cluster/BUILD.bazel @@ -16,6 +16,7 @@ go_library( "//pkg/roachprod/install", "//pkg/roachprod/logger", "//pkg/roachprod/prometheus", + "//pkg/roachprod/vm", "@com_github_cockroachdb_errors//:errors", ], ) diff --git a/pkg/cmd/roachtest/cluster/cluster_interface.go b/pkg/cmd/roachtest/cluster/cluster_interface.go index bbd538676ee9..b855cb39b18c 100644 --- a/pkg/cmd/roachtest/cluster/cluster_interface.go +++ b/pkg/cmd/roachtest/cluster/cluster_interface.go @@ -20,6 +20,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/roachprod/install" "github.com/cockroachdb/cockroach/pkg/roachprod/logger" "github.com/cockroachdb/cockroach/pkg/roachprod/prometheus" + "github.com/cockroachdb/cockroach/pkg/roachprod/vm" ) // Cluster is the interface through which a given roachtest interacts with the @@ -107,7 +108,10 @@ type Cluster interface { Spec() spec.ClusterSpec Name() string IsLocal() bool + // IsSecure returns true iff the cluster uses TLS. IsSecure() bool + // Returns CPU architecture of the nodes. + Architecture() vm.CPUArch // Deleting CockroachDB data and logs on nodes. diff --git a/pkg/cmd/roachtest/cluster_test.go b/pkg/cmd/roachtest/cluster_test.go index fb26d417e285..4aaddbab3e1a 100644 --- a/pkg/cmd/roachtest/cluster_test.go +++ b/pkg/cmd/roachtest/cluster_test.go @@ -18,6 +18,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec" test2 "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" "github.com/cockroachdb/cockroach/pkg/roachprod/logger" + "github.com/cockroachdb/cockroach/pkg/roachprod/vm" "github.com/cockroachdb/cockroach/pkg/util/version" "github.com/cockroachdb/errors" "github.com/stretchr/testify/assert" @@ -199,14 +200,14 @@ func TestVerifyLibraries(t *testing.T) { name: "no match", verifyLibs: []string{"required_c"}, libraryFilePaths: []string{"/some/path/lib.so"}, - expectedError: errors.Wrap(errors.Errorf("missing required library %s", + expectedError: errors.Wrap(errors.Errorf("missing required library %s (arch=\"amd64\")", "required_c"), "cluster.VerifyLibraries"), }, { name: "no match on nil libs", verifyLibs: []string{"required_b"}, libraryFilePaths: nil, - expectedError: errors.Wrap(errors.Errorf("missing required library %s", + expectedError: errors.Wrap(errors.Errorf("missing required library %s (arch=\"amd64\")", "required_b"), "cluster.VerifyLibraries"), }, { @@ -215,17 +216,29 @@ func TestVerifyLibraries(t *testing.T) { libraryFilePaths: []string{"/lib/geos.so"}, expectedError: nil, }, + { + name: "single match, multiple extensions", + verifyLibs: []string{"geos"}, + libraryFilePaths: []string{"/lib/geos.linux-amd.so"}, + expectedError: nil, + }, { name: "multiple matches", verifyLibs: []string{"lib", "ltwo", "geos"}, libraryFilePaths: []string{"ltwo.so", "a/geos.so", "/some/path/to/lib.so"}, expectedError: nil, }, + { + name: "multiple matches, multiple extensions", + verifyLibs: []string{"lib", "ltwo", "geos"}, + libraryFilePaths: []string{"ltwo.linux-arm64.so", "a/geos.linux-amd64.fips.so", "/some/path/to/lib.darwin-arm64.so"}, + expectedError: nil, + }, } for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { - libraryFilePaths = tc.libraryFilePaths - actualError := VerifyLibraries(tc.verifyLibs) + libraryFilePaths = map[vm.CPUArch][]string{vm.ArchAMD64: tc.libraryFilePaths} + actualError := VerifyLibraries(tc.verifyLibs, vm.ArchAMD64) if tc.expectedError == nil { require.NoError(t, actualError) } else { diff --git a/pkg/cmd/roachtest/github.go b/pkg/cmd/roachtest/github.go index 05768e49d606..8d1414f1e664 100644 --- a/pkg/cmd/roachtest/github.go +++ b/pkg/cmd/roachtest/github.go @@ -125,7 +125,10 @@ func (g *githubIssues) createPostRequest( roachtestPrefix("cpu"): fmt.Sprintf("%d", spec.Cluster.CPUs), roachtestPrefix("ssd"): fmt.Sprintf("%d", spec.Cluster.SSDs), } - + // Emit CPU architecture only if it was specified; otherwise, it's captured below, assuming cluster was created. + if spec.Cluster.Arch != "" { + clusterParams[roachtestPrefix("arch")] = string(spec.Cluster.Arch) + } // These params can be probabilistically set, so we pass them here to // show what their actual values are in the posted issue. if g.vmCreateOpts != nil { @@ -135,6 +138,11 @@ func (g *githubIssues) createPostRequest( if g.cluster != nil { clusterParams[roachtestPrefix("encrypted")] = fmt.Sprintf("%v", g.cluster.encAtRest) + if spec.Cluster.Arch == "" { + // N.B. when Arch is specified, it cannot differ from cluster's arch. + // Hence, we only emit when arch was unspecified. + clusterParams[roachtestPrefix("arch")] = string(g.cluster.arch) + } } return issues.PostRequest{ diff --git a/pkg/cmd/roachtest/github_test.go b/pkg/cmd/roachtest/github_test.go index 8b49dfe86c7a..b2c2f139ad9f 100644 --- a/pkg/cmd/roachtest/github_test.go +++ b/pkg/cmd/roachtest/github_test.go @@ -74,7 +74,8 @@ func TestShouldPost(t *testing.T) { {false, 1, "token", "master", true}, } - reg := makeTestRegistry(spec.GCE, "", "", false, false) + reg, err := makeTestRegistry(spec.GCE, "", "", false, false) + require.NoError(t, err) for _, c := range testCases { t.Setenv("GITHUB_API_TOKEN", c.envGithubAPIToken) @@ -108,34 +109,40 @@ func TestCreatePostRequest(t *testing.T) { clusterCreationFailed bool loadTeamsFailed bool localSSD bool + arch vm.CPUArch category issueCategory expectedPost bool expectedParams map[string]string }{ - {true, false, false, false, otherErr, true, + {true, false, false, false, "", otherErr, true, + prefixAll(map[string]string{ "cloud": "gce", "encrypted": "false", "fs": "ext4", "ssd": "0", "cpu": "4", + "arch": "amd64", "localSSD": "false", }), }, - {true, false, false, true, clusterCreationErr, true, + {true, false, false, true, vm.ArchARM64, clusterCreationErr, true, + prefixAll(map[string]string{ "cloud": "gce", "encrypted": "false", "fs": "ext4", "ssd": "0", "cpu": "4", + "arch": "arm64", "localSSD": "true", }), }, // Assert that release-blocker label exists when !nonReleaseBlocker // Also ensure that in the event of a failed cluster creation, // nil `vmOptions` and `clusterImpl` are not dereferenced - {false, true, false, false, sshErr, true, + {false, true, false, false, "", sshErr, true, + prefixAll(map[string]string{ "cloud": "gce", "ssd": "0", @@ -143,17 +150,14 @@ func TestCreatePostRequest(t *testing.T) { }), }, //Simulate failure loading TEAMS.yaml - {true, false, true, false, otherErr, false, nil}, + {true, false, true, false, "", otherErr, false, nil}, } -<<<<<<< HEAD - reg, _ := makeTestRegistry(spec.GCE, "", "", false) + reg, err := makeTestRegistry(spec.GCE, "", "", false, false) + require.NoError(t, err) -======= - reg := makeTestRegistry(spec.GCE, "", "", false, false) ->>>>>>> 0df3a03e781 (roachtest: require perf. tests to opt in via TestSpec.Benchmark) for _, c := range testCases { - clusterSpec := reg.MakeClusterSpec(1) + clusterSpec := reg.MakeClusterSpec(1, spec.Arch(c.arch)) testSpec := ®istry.TestSpec{ Name: "github_test", @@ -167,7 +171,7 @@ func TestCreatePostRequest(t *testing.T) { l: nilLogger(), } - testClusterImpl := &clusterImpl{spec: clusterSpec} + testClusterImpl := &clusterImpl{spec: clusterSpec, arch: vm.ArchAMD64} vo := vm.DefaultCreateOpts() vmOpts := &vo diff --git a/pkg/cmd/roachtest/main.go b/pkg/cmd/roachtest/main.go index 9e03a453e81e..a71688ce6652 100644 --- a/pkg/cmd/roachtest/main.go +++ b/pkg/cmd/roachtest/main.go @@ -18,10 +18,12 @@ import ( "os/signal" "os/user" "path/filepath" + "runtime" "time" "github.com/cockroachdb/cockroach/pkg/build" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/tests" "github.com/cockroachdb/cockroach/pkg/roachprod" "github.com/cockroachdb/cockroach/pkg/roachprod/config" @@ -102,16 +104,50 @@ func main() { if cmd.Name() == "help" { return nil } - - if clusterName != "" && local { - return fmt.Errorf( - "cannot specify both an existing cluster (%s) and --local. However, if a local cluster "+ - "already exists, --clusters=local will use it", - clusterName) + local := cmd.Flags().Lookup("local").Value.String() == "true" + if local { + if clusterName != "" { + return fmt.Errorf( + "cannot specify both an existing cluster (%s) and --local. However, if a local cluster "+ + "already exists, --clusters=local will use it", + clusterName) + } + cloud = spec.Local } switch cmd.Name() { case "run", "bench", "store-gen": + if !(0 <= arm64Probability && arm64Probability <= 1) { + return fmt.Errorf("'metamorphic-arm64-probability' must be in [0,1]") + } + if !(0 <= fipsProbability && fipsProbability <= 1) { + return fmt.Errorf("'metamorphic-fips-probability' must be in [0,1]") + } + if arm64Probability == 1 && fipsProbability != 0 { + return fmt.Errorf("'metamorphic-fips-probability' must be 0 when 'metamorphic-arm64-probability' is 1") + } + if fipsProbability == 1 && arm64Probability != 0 { + return fmt.Errorf("'metamorphic-arm64-probability' must be 0 when 'metamorphic-fips-probability' is 1") + } + arm64Opt := cmd.Flags().Lookup("metamorphic-arm64-probability") + if !arm64Opt.Changed && runtime.GOARCH == "arm64" && cloud == spec.Local { + fmt.Printf("Detected 'arm64' in 'local mode', setting 'metamorphic-arm64-probability' to 1; use --metamorphic-arm64-probability to run (emulated) with other binaries\n") + arm64Probability = 1 + } + // Find and validate all required binaries and libraries. initBinariesAndLibraries() + + if arm64Probability > 0 { + fmt.Printf("ARM64 clusters will be provisioned with probability %.2f\n", arm64Probability) + } + amd64Probability := 1 - arm64Probability + if amd64Probability > 0 { + fmt.Printf("AMD64 clusters will be provisioned with probability %.2f\n", amd64Probability) + } + if fipsProbability > 0 { + // N.B. arm64Probability < 1, otherwise fipsProbability == 0, as per above check. + // Hence, amd64Probability > 0 is implied. + fmt.Printf("FIPS clusters will be provisioned with probability %.2f\n", fipsProbability*amd64Probability) + } } return nil }, @@ -123,6 +159,7 @@ func main() { "If fewer than --parallelism names are specified, then the parallelism "+ "is capped to the number of clusters specified. When a cluster does not exist "+ "yet, it is created according to the spec.") + var local bool rootCmd.PersistentFlags().BoolVarP( &local, "local", "l", local, "run tests locally") rootCmd.PersistentFlags().StringVarP( @@ -130,15 +167,25 @@ func main() { "Username to use as a cluster name prefix. "+ "If blank, the current OS user is detected and specified.") rootCmd.PersistentFlags().StringVar( - &cockroach, "cockroach", "", "path to cockroach binary to use") + &cockroachPath, "cockroach", "", "path to cockroach binary to use") rootCmd.PersistentFlags().StringVar( - &cockroachShort, "cockroach-short", "", "path to cockroach-short binary (compiled with crdb_test build tag) to use") + &cockroachShortPath, "cockroach-short", "", "path to cockroach-short binary (compiled with crdb_test build tag) to use") rootCmd.PersistentFlags().StringVar( - &workload, "workload", "", "path to workload binary to use") + &workloadPath, "workload", "", "path to workload binary to use") rootCmd.PersistentFlags().Float64Var( &encryptionProbability, "metamorphic-encryption-probability", defaultEncryptionProbability, "probability that clusters will be created with encryption-at-rest enabled "+ "for tests that support metamorphic encryption (default 1.0)") + rootCmd.PersistentFlags().Float64Var( + &fipsProbability, "metamorphic-fips-probability", defaultFIPSProbability, + "conditional probability that amd64 clusters will be created with FIPS, i.e., P(fips | amd64), "+ + "for tests that support FIPS and whose CPU architecture is 'amd64' (default 0) "+ + "NOTE: amd64 clusters are created with probability 1-P(arm64), where P(arm64) is 'metamorphic-arm64-probability'. "+ + "Hence, P(fips | amd64) = P(fips) * (1 - P(arm64))") + rootCmd.PersistentFlags().Float64Var( + &arm64Probability, "metamorphic-arm64-probability", defaultARM64Probability, + "probability that clusters will be created with 'arm64' CPU architecture "+ + "for tests that support 'arm64' (default 0)") rootCmd.AddCommand(&cobra.Command{ Use: `version`, @@ -224,7 +271,6 @@ runner itself. user: username, clusterID: clusterID, versionsBinaryOverride: versionsBinaryOverride, - enableFIPS: enableFIPS, }, false /* benchOnly */) }, } @@ -263,7 +309,6 @@ runner itself. user: username, clusterID: clusterID, versionsBinaryOverride: versionsBinaryOverride, - enableFIPS: enableFIPS, }, true /* benchOnly */) }, } @@ -378,7 +423,7 @@ func runTests(register func(registry.Registry), cfg cliCfg, benchOnly bool) erro filter := registry.NewTestFilter(cfg.args) clusterType := roachprodCluster bindTo := "" - if local { + if cloud == spec.Local { clusterType = localCluster // This will suppress the annoying "Allow incoming network connections" popup from diff --git a/pkg/cmd/roachtest/slack.go b/pkg/cmd/roachtest/slack.go index 1e653d142f61..40f2505003e0 100644 --- a/pkg/cmd/roachtest/slack.go +++ b/pkg/cmd/roachtest/slack.go @@ -75,8 +75,6 @@ func postSlackReport(pass, fail, skip map[*testImpl]struct{}) { switch { case cloud != "": prefix = strings.ToUpper(cloud) - case local: - prefix = "LOCAL" default: prefix = "GCE" } diff --git a/pkg/cmd/roachtest/spec/cluster_spec.go b/pkg/cmd/roachtest/spec/cluster_spec.go index dfae8d44733b..592a7054c268 100644 --- a/pkg/cmd/roachtest/spec/cluster_spec.go +++ b/pkg/cmd/roachtest/spec/cluster_spec.go @@ -40,7 +40,8 @@ const ( // look like. It becomes part of a clusterConfig when the cluster is created. type ClusterSpec struct { Cloud string - InstanceType string // auto-chosen if left empty + Arch vm.CPUArch // CPU architecture; auto-chosen if left empty + InstanceType string // auto-chosen if left empty NodeCount int // CPUs is the number of CPUs per node. CPUs int @@ -156,7 +157,7 @@ func getAzureOpts(machineType string, zones []string) vm.ProviderOpts { // RoachprodOpts returns the opts to use when calling `roachprod.Create()` // in order to create the cluster described in the spec. func (s *ClusterSpec) RoachprodOpts( - clusterName string, useIOBarrier bool, + clusterName string, useIOBarrier bool, arch vm.CPUArch, ) (vm.CreateOpts, vm.ProviderOpts, error) { createVMOpts := vm.DefaultCreateOpts() @@ -187,29 +188,41 @@ func (s *ClusterSpec) RoachprodOpts( } createVMOpts.GeoDistributed = s.Geo + createVMOpts.Arch = string(arch) machineType := s.InstanceType ssdCount := s.SSDs + if s.CPUs != 0 { // Default to the user-supplied machine type, if any. // Otherwise, pick based on requested CPU count. + var selectedArch vm.CPUArch + if len(machineType) == 0 { // If no machine type was specified, choose one // based on the cloud and CPU count. switch s.Cloud { case AWS: - machineType = AWSMachineType(s.CPUs, s.HighMem) + machineType, selectedArch = AWSMachineType(s.CPUs, s.HighMem, arch) case GCE: - machineType = GCEMachineType(s.CPUs, s.HighMem) + machineType, selectedArch = GCEMachineType(s.CPUs, s.HighMem, arch) case Azure: machineType = AzureMachineType(s.CPUs, s.HighMem) } } + if selectedArch != "" && selectedArch != arch { + // TODO(srosenberg): we need a better way to monitor the rate of this mismatch, i.e., + // other than grepping cluster creation logs. + fmt.Printf("WARN: requested arch %s for machineType %s, but selected %s\n", arch, machineType, selectedArch) + createVMOpts.Arch = string(selectedArch) + } // Local SSD can only be requested // - if configured to prefer doing so, // - if no particular volume size is requested, and, // - on AWS, if the machine type supports it. - if s.PreferLocalSSD && s.VolumeSize == 0 && (s.Cloud != AWS || awsMachineSupportsSSD(machineType)) { + // - on GCE, if the machine type is not ARM64. + if s.PreferLocalSSD && s.VolumeSize == 0 && (s.Cloud != AWS || awsMachineSupportsSSD(machineType)) && + (s.Cloud != GCE || selectedArch != vm.ArchARM64) { // Ensure SSD count is at least 1 if UseLocalSSD is true. if ssdCount == 0 { ssdCount = 1 @@ -242,6 +255,12 @@ func (s *ClusterSpec) RoachprodOpts( } } + if createVMOpts.Arch == string(vm.ArchFIPS) && !(s.Cloud == GCE || s.Cloud == AWS) { + return vm.CreateOpts{}, nil, errors.Errorf( + "FIPS not yet supported on %s", s.Cloud, + ) + } + var providerOpts vm.ProviderOpts switch s.Cloud { case AWS: diff --git a/pkg/cmd/roachtest/spec/machine_type.go b/pkg/cmd/roachtest/spec/machine_type.go index af77a4e62e64..db9ce94520ee 100644 --- a/pkg/cmd/roachtest/spec/machine_type.go +++ b/pkg/cmd/roachtest/spec/machine_type.go @@ -10,15 +10,31 @@ package spec -import "fmt" +import ( + "fmt" + + "github.com/cockroachdb/cockroach/pkg/roachprod/vm" +) // AWSMachineType selects a machine type given the desired number of CPUs. -func AWSMachineType(cpus int, highmem bool) string { +// Also returns the architecture of the selected machine type. +func AWSMachineType(cpus int, highmem bool, arch vm.CPUArch) (string, vm.CPUArch) { // TODO(erikgrinaker): These have significantly less RAM than // their GCE counterparts. Consider harmonizing them. family := "c5d" // 2 GB RAM per CPU + selectedArch := vm.ArchAMD64 + if arch == vm.ArchFIPS { + selectedArch = vm.ArchFIPS + } else if arch == vm.ArchARM64 { + family = "c7g" // 2 GB RAM per CPU (graviton3) + selectedArch = vm.ArchARM64 + } + if highmem { family = "m5d" // 4 GB RAM per CPU + if arch == vm.ArchARM64 { + family = "m7g" // 4 GB RAM per CPU (graviton3) + } } var size string @@ -33,36 +49,63 @@ func AWSMachineType(cpus int, highmem bool) string { size = "4xlarge" case cpus <= 36: size = "9xlarge" + if family == "c7g" || family == "m7g" { + size = "8xlarge" + } case cpus <= 72: size = "18xlarge" + if family == "c7g" || family == "m7g" { + size = "16xlarge" + } case cpus <= 96: size = "24xlarge" default: panic(fmt.Sprintf("no aws machine type with %d cpus", cpus)) } - // There is no c5d.24xlarge. + // There is no m7g.24xlarge, fall back to m5d.24xlarge. + if family == "m7g" && size == "24xlarge" { + family = "m5d" + selectedArch = vm.ArchAMD64 + } + // There is no c7g.24xlarge, fall back to c5d.24xlarge. + if family == "c7g" && size == "24xlarge" { + family = "c5d" + selectedArch = vm.ArchAMD64 + } + + // There is no c5d.24xlarge, fall back to m5d.24xlarge. if family == "c5d" && size == "24xlarge" { family = "m5d" } - return fmt.Sprintf("%s.%s", family, size) + return fmt.Sprintf("%s.%s", family, size), selectedArch } // GCEMachineType selects a machine type given the desired number of CPUs. -func GCEMachineType(cpus int, highmem bool) string { +// Also returns the architecture of the selected machine type. +func GCEMachineType(cpus int, highmem bool, arch vm.CPUArch) (string, vm.CPUArch) { // TODO(peter): This is awkward: at or below 16 cpus, use n1-standard so that // the machines have a decent amount of RAM. We could use custom machine // configurations, but the rules for the amount of RAM per CPU need to be // determined (you can't request any arbitrary amount of RAM). series := "n1" + selectedArch := vm.ArchAMD64 + if arch == vm.ArchFIPS { + selectedArch = vm.ArchFIPS + } kind := "standard" // 3.75 GB RAM per CPU if highmem { kind = "highmem" // 6.5 GB RAM per CPU } else if cpus > 16 { kind = "highcpu" // 0.9 GB RAM per CPU } - return fmt.Sprintf("%s-%s-%d", series, kind, cpus) + if arch == vm.ArchARM64 && !highmem && cpus <= 48 { + series = "t2a" + kind = "standard" + selectedArch = vm.ArchARM64 + } + return fmt.Sprintf("%s-%s-%d", series, kind, cpus), selectedArch } // AzureMachineType selects a machine type given the desired number of CPUs. diff --git a/pkg/cmd/roachtest/spec/option.go b/pkg/cmd/roachtest/spec/option.go index 00acc79c0dcf..ee8080f2adc2 100644 --- a/pkg/cmd/roachtest/spec/option.go +++ b/pkg/cmd/roachtest/spec/option.go @@ -10,13 +10,39 @@ package spec -import "time" +import ( + "time" + + "github.com/cockroachdb/cockroach/pkg/roachprod/vm" +) // Option is the interface satisfied by options to MakeClusterSpec. type Option interface { apply(spec *ClusterSpec) } +type cloudOption string + +func (o cloudOption) apply(spec *ClusterSpec) { + spec.Cloud = string(o) +} + +// Cloud controls what cloud is used to create the cluster. +func Cloud(s string) Option { + return cloudOption(s) +} + +type archOption string + +func (o archOption) apply(spec *ClusterSpec) { + spec.Arch = vm.CPUArch(o) +} + +// Request specific CPU architecture. +func Arch(arch vm.CPUArch) Option { + return archOption(arch) +} + type nodeCPUOption int func (o nodeCPUOption) apply(spec *ClusterSpec) { diff --git a/pkg/cmd/roachtest/test_impl.go b/pkg/cmd/roachtest/test_impl.go index a3600257fa51..23424b7da8f4 100644 --- a/pkg/cmd/roachtest/test_impl.go +++ b/pkg/cmd/roachtest/test_impl.go @@ -122,6 +122,7 @@ func (t *testImpl) BuildVersion() *version.Version { return &t.buildVersion } +// Cockroach returns the path to the cockroach binary. func (t *testImpl) Cockroach() string { return t.cockroach } diff --git a/pkg/cmd/roachtest/test_registry_test.go b/pkg/cmd/roachtest/test_registry_test.go index 9ab9c2780537..5678d25a09d2 100644 --- a/pkg/cmd/roachtest/test_registry_test.go +++ b/pkg/cmd/roachtest/test_registry_test.go @@ -14,6 +14,7 @@ import ( "testing" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec" + "github.com/cockroachdb/cockroach/pkg/roachprod/vm" "github.com/cockroachdb/cockroach/pkg/testutils" "github.com/stretchr/testify/require" ) @@ -41,5 +42,11 @@ func TestMakeTestRegistry(t *testing.T) { require.Equal(t, "foo", s.InstanceType) require.EqualValues(t, 4, s.CPUs) require.True(t, s.TerminateOnMigration) + + s = r.MakeClusterSpec(10, spec.CPU(16), spec.Arch(vm.ArchARM64)) + require.EqualValues(t, 10, s.NodeCount) + require.Equal(t, "foo", s.InstanceType) + require.EqualValues(t, 16, s.CPUs) + require.EqualValues(t, vm.ArchARM64, s.Arch) }) } diff --git a/pkg/cmd/roachtest/test_runner.go b/pkg/cmd/roachtest/test_runner.go index 7160cd4b9ec3..1c165880601a 100644 --- a/pkg/cmd/roachtest/test_runner.go +++ b/pkg/cmd/roachtest/test_runner.go @@ -58,6 +58,8 @@ var ( // reference error used when cluster creation fails for a test errClusterProvisioningFailed = fmt.Errorf("cluster could not be created") + + prng, _ = randutil.NewLockedPseudoRand() ) // testRunner runs tests. @@ -351,11 +353,12 @@ func defaultClusterAllocator( allocateCluster := func( ctx context.Context, t registry.TestSpec, + arch vm.CPUArch, alloc *quotapool.IntAlloc, artifactsDir string, wStatus *workerStatus, ) (*clusterImpl, *vm.CreateOpts, error) { - wStatus.SetStatus("creating cluster") + wStatus.SetStatus(fmt.Sprintf("creating cluster (arch=%q)", arch)) defer wStatus.SetStatus("") existingClusterName := clustersOpt.clusterName @@ -372,6 +375,9 @@ func defaultClusterAllocator( skipStop: r.config.skipClusterStopOnAttach, skipWipe: r.config.skipClusterWipeOnAttach, } + // TODO(srosenberg): we need to think about validation here. Attaching to an incompatible cluster, e.g., + // using arm64 AMI with amd64 binary, would result in obscure errors. The test runner ensures compatibility + // during cluster reuse, whereas attachment via CLI (e.g., via roachprod) does not. lopt.l.PrintfCtx(ctx, "Attaching to existing cluster %s for test %s", existingClusterName, t.Name) c, err := attachToExistingCluster(ctx, existingClusterName, clusterL, t.Cluster, opt, r.cr) if err == nil { @@ -382,11 +388,11 @@ func defaultClusterAllocator( } // Fall through to create new cluster with name override. lopt.l.PrintfCtx( - ctx, "Creating new cluster with custom name %q for test %s: %s", - clustersOpt.clusterName, t.Name, t.Cluster, + ctx, "Creating new cluster with custom name %q for test %s: %s (arch=%q)", + clustersOpt.clusterName, t.Name, t.Cluster, arch, ) } else { - lopt.l.PrintfCtx(ctx, "Creating new cluster for test %s: %s", t.Name, t.Cluster) + lopt.l.PrintfCtx(ctx, "Creating new cluster for test %s: %s (arch=%q)", t.Name, t.Cluster, arch) } cfg := clusterConfig{ @@ -396,6 +402,7 @@ func defaultClusterAllocator( username: clustersOpt.user, localCluster: clustersOpt.typ == localCluster, alloc: alloc, + arch: arch, } return clusterFactory.newCluster(ctx, cfg, wStatus.SetStatus, lopt.tee) } @@ -405,6 +412,7 @@ func defaultClusterAllocator( type clusterAllocatorFn func( ctx context.Context, t registry.TestSpec, + arch vm.CPUArch, alloc *quotapool.IntAlloc, artifactsDir string, wStatus *workerStatus, @@ -485,8 +493,6 @@ func (r *testRunner) runWorker( } }() - prng, _ := randutil.NewPseudoRand() - // Loop until there's no more work in the pool, we get interrupted, or an // error occurs. for { @@ -532,7 +538,7 @@ func (r *testRunner) runWorker( // Attempt to reuse existing cluster. if c != nil && testToRun.canReuseCluster { err = func() error { - l.PrintfCtx(ctx, "Using existing cluster: %s. Wiping", c.name) + l.PrintfCtx(ctx, "Using existing cluster: %s (arch=%q). Wiping", c.name, c.arch) if err := c.WipeE(ctx, l); err != nil { return err } @@ -559,10 +565,48 @@ func (r *testRunner) runWorker( // Let's attempt to create a fresh one. testToRun.canReuseCluster = false } + // sanity check + if c.spec.Cloud != spec.Local && c.spec.Arch != "" && c.arch != c.spec.Arch { + return errors.Newf("cluster arch %q does not match specified arch %q on cloud: %q", c.arch, c.spec.Arch, c.spec.Cloud) + } + } + arch := testToRun.spec.Cluster.Arch + // N.B. local cluster can mix different CPU architectures via emulation; e.g., mac silicon running x86. + if testToRun.canReuseCluster && c != nil && c.spec.Cloud != spec.Local { + // We're reusing a non-local cluster, so we must use the same arch. + arch = c.arch + } + if arch == "" { + // CPU architecture is unspecified, choose one according to the probability distribution. + arch = vm.ArchAMD64 + if prng.Float64() < arm64Probability { + arch = vm.ArchARM64 + } else if prng.Float64() < fipsProbability { + // N.B. branch is taken with probability (1 - arm64Probability) * fipsProbability which is P(fips | amd64). + // N.B. FIPS is only supported on 'amd64' at this time. + arch = vm.ArchFIPS + } + if testToRun.spec.Benchmark { + // TODO(srosenberg): enable after https://github.com/cockroachdb/cockroach/issues/104213 + l.PrintfCtx(ctx, "Disabling randomly chosen arch=%q, %s", arch, testToRun.spec.Name) + arch = vm.ArchAMD64 + } + l.PrintfCtx(ctx, "Using randomly chosen arch=%q, %s", arch, testToRun.spec.Name) + } else { + l.PrintfCtx(ctx, "Using specified arch=%q, %s", arch, testToRun.spec.Name) + } + // N.B. if canReuseCluster is false, then the previous cluster has been destroyed; new one will be created below. + if testToRun.canReuseCluster && c != nil && c.arch != arch { + // Non-local cluster that's being reused must have the same architecture as was ensured above. + if c.spec.Cloud != spec.Local { + return errors.New("infeasible path: non-local cluster arch mismatch") + } + // Local cluster is now reused to emulate a different CPU architecture. + c.arch = arch } // Verify that required native libraries are available. - if err = VerifyLibraries(testToRun.spec.NativeLibs); err != nil { + if err = VerifyLibraries(testToRun.spec.NativeLibs, arch); err != nil { shout(ctx, l, stdout, "Library verification failed: %s", err) return err } @@ -574,13 +618,14 @@ func (r *testRunner) runWorker( // Create a new cluster if can't reuse or reuse attempt failed. // N.B. non-reusable cluster would have been destroyed above. wStatus.SetTest(nil /* test */, testToRun) - wStatus.SetStatus("creating cluster") - c, vmCreateOpts, clusterCreateErr = allocateCluster(ctx, testToRun.spec, testToRun.alloc, artifactsRootDir, wStatus) + c, vmCreateOpts, clusterCreateErr = allocateCluster(ctx, testToRun.spec, arch, testToRun.alloc, artifactsRootDir, wStatus) if clusterCreateErr != nil { clusterCreateErr = errors.Mark(clusterCreateErr, errClusterProvisioningFailed) atomic.AddInt32(&r.numClusterErrs, 1) shout(ctx, l, stdout, "Unable to create (or reuse) cluster for test %s due to: %s.", testToRun.spec.Name, clusterCreateErr) + } else { + l.PrintfCtx(ctx, "Created new cluster for test %s: %s (arch=%q)", testToRun.spec.Name, c.Name(), arch) } } // Prepare the test's logger. Always set this up with real files, using a @@ -606,9 +651,9 @@ func (r *testRunner) runWorker( } t := &testImpl{ spec: &testToRun.spec, - cockroach: cockroach, - cockroachShort: cockroachShort, - deprecatedWorkload: workload, + cockroach: cockroach[arch], + cockroachShort: cockroachShort[arch], + deprecatedWorkload: workload[arch], buildVersion: r.buildVersion, artifactsDir: artifactsDir, artifactsSpec: artifactsSpec, @@ -633,6 +678,9 @@ func (r *testRunner) runWorker( shout(ctx, l, stdout, "failed to post issue: %s", err) } } else { + // Now run the test. + l.PrintfCtx(ctx, "Starting test: %s:%d on cluster=%s (arch=%q)", testToRun.spec.Name, testToRun.runNum, c.Name(), arch) + c.setTest(t) err = c.PutLibraries(ctx, "./lib", t.spec.NativeLibs) diff --git a/pkg/cmd/roachtest/test_test.go b/pkg/cmd/roachtest/test_test.go index b42236a7dbd8..941dfd1ac50b 100644 --- a/pkg/cmd/roachtest/test_test.go +++ b/pkg/cmd/roachtest/test_test.go @@ -98,6 +98,7 @@ func nilLogger() *logger.Logger { func alwaysFailingClusterAllocator( ctx context.Context, t registry.TestSpec, + arch vm.CPUArch, alloc *quotapool.IntAlloc, artifactsDir string, wStatus *workerStatus, diff --git a/pkg/cmd/roachtest/tests/BUILD.bazel b/pkg/cmd/roachtest/tests/BUILD.bazel index 45ed875dc05a..304a41cf4ff0 100644 --- a/pkg/cmd/roachtest/tests/BUILD.bazel +++ b/pkg/cmd/roachtest/tests/BUILD.bazel @@ -184,6 +184,7 @@ go_library( "//pkg/roachprod/install", "//pkg/roachprod/logger", "//pkg/roachprod/prometheus", + "//pkg/roachprod/vm", "//pkg/security/username", "//pkg/server", "//pkg/server/serverpb", diff --git a/pkg/cmd/roachtest/tests/autoupgrade.go b/pkg/cmd/roachtest/tests/autoupgrade.go index 88b308b10ae2..c3aef0c3dea4 100644 --- a/pkg/cmd/roachtest/tests/autoupgrade.go +++ b/pkg/cmd/roachtest/tests/autoupgrade.go @@ -260,6 +260,7 @@ func registerAutoUpgrade(r registry.Registry) { t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") } pred, err := PredecessorVersion(*t.BuildVersion()) + if err != nil { t.Fatal(err) } diff --git a/pkg/cmd/roachtest/tests/cdc.go b/pkg/cmd/roachtest/tests/cdc.go index 9a7cba8726b8..a656127e1ca0 100644 --- a/pkg/cmd/roachtest/tests/cdc.go +++ b/pkg/cmd/roachtest/tests/cdc.go @@ -25,7 +25,6 @@ import ( "net/url" "path/filepath" "regexp" - "runtime" "sort" "strconv" "strings" @@ -44,6 +43,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/jobs/jobspb" "github.com/cockroachdb/cockroach/pkg/roachprod/install" "github.com/cockroachdb/cockroach/pkg/roachprod/logger" + "github.com/cockroachdb/cockroach/pkg/roachprod/vm" "github.com/cockroachdb/cockroach/pkg/testutils/sqlutils" "github.com/cockroachdb/cockroach/pkg/util/log" "github.com/cockroachdb/cockroach/pkg/util/protoutil" @@ -313,9 +313,6 @@ func cdcBasicTest(ctx context.Context, t test.Test, c cluster.Cluster, args cdcT } func runCDCBank(ctx context.Context, t test.Test, c cluster.Cluster) { - if runtime.GOARCH == "arm64" { - t.Skip("Skipping cdc/bank under ARM64.") - } // Make the logs dir on every node to work around the `roachprod get logs` // spam. c.Run(ctx, c.All(), `mkdir -p logs`) @@ -683,9 +680,11 @@ func runCDCKafkaAuth(ctx context.Context, t test.Test, c cluster.Cluster) { func registerCDC(r registry.Registry) { r.Add(registry.TestSpec{ - Name: "cdc/tpcc-1000", - Owner: registry.OwnerCDC, - Cluster: r.MakeClusterSpec(4, spec.CPU(16)), + Name: "cdc/tpcc-1000", + Owner: registry.OwnerCDC, + Benchmark: true, + // N.B. ARM64 is not yet supported, see https://github.com/cockroachdb/cockroach/issues/103888. + Cluster: r.MakeClusterSpec(4, spec.CPU(16), spec.Arch(vm.ArchAMD64)), RequiresLicense: true, Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { cdcBasicTest(ctx, t, c, cdcTestArgs{ @@ -698,9 +697,11 @@ func registerCDC(r registry.Registry) { }, }) r.Add(registry.TestSpec{ - Name: "cdc/tpcc-1000/sink=null", - Owner: registry.OwnerCDC, - Cluster: r.MakeClusterSpec(4, spec.CPU(16)), + Name: "cdc/tpcc-1000/sink=null", + Owner: registry.OwnerCDC, + Benchmark: true, + // N.B. ARM64 is not yet supported, see https://github.com/cockroachdb/cockroach/issues/103888. + Cluster: r.MakeClusterSpec(4, spec.CPU(16), spec.Arch(vm.ArchAMD64)), Tags: []string{"manual"}, RequiresLicense: true, Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { @@ -715,9 +716,11 @@ func registerCDC(r registry.Registry) { }, }) r.Add(registry.TestSpec{ - Name: "cdc/initial-scan", - Owner: registry.OwnerCDC, - Cluster: r.MakeClusterSpec(4, spec.CPU(16)), + Name: "cdc/initial-scan", + Owner: registry.OwnerCDC, + Benchmark: true, + // N.B. ARM64 is not yet supported, see https://github.com/cockroachdb/cockroach/issues/103888. + Cluster: r.MakeClusterSpec(4, spec.CPU(16), spec.Arch(vm.ArchAMD64)), RequiresLicense: true, Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { cdcBasicTest(ctx, t, c, cdcTestArgs{ @@ -731,9 +734,11 @@ func registerCDC(r registry.Registry) { }, }) r.Add(registry.TestSpec{ - Name: "cdc/sink-chaos", - Owner: `cdc`, - Cluster: r.MakeClusterSpec(4, spec.CPU(16)), + Name: "cdc/sink-chaos", + Owner: `cdc`, + Benchmark: true, + // N.B. ARM64 is not yet supported, see https://github.com/cockroachdb/cockroach/issues/103888. + Cluster: r.MakeClusterSpec(4, spec.CPU(16), spec.Arch(vm.ArchAMD64)), RequiresLicense: true, Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { cdcBasicTest(ctx, t, c, cdcTestArgs{ @@ -747,9 +752,11 @@ func registerCDC(r registry.Registry) { }, }) r.Add(registry.TestSpec{ - Name: "cdc/crdb-chaos", - Owner: `cdc`, - Cluster: r.MakeClusterSpec(4, spec.CPU(16)), + Name: "cdc/crdb-chaos", + Owner: `cdc`, + Benchmark: true, + // N.B. ARM64 is not yet supported, see https://github.com/cockroachdb/cockroach/issues/103888. + Cluster: r.MakeClusterSpec(4, spec.CPU(16), spec.Arch(vm.ArchAMD64)), RequiresLicense: true, Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { cdcBasicTest(ctx, t, c, cdcTestArgs{ @@ -770,7 +777,9 @@ func registerCDC(r registry.Registry) { // TODO(mrtracy): This workload is designed to be running on a 20CPU nodes, // but this cannot be allocated without some sort of configuration outside // of this test. Look into it. - Cluster: r.MakeClusterSpec(4, spec.CPU(16)), + Benchmark: true, + // N.B. ARM64 is not yet supported, see https://github.com/cockroachdb/cockroach/issues/103888. + Cluster: r.MakeClusterSpec(4, spec.CPU(16), spec.Arch(vm.ArchAMD64)), RequiresLicense: true, Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { cdcBasicTest(ctx, t, c, cdcTestArgs{ @@ -789,9 +798,11 @@ func registerCDC(r registry.Registry) { }, }) r.Add(registry.TestSpec{ - Name: "cdc/cloud-sink-gcs/rangefeed=true", - Owner: `cdc`, - Cluster: r.MakeClusterSpec(4, spec.CPU(16)), + Name: "cdc/cloud-sink-gcs/rangefeed=true", + Owner: `cdc`, + Benchmark: true, + // N.B. ARM64 is not yet supported, see https://github.com/cockroachdb/cockroach/issues/103888. + Cluster: r.MakeClusterSpec(4, spec.CPU(16), spec.Arch(vm.ArchAMD64)), RequiresLicense: true, Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { cdcBasicTest(ctx, t, c, cdcTestArgs{ @@ -810,9 +821,11 @@ func registerCDC(r registry.Registry) { }, }) r.Add(registry.TestSpec{ - Name: "cdc/pubsub-sink", - Owner: `cdc`, - Cluster: r.MakeClusterSpec(4, spec.CPU(16)), + Name: "cdc/pubsub-sink", + Owner: `cdc`, + Benchmark: true, + // N.B. ARM64 is not yet supported, see https://github.com/cockroachdb/cockroach/issues/103888. + Cluster: r.MakeClusterSpec(4, spec.CPU(16), spec.Arch(vm.ArchAMD64)), RequiresLicense: true, Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { cdcBasicTest(ctx, t, c, cdcTestArgs{ @@ -836,9 +849,11 @@ func registerCDC(r registry.Registry) { // TODO(rui): Change to a shorter test as it just needs to validate // permissions and shouldn't need to run a full 30m workload. r.Add(registry.TestSpec{ - Name: "cdc/pubsub-sink/assume-role", - Owner: `cdc`, - Cluster: r.MakeClusterSpec(4, spec.CPU(16)), + Name: "cdc/pubsub-sink/assume-role", + Owner: `cdc`, + Benchmark: true, + // N.B. ARM64 is not yet supported, see https://github.com/cockroachdb/cockroach/issues/103888. + Cluster: r.MakeClusterSpec(4, spec.CPU(16), spec.Arch(vm.ArchAMD64)), RequiresLicense: true, Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { cdcBasicTest(ctx, t, c, cdcTestArgs{ @@ -863,9 +878,11 @@ func registerCDC(r registry.Registry) { // TODO(rui): Change to a shorter test as it just needs to validate // permissions and shouldn't need to run a full 30m workload. r.Add(registry.TestSpec{ - Name: "cdc/cloud-sink-gcs/assume-role", - Owner: `cdc`, - Cluster: r.MakeClusterSpec(4, spec.CPU(16)), + Name: "cdc/cloud-sink-gcs/assume-role", + Owner: `cdc`, + Benchmark: true, + // N.B. ARM64 is not yet supported, see https://github.com/cockroachdb/cockroach/issues/103888. + Cluster: r.MakeClusterSpec(4, spec.CPU(16), spec.Arch(vm.ArchAMD64)), RequiresLicense: true, Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { cdcBasicTest(ctx, t, c, cdcTestArgs{ @@ -903,18 +920,21 @@ func registerCDC(r registry.Registry) { }) */ r.Add(registry.TestSpec{ - Name: "cdc/kafka-auth", - Owner: `cdc`, - Cluster: r.MakeClusterSpec(1), + Name: "cdc/kafka-auth", + Owner: `cdc`, + Benchmark: true, + // N.B. ARM64 is not yet supported, see https://github.com/cockroachdb/cockroach/issues/103888. + Cluster: r.MakeClusterSpec(1, spec.Arch(vm.ArchAMD64)), RequiresLicense: true, Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { runCDCKafkaAuth(ctx, t, c) }, }) r.Add(registry.TestSpec{ - Name: "cdc/bank", - Owner: `cdc`, - Cluster: r.MakeClusterSpec(4), + Name: "cdc/bank", + Owner: `cdc`, + // N.B. ARM64 is not yet supported, see https://github.com/cockroachdb/cockroach/issues/103888. + Cluster: r.MakeClusterSpec(4, spec.Arch(vm.ArchAMD64)), RequiresLicense: true, Timeout: 30 * time.Minute, Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { diff --git a/pkg/cmd/roachtest/tests/follower_reads.go b/pkg/cmd/roachtest/tests/follower_reads.go index a2ee91ef5a7b..cebb3c439cb2 100644 --- a/pkg/cmd/roachtest/tests/follower_reads.go +++ b/pkg/cmd/roachtest/tests/follower_reads.go @@ -19,7 +19,6 @@ import ( "net/http" "reflect" "regexp" - "runtime" "strconv" "strings" "time" @@ -100,9 +99,6 @@ func registerFollowerReads(r registry.Registry) { spec.CPU(2), ), Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { - if runtime.GOARCH == "arm64" { - t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") - } runFollowerReadsMixedVersionSingleRegionTest(ctx, t, c, *t.BuildVersion()) }, }) diff --git a/pkg/cmd/roachtest/tests/import.go b/pkg/cmd/roachtest/tests/import.go index 89e006e30589..5e6e35741fb0 100644 --- a/pkg/cmd/roachtest/tests/import.go +++ b/pkg/cmd/roachtest/tests/import.go @@ -359,6 +359,7 @@ func registerImportMixedVersion(r registry.Registry) { t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") } predV, err := PredecessorVersion(*t.BuildVersion()) + if err != nil { t.Fatal(err) } diff --git a/pkg/cmd/roachtest/tests/indexes.go b/pkg/cmd/roachtest/tests/indexes.go index cb2cdff6124b..2e2e619531d7 100644 --- a/pkg/cmd/roachtest/tests/indexes.go +++ b/pkg/cmd/roachtest/tests/indexes.go @@ -140,9 +140,3 @@ func registerNIndexes(r registry.Registry, secondaryIndexes int) { func registerIndexes(r registry.Registry) { registerNIndexes(r, 2) } - -func registerIndexesBench(r registry.Registry) { - for i := 0; i <= 100; i++ { - registerNIndexes(r, i) - } -} diff --git a/pkg/cmd/roachtest/tests/mixed_version_cdc.go b/pkg/cmd/roachtest/tests/mixed_version_cdc.go index 1a86b1977add..e5f291e99f39 100644 --- a/pkg/cmd/roachtest/tests/mixed_version_cdc.go +++ b/pkg/cmd/roachtest/tests/mixed_version_cdc.go @@ -25,6 +25,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" + "github.com/cockroachdb/cockroach/pkg/roachprod/vm" "github.com/cockroachdb/cockroach/pkg/util/randutil" "github.com/cockroachdb/cockroach/pkg/util/syncutil" "github.com/cockroachdb/cockroach/pkg/util/timeutil" @@ -65,9 +66,10 @@ func registerCDCMixedVersions(r registry.Registry) { zones = teamcityAgentZone } r.Add(registry.TestSpec{ - Name: "cdc/mixed-versions", - Owner: registry.OwnerTestEng, - Cluster: r.MakeClusterSpec(5, spec.Zones(zones)), + Name: "cdc/mixed-versions", + Owner: registry.OwnerTestEng, + // N.B. ARM64 is not yet supported, see https://github.com/cockroachdb/cockroach/issues/103888. + Cluster: r.MakeClusterSpec(5, spec.Zones(zones), spec.Arch(vm.ArchAMD64)), Timeout: timeout, RequiresLicense: true, Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { diff --git a/pkg/cmd/roachtest/tests/mixed_version_schemachange.go b/pkg/cmd/roachtest/tests/mixed_version_schemachange.go index 797fd97ce65b..e8e53e438e94 100644 --- a/pkg/cmd/roachtest/tests/mixed_version_schemachange.go +++ b/pkg/cmd/roachtest/tests/mixed_version_schemachange.go @@ -34,6 +34,7 @@ func registerSchemaChangeMixedVersions(r registry.Registry) { if runtime.GOARCH == "arm64" { t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") } + maxOps := 100 concurrency := 5 if c.IsLocal() { diff --git a/pkg/cmd/roachtest/tests/rebalance_load.go b/pkg/cmd/roachtest/tests/rebalance_load.go index d3223a0000d2..81485336de6e 100644 --- a/pkg/cmd/roachtest/tests/rebalance_load.go +++ b/pkg/cmd/roachtest/tests/rebalance_load.go @@ -15,7 +15,6 @@ import ( gosql "database/sql" "fmt" "math/rand" - "runtime" "sort" "strings" "time" @@ -157,9 +156,6 @@ func registerRebalanceLoad(r registry.Registry) { Owner: registry.OwnerKV, Cluster: r.MakeClusterSpec(4), // the last node is just used to generate load Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { - if runtime.GOARCH == "arm64" { - t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") - } if c.IsLocal() { concurrency = 32 fmt.Printf("lowering concurrency to %d in local testing\n", concurrency) diff --git a/pkg/cmd/roachtest/tests/tpcc.go b/pkg/cmd/roachtest/tests/tpcc.go index c22666379bcc..5333ce823c89 100644 --- a/pkg/cmd/roachtest/tests/tpcc.go +++ b/pkg/cmd/roachtest/tests/tpcc.go @@ -297,6 +297,7 @@ var tpccSupportedWarehouses = []struct { // TODO(tbg): this number is copied from gce-n4cpu16. The real number should be a // little higher, find out what it is. {hardware: "gce-n5cpu16", v: version.MustParse(`v19.1.0-0`), warehouses: 1300}, + {hardware: "aws-n5cpu16", v: version.MustParse(`v19.1.0-0`), warehouses: 2100}, // Ditto. {hardware: "gce-n5cpu16", v: version.MustParse(`v2.1.0-0`), warehouses: 1300}, } @@ -504,6 +505,7 @@ func registerTPCC(r registry.Registry) { runTPCCMixedHeadroom(ctx, t, c, cloud, 1) }, }) + r.Add(registry.TestSpec{ // run the same mixed-headroom test, but going back two versions Name: "tpcc/mixed-headroom/multiple-upgrades/" + mixedHeadroomSpec.String(), @@ -1383,117 +1385,6 @@ func runTPCCBench(ctx context.Context, t test.Test, c cluster.Cluster, b tpccBen } } -func registerTPCCBench(r registry.Registry) { - specs := []tpccBenchSpec{ - { - Nodes: 3, - CPUs: 4, - - LoadWarehouses: 1000, - EstimatedMax: 325, - }, - { - Nodes: 3, - CPUs: 16, - - LoadWarehouses: 2000, - EstimatedMax: 1300, - }, - // objective 1, key result 1. - { - Nodes: 30, - CPUs: 16, - - LoadWarehouses: 10000, - EstimatedMax: 5300, - }, - // objective 1, key result 2. - { - Nodes: 18, - CPUs: 16, - LoadConfig: singlePartitionedLoadgen, - - LoadWarehouses: 10000, - EstimatedMax: 8000, - }, - // objective 2, key result 1. - { - Nodes: 7, - CPUs: 16, - Chaos: true, - - LoadWarehouses: 5000, - EstimatedMax: 2000, - }, - // objective 3, key result 1. - { - Nodes: 3, - CPUs: 16, - Distribution: multiZone, - - LoadWarehouses: 2000, - EstimatedMax: 1000, - }, - // objective 3, key result 2. - { - Nodes: 9, - CPUs: 16, - Distribution: multiRegion, - LoadConfig: multiLoadgen, - - LoadWarehouses: 12000, - EstimatedMax: 8000, - }, - // objective 4, key result 2. - { - Nodes: 64, - CPUs: 16, - - LoadWarehouses: 50000, - EstimatedMax: 40000, - }, - - // See https://github.com/cockroachdb/cockroach/issues/31409 for the next three specs. - { - Nodes: 6, - CPUs: 16, - - LoadWarehouses: 5000, - EstimatedMax: 3000, - LoadConfig: singlePartitionedLoadgen, - }, - { - Nodes: 12, - CPUs: 16, - - LoadWarehouses: 10000, - EstimatedMax: 6000, - LoadConfig: singlePartitionedLoadgen, - }, - { - Nodes: 24, - CPUs: 16, - - LoadWarehouses: 20000, - EstimatedMax: 12000, - LoadConfig: singlePartitionedLoadgen, - }, - - // Requested by @awoods87. - { - Nodes: 11, - CPUs: 32, - - LoadWarehouses: 10000, - EstimatedMax: 8000, - }, - } - - for _, b := range specs { - registerTPCCBenchSpec(r, b) - } -} - // makeWorkloadScrapeNodes creates a ScrapeNode for every workloadInstance. func makeWorkloadScrapeNodes( workloadNode install.Node, workloadInstances []workloadInstance, diff --git a/pkg/cmd/roachtest/tests/tpch_concurrency.go b/pkg/cmd/roachtest/tests/tpch_concurrency.go index 974cf9b9c55e..999a43c16608 100644 --- a/pkg/cmd/roachtest/tests/tpch_concurrency.go +++ b/pkg/cmd/roachtest/tests/tpch_concurrency.go @@ -198,10 +198,10 @@ func registerTPCHConcurrency(r registry.Registry) { } r.Add(registry.TestSpec{ - Name: "tpch_concurrency", - Owner: registry.OwnerSQLQueries, + Name: "tpch_concurrency", + Owner: registry.OwnerSQLQueries, Benchmark: true, - Cluster: r.MakeClusterSpec(numNodes), + Cluster: r.MakeClusterSpec(numNodes), Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { runTPCHConcurrency(ctx, t, c, true /* lowerRefreshSpansBytes */, false /* disableStreamer */) @@ -232,17 +232,11 @@ func registerTPCHConcurrency(r registry.Registry) { // TODO(yuzefovich): remove this once the streamer is stabilized. r.Add(registry.TestSpec{ -<<<<<<< HEAD - Name: "tpch_concurrency/no_streamer", - Owner: registry.OwnerSQLQueries, - Cluster: r.MakeClusterSpec(numNodes), -======= Name: "tpch_concurrency/no_streamer", Owner: registry.OwnerSQLQueries, Benchmark: true, - Timeout: timeout, Cluster: r.MakeClusterSpec(numNodes), ->>>>>>> 0df3a03e781 (roachtest: require perf. tests to opt in via TestSpec.Benchmark) + Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { runTPCHConcurrency(ctx, t, c, true /* lowerRefreshSpansBytes */, true /* disableStreamer */) }, diff --git a/pkg/cmd/roachtest/tests/versionupgrade.go b/pkg/cmd/roachtest/tests/versionupgrade.go index 1e6d52a42ef3..3e4f2998bc91 100644 --- a/pkg/cmd/roachtest/tests/versionupgrade.go +++ b/pkg/cmd/roachtest/tests/versionupgrade.go @@ -102,6 +102,7 @@ func runVersionUpgrade(ctx context.Context, t test.Test, c cluster.Cluster) { // of #58489 is being addressed. _ = schemaChangeStep backupStep := func(ctx context.Context, t test.Test, u *versionUpgradeTest) { + // Verify that backups can be created in various configurations. This is // important to test because changes in system tables might cause backups to // fail in mixed-version clusters. diff --git a/pkg/roachprod/install/BUILD.bazel b/pkg/roachprod/install/BUILD.bazel index 0c41730b9ac4..066c6e416245 100644 --- a/pkg/roachprod/install/BUILD.bazel +++ b/pkg/roachprod/install/BUILD.bazel @@ -29,6 +29,7 @@ go_library( "//pkg/roachprod/logger", "//pkg/roachprod/ssh", "//pkg/roachprod/ui", + "//pkg/roachprod/vm", "//pkg/roachprod/vm/aws", "//pkg/roachprod/vm/local", "//pkg/util", @@ -54,6 +55,7 @@ go_test( embed = [":install"], deps = [ "//pkg/roachprod/logger", + "//pkg/roachprod/vm", "//pkg/testutils", "//pkg/util/retry", "@com_github_cockroachdb_datadriven//:datadriven", diff --git a/pkg/roachprod/install/staging.go b/pkg/roachprod/install/staging.go index 09c31a6dd2fa..7a4382479fe5 100644 --- a/pkg/roachprod/install/staging.go +++ b/pkg/roachprod/install/staging.go @@ -17,6 +17,7 @@ import ( "path/filepath" "github.com/cockroachdb/cockroach/pkg/roachprod/logger" + "github.com/cockroachdb/cockroach/pkg/roachprod/vm" "github.com/cockroachdb/errors" ) @@ -98,30 +99,30 @@ var ( ) // ArchInfoForOS returns an ArchInfo for the given OS and Architecture if currently supported. -func ArchInfoForOS(os string, arch string) (archInfo, error) { - if arch != "" && arch != "amd64" && arch != "arm64" && arch != "fips" { +func ArchInfoForOS(os string, arch vm.CPUArch) (archInfo, error) { + if arch != "" && arch != vm.ArchAMD64 && arch != vm.ArchARM64 && arch != vm.ArchFIPS { return archInfo{}, errors.Errorf("unsupported architecture %q", arch) } switch os { case "linux": - if arch == "arm64" { + if arch == vm.ArchARM64 { return linux_arm64_ArchInfo, nil } - if arch == "fips" { + if arch == vm.ArchFIPS { return linux_x86_64_fips_ArchInfo, nil } return linux_x86_64_ArchInfo, nil case "darwin": - if arch == "arm64" { + if arch == vm.ArchARM64 { return darwin_arm64_ArchInfo, nil } - if arch == "fips" { + if arch == vm.ArchFIPS { return archInfo{}, errors.Errorf("%q is not supported on %q", arch, os) } return darwin_x86_64_ArchInfo, nil case "windows": - if arch == "fips" || arch == "arm64" { + if arch == vm.ArchFIPS || arch == vm.ArchARM64 { return archInfo{}, errors.Errorf("%q is not supported on %q", arch, os) } return windowsArchInfo, nil @@ -176,7 +177,7 @@ func StageApplication( applicationName string, version string, os string, - arch string, + arch vm.CPUArch, destDir string, ) error { archInfo, err := ArchInfoForOS(os, arch) @@ -226,7 +227,7 @@ func StageApplication( // URLsForApplication returns a slice of URLs that should be // downloaded for the given application. func URLsForApplication( - application string, version string, os string, arch string, + application string, version string, os string, arch vm.CPUArch, ) ([]*url.URL, error) { archInfo, err := ArchInfoForOS(os, arch) if err != nil { diff --git a/pkg/roachprod/install/staging_test.go b/pkg/roachprod/install/staging_test.go index 3455df5fff79..977d755bf401 100644 --- a/pkg/roachprod/install/staging_test.go +++ b/pkg/roachprod/install/staging_test.go @@ -13,6 +13,7 @@ package install import ( "testing" + "github.com/cockroachdb/cockroach/pkg/roachprod/vm" "github.com/stretchr/testify/require" ) @@ -322,7 +323,7 @@ func TestURLsForApplication(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - got, err := URLsForApplication(tt.args.application, tt.args.version, tt.args.os, tt.args.arch) + got, err := URLsForApplication(tt.args.application, tt.args.version, tt.args.os, vm.CPUArch(tt.args.arch)) if (err != nil) != tt.wantErr { t.Errorf("URLsForApplication() error = %v, wantErr %v", err, tt.wantErr) return diff --git a/pkg/roachprod/prometheus/BUILD.bazel b/pkg/roachprod/prometheus/BUILD.bazel index 6c763655eeb3..8da074a7c4e1 100644 --- a/pkg/roachprod/prometheus/BUILD.bazel +++ b/pkg/roachprod/prometheus/BUILD.bazel @@ -9,6 +9,7 @@ go_library( deps = [ "//pkg/roachprod/install", "//pkg/roachprod/logger", + "//pkg/roachprod/vm", "@com_github_cockroachdb_errors//:errors", "@com_github_prometheus_client_golang//api/prometheus/v1:prometheus", "@com_github_prometheus_common//model", diff --git a/pkg/roachprod/prometheus/prometheus.go b/pkg/roachprod/prometheus/prometheus.go index adc994e51f72..4798d7628eb7 100644 --- a/pkg/roachprod/prometheus/prometheus.go +++ b/pkg/roachprod/prometheus/prometheus.go @@ -20,6 +20,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/roachprod/install" "github.com/cockroachdb/cockroach/pkg/roachprod/logger" + "github.com/cockroachdb/cockroach/pkg/roachprod/vm" "github.com/cockroachdb/errors" promv1 "github.com/prometheus/client_golang/api/prometheus/v1" "github.com/prometheus/common/model" @@ -189,20 +190,25 @@ type Prometheus struct { // Init creates a prometheus instance on the given cluster. func Init( - ctx context.Context, l *logger.Logger, c *install.SyncedCluster, cfg Config, + ctx context.Context, l *logger.Logger, c *install.SyncedCluster, arch vm.CPUArch, cfg Config, ) (_ *Prometheus, _ error) { + binArch := "amd64" + if arch == vm.ArchARM64 { + binArch = "arm64" + } + if len(cfg.NodeExporter) > 0 { // NB: when upgrading here, make sure to target a version that picks up this PR: // https://github.com/prometheus/node_exporter/pull/2311 // At time of writing, there hasn't been a release in over half a year. if err := c.RepeatRun(ctx, l, os.Stdout, os.Stderr, cfg.NodeExporter, "download node exporter", - ` + fmt.Sprintf(` (sudo systemctl stop node_exporter || true) && rm -rf node_exporter && mkdir -p node_exporter && curl -fsSL \ - https://github.com/prometheus/node_exporter/releases/download/v1.2.2/node_exporter-1.2.2.linux-amd64.tar.gz | + https://storage.googleapis.com/cockroach-fixtures/prometheus/node_exporter-1.2.2.linux-%s.tar.gz | tar zxv --strip-components 1 -C node_exporter -`); err != nil { +`, binArch)); err != nil { return nil, err } @@ -235,9 +241,9 @@ sudo systemd-run --unit node_exporter --same-dir ./node_exporter`, os.Stderr, cfg.PrometheusNode, "download prometheus", - `sudo rm -rf /tmp/prometheus && mkdir /tmp/prometheus && cd /tmp/prometheus && - curl -fsSL https://storage.googleapis.com/cockroach-fixtures/prometheus/prometheus-2.27.1.linux-amd64.tar.gz | tar zxv --strip-components=1`, - ); err != nil { + fmt.Sprintf(`sudo rm -rf /tmp/prometheus && mkdir /tmp/prometheus && cd /tmp/prometheus && + curl -fsSL https://storage.googleapis.com/cockroach-fixtures/prometheus/prometheus-2.27.1.linux-%s.tar.gz | tar zxv --strip-components=1`, + binArch)); err != nil { return nil, err } // create and upload prom config @@ -279,14 +285,16 @@ sudo systemd-run --unit prometheus --same-dir \ if cfg.Grafana.Enabled { // Install Grafana. if err := c.RepeatRun(ctx, l, - os.Stdout, - os.Stderr, cfg.PrometheusNode, "install grafana", - `sudo apt-get install -qqy apt-transport-https && + l.Stdout, + l.Stderr, cfg.PrometheusNode, "install grafana", + fmt.Sprintf(` +sudo apt-get install -qqy apt-transport-https && sudo apt-get install -qqy software-properties-common wget && -wget -q -O - https://packages.grafana.com/gpg.key | sudo apt-key add - && -echo "deb https://packages.grafana.com/enterprise/deb stable main" | sudo tee -a /etc/apt/sources.list.d/grafana.list && -sudo apt-get update -qqy && sudo apt-get install -qqy grafana-enterprise && sudo mkdir -p /var/lib/grafana/dashboards`, - ); err != nil { +sudo apt-get install -y adduser libfontconfig1 && +wget https://dl.grafana.com/enterprise/release/grafana-enterprise_9.2.3_%s.deb -O grafana-enterprise_9.2.3_%s.deb && +sudo dpkg -i grafana-enterprise_9.2.3_%s.deb && +sudo mkdir -p /var/lib/grafana/dashboards`, + binArch, binArch, binArch)); err != nil { return nil, err } diff --git a/pkg/roachprod/roachprod.go b/pkg/roachprod/roachprod.go index ff88cb0707c8..7ad6b7ef7d4b 100644 --- a/pkg/roachprod/roachprod.go +++ b/pkg/roachprod/roachprod.go @@ -517,7 +517,7 @@ func Stage( dir = stageDir } - return install.StageApplication(ctx, l, c, applicationName, version, os, arch, dir) + return install.StageApplication(ctx, l, c, applicationName, version, os, vm.CPUArch(arch), dir) } // Reset resets all VMs in a cluster. @@ -1357,7 +1357,7 @@ func StageURL( if stageArch != "" { arch = stageArch } - urls, err := install.URLsForApplication(applicationName, version, os, arch) + urls, err := install.URLsForApplication(applicationName, version, os, vm.CPUArch(arch)) if err != nil { return nil, err } @@ -1402,6 +1402,7 @@ func StartGrafana( ctx context.Context, l *logger.Logger, clusterName string, + arch vm.CPUArch, grafanaURL string, promCfg *prometheus.Config, // passed iff grafanaURL is empty ) error { @@ -1435,7 +1436,7 @@ func StartGrafana( promCfg.WithGrafanaDashboard(grafanaURL) } } - _, err = prometheus.Init(ctx, l, c, *promCfg) + _, err = prometheus.Init(ctx, l, c, arch, *promCfg) if err != nil { return err } diff --git a/pkg/roachprod/vm/aws/aws.go b/pkg/roachprod/vm/aws/aws.go index b1b20114bf58..cd60221c0eec 100644 --- a/pkg/roachprod/vm/aws/aws.go +++ b/pkg/roachprod/vm/aws/aws.go @@ -265,7 +265,9 @@ var defaultConfig = func() (cfg *awsConfig) { // cluster creation. If the geo flag is specified, nodes are distributed between // zones. var defaultCreateZones = []string{ - "us-east-2b", + // N.B. us-east-2a is the default zone for non-geo distributed clusters. It appears to have a higher on-demand + // capacity of c7g.8xlarge (graviton3) than us-east-2b. + "us-east-2a", "us-west-2b", "eu-west-2b", } @@ -434,12 +436,13 @@ func (p *Provider) Create( var g errgroup.Group limiter := rate.NewLimiter(rate.Limit(providerOpts.CreateRateLimit), 2 /* buckets */) for i := range names { + index := i capName := names[i] placement := zones[i] res := limiter.Reserve() g.Go(func() error { time.Sleep(res.Delay()) - return p.runInstance(l, capName, placement, opts, providerOpts) + return p.runInstance(l, capName, index, placement, opts, providerOpts) }) } if err := g.Wait(); err != nil { @@ -797,7 +800,12 @@ func (p *Provider) listRegion(region string, opts ProviderOpts) (vm.List, error) // we need to do a bit of work to look up all of the various ids that // we need in order to actually allocate an instance. func (p *Provider) runInstance( - l *logger.Logger, name string, zone string, opts vm.CreateOpts, providerOpts *ProviderOpts, + l *logger.Logger, + name string, + instanceIdx int, + zone string, + opts vm.CreateOpts, + providerOpts *ProviderOpts, ) error { // There exist different flags to control the machine type when ssd is true. // This enables sane defaults for either setting but the behavior can be @@ -850,7 +858,10 @@ func (p *Provider) runInstance( var labelPairs []string addLabel := func(key, value string) { - labelPairs = append(labelPairs, fmt.Sprintf("{Key=%s,Value=%s}", key, value)) + // N.B. AWS does not allow empty values. + if value != "" { + labelPairs = append(labelPairs, fmt.Sprintf("{Key=%s,Value=%s}", key, value)) + } } for key, value := range opts.CustomLabels { @@ -888,7 +899,8 @@ func (p *Provider) runInstance( extraMountOpts = "nobarrier" } } - filename, err := writeStartupScript(extraMountOpts, providerOpts.UseMultipleDisks, opts.EnableFIPS) + filename, err := writeStartupScript(extraMountOpts, providerOpts.UseMultipleDisks, opts.Arch == string(vm.ArchFIPS)) + if err != nil { return errors.Wrapf(err, "could not write AWS startup script to temp file") } @@ -904,14 +916,22 @@ func (p *Provider) runInstance( } imageID := withFlagOverride(az.region.AMI_X86_64, &providerOpts.ImageAMI) useArmAMI := strings.Index(machineType, "6g.") == 1 || strings.Index(machineType, "7g.") == 1 + if useArmAMI && (opts.Arch != "" && opts.Arch != string(vm.ArchARM64)) { + return errors.Errorf("machine type %s is arm64, but requested arch is %s", machineType, opts.Arch) + } //TODO(srosenberg): remove this once we have a better way to detect ARM64 machines if useArmAMI { imageID = withFlagOverride(az.region.AMI_ARM64, &providerOpts.ImageAMI) - l.Printf("Using ARM64 AMI: %s for machine type: %s", imageID, machineType) + // N.B. use arbitrary instanceIdx to suppress the same info for every other instance being created. + if instanceIdx == 0 { + l.Printf("Using ARM64 AMI: %s for machine type: %s", imageID, machineType) + } } - if !useArmAMI && opts.EnableFIPS { + if opts.Arch == string(vm.ArchFIPS) { imageID = withFlagOverride(az.region.AMI_FIPS, &providerOpts.ImageAMI) - l.Printf("Using FIPS-enabled AMI: %s for machine type: %s", imageID, machineType) + if instanceIdx == 0 { + l.Printf("Using FIPS-enabled AMI: %s for machine type: %s", imageID, machineType) + } } args := []string{ "ec2", "run-instances", diff --git a/pkg/roachprod/vm/gce/gcloud.go b/pkg/roachprod/vm/gce/gcloud.go index 9e9e112c902d..660d8d37c548 100644 --- a/pkg/roachprod/vm/gce/gcloud.go +++ b/pkg/roachprod/vm/gce/gcloud.go @@ -36,6 +36,7 @@ const ( // ProviderName is gce. ProviderName = "gce" DefaultImage = "ubuntu-2004-focal-v20210603" + ARM64Image = "ubuntu-2004-focal-arm64-v20230523" FIPSImage = "ubuntu-pro-fips-2004-focal-v20230302" defaultImageProject = "ubuntu-os-cloud" FIPSImageProject = "ubuntu-os-pro-cloud" @@ -421,10 +422,34 @@ func (p *Provider) Create( // Fixed args. image := providerOpts.Image imageProject := defaultImageProject - if opts.EnableFIPS { + useArmAMI := strings.HasPrefix(strings.ToLower(providerOpts.MachineType), "t2a-") + if useArmAMI && (opts.Arch != "" && opts.Arch != string(vm.ArchARM64)) { + return errors.Errorf("machine type %s is arm64, but requested arch is %s", providerOpts.MachineType, opts.Arch) + } + if useArmAMI && opts.SSDOpts.UseLocalSSD { + return errors.New("local SSDs are not supported with T2A instances, use --local-ssd=false") + } + if useArmAMI { + if len(providerOpts.Zones) == 0 { + zones = []string{"us-central1-a"} + } else { + for _, zone := range providerOpts.Zones { + if !strings.HasPrefix(zone, "us-central1-") { + return errors.New("T2A instances are not supported outside of us-central1") + } + } + } + } + //TODO(srosenberg): remove this once we have a better way to detect ARM64 machines + if useArmAMI { + image = ARM64Image + l.Printf("Using ARM64 AMI: %s for machine type: %s", image, providerOpts.MachineType) + } + if opts.Arch == string(vm.ArchFIPS) { // NB: if FIPS is enabled, it overrides the image passed via CLI (--gce-image) image = FIPSImage imageProject = FIPSImageProject + l.Printf("Using FIPS-enabled AMI: %s for machine type: %s", image, providerOpts.MachineType) } args := []string{ "compute", "instances", "create", @@ -495,7 +520,7 @@ func (p *Provider) Create( } // Create GCE startup script file. - filename, err := writeStartupScript(extraMountOpts, opts.SSDOpts.FileSystem, providerOpts.UseMultipleDisks, opts.EnableFIPS) + filename, err := writeStartupScript(extraMountOpts, opts.SSDOpts.FileSystem, providerOpts.UseMultipleDisks, opts.Arch == string(vm.ArchFIPS)) if err != nil { return errors.Wrapf(err, "could not write GCE startup script to temp file") } diff --git a/pkg/roachprod/vm/vm.go b/pkg/roachprod/vm/vm.go index e0046c6f5287..b1d6dddbadfb 100644 --- a/pkg/roachprod/vm/vm.go +++ b/pkg/roachprod/vm/vm.go @@ -34,10 +34,30 @@ const ( TagLifetime = "lifetime" // TagRoachprod is roachprod tag const, value is true & false. TagRoachprod = "roachprod" + // TagUsage indicates where a certain resource is used. "roachtest" is used + // as the key for roachtest created resources. + TagUsage = "usage" + // TagArch is the CPU architecture tag const. + TagArch = "arch" + + ArchARM64 = CPUArch("arm64") + ArchAMD64 = CPUArch("amd64") + ArchFIPS = CPUArch("fips") ) +type CPUArch string + // GetDefaultLabelMap returns a label map for a common set of labels. func GetDefaultLabelMap(opts CreateOpts) map[string]string { + // Add architecture override tag, only if it was specified. + if opts.Arch != "" { + return map[string]string{ + TagCluster: opts.ClusterName, + TagLifetime: opts.Lifetime.String(), + TagRoachprod: "true", + TagArch: opts.Arch, + } + } return map[string]string{ TagCluster: opts.ClusterName, TagLifetime: opts.Lifetime.String(), @@ -176,7 +196,7 @@ type CreateOpts struct { CustomLabels map[string]string GeoDistributed bool - EnableFIPS bool + Arch string VMProviders []string SSDOpts struct { UseLocalSSD bool @@ -197,6 +217,8 @@ func DefaultCreateOpts() CreateOpts { GeoDistributed: false, VMProviders: []string{}, OsVolumeSize: 10, + // N.B. When roachprod is used via CLI, this will be overridden by {"roachprod":"true"}. + CustomLabels: map[string]string{"roachtest": "true"}, } defaultCreateOpts.SSDOpts.UseLocalSSD = true defaultCreateOpts.SSDOpts.NoExt4Barrier = true diff --git a/pkg/util/randutil/rand.go b/pkg/util/randutil/rand.go index daa2be07a772..b83f8547974d 100644 --- a/pkg/util/randutil/rand.go +++ b/pkg/util/randutil/rand.go @@ -97,6 +97,12 @@ func NewPseudoRand() (*rand.Rand, int64) { return rand.New(rand.NewSource(seed)), seed } +// Same as NewPseudoRand, but the returned Rand is using thread safe underlying source. +func NewLockedPseudoRand() (*rand.Rand, int64) { + seed := envutil.EnvOrDefaultInt64("COCKROACH_RANDOM_SEED", NewPseudoSeed()) + return rand.New(NewLockedSource(seed)), seed +} + // NewTestRand returns an instance of math/rand.Rand seeded from rng, which is // seeded with the global seed. If the caller is a test with a different // path-qualified name than the previous caller, rng is reseeded from the global