diff --git a/pkg/build/BUILD.bazel b/pkg/build/BUILD.bazel index db1d660daa0b..a4b25782961c 100644 --- a/pkg/build/BUILD.bazel +++ b/pkg/build/BUILD.bazel @@ -24,6 +24,7 @@ go_library( "github.com/cockroachdb/cockroach/pkg/build.utcTime": "{BUILD_UTCTIME}", }, deps = [ + "//pkg/util/buildutil", "//pkg/util/envutil", "//pkg/util/version", ], diff --git a/pkg/build/info.go b/pkg/build/info.go index d7b249dc21ac..8d0a7cffc9b7 100644 --- a/pkg/build/info.go +++ b/pkg/build/info.go @@ -19,6 +19,7 @@ import ( "text/tabwriter" "time" + "github.com/cockroachdb/cockroach/pkg/util/buildutil" "github.com/cockroachdb/cockroach/pkg/util/envutil" "github.com/cockroachdb/cockroach/pkg/util/version" ) @@ -37,10 +38,11 @@ var ( cgoTargetTriple string platform = fmt.Sprintf("%s %s", runtime.GOOS, runtime.GOARCH) // Distribution is changed by the CCL init-time hook in non-APL builds. - Distribution = "OSS" - typ string // Type of this build: , "development", or "release" - channel string - envChannel = envutil.EnvOrDefaultString("COCKROACH_CHANNEL", "unknown") + Distribution = "OSS" + typ string // Type of this build: , "development", or "release" + channel string + envChannel = envutil.EnvOrDefaultString("COCKROACH_CHANNEL", "unknown") + enabledAssertions = buildutil.CrdbTestBuild //go:embed version.txt cockroachVersion string binaryVersion = computeBinaryVersion(cockroachVersion, rev) @@ -127,7 +129,8 @@ func (b Info) Long() string { fmt.Fprintf(tw, "Go Version: %s\n", b.GoVersion) fmt.Fprintf(tw, "C Compiler: %s\n", b.CgoCompiler) fmt.Fprintf(tw, "Build Commit ID: %s\n", b.Revision) - fmt.Fprintf(tw, "Build Type: %s", b.Type) // No final newline: cobra prints one for us. + fmt.Fprintf(tw, "Build Type: %s\n", b.Type) + fmt.Fprintf(tw, "Enabled Assertions: %t", b.EnabledAssertions) // No final newline: cobra prints one for us. _ = tw.Flush() return buf.String() } @@ -157,17 +160,18 @@ func GetInfo() Info { ch = "unknown" } return Info{ - GoVersion: runtime.Version(), - Tag: binaryVersion, - Time: utcTime, - Revision: rev, - CgoCompiler: cgoCompiler, - CgoTargetTriple: cgoTargetTriple, - Platform: platform, - Distribution: Distribution, - Type: typ, - Channel: ch, - EnvChannel: envChannel, + GoVersion: runtime.Version(), + Tag: binaryVersion, + Time: utcTime, + Revision: rev, + CgoCompiler: cgoCompiler, + CgoTargetTriple: cgoTargetTriple, + Platform: platform, + Distribution: Distribution, + Type: typ, + Channel: ch, + EnvChannel: envChannel, + EnabledAssertions: enabledAssertions, } } diff --git a/pkg/build/info.proto b/pkg/build/info.proto index 22d801551127..8e3e18a3a12f 100644 --- a/pkg/build/info.proto +++ b/pkg/build/info.proto @@ -40,6 +40,8 @@ message Info { optional string channel = 9 [(gogoproto.nullable) = false]; // env_channel identifies the product channel as overridden by the COCKROACH_CHANNEL environment variable. optional string env_channel = 11 [(gogoproto.nullable) = false]; + // enabled_assertions returns the value of 'CrdbTestBuild' (true iff compiled with 'crdb_test' tag) + optional bool enabled_assertions = 12 [(gogoproto.nullable) = false]; // dependencies exists to allow tests that run against old clusters // to unmarshal JSON containing this field. The tag is unimportant, diff --git a/pkg/cmd/roachprod/flags.go b/pkg/cmd/roachprod/flags.go index ea4fab89e132..83b8d9f4142e 100644 --- a/pkg/cmd/roachprod/flags.go +++ b/pkg/cmd/roachprod/flags.go @@ -39,6 +39,7 @@ var ( extendLifetime time.Duration wipePreserveCerts bool grafanaConfig string + grafanaArch string grafanaurlOpen bool grafanaDumpDir string listDetails bool @@ -107,8 +108,9 @@ func initFlags() { vm.AllProviderNames())) createCmd.Flags().BoolVar(&createVMOpts.GeoDistributed, "geo", false, "Create geo-distributed cluster") - createCmd.Flags().BoolVar(&createVMOpts.EnableFIPS, - "fips", false, "Enable FIPS mode (uses custom AMI)") + createCmd.Flags().StringVar(&createVMOpts.Arch, "arch", "", + "architecture override for VM [amd64, arm64, fips]; N.B. fips implies amd64 with openssl") + // N.B. We set "usage=roachprod" as the default, custom label for billing tracking. createCmd.Flags().StringToStringVar(&createVMOpts.CustomLabels, "label", map[string]string{"usage": "roachprod"}, @@ -258,6 +260,9 @@ Default is "RECURRING '*/15 * * * *' FULL BACKUP '@hourly' WITH SCHEDULE OPTIONS grafanaStartCmd.Flags().StringVar(&grafanaConfig, "grafana-config", "", "URI to grafana json config, supports local and http(s) schemes") + grafanaStartCmd.Flags().StringVar(&grafanaArch, "arch", "", + "binary architecture override [amd64, arm64]") + grafanaURLCmd.Flags().BoolVar(&grafanaurlOpen, "open", false, "open the grafana dashboard url on the browser") diff --git a/pkg/cmd/roachprod/main.go b/pkg/cmd/roachprod/main.go index daa00bea51f4..5852d3ed7804 100644 --- a/pkg/cmd/roachprod/main.go +++ b/pkg/cmd/roachprod/main.go @@ -277,6 +277,14 @@ hosts file. return err } } else { + machineType := func(clusterVMs vm.List) string { + res := clusterVMs[0].MachineType + // Display CPU architecture, other than amd64 (default). + if arch := clusterVMs[0].Labels["arch"]; arch != "" && arch != string(vm.ArchAMD64) { + res += fmt.Sprintf(" [%s]", arch) + } + return res + } // Align columns left and separate with at least two spaces. tw := tabwriter.NewWriter(os.Stdout, 0, 8, 2, ' ', tabwriter.AlignRight) // N.B. colors use escape codes which don't play nice with tabwriter [1]. @@ -304,7 +312,7 @@ hosts file. // N.B. Tabwriter doesn't support per-column alignment. It looks odd to have the cluster names right-aligned, // so we make it left-aligned. fmt.Fprintf(tw, "%s\t%s\t%d\t%s", name+strings.Repeat(" ", maxClusterName-len(name)), c.Clouds(), - len(c.VMs), c.VMs[0].MachineType) + len(c.VMs), machineType(c.VMs)) if !c.IsLocal() { colorByCostBucket := func(cost float64) func(string, ...interface{}) string { switch { @@ -987,7 +995,7 @@ var getProvidersCmd = &cobra.Command{ var grafanaStartCmd = &cobra.Command{ Use: `grafana-start `, - Short: `spins up a prometheus and grafana instance on the last node in the cluster`, + Short: `spins up a prometheus and grafana instance on the last node in the cluster; NOTE: for arm64 clusters, use --arch arm64`, Args: cobra.ExactArgs(1), Run: wrap(func(cmd *cobra.Command, args []string) error { var grafanaDashboardJSONs []string @@ -1015,8 +1023,11 @@ var grafanaStartCmd = &cobra.Command{ return err } } - - return roachprod.StartGrafana(context.Background(), config.Logger, args[0], + arch := vm.ArchAMD64 + if grafanaArch == "arm64" { + arch = vm.ArchARM64 + } + return roachprod.StartGrafana(context.Background(), config.Logger, args[0], arch, grafanaConfigURL, grafanaDashboardJSONs, nil) }), } @@ -1271,14 +1282,14 @@ func validateAndConfigure(cmd *cobra.Command, args []string) { // Validate architecture flag, if set. if archOpt := cmd.Flags().Lookup("arch"); archOpt != nil && archOpt.Changed { - arch := strings.ToLower(archOpt.Value.String()) + arch := vm.CPUArch(strings.ToLower(archOpt.Value.String())) - if arch != "amd64" && arch != "arm64" && arch != "fips" { + if arch != vm.ArchAMD64 && arch != vm.ArchARM64 && arch != vm.ArchFIPS { printErrAndExit(fmt.Errorf("unsupported architecture %q", arch)) } - if arch != archOpt.Value.String() { + if string(arch) != archOpt.Value.String() { // Set the canonical value. - _ = cmd.Flags().Set("arch", arch) + _ = cmd.Flags().Set("arch", string(arch)) } } } diff --git a/pkg/cmd/roachtest/cluster.go b/pkg/cmd/roachtest/cluster.go index d41503580e2b..20660a1637af 100644 --- a/pkg/cmd/roachtest/cluster.go +++ b/pkg/cmd/roachtest/cluster.go @@ -59,13 +59,20 @@ func init() { } var ( - // TODO(tbg): this is redundant with --cloud==local. Make the --local flag an - // alias for `--cloud=local` and remove this variable. - local bool - - cockroach string - cockroachShort string - libraryFilePaths []string + // user-specified path to crdb binary + cockroachPath string + // maps cpuArch to the corresponding crdb binary's absolute path + cockroach = make(map[vm.CPUArch]string) + // user-specified path to short crdb binary + cockroachShortPath string + // maps cpuArch to the corresponding short crdb (i.e., without UI) binary's absolute path + cockroachShort = make(map[vm.CPUArch]string) + // user-specified path to workload binary + workloadPath string + // maps cpuArch to the corresponding workload binary's absolute path + workload = make(map[vm.CPUArch]string) + // maps cpuArch to the corresponding dynamically-linked libraries' absolute paths + libraryFilePaths = make(map[vm.CPUArch][]string) cloud = spec.GCE // encryptionProbability controls when encryption-at-rest is enabled // in a cluster for tests that have opted-in to metamorphic @@ -75,10 +82,18 @@ var ( // encryption enabled by default (probability 1). In order to run // them with encryption disabled (perhaps to reproduce a test // failure), roachtest can be invoked with --metamorphic-encryption-probability=0 - encryptionProbability float64 + encryptionProbability float64 + // Total probability with which new ARM64 clusters are provisioned, modulo test specs. which are incompatible. + // N.B. if all selected tests are incompatible with ARM64, then arm64Probability is effectively 0. + // In other words, ClusterSpec.Arch takes precedence over the arm64Probability flag. + arm64Probability float64 + // Conditional probability with which new FIPS clusters are provisioned, modulo test specs. The total probability + // is the product of this and 1-arm64Probability. + // As in the case of arm64Probability, ClusterSpec.Arch takes precedence over the fipsProbability flag. + fipsProbability float64 + instanceType string localSSDArg bool - workload string deprecatedRoachprodBinary string // overrideOpts contains vm.CreateOpts override values passed from the cli. overrideOpts vm.CreateOpts @@ -98,6 +113,8 @@ var ( const ( defaultEncryptionProbability = 1 + defaultFIPSProbability = 0 + defaultARM64Probability = 0 defaultCockroachPath = "./cockroach-default" ) @@ -109,29 +126,62 @@ func (e errBinaryOrLibraryNotFound) Error() string { return fmt.Sprintf("binary or library %q not found (or was not executable)", e.binary) } -func filepathAbs(path string) (string, error) { - path, err := filepath.Abs(path) +func validateBinaryFormat(path string, arch vm.CPUArch, checkEA bool) (string, error) { + abspath, err := filepath.Abs(path) if err != nil { return "", errors.WithStack(err) } - return path, nil -} - -func findBinary(binary, defValue string) (abspath string, err error) { - if binary == "" { - binary = defValue + // Check that the binary ELF format matches the expected architecture. + cmd := exec.Command("file", "-b", abspath) + var out bytes.Buffer + cmd.Stdout = &out + if err := cmd.Run(); err != nil { + return "", errors.Wrapf(err, "error executing 'file %s'", abspath) + } + fileFormat := strings.ToLower(out.String()) + if arch == vm.ArchARM64 { + if !strings.Contains(fileFormat, "arm64") && + !strings.Contains(fileFormat, "aarch64") { + return "", errors.Newf("%s has incompatible architecture; want: %q, got: %q", abspath, arch, fileFormat) + } + } else if arch == vm.ArchAMD64 && + // N.B. the "x86_64" string is returned on macOS, while "x86-64" is returned on Linux. + !(strings.Contains(fileFormat, "x86-64") || strings.Contains(fileFormat, "x86_64")) { + // Otherwise, we expect a binary that was built for amd64. + return "", errors.Newf("%s has incompatible architecture; want: %q, got: %q", abspath, arch, fileFormat) + } + if arch == vm.ArchFIPS && strings.HasSuffix(abspath, "cockroach") { + // Check that the binary is patched to use OpenSSL FIPS. + // N.B. only the cockroach binary is patched, so we exclude this check for dynamically-linked libraries. + cmd = exec.Command("bash", "-c", fmt.Sprintf("nm %s | grep golang-fips |head -1", abspath)) + if err := cmd.Run(); err != nil { + return "", errors.Newf("%s is not compiled with FIPS", abspath) + } } + if checkEA { + // Check that the binary was compiled with assertions _enabled_. + cmd = exec.Command("bash", "-c", fmt.Sprintf("%s version |grep \"Enabled Assertions\" |grep true", abspath)) + if err := cmd.Run(); err != nil { + return "", errors.Newf("%s is not compiled with assertions enabled", abspath) + } + } + + return abspath, nil +} +func findBinary( + name string, osName string, arch vm.CPUArch, checkEA bool, +) (abspath string, err error) { // Check to see if binary exists and is a regular file and executable. - if fi, err := os.Stat(binary); err == nil && fi.Mode().IsRegular() && (fi.Mode()&0111) != 0 { - return filepathAbs(binary) + if fi, err := os.Stat(name); err == nil && fi.Mode().IsRegular() && (fi.Mode()&0111) != 0 { + return validateBinaryFormat(name, arch, checkEA) } - return findBinaryOrLibrary("bin", binary) + return findBinaryOrLibrary("bin", name, "", osName, arch, checkEA) } -func findLibrary(libraryName string) (string, error) { +func findLibrary(libraryName string, os string, arch vm.CPUArch) (string, error) { suffix := ".so" - if local { + if cloud == spec.Local { switch runtime.GOOS { case "linux": case "freebsd": @@ -145,65 +195,102 @@ func findLibrary(libraryName string) (string, error) { return "", errors.Newf("failed to find suffix for runtime %s", runtime.GOOS) } } - return findBinaryOrLibrary("lib", libraryName+suffix) + + return findBinaryOrLibrary("lib", libraryName, suffix, os, arch, false) } -func findBinaryOrLibrary(binOrLib string, name string) (string, error) { +// findBinaryOrLibrary searches for a binary or library, _first_ in the $PATH, _then_ in the following hardcoded paths, +// +// $GOPATH/src/github.com/cockroachdb/cockroach/ +// $GOPATH/src/github.com/cockroachdb/artifacts/ +// $PWD/binOrLib +// $GOPATH/src/github.com/cockroachdb/cockroach/binOrLib +// +// in the above order, unless 'name' is an absolute path, in which case the hardcoded paths are skipped. +// +// binOrLib is either 'bin' or 'lib'; nameSuffix is either empty, '.so', '.dll', or '.dylib'. +// Both osName and arch are used to derive a fully qualified binary or library name by inserting the +// corresponding arch suffix (see install.ArchInfoForOS), e.g. '.linux-arm64' or '.darwin-amd64'. +// That is, each hardcoded path is searched for a file named 'name' or 'name.nameSuffix.archSuffix', respectively. +// +// If no binary or library is found, an error is returned. +// Otherwise, if multiple binaries or libraries are located at the above paths, the first one found is returned. +// If the found binary or library happens to be of the wrong type, e.g., architecture is different from 'arch', or +// checkEA is true, and the binary was not compiled with runtime assertions enabled, an error is returned. +// While we could continue the search instead of returning an error, it is assumed the user can stage the binaries +// to avoid such ambiguity. Alternatively, the user can specify the absolute path to the binary or library, +// e.g., via --cockroach; in this case, only the absolute path is checked and validated. +func findBinaryOrLibrary( + binOrLib string, name string, nameSuffix string, osName string, arch vm.CPUArch, checkEA bool, +) (string, error) { // Find the binary to run and translate it to an absolute path. First, look // for the binary in PATH. - path, err := exec.LookPath(name) + pathFromEnv, err := exec.LookPath(name) + if err == nil { + // Found it in PATH, validate and return absolute path. + return validateBinaryFormat(pathFromEnv, arch, checkEA) + } + if strings.HasPrefix(name, "/") { + // Specified name is an absolute path, but we couldn't find it; bail out. + return "", errors.WithStack(err) + } + // We're unable to find the name in PATH and "name" is a relative path: + // look in the cockroach repo. + gopath := os.Getenv("GOPATH") + if gopath == "" { + gopath = filepath.Join(os.Getenv("HOME"), "go") + } + + dirs := []string{ + filepath.Join(gopath, "/src/github.com/cockroachdb/cockroach/"), + filepath.Join(gopath, "/src/github.com/cockroachdb/cockroach/artifacts/"), + filepath.Join(os.ExpandEnv("$PWD"), binOrLib), + filepath.Join(gopath, "/src/github.com/cockroachdb/cockroach", binOrLib), + } + + archInfo, err := install.ArchInfoForOS(osName, arch) if err != nil { - if strings.HasPrefix(name, "/") { - return "", errors.WithStack(err) - } - - // We're unable to find the name in PATH and "name" is a relative path: - // look in the cockroach repo. - gopath := os.Getenv("GOPATH") - if gopath == "" { - gopath = filepath.Join(os.Getenv("HOME"), "go") - } - - var suffix string - if !local { - suffix = ".docker_amd64" - } - dirs := []string{ - filepath.Join(gopath, "/src/github.com/cockroachdb/cockroach/"), - filepath.Join(gopath, "/src/github.com/cockroachdb/cockroach/artifacts/"), - filepath.Join(gopath, "/src/github.com/cockroachdb/cockroach", binOrLib+suffix), - filepath.Join(os.ExpandEnv("$PWD"), binOrLib+suffix), - filepath.Join(gopath, "/src/github.com/cockroachdb/cockroach", binOrLib), - } - for _, dir := range dirs { - path = filepath.Join(dir, name) - var err2 error - path, err2 = exec.LookPath(path) - if err2 == nil { - return filepathAbs(path) + return "", err + } + archSuffixes := []string{"." + archInfo.DebugArchitecture, "." + archInfo.ReleaseArchitecture} + + for _, dir := range dirs { + var path string + + if path, err = exec.LookPath(filepath.Join(dir, name)); err == nil { + return validateBinaryFormat(path, arch, checkEA) + } + for _, archSuffix := range archSuffixes { + if path, err = exec.LookPath(filepath.Join(dir, name+archSuffix+nameSuffix)); err == nil { + return validateBinaryFormat(path, arch, checkEA) } } - return "", errBinaryOrLibraryNotFound{name} } - return filepathAbs(path) + return "", errBinaryOrLibraryNotFound{name} } // VerifyLibraries verifies that the required libraries, specified by name, are // available for the target environment. -func VerifyLibraries(requiredLibs []string) error { +func VerifyLibraries(requiredLibs []string, arch vm.CPUArch) error { + foundLibraryPaths := libraryFilePaths[arch] + for _, requiredLib := range requiredLibs { - if !contains(libraryFilePaths, libraryNameFromPath, requiredLib) { - return errors.Wrap(errors.Errorf("missing required library %s", requiredLib), "cluster.VerifyLibraries") + if !contains(foundLibraryPaths, libraryNameFromPath, requiredLib) { + return errors.Wrap(errors.Errorf("missing required library %s (arch=%q)", requiredLib, arch), "cluster.VerifyLibraries") } } return nil } -// libraryNameFromPath returns the name of a library without the extension, for a +// libraryNameFromPath returns the name of a library without the extension(s), for a // given path. func libraryNameFromPath(path string) string { filename := filepath.Base(path) - return strings.TrimSuffix(filename, filepath.Ext(filename)) + // N.B. filename may contain multiple extensions, e.g. "libgeos.linux-amd64.fips.so". + for ext := filepath.Ext(filename); ext != ""; ext = filepath.Ext(filename) { + filename = strings.TrimSuffix(filename, ext) + } + return filename } func contains(list []string, transformString func(s string) string, str string) bool { @@ -219,50 +306,128 @@ func contains(list []string, transformString func(s string) string, str string) } func initBinariesAndLibraries() { - // If we're running against an existing "local" cluster, force the local flag - // to true in order to get the "local" test configurations. - if clusterName == "local" { - local = true - } - if local { - cloud = spec.Local - } + // TODO(srosenberg): enable metamorphic local clusters; currently, spec.Local means run all tests locally. + // This could be revisited after we have a way to specify which clouds a given test supports, + // see https://github.com/cockroachdb/cockroach/issues/104029. + defaultOsName := "linux" + defaultArch := vm.ArchAMD64 + if cloud == spec.Local { + defaultOsName = runtime.GOOS + if arm64Probability == 1 { + // N.B. if arm64Probability != 1, then we're running a local cluster with both arm64 and amd64. + defaultArch = vm.ArchARM64 + } + if string(defaultArch) != runtime.GOARCH { + fmt.Printf("WARN: local cluster's architecture (%q) differs from default (%q)\n", runtime.GOARCH, defaultArch) + } + } + fmt.Printf("Locating and verifying binaries for os=%q, arch=%q\n", defaultOsName, defaultArch) + + // Finds and validates a binary. If the binary 'isRequired', but not found, exit and print the error. + resolveBinary := func(binName string, userSpecified string, arch vm.CPUArch, isRequired bool, checkEA bool) (string, error) { + path := binName + if userSpecified != "" { + path = userSpecified + } + abspath, err := findBinary(path, defaultOsName, arch, checkEA) + if err != nil { + if isRequired { + fmt.Fprintf(os.Stderr, "ERROR: unable to find required binary %q for %q: %v\n", binName, arch, err) + os.Exit(1) + } + return "", err + } + if userSpecified == "" { + // No user-specified path, so return the found absolute path. + return abspath, nil + } + // Bail out if a path other than the user-specified was found. + userPath, err := filepath.Abs(userSpecified) + + if err != nil || userPath != abspath { + err = errors.Wrapf(err, "ERROR: found %q at: %s instead of the user-specified path: %q\n", binName, abspath, userSpecified) - cockroachDefault := "cockroach" - if !local { - cockroachDefault = "cockroach-linux-2.6.32-gnu-amd64" + if isRequired { + fmt.Fprintf(os.Stderr, "%v", err) + os.Exit(1) + } + return "", err + } + return abspath, nil } + // We need to verify we have at least both the cockroach and the workload binaries. var err error - cockroach, err = findBinary(cockroach, cockroachDefault) + + cockroach[defaultArch], _ = resolveBinary("cockroach", cockroachPath, defaultArch, true, false) + workload[defaultArch], _ = resolveBinary("workload", workloadPath, defaultArch, true, false) + cockroachShort[defaultArch], err = resolveBinary("cockroach-short", cockroachShortPath, defaultArch, false, true) if err != nil { - fmt.Fprintf(os.Stderr, "%+v\n", err) - os.Exit(1) + fmt.Fprintf(os.Stderr, "WARN: unable to find %q for %q: %s\n", "cockroach-short", defaultArch, err) } - if cockroachShort != "" { - // defValue doesn't matter since cockroachShort is a non-empty string. - cockroachShort, err = findBinary(cockroachShort, "" /* defValue */) + if arm64Probability > 0 && defaultArch != vm.ArchARM64 { + fmt.Printf("Locating and verifying binaries for os=%q, arch=%q\n", defaultOsName, vm.ArchARM64) + // We need to verify we have all the required binaries for arm64. + cockroach[vm.ArchARM64], _ = resolveBinary("cockroach", cockroachPath, vm.ArchARM64, true, false) + workload[vm.ArchARM64], _ = resolveBinary("workload", workloadPath, vm.ArchARM64, true, false) + cockroachShort[vm.ArchARM64], err = resolveBinary("cockroach-short", cockroachShortPath, vm.ArchARM64, false, true) if err != nil { - fmt.Fprintf(os.Stderr, "%+v\n", err) - os.Exit(1) + fmt.Fprintf(os.Stderr, "WARN: unable to find %q for %q: %s\n", "cockroach-short", vm.ArchARM64, err) } } - - workload, err = findBinary(workload, "workload") - if errors.As(err, &errBinaryOrLibraryNotFound{}) { - fmt.Fprintln(os.Stderr, "workload binary not provided, proceeding anyway") - } else if err != nil { - fmt.Fprintf(os.Stderr, "%+v\n", err) - os.Exit(1) + if fipsProbability > 0 && defaultArch != vm.ArchFIPS { + fmt.Printf("Locating and verifying binaries for os=%q, arch=%q\n", defaultOsName, vm.ArchFIPS) + // We need to verify we have all the required binaries for fips. + cockroach[vm.ArchFIPS], _ = resolveBinary("cockroach", cockroachPath, vm.ArchFIPS, true, false) + workload[vm.ArchFIPS], _ = resolveBinary("workload", workloadPath, vm.ArchFIPS, true, false) + cockroachShort[vm.ArchFIPS], err = resolveBinary("cockroach-short", cockroachShortPath, vm.ArchFIPS, false, true) + if err != nil { + fmt.Fprintf(os.Stderr, "WARN: unable to find %q for %q: %s\n", "cockroach-short", vm.ArchFIPS, err) + } } // In v20.2 or higher, optionally expect certain library files to exist. // Since they may not be found in older versions, do not hard error if they are not found. - for _, libraryName := range []string{"libgeos", "libgeos_c"} { - if libraryFilePath, err := findLibrary(libraryName); err != nil { - fmt.Fprintf(os.Stderr, "error finding library %s, ignoring: %+v\n", libraryName, err) - } else { - libraryFilePaths = append(libraryFilePaths, libraryFilePath) + for _, arch := range []vm.CPUArch{vm.ArchAMD64, vm.ArchARM64, vm.ArchFIPS} { + if arm64Probability == 0 && defaultArch != vm.ArchARM64 && arch == vm.ArchARM64 { + // arm64 isn't used, skip finding libs for it. + continue + } + if fipsProbability == 0 && arch == vm.ArchFIPS { + // fips isn't used, skip finding libs for it. + continue + } + paths := []string(nil) + + for _, libraryName := range []string{"libgeos", "libgeos_c"} { + if libraryFilePath, err := findLibrary(libraryName, defaultOsName, arch); err != nil { + fmt.Fprintf(os.Stderr, "WARN: unable to find library %s, ignoring: %s\n", libraryName, err) + } else { + paths = append(paths, libraryFilePath) + } + } + libraryFilePaths[arch] = paths + } + // Looks like we have all the binaries we'll need. Let's print them out. + fmt.Printf("\nFound the following binaries:\n") + for arch, path := range cockroach { + if path != "" { + fmt.Printf("\tcockroach %q at: %s\n", arch, path) + } + } + for arch, path := range workload { + if path != "" { + fmt.Printf("\tworkload %q at: %s\n", arch, path) + } + } + for arch, path := range cockroachShort { + if path != "" { + fmt.Printf("\tcockroach-short %q at: %s\n", arch, path) + } + } + for arch, paths := range libraryFilePaths { + if len(paths) > 0 { + fmt.Printf("\tlibraries %q at: %s\n", arch, strings.Join(paths, ", ")) } } } @@ -666,6 +831,8 @@ type clusterImpl struct { // clusterSettings are additional cluster settings set on cluster startup. clusterSettings map[string]string + os string // OS of the cluster + arch vm.CPUArch // CPU architecture of the cluster // destroyState contains state related to the cluster's destruction. destroyState destroyState } @@ -749,7 +916,10 @@ type clusterConfig struct { localCluster bool useIOBarrier bool alloc *quotapool.IntAlloc - enableFIPS bool + // Specifies CPU architecture which may require a custom AMI and cockroach binary. + arch vm.CPUArch + // Specifies the OS which may require a custom AMI and cockroach binary. + os string } // clusterFactory is a creator of clusters. @@ -886,7 +1056,7 @@ func (f *clusterFactory) newCluster( providerOptsContainer := vm.CreateProviderOptionsContainer() // The ClusterName is set below in the retry loop to ensure // that each create attempt gets a unique cluster name. - createVMOpts, providerOpts, err := cfg.spec.RoachprodOpts("", cfg.useIOBarrier, cfg.enableFIPS) + createVMOpts, providerOpts, err := cfg.spec.RoachprodOpts("", cfg.useIOBarrier, cfg.arch) if err != nil { // We must release the allocation because cluster creation is not possible at this point. cfg.alloc.Release() @@ -922,6 +1092,8 @@ func (f *clusterFactory) newCluster( spec: cfg.spec, expiration: cfg.spec.Expiration(), r: f.r, + arch: cfg.arch, + os: cfg.os, destroyState: destroyState{ owned: true, alloc: cfg.alloc, @@ -1775,11 +1947,13 @@ func (c *clusterImpl) PutLibraries( if err := c.RunE(ctx, c.All(), "mkdir", "-p", libraryDir); err != nil { return err } - for _, libraryFilePath := range libraryFilePaths { - if !contains(libraries, nil, libraryNameFromPath(libraryFilePath)) { + + for _, libraryFilePath := range libraryFilePaths[c.arch] { + libName := libraryNameFromPath(libraryFilePath) + if !contains(libraries, nil, libName) { continue } - putPath := filepath.Join(libraryDir, filepath.Base(libraryFilePath)) + putPath := filepath.Join(libraryDir, libName) if err := c.PutE( ctx, c.l, @@ -1805,7 +1979,7 @@ func (c *clusterImpl) Stage( c.status("staging binary") defer c.status("") return errors.Wrap(roachprod.Stage(ctx, l, c.MakeNodes(opts...), - "" /* stageOS */, "" /* stageArch */, dir, application, versionOrSHA), "cluster.Stage") + c.os, string(c.arch), dir, application, versionOrSHA), "cluster.Stage") } // Get gets files from remote hosts. @@ -2523,6 +2697,10 @@ func (c *clusterImpl) IsSecure() bool { return c.localCertsDir != "" } +func (c *clusterImpl) Architecture() vm.CPUArch { + return c.arch +} + // Extend extends the cluster's expiration by d. func (c *clusterImpl) Extend(ctx context.Context, d time.Duration, l *logger.Logger) error { if ctx.Err() != nil { @@ -2551,7 +2729,7 @@ func (c *clusterImpl) NewMonitor(ctx context.Context, opts ...option.Option) clu func (c *clusterImpl) StartGrafana( ctx context.Context, l *logger.Logger, promCfg *prometheus.Config, ) error { - return roachprod.StartGrafana(ctx, l, c.name, "", nil, promCfg) + return roachprod.StartGrafana(ctx, l, c.name, c.arch, "", nil, promCfg) } func (c *clusterImpl) StopGrafana(ctx context.Context, l *logger.Logger, dumpDir string) error { diff --git a/pkg/cmd/roachtest/cluster/cluster_interface.go b/pkg/cmd/roachtest/cluster/cluster_interface.go index b4e85fe2ab2b..fd2547f70bf0 100644 --- a/pkg/cmd/roachtest/cluster/cluster_interface.go +++ b/pkg/cmd/roachtest/cluster/cluster_interface.go @@ -108,7 +108,10 @@ type Cluster interface { Spec() spec.ClusterSpec Name() string IsLocal() bool + // IsSecure returns true iff the cluster uses TLS. IsSecure() bool + // Returns CPU architecture of the nodes. + Architecture() vm.CPUArch // Deleting CockroachDB data and logs on nodes. diff --git a/pkg/cmd/roachtest/cluster_test.go b/pkg/cmd/roachtest/cluster_test.go index 654c4ece1d0a..a68bca5448e3 100644 --- a/pkg/cmd/roachtest/cluster_test.go +++ b/pkg/cmd/roachtest/cluster_test.go @@ -18,6 +18,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec" test2 "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" "github.com/cockroachdb/cockroach/pkg/roachprod/logger" + "github.com/cockroachdb/cockroach/pkg/roachprod/vm" "github.com/cockroachdb/cockroach/pkg/util/version" "github.com/cockroachdb/errors" "github.com/stretchr/testify/assert" @@ -207,14 +208,14 @@ func TestVerifyLibraries(t *testing.T) { name: "no match", verifyLibs: []string{"required_c"}, libraryFilePaths: []string{"/some/path/lib.so"}, - expectedError: errors.Wrap(errors.Errorf("missing required library %s", + expectedError: errors.Wrap(errors.Errorf("missing required library %s (arch=\"amd64\")", "required_c"), "cluster.VerifyLibraries"), }, { name: "no match on nil libs", verifyLibs: []string{"required_b"}, libraryFilePaths: nil, - expectedError: errors.Wrap(errors.Errorf("missing required library %s", + expectedError: errors.Wrap(errors.Errorf("missing required library %s (arch=\"amd64\")", "required_b"), "cluster.VerifyLibraries"), }, { @@ -223,17 +224,29 @@ func TestVerifyLibraries(t *testing.T) { libraryFilePaths: []string{"/lib/geos.so"}, expectedError: nil, }, + { + name: "single match, multiple extensions", + verifyLibs: []string{"geos"}, + libraryFilePaths: []string{"/lib/geos.linux-amd.so"}, + expectedError: nil, + }, { name: "multiple matches", verifyLibs: []string{"lib", "ltwo", "geos"}, libraryFilePaths: []string{"ltwo.so", "a/geos.so", "/some/path/to/lib.so"}, expectedError: nil, }, + { + name: "multiple matches, multiple extensions", + verifyLibs: []string{"lib", "ltwo", "geos"}, + libraryFilePaths: []string{"ltwo.linux-arm64.so", "a/geos.linux-amd64.fips.so", "/some/path/to/lib.darwin-arm64.so"}, + expectedError: nil, + }, } for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { - libraryFilePaths = tc.libraryFilePaths - actualError := VerifyLibraries(tc.verifyLibs) + libraryFilePaths = map[vm.CPUArch][]string{vm.ArchAMD64: tc.libraryFilePaths} + actualError := VerifyLibraries(tc.verifyLibs, vm.ArchAMD64) if tc.expectedError == nil { require.NoError(t, actualError) } else { diff --git a/pkg/cmd/roachtest/github.go b/pkg/cmd/roachtest/github.go index f12806dcc1b4..cb1f96d573a8 100644 --- a/pkg/cmd/roachtest/github.go +++ b/pkg/cmd/roachtest/github.go @@ -165,7 +165,10 @@ func (g *githubIssues) createPostRequest( roachtestPrefix("cpu"): fmt.Sprintf("%d", spec.Cluster.CPUs), roachtestPrefix("ssd"): fmt.Sprintf("%d", spec.Cluster.SSDs), } - + // Emit CPU architecture only if it was specified; otherwise, it's captured below, assuming cluster was created. + if spec.Cluster.Arch != "" { + clusterParams[roachtestPrefix("arch")] = string(spec.Cluster.Arch) + } // These params can be probabilistically set, so we pass them here to // show what their actual values are in the posted issue. if g.vmCreateOpts != nil { @@ -175,6 +178,11 @@ func (g *githubIssues) createPostRequest( if g.cluster != nil { clusterParams[roachtestPrefix("encrypted")] = fmt.Sprintf("%v", g.cluster.encAtRest) + if spec.Cluster.Arch == "" { + // N.B. when Arch is specified, it cannot differ from cluster's arch. + // Hence, we only emit when arch was unspecified. + clusterParams[roachtestPrefix("arch")] = string(g.cluster.arch) + } } issueMessage := messagePrefix + message diff --git a/pkg/cmd/roachtest/github_test.go b/pkg/cmd/roachtest/github_test.go index 6d62abb3232d..a891b0204a2f 100644 --- a/pkg/cmd/roachtest/github_test.go +++ b/pkg/cmd/roachtest/github_test.go @@ -106,28 +106,31 @@ func TestCreatePostRequest(t *testing.T) { clusterCreationFailed bool loadTeamsFailed bool localSSD bool + arch vm.CPUArch category issueCategory expectedPost bool expectedReleaseBlocker bool expectedParams map[string]string }{ - {true, false, false, false, otherErr, true, false, + {true, false, false, false, "", otherErr, true, false, prefixAll(map[string]string{ "cloud": "gce", "encrypted": "false", "fs": "ext4", "ssd": "0", "cpu": "4", + "arch": "amd64", "localSSD": "false", }), }, - {true, false, false, true, clusterCreationErr, true, false, + {true, false, false, true, vm.ArchARM64, clusterCreationErr, true, false, prefixAll(map[string]string{ "cloud": "gce", "encrypted": "false", "fs": "ext4", "ssd": "0", "cpu": "4", + "arch": "arm64", "localSSD": "true", }), }, @@ -135,7 +138,7 @@ func TestCreatePostRequest(t *testing.T) { // !nonReleaseBlocker and issue is an SSH flake. Also ensure that // in the event of a failed cluster creation, nil `vmOptions` and // `clusterImpl` are not dereferenced - {false, true, false, false, sshErr, true, false, + {false, true, false, false, "", sshErr, true, false, prefixAll(map[string]string{ "cloud": "gce", "ssd": "0", @@ -143,12 +146,12 @@ func TestCreatePostRequest(t *testing.T) { }), }, //Simulate failure loading TEAMS.yaml - {true, false, true, false, otherErr, false, false, nil}, + {true, false, true, false, "", otherErr, false, false, nil}, } reg := makeTestRegistry(spec.GCE, "", "", false) for _, c := range testCases { - clusterSpec := reg.MakeClusterSpec(1) + clusterSpec := reg.MakeClusterSpec(1, spec.Arch(c.arch)) testSpec := ®istry.TestSpec{ Name: "github_test", @@ -162,7 +165,7 @@ func TestCreatePostRequest(t *testing.T) { l: nilLogger(), } - testClusterImpl := &clusterImpl{spec: clusterSpec} + testClusterImpl := &clusterImpl{spec: clusterSpec, arch: vm.ArchAMD64} vo := vm.DefaultCreateOpts() vmOpts := &vo diff --git a/pkg/cmd/roachtest/main.go b/pkg/cmd/roachtest/main.go index 88b0e0aed96b..9ca04f7d5df0 100644 --- a/pkg/cmd/roachtest/main.go +++ b/pkg/cmd/roachtest/main.go @@ -19,10 +19,12 @@ import ( "os/signal" "os/user" "path/filepath" + "runtime" "time" "github.com/cockroachdb/cockroach/pkg/build" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/tests" "github.com/cockroachdb/cockroach/pkg/roachprod" "github.com/cockroachdb/cockroach/pkg/roachprod/config" @@ -94,7 +96,6 @@ func main() { var clusterID string var count = 1 var versionsBinaryOverride map[string]string - var enableFIPS bool cobra.EnableCommandSorting = false @@ -119,16 +120,50 @@ func main() { if cmd.Name() == "help" { return nil } - - if clusterName != "" && local { - return fmt.Errorf( - "cannot specify both an existing cluster (%s) and --local. However, if a local cluster "+ - "already exists, --clusters=local will use it", - clusterName) + local := cmd.Flags().Lookup("local").Value.String() == "true" + if local { + if clusterName != "" { + return fmt.Errorf( + "cannot specify both an existing cluster (%s) and --local. However, if a local cluster "+ + "already exists, --clusters=local will use it", + clusterName) + } + cloud = spec.Local } switch cmd.Name() { case "run", "bench", "store-gen": + if !(0 <= arm64Probability && arm64Probability <= 1) { + return fmt.Errorf("'metamorphic-arm64-probability' must be in [0,1]") + } + if !(0 <= fipsProbability && fipsProbability <= 1) { + return fmt.Errorf("'metamorphic-fips-probability' must be in [0,1]") + } + if arm64Probability == 1 && fipsProbability != 0 { + return fmt.Errorf("'metamorphic-fips-probability' must be 0 when 'metamorphic-arm64-probability' is 1") + } + if fipsProbability == 1 && arm64Probability != 0 { + return fmt.Errorf("'metamorphic-arm64-probability' must be 0 when 'metamorphic-fips-probability' is 1") + } + arm64Opt := cmd.Flags().Lookup("metamorphic-arm64-probability") + if !arm64Opt.Changed && runtime.GOARCH == "arm64" && cloud == spec.Local { + fmt.Printf("Detected 'arm64' in 'local mode', setting 'metamorphic-arm64-probability' to 1; use --metamorphic-arm64-probability to run (emulated) with other binaries\n") + arm64Probability = 1 + } + // Find and validate all required binaries and libraries. initBinariesAndLibraries() + + if arm64Probability > 0 { + fmt.Printf("ARM64 clusters will be provisioned with probability %.2f\n", arm64Probability) + } + amd64Probability := 1 - arm64Probability + if amd64Probability > 0 { + fmt.Printf("AMD64 clusters will be provisioned with probability %.2f\n", amd64Probability) + } + if fipsProbability > 0 { + // N.B. arm64Probability < 1, otherwise fipsProbability == 0, as per above check. + // Hence, amd64Probability > 0 is implied. + fmt.Printf("FIPS clusters will be provisioned with probability %.2f\n", fipsProbability*amd64Probability) + } } return nil }, @@ -140,6 +175,7 @@ func main() { "If fewer than --parallelism names are specified, then the parallelism "+ "is capped to the number of clusters specified. When a cluster does not exist "+ "yet, it is created according to the spec.") + var local bool rootCmd.PersistentFlags().BoolVarP( &local, "local", "l", local, "run tests locally") rootCmd.PersistentFlags().StringVarP( @@ -147,15 +183,25 @@ func main() { "Username to use as a cluster name prefix. "+ "If blank, the current OS user is detected and specified.") rootCmd.PersistentFlags().StringVar( - &cockroach, "cockroach", "", "path to cockroach binary to use") + &cockroachPath, "cockroach", "", "path to cockroach binary to use") rootCmd.PersistentFlags().StringVar( - &cockroachShort, "cockroach-short", "", "path to cockroach-short binary (compiled with crdb_test build tag) to use") + &cockroachShortPath, "cockroach-short", "", "path to cockroach-short binary (compiled with crdb_test build tag) to use") rootCmd.PersistentFlags().StringVar( - &workload, "workload", "", "path to workload binary to use") + &workloadPath, "workload", "", "path to workload binary to use") rootCmd.PersistentFlags().Float64Var( &encryptionProbability, "metamorphic-encryption-probability", defaultEncryptionProbability, "probability that clusters will be created with encryption-at-rest enabled "+ "for tests that support metamorphic encryption (default 1.0)") + rootCmd.PersistentFlags().Float64Var( + &fipsProbability, "metamorphic-fips-probability", defaultFIPSProbability, + "conditional probability that amd64 clusters will be created with FIPS, i.e., P(fips | amd64), "+ + "for tests that support FIPS and whose CPU architecture is 'amd64' (default 0) "+ + "NOTE: amd64 clusters are created with probability 1-P(arm64), where P(arm64) is 'metamorphic-arm64-probability'. "+ + "Hence, P(fips | amd64) = P(fips) * (1 - P(arm64))") + rootCmd.PersistentFlags().Float64Var( + &arm64Probability, "metamorphic-arm64-probability", defaultARM64Probability, + "probability that clusters will be created with 'arm64' CPU architecture "+ + "for tests that support 'arm64' (default 0)") rootCmd.AddCommand(&cobra.Command{ Use: `version`, @@ -257,7 +303,6 @@ runner itself. user: username, clusterID: clusterID, versionsBinaryOverride: versionsBinaryOverride, - enableFIPS: enableFIPS, }) }, } @@ -295,7 +340,6 @@ runner itself. user: username, clusterID: clusterID, versionsBinaryOverride: versionsBinaryOverride, - enableFIPS: enableFIPS, }) }, } @@ -348,8 +392,6 @@ runner itself. "is present in the list,"+"the respective binary will be used when a "+ "multi-version test asks for the respective binary, instead of "+ "`roachprod stage `. Example: 20.1.4=cockroach-20.1,20.2.0=cockroach-20.2.") - cmd.Flags().BoolVar( - &enableFIPS, "fips", false, "Run tests in enableFIPS mode") } parseCreateOpts(runCmd.Flags(), &overrideOpts) @@ -401,7 +443,6 @@ type cliCfg struct { user string clusterID string versionsBinaryOverride map[string]string - enableFIPS bool } func runTests(register func(registry.Registry), cfg cliCfg) error { @@ -421,7 +462,7 @@ func runTests(register func(registry.Registry), cfg cliCfg) error { filter := registry.NewTestFilter(cfg.args, cfg.runSkipped) clusterType := roachprodCluster bindTo := "" - if local { + if cloud == spec.Local { clusterType = localCluster // This will suppress the annoying "Allow incoming network connections" popup from @@ -442,7 +483,6 @@ func runTests(register func(registry.Registry), cfg cliCfg) error { cpuQuota: cfg.cpuQuota, debugMode: cfg.debugMode, clusterID: cfg.clusterID, - enableFIPS: cfg.enableFIPS, } if err := runner.runHTTPServer(cfg.httpPort, os.Stdout, bindTo); err != nil { return err diff --git a/pkg/cmd/roachtest/roachtestutil/mixedversion/mixedversion.go b/pkg/cmd/roachtest/roachtestutil/mixedversion/mixedversion.go index 4a34fddc7079..880d9fb43ce7 100644 --- a/pkg/cmd/roachtest/roachtestutil/mixedversion/mixedversion.go +++ b/pkg/cmd/roachtest/roachtestutil/mixedversion/mixedversion.go @@ -258,7 +258,7 @@ func NewTest( t.Fatal(err) } - prng, seed := randutil.NewPseudoRand() + prng, seed := randutil.NewLockedPseudoRand() testLogger.Printf("mixed-version random seed: %d", seed) testCtx, cancel := context.WithCancel(ctx) diff --git a/pkg/cmd/roachtest/slack.go b/pkg/cmd/roachtest/slack.go index 1e653d142f61..40f2505003e0 100644 --- a/pkg/cmd/roachtest/slack.go +++ b/pkg/cmd/roachtest/slack.go @@ -75,8 +75,6 @@ func postSlackReport(pass, fail, skip map[*testImpl]struct{}) { switch { case cloud != "": prefix = strings.ToUpper(cloud) - case local: - prefix = "LOCAL" default: prefix = "GCE" } diff --git a/pkg/cmd/roachtest/spec/cluster_spec.go b/pkg/cmd/roachtest/spec/cluster_spec.go index 856e0d6eb708..cec686692157 100644 --- a/pkg/cmd/roachtest/spec/cluster_spec.go +++ b/pkg/cmd/roachtest/spec/cluster_spec.go @@ -63,7 +63,8 @@ func (m MemPerCPU) String() string { // look like. It becomes part of a clusterConfig when the cluster is created. type ClusterSpec struct { Cloud string - InstanceType string // auto-chosen if left empty + Arch vm.CPUArch // CPU architecture; auto-chosen if left empty + InstanceType string // auto-chosen if left empty NodeCount int // CPUs is the number of CPUs per node. CPUs int @@ -199,7 +200,7 @@ func getAzureOpts(machineType string, zones []string) vm.ProviderOpts { // RoachprodOpts returns the opts to use when calling `roachprod.Create()` // in order to create the cluster described in the spec. func (s *ClusterSpec) RoachprodOpts( - clusterName string, useIOBarrier bool, enableFIPS bool, + clusterName string, useIOBarrier bool, arch vm.CPUArch, ) (vm.CreateOpts, vm.ProviderOpts, error) { createVMOpts := vm.DefaultCreateOpts() @@ -232,30 +233,41 @@ func (s *ClusterSpec) RoachprodOpts( } createVMOpts.GeoDistributed = s.Geo - createVMOpts.EnableFIPS = enableFIPS + createVMOpts.Arch = string(arch) machineType := s.InstanceType ssdCount := s.SSDs + if s.CPUs != 0 { // Default to the user-supplied machine type, if any. // Otherwise, pick based on requested CPU count. + var selectedArch vm.CPUArch + if len(machineType) == 0 { // If no machine type was specified, choose one // based on the cloud and CPU count. switch s.Cloud { case AWS: - machineType = AWSMachineType(s.CPUs, s.Mem) + machineType, selectedArch = AWSMachineType(s.CPUs, s.Mem, arch) case GCE: - machineType = GCEMachineType(s.CPUs, s.Mem) + machineType, selectedArch = GCEMachineType(s.CPUs, s.Mem, arch) case Azure: machineType = AzureMachineType(s.CPUs, s.Mem) } } + if selectedArch != "" && selectedArch != arch { + // TODO(srosenberg): we need a better way to monitor the rate of this mismatch, i.e., + // other than grepping cluster creation logs. + fmt.Printf("WARN: requested arch %s for machineType %s, but selected %s\n", arch, machineType, selectedArch) + createVMOpts.Arch = string(selectedArch) + } // Local SSD can only be requested // - if configured to prefer doing so, // - if no particular volume size is requested, and, // - on AWS, if the machine type supports it. - if s.PreferLocalSSD && s.VolumeSize == 0 && (s.Cloud != AWS || awsMachineSupportsSSD(machineType)) { + // - on GCE, if the machine type is not ARM64. + if s.PreferLocalSSD && s.VolumeSize == 0 && (s.Cloud != AWS || awsMachineSupportsSSD(machineType)) && + (s.Cloud != GCE || selectedArch != vm.ArchARM64) { // Ensure SSD count is at least 1 if UseLocalSSD is true. if ssdCount == 0 { ssdCount = 1 @@ -288,9 +300,9 @@ func (s *ClusterSpec) RoachprodOpts( } } - if createVMOpts.EnableFIPS && !(s.Cloud == GCE || s.Cloud == AWS) { + if createVMOpts.Arch == string(vm.ArchFIPS) && !(s.Cloud == GCE || s.Cloud == AWS) { return vm.CreateOpts{}, nil, errors.Errorf( - "node creation with enableFIPS enabled not yet supported on %s", s.Cloud, + "FIPS not yet supported on %s", s.Cloud, ) } var providerOpts vm.ProviderOpts diff --git a/pkg/cmd/roachtest/spec/machine_type.go b/pkg/cmd/roachtest/spec/machine_type.go index 98e1a7aba508..445765bab8be 100644 --- a/pkg/cmd/roachtest/spec/machine_type.go +++ b/pkg/cmd/roachtest/spec/machine_type.go @@ -10,16 +10,31 @@ package spec -import "fmt" +import ( + "fmt" + + "github.com/cockroachdb/cockroach/pkg/roachprod/vm" +) // AWSMachineType selects a machine type given the desired number of CPUs and -// memory per CPU ratio. -func AWSMachineType(cpus int, mem MemPerCPU) string { +// memory per CPU ratio. Also returns the architecture of the selected machine type. +func AWSMachineType(cpus int, mem MemPerCPU, arch vm.CPUArch) (string, vm.CPUArch) { // TODO(erikgrinaker): These have significantly less RAM than // their GCE counterparts. Consider harmonizing them. family := "c5d" // 2 GB RAM per CPU + selectedArch := vm.ArchAMD64 + if arch == vm.ArchFIPS { + selectedArch = vm.ArchFIPS + } else if arch == vm.ArchARM64 { + family = "c7g" // 2 GB RAM per CPU (graviton3) + selectedArch = vm.ArchARM64 + } + if mem == High { family = "m5d" // 4 GB RAM per CPU + if arch == vm.ArchARM64 { + family = "m7g" // 4 GB RAM per CPU (graviton3) + } } else if mem == Low { panic("low memory per CPU not available for AWS") } @@ -36,30 +51,51 @@ func AWSMachineType(cpus int, mem MemPerCPU) string { size = "4xlarge" case cpus <= 36: size = "9xlarge" + if family == "c7g" || family == "m7g" { + size = "8xlarge" + } case cpus <= 72: size = "18xlarge" + if family == "c7g" || family == "m7g" { + size = "16xlarge" + } case cpus <= 96: size = "24xlarge" default: panic(fmt.Sprintf("no aws machine type with %d cpus", cpus)) } - // There is no c5d.24xlarge. + // There is no m7g.24xlarge, fall back to m5d.24xlarge. + if family == "m7g" && size == "24xlarge" { + family = "m5d" + selectedArch = vm.ArchAMD64 + } + // There is no c7g.24xlarge, fall back to c5d.24xlarge. + if family == "c7g" && size == "24xlarge" { + family = "c5d" + selectedArch = vm.ArchAMD64 + } + + // There is no c5d.24xlarge, fall back to m5d.24xlarge. if family == "c5d" && size == "24xlarge" { family = "m5d" } - return fmt.Sprintf("%s.%s", family, size) + return fmt.Sprintf("%s.%s", family, size), selectedArch } // GCEMachineType selects a machine type given the desired number of CPUs and -// memory per CPU ratio. -func GCEMachineType(cpus int, mem MemPerCPU) string { +// memory per CPU ratio. Also returns the architecture of the selected machine type. +func GCEMachineType(cpus int, mem MemPerCPU, arch vm.CPUArch) (string, vm.CPUArch) { // TODO(peter): This is awkward: at or below 16 cpus, use n1-standard so that // the machines have a decent amount of RAM. We could use custom machine // configurations, but the rules for the amount of RAM per CPU need to be // determined (you can't request any arbitrary amount of RAM). series := "n1" + selectedArch := vm.ArchAMD64 + if arch == vm.ArchFIPS { + selectedArch = vm.ArchFIPS + } var kind string switch mem { case Auto: @@ -75,7 +111,12 @@ func GCEMachineType(cpus int, mem MemPerCPU) string { case Low: kind = "highcpu" // 0.9 GB RAM per CPU } - return fmt.Sprintf("%s-%s-%d", series, kind, cpus) + if arch == vm.ArchARM64 && mem == Auto && cpus <= 48 { + series = "t2a" + kind = "standard" + selectedArch = vm.ArchARM64 + } + return fmt.Sprintf("%s-%s-%d", series, kind, cpus), selectedArch } // AzureMachineType selects a machine type given the desired number of CPUs and diff --git a/pkg/cmd/roachtest/spec/option.go b/pkg/cmd/roachtest/spec/option.go index ff59b5bb39ee..146a2b43d503 100644 --- a/pkg/cmd/roachtest/spec/option.go +++ b/pkg/cmd/roachtest/spec/option.go @@ -10,7 +10,11 @@ package spec -import "time" +import ( + "time" + + "github.com/cockroachdb/cockroach/pkg/roachprod/vm" +) // Option is the interface satisfied by options to MakeClusterSpec. type Option interface { @@ -28,6 +32,17 @@ func Cloud(s string) Option { return cloudOption(s) } +type archOption string + +func (o archOption) apply(spec *ClusterSpec) { + spec.Arch = vm.CPUArch(o) +} + +// Request specific CPU architecture. +func Arch(arch vm.CPUArch) Option { + return archOption(arch) +} + type nodeCPUOption int func (o nodeCPUOption) apply(spec *ClusterSpec) { diff --git a/pkg/cmd/roachtest/test_impl.go b/pkg/cmd/roachtest/test_impl.go index 63cab78b582e..adcb5f2e3d9f 100644 --- a/pkg/cmd/roachtest/test_impl.go +++ b/pkg/cmd/roachtest/test_impl.go @@ -123,6 +123,7 @@ func (t *testImpl) BuildVersion() *version.Version { return t.buildVersion } +// Cockroach returns the path to the cockroach binary. func (t *testImpl) Cockroach() string { return t.cockroach } diff --git a/pkg/cmd/roachtest/test_registry_test.go b/pkg/cmd/roachtest/test_registry_test.go index 21002046bfd0..0d05c8c933f0 100644 --- a/pkg/cmd/roachtest/test_registry_test.go +++ b/pkg/cmd/roachtest/test_registry_test.go @@ -15,6 +15,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec" + "github.com/cockroachdb/cockroach/pkg/roachprod/vm" "github.com/cockroachdb/cockroach/pkg/testutils" "github.com/prometheus/client_golang/prometheus" "github.com/stretchr/testify/require" @@ -42,6 +43,12 @@ func TestMakeTestRegistry(t *testing.T) { require.Equal(t, "foo", s.InstanceType) require.EqualValues(t, 4, s.CPUs) require.True(t, s.TerminateOnMigration) + + s = r.MakeClusterSpec(10, spec.CPU(16), spec.Arch(vm.ArchARM64)) + require.EqualValues(t, 10, s.NodeCount) + require.Equal(t, "foo", s.InstanceType) + require.EqualValues(t, 16, s.CPUs) + require.EqualValues(t, vm.ArchARM64, s.Arch) }) } diff --git a/pkg/cmd/roachtest/test_runner.go b/pkg/cmd/roachtest/test_runner.go index 7cf8ebb54ee5..eeea467f4c1e 100644 --- a/pkg/cmd/roachtest/test_runner.go +++ b/pkg/cmd/roachtest/test_runner.go @@ -65,6 +65,8 @@ var ( // prometheusScrapeInterval should be consistent with the scrape interval defined in // https://grafana.testeng.crdb.io/prometheus/config prometheusScrapeInterval = time.Second * 15 + + prng, _ = randutil.NewLockedPseudoRand() ) // testRunner runs tests. @@ -163,8 +165,7 @@ type clustersOpt struct { cpuQuota int // Controls whether the cluster is cleaned up at the end of the test. - debugMode debugMode - enableFIPS bool + debugMode debugMode } type debugMode int @@ -395,11 +396,12 @@ func defaultClusterAllocator( allocateCluster := func( ctx context.Context, t registry.TestSpec, + arch vm.CPUArch, alloc *quotapool.IntAlloc, artifactsDir string, wStatus *workerStatus, ) (*clusterImpl, *vm.CreateOpts, error) { - wStatus.SetStatus("creating cluster") + wStatus.SetStatus(fmt.Sprintf("creating cluster (arch=%q)", arch)) defer wStatus.SetStatus("") existingClusterName := clustersOpt.clusterName @@ -416,6 +418,9 @@ func defaultClusterAllocator( skipStop: r.config.skipClusterStopOnAttach, skipWipe: r.config.skipClusterWipeOnAttach, } + // TODO(srosenberg): we need to think about validation here. Attaching to an incompatible cluster, e.g., + // using arm64 AMI with amd64 binary, would result in obscure errors. The test runner ensures compatibility + // during cluster reuse, whereas attachment via CLI (e.g., via roachprod) does not. lopt.l.PrintfCtx(ctx, "Attaching to existing cluster %s for test %s", existingClusterName, t.Name) c, err := attachToExistingCluster(ctx, existingClusterName, clusterL, t.Cluster, opt, r.cr) if err == nil { @@ -426,11 +431,11 @@ func defaultClusterAllocator( } // Fall through to create new cluster with name override. lopt.l.PrintfCtx( - ctx, "Creating new cluster with custom name %q for test %s: %s", - clustersOpt.clusterName, t.Name, t.Cluster, + ctx, "Creating new cluster with custom name %q for test %s: %s (arch=%q)", + clustersOpt.clusterName, t.Name, t.Cluster, arch, ) } else { - lopt.l.PrintfCtx(ctx, "Creating new cluster for test %s: %s", t.Name, t.Cluster) + lopt.l.PrintfCtx(ctx, "Creating new cluster for test %s: %s (arch=%q)", t.Name, t.Cluster, arch) } cfg := clusterConfig{ @@ -440,7 +445,7 @@ func defaultClusterAllocator( username: clustersOpt.user, localCluster: clustersOpt.typ == localCluster, alloc: alloc, - enableFIPS: clustersOpt.enableFIPS, + arch: arch, } return clusterFactory.newCluster(ctx, cfg, wStatus.SetStatus, lopt.tee) } @@ -450,6 +455,7 @@ func defaultClusterAllocator( type clusterAllocatorFn func( ctx context.Context, t registry.TestSpec, + arch vm.CPUArch, alloc *quotapool.IntAlloc, artifactsDir string, wStatus *workerStatus, @@ -530,8 +536,6 @@ func (r *testRunner) runWorker( } }() - prng, _ := randutil.NewPseudoRand() - // Loop until there's no more work in the pool, we get interrupted, or an // error occurs. for { @@ -577,7 +581,7 @@ func (r *testRunner) runWorker( // Attempt to reuse existing cluster. if c != nil && testToRun.canReuseCluster { err = func() error { - l.PrintfCtx(ctx, "Using existing cluster: %s. Wiping", c.name) + l.PrintfCtx(ctx, "Using existing cluster: %s (arch=%q). Wiping", c.name, c.arch) if err := c.WipeE(ctx, l); err != nil { return err } @@ -604,10 +608,43 @@ func (r *testRunner) runWorker( // Let's attempt to create a fresh one. testToRun.canReuseCluster = false } + // sanity check + if c.spec.Cloud != spec.Local && c.spec.Arch != "" && c.arch != c.spec.Arch { + return errors.Newf("cluster arch %q does not match specified arch %q on cloud: %q", c.arch, c.spec.Arch, c.spec.Cloud) + } + } + arch := testToRun.spec.Cluster.Arch + // N.B. local cluster can mix different CPU architectures via emulation; e.g., mac silicon running x86. + if testToRun.canReuseCluster && c != nil && c.spec.Cloud != spec.Local { + // We're reusing a non-local cluster, so we must use the same arch. + arch = c.arch + } + if arch == "" { + // CPU architecture is unspecified, choose one according to the probability distribution. + arch = vm.ArchAMD64 + if prng.Float64() < arm64Probability { + arch = vm.ArchARM64 + } else if prng.Float64() < fipsProbability { + // N.B. branch is taken with probability (1 - arm64Probability) * fipsProbability which is P(fips | amd64). + // N.B. FIPS is only supported on 'amd64' at this time. + arch = vm.ArchFIPS + } + l.PrintfCtx(ctx, "Using (randomly) chosen arch=%q for %s", arch, testToRun.spec.Name) + } else { + l.PrintfCtx(ctx, "Using (specified) arch=%q for %s", arch, testToRun.spec.Name) + } + // N.B. if canReuseCluster is false, then the previous cluster has been destroyed; new one will be created below. + if testToRun.canReuseCluster && c != nil && c.arch != arch { + // Non-local cluster that's being reused must have the same architecture as was ensured above. + if c.spec.Cloud != spec.Local { + return errors.New("infeasible path: non-local cluster arch mismatch") + } + // Local cluster is now reused to emulate a different CPU architecture. + c.arch = arch } // Verify that required native libraries are available. - if err = VerifyLibraries(testToRun.spec.NativeLibs); err != nil { + if err = VerifyLibraries(testToRun.spec.NativeLibs, arch); err != nil { shout(ctx, l, stdout, "Library verification failed: %s", err) return err } @@ -619,13 +656,14 @@ func (r *testRunner) runWorker( // Create a new cluster if can't reuse or reuse attempt failed. // N.B. non-reusable cluster would have been destroyed above. wStatus.SetTest(nil /* test */, testToRun) - wStatus.SetStatus("creating cluster") - c, vmCreateOpts, clusterCreateErr = allocateCluster(ctx, testToRun.spec, testToRun.alloc, artifactsRootDir, wStatus) + c, vmCreateOpts, clusterCreateErr = allocateCluster(ctx, testToRun.spec, arch, testToRun.alloc, artifactsRootDir, wStatus) if clusterCreateErr != nil { clusterCreateErr = errors.Mark(clusterCreateErr, errClusterProvisioningFailed) atomic.AddInt32(&r.numClusterErrs, 1) shout(ctx, l, stdout, "Unable to create (or reuse) cluster for test %s due to: %s.", testToRun.spec.Name, clusterCreateErr) + } else { + l.PrintfCtx(ctx, "Created new cluster for test %s: %s (arch=%q)", testToRun.spec.Name, c.Name(), arch) } } // Prepare the test's logger. Always set this up with real files, using a @@ -655,9 +693,9 @@ func (r *testRunner) runWorker( } t := &testImpl{ spec: &testToRun.spec, - cockroach: cockroach, - cockroachShort: cockroachShort, - deprecatedWorkload: workload, + cockroach: cockroach[arch], + cockroachShort: cockroachShort[arch], + deprecatedWorkload: workload[arch], buildVersion: binaryVersion, artifactsDir: artifactsDir, artifactsSpec: artifactsSpec, @@ -666,9 +704,6 @@ func (r *testRunner) runWorker( skipInit: topt.skipInit, debug: debugMode.IsDebug(), } - // Now run the test. - l.PrintfCtx(ctx, "starting test: %s:%d", testToRun.spec.Name, testToRun.runNum) - github := newGithubIssues(r.config.disableIssue, c, vmCreateOpts) if clusterCreateErr != nil { @@ -683,6 +718,9 @@ func (r *testRunner) runWorker( shout(ctx, l, stdout, "failed to post issue: %s", err) } } else { + // Now run the test. + l.PrintfCtx(ctx, "Starting test: %s:%d on cluster=%s (arch=%q)", testToRun.spec.Name, testToRun.runNum, c.Name(), arch) + c.setTest(t) if c.spec.NodeCount > 0 { // skip during tests err = c.PutDefaultCockroach(ctx, l, t.Cockroach()) diff --git a/pkg/cmd/roachtest/test_test.go b/pkg/cmd/roachtest/test_test.go index 04bd1c882b6d..52b4514300a6 100644 --- a/pkg/cmd/roachtest/test_test.go +++ b/pkg/cmd/roachtest/test_test.go @@ -100,6 +100,7 @@ func nilLogger() *logger.Logger { func alwaysFailingClusterAllocator( ctx context.Context, t registry.TestSpec, + arch vm.CPUArch, alloc *quotapool.IntAlloc, artifactsDir string, wStatus *workerStatus, diff --git a/pkg/cmd/roachtest/tests/autoupgrade.go b/pkg/cmd/roachtest/tests/autoupgrade.go index c7bd5583da33..7941b8687df0 100644 --- a/pkg/cmd/roachtest/tests/autoupgrade.go +++ b/pkg/cmd/roachtest/tests/autoupgrade.go @@ -13,7 +13,6 @@ package tests import ( "context" "fmt" - "runtime" "time" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" @@ -257,9 +256,6 @@ func registerAutoUpgrade(r registry.Registry) { Owner: registry.OwnerTestEng, Cluster: r.MakeClusterSpec(5), Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { - if c.IsLocal() && runtime.GOARCH == "arm64" { - t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") - } pred, err := version.PredecessorVersion(*t.BuildVersion()) if err != nil { t.Fatal(err) diff --git a/pkg/cmd/roachtest/tests/cdc.go b/pkg/cmd/roachtest/tests/cdc.go index 274c36ddccba..d16c5e646b56 100644 --- a/pkg/cmd/roachtest/tests/cdc.go +++ b/pkg/cmd/roachtest/tests/cdc.go @@ -25,7 +25,6 @@ import ( "net/url" "path/filepath" "regexp" - "runtime" "sort" "strconv" "strings" @@ -48,6 +47,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/roachprod/install" "github.com/cockroachdb/cockroach/pkg/roachprod/logger" "github.com/cockroachdb/cockroach/pkg/roachprod/prometheus" + "github.com/cockroachdb/cockroach/pkg/roachprod/vm" "github.com/cockroachdb/cockroach/pkg/testutils/jobutils" "github.com/cockroachdb/cockroach/pkg/testutils/sqlutils" "github.com/cockroachdb/cockroach/pkg/util/protoutil" @@ -583,9 +583,6 @@ type latencyTargets struct { } func runCDCBank(ctx context.Context, t test.Test, c cluster.Cluster) { - if runtime.GOARCH == "arm64" { - t.Skip("Skipping cdc/bank under ARM64.") - } // Make the logs dir on every node to work around the `roachprod get logs` // spam. c.Run(ctx, c.All(), `mkdir -p logs`) @@ -1317,9 +1314,10 @@ func registerCDC(r registry.Registry) { }, }) r.Add(registry.TestSpec{ - Name: "cdc/bank", - Owner: `cdc`, - Cluster: r.MakeClusterSpec(4), + Name: "cdc/bank", + Owner: `cdc`, + // N.B. ARM64 is not yet supported, see https://github.com/cockroachdb/cockroach/issues/103888. + Cluster: r.MakeClusterSpec(4, spec.Arch(vm.ArchAMD64)), Leases: registry.MetamorphicLeases, RequiresLicense: true, Timeout: 30 * time.Minute, diff --git a/pkg/cmd/roachtest/tests/cluster_to_cluster.go b/pkg/cmd/roachtest/tests/cluster_to_cluster.go index fa506b0c08e1..6c7e19e7da84 100644 --- a/pkg/cmd/roachtest/tests/cluster_to_cluster.go +++ b/pkg/cmd/roachtest/tests/cluster_to_cluster.go @@ -497,8 +497,8 @@ func (rd *replicationDriver) preStreamingWorkload(ctx context.Context) { rd.t.Status("populating source cluster before replication") initStart := timeutil.Now() rd.c.Run(ctx, rd.setup.workloadNode, initCmd) - rd.t.L().Printf("src cluster workload initialization took %s minutes", - timeutil.Since(initStart).Minutes()) + rd.t.L().Printf("src cluster workload initialization took %s", + timeutil.Since(initStart)) } } diff --git a/pkg/cmd/roachtest/tests/decommission.go b/pkg/cmd/roachtest/tests/decommission.go index 3e612984dc59..58fc0eeee38b 100644 --- a/pkg/cmd/roachtest/tests/decommission.go +++ b/pkg/cmd/roachtest/tests/decommission.go @@ -17,7 +17,6 @@ import ( "math/rand" "reflect" "regexp" - "runtime" "strconv" "strings" "time" @@ -102,9 +101,6 @@ func registerDecommission(r registry.Registry) { Owner: registry.OwnerKV, Cluster: r.MakeClusterSpec(numNodes), Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { - if c.IsLocal() && runtime.GOARCH == "arm64" { - t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") - } runDecommissionMixedVersions(ctx, t, c, *t.BuildVersion()) }, }) diff --git a/pkg/cmd/roachtest/tests/follower_reads.go b/pkg/cmd/roachtest/tests/follower_reads.go index 6ed247300866..c3393362fce3 100644 --- a/pkg/cmd/roachtest/tests/follower_reads.go +++ b/pkg/cmd/roachtest/tests/follower_reads.go @@ -19,7 +19,6 @@ import ( "net/http" "reflect" "regexp" - "runtime" "strconv" "strings" "time" @@ -103,9 +102,6 @@ func registerFollowerReads(r registry.Registry) { spec.CPU(2), ), Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { - if c.IsLocal() && runtime.GOARCH == "arm64" { - t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") - } runFollowerReadsMixedVersionSingleRegionTest(ctx, t, c, *t.BuildVersion()) }, }) diff --git a/pkg/cmd/roachtest/tests/import.go b/pkg/cmd/roachtest/tests/import.go index 8b07bdbb2509..64c5ba102e33 100644 --- a/pkg/cmd/roachtest/tests/import.go +++ b/pkg/cmd/roachtest/tests/import.go @@ -15,7 +15,6 @@ import ( gosql "database/sql" "fmt" "path/filepath" - "runtime" "strings" "time" @@ -357,9 +356,6 @@ func registerImportMixedVersion(r registry.Registry) { // Mixed-version support was added in 21.1. Cluster: r.MakeClusterSpec(4), Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { - if c.IsLocal() && runtime.GOARCH == "arm64" { - t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") - } predV, err := version.PredecessorVersion(*t.BuildVersion()) if err != nil { t.Fatal(err) diff --git a/pkg/cmd/roachtest/tests/mixed_version_backup.go b/pkg/cmd/roachtest/tests/mixed_version_backup.go index bf8e1c3e706c..54ba4fec1bc8 100644 --- a/pkg/cmd/roachtest/tests/mixed_version_backup.go +++ b/pkg/cmd/roachtest/tests/mixed_version_backup.go @@ -19,7 +19,6 @@ import ( "path/filepath" "reflect" "regexp" - "runtime" "sort" "strings" "sync/atomic" @@ -31,6 +30,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/mixedversion" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" "github.com/cockroachdb/cockroach/pkg/jobs" "github.com/cockroachdb/cockroach/pkg/jobs/jobspb" @@ -2033,8 +2033,8 @@ func registerBackupMixedVersion(r registry.Registry) { EncryptionSupport: registry.EncryptionMetamorphic, RequiresLicense: true, Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { - if c.IsLocal() && runtime.GOARCH == "arm64" { - t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") + if c.Spec().Cloud != spec.GCE { + t.Skip("uses gs://cockroachdb-backup-testing, available only in GCE") } roachNodes := c.Range(1, c.Spec().NodeCount-1) diff --git a/pkg/cmd/roachtest/tests/mixed_version_cdc.go b/pkg/cmd/roachtest/tests/mixed_version_cdc.go index bef4ecb237b6..21a7f4e86b9c 100644 --- a/pkg/cmd/roachtest/tests/mixed_version_cdc.go +++ b/pkg/cmd/roachtest/tests/mixed_version_cdc.go @@ -14,7 +14,6 @@ import ( "context" gosql "database/sql" "fmt" - "runtime" "strconv" "strings" "time" @@ -26,6 +25,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" + "github.com/cockroachdb/cockroach/pkg/roachprod/vm" "github.com/cockroachdb/cockroach/pkg/util/randutil" "github.com/cockroachdb/cockroach/pkg/util/syncutil" "github.com/cockroachdb/cockroach/pkg/util/timeutil" @@ -66,15 +66,13 @@ func registerCDCMixedVersions(r registry.Registry) { zones = teamcityAgentZone } r.Add(registry.TestSpec{ - Name: "cdc/mixed-versions", - Owner: registry.OwnerTestEng, - Cluster: r.MakeClusterSpec(5, spec.Zones(zones)), + Name: "cdc/mixed-versions", + Owner: registry.OwnerTestEng, + // N.B. ARM64 is not yet supported, see https://github.com/cockroachdb/cockroach/issues/103888. + Cluster: r.MakeClusterSpec(5, spec.Zones(zones), spec.Arch(vm.ArchAMD64)), Timeout: timeout, RequiresLicense: true, Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { - if c.IsLocal() && runtime.GOARCH == "arm64" { - t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") - } runCDCMixedVersions(ctx, t, c, *t.BuildVersion()) }, }) diff --git a/pkg/cmd/roachtest/tests/mixed_version_decl_schemachange_compat.go b/pkg/cmd/roachtest/tests/mixed_version_decl_schemachange_compat.go index 6161bb99d67a..9d791b11d53f 100644 --- a/pkg/cmd/roachtest/tests/mixed_version_decl_schemachange_compat.go +++ b/pkg/cmd/roachtest/tests/mixed_version_decl_schemachange_compat.go @@ -16,11 +16,11 @@ import ( "os" "path/filepath" "regexp" - "runtime" "strings" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" "github.com/cockroachdb/cockroach/pkg/util/version" ) @@ -31,8 +31,8 @@ func registerDeclSchemaChangeCompatMixedVersions(r registry.Registry) { Owner: registry.OwnerSQLFoundations, Cluster: r.MakeClusterSpec(1), Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { - if c.IsLocal() && runtime.GOARCH == "arm64" { - t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") + if c.Spec().Cloud != spec.GCE { + t.Skip("uses gsutil with gs://cockroach-corpus, available only in GCE") } runDeclSchemaChangeCompatMixedVersions(ctx, t, c, *t.BuildVersion()) }, diff --git a/pkg/cmd/roachtest/tests/mixed_version_job_compatibility_in_declarative_schema_changer.go b/pkg/cmd/roachtest/tests/mixed_version_job_compatibility_in_declarative_schema_changer.go index 8eeeac3c93df..87343d58f523 100644 --- a/pkg/cmd/roachtest/tests/mixed_version_job_compatibility_in_declarative_schema_changer.go +++ b/pkg/cmd/roachtest/tests/mixed_version_job_compatibility_in_declarative_schema_changer.go @@ -12,7 +12,6 @@ package tests import ( "context" - "runtime" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" @@ -133,9 +132,6 @@ func registerDeclarativeSchemaChangerJobCompatibilityInMixedVersion(r registry.R Owner: registry.OwnerSQLFoundations, Cluster: r.MakeClusterSpec(4), Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { - if c.IsLocal() && runtime.GOARCH == "arm64" { - t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") - } predV, err := version.PredecessorVersion(*t.BuildVersion()) require.NoError(t, err) diff --git a/pkg/cmd/roachtest/tests/mixed_version_jobs.go b/pkg/cmd/roachtest/tests/mixed_version_jobs.go index ed805f4aeef9..33019e8b6ce3 100644 --- a/pkg/cmd/roachtest/tests/mixed_version_jobs.go +++ b/pkg/cmd/roachtest/tests/mixed_version_jobs.go @@ -13,7 +13,6 @@ package tests import ( "context" "fmt" - "runtime" "time" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" @@ -333,9 +332,6 @@ func registerJobsMixedVersions(r registry.Registry) { // vice versa in order to detect regressions in the work done for 20.1. Cluster: r.MakeClusterSpec(4), Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { - if c.IsLocal() && runtime.GOARCH == "arm64" { - t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") - } predV, err := version.PredecessorVersion(*t.BuildVersion()) if err != nil { t.Fatal(err) diff --git a/pkg/cmd/roachtest/tests/mixed_version_schemachange.go b/pkg/cmd/roachtest/tests/mixed_version_schemachange.go index fb66c4003e60..990b2e18ca3c 100644 --- a/pkg/cmd/roachtest/tests/mixed_version_schemachange.go +++ b/pkg/cmd/roachtest/tests/mixed_version_schemachange.go @@ -13,7 +13,6 @@ package tests import ( "context" "fmt" - "runtime" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" @@ -32,9 +31,6 @@ func registerSchemaChangeMixedVersions(r registry.Registry) { Cluster: r.MakeClusterSpec(4), NativeLibs: registry.LibGEOS, Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { - if c.IsLocal() && runtime.GOARCH == "arm64" { - t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") - } maxOps := 100 concurrency := 5 if c.IsLocal() { diff --git a/pkg/cmd/roachtest/tests/rebalance_load.go b/pkg/cmd/roachtest/tests/rebalance_load.go index 154b9abc2e04..3ae35b08e001 100644 --- a/pkg/cmd/roachtest/tests/rebalance_load.go +++ b/pkg/cmd/roachtest/tests/rebalance_load.go @@ -14,7 +14,6 @@ import ( "context" "fmt" "math/rand" - "runtime" "strings" "time" @@ -193,9 +192,6 @@ func registerRebalanceLoad(r registry.Registry) { Cluster: r.MakeClusterSpec(4), // the last node is just used to generate load Leases: registry.MetamorphicLeases, Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { - if c.IsLocal() && runtime.GOARCH == "arm64" { - t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") - } if c.IsLocal() { concurrency = 32 fmt.Printf("lowering concurrency to %d in local testing\n", concurrency) diff --git a/pkg/cmd/roachtest/tests/restore.go b/pkg/cmd/roachtest/tests/restore.go index 4b265fe37303..3f87107a60b9 100644 --- a/pkg/cmd/roachtest/tests/restore.go +++ b/pkg/cmd/roachtest/tests/restore.go @@ -32,6 +32,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/keys" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/roachprod/install" + "github.com/cockroachdb/cockroach/pkg/roachprod/vm" "github.com/cockroachdb/cockroach/pkg/testutils" "github.com/cockroachdb/cockroach/pkg/testutils/sqlutils" "github.com/cockroachdb/cockroach/pkg/ts/tspb" @@ -299,7 +300,7 @@ func registerRestore(r registry.Registry) { hardware: makeHardwareSpecs(hardwareSpecs{ nodes: 9, zones: []string{"us-east-2b", "us-west-2b", "eu-west-1b"}}), // These zones are AWS-specific. - backup: makeBackupSpecs(backupSpecs{}), + backup: makeBackupSpecs(backupSpecs{cloud: spec.AWS}), timeout: 90 * time.Minute, tags: registry.Tags("aws"), }, @@ -489,8 +490,9 @@ func (hw hardwareSpecs) makeClusterSpecs(r registry.Registry, backupCloud string // https://github.com/cockroachdb/cockroach/issues/98783. // // TODO(srosenberg): Remove this workaround when 98783 is addressed. - s.InstanceType = spec.AWSMachineType(s.CPUs, s.Mem) + s.InstanceType, _ = spec.AWSMachineType(s.CPUs, s.Mem, vm.ArchAMD64) s.InstanceType = strings.Replace(s.InstanceType, "d.", ".", 1) + s.Arch = vm.ArchAMD64 } return s } diff --git a/pkg/cmd/roachtest/tests/secondary_indexes.go b/pkg/cmd/roachtest/tests/secondary_indexes.go index 888eee17dffa..a26864022723 100644 --- a/pkg/cmd/roachtest/tests/secondary_indexes.go +++ b/pkg/cmd/roachtest/tests/secondary_indexes.go @@ -12,7 +12,6 @@ package tests import ( "context" - "runtime" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" @@ -140,9 +139,6 @@ func registerSecondaryIndexesMultiVersionCluster(r registry.Registry) { Owner: registry.OwnerSQLFoundations, Cluster: r.MakeClusterSpec(3), Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { - if c.IsLocal() && runtime.GOARCH == "arm64" { - t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") - } predV, err := version.PredecessorVersion(*t.BuildVersion()) if err != nil { t.Fatal(err) diff --git a/pkg/cmd/roachtest/tests/tpcc.go b/pkg/cmd/roachtest/tests/tpcc.go index 22d09951bc7f..3d122a5e7425 100644 --- a/pkg/cmd/roachtest/tests/tpcc.go +++ b/pkg/cmd/roachtest/tests/tpcc.go @@ -17,7 +17,6 @@ import ( "math/rand" "os" "path/filepath" - "runtime" "strings" "time" @@ -29,6 +28,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/roachprod/install" "github.com/cockroachdb/cockroach/pkg/roachprod/logger" "github.com/cockroachdb/cockroach/pkg/roachprod/prometheus" + "github.com/cockroachdb/cockroach/pkg/roachprod/vm" "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode" "github.com/cockroachdb/cockroach/pkg/testutils/skip" "github.com/cockroachdb/cockroach/pkg/util/search" @@ -317,6 +317,7 @@ var tpccSupportedWarehouses = []struct { // TODO(tbg): this number is copied from gce-n4cpu16. The real number should be a // little higher, find out what it is. {hardware: "gce-n5cpu16", v: version.MustParse(`v19.1.0-0`), warehouses: 1300}, + {hardware: "aws-n5cpu16", v: version.MustParse(`v19.1.0-0`), warehouses: 2100}, // Ditto. {hardware: "gce-n5cpu16", v: version.MustParse(`v2.1.0-0`), warehouses: 1300}, } @@ -357,9 +358,6 @@ func maxSupportedTPCCWarehouses( func runTPCCMixedHeadroom( ctx context.Context, t test.Test, c cluster.Cluster, cloud string, versionsToUpgrade int, ) { - if c.IsLocal() && runtime.GOARCH == "arm64" { - t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") - } crdbNodes := c.Range(1, c.Spec().NodeCount-1) workloadNode := c.Node(c.Spec().NodeCount) @@ -525,13 +523,17 @@ func registerTPCC(r registry.Registry) { runTPCCMixedHeadroom(ctx, t, c, cloud, 1) }, }) + + // N.B. Multiple upgrades may require a released version < 22.2.x, which wasn't built for ARM64. + mixedHeadroomMultiUpgradesSpec := r.MakeClusterSpec(5, spec.CPU(16), spec.RandomlyUseZfs(), spec.Arch(vm.ArchAMD64)) + r.Add(registry.TestSpec{ // run the same mixed-headroom test, but going back two versions - Name: "tpcc/mixed-headroom/multiple-upgrades/" + mixedHeadroomSpec.String(), + Name: "tpcc/mixed-headroom/multiple-upgrades/" + mixedHeadroomMultiUpgradesSpec.String(), Timeout: 5 * time.Hour, Owner: registry.OwnerTestEng, Tags: registry.Tags(`default`), - Cluster: mixedHeadroomSpec, + Cluster: mixedHeadroomMultiUpgradesSpec, EncryptionSupport: registry.EncryptionMetamorphic, Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { runTPCCMixedHeadroom(ctx, t, c, cloud, 2) diff --git a/pkg/cmd/roachtest/tests/validate_system_schema_after_version_upgrade.go b/pkg/cmd/roachtest/tests/validate_system_schema_after_version_upgrade.go index 63b023d1aa0c..90ed4d0941db 100644 --- a/pkg/cmd/roachtest/tests/validate_system_schema_after_version_upgrade.go +++ b/pkg/cmd/roachtest/tests/validate_system_schema_after_version_upgrade.go @@ -12,7 +12,6 @@ package tests import ( "context" - "runtime" "strings" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" @@ -36,9 +35,6 @@ func registerValidateSystemSchemaAfterVersionUpgrade(r registry.Registry) { Owner: registry.OwnerSQLFoundations, Cluster: r.MakeClusterSpec(1), Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { - if c.IsLocal() && runtime.GOARCH == "arm64" { - t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") - } predecessorVersion, err := version.PredecessorVersion(*t.BuildVersion()) if err != nil { t.Fatal(err) diff --git a/pkg/cmd/roachtest/tests/version.go b/pkg/cmd/roachtest/tests/version.go index 940c5f3125f2..8cf4a2c44167 100644 --- a/pkg/cmd/roachtest/tests/version.go +++ b/pkg/cmd/roachtest/tests/version.go @@ -13,7 +13,6 @@ package tests import ( "context" "fmt" - "runtime" "strings" "time" @@ -223,9 +222,6 @@ func registerVersion(r registry.Registry) { Owner: registry.OwnerTestEng, Cluster: r.MakeClusterSpec(n + 1), Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { - if c.IsLocal() && runtime.GOARCH == "arm64" { - t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") - } pred, err := version.PredecessorVersion(*t.BuildVersion()) if err != nil { t.Fatal(err) diff --git a/pkg/cmd/roachtest/tests/versionupgrade.go b/pkg/cmd/roachtest/tests/versionupgrade.go index 1e3e96b7f29b..e1d272a9bc2f 100644 --- a/pkg/cmd/roachtest/tests/versionupgrade.go +++ b/pkg/cmd/roachtest/tests/versionupgrade.go @@ -98,9 +98,6 @@ DROP TABLE splitmerge.t; } func runVersionUpgrade(ctx context.Context, t test.Test, c cluster.Cluster) { - if c.IsLocal() && runtime.GOARCH == "arm64" { - t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") - } c.Put(ctx, t.DeprecatedWorkload(), "./workload", c.All()) mvt := mixedversion.NewTest(ctx, t, t.L(), c, c.All()) mvt.OnStartup("setup schema changer workload", func(ctx context.Context, l *logger.Logger, r *rand.Rand, helper *mixedversion.Helper) error { diff --git a/pkg/roachprod/install/BUILD.bazel b/pkg/roachprod/install/BUILD.bazel index 94b01bf661e8..e33256a62493 100644 --- a/pkg/roachprod/install/BUILD.bazel +++ b/pkg/roachprod/install/BUILD.bazel @@ -57,6 +57,7 @@ go_test( embed = [":install"], deps = [ "//pkg/roachprod/logger", + "//pkg/roachprod/vm", "//pkg/testutils/datapathutils", "//pkg/util/retry", "@com_github_cockroachdb_datadriven//:datadriven", diff --git a/pkg/roachprod/install/staging.go b/pkg/roachprod/install/staging.go index 09c31a6dd2fa..7a4382479fe5 100644 --- a/pkg/roachprod/install/staging.go +++ b/pkg/roachprod/install/staging.go @@ -17,6 +17,7 @@ import ( "path/filepath" "github.com/cockroachdb/cockroach/pkg/roachprod/logger" + "github.com/cockroachdb/cockroach/pkg/roachprod/vm" "github.com/cockroachdb/errors" ) @@ -98,30 +99,30 @@ var ( ) // ArchInfoForOS returns an ArchInfo for the given OS and Architecture if currently supported. -func ArchInfoForOS(os string, arch string) (archInfo, error) { - if arch != "" && arch != "amd64" && arch != "arm64" && arch != "fips" { +func ArchInfoForOS(os string, arch vm.CPUArch) (archInfo, error) { + if arch != "" && arch != vm.ArchAMD64 && arch != vm.ArchARM64 && arch != vm.ArchFIPS { return archInfo{}, errors.Errorf("unsupported architecture %q", arch) } switch os { case "linux": - if arch == "arm64" { + if arch == vm.ArchARM64 { return linux_arm64_ArchInfo, nil } - if arch == "fips" { + if arch == vm.ArchFIPS { return linux_x86_64_fips_ArchInfo, nil } return linux_x86_64_ArchInfo, nil case "darwin": - if arch == "arm64" { + if arch == vm.ArchARM64 { return darwin_arm64_ArchInfo, nil } - if arch == "fips" { + if arch == vm.ArchFIPS { return archInfo{}, errors.Errorf("%q is not supported on %q", arch, os) } return darwin_x86_64_ArchInfo, nil case "windows": - if arch == "fips" || arch == "arm64" { + if arch == vm.ArchFIPS || arch == vm.ArchARM64 { return archInfo{}, errors.Errorf("%q is not supported on %q", arch, os) } return windowsArchInfo, nil @@ -176,7 +177,7 @@ func StageApplication( applicationName string, version string, os string, - arch string, + arch vm.CPUArch, destDir string, ) error { archInfo, err := ArchInfoForOS(os, arch) @@ -226,7 +227,7 @@ func StageApplication( // URLsForApplication returns a slice of URLs that should be // downloaded for the given application. func URLsForApplication( - application string, version string, os string, arch string, + application string, version string, os string, arch vm.CPUArch, ) ([]*url.URL, error) { archInfo, err := ArchInfoForOS(os, arch) if err != nil { diff --git a/pkg/roachprod/install/staging_test.go b/pkg/roachprod/install/staging_test.go index 3455df5fff79..977d755bf401 100644 --- a/pkg/roachprod/install/staging_test.go +++ b/pkg/roachprod/install/staging_test.go @@ -13,6 +13,7 @@ package install import ( "testing" + "github.com/cockroachdb/cockroach/pkg/roachprod/vm" "github.com/stretchr/testify/require" ) @@ -322,7 +323,7 @@ func TestURLsForApplication(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - got, err := URLsForApplication(tt.args.application, tt.args.version, tt.args.os, tt.args.arch) + got, err := URLsForApplication(tt.args.application, tt.args.version, tt.args.os, vm.CPUArch(tt.args.arch)) if (err != nil) != tt.wantErr { t.Errorf("URLsForApplication() error = %v, wantErr %v", err, tt.wantErr) return diff --git a/pkg/roachprod/prometheus/BUILD.bazel b/pkg/roachprod/prometheus/BUILD.bazel index 57e36a1b18e7..0c5f2b55df38 100644 --- a/pkg/roachprod/prometheus/BUILD.bazel +++ b/pkg/roachprod/prometheus/BUILD.bazel @@ -9,6 +9,7 @@ go_library( deps = [ "//pkg/roachprod/install", "//pkg/roachprod/logger", + "//pkg/roachprod/vm", "@com_github_cockroachdb_errors//:errors", "@com_github_prometheus_client_golang//api/prometheus/v1:prometheus", "@com_github_prometheus_common//model", diff --git a/pkg/roachprod/prometheus/prometheus.go b/pkg/roachprod/prometheus/prometheus.go index bc23e239b428..22a59af72a24 100644 --- a/pkg/roachprod/prometheus/prometheus.go +++ b/pkg/roachprod/prometheus/prometheus.go @@ -20,6 +20,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/roachprod/install" "github.com/cockroachdb/cockroach/pkg/roachprod/logger" + "github.com/cockroachdb/cockroach/pkg/roachprod/vm" "github.com/cockroachdb/errors" promv1 "github.com/prometheus/client_golang/api/prometheus/v1" "github.com/prometheus/common/model" @@ -241,20 +242,25 @@ type Prometheus struct { // Init creates a prometheus instance on the given cluster. func Init( - ctx context.Context, l *logger.Logger, c *install.SyncedCluster, cfg Config, + ctx context.Context, l *logger.Logger, c *install.SyncedCluster, arch vm.CPUArch, cfg Config, ) (_ *Prometheus, _ error) { + binArch := "amd64" + if arch == vm.ArchARM64 { + binArch = "arm64" + } + if len(cfg.NodeExporter) > 0 { // NB: when upgrading here, make sure to target a version that picks up this PR: // https://github.com/prometheus/node_exporter/pull/2311 // At time of writing, there hasn't been a release in over half a year. if err := c.RepeatRun(ctx, l, l.Stdout, l.Stderr, cfg.NodeExporter, "download node exporter", - ` + fmt.Sprintf(` (sudo systemctl stop node_exporter || true) && rm -rf node_exporter && mkdir -p node_exporter && curl -fsSL \ - https://github.com/prometheus/node_exporter/releases/download/v1.2.2/node_exporter-1.2.2.linux-amd64.tar.gz | + https://storage.googleapis.com/cockroach-fixtures/prometheus/node_exporter-1.2.2.linux-%s.tar.gz | tar zxv --strip-components 1 -C node_exporter -`); err != nil { +`, binArch)); err != nil { return nil, err } @@ -287,9 +293,9 @@ sudo systemd-run --unit node_exporter --same-dir ./node_exporter`, l.Stderr, cfg.PrometheusNode, "download prometheus", - `sudo rm -rf /tmp/prometheus && mkdir /tmp/prometheus && cd /tmp/prometheus && - curl -fsSL https://storage.googleapis.com/cockroach-fixtures/prometheus/prometheus-2.27.1.linux-amd64.tar.gz | tar zxv --strip-components=1`, - ); err != nil { + fmt.Sprintf(`sudo rm -rf /tmp/prometheus && mkdir /tmp/prometheus && cd /tmp/prometheus && + curl -fsSL https://storage.googleapis.com/cockroach-fixtures/prometheus/prometheus-2.27.1.linux-%s.tar.gz | tar zxv --strip-components=1`, + binArch)); err != nil { return nil, err } // create and upload prom config @@ -333,14 +339,14 @@ sudo systemd-run --unit prometheus --same-dir \ if err := c.RepeatRun(ctx, l, l.Stdout, l.Stderr, cfg.PrometheusNode, "install grafana", - ` + fmt.Sprintf(` sudo apt-get install -qqy apt-transport-https && sudo apt-get install -qqy software-properties-common wget && sudo apt-get install -y adduser libfontconfig1 && -wget https://dl.grafana.com/enterprise/release/grafana-enterprise_9.2.3_amd64.deb -O grafana-enterprise_9.2.3_amd64.deb && -sudo dpkg -i grafana-enterprise_9.2.3_amd64.deb && +wget https://dl.grafana.com/enterprise/release/grafana-enterprise_9.2.3_%s.deb -O grafana-enterprise_9.2.3_%s.deb && +sudo dpkg -i grafana-enterprise_9.2.3_%s.deb && sudo mkdir -p /var/lib/grafana/dashboards`, - ); err != nil { + binArch, binArch, binArch)); err != nil { return nil, err } diff --git a/pkg/roachprod/roachprod.go b/pkg/roachprod/roachprod.go index 9b3353d816bb..be799c3696a8 100644 --- a/pkg/roachprod/roachprod.go +++ b/pkg/roachprod/roachprod.go @@ -538,7 +538,7 @@ func Stage( dir = stageDir } - return install.StageApplication(ctx, l, c, applicationName, version, os, arch, dir) + return install.StageApplication(ctx, l, c, applicationName, version, os, vm.CPUArch(arch), dir) } // Reset resets all VMs in a cluster. @@ -1413,7 +1413,7 @@ func StageURL( if stageArch != "" { arch = stageArch } - urls, err := install.URLsForApplication(applicationName, version, os, arch) + urls, err := install.URLsForApplication(applicationName, version, os, vm.CPUArch(arch)) if err != nil { return nil, err } @@ -1458,6 +1458,7 @@ func StartGrafana( ctx context.Context, l *logger.Logger, clusterName string, + arch vm.CPUArch, grafanaURL string, grafanaJSON []string, promCfg *prometheus.Config, // passed iff grafanaURL is empty @@ -1499,7 +1500,7 @@ func StartGrafana( promCfg.WithGrafanaDashboardJSON(str) } } - _, err = prometheus.Init(ctx, l, c, *promCfg) + _, err = prometheus.Init(ctx, l, c, arch, *promCfg) if err != nil { return err } diff --git a/pkg/roachprod/vm/aws/aws.go b/pkg/roachprod/vm/aws/aws.go index bd5f1376fbac..20cfa743cbc1 100644 --- a/pkg/roachprod/vm/aws/aws.go +++ b/pkg/roachprod/vm/aws/aws.go @@ -265,7 +265,9 @@ var defaultConfig = func() (cfg *awsConfig) { // cluster creation. If the geo flag is specified, nodes are distributed between // zones. var defaultCreateZones = []string{ - "us-east-2b", + // N.B. us-east-2a is the default zone for non-geo distributed clusters. It appears to have a higher on-demand + // capacity of c7g.8xlarge (graviton3) than us-east-2b. + "us-east-2a", "us-west-2b", "eu-west-2b", } @@ -456,12 +458,13 @@ func (p *Provider) Create( var g errgroup.Group limiter := rate.NewLimiter(rate.Limit(providerOpts.CreateRateLimit), 2 /* buckets */) for i := range names { + index := i capName := names[i] placement := zones[i] res := limiter.Reserve() g.Go(func() error { time.Sleep(res.Delay()) - return p.runInstance(l, capName, placement, opts, providerOpts) + return p.runInstance(l, capName, index, placement, opts, providerOpts) }) } if err := g.Wait(); err != nil { @@ -914,7 +917,12 @@ func (p *Provider) listRegion( // we need to do a bit of work to look up all of the various ids that // we need in order to actually allocate an instance. func (p *Provider) runInstance( - l *logger.Logger, name string, zone string, opts vm.CreateOpts, providerOpts *ProviderOpts, + l *logger.Logger, + name string, + instanceIdx int, + zone string, + opts vm.CreateOpts, + providerOpts *ProviderOpts, ) error { // There exist different flags to control the machine type when ssd is true. // This enables sane defaults for either setting but the behavior can be @@ -1005,7 +1013,7 @@ func (p *Provider) runInstance( extraMountOpts = "nobarrier" } } - filename, err := writeStartupScript(name, extraMountOpts, providerOpts.UseMultipleDisks, opts.EnableFIPS) + filename, err := writeStartupScript(name, extraMountOpts, providerOpts.UseMultipleDisks, opts.Arch == string(vm.ArchFIPS)) if err != nil { return errors.Wrapf(err, "could not write AWS startup script to temp file") } @@ -1021,14 +1029,22 @@ func (p *Provider) runInstance( } imageID := withFlagOverride(az.region.AMI_X86_64, &providerOpts.ImageAMI) useArmAMI := strings.Index(machineType, "6g.") == 1 || strings.Index(machineType, "7g.") == 1 + if useArmAMI && (opts.Arch != "" && opts.Arch != string(vm.ArchARM64)) { + return errors.Errorf("machine type %s is arm64, but requested arch is %s", machineType, opts.Arch) + } //TODO(srosenberg): remove this once we have a better way to detect ARM64 machines if useArmAMI { imageID = withFlagOverride(az.region.AMI_ARM64, &providerOpts.ImageAMI) - l.Printf("Using ARM64 AMI: %s for machine type: %s", imageID, machineType) + // N.B. use arbitrary instanceIdx to suppress the same info for every other instance being created. + if instanceIdx == 0 { + l.Printf("Using ARM64 AMI: %s for machine type: %s", imageID, machineType) + } } - if !useArmAMI && opts.EnableFIPS { + if opts.Arch == string(vm.ArchFIPS) { imageID = withFlagOverride(az.region.AMI_FIPS, &providerOpts.ImageAMI) - l.Printf("Using FIPS-enabled AMI: %s for machine type: %s", imageID, machineType) + if instanceIdx == 0 { + l.Printf("Using FIPS-enabled AMI: %s for machine type: %s", imageID, machineType) + } } args := []string{ "ec2", "run-instances", diff --git a/pkg/roachprod/vm/gce/gcloud.go b/pkg/roachprod/vm/gce/gcloud.go index 3eeadacb6060..713cfd27e171 100644 --- a/pkg/roachprod/vm/gce/gcloud.go +++ b/pkg/roachprod/vm/gce/gcloud.go @@ -40,6 +40,7 @@ const ( // ProviderName is gce. ProviderName = "gce" DefaultImage = "ubuntu-2004-focal-v20210603" + ARM64Image = "ubuntu-2004-focal-arm64-v20230523" FIPSImage = "ubuntu-pro-fips-2004-focal-v20230302" defaultImageProject = "ubuntu-os-cloud" FIPSImageProject = "ubuntu-os-pro-cloud" @@ -884,10 +885,34 @@ func (p *Provider) Create( // Fixed args. image := providerOpts.Image imageProject := defaultImageProject - if opts.EnableFIPS { + useArmAMI := strings.HasPrefix(strings.ToLower(providerOpts.MachineType), "t2a-") + if useArmAMI && (opts.Arch != "" && opts.Arch != string(vm.ArchARM64)) { + return errors.Errorf("machine type %s is arm64, but requested arch is %s", providerOpts.MachineType, opts.Arch) + } + if useArmAMI && opts.SSDOpts.UseLocalSSD { + return errors.New("local SSDs are not supported with T2A instances, use --local-ssd=false") + } + if useArmAMI { + if len(providerOpts.Zones) == 0 { + zones = []string{"us-central1-a"} + } else { + for _, zone := range providerOpts.Zones { + if !strings.HasPrefix(zone, "us-central1-") { + return errors.New("T2A instances are not supported outside of us-central1") + } + } + } + } + //TODO(srosenberg): remove this once we have a better way to detect ARM64 machines + if useArmAMI { + image = ARM64Image + l.Printf("Using ARM64 AMI: %s for machine type: %s", image, providerOpts.MachineType) + } + if opts.Arch == string(vm.ArchFIPS) { // NB: if FIPS is enabled, it overrides the image passed via CLI (--gce-image) image = FIPSImage imageProject = FIPSImageProject + l.Printf("Using FIPS-enabled AMI: %s for machine type: %s", image, providerOpts.MachineType) } args := []string{ "compute", "instances", "create", @@ -958,7 +983,7 @@ func (p *Provider) Create( } // Create GCE startup script file. - filename, err := writeStartupScript(extraMountOpts, opts.SSDOpts.FileSystem, providerOpts.UseMultipleDisks, opts.EnableFIPS) + filename, err := writeStartupScript(extraMountOpts, opts.SSDOpts.FileSystem, providerOpts.UseMultipleDisks, opts.Arch == string(vm.ArchFIPS)) if err != nil { return errors.Wrapf(err, "could not write GCE startup script to temp file") } diff --git a/pkg/roachprod/vm/vm.go b/pkg/roachprod/vm/vm.go index bc28a72cc294..ce7fbdc41e2d 100644 --- a/pkg/roachprod/vm/vm.go +++ b/pkg/roachprod/vm/vm.go @@ -38,14 +38,23 @@ const ( // TagUsage indicates where a certain resource is used. "roachtest" is used // as the key for roachtest created resources. TagUsage = "usage" + // TagArch is the CPU architecture tag const. + TagArch = "arch" + + ArchARM64 = CPUArch("arm64") + ArchAMD64 = CPUArch("amd64") + ArchFIPS = CPUArch("fips") ) +type CPUArch string + // GetDefaultLabelMap returns a label map for a common set of labels. func GetDefaultLabelMap(opts CreateOpts) map[string]string { return map[string]string{ TagCluster: opts.ClusterName, TagLifetime: opts.Lifetime.String(), TagRoachprod: "true", + TagArch: opts.Arch, } } @@ -230,7 +239,7 @@ type CreateOpts struct { CustomLabels map[string]string GeoDistributed bool - EnableFIPS bool + Arch string VMProviders []string SSDOpts struct { UseLocalSSD bool @@ -251,7 +260,8 @@ func DefaultCreateOpts() CreateOpts { GeoDistributed: false, VMProviders: []string{}, OsVolumeSize: 10, - CustomLabels: map[string]string{"roachtest": "true"}, + // N.B. When roachprod is used via CLI, this will be overridden by {"roachprod":"true"}. + CustomLabels: map[string]string{"roachtest": "true"}, } defaultCreateOpts.SSDOpts.UseLocalSSD = true defaultCreateOpts.SSDOpts.NoExt4Barrier = true diff --git a/pkg/util/randutil/rand.go b/pkg/util/randutil/rand.go index 2a0755fdb910..ad1543e80fc8 100644 --- a/pkg/util/randutil/rand.go +++ b/pkg/util/randutil/rand.go @@ -98,6 +98,12 @@ func NewPseudoRand() (*rand.Rand, int64) { return rand.New(rand.NewSource(seed)), seed } +// Same as NewPseudoRand, but the returned Rand is using thread safe underlying source. +func NewLockedPseudoRand() (*rand.Rand, int64) { + seed := envutil.EnvOrDefaultInt64("COCKROACH_RANDOM_SEED", NewPseudoSeed()) + return rand.New(NewLockedSource(seed)), seed +} + // NewTestRand returns an instance of math/rand.Rand seeded from rng, which is // seeded with the global seed. If the caller is a test with a different // path-qualified name than the previous caller, rng is reseeded from the global