Skip to content

Commit

Permalink
Merge #103710
Browse files Browse the repository at this point in the history
103710: roachtest: metamorphic ARM64 and FIPS clusters r=smg260,herkolategan a=srosenberg

Previously, all roachtests used (cloud) machine types with the AMD64 (cpu) architecture. Recently [1], new CI infrastructure was added to run a clone of all the nightly roachtests, configured with FIPS; i.e., same AMD64 machine types, different AMI and crdb binary, patched with FIPS-certified openssl native code.

As of this PR, we add the capability to execute any roachtest in a cluster, configured with either
ARM64, FIPS, or AMD64 (default). This is controlled via the two CLI args: `metamorphic-arm64-probability` and `metamorphic-fips-probability`. The former denotes the probability (over the uniform distribution) of a new cluster provisioned using ARM64 VMs. The latter denotes the probability of a new AMD64 cluster provisioned with the FIPS-compliant (kernel) configuration.
In case a test is compatible only with AMD64, it's effectively excluded from the set; i.e., both
probabilities apply to compatible tests only.

Note, the two probabilties don't have to add up to 1. E.g., `metamorphic-arm64-probability==0.4`,
`metamorphic-fips-probability==0.2` denotes that ARM64 clusters are chosen ~40% of the time, whereas of the remaining ~60% AMD clusters, FIPS is chosen ~20%
of the time; i.e., ~12% of all clusters will use FIPS.

Note, the values '0' and '1' are absolute. Setting both 
to '0' is tantamount to the behavior before this PR.
Setting either to '1' enforces _all_ clusters 
are provisioned with either ARM64 or FIPS.
A test can specify its required architecture, in which 
case, it takes precedence over metamorphic settings.

This PR builds on [1], which enabled ARM64 provisioning for AWS in roachprod. We add ARM64 provisioning for GCE, i.e., T2A, as well as refactor 'arch' argument to
denote one of: AMD64, ARM64, FIPS, where the latter isn't formally a CPU architecture; however, it simplifies provisioning and binary staging.
We also modify roachprod.List to display CPU architecture, other than AMD64, with the machine type; this should make it easier to see which clusters are running ARM64 and FIPS configurations, as we ramp up their testing.

Epic: none
Release note: None

Resolves: #94957
Informs: #94986

[1] #99224
[2] #103243

Co-authored-by: Stan Rosenberg <[email protected]>
  • Loading branch information
craig[bot] and srosenberg committed Jun 2, 2023
2 parents 9e9cdee + beb35aa commit 4ac97a8
Show file tree
Hide file tree
Showing 70 changed files with 917 additions and 577 deletions.
1 change: 1 addition & 0 deletions pkg/build/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ go_library(
"github.com/cockroachdb/cockroach/pkg/build.utcTime": "{BUILD_UTCTIME}",
},
deps = [
"//pkg/util/buildutil",
"//pkg/util/envutil",
"//pkg/util/version",
],
Expand Down
36 changes: 20 additions & 16 deletions pkg/build/info.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"text/tabwriter"
"time"

"github.com/cockroachdb/cockroach/pkg/util/buildutil"
"github.com/cockroachdb/cockroach/pkg/util/envutil"
"github.com/cockroachdb/cockroach/pkg/util/version"
)
Expand All @@ -37,10 +38,11 @@ var (
cgoTargetTriple string
platform = fmt.Sprintf("%s %s", runtime.GOOS, runtime.GOARCH)
// Distribution is changed by the CCL init-time hook in non-APL builds.
Distribution = "OSS"
typ string // Type of this build: <empty>, "development", or "release"
channel string
envChannel = envutil.EnvOrDefaultString("COCKROACH_CHANNEL", "unknown")
Distribution = "OSS"
typ string // Type of this build: <empty>, "development", or "release"
channel string
envChannel = envutil.EnvOrDefaultString("COCKROACH_CHANNEL", "unknown")
enabledAssertions = buildutil.CrdbTestBuild
//go:embed version.txt
cockroachVersion string
binaryVersion = computeBinaryVersion(cockroachVersion, rev)
Expand Down Expand Up @@ -127,7 +129,8 @@ func (b Info) Long() string {
fmt.Fprintf(tw, "Go Version: %s\n", b.GoVersion)
fmt.Fprintf(tw, "C Compiler: %s\n", b.CgoCompiler)
fmt.Fprintf(tw, "Build Commit ID: %s\n", b.Revision)
fmt.Fprintf(tw, "Build Type: %s", b.Type) // No final newline: cobra prints one for us.
fmt.Fprintf(tw, "Build Type: %s\n", b.Type)
fmt.Fprintf(tw, "Enabled Assertions: %t", b.EnabledAssertions) // No final newline: cobra prints one for us.
_ = tw.Flush()
return buf.String()
}
Expand Down Expand Up @@ -157,17 +160,18 @@ func GetInfo() Info {
ch = "unknown"
}
return Info{
GoVersion: runtime.Version(),
Tag: binaryVersion,
Time: utcTime,
Revision: rev,
CgoCompiler: cgoCompiler,
CgoTargetTriple: cgoTargetTriple,
Platform: platform,
Distribution: Distribution,
Type: typ,
Channel: ch,
EnvChannel: envChannel,
GoVersion: runtime.Version(),
Tag: binaryVersion,
Time: utcTime,
Revision: rev,
CgoCompiler: cgoCompiler,
CgoTargetTriple: cgoTargetTriple,
Platform: platform,
Distribution: Distribution,
Type: typ,
Channel: ch,
EnvChannel: envChannel,
EnabledAssertions: enabledAssertions,
}
}

Expand Down
2 changes: 2 additions & 0 deletions pkg/build/info.proto
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ message Info {
optional string channel = 9 [(gogoproto.nullable) = false];
// env_channel identifies the product channel as overridden by the COCKROACH_CHANNEL environment variable.
optional string env_channel = 11 [(gogoproto.nullable) = false];
// enabled_assertions returns the value of 'CrdbTestBuild' (true iff compiled with 'crdb_test' tag)
optional bool enabled_assertions = 12 [(gogoproto.nullable) = false];

// dependencies exists to allow tests that run against old clusters
// to unmarshal JSON containing this field. The tag is unimportant,
Expand Down
9 changes: 7 additions & 2 deletions pkg/cmd/roachprod/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ var (
extendLifetime time.Duration
wipePreserveCerts bool
grafanaConfig string
grafanaArch string
grafanaurlOpen bool
grafanaDumpDir string
listDetails bool
Expand Down Expand Up @@ -109,8 +110,9 @@ func initFlags() {
vm.AllProviderNames()))
createCmd.Flags().BoolVar(&createVMOpts.GeoDistributed,
"geo", false, "Create geo-distributed cluster")
createCmd.Flags().BoolVar(&createVMOpts.EnableFIPS,
"fips", false, "Enable FIPS mode (uses custom AMI)")
createCmd.Flags().StringVar(&createVMOpts.Arch, "arch", "",
"architecture override for VM [amd64, arm64, fips]; N.B. fips implies amd64 with openssl")

// N.B. We set "usage=roachprod" as the default, custom label for billing tracking.
createCmd.Flags().StringToStringVar(&createVMOpts.CustomLabels,
"label", map[string]string{"usage": "roachprod"},
Expand Down Expand Up @@ -260,6 +262,9 @@ Default is "RECURRING '*/15 * * * *' FULL BACKUP '@hourly' WITH SCHEDULE OPTIONS
grafanaStartCmd.Flags().StringVar(&grafanaConfig,
"grafana-config", "", "URI to grafana json config, supports local and http(s) schemes")

grafanaStartCmd.Flags().StringVar(&grafanaArch, "arch", "",
"binary architecture override [amd64, arm64]")

grafanaURLCmd.Flags().BoolVar(&grafanaurlOpen,
"open", false, "open the grafana dashboard url on the browser")

Expand Down
27 changes: 19 additions & 8 deletions pkg/cmd/roachprod/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,14 @@ hosts file.
return err
}
} else {
machineType := func(clusterVMs vm.List) string {
res := clusterVMs[0].MachineType
// Display CPU architecture, other than amd64 (default).
if arch := clusterVMs[0].Labels["arch"]; arch != "" && arch != string(vm.ArchAMD64) {
res += fmt.Sprintf(" [%s]", arch)
}
return res
}
// Align columns left and separate with at least two spaces.
tw := tabwriter.NewWriter(os.Stdout, 0, 8, 2, ' ', tabwriter.AlignRight)
// N.B. colors use escape codes which don't play nice with tabwriter [1].
Expand Down Expand Up @@ -305,7 +313,7 @@ hosts file.
// N.B. Tabwriter doesn't support per-column alignment. It looks odd to have the cluster names right-aligned,
// so we make it left-aligned.
fmt.Fprintf(tw, "%s\t%s\t%d\t%s", name+strings.Repeat(" ", maxClusterName-len(name)), c.Clouds(),
len(c.VMs), c.VMs[0].MachineType)
len(c.VMs), machineType(c.VMs))
if !c.IsLocal() {
colorByCostBucket := func(cost float64) func(string, ...interface{}) string {
switch {
Expand Down Expand Up @@ -988,7 +996,7 @@ var getProvidersCmd = &cobra.Command{

var grafanaStartCmd = &cobra.Command{
Use: `grafana-start <cluster>`,
Short: `spins up a prometheus and grafana instance on the last node in the cluster`,
Short: `spins up a prometheus and grafana instance on the last node in the cluster; NOTE: for arm64 clusters, use --arch arm64`,
Args: cobra.ExactArgs(1),
Run: wrap(func(cmd *cobra.Command, args []string) error {
var grafanaDashboardJSONs []string
Expand Down Expand Up @@ -1016,8 +1024,11 @@ var grafanaStartCmd = &cobra.Command{
return err
}
}

return roachprod.StartGrafana(context.Background(), config.Logger, args[0],
arch := vm.ArchAMD64
if grafanaArch == "arm64" {
arch = vm.ArchARM64
}
return roachprod.StartGrafana(context.Background(), config.Logger, args[0], arch,
grafanaConfigURL, grafanaDashboardJSONs, nil)
}),
}
Expand Down Expand Up @@ -1272,14 +1283,14 @@ func validateAndConfigure(cmd *cobra.Command, args []string) {

// Validate architecture flag, if set.
if archOpt := cmd.Flags().Lookup("arch"); archOpt != nil && archOpt.Changed {
arch := strings.ToLower(archOpt.Value.String())
arch := vm.CPUArch(strings.ToLower(archOpt.Value.String()))

if arch != "amd64" && arch != "arm64" && arch != "fips" {
if arch != vm.ArchAMD64 && arch != vm.ArchARM64 && arch != vm.ArchFIPS {
printErrAndExit(fmt.Errorf("unsupported architecture %q", arch))
}
if arch != archOpt.Value.String() {
if string(arch) != archOpt.Value.String() {
// Set the canonical value.
_ = cmd.Flags().Set("arch", arch)
_ = cmd.Flags().Set("arch", string(arch))
}
}
}
Expand Down
Loading

0 comments on commit 4ac97a8

Please sign in to comment.