Skip to content

Commit

Permalink
roachtest: metamorphic ARM64 and FIPS clusters
Browse files Browse the repository at this point in the history
Previously, all roachtests used (cloud) machine types
with the AMD64 (cpu) architecture. Recently [1], new
CI infrastructure was added to run a clone of all the
nightly roachtests, configured with FIPS; i.e., same
AMD64 machine types, different AMI and crdb binary,
patched with FIPS-certified openssl native code.

As of this PR, we add the capability to execute any
roachtest in a cluster, configured with either
ARM64, FIPS, or AMD64 (default). This is controlled
via the two CLI args: `metamorphic-arm64-probability`
and `metamorphic-fips-probability`. The former denotes
the probability (over the uniform distribution) of a new
cluster provisioned using ARM64 VMs. The latter denotes
the probability of a new AMD64 cluster provisioned
with the FIPS-compliant (kernel) configuration.
In case a test is compatible only with AMD64, it's
effectively excluded from the set; i.e., both
probabilities apply to compatible tests only.

Note, the two probabilties don't have to add up to 1.
E.g., `metamorphic-arm64-probability==0.4`,
`metamorphic-fips-probability==0.2` denotes that ARM64
clusters are chosen ~40% of the time, whereas of the
remaining ~60% AMD clusters, FIPS is chosen ~20%
of the time; i.e., ~12% of all clusters will use FIPS.

Note, the values '0' and '1' are absolute. Setting both
to '0' is tantamount to the behavior before this PR.
Setting either to '1' enforces _all_ clusters
are provisioned with either ARM64 or FIPS.
A test can specify its required architecture, in which
case, it takes precedence over metamorphic settings.

This PR builds on [1], which enabled ARM64 provisioning
for AWS in roachprod. We add ARM64 provisioning for GCE,
i.e., T2A, as well as refactor 'arch' argument to
denote one of: AMD64, ARM64, FIPS, where the latter
isn't formally a CPU architecture; however, it simplifies
provisioning and binary staging.
We also modify roachprod.List to display CPU architecture,
other than AMD64, with the machine type; this should make it
easier to see which clusters are running ARM64 and FIPS
configurations, as we ramp up their testing.

The PR also adds validation to cockroach binaries and libs
to ensure we can execute tests under ARM64 and FIPS.
Furthermore, we add 'Enabled Assertions' header, generated
at build time, to the cockroach binary; the header is used
to validate whether or not the binary has runtime assertions
enabled.

Epic: none
Release note: None

Resolves: cockroachdb#94957
Resolves: cockroachdb#89268
Informs: cockroachdb#94986

[1] cockroachdb#99224
[2] cockroachdb#103243
  • Loading branch information
srosenberg committed May 31, 2023
1 parent f676a1a commit a25050c
Show file tree
Hide file tree
Showing 49 changed files with 705 additions and 303 deletions.
1 change: 1 addition & 0 deletions pkg/build/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ go_library(
"github.com/cockroachdb/cockroach/pkg/build.utcTime": "{BUILD_UTCTIME}",
},
deps = [
"//pkg/util/buildutil",
"//pkg/util/envutil",
"//pkg/util/version",
],
Expand Down
36 changes: 20 additions & 16 deletions pkg/build/info.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"text/tabwriter"
"time"

"github.com/cockroachdb/cockroach/pkg/util/buildutil"
"github.com/cockroachdb/cockroach/pkg/util/envutil"
"github.com/cockroachdb/cockroach/pkg/util/version"
)
Expand All @@ -37,10 +38,11 @@ var (
cgoTargetTriple string
platform = fmt.Sprintf("%s %s", runtime.GOOS, runtime.GOARCH)
// Distribution is changed by the CCL init-time hook in non-APL builds.
Distribution = "OSS"
typ string // Type of this build: <empty>, "development", or "release"
channel string
envChannel = envutil.EnvOrDefaultString("COCKROACH_CHANNEL", "unknown")
Distribution = "OSS"
typ string // Type of this build: <empty>, "development", or "release"
channel string
envChannel = envutil.EnvOrDefaultString("COCKROACH_CHANNEL", "unknown")
enabledAssertions = buildutil.CrdbTestBuild
//go:embed version.txt
cockroachVersion string
binaryVersion = computeBinaryVersion(cockroachVersion, rev)
Expand Down Expand Up @@ -127,7 +129,8 @@ func (b Info) Long() string {
fmt.Fprintf(tw, "Go Version: %s\n", b.GoVersion)
fmt.Fprintf(tw, "C Compiler: %s\n", b.CgoCompiler)
fmt.Fprintf(tw, "Build Commit ID: %s\n", b.Revision)
fmt.Fprintf(tw, "Build Type: %s", b.Type) // No final newline: cobra prints one for us.
fmt.Fprintf(tw, "Build Type: %s\n", b.Type)
fmt.Fprintf(tw, "Enabled Assertions: %t", b.EnabledAssertions) // No final newline: cobra prints one for us.
_ = tw.Flush()
return buf.String()
}
Expand Down Expand Up @@ -157,17 +160,18 @@ func GetInfo() Info {
ch = "unknown"
}
return Info{
GoVersion: runtime.Version(),
Tag: binaryVersion,
Time: utcTime,
Revision: rev,
CgoCompiler: cgoCompiler,
CgoTargetTriple: cgoTargetTriple,
Platform: platform,
Distribution: Distribution,
Type: typ,
Channel: ch,
EnvChannel: envChannel,
GoVersion: runtime.Version(),
Tag: binaryVersion,
Time: utcTime,
Revision: rev,
CgoCompiler: cgoCompiler,
CgoTargetTriple: cgoTargetTriple,
Platform: platform,
Distribution: Distribution,
Type: typ,
Channel: ch,
EnvChannel: envChannel,
EnabledAssertions: enabledAssertions,
}
}

Expand Down
2 changes: 2 additions & 0 deletions pkg/build/info.proto
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ message Info {
optional string channel = 9 [(gogoproto.nullable) = false];
// env_channel identifies the product channel as overridden by the COCKROACH_CHANNEL environment variable.
optional string env_channel = 11 [(gogoproto.nullable) = false];
// enabled_assertions returns the value of 'CrdbTestBuild' (true iff compiled with 'crdb_test' tag)
optional bool enabled_assertions = 12 [(gogoproto.nullable) = false];

// dependencies exists to allow tests that run against old clusters
// to unmarshal JSON containing this field. The tag is unimportant,
Expand Down
9 changes: 7 additions & 2 deletions pkg/cmd/roachprod/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ var (
extendLifetime time.Duration
wipePreserveCerts bool
grafanaConfig string
grafanaArch string
grafanaurlOpen bool
grafanaDumpDir string
listDetails bool
Expand Down Expand Up @@ -107,8 +108,9 @@ func initFlags() {
vm.AllProviderNames()))
createCmd.Flags().BoolVar(&createVMOpts.GeoDistributed,
"geo", false, "Create geo-distributed cluster")
createCmd.Flags().BoolVar(&createVMOpts.EnableFIPS,
"fips", false, "Enable FIPS mode (uses custom AMI)")
createCmd.Flags().StringVar(&createVMOpts.Arch, "arch", "",
"architecture override for VM [amd64, arm64, fips]; N.B. fips implies amd64 with openssl")

// N.B. We set "usage=roachprod" as the default, custom label for billing tracking.
createCmd.Flags().StringToStringVar(&createVMOpts.CustomLabels,
"label", map[string]string{"usage": "roachprod"},
Expand Down Expand Up @@ -258,6 +260,9 @@ Default is "RECURRING '*/15 * * * *' FULL BACKUP '@hourly' WITH SCHEDULE OPTIONS
grafanaStartCmd.Flags().StringVar(&grafanaConfig,
"grafana-config", "", "URI to grafana json config, supports local and http(s) schemes")

grafanaStartCmd.Flags().StringVar(&grafanaArch, "arch", "",
"binary architecture override [amd64, arm64]")

grafanaURLCmd.Flags().BoolVar(&grafanaurlOpen,
"open", false, "open the grafana dashboard url on the browser")

Expand Down
27 changes: 19 additions & 8 deletions pkg/cmd/roachprod/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,14 @@ hosts file.
return err
}
} else {
machineType := func(clusterVMs vm.List) string {
res := clusterVMs[0].MachineType
// Display CPU architecture, other than amd64 (default).
if arch := clusterVMs[0].Labels["arch"]; arch != "" && arch != string(vm.ArchAMD64) {
res += fmt.Sprintf(" [%s]", arch)
}
return res
}
// Align columns left and separate with at least two spaces.
tw := tabwriter.NewWriter(os.Stdout, 0, 8, 2, ' ', tabwriter.AlignRight)
// N.B. colors use escape codes which don't play nice with tabwriter [1].
Expand Down Expand Up @@ -304,7 +312,7 @@ hosts file.
// N.B. Tabwriter doesn't support per-column alignment. It looks odd to have the cluster names right-aligned,
// so we make it left-aligned.
fmt.Fprintf(tw, "%s\t%s\t%d\t%s", name+strings.Repeat(" ", maxClusterName-len(name)), c.Clouds(),
len(c.VMs), c.VMs[0].MachineType)
len(c.VMs), machineType(c.VMs))
if !c.IsLocal() {
colorByCostBucket := func(cost float64) func(string, ...interface{}) string {
switch {
Expand Down Expand Up @@ -987,7 +995,7 @@ var getProvidersCmd = &cobra.Command{

var grafanaStartCmd = &cobra.Command{
Use: `grafana-start <cluster>`,
Short: `spins up a prometheus and grafana instance on the last node in the cluster`,
Short: `spins up a prometheus and grafana instance on the last node in the cluster; NOTE: for arm64 clusters, use --arch arm64`,
Args: cobra.ExactArgs(1),
Run: wrap(func(cmd *cobra.Command, args []string) error {
var grafanaDashboardJSONs []string
Expand Down Expand Up @@ -1015,8 +1023,11 @@ var grafanaStartCmd = &cobra.Command{
return err
}
}

return roachprod.StartGrafana(context.Background(), config.Logger, args[0],
arch := vm.ArchAMD64
if grafanaArch == "arm64" {
arch = vm.ArchARM64
}
return roachprod.StartGrafana(context.Background(), config.Logger, args[0], arch,
grafanaConfigURL, grafanaDashboardJSONs, nil)
}),
}
Expand Down Expand Up @@ -1271,14 +1282,14 @@ func validateAndConfigure(cmd *cobra.Command, args []string) {

// Validate architecture flag, if set.
if archOpt := cmd.Flags().Lookup("arch"); archOpt != nil && archOpt.Changed {
arch := strings.ToLower(archOpt.Value.String())
arch := vm.CPUArch(strings.ToLower(archOpt.Value.String()))

if arch != "amd64" && arch != "arm64" && arch != "fips" {
if arch != vm.ArchAMD64 && arch != vm.ArchARM64 && arch != vm.ArchFIPS {
printErrAndExit(fmt.Errorf("unsupported architecture %q", arch))
}
if arch != archOpt.Value.String() {
if string(arch) != archOpt.Value.String() {
// Set the canonical value.
_ = cmd.Flags().Set("arch", arch)
_ = cmd.Flags().Set("arch", string(arch))
}
}
}
Expand Down
Loading

0 comments on commit a25050c

Please sign in to comment.