From 40562757e2ecb0df01f884a36d50a10b43c50490 Mon Sep 17 00:00:00 2001 From: Michael Butler Date: Tue, 20 Dec 2022 10:31:41 -0500 Subject: [PATCH] backupccl: introduce new restore roachtest framework This patch introduces a new framework for writing restore roachtests that minimizes code reuse and leverages our new backup fixture organization. The framework makes it easy to write a new test using a variety of knobs like: - hardware: cloud provider, disk volume, # of nodes, # of cpus - backup fixture: workload, workload scale The patch is the first in an ongoing effort to redo our roachtests, and introduces 3 new roachtests: - restore/tpce/400GB: the default configuration: 4 nodes, 8vcpus, 1000 GB EBS, restore a tpce backup fixture (25,000 customers, around 400 GB). - restore/tpce/400GB/gce: same config as above, run on gce. - restore/tpce/8TB/nodes=10: the big one! Notice that this patch also introduces a new naming convention for restore tests. The default test is named `restore/tpce/400GB` and only contains the basic workload. Each other test name will contain the workload and any specs which deviate from the default config. For example `restore/tpce/400GB/gce` only switches the cloud provider and holds all other variables constant; thus only the workload and 'gce' are needed in the name. Future patches will add more tests that use this framework. Informs #92699 Release note: None enforce naming convention --- pkg/cmd/roachtest/tests/restore.go | 267 ++++++++++++++++++++++++++++- 1 file changed, 266 insertions(+), 1 deletion(-) diff --git a/pkg/cmd/roachtest/tests/restore.go b/pkg/cmd/roachtest/tests/restore.go index 2810cc9d0c2f..b86c0313f1a5 100644 --- a/pkg/cmd/roachtest/tests/restore.go +++ b/pkg/cmd/roachtest/tests/restore.go @@ -412,8 +412,11 @@ func (tpccIncData) runRestoreDetached( } func registerRestore(r registry.Registry) { + // TODO(msbutler): delete the tests created by the loop below. Specifically + // - restore2TB/nodes=10 + // - restore2TB/nodes=32 + // - restore2TB/nodes=6/cpus=8/pd-volume=2500GB largeVolumeSize := 2500 // the size in GB of disks in large volume configs - for _, item := range []struct { nodes int cpus int @@ -660,6 +663,268 @@ func registerRestore(r registry.Registry) { require.NotEqual(t, 3, maxPauses, "the job should have paused at least once") }, }) + + for _, sp := range []restoreSpecs{ + { + hardware: makeHardwareSpecs(hardwareSpecs{}), + backup: makeBackupSpecs(backupSpecs{}), + timeout: 1 * time.Hour, + }, + { + // Note that the default specs in makeHardwareSpecs() spin up restore tests in aws, + // by default. + hardware: makeHardwareSpecs(hardwareSpecs{cloud: spec.GCE}), + backup: makeBackupSpecs(backupSpecs{}), + timeout: 1 * time.Hour, + }, + { + hardware: makeHardwareSpecs(hardwareSpecs{nodes: 10, volumeSize: 2000}), + backup: makeBackupSpecs(backupSpecs{ + version: "v22.2.1", + aost: "'2023-01-05 20:45:00'", + workload: tpceRestore{customers: 500000}}), + timeout: 5 * time.Hour, + }, + // TODO(msbutler): add the following tests once roachperf/grafana is hooked up and old tests are + // removed: + // - restore/tpce/400GB/nodes=10 + // - restore/tpce/400GB/nodes=30 + // - restore/tpce/400GB/cpu=16 + // - restore/tpce/45TB/nodes=15/cpu=16/ + // - restore/tpce/400GB/encryption + } { + sp := sp + clusterOpts := make([]spec.Option, 0) + clusterOpts = append(clusterOpts, spec.CPU(sp.hardware.cpus)) + if sp.hardware.volumeSize != 0 { + clusterOpts = append(clusterOpts, spec.VolumeSize(sp.hardware.volumeSize)) + } + r.Add(registry.TestSpec{ + Name: sp.computeName(false), + Owner: registry.OwnerDisasterRecovery, + Cluster: r.MakeClusterSpec(sp.hardware.nodes, clusterOpts...), + Timeout: sp.timeout, + // These tests measure performance. To ensure consistent perf, + // disable metamorphic encryption. + EncryptionSupport: registry.EncryptionAlwaysDisabled, + Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { + + t.L().Printf("Full test specs: %s", sp.computeName(true)) + + if c.Spec().Cloud != sp.hardware.cloud { + t.Skip("test configured to run on %s", sp.hardware.cloud) + } + c.Put(ctx, t.Cockroach(), "./cockroach") + c.Start(ctx, t.L(), option.DefaultStartOpts(), install.MakeClusterSettings()) + m := c.NewMonitor(ctx) + + // Run the disk usage logger in the monitor to guarantee its + // having terminated when the test ends. + dul := NewDiskUsageLogger(t, c) + m.Go(dul.Runner) + hc := NewHealthChecker(t, c, c.All()) + m.Go(hc.Runner) + + m.Go(func(ctx context.Context) error { + defer dul.Done() + defer hc.Done() + t.Status(`running restore`) + if err := sp.run(ctx, c); err != nil { + return err + } + return nil + }) + m.Wait() + }, + }) + } +} + +var defaultHardware = hardwareSpecs{ + cloud: spec.AWS, + cpus: 8, + nodes: 4, + volumeSize: 1000, +} + +type hardwareSpecs struct { + // cloud is the cloud provider the test will run on. + cloud string + + // cpus is the per node cpu count. + cpus int + + // nodes is the number of nodes in the restore. + nodes int + + // volumeSize indicates the size of per node block storage (pd-ssd for gcs, + // ebs for aws). If zero, local ssd's are used. + volumeSize int +} + +// String prints the hardware specs. If full==false, only non default specs are printed. +func (hw hardwareSpecs) String(full bool) string { + var builder strings.Builder + if full || hw.cloud != defaultHardware.cloud { + builder.WriteString("/" + hw.cloud) + } + if full || hw.nodes != defaultHardware.nodes { + builder.WriteString(fmt.Sprintf("/nodes=%d", hw.nodes)) + } + if full || hw.cpus != defaultHardware.cpus { + builder.WriteString(fmt.Sprintf("/cpus=%d", hw.cpus)) + } + if full { + builder.WriteString(fmt.Sprintf("/volSize=%dGB", hw.volumeSize)) + } + return builder.String() +} + +// makeHardwareSpecs instantiates hardware specs for a restore roachtest. +// Unless the caller provides any explicit specs, the default specs are used. +func makeHardwareSpecs(override hardwareSpecs) hardwareSpecs { + specs := defaultHardware + if override.cloud != "" { + specs.cloud = override.cloud + } + if override.cpus != 0 { + specs.cpus = override.cpus + } + if override.nodes != 0 { + specs.nodes = override.nodes + } + if override.volumeSize != 0 { + specs.volumeSize = override.volumeSize + } + return specs +} + +var defaultBackupSpecs = backupSpecs{ + // TODO(msbutler): write a script that automatically finds the latest versioned fixture for + // the given spec and a reasonable aost. + version: "v22.2.0", + backupProperties: "inc-count=48", + fullBackupDir: "LATEST", + + // restoring as of from the 24th incremental backup in the chain + aost: "'2022-12-21 05:15:00'", + workload: tpceRestore{customers: 25000}, +} + +type backupSpecs struct { + // version specifies the crdb version the backup was taken on. + version string + + backupProperties string + + // specifies the full backup directory in the collection to restore from. + fullBackupDir string + + // aost specifies the as of system time restore to. + aost string + + // workload defines the backed up workload. + workload backupWorkload +} + +// String returns a stringified version of the backup specs. If full is false, +// default backupProperties are omitted. Note that the backup version, backup +// directory, and AOST are never included. +func (bs backupSpecs) String(full bool) string { + var builder strings.Builder + builder.WriteString("/" + bs.workload.String()) + + if full || bs.backupProperties != defaultBackupSpecs.backupProperties { + builder.WriteString("/" + bs.backupProperties) + } + return builder.String() +} + +// makeBackupSpecs initializes the default backup specs. The caller can override +// any of the default backup specs by passing any non-nil params. +func makeBackupSpecs(override backupSpecs) backupSpecs { + specs := defaultBackupSpecs + if override.version != "" { + specs.version = override.version + } + + if override.backupProperties != "" { + specs.backupProperties = override.backupProperties + } + + if override.fullBackupDir != "" { + specs.fullBackupDir = override.fullBackupDir + } + + if override.aost != "" { + specs.aost = override.aost + } + + if override.workload != nil { + specs.workload = override.workload + } + return specs +} + +type backupWorkload interface { + fixtureDir() string + String() string +} + +type tpceRestore struct { + customers int +} + +func (tpce tpceRestore) fixtureDir() string { + return fmt.Sprintf(`tpc-e/customers=%d`, tpce.customers) +} + +func (tpce tpceRestore) String() string { + var builder strings.Builder + builder.WriteString("tpce/") + switch tpce.customers { + case 25000: + builder.WriteString("400GB") + case 500000: + builder.WriteString("8TB") + default: + panic("tpce customer count not recognized") + } + return builder.String() +} + +type restoreSpecs struct { + hardware hardwareSpecs + backup backupSpecs + timeout time.Duration +} + +func (sp restoreSpecs) computeName(full bool) string { + return "restore" + sp.backup.String(full) + sp.hardware.String(full) +} + +func (sp restoreSpecs) storagePrefix() string { + if sp.hardware.cloud == spec.AWS { + return "s3" + } + return "gs" +} + +func (sp restoreSpecs) backupDir() string { + return fmt.Sprintf(`'%s://cockroach-fixtures/backups/%s/%s/%s?AUTH=implicit'`, + sp.storagePrefix(), sp.backup.workload.fixtureDir(), sp.backup.version, sp.backup.backupProperties) +} + +func (sp restoreSpecs) restoreCmd() string { + return fmt.Sprintf(`./cockroach sql --insecure -e "RESTORE FROM %s IN %s AS OF SYSTEM TIME %s"`, + sp.backup.fullBackupDir, sp.backupDir(), sp.backup.aost) +} + +func (sp restoreSpecs) run(ctx context.Context, c cluster.Cluster) error { + if err := c.RunE(ctx, c.Node(1), sp.restoreCmd()); err != nil { + return errors.Wrapf(err, "full test specs: %s", sp.computeName(true)) + } + return nil } // verifyMetrics loops, retrieving the timeseries metrics specified in m every