Skip to content

Commit

Permalink
backupccl: introduce new restore roachtest framework
Browse files Browse the repository at this point in the history
This patch introduces a new framework for writing restore roachtests that
minimizes code reuse and leverages our new backup fixture organization. The
framework makes it easy to write a new test using a variety of knobs like:
- hardware: cloud provider, disk volume, # of nodes, # of cpus
- backup fixture: workload, workload scale

The patch is the first in an ongoing effort to redo our roachtests, and
introduces 3 new roachtests:
- restore/tpce/400GB: the default configuration: 4 nodes, 8vcpus, 1000 GB EBS,
  restore a tpce backup fixture (25,000 customers, around 400 GB).
- restore/tpce/400GB/gce: same config as above, run on gce.
- restore/tpce/8TB/nodes=10: the big one!

Notice that this patch also introduces a new naming convention for restore
tests.  The default test is named `restore/tpce/400GB` and only contains the
basic workload. Each other test name will contain the workload and any specs
which deviate from the default config. For example `restore/tpce/400GB/gce`
only switches the cloud provider and holds all other variables constant; thus
only the workload and 'gce' are needed in the name.

Future patches will add more tests that use this framework.

Informs #92699

Release note: None

enforce naming convention
  • Loading branch information
msbutler committed Jan 6, 2023
1 parent 7317b25 commit 4056275
Showing 1 changed file with 266 additions and 1 deletion.
267 changes: 266 additions & 1 deletion pkg/cmd/roachtest/tests/restore.go
Original file line number Diff line number Diff line change
Expand Up @@ -412,8 +412,11 @@ func (tpccIncData) runRestoreDetached(
}

func registerRestore(r registry.Registry) {
// TODO(msbutler): delete the tests created by the loop below. Specifically
// - restore2TB/nodes=10
// - restore2TB/nodes=32
// - restore2TB/nodes=6/cpus=8/pd-volume=2500GB
largeVolumeSize := 2500 // the size in GB of disks in large volume configs

for _, item := range []struct {
nodes int
cpus int
Expand Down Expand Up @@ -660,6 +663,268 @@ func registerRestore(r registry.Registry) {
require.NotEqual(t, 3, maxPauses, "the job should have paused at least once")
},
})

for _, sp := range []restoreSpecs{
{
hardware: makeHardwareSpecs(hardwareSpecs{}),
backup: makeBackupSpecs(backupSpecs{}),
timeout: 1 * time.Hour,
},
{
// Note that the default specs in makeHardwareSpecs() spin up restore tests in aws,
// by default.
hardware: makeHardwareSpecs(hardwareSpecs{cloud: spec.GCE}),
backup: makeBackupSpecs(backupSpecs{}),
timeout: 1 * time.Hour,
},
{
hardware: makeHardwareSpecs(hardwareSpecs{nodes: 10, volumeSize: 2000}),
backup: makeBackupSpecs(backupSpecs{
version: "v22.2.1",
aost: "'2023-01-05 20:45:00'",
workload: tpceRestore{customers: 500000}}),
timeout: 5 * time.Hour,
},
// TODO(msbutler): add the following tests once roachperf/grafana is hooked up and old tests are
// removed:
// - restore/tpce/400GB/nodes=10
// - restore/tpce/400GB/nodes=30
// - restore/tpce/400GB/cpu=16
// - restore/tpce/45TB/nodes=15/cpu=16/
// - restore/tpce/400GB/encryption
} {
sp := sp
clusterOpts := make([]spec.Option, 0)
clusterOpts = append(clusterOpts, spec.CPU(sp.hardware.cpus))
if sp.hardware.volumeSize != 0 {
clusterOpts = append(clusterOpts, spec.VolumeSize(sp.hardware.volumeSize))
}
r.Add(registry.TestSpec{
Name: sp.computeName(false),
Owner: registry.OwnerDisasterRecovery,
Cluster: r.MakeClusterSpec(sp.hardware.nodes, clusterOpts...),
Timeout: sp.timeout,
// These tests measure performance. To ensure consistent perf,
// disable metamorphic encryption.
EncryptionSupport: registry.EncryptionAlwaysDisabled,
Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {

t.L().Printf("Full test specs: %s", sp.computeName(true))

if c.Spec().Cloud != sp.hardware.cloud {
t.Skip("test configured to run on %s", sp.hardware.cloud)
}
c.Put(ctx, t.Cockroach(), "./cockroach")
c.Start(ctx, t.L(), option.DefaultStartOpts(), install.MakeClusterSettings())
m := c.NewMonitor(ctx)

// Run the disk usage logger in the monitor to guarantee its
// having terminated when the test ends.
dul := NewDiskUsageLogger(t, c)
m.Go(dul.Runner)
hc := NewHealthChecker(t, c, c.All())
m.Go(hc.Runner)

m.Go(func(ctx context.Context) error {
defer dul.Done()
defer hc.Done()
t.Status(`running restore`)
if err := sp.run(ctx, c); err != nil {
return err
}
return nil
})
m.Wait()
},
})
}
}

var defaultHardware = hardwareSpecs{
cloud: spec.AWS,
cpus: 8,
nodes: 4,
volumeSize: 1000,
}

type hardwareSpecs struct {
// cloud is the cloud provider the test will run on.
cloud string

// cpus is the per node cpu count.
cpus int

// nodes is the number of nodes in the restore.
nodes int

// volumeSize indicates the size of per node block storage (pd-ssd for gcs,
// ebs for aws). If zero, local ssd's are used.
volumeSize int
}

// String prints the hardware specs. If full==false, only non default specs are printed.
func (hw hardwareSpecs) String(full bool) string {
var builder strings.Builder
if full || hw.cloud != defaultHardware.cloud {
builder.WriteString("/" + hw.cloud)
}
if full || hw.nodes != defaultHardware.nodes {
builder.WriteString(fmt.Sprintf("/nodes=%d", hw.nodes))
}
if full || hw.cpus != defaultHardware.cpus {
builder.WriteString(fmt.Sprintf("/cpus=%d", hw.cpus))
}
if full {
builder.WriteString(fmt.Sprintf("/volSize=%dGB", hw.volumeSize))
}
return builder.String()
}

// makeHardwareSpecs instantiates hardware specs for a restore roachtest.
// Unless the caller provides any explicit specs, the default specs are used.
func makeHardwareSpecs(override hardwareSpecs) hardwareSpecs {
specs := defaultHardware
if override.cloud != "" {
specs.cloud = override.cloud
}
if override.cpus != 0 {
specs.cpus = override.cpus
}
if override.nodes != 0 {
specs.nodes = override.nodes
}
if override.volumeSize != 0 {
specs.volumeSize = override.volumeSize
}
return specs
}

var defaultBackupSpecs = backupSpecs{
// TODO(msbutler): write a script that automatically finds the latest versioned fixture for
// the given spec and a reasonable aost.
version: "v22.2.0",
backupProperties: "inc-count=48",
fullBackupDir: "LATEST",

// restoring as of from the 24th incremental backup in the chain
aost: "'2022-12-21 05:15:00'",
workload: tpceRestore{customers: 25000},
}

type backupSpecs struct {
// version specifies the crdb version the backup was taken on.
version string

backupProperties string

// specifies the full backup directory in the collection to restore from.
fullBackupDir string

// aost specifies the as of system time restore to.
aost string

// workload defines the backed up workload.
workload backupWorkload
}

// String returns a stringified version of the backup specs. If full is false,
// default backupProperties are omitted. Note that the backup version, backup
// directory, and AOST are never included.
func (bs backupSpecs) String(full bool) string {
var builder strings.Builder
builder.WriteString("/" + bs.workload.String())

if full || bs.backupProperties != defaultBackupSpecs.backupProperties {
builder.WriteString("/" + bs.backupProperties)
}
return builder.String()
}

// makeBackupSpecs initializes the default backup specs. The caller can override
// any of the default backup specs by passing any non-nil params.
func makeBackupSpecs(override backupSpecs) backupSpecs {
specs := defaultBackupSpecs
if override.version != "" {
specs.version = override.version
}

if override.backupProperties != "" {
specs.backupProperties = override.backupProperties
}

if override.fullBackupDir != "" {
specs.fullBackupDir = override.fullBackupDir
}

if override.aost != "" {
specs.aost = override.aost
}

if override.workload != nil {
specs.workload = override.workload
}
return specs
}

type backupWorkload interface {
fixtureDir() string
String() string
}

type tpceRestore struct {
customers int
}

func (tpce tpceRestore) fixtureDir() string {
return fmt.Sprintf(`tpc-e/customers=%d`, tpce.customers)
}

func (tpce tpceRestore) String() string {
var builder strings.Builder
builder.WriteString("tpce/")
switch tpce.customers {
case 25000:
builder.WriteString("400GB")
case 500000:
builder.WriteString("8TB")
default:
panic("tpce customer count not recognized")
}
return builder.String()
}

type restoreSpecs struct {
hardware hardwareSpecs
backup backupSpecs
timeout time.Duration
}

func (sp restoreSpecs) computeName(full bool) string {
return "restore" + sp.backup.String(full) + sp.hardware.String(full)
}

func (sp restoreSpecs) storagePrefix() string {
if sp.hardware.cloud == spec.AWS {
return "s3"
}
return "gs"
}

func (sp restoreSpecs) backupDir() string {
return fmt.Sprintf(`'%s://cockroach-fixtures/backups/%s/%s/%s?AUTH=implicit'`,
sp.storagePrefix(), sp.backup.workload.fixtureDir(), sp.backup.version, sp.backup.backupProperties)
}

func (sp restoreSpecs) restoreCmd() string {
return fmt.Sprintf(`./cockroach sql --insecure -e "RESTORE FROM %s IN %s AS OF SYSTEM TIME %s"`,
sp.backup.fullBackupDir, sp.backupDir(), sp.backup.aost)
}

func (sp restoreSpecs) run(ctx context.Context, c cluster.Cluster) error {
if err := c.RunE(ctx, c.Node(1), sp.restoreCmd()); err != nil {
return errors.Wrapf(err, "full test specs: %s", sp.computeName(true))
}
return nil
}

// verifyMetrics loops, retrieving the timeseries metrics specified in m every
Expand Down

0 comments on commit 4056275

Please sign in to comment.