Skip to content

Commit

Permalink
backupccl: introduce new restore roachtest framework
Browse files Browse the repository at this point in the history
This patch introduces a new framework for writing restore roachtests that
minimizes code reuse and leverages our new backup fixture organization. The
framework makes it easy to write a new test using a variety of knobs like:
- hardware: cloud provider, disk volume, # of nodes, # of cpus
- backup fixture: workload, workload scale

The patch is the first in an ongoing effort to redo our roachtests, and
introduces two new roachtests:
- restore/nodes=4: the default configuration: 4 nodes, 8vcpus, 1000 GB EBS,
  restore a tpce backup fixture (25,000 customers, around 400 GB).
- restore/gce: same config as above, run on gce.

Notice that this patch also introduces a new naming convention for restore
tests.  The default test is named `restore/nodes=4` and each test which
deviates from the config will highlight the deviation in the name. For example
`restore/gce` only switches the cloud provider and holds all other variables
constant; thus only 'gce' is needed in the name.

Future patches will add more tests that use this framework.

Informs cockroachdb#92699

Release note: None
  • Loading branch information
msbutler committed Jan 5, 2023
1 parent 9317a5f commit 7242ed3
Showing 1 changed file with 216 additions and 1 deletion.
217 changes: 216 additions & 1 deletion pkg/cmd/roachtest/tests/restore.go
Original file line number Diff line number Diff line change
Expand Up @@ -412,8 +412,11 @@ func (tpccIncData) runRestoreDetached(
}

func registerRestore(r registry.Registry) {
// TODO(msbutler): delete the tests created by the loop below. Specifically
// - restore2TB/nodes=10
// - restore2TB/nodes=32
// - restore2TB/nodes=6/cpus=8/pd-volume=2500GB
largeVolumeSize := 2500 // the size in GB of disks in large volume configs

for _, item := range []struct {
nodes int
cpus int
Expand Down Expand Up @@ -660,6 +663,218 @@ func registerRestore(r registry.Registry) {
require.NotEqual(t, 3, maxPauses, "the job should have paused at least once")
},
})

for _, sp := range []restoreSpecs{
{
name: "restore/nodes=4",
hardware: makeHardwareSpecs(hardwareSpecs{}),
backup: makeBackupSpecs(backupSpecs{}),
timeout: 2 * time.Hour,
},
{
name: "restore/gce",
// Note that the default specs in makeHardwareSpecs() spin up restore tests in aws,
// by default.
hardware: makeHardwareSpecs(hardwareSpecs{cloud: spec.GCE}),
backup: makeBackupSpecs(backupSpecs{}),
timeout: 2 * time.Hour,
},
// TODO(msbutler): add the following tests once roachperf is hooked up and old tests are
// removed:
// - restore/nodes=4
// - restore/nodes=10
// - restore/cpu=16
// - restore/gce/8TB
// - restore/45TB
// - restore/encryption
} {
sp := sp
clusterOpts := make([]spec.Option, 0)
clusterOpts = append(clusterOpts, spec.CPU(sp.hardware.cpus))
if sp.hardware.volumeSize != 0 {
clusterOpts = append(clusterOpts, spec.VolumeSize(sp.hardware.volumeSize))
}
r.Add(registry.TestSpec{
Name: sp.name,
Owner: registry.OwnerDisasterRecovery,
Cluster: r.MakeClusterSpec(sp.hardware.nodes, clusterOpts...),
Timeout: sp.timeout,
// These tests measure performance. To ensure consistent perf,
// disable metamorphic encryption.
EncryptionSupport: registry.EncryptionAlwaysDisabled,
Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {

if c.Spec().Cloud != sp.hardware.cloud {
t.Skip("test configured to run on %s", sp.hardware.cloud)
}
c.Put(ctx, t.Cockroach(), "./cockroach")
c.Start(ctx, t.L(), option.DefaultStartOpts(), install.MakeClusterSettings())
m := c.NewMonitor(ctx)

// Run the disk usage logger in the monitor to guarantee its
// having terminated when the test ends.
dul := NewDiskUsageLogger(t, c)
m.Go(dul.Runner)
hc := NewHealthChecker(t, c, c.All())
m.Go(hc.Runner)

// TODO(msbutler): export to prom/grafana
tick, perfBuf := initBulkJobPerfArtifacts(sp.name, sp.timeout)
m.Go(func(ctx context.Context) error {
defer dul.Done()
defer hc.Done()
t.Status(`running restore`)
tick()
sp.run(ctx, c)
tick()

// Upload the perf artifacts to any one of the nodes so that the test
// runner copies it into an appropriate directory path.
dest := filepath.Join(t.PerfArtifactsDir(), "stats.json")
if err := c.RunE(ctx, c.Node(1), "mkdir -p "+filepath.Dir(dest)); err != nil {
log.Errorf(ctx, "failed to create perf dir: %+v", err)
}
if err := c.PutString(ctx, perfBuf.String(), dest, 0755, c.Node(1)); err != nil {
log.Errorf(ctx, "failed to upload perf artifacts to node: %s", err.Error())
}
return nil
})
m.Wait()
},
})
}
}

type hardwareSpecs struct {
// cloud is the cloud provider the test will run on.
cloud string

// cpus is the per node cpu count.
cpus int

// nodes is the number of nodes in the restore.
nodes int

// volumeSize indicates the size of per node block storage (pd-ssd for gcs,
// ebs for aws). If zero, local ssd's are used.
volumeSize int
}

// makeHardwareSpecs instantiates hardware specs for a restore roachtest.
// Unless the caller provides any explicit specs, the default specs are used.
func makeHardwareSpecs(override hardwareSpecs) hardwareSpecs {
specs := hardwareSpecs{
cloud: spec.AWS,
cpus: 8,
nodes: 4,
volumeSize: 1000,
}

if override.cloud != "" {
specs.cloud = override.cloud
}
if override.cpus != 0 {
specs.cpus = override.cpus
}
if override.nodes != 0 {
specs.nodes = override.nodes
}
if override.volumeSize != 0 {
specs.volumeSize = override.volumeSize
}
return specs
}

type backupSpecs struct {
// version specifies the crdb version the backup was taken on.
version string

backupProperties string

// specifies the full backup directory in the collection to restore from.
fullBackupDir string

// aost specifies the as of system time restore to.
aost string

// workload defines the backed up workload.
workload backupWorkload
}

// makeBackupSpecs initializes the default backup specs. The caller can override
// any of the default backup specs by passing any non-nil params.
func makeBackupSpecs(override backupSpecs) backupSpecs {
// restoring as of from the 24th incremental backup in the chain
specs := backupSpecs{
// TODO(msbutler): write a script that automatically finds the latest versioned fixture for
// the given spec and a reasonable aost.
version: "v22.2.0",
backupProperties: "inc-count=48",
fullBackupDir: "LATEST",
aost: "'2022-12-21 05:15:00'",
workload: tpceRestore{customers: 25000},
}

if override.version != "" {
specs.version = override.version
}

if override.backupProperties != "" {
specs.backupProperties = override.backupProperties
}

if override.fullBackupDir != "" {
specs.fullBackupDir = override.fullBackupDir
}

if override.aost != "" {
specs.aost = override.aost
}

if override.workload != nil {
specs.workload = override.workload
}
return specs
}

type backupWorkload interface {
fixtureDir() string
}

type tpceRestore struct {
customers int
}

func (tpce tpceRestore) fixtureDir() string {
return fmt.Sprintf(`tpc-e/customers=%d`, tpce.customers)
}

type restoreSpecs struct {
name string
hardware hardwareSpecs
backup backupSpecs
timeout time.Duration
}

func (sp restoreSpecs) storagePrefix() string {
if sp.hardware.cloud == spec.AWS {
return "s3"
}
return "gs"
}

func (sp restoreSpecs) backupDir() string {
return fmt.Sprintf(`'%s://cockroach-fixtures/backups/%s/%s/%s?AUTH=implicit'`,
sp.storagePrefix(), sp.backup.workload.fixtureDir(), sp.backup.version, sp.backup.backupProperties)
}

func (sp restoreSpecs) restoreCmd() string {
return fmt.Sprintf(`./cockroach sql --insecure -e "RESTORE FROM %s IN %s AS OF SYSTEM TIME %s"`,
sp.backup.fullBackupDir, sp.backupDir(), sp.backup.aost)
}

func (sp restoreSpecs) run(ctx context.Context, c cluster.Cluster) {
c.Run(ctx, c.Node(1), sp.restoreCmd())
}

// verifyMetrics loops, retrieving the timeseries metrics specified in m every
Expand Down

0 comments on commit 7242ed3

Please sign in to comment.