Skip to content

Commit

Permalink
backupccl: introduce new restore roachtest framework
Browse files Browse the repository at this point in the history
This patch introduces a new framework for writing restore roachtests that
minimizes code reuse and leverages our new backup fixture organization. The
framework makes it easy to write a new test using a variety of knobs like:
- hardware: cloud provider, disk volume, # of nodes, # of cpus
- backup fixture: workload, workload scale

The patch is the first in an ongoing effort to redo our roachtests, and
introduces two new roachtests:
- restore/nodes=4: the default configuration: 4 nodes, 8vcpus, 1000 GB EBS,
  restore a tpce backup fixture (25,000 customers, around 400 GB).
- restore/gce: same config as above, run on gce.

Future patches will add more tests that use this framework.

Informs cockroachdb#92699

Release note: None
  • Loading branch information
msbutler committed Jan 4, 2023
1 parent 6225ad6 commit 817e507
Showing 1 changed file with 207 additions and 0 deletions.
207 changes: 207 additions & 0 deletions pkg/cmd/roachtest/tests/restore.go
Original file line number Diff line number Diff line change
Expand Up @@ -412,6 +412,7 @@ func (tpccIncData) runRestoreDetached(
}

func registerRestore(r registry.Registry) {
// TODO(msbutler): delete old tests
largeVolumeSize := 2500 // the size in GB of disks in large volume configs

for _, item := range []struct {
Expand Down Expand Up @@ -660,6 +661,212 @@ func registerRestore(r registry.Registry) {
require.NotEqual(t, 3, maxPauses, "the job should have paused at least once")
},
})

for _, sp := range []restoreSpecs{
{
name: "restore/nodes=4",
hardware: makeHardwareSpecs(hardwareSpecs{}),
backup: makeBackupSpecs(backupSpecs{}),
timeout: 2 * time.Hour,
},
{
name: "restore/gce",
hardware: makeHardwareSpecs(hardwareSpecs{cloud: spec.GCE}),
backup: makeBackupSpecs(backupSpecs{}),
timeout: 2 * time.Hour,
},
// TODO(msbutler): add the following tests once roachperf is hooked up and old tests are
// removed:
// - restore/nodes=4
// - restore/nodes=10
// - restore/cpu=16
// - restore/gce/8TB
// - restore/45TB
} {
sp := sp
clusterOpts := make([]spec.Option, 0)
clusterOpts = append(clusterOpts, spec.CPU(sp.hardware.cpus))
if sp.hardware.volumeSize != 0 {
clusterOpts = append(clusterOpts, spec.VolumeSize(sp.hardware.volumeSize))
}
clusterOpts = append(clusterOpts, spec.CPU(sp.hardware.cpus))
r.Add(registry.TestSpec{
Name: sp.name,
Owner: registry.OwnerDisasterRecovery,
Cluster: r.MakeClusterSpec(sp.hardware.nodes, clusterOpts...),
Timeout: sp.timeout,
// These tests measure performance. To ensure consistent perf,
// disable metamorphic encryption.
EncryptionSupport: registry.EncryptionAlwaysDisabled,
Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {

if c.Spec().Cloud != sp.hardware.cloud {
t.Skip("test configured to run on %s", sp.hardware.cloud)
}
c.Put(ctx, t.Cockroach(), "./cockroach")
c.Start(ctx, t.L(), option.DefaultStartOpts(), install.MakeClusterSettings())
m := c.NewMonitor(ctx)

// Run the disk usage logger in the monitor to guarantee its
// having terminated when the test ends.
dul := NewDiskUsageLogger(t, c)
m.Go(dul.Runner)
hc := NewHealthChecker(t, c, c.All())
m.Go(hc.Runner)

// TODO(msbutler): export to prom/grafana
tick, perfBuf := initBulkJobPerfArtifacts(sp.name, sp.timeout)
m.Go(func(ctx context.Context) error {
defer dul.Done()
defer hc.Done()
t.Status(`running restore`)
tick()
sp.run(ctx, c)
tick()

// Upload the perf artifacts to any one of the nodes so that the test
// runner copies it into an appropriate directory path.
dest := filepath.Join(t.PerfArtifactsDir(), "stats.json")
if err := c.RunE(ctx, c.Node(1), "mkdir -p "+filepath.Dir(dest)); err != nil {
log.Errorf(ctx, "failed to create perf dir: %+v", err)
}
if err := c.PutString(ctx, perfBuf.String(), dest, 0755, c.Node(1)); err != nil {
log.Errorf(ctx, "failed to upload perf artifacts to node: %s", err.Error())
}
return nil
})
m.Wait()
},
})
}
}

type hardwareSpecs struct {
// cloud is the cloud provider the test will run on
cloud string

// cpus is the per node cpu count
cpus int

// nodes is the number of nodes in the restore
nodes int

// volumeSize indicates the size of per node block storage (pd-ssd for gcs,
// ebs for aws). If zero, local ssd's are used
volumeSize int
}

func makeHardwareSpecs(override hardwareSpecs) hardwareSpecs {
specs := hardwareSpecs{
cloud: spec.AWS,
cpus: 8,
nodes: 4,
volumeSize: 1000,
}

if override.cloud != "" {
specs.cloud = override.cloud
}
if override.cpus != 0 {
specs.cpus = override.cpus
}
if override.nodes != 0 {
specs.nodes = override.nodes
}
if override.volumeSize != 0 {
specs.volumeSize = override.volumeSize
}
return specs
}

type backupSpecs struct {
// version specifies the crdb version the backup was taken on
version string

backupProperties string

// specifies the full backup directory in the collection to restore from
fullBackupDir string

// aost specifies the as of system time restore to
aost string

// workload defines the backed up workload
workload backupWorkload
}

// makeBackupSpecs initializes the default backup specs.
// The caller can override any of the backup specs by passing any non-nil params.
func makeBackupSpecs(override backupSpecs) backupSpecs {
// restoring as of from the 24th incremental backup in the chain
specs := backupSpecs{
version: "v22.2.0",
backupProperties: "inc-count=48",
fullBackupDir: "LATEST",
aost: "'2022-12-21 05:15:00'",
workload: tpceRestore{customers: 25000},
}

if override.version != "" {
specs.version = override.version
}

if override.backupProperties != "" {
specs.backupProperties = override.backupProperties
}

if override.fullBackupDir != "" {
specs.fullBackupDir = override.fullBackupDir
}

if override.aost != "" {
specs.aost = override.aost
}

if override.workload != nil {
specs.workload = override.workload
}
return specs
}

type backupWorkload interface {
fixtureDir() string
}

type tpceRestore struct {
customers int
}

func (tpce tpceRestore) fixtureDir() string {
return fmt.Sprintf(`tpc-e/customers=%d`, tpce.customers)
}

type restoreSpecs struct {
name string
hardware hardwareSpecs
backup backupSpecs
timeout time.Duration
}

func (sp restoreSpecs) storagePrefix() string {
if sp.hardware.cloud == spec.AWS {
return "s3"
}
return "gs"
}

func (sp restoreSpecs) backupDir() string {
return fmt.Sprintf(`'%s://cockroach-fixtures/backups/%s/%s/%s?AUTH=implicit'`,
sp.storagePrefix(), sp.backup.workload.fixtureDir(), sp.backup.version, sp.backup.backupProperties)
}

func (sp restoreSpecs) restoreCmd(options string) string {
return fmt.Sprintf(`./cockroach sql --insecure -e "RESTORE FROM %s IN %s AS OF SYSTEM TIME %s %s"`,
sp.backup.fullBackupDir, sp.backupDir(), sp.backup.aost, options)
}

func (sp restoreSpecs) run(ctx context.Context, c cluster.Cluster) {
c.Run(ctx, c.Node(1), sp.restoreCmd(""))
}

// verifyMetrics loops, retrieving the timeseries metrics specified in m every
Expand Down

0 comments on commit 817e507

Please sign in to comment.