Skip to content

Commit

Permalink
backupccl: incremental schedules always wait on_previous_running
Browse files Browse the repository at this point in the history
An incremental backup schedule must always wait if there is a running job
that was previously scheduled by this incremental schedule. This is
because until the previous incremental backup job completes, all future
incremental jobs will attempt to backup data from the same `StartTime`
corresponding to the `EndTime` of the last incremental layer. In this
case only the first incremental job to complete will succeed, while the
remaining jobs will either be rejected or worse corrupt the chain of
backups.

This change overrides the Wait behaviour for an incremental schedule to
always default to `wait` during schedule creation or in an alter statement.
Note the user specified value will still be applied to the full backup schedule.

Ideally we'd have a way to configure options for both the full and
incremental schedule separately, in which case we could reject the
`on_previous_running` configuration for incremental schedules.
Until then this workaround will have to do and we should call out this
known limitation.

Release note (enterprise change): backup schedules created or altered to
have the option `on_previous_running` will have the full backup
schedule created with the user specified option, but will override the
incremental backup schedule to always default to `on_previous_running = wait`.
This ensures correctness of the backup chains created by the incremental
schedule by preventing duplicate incremental jobs from racing against each
other.
  • Loading branch information
adityamaru committed Mar 24, 2023
1 parent 9a97275 commit 018fc65
Show file tree
Hide file tree
Showing 4 changed files with 158 additions and 16 deletions.
18 changes: 15 additions & 3 deletions pkg/ccl/backupccl/alter_backup_schedule.go
Original file line number Diff line number Diff line change
Expand Up @@ -275,9 +275,21 @@ func processScheduleOptions(
if incDetails == nil {
continue
}
if err := parseWaitBehavior(v, incDetails); err != nil {
return err
}
// An incremental backup schedule must always wait if there is a running job
// that was previously scheduled by this incremental schedule. This is
// because until the previous incremental backup job completes, all future
// incremental jobs will attempt to backup data from the same `StartTime`
// corresponding to the `EndTime` of the last incremental layer. In this
// case only the first incremental job to complete will succeed, while the
// remaining jobs will either be rejected or worse corrupt the chain of
// backups. We can accept both `wait` and `skip` as valid
// `on_previous_running` options for an incremental schedule.
//
// NB: Ideally we'd have a way to configure options for both the full and
// incremental schedule separately, in which case we could reject the
// `on_previous_running = start` configuration for incremental schedules.
// Until then this interception will have to do.
incDetails.Wait = jobspb.ScheduleDetails_WAIT
s.incJob.SetScheduleDetails(*incDetails)
case optUpdatesLastBackupMetric:
// NB: as of 20.2, schedule creation requires admin so this is duplicative
Expand Down
20 changes: 18 additions & 2 deletions pkg/ccl/backupccl/create_scheduled_backup.go
Original file line number Diff line number Diff line change
Expand Up @@ -386,10 +386,26 @@ func doCreateBackupSchedules(
unpauseOnSuccessID := jobs.InvalidScheduleID

var chainProtectedTimestampRecords bool
// If needed, create incremental.
// If needed, create an incremental schedule.
var inc *jobs.ScheduledJob
var incScheduledBackupArgs *backuppb.ScheduledBackupExecutionArgs
if incRecurrence != nil {
incrementalScheduleDetails := details
// An incremental backup schedule must always wait if there is a running job
// that was previously scheduled by this incremental schedule. This is
// because until the previous incremental backup job completes, all future
// incremental jobs will attempt to backup data from the same `StartTime`
// corresponding to the `EndTime` of the last incremental layer. In this
// case only the first incremental job to complete will succeed, while the
// remaining jobs will either be rejected or worse corrupt the chain of
// backups. We can accept both `wait` and `skip` as valid
// `on_previous_running` options for an incremental schedule.
//
// NB: Ideally we'd have a way to configure options for both the full and
// incremental schedule separately, in which case we could reject the
// `on_previous_running = start` configuration for incremental schedules.
// Until then this interception will have to do.
incrementalScheduleDetails.Wait = jobspb.ScheduleDetails_WAIT
chainProtectedTimestampRecords = scheduledBackupGCProtectionEnabled.Get(&p.ExecCfg().Settings.SV)
backupNode.AppendToLatest = true

Expand All @@ -401,7 +417,7 @@ func doCreateBackupSchedules(
}
}
inc, incScheduledBackupArgs, err = makeBackupSchedule(
env, p.User(), scheduleLabel, incRecurrence, details, unpauseOnSuccessID,
env, p.User(), scheduleLabel, incRecurrence, incrementalScheduleDetails, unpauseOnSuccessID,
updateMetricOnSuccess, backupNode, chainProtectedTimestampRecords)
if err != nil {
return err
Expand Down
73 changes: 71 additions & 2 deletions pkg/ccl/backupccl/create_scheduled_backup_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -678,6 +678,74 @@ func TestSerializesScheduledBackupExecutionArgs(t *testing.T) {
}
}

// TestIncrementalScheduleBackupOnPreviousRunning tests that incremental
// schedules always set `on_previous_running` to `wait` regardless of what the
// full schedule is configured with.
// For an explanation please refer to `create_scheduled_backup.go` where we
// configure the option for the incremental schedule.
func TestIncrementalScheduleBackupOnPreviousRunning(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)

th, cleanup := newTestHelper(t)
defer cleanup()

th.sqlDB.Exec(t, `
CREATE DATABASE db;
USE db;
CREATE TABLE t1(a int);
INSERT INTO t1 values (1), (10), (100);
`)

// We'll be manipulating schedule time via th.env, but we can't fool actual
// backup when it comes to AsOf time. So, override AsOf backup clause to be
// the current time.
th.cfg.TestingKnobs.(*jobs.TestingKnobs).OverrideAsOfClause = func(clause *tree.AsOfClause, _ time.Time) {
expr, err := tree.MakeDTimestampTZ(th.cfg.DB.Clock().PhysicalTime(), time.Microsecond)
require.NoError(t, err)
clause.Expr = expr
}

checkScheduleDetailsWaitOption := func(schedules []*jobs.ScheduledJob,
expectedFullOption, expectedIncOption jobspb.ScheduleDetails_WaitBehavior) {
require.Len(t, schedules, 2)
full, inc := schedules[0], schedules[1]
if full.IsPaused() {
full, inc = inc, full // Swap: inc should be paused.
}
require.Equal(t, expectedFullOption, full.ScheduleDetails().Wait)
require.Equal(t, expectedIncOption, inc.ScheduleDetails().Wait)
}

t.Run("on_previous_running=start", func(t *testing.T) {
schedule := `CREATE SCHEDULE FOR BACKUP INTO $1 RECURRING '@hourly' WITH SCHEDULE OPTIONS on_previous_running = 'start'`
destination := "nodelocal://0/backup/"
schedules, err := th.createBackupSchedule(t, schedule, destination)
require.NoError(t, err)
require.Len(t, schedules, 2)
checkScheduleDetailsWaitOption(schedules, jobspb.ScheduleDetails_NO_WAIT, jobspb.ScheduleDetails_WAIT)
})

t.Run("on_previous_running=skip", func(t *testing.T) {
schedule := `CREATE SCHEDULE FOR BACKUP INTO $1 RECURRING '@hourly' WITH SCHEDULE OPTIONS on_previous_running = 'skip'`
destination := "nodelocal://0/backup/"
schedules, err := th.createBackupSchedule(t, schedule, destination)
require.NoError(t, err)
require.Len(t, schedules, 2)
checkScheduleDetailsWaitOption(schedules, jobspb.ScheduleDetails_SKIP, jobspb.ScheduleDetails_WAIT)
})

t.Run("on_previous_running=wait", func(t *testing.T) {
schedule := `CREATE SCHEDULE FOR BACKUP INTO $1 RECURRING '@hourly' WITH SCHEDULE OPTIONS on_previous_running = 'wait'`
destination := "nodelocal://0/backup/"
schedules, err := th.createBackupSchedule(t, schedule, destination)
require.NoError(t, err)
require.Len(t, schedules, 2)
checkScheduleDetailsWaitOption(schedules, jobspb.ScheduleDetails_WAIT, jobspb.ScheduleDetails_WAIT)
})
}

func TestScheduleBackup(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)
Expand Down Expand Up @@ -705,8 +773,9 @@ CREATE TABLE t1(a int);
INSERT INTO t1 values (-1), (10), (-100);
`)

// We'll be manipulating schedule time via th.env, but we can't fool actual backup
// when it comes to AsOf time. So, override AsOf backup clause to be the current time.
// We'll be manipulating schedule time via th.env, but we can't fool actual
// backup when it comes to AsOf time. So, override AsOf backup clause to be
// the current time.
th.cfg.TestingKnobs.(*jobs.TestingKnobs).OverrideAsOfClause = func(clause *tree.AsOfClause, _ time.Time) {
expr, err := tree.MakeDTimestampTZ(th.cfg.DB.Clock().PhysicalTime(), time.Microsecond)
require.NoError(t, err)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,30 +12,75 @@ with schedules as (show schedules) select id from schedules where label='datates
----

query-sql
with schedules as (show schedules) select id, label from schedules where id in ($fullID, $incID) order by command->>'backup_type' asc;
with schedules as (show schedules) select label from schedules where id in ($fullID, $incID) order by command->>'backup_type' asc;
----
$fullID datatest
$incID datatest
datatest
datatest

exec-sql
alter backup schedule $fullID set label 'datatest2'
----

query-sql
with schedules as (show schedules) select id, label from schedules where id in ($fullID, $incID) order by command->>'backup_type' asc;
with schedules as (show schedules) select label from schedules where id in ($fullID, $incID) order by command->>'backup_type' asc;
----
$fullID datatest2
$incID datatest2
datatest2
datatest2

exec-sql
alter backup schedule $fullID set into 'nodelocal://1/example-schedule-2'
----

query-sql
with schedules as (show schedules) select id, command->'backup_statement' from schedules where id in ($fullID, $incID) order by command->>'backup_type' asc;
with schedules as (show schedules) select command->'backup_statement' from schedules where id in ($fullID, $incID) order by command->>'backup_type' asc;
----
$fullID "BACKUP INTO 'nodelocal://1/example-schedule-2' WITH detached"
$incID "BACKUP INTO LATEST IN 'nodelocal://1/example-schedule-2' WITH detached"
"BACKUP INTO 'nodelocal://1/example-schedule-2' WITH detached"
"BACKUP INTO LATEST IN 'nodelocal://1/example-schedule-2' WITH detached"

# Alter the `on_previous_running` schedule option to test that incremental
# schedules always have their configuration set to wait.
exec-sql
alter backup schedule $fullID set schedule option on_previous_running = 'start';
----

query-sql
with schedules as (select * from system.scheduled_jobs)
select crdb_internal.pb_to_json('cockroach.jobs.jobspb.ScheduleDetails', schedule_details, true, false)
from schedules
where schedule_id in ($fullID, $incID)
order by crdb_internal.pb_to_json('cockroach.jobs.jobspb.ExecutionArguments', execution_args, true, true)->'args'->>'backup_type' asc;
----
{"onError": "RETRY_SCHED", "wait": "NO_WAIT"}
{"onError": "RETRY_SCHED", "wait": "WAIT"}

exec-sql
alter backup schedule $fullID set schedule option on_previous_running = 'skip';
----

query-sql
with schedules as (select * from system.scheduled_jobs)
select crdb_internal.pb_to_json('cockroach.jobs.jobspb.ScheduleDetails', schedule_details, true, false)
from schedules
where schedule_id in ($fullID, $incID)
order by crdb_internal.pb_to_json('cockroach.jobs.jobspb.ExecutionArguments', execution_args, true, true)->'args'->>'backup_type' asc;
----
{"onError": "RETRY_SCHED", "wait": "SKIP"}
{"onError": "RETRY_SCHED", "wait": "WAIT"}

exec-sql
alter backup schedule $fullID set schedule option on_previous_running = 'wait';
----

query-sql
with schedules as (select * from system.scheduled_jobs)
select crdb_internal.pb_to_json('cockroach.jobs.jobspb.ScheduleDetails', schedule_details, true, false)
from schedules
where schedule_id in ($fullID, $incID)
order by crdb_internal.pb_to_json('cockroach.jobs.jobspb.ExecutionArguments', execution_args, true, true)->'args'->>'backup_type' asc;
----
{"onError": "RETRY_SCHED", "wait": "WAIT"}
{"onError": "RETRY_SCHED", "wait": "WAIT"}


# Hard to validate these, so settle for checking they execute without errors.

Expand Down

0 comments on commit 018fc65

Please sign in to comment.