diff --git a/nomad/core_sched.go b/nomad/core_sched.go index e5dbea5a706..a191effddbf 100644 --- a/nomad/core_sched.go +++ b/nomad/core_sched.go @@ -196,8 +196,6 @@ func (c *CoreScheduler) evalGC(eval *structs.Evaluation) error { // The Evaluation GC should not handle batch jobs since those need to be // garbage collected in one shot - // XXX believe there is a bug that if a batch job gets stopped, there is no - // way for it to GC the eval/allocs gc, allocs, err := c.gcEval(eval, oldThreshold, false) if err != nil { return err @@ -239,10 +237,6 @@ func (c *CoreScheduler) gcEval(eval *structs.Evaluation, thresholdIndex uint64, // terminal allocations get GC'd the scheduler would re-run the // allocations. if eval.Type == structs.JobTypeBatch { - if !allowBatch { - return false, nil, nil - } - // Check if the job is running job, err := c.snap.JobByID(ws, eval.JobID) if err != nil { @@ -250,7 +244,8 @@ func (c *CoreScheduler) gcEval(eval *structs.Evaluation, thresholdIndex uint64, } // We don't want to gc anything related to a job which is not dead - if job != nil && job.Status != structs.JobStatusDead { + // If the batch job doesn't exist we can GC it regardless of allowBatch + if job != nil && (!allowBatch || job.Status != structs.JobStatusDead) { return false, nil, nil } } diff --git a/nomad/core_sched_test.go b/nomad/core_sched_test.go index 3f1c6d24716..ce1e39cd352 100644 --- a/nomad/core_sched_test.go +++ b/nomad/core_sched_test.go @@ -90,7 +90,7 @@ func TestCoreScheduler_EvalGC(t *testing.T) { } } -// An EvalGC should never reap a batch job +// An EvalGC should never reap a batch job that has not been stopped func TestCoreScheduler_EvalGC_Batch(t *testing.T) { s1 := testServer(t, nil) defer s1.Shutdown() @@ -190,6 +190,94 @@ func TestCoreScheduler_EvalGC_Batch(t *testing.T) { } } +// An EvalGC should reap a batch job that has been stopped +func TestCoreScheduler_EvalGC_BatchStopped(t *testing.T) { + s1 := testServer(t, nil) + defer s1.Shutdown() + testutil.WaitForLeader(t, s1.RPC) + + // COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0 + s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10) + + // Create a "dead" job + state := s1.fsm.State() + job := mock.Job() + job.Type = structs.JobTypeBatch + job.Status = structs.JobStatusDead + + // Insert "complete" eval + eval := mock.Eval() + eval.Status = structs.EvalStatusComplete + eval.Type = structs.JobTypeBatch + eval.JobID = job.ID + err := state.UpsertEvals(1001, []*structs.Evaluation{eval}) + if err != nil { + t.Fatalf("err: %v", err) + } + + // Insert "failed" alloc + alloc := mock.Alloc() + alloc.JobID = job.ID + alloc.EvalID = eval.ID + alloc.DesiredStatus = structs.AllocDesiredStatusStop + + // Insert "lost" alloc + alloc2 := mock.Alloc() + alloc2.JobID = job.ID + alloc2.EvalID = eval.ID + alloc2.DesiredStatus = structs.AllocDesiredStatusRun + alloc2.ClientStatus = structs.AllocClientStatusLost + + err = state.UpsertAllocs(1002, []*structs.Allocation{alloc, alloc2}) + if err != nil { + t.Fatalf("err: %v", err) + } + + // Update the time tables to make this work + tt := s1.fsm.TimeTable() + tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.EvalGCThreshold)) + + // Create a core scheduler + snap, err := state.Snapshot() + if err != nil { + t.Fatalf("err: %v", err) + } + core := NewCoreScheduler(s1, snap) + + // Attempt the GC + gc := s1.coreJobEval(structs.CoreJobEvalGC, 2000) + err = core.Process(gc) + if err != nil { + t.Fatalf("err: %v", err) + } + + // Everything should be gone + ws := memdb.NewWatchSet() + out, err := state.EvalByID(ws, eval.ID) + if err != nil { + t.Fatalf("err: %v", err) + } + if out != nil { + t.Fatalf("bad: %v", out) + } + + outA, err := state.AllocByID(ws, alloc.ID) + if err != nil { + t.Fatalf("err: %v", err) + } + if outA != nil { + t.Fatalf("bad: %v", outA) + } + + outA2, err := state.AllocByID(ws, alloc2.ID) + if err != nil { + t.Fatalf("err: %v", err) + } + if outA2 != nil { + t.Fatalf("bad: %v", outA2) + } +} + func TestCoreScheduler_EvalGC_Partial(t *testing.T) { s1 := testServer(t, nil) defer s1.Shutdown()