From d1da2ce020df9f7d0a3fb86594a69f487296e266 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Wed, 24 Aug 2016 17:40:11 -0700 Subject: [PATCH 1/4] Don't fail other tasks when retrying artifact get The artifact fetching may be retried and succeed, so don't set the task as dead. Fixes #1558 --- client/alloc_runner_test.go | 47 ++++++++++++++++++++++++++++++++++--- client/task_runner.go | 2 +- 2 files changed, 45 insertions(+), 4 deletions(-) diff --git a/client/alloc_runner_test.go b/client/alloc_runner_test.go index 9bdfcf9e83d..fb0082596b8 100644 --- a/client/alloc_runner_test.go +++ b/client/alloc_runner_test.go @@ -25,22 +25,24 @@ func (m *MockAllocStateUpdater) Update(alloc *structs.Allocation) { m.Allocs = append(m.Allocs, alloc) } -func testAllocRunner(restarts bool) (*MockAllocStateUpdater, *AllocRunner) { +func testAllocRunnerFromAlloc(alloc *structs.Allocation, restarts bool) (*MockAllocStateUpdater, *AllocRunner) { logger := testLogger() conf := config.DefaultConfig() conf.StateDir = os.TempDir() conf.AllocDir = os.TempDir() upd := &MockAllocStateUpdater{} - alloc := mock.Alloc() if !restarts { *alloc.Job.LookupTaskGroup(alloc.TaskGroup).RestartPolicy = structs.RestartPolicy{Attempts: 0} alloc.Job.Type = structs.JobTypeBatch } - ar := NewAllocRunner(logger, conf, upd.Update, alloc) return upd, ar } +func testAllocRunner(restarts bool) (*MockAllocStateUpdater, *AllocRunner) { + return testAllocRunnerFromAlloc(mock.Alloc(), restarts) +} + func TestAllocRunner_SimpleRun(t *testing.T) { ctestutil.ExecCompatible(t) upd, ar := testAllocRunner(false) @@ -61,6 +63,45 @@ func TestAllocRunner_SimpleRun(t *testing.T) { }) } +// TestAllocRuner_RetryArtifact ensures that if one task in a task group is +// retrying fetching an artifact, other tasks in the the group should be able +// to proceed. See #1558 +func TestAllocRunner_RetryArtifact(t *testing.T) { + ctestutil.ExecCompatible(t) + + alloc := mock.Alloc() + + // Create a copy of the task for testing #1558 + badtask := alloc.Job.TaskGroups[0].Tasks[0].Copy() + badtask.Name = "bad" + + // Add a bad artifact to one of the tasks + badtask.Artifacts = []*structs.TaskArtifact{ + {GetterSource: "http://127.1.1.111:12315/foo/bar/baz"}, + } + + alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, badtask) + upd, ar := testAllocRunnerFromAlloc(alloc, true) + go ar.Run() + defer ar.Destroy() + + testutil.WaitForResult(func() (bool, error) { + if upd.Count < 6 { + return false, fmt.Errorf("Not enough updates") + } + last := upd.Allocs[upd.Count-1] + if last.TaskStates["web"].State != structs.TaskStatePending { + return false, fmt.Errorf("expected web to be pending but found %q", last.TaskStates["web"].State) + } + if last.TaskStates["bad"].State != structs.TaskStatePending { + return false, fmt.Errorf("expected bad to be pending but found %q", last.TaskStates["web"].State) + } + return true, nil + }, func(err error) { + t.Fatalf("err: %v", err) + }) +} + func TestAllocRunner_TerminalUpdate_Destroy(t *testing.T) { ctestutil.ExecCompatible(t) upd, ar := testAllocRunner(false) diff --git a/client/task_runner.go b/client/task_runner.go index 0544a1eae94..76b19f28cfc 100644 --- a/client/task_runner.go +++ b/client/task_runner.go @@ -318,7 +318,7 @@ func (r *TaskRunner) run() { for _, artifact := range r.task.Artifacts { if err := getter.GetArtifact(r.taskEnv, artifact, taskDir); err != nil { - r.setState(structs.TaskStateDead, + r.setState(structs.TaskStatePending, structs.NewTaskEvent(structs.TaskArtifactDownloadFailed).SetDownloadError(err)) r.restartTracker.SetStartError(dstructs.NewRecoverableError(err, true)) goto RESTART From 2a88e48f07626c46da725ba52cb80c2b543b54a8 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Thu, 25 Aug 2016 14:42:50 -0700 Subject: [PATCH 2/4] Ensure web task exited successfully Web task should run to completion successfully while the `bad` task is retrying artifact downloads. --- client/alloc_runner_test.go | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/client/alloc_runner_test.go b/client/alloc_runner_test.go index fb0082596b8..d8b48b7054b 100644 --- a/client/alloc_runner_test.go +++ b/client/alloc_runner_test.go @@ -65,17 +65,16 @@ func TestAllocRunner_SimpleRun(t *testing.T) { // TestAllocRuner_RetryArtifact ensures that if one task in a task group is // retrying fetching an artifact, other tasks in the the group should be able -// to proceed. See #1558 +// to proceed. func TestAllocRunner_RetryArtifact(t *testing.T) { ctestutil.ExecCompatible(t) alloc := mock.Alloc() + alloc.Job.Type = structs.JobTypeBatch - // Create a copy of the task for testing #1558 + // Create a new task with a bad artifact badtask := alloc.Job.TaskGroups[0].Tasks[0].Copy() badtask.Name = "bad" - - // Add a bad artifact to one of the tasks badtask.Artifacts = []*structs.TaskArtifact{ {GetterSource: "http://127.1.1.111:12315/foo/bar/baz"}, } @@ -90,8 +89,12 @@ func TestAllocRunner_RetryArtifact(t *testing.T) { return false, fmt.Errorf("Not enough updates") } last := upd.Allocs[upd.Count-1] - if last.TaskStates["web"].State != structs.TaskStatePending { - return false, fmt.Errorf("expected web to be pending but found %q", last.TaskStates["web"].State) + webstate := last.TaskStates["web"] + if webstate.State != structs.TaskStateDead { + return false, fmt.Errorf("expected web to be dead but found %q", last.TaskStates["web"].State) + } + if !webstate.Successful() { + return false, fmt.Errorf("expected web to have exited successfully") } if last.TaskStates["bad"].State != structs.TaskStatePending { return false, fmt.Errorf("expected bad to be pending but found %q", last.TaskStates["web"].State) From bbce6c7ad19b066f5e24ef344262fc9def0badb3 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Thu, 25 Aug 2016 16:05:19 -0700 Subject: [PATCH 3/4] Ensure bad task reaches terminal state within test --- client/alloc_runner_test.go | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/client/alloc_runner_test.go b/client/alloc_runner_test.go index d8b48b7054b..c6003a29670 100644 --- a/client/alloc_runner_test.go +++ b/client/alloc_runner_test.go @@ -71,6 +71,7 @@ func TestAllocRunner_RetryArtifact(t *testing.T) { alloc := mock.Alloc() alloc.Job.Type = structs.JobTypeBatch + alloc.Job.TaskGroups[0].RestartPolicy.Attempts = 0 // Create a new task with a bad artifact badtask := alloc.Job.TaskGroups[0].Tasks[0].Copy() @@ -89,6 +90,9 @@ func TestAllocRunner_RetryArtifact(t *testing.T) { return false, fmt.Errorf("Not enough updates") } last := upd.Allocs[upd.Count-1] + + // web task should have completed successfully while bad task + // retries artififact fetching webstate := last.TaskStates["web"] if webstate.State != structs.TaskStateDead { return false, fmt.Errorf("expected web to be dead but found %q", last.TaskStates["web"].State) @@ -96,8 +100,14 @@ func TestAllocRunner_RetryArtifact(t *testing.T) { if !webstate.Successful() { return false, fmt.Errorf("expected web to have exited successfully") } - if last.TaskStates["bad"].State != structs.TaskStatePending { - return false, fmt.Errorf("expected bad to be pending but found %q", last.TaskStates["web"].State) + + // bad task should have failed + badstate := last.TaskStates["bad"] + if badstate.State != structs.TaskStateDead { + return false, fmt.Errorf("expected bad to be dead but found %q", last.TaskStates["web"].State) + } + if !badstate.Failed() { + return false, fmt.Errorf("expected bad to have failed") } return true, nil }, func(err error) { From 5b0dd7ff2ba5df356970dd95d72850a00b90f69c Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Thu, 25 Aug 2016 17:25:51 -0700 Subject: [PATCH 4/4] Make sure bad doesn't fail before web runs --- client/alloc_runner_test.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/client/alloc_runner_test.go b/client/alloc_runner_test.go index c6003a29670..6ee1f033949 100644 --- a/client/alloc_runner_test.go +++ b/client/alloc_runner_test.go @@ -71,7 +71,8 @@ func TestAllocRunner_RetryArtifact(t *testing.T) { alloc := mock.Alloc() alloc.Job.Type = structs.JobTypeBatch - alloc.Job.TaskGroups[0].RestartPolicy.Attempts = 0 + alloc.Job.TaskGroups[0].RestartPolicy.Attempts = 1 + alloc.Job.TaskGroups[0].RestartPolicy.Delay = time.Duration(4*testutil.TestMultiplier()) * time.Second // Create a new task with a bad artifact badtask := alloc.Job.TaskGroups[0].Tasks[0].Copy()