Merge pull request #1653 from hashicorp/b-fix-artifact-retry

Don't fail other tasks when retrying artifact get
2016-08-26 09:53:39 -07:00 · 2016-08-26 09:53:39 -07:00 · d31f373a5b
parent 9a9815b6d5 592b898f43
commit d31f373a5b
2 changed files with 59 additions and 4 deletions
--- a/client/alloc_runner_test.go
+++ b/client/alloc_runner_test.go
@ -25,22 +25,24 @@ func (m *MockAllocStateUpdater) Update(alloc *structs.Allocation) {
 	m.Allocs = append(m.Allocs, alloc)
 }

-func testAllocRunner(restarts bool) (*MockAllocStateUpdater, *AllocRunner) {
+func testAllocRunnerFromAlloc(alloc *structs.Allocation, restarts bool) (*MockAllocStateUpdater, *AllocRunner) {
 	logger := testLogger()
 	conf := config.DefaultConfig()
 	conf.StateDir = os.TempDir()
 	conf.AllocDir = os.TempDir()
 	upd := &MockAllocStateUpdater{}
-	alloc := mock.Alloc()
 	if !restarts {
 		*alloc.Job.LookupTaskGroup(alloc.TaskGroup).RestartPolicy = structs.RestartPolicy{Attempts: 0}
 		alloc.Job.Type = structs.JobTypeBatch
 	}
-
 	ar := NewAllocRunner(logger, conf, upd.Update, alloc)
 	return upd, ar
 }

+func testAllocRunner(restarts bool) (*MockAllocStateUpdater, *AllocRunner) {
+	return testAllocRunnerFromAlloc(mock.Alloc(), restarts)
+}
+
 func TestAllocRunner_SimpleRun(t *testing.T) {
 	ctestutil.ExecCompatible(t)
 	upd, ar := testAllocRunner(false)
@ -61,6 +63,59 @@ func TestAllocRunner_SimpleRun(t *testing.T) {
 	})
 }

+// TestAllocRuner_RetryArtifact ensures that if one task in a task group is
+// retrying fetching an artifact, other tasks in the the group should be able
+// to proceed.
+func TestAllocRunner_RetryArtifact(t *testing.T) {
+	ctestutil.ExecCompatible(t)
+
+	alloc := mock.Alloc()
+	alloc.Job.Type = structs.JobTypeBatch
+	alloc.Job.TaskGroups[0].RestartPolicy.Attempts = 1
+	alloc.Job.TaskGroups[0].RestartPolicy.Delay = time.Duration(4*testutil.TestMultiplier()) * time.Second
+
+	// Create a new task with a bad artifact
+	badtask := alloc.Job.TaskGroups[0].Tasks[0].Copy()
+	badtask.Name = "bad"
+	badtask.Artifacts = []*structs.TaskArtifact{
+		{GetterSource: "http://127.1.1.111:12315/foo/bar/baz"},
+	}
+
+	alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, badtask)
+	upd, ar := testAllocRunnerFromAlloc(alloc, true)
+	go ar.Run()
+	defer ar.Destroy()
+
+	testutil.WaitForResult(func() (bool, error) {
+		if upd.Count < 6 {
+			return false, fmt.Errorf("Not enough updates")
+		}
+		last := upd.Allocs[upd.Count-1]
+
+		// web task should have completed successfully while bad task
+		// retries artififact fetching
+		webstate := last.TaskStates["web"]
+		if webstate.State != structs.TaskStateDead {
+			return false, fmt.Errorf("expected web to be dead but found %q", last.TaskStates["web"].State)
+		}
+		if !webstate.Successful() {
+			return false, fmt.Errorf("expected web to have exited successfully")
+		}
+
+		// bad task should have failed
+		badstate := last.TaskStates["bad"]
+		if badstate.State != structs.TaskStateDead {
+			return false, fmt.Errorf("expected bad to be dead but found %q", last.TaskStates["web"].State)
+		}
+		if !badstate.Failed() {
+			return false, fmt.Errorf("expected bad to have failed")
+		}
+		return true, nil
+	}, func(err error) {
+		t.Fatalf("err: %v", err)
+	})
+}
+
 func TestAllocRunner_TerminalUpdate_Destroy(t *testing.T) {
 	ctestutil.ExecCompatible(t)
 	upd, ar := testAllocRunner(false)
--- a/client/task_runner.go
+++ b/client/task_runner.go
@ -318,7 +318,7 @@ func (r *TaskRunner) run() {

 			for _, artifact := range r.task.Artifacts {
 				if err := getter.GetArtifact(r.taskEnv, artifact, taskDir); err != nil {
-					r.setState(structs.TaskStateDead,
+					r.setState(structs.TaskStatePending,
 						structs.NewTaskEvent(structs.TaskArtifactDownloadFailed).SetDownloadError(err))
 					r.restartTracker.SetStartError(dstructs.NewRecoverableError(err, true))
 					goto RESTART