Merge pull request #3496 from hashicorp/b-auto-revert-stable

Makes auto reverts robust against infinite revert cycles
2017-11-03 17:28:28 -05:00 · 2017-11-03 17:28:28 -05:00 · 0d0804d6ff
parent 6034916ad1 6468883cd1
commit 0d0804d6ff
4 changed files with 289 additions and 4 deletions
--- a/nomad/deployment_endpoint_test.go
+++ b/nomad/deployment_endpoint_test.go
@ -293,6 +293,8 @@ func TestDeploymentEndpoint_Fail_Rollback(t *testing.T) {
 	// Create the second job, deployment and alloc
 	j2 := j.Copy()
 	j2.Stable = false
+	// Modify the job to make its specification different
+	j2.Meta["foo"] = "bar"

 	d := mock.Deployment()
 	d.TaskGroups["web"].AutoRevert = true
@ -792,7 +794,8 @@ func TestDeploymentEndpoint_SetAllocHealth_Rollback(t *testing.T) {
 	// Create the second job, deployment and alloc
 	j2 := j.Copy()
 	j2.Stable = false
-
+	// Modify the job to make its specification different
+	j2.Meta["foo"] = "bar"
 	d := mock.Deployment()
 	d.TaskGroups["web"].AutoRevert = true
 	d.JobID = j2.ID
@ -857,6 +860,93 @@ func TestDeploymentEndpoint_SetAllocHealth_Rollback(t *testing.T) {
 	assert.EqualValues(2, jout.Version, "reverted job version")
 }

+// tests rollback upon alloc health failure to job with identical spec does not succeed
+func TestDeploymentEndpoint_SetAllocHealth_NoRollback(t *testing.T) {
+	t.Parallel()
+	s1 := testServer(t, func(c *Config) {
+		c.NumSchedulers = 0 // Prevent automatic dequeue
+	})
+	defer s1.Shutdown()
+	codec := rpcClient(t, s1)
+	testutil.WaitForLeader(t, s1.RPC)
+	assert := assert.New(t)
+	state := s1.fsm.State()
+
+	// Create the original job
+	j := mock.Job()
+	j.Stable = true
+	j.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy()
+	j.TaskGroups[0].Update.MaxParallel = 2
+	j.TaskGroups[0].Update.AutoRevert = true
+	assert.Nil(state.UpsertJob(998, j), "UpsertJob")
+
+	// Create the second job, deployment and alloc. Job has same spec as original
+	j2 := j.Copy()
+	j2.Stable = false
+
+	d := mock.Deployment()
+	d.TaskGroups["web"].AutoRevert = true
+	d.JobID = j2.ID
+	d.JobVersion = j2.Version
+
+	a := mock.Alloc()
+	a.JobID = j.ID
+	a.DeploymentID = d.ID
+
+	assert.Nil(state.UpsertJob(999, j2), "UpsertJob")
+	assert.Nil(state.UpsertDeployment(1000, d), "UpsertDeployment")
+	assert.Nil(state.UpsertAllocs(1001, []*structs.Allocation{a}), "UpsertAllocs")
+
+	// Set the alloc as unhealthy
+	req := &structs.DeploymentAllocHealthRequest{
+		DeploymentID:           d.ID,
+		UnhealthyAllocationIDs: []string{a.ID},
+		WriteRequest:           structs.WriteRequest{Region: "global"},
+	}
+
+	// Fetch the response
+	var resp structs.DeploymentUpdateResponse
+	assert.Nil(msgpackrpc.CallWithCodec(codec, "Deployment.SetAllocHealth", req, &resp), "RPC")
+	assert.NotZero(resp.Index, "bad response index")
+	assert.Nil(resp.RevertedJobVersion, "revert version must be nil")
+
+	// Lookup the evaluation
+	ws := memdb.NewWatchSet()
+	eval, err := state.EvalByID(ws, resp.EvalID)
+	assert.Nil(err, "EvalByID failed")
+	assert.NotNil(eval, "Expect eval")
+	assert.Equal(eval.CreateIndex, resp.EvalCreateIndex, "eval index mismatch")
+	assert.Equal(eval.TriggeredBy, structs.EvalTriggerDeploymentWatcher, "eval trigger")
+	assert.Equal(eval.JobID, d.JobID, "eval job id")
+	assert.Equal(eval.DeploymentID, d.ID, "eval deployment id")
+	assert.Equal(eval.Status, structs.EvalStatusPending, "eval status")
+
+	// Lookup the deployment
+	expectedDesc := structs.DeploymentStatusDescriptionRollbackNoop(structs.DeploymentStatusDescriptionFailedAllocations, 0)
+	dout, err := state.DeploymentByID(ws, d.ID)
+	assert.Nil(err, "DeploymentByID failed")
+	assert.Equal(dout.Status, structs.DeploymentStatusFailed, "wrong status")
+	assert.Equal(dout.StatusDescription, expectedDesc, "wrong status description")
+	assert.Equal(resp.DeploymentModifyIndex, dout.ModifyIndex, "wrong modify index")
+	assert.Len(dout.TaskGroups, 1, "should have one group")
+	assert.Contains(dout.TaskGroups, "web", "should have web group")
+	assert.Equal(1, dout.TaskGroups["web"].UnhealthyAllocs, "should have one healthy")
+
+	// Lookup the allocation
+	aout, err := state.AllocByID(ws, a.ID)
+	assert.Nil(err, "AllocByID")
+	assert.NotNil(aout, "alloc")
+	assert.NotNil(aout.DeploymentStatus, "alloc deployment status")
+	assert.NotNil(aout.DeploymentStatus.Healthy, "alloc deployment healthy")
+	assert.False(*aout.DeploymentStatus.Healthy, "alloc deployment healthy")
+
+	// Lookup the job, its version should not have changed
+	jout, err := state.JobByID(ws, j.Namespace, j.ID)
+	assert.Nil(err, "JobByID")
+	assert.NotNil(jout, "job")
+	assert.EqualValues(1, jout.Version, "original job version")
+}
+
 func TestDeploymentEndpoint_List(t *testing.T) {
 	t.Parallel()
 	s1 := testServer(t, nil)
--- a/nomad/deploymentwatcher/deployment_watcher.go
+++ b/nomad/deploymentwatcher/deployment_watcher.go
@ -149,7 +149,7 @@ func (w *deploymentWatcher) SetAllocHealth(
 			}

 			if j != nil {
-				desc = structs.DeploymentStatusDescriptionRollback(desc, j.Version)
+				j, desc = w.handleRollbackValidity(j, desc)
 			}
 			break
 		}
@ -185,6 +185,21 @@ func (w *deploymentWatcher) SetAllocHealth(
 	return nil
 }

+// handleRollbackValidity checks if the job being rolled back to has the same spec as the existing job
+// Returns a modified description and job accordingly.
+func (w *deploymentWatcher) handleRollbackValidity(rollbackJob *structs.Job, desc string) (*structs.Job, string) {
+	// Only rollback if job being changed has a different spec.
+	// This prevents an infinite revert cycle when a previously stable version of the job fails to start up during a rollback
+	// If the job we are trying to rollback to is identical to the current job, we stop because the rollback will not succeed.
+	if w.j.SpecChanged(rollbackJob) {
+		desc = structs.DeploymentStatusDescriptionRollback(desc, rollbackJob.Version)
+	} else {
+		desc = structs.DeploymentStatusDescriptionRollbackNoop(desc, rollbackJob.Version)
+		rollbackJob = nil
+	}
+	return rollbackJob, desc
+}
+
 func (w *deploymentWatcher) PromoteDeployment(
 	req *structs.DeploymentPromoteRequest,
 	resp *structs.DeploymentUpdateResponse) error {
@ -265,7 +280,7 @@ func (w *deploymentWatcher) FailDeployment(
 		}

 		if rollbackJob != nil {
-			desc = structs.DeploymentStatusDescriptionRollback(desc, rollbackJob.Version)
+			rollbackJob, desc = w.handleRollbackValidity(rollbackJob, desc)
 		} else {
 			desc = structs.DeploymentStatusDescriptionNoRollbackTarget(desc)
 		}
@ -371,7 +386,7 @@ func (w *deploymentWatcher) watch() {
 				// Description should include that the job is being rolled back to
 				// version N
 				if j != nil {
-					desc = structs.DeploymentStatusDescriptionRollback(desc, j.Version)
+					j, desc = w.handleRollbackValidity(j, desc)
 				} else {
 					desc = structs.DeploymentStatusDescriptionNoRollbackTarget(desc)
 				}
--- a/nomad/deploymentwatcher/deployments_watcher_test.go
+++ b/nomad/deploymentwatcher/deployments_watcher_test.go
@ -281,6 +281,9 @@ func TestWatcher_SetAllocHealth_Unhealthy_Rollback(t *testing.T) {
 	// Upsert the job again to get a new version
 	j2 := j.Copy()
 	j2.Stable = false
+	// Modify the job to make its specification different
+	j2.Meta["foo"] = "bar"
+
 	assert.Nil(m.state.UpsertJob(m.nextIndex(), j2), "UpsertJob2")

 	w.SetEnabled(true, m.state)
@ -316,6 +319,66 @@ func TestWatcher_SetAllocHealth_Unhealthy_Rollback(t *testing.T) {
 	m.AssertNumberOfCalls(t, "UpdateDeploymentAllocHealth", 1)
 }

+// Test setting allocation unhealthy on job with identical spec and there should be no rollback
+func TestWatcher_SetAllocHealth_Unhealthy_NoRollback(t *testing.T) {
+	t.Parallel()
+	assert := assert.New(t)
+	w, m := defaultTestDeploymentWatcher(t)
+
+	// Create a job, alloc, and a deployment
+	j := mock.Job()
+	j.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy()
+	j.TaskGroups[0].Update.MaxParallel = 2
+	j.TaskGroups[0].Update.AutoRevert = true
+	j.Stable = true
+	d := mock.Deployment()
+	d.JobID = j.ID
+	d.TaskGroups["web"].AutoRevert = true
+	a := mock.Alloc()
+	a.DeploymentID = d.ID
+	assert.Nil(m.state.UpsertJob(m.nextIndex(), j), "UpsertJob")
+	assert.Nil(m.state.UpsertDeployment(m.nextIndex(), d), "UpsertDeployment")
+	assert.Nil(m.state.UpsertAllocs(m.nextIndex(), []*structs.Allocation{a}), "UpsertAllocs")
+
+	// Upsert the job again to get a new version
+	j2 := j.Copy()
+	j2.Stable = false
+
+	assert.Nil(m.state.UpsertJob(m.nextIndex(), j2), "UpsertJob2")
+
+	w.SetEnabled(true, m.state)
+	testutil.WaitForResult(func() (bool, error) { return 1 == len(w.watchers), nil },
+		func(err error) { assert.Equal(1, len(w.watchers), "Should have 1 deployment") })
+
+	// Assert that we get a call to UpsertDeploymentAllocHealth
+	matchConfig := &matchDeploymentAllocHealthRequestConfig{
+		DeploymentID: d.ID,
+		Unhealthy:    []string{a.ID},
+		Eval:         true,
+		DeploymentUpdate: &structs.DeploymentStatusUpdate{
+			DeploymentID:      d.ID,
+			Status:            structs.DeploymentStatusFailed,
+			StatusDescription: structs.DeploymentStatusDescriptionFailedAllocations,
+		},
+		JobVersion: nil,
+	}
+	matcher := matchDeploymentAllocHealthRequest(matchConfig)
+	m.On("UpdateDeploymentAllocHealth", mocker.MatchedBy(matcher)).Return(nil)
+
+	// Call SetAllocHealth
+	req := &structs.DeploymentAllocHealthRequest{
+		DeploymentID:           d.ID,
+		UnhealthyAllocationIDs: []string{a.ID},
+	}
+	var resp structs.DeploymentUpdateResponse
+	err := w.SetAllocHealth(req, &resp)
+	assert.Nil(err, "SetAllocHealth")
+
+	testutil.WaitForResult(func() (bool, error) { return 0 == len(w.watchers), nil },
+		func(err error) { assert.Equal(0, len(w.watchers), "Should have no deployment") })
+	m.AssertNumberOfCalls(t, "UpdateDeploymentAllocHealth", 1)
+}
+
 // Test promoting a deployment
 func TestWatcher_PromoteDeployment_HealthyCanaries(t *testing.T) {
 	t.Parallel()
@ -639,6 +702,8 @@ func TestDeploymentWatcher_Watch(t *testing.T) {

 	// Upsert the job again to get a new version
 	j2 := j.Copy()
+	// Modify the job to make its specification different
+	j2.Meta["foo"] = "bar"
 	j2.Stable = false
 	assert.Nil(m.state.UpsertJob(m.nextIndex(), j2), "UpsertJob2")

@ -734,6 +799,115 @@ func TestDeploymentWatcher_Watch(t *testing.T) {
 		func(err error) { assert.Equal(0, len(w.watchers), "Should have no deployment") })
 }

+// Tests that the watcher fails rollback when the spec hasn't changed
+func TestDeploymentWatcher_RollbackFailed(t *testing.T) {
+	t.Parallel()
+	assert := assert.New(t)
+	w, m := testDeploymentWatcher(t, 1000.0, 1*time.Millisecond)
+
+	// Create a job, alloc, and a deployment
+	j := mock.Job()
+	j.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy()
+	j.TaskGroups[0].Update.MaxParallel = 2
+	j.TaskGroups[0].Update.AutoRevert = true
+	j.Stable = true
+	d := mock.Deployment()
+	d.JobID = j.ID
+	d.TaskGroups["web"].AutoRevert = true
+	a := mock.Alloc()
+	a.DeploymentID = d.ID
+	assert.Nil(m.state.UpsertJob(m.nextIndex(), j), "UpsertJob")
+	assert.Nil(m.state.UpsertDeployment(m.nextIndex(), d), "UpsertDeployment")
+	assert.Nil(m.state.UpsertAllocs(m.nextIndex(), []*structs.Allocation{a}), "UpsertAllocs")
+
+	// Upsert the job again to get a new version
+	j2 := j.Copy()
+	// Modify the job to make its specification different
+	j2.Stable = false
+	assert.Nil(m.state.UpsertJob(m.nextIndex(), j2), "UpsertJob2")
+
+	w.SetEnabled(true, m.state)
+	testutil.WaitForResult(func() (bool, error) { return 1 == len(w.watchers), nil },
+		func(err error) { assert.Equal(1, len(w.watchers), "Should have 1 deployment") })
+
+	// Assert that we will get a createEvaluation call only once. This will
+	// verify that the watcher is batching allocation changes
+	m1 := matchUpsertEvals([]string{d.ID})
+	m.On("UpsertEvals", mocker.MatchedBy(m1)).Return(nil).Once()
+
+	// Update the allocs health to healthy which should create an evaluation
+	for i := 0; i < 5; i++ {
+		req := &structs.ApplyDeploymentAllocHealthRequest{
+			DeploymentAllocHealthRequest: structs.DeploymentAllocHealthRequest{
+				DeploymentID:         d.ID,
+				HealthyAllocationIDs: []string{a.ID},
+			},
+		}
+		assert.Nil(m.state.UpdateDeploymentAllocHealth(m.nextIndex(), req), "UpsertDeploymentAllocHealth")
+	}
+
+	// Wait for there to be one eval
+	testutil.WaitForResult(func() (bool, error) {
+		ws := memdb.NewWatchSet()
+		evals, err := m.state.EvalsByJob(ws, j.Namespace, j.ID)
+		if err != nil {
+			return false, err
+		}
+
+		if l := len(evals); l != 1 {
+			return false, fmt.Errorf("Got %d evals; want 1", l)
+		}
+
+		return true, nil
+	}, func(err error) {
+		t.Fatal(err)
+	})
+
+	// Assert that we get a call to UpsertDeploymentStatusUpdate with roll back failed as the status
+	c := &matchDeploymentStatusUpdateConfig{
+		DeploymentID:      d.ID,
+		Status:            structs.DeploymentStatusFailed,
+		StatusDescription: structs.DeploymentStatusDescriptionRollbackNoop(structs.DeploymentStatusDescriptionFailedAllocations, 0),
+		JobVersion:        nil,
+		Eval:              true,
+	}
+	m2 := matchDeploymentStatusUpdateRequest(c)
+	m.On("UpdateDeploymentStatus", mocker.MatchedBy(m2)).Return(nil)
+
+	// Update the allocs health to unhealthy which will cause attempting a rollback,
+	// fail in that step, do status update and eval
+	req2 := &structs.ApplyDeploymentAllocHealthRequest{
+		DeploymentAllocHealthRequest: structs.DeploymentAllocHealthRequest{
+			DeploymentID:           d.ID,
+			UnhealthyAllocationIDs: []string{a.ID},
+		},
+	}
+	assert.Nil(m.state.UpdateDeploymentAllocHealth(m.nextIndex(), req2), "UpsertDeploymentAllocHealth")
+
+	// Wait for there to be one eval
+	testutil.WaitForResult(func() (bool, error) {
+		ws := memdb.NewWatchSet()
+		evals, err := m.state.EvalsByJob(ws, j.Namespace, j.ID)
+		if err != nil {
+			return false, err
+		}
+
+		if l := len(evals); l != 2 {
+			return false, fmt.Errorf("Got %d evals; want 1", l)
+		}
+
+		return true, nil
+	}, func(err error) {
+		t.Fatal(err)
+	})
+
+	m.AssertCalled(t, "UpsertEvals", mocker.MatchedBy(m1))
+
+	// verify that the job version hasn't changed after upsert
+	m.state.JobByID(nil, structs.DefaultNamespace, j.ID)
+	assert.Equal(uint64(0), j.Version, "Expected job version 0 but got ", j.Version)
+}
+
 // Test evaluations are batched between watchers
 func TestWatcher_BatchEvals(t *testing.T) {
 	t.Parallel()
--- a/nomad/structs/structs.go
+++ b/nomad/structs/structs.go
@ -4488,6 +4488,12 @@ func DeploymentStatusDescriptionRollback(baseDescription string, jobVersion uint
 	return fmt.Sprintf("%s - rolling back to job version %d", baseDescription, jobVersion)
 }

+// DeploymentStatusDescriptionRollbackNoop is used to get the status description of
+// a deployment when rolling back is not possible because it has the same specification
+func DeploymentStatusDescriptionRollbackNoop(baseDescription string, jobVersion uint64) string {
+	return fmt.Sprintf("%s - not rolling back to stable job version %d as current job has same specification", baseDescription, jobVersion)
+}
+
 // DeploymentStatusDescriptionNoRollbackTarget is used to get the status description of
 // a deployment when there is no target to rollback to but autorevet is desired.
 func DeploymentStatusDescriptionNoRollbackTarget(baseDescription string) string {