Merge pull request #3830 from hashicorp/f-failed-allocs-during-deploy

Reconciler should consider failed allocs when marking deployment as failed
This commit is contained in:
Preetha 2018-02-05 17:21:37 -06:00 committed by GitHub
commit 9e27c4a2a2
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 122 additions and 1 deletions

View file

@ -2946,6 +2946,69 @@ func TestServiceSched_Reschedule_Multiple(t *testing.T) {
assert.Equal(5, len(out)) // 2 original, plus 3 reschedule attempts
}
// Tests that deployments with failed allocs don't result in placements
func TestDeployment_FailedAllocs_NoReschedule(t *testing.T) {
h := NewHarness(t)
require := require.New(t)
// Create some nodes
var nodes []*structs.Node
for i := 0; i < 10; i++ {
node := mock.Node()
nodes = append(nodes, node)
noErr(t, h.State.UpsertNode(h.NextIndex(), node))
}
// Generate a fake job with allocations and a reschedule policy.
job := mock.Job()
job.TaskGroups[0].Count = 2
job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
Attempts: 1,
Interval: 15 * time.Minute,
}
jobIndex := h.NextIndex()
require.Nil(h.State.UpsertJob(jobIndex, job))
deployment := mock.Deployment()
deployment.JobID = job.ID
deployment.JobCreateIndex = jobIndex
deployment.JobVersion = job.Version
require.Nil(h.State.UpsertDeployment(h.NextIndex(), deployment))
var allocs []*structs.Allocation
for i := 0; i < 2; i++ {
alloc := mock.Alloc()
alloc.Job = job
alloc.JobID = job.ID
alloc.NodeID = nodes[i].ID
alloc.Name = fmt.Sprintf("my-job.web[%d]", i)
alloc.DeploymentID = deployment.ID
allocs = append(allocs, alloc)
}
// Mark one of the allocations as failed
allocs[1].ClientStatus = structs.AllocClientStatusFailed
require.Nil(h.State.UpsertAllocs(h.NextIndex(), allocs))
// Create a mock evaluation
eval := &structs.Evaluation{
Namespace: structs.DefaultNamespace,
ID: uuid.Generate(),
Priority: 50,
TriggeredBy: structs.EvalTriggerNodeUpdate,
JobID: job.ID,
Status: structs.EvalStatusPending,
}
require.Nil(h.State.UpsertEvals(h.NextIndex(), []*structs.Evaluation{eval}))
// Process the evaluation
require.Nil(h.Process(NewServiceScheduler, eval))
// Verify no plan created
require.Equal(0, len(h.Plans))
}
func TestBatchSched_Run_CompleteAlloc(t *testing.T) {
h := NewHarness(t)

View file

@ -159,8 +159,20 @@ func (a *allocReconciler) Compute() *reconcileResults {
// Detect if the deployment is paused
if a.deployment != nil {
// Detect if any allocs associated with this deploy have failed
// Failed allocations could edge trigger an evaluation before the deployment watcher
// runs and marks the deploy as failed. This block makes sure that is still
// considered a failed deploy
failedAllocsInDeploy := false
for _, as := range m {
for _, alloc := range as {
if alloc.DeploymentID == a.deployment.ID && alloc.ClientStatus == structs.AllocClientStatusFailed {
failedAllocsInDeploy = true
}
}
}
a.deploymentPaused = a.deployment.Status == structs.DeploymentStatusPaused
a.deploymentFailed = a.deployment.Status == structs.DeploymentStatusFailed
a.deploymentFailed = a.deployment.Status == structs.DeploymentStatusFailed || failedAllocsInDeploy
}
// Reconcile each group

View file

@ -74,6 +74,7 @@ Update stanza Tests:
Change job change while scaling up
Update the job when all allocations from the previous job haven't been placed yet.
Paused or failed deployment doesn't do any rescheduling of failed allocs
Running deployment with failed allocs doesn't do any rescheduling of failed allocs
*/
var (
@ -3350,3 +3351,48 @@ func TestReconciler_FailedDeployment_DontReschedule(t *testing.T) {
},
})
}
// Test that a running deployment with failed allocs will not result in rescheduling failed allocations
func TestReconciler_DeploymentWithFailedAllocs_DontReschedule(t *testing.T) {
job := mock.Job()
job.TaskGroups[0].Update = noCanaryUpdate
// Mock deployment with failed allocs, but deployment watcher hasn't marked it as failed yet
d := structs.NewDeployment(job)
d.Status = structs.DeploymentStatusRunning
d.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{
Promoted: false,
DesiredTotal: 5,
PlacedAllocs: 4,
}
// Create 4 allocations and mark two as failed
var allocs []*structs.Allocation
for i := 0; i < 4; i++ {
alloc := mock.Alloc()
alloc.Job = job
alloc.JobID = job.ID
alloc.NodeID = uuid.Generate()
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
alloc.TaskGroup = job.TaskGroups[0].Name
alloc.DeploymentID = d.ID
allocs = append(allocs, alloc)
}
allocs[2].ClientStatus = structs.AllocClientStatusFailed
allocs[3].ClientStatus = structs.AllocClientStatusFailed
reconciler := NewAllocReconciler(testLogger(), allocUpdateFnDestructive, false, job.ID, job, d, allocs, nil)
r := reconciler.Compute()
// Assert that no rescheduled placements were created
assertResults(t, r, &resultExpectation{
place: 0,
createDeployment: nil,
deploymentUpdates: nil,
desiredTGUpdates: map[string]*structs.DesiredUpdates{
job.TaskGroups[0].Name: {
Ignore: 2,
},
},
})
}