Merge pull request #3830 from hashicorp/f-failed-allocs-during-deploy
Reconciler should consider failed allocs when marking deployment as failed
This commit is contained in:
commit
9e27c4a2a2
|
@ -2946,6 +2946,69 @@ func TestServiceSched_Reschedule_Multiple(t *testing.T) {
|
|||
assert.Equal(5, len(out)) // 2 original, plus 3 reschedule attempts
|
||||
}
|
||||
|
||||
// Tests that deployments with failed allocs don't result in placements
|
||||
func TestDeployment_FailedAllocs_NoReschedule(t *testing.T) {
|
||||
h := NewHarness(t)
|
||||
require := require.New(t)
|
||||
// Create some nodes
|
||||
var nodes []*structs.Node
|
||||
for i := 0; i < 10; i++ {
|
||||
node := mock.Node()
|
||||
nodes = append(nodes, node)
|
||||
noErr(t, h.State.UpsertNode(h.NextIndex(), node))
|
||||
}
|
||||
|
||||
// Generate a fake job with allocations and a reschedule policy.
|
||||
job := mock.Job()
|
||||
job.TaskGroups[0].Count = 2
|
||||
job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
|
||||
Attempts: 1,
|
||||
Interval: 15 * time.Minute,
|
||||
}
|
||||
jobIndex := h.NextIndex()
|
||||
require.Nil(h.State.UpsertJob(jobIndex, job))
|
||||
|
||||
deployment := mock.Deployment()
|
||||
deployment.JobID = job.ID
|
||||
deployment.JobCreateIndex = jobIndex
|
||||
deployment.JobVersion = job.Version
|
||||
|
||||
require.Nil(h.State.UpsertDeployment(h.NextIndex(), deployment))
|
||||
|
||||
var allocs []*structs.Allocation
|
||||
for i := 0; i < 2; i++ {
|
||||
alloc := mock.Alloc()
|
||||
alloc.Job = job
|
||||
alloc.JobID = job.ID
|
||||
alloc.NodeID = nodes[i].ID
|
||||
alloc.Name = fmt.Sprintf("my-job.web[%d]", i)
|
||||
alloc.DeploymentID = deployment.ID
|
||||
allocs = append(allocs, alloc)
|
||||
}
|
||||
// Mark one of the allocations as failed
|
||||
allocs[1].ClientStatus = structs.AllocClientStatusFailed
|
||||
|
||||
require.Nil(h.State.UpsertAllocs(h.NextIndex(), allocs))
|
||||
|
||||
// Create a mock evaluation
|
||||
eval := &structs.Evaluation{
|
||||
Namespace: structs.DefaultNamespace,
|
||||
ID: uuid.Generate(),
|
||||
Priority: 50,
|
||||
TriggeredBy: structs.EvalTriggerNodeUpdate,
|
||||
JobID: job.ID,
|
||||
Status: structs.EvalStatusPending,
|
||||
}
|
||||
require.Nil(h.State.UpsertEvals(h.NextIndex(), []*structs.Evaluation{eval}))
|
||||
|
||||
// Process the evaluation
|
||||
require.Nil(h.Process(NewServiceScheduler, eval))
|
||||
|
||||
// Verify no plan created
|
||||
require.Equal(0, len(h.Plans))
|
||||
|
||||
}
|
||||
|
||||
func TestBatchSched_Run_CompleteAlloc(t *testing.T) {
|
||||
h := NewHarness(t)
|
||||
|
||||
|
|
|
@ -159,8 +159,20 @@ func (a *allocReconciler) Compute() *reconcileResults {
|
|||
|
||||
// Detect if the deployment is paused
|
||||
if a.deployment != nil {
|
||||
// Detect if any allocs associated with this deploy have failed
|
||||
// Failed allocations could edge trigger an evaluation before the deployment watcher
|
||||
// runs and marks the deploy as failed. This block makes sure that is still
|
||||
// considered a failed deploy
|
||||
failedAllocsInDeploy := false
|
||||
for _, as := range m {
|
||||
for _, alloc := range as {
|
||||
if alloc.DeploymentID == a.deployment.ID && alloc.ClientStatus == structs.AllocClientStatusFailed {
|
||||
failedAllocsInDeploy = true
|
||||
}
|
||||
}
|
||||
}
|
||||
a.deploymentPaused = a.deployment.Status == structs.DeploymentStatusPaused
|
||||
a.deploymentFailed = a.deployment.Status == structs.DeploymentStatusFailed
|
||||
a.deploymentFailed = a.deployment.Status == structs.DeploymentStatusFailed || failedAllocsInDeploy
|
||||
}
|
||||
|
||||
// Reconcile each group
|
||||
|
|
|
@ -74,6 +74,7 @@ Update stanza Tests:
|
|||
√ Change job change while scaling up
|
||||
√ Update the job when all allocations from the previous job haven't been placed yet.
|
||||
√ Paused or failed deployment doesn't do any rescheduling of failed allocs
|
||||
√ Running deployment with failed allocs doesn't do any rescheduling of failed allocs
|
||||
*/
|
||||
|
||||
var (
|
||||
|
@ -3350,3 +3351,48 @@ func TestReconciler_FailedDeployment_DontReschedule(t *testing.T) {
|
|||
},
|
||||
})
|
||||
}
|
||||
|
||||
// Test that a running deployment with failed allocs will not result in rescheduling failed allocations
|
||||
func TestReconciler_DeploymentWithFailedAllocs_DontReschedule(t *testing.T) {
|
||||
job := mock.Job()
|
||||
job.TaskGroups[0].Update = noCanaryUpdate
|
||||
|
||||
// Mock deployment with failed allocs, but deployment watcher hasn't marked it as failed yet
|
||||
d := structs.NewDeployment(job)
|
||||
d.Status = structs.DeploymentStatusRunning
|
||||
d.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{
|
||||
Promoted: false,
|
||||
DesiredTotal: 5,
|
||||
PlacedAllocs: 4,
|
||||
}
|
||||
|
||||
// Create 4 allocations and mark two as failed
|
||||
var allocs []*structs.Allocation
|
||||
for i := 0; i < 4; i++ {
|
||||
alloc := mock.Alloc()
|
||||
alloc.Job = job
|
||||
alloc.JobID = job.ID
|
||||
alloc.NodeID = uuid.Generate()
|
||||
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
||||
alloc.TaskGroup = job.TaskGroups[0].Name
|
||||
alloc.DeploymentID = d.ID
|
||||
allocs = append(allocs, alloc)
|
||||
}
|
||||
allocs[2].ClientStatus = structs.AllocClientStatusFailed
|
||||
allocs[3].ClientStatus = structs.AllocClientStatusFailed
|
||||
|
||||
reconciler := NewAllocReconciler(testLogger(), allocUpdateFnDestructive, false, job.ID, job, d, allocs, nil)
|
||||
r := reconciler.Compute()
|
||||
|
||||
// Assert that no rescheduled placements were created
|
||||
assertResults(t, r, &resultExpectation{
|
||||
place: 0,
|
||||
createDeployment: nil,
|
||||
deploymentUpdates: nil,
|
||||
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
||||
job.TaskGroups[0].Name: {
|
||||
Ignore: 2,
|
||||
},
|
||||
},
|
||||
})
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue