Merge pull request #5205 from hashicorp/b-queuedallocs-bugfix
Reconcile child summaries correctly
This commit is contained in:
commit
2e71168e5f
|
@ -10,7 +10,6 @@ name = "client1"
|
|||
# Enable the client
|
||||
client {
|
||||
enabled = true
|
||||
|
||||
server_join {
|
||||
retry_join = ["127.0.0.1:4647", "127.0.0.1:5647", "127.0.0.1:6647"]
|
||||
}
|
||||
|
|
|
@ -1401,6 +1401,11 @@ func (n *nomadFSM) reconcileQueuedAllocations(index uint64) error {
|
|||
break
|
||||
}
|
||||
job := rawJob.(*structs.Job)
|
||||
|
||||
// Nothing to do for queued allocations if the job is a parent periodic/parameterized job
|
||||
if job.IsParameterized() || job.IsPeriodic() {
|
||||
continue
|
||||
}
|
||||
planner := &scheduler.Harness{
|
||||
State: &snap.StateStore,
|
||||
}
|
||||
|
|
|
@ -2816,6 +2816,78 @@ func TestFSM_ReconcileSummaries(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
// COMPAT: Remove in 0.11
|
||||
func TestFSM_ReconcileParentJobSummary(t *testing.T) {
|
||||
// This test exercises code to handle https://github.com/hashicorp/nomad/issues/3886
|
||||
t.Parallel()
|
||||
|
||||
require := require.New(t)
|
||||
// Add some state
|
||||
fsm := testFSM(t)
|
||||
state := fsm.State()
|
||||
|
||||
// Add a node
|
||||
node := mock.Node()
|
||||
state.UpsertNode(800, node)
|
||||
|
||||
// Make a parameterized job
|
||||
job1 := mock.BatchJob()
|
||||
job1.ID = "test"
|
||||
job1.ParameterizedJob = &structs.ParameterizedJobConfig{
|
||||
Payload: "random",
|
||||
}
|
||||
job1.TaskGroups[0].Count = 1
|
||||
state.UpsertJob(1000, job1)
|
||||
|
||||
// Make a child job
|
||||
childJob := job1.Copy()
|
||||
childJob.ID = job1.ID + "dispatch-23423423"
|
||||
childJob.ParentID = job1.ID
|
||||
childJob.Dispatched = true
|
||||
childJob.Status = structs.JobStatusRunning
|
||||
|
||||
// Create an alloc for child job
|
||||
alloc := mock.Alloc()
|
||||
alloc.NodeID = node.ID
|
||||
alloc.Job = childJob
|
||||
alloc.JobID = childJob.ID
|
||||
alloc.ClientStatus = structs.AllocClientStatusRunning
|
||||
|
||||
state.UpsertJob(1010, childJob)
|
||||
state.UpsertAllocs(1011, []*structs.Allocation{alloc})
|
||||
|
||||
// Make the summary incorrect in the state store
|
||||
summary, err := state.JobSummaryByID(nil, job1.Namespace, job1.ID)
|
||||
require.Nil(err)
|
||||
|
||||
summary.Children = nil
|
||||
summary.Summary = make(map[string]structs.TaskGroupSummary)
|
||||
summary.Summary["web"] = structs.TaskGroupSummary{
|
||||
Queued: 1,
|
||||
}
|
||||
|
||||
req := structs.GenericRequest{}
|
||||
buf, err := structs.Encode(structs.ReconcileJobSummariesRequestType, req)
|
||||
require.Nil(err)
|
||||
|
||||
resp := fsm.Apply(makeLog(buf))
|
||||
require.Nil(resp)
|
||||
|
||||
ws := memdb.NewWatchSet()
|
||||
out1, _ := state.JobSummaryByID(ws, job1.Namespace, job1.ID)
|
||||
expected := structs.JobSummary{
|
||||
JobID: job1.ID,
|
||||
Namespace: job1.Namespace,
|
||||
Summary: make(map[string]structs.TaskGroupSummary),
|
||||
CreateIndex: 1000,
|
||||
ModifyIndex: out1.ModifyIndex,
|
||||
Children: &structs.JobChildrenSummary{
|
||||
Running: 1,
|
||||
},
|
||||
}
|
||||
require.Equal(&expected, out1)
|
||||
}
|
||||
|
||||
func TestFSM_LeakedDeployments(t *testing.T) {
|
||||
t.Parallel()
|
||||
require := require.New(t)
|
||||
|
|
|
@ -6,6 +6,8 @@ import (
|
|||
"sort"
|
||||
"time"
|
||||
|
||||
"reflect"
|
||||
|
||||
log "github.com/hashicorp/go-hclog"
|
||||
"github.com/hashicorp/go-memdb"
|
||||
multierror "github.com/hashicorp/go-multierror"
|
||||
|
@ -3049,12 +3051,86 @@ func (s *StateStore) ReconcileJobSummaries(index uint64) error {
|
|||
if err != nil {
|
||||
return err
|
||||
}
|
||||
// COMPAT: Remove after 0.11
|
||||
// Iterate over jobs to build a list of parent jobs and their children
|
||||
parentMap := make(map[string][]*structs.Job)
|
||||
for {
|
||||
rawJob := iter.Next()
|
||||
if rawJob == nil {
|
||||
break
|
||||
}
|
||||
job := rawJob.(*structs.Job)
|
||||
if job.ParentID != "" {
|
||||
children := parentMap[job.ParentID]
|
||||
children = append(children, job)
|
||||
parentMap[job.ParentID] = children
|
||||
}
|
||||
}
|
||||
|
||||
// Get all the jobs again
|
||||
iter, err = txn.Get("jobs", "id")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for {
|
||||
rawJob := iter.Next()
|
||||
if rawJob == nil {
|
||||
break
|
||||
}
|
||||
job := rawJob.(*structs.Job)
|
||||
|
||||
if job.IsParameterized() || job.IsPeriodic() {
|
||||
// COMPAT: Remove after 0.11
|
||||
|
||||
// The following block of code fixes incorrect child summaries due to a bug
|
||||
// See https://github.com/hashicorp/nomad/issues/3886 for details
|
||||
rawSummary, err := txn.First("job_summary", "id", job.Namespace, job.ID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if rawSummary == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
oldSummary := rawSummary.(*structs.JobSummary)
|
||||
|
||||
// Create an empty summary
|
||||
summary := &structs.JobSummary{
|
||||
JobID: job.ID,
|
||||
Namespace: job.Namespace,
|
||||
Summary: make(map[string]structs.TaskGroupSummary),
|
||||
Children: &structs.JobChildrenSummary{},
|
||||
}
|
||||
|
||||
// Iterate over children of this job if any to fix summary counts
|
||||
children := parentMap[job.ID]
|
||||
for _, childJob := range children {
|
||||
switch childJob.Status {
|
||||
case structs.JobStatusPending:
|
||||
summary.Children.Pending++
|
||||
case structs.JobStatusDead:
|
||||
summary.Children.Dead++
|
||||
case structs.JobStatusRunning:
|
||||
summary.Children.Running++
|
||||
}
|
||||
}
|
||||
|
||||
// Insert the job summary if its different
|
||||
if !reflect.DeepEqual(summary, oldSummary) {
|
||||
// Set the create index of the summary same as the job's create index
|
||||
// and the modify index to the current index
|
||||
summary.CreateIndex = job.CreateIndex
|
||||
summary.ModifyIndex = index
|
||||
|
||||
if err := txn.Insert("job_summary", summary); err != nil {
|
||||
return fmt.Errorf("error inserting job summary: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Done with handling a parent job, continue to next
|
||||
continue
|
||||
}
|
||||
|
||||
// Create a job summary for the job
|
||||
summary := &structs.JobSummary{
|
||||
|
|
|
@ -4354,6 +4354,95 @@ func TestStateStore_ReconcileJobSummary(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestStateStore_ReconcileParentJobSummary(t *testing.T) {
|
||||
t.Parallel()
|
||||
require := require.New(t)
|
||||
state := testStateStore(t)
|
||||
|
||||
// Add a node
|
||||
node := mock.Node()
|
||||
state.UpsertNode(80, node)
|
||||
|
||||
// Make a parameterized job
|
||||
job1 := mock.BatchJob()
|
||||
job1.ID = "test"
|
||||
job1.ParameterizedJob = &structs.ParameterizedJobConfig{
|
||||
Payload: "random",
|
||||
}
|
||||
job1.TaskGroups[0].Count = 1
|
||||
state.UpsertJob(100, job1)
|
||||
|
||||
// Make a child job
|
||||
childJob := job1.Copy()
|
||||
childJob.ID = job1.ID + "dispatch-23423423"
|
||||
childJob.ParentID = job1.ID
|
||||
childJob.Dispatched = true
|
||||
childJob.Status = structs.JobStatusRunning
|
||||
|
||||
// Make some allocs for child job
|
||||
alloc := mock.Alloc()
|
||||
alloc.NodeID = node.ID
|
||||
alloc.Job = childJob
|
||||
alloc.JobID = childJob.ID
|
||||
alloc.ClientStatus = structs.AllocClientStatusRunning
|
||||
|
||||
alloc2 := mock.Alloc()
|
||||
alloc2.NodeID = node.ID
|
||||
alloc2.Job = childJob
|
||||
alloc2.JobID = childJob.ID
|
||||
alloc2.ClientStatus = structs.AllocClientStatusFailed
|
||||
|
||||
require.Nil(state.UpsertJob(110, childJob))
|
||||
require.Nil(state.UpsertAllocs(111, []*structs.Allocation{alloc, alloc2}))
|
||||
|
||||
// Make the summary incorrect in the state store
|
||||
summary, err := state.JobSummaryByID(nil, job1.Namespace, job1.ID)
|
||||
require.Nil(err)
|
||||
|
||||
summary.Children = nil
|
||||
summary.Summary = make(map[string]structs.TaskGroupSummary)
|
||||
summary.Summary["web"] = structs.TaskGroupSummary{
|
||||
Queued: 1,
|
||||
}
|
||||
|
||||
// Delete the child job summary
|
||||
state.DeleteJobSummary(125, childJob.Namespace, childJob.ID)
|
||||
|
||||
state.ReconcileJobSummaries(120)
|
||||
|
||||
ws := memdb.NewWatchSet()
|
||||
|
||||
// Verify parent summary is corrected
|
||||
summary, _ = state.JobSummaryByID(ws, alloc.Namespace, job1.ID)
|
||||
expectedSummary := structs.JobSummary{
|
||||
JobID: job1.ID,
|
||||
Namespace: job1.Namespace,
|
||||
Summary: make(map[string]structs.TaskGroupSummary),
|
||||
Children: &structs.JobChildrenSummary{
|
||||
Running: 1,
|
||||
},
|
||||
CreateIndex: 100,
|
||||
ModifyIndex: 120,
|
||||
}
|
||||
require.Equal(&expectedSummary, summary)
|
||||
|
||||
// Verify child job summary is also correct
|
||||
childSummary, _ := state.JobSummaryByID(ws, childJob.Namespace, childJob.ID)
|
||||
expectedChildSummary := structs.JobSummary{
|
||||
JobID: childJob.ID,
|
||||
Namespace: childJob.Namespace,
|
||||
Summary: map[string]structs.TaskGroupSummary{
|
||||
"web": {
|
||||
Running: 1,
|
||||
Failed: 1,
|
||||
},
|
||||
},
|
||||
CreateIndex: 110,
|
||||
ModifyIndex: 120,
|
||||
}
|
||||
require.Equal(&expectedChildSummary, childSummary)
|
||||
}
|
||||
|
||||
func TestStateStore_UpdateAlloc_JobNotPresent(t *testing.T) {
|
||||
state := testStateStore(t)
|
||||
|
||||
|
|
Loading…
Reference in New Issue