3371214431
This PR implements a new "System Batch" scheduler type. Jobs can make use of this new scheduler by setting their type to 'sysbatch'. Like the name implies, sysbatch can be thought of as a hybrid between system and batch jobs - it is for running short lived jobs intended to run on every compatible node in the cluster. As with batch jobs, sysbatch jobs can also be periodic and/or parameterized dispatch jobs. A sysbatch job is considered complete when it has been run on all compatible nodes until reaching a terminal state (success or failed on retries). Feasibility and preemption are governed the same as with system jobs. In this PR, the update stanza is not yet supported. The update stanza is sill limited in functionality for the underlying system scheduler, and is not useful yet for sysbatch jobs. Further work in #4740 will improve support for the update stanza and deployments. Closes #2527
266 lines
8.2 KiB
Go
266 lines
8.2 KiB
Go
package scheduler_sysbatch
|
|
|
|
import (
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/hashicorp/nomad/api"
|
|
"github.com/hashicorp/nomad/e2e/e2eutil"
|
|
"github.com/hashicorp/nomad/e2e/framework"
|
|
"github.com/hashicorp/nomad/nomad/structs"
|
|
"github.com/stretchr/testify/assert"
|
|
"github.com/stretchr/testify/require"
|
|
)
|
|
|
|
type SysBatchSchedulerTest struct {
|
|
framework.TC
|
|
jobIDs []string
|
|
}
|
|
|
|
func init() {
|
|
framework.AddSuites(&framework.TestSuite{
|
|
Component: "SysBatchScheduler",
|
|
CanRunLocal: true,
|
|
Cases: []framework.TestCase{
|
|
new(SysBatchSchedulerTest),
|
|
},
|
|
})
|
|
}
|
|
|
|
func (tc *SysBatchSchedulerTest) BeforeAll(f *framework.F) {
|
|
// Ensure cluster has leader before running tests
|
|
e2eutil.WaitForLeader(f.T(), tc.Nomad())
|
|
e2eutil.WaitForNodesReady(f.T(), tc.Nomad(), 4)
|
|
}
|
|
|
|
func (tc *SysBatchSchedulerTest) TestJobRunBasic(f *framework.F) {
|
|
t := f.T()
|
|
nomadClient := tc.Nomad()
|
|
|
|
// submit a fast sysbatch job
|
|
jobID := "sysbatch_run_basic"
|
|
tc.jobIDs = append(tc.jobIDs, jobID)
|
|
e2eutil.RegisterAndWaitForAllocs(t, nomadClient, "scheduler_sysbatch/input/sysbatch_job_fast.nomad", jobID, "")
|
|
|
|
// get our allocations for this sysbatch job
|
|
jobs := nomadClient.Jobs()
|
|
allocs, _, err := jobs.Allocations(jobID, true, nil)
|
|
require.NoError(t, err)
|
|
|
|
// make sure this is job is being run on "all" the linux clients
|
|
require.True(t, len(allocs) >= 3)
|
|
|
|
// wait for every alloc to reach completion
|
|
allocIDs := e2eutil.AllocIDsFromAllocationListStubs(allocs)
|
|
e2eutil.WaitForAllocsStatus(t, nomadClient, allocIDs, structs.AllocClientStatusComplete)
|
|
}
|
|
|
|
func (tc *SysBatchSchedulerTest) TestJobStopEarly(f *framework.F) {
|
|
t := f.T()
|
|
nomadClient := tc.Nomad()
|
|
|
|
// submit a slow sysbatch job
|
|
jobID := "sysbatch_stop_early"
|
|
tc.jobIDs = append(tc.jobIDs, jobID)
|
|
e2eutil.RegisterAndWaitForAllocs(t, nomadClient, "scheduler_sysbatch/input/sysbatch_job_slow.nomad", jobID, "")
|
|
|
|
// get our allocations for this sysbatch job
|
|
jobs := nomadClient.Jobs()
|
|
allocs, _, err := jobs.Allocations(jobID, true, nil)
|
|
require.NoError(t, err)
|
|
|
|
// make sure this is job is being run on "all" the linux clients
|
|
require.True(t, len(allocs) >= 3)
|
|
|
|
// wait for every alloc to reach running status
|
|
allocIDs := e2eutil.AllocIDsFromAllocationListStubs(allocs)
|
|
e2eutil.WaitForAllocsStatus(t, nomadClient, allocIDs, structs.AllocClientStatusRunning)
|
|
|
|
// stop the job before allocs reach completion
|
|
_, _, err = jobs.Deregister(jobID, false, nil)
|
|
require.NoError(t, err)
|
|
}
|
|
|
|
func (tc *SysBatchSchedulerTest) TestJobReplaceRunning(f *framework.F) {
|
|
t := f.T()
|
|
nomadClient := tc.Nomad()
|
|
|
|
// submit a slow sysbatch job
|
|
jobID := "sysbatch_replace_running"
|
|
tc.jobIDs = append(tc.jobIDs, jobID)
|
|
e2eutil.RegisterAndWaitForAllocs(t, nomadClient, "scheduler_sysbatch/input/sysbatch_job_slow.nomad", jobID, "")
|
|
|
|
// get out allocations for this sysbatch job
|
|
jobs := nomadClient.Jobs()
|
|
allocs, _, err := jobs.Allocations(jobID, true, nil)
|
|
require.NoError(t, err)
|
|
|
|
// make sure this is job is being run on "all" the linux clients
|
|
require.True(t, len(allocs) >= 3)
|
|
|
|
// wait for every alloc to reach running status
|
|
allocIDs := e2eutil.AllocIDsFromAllocationListStubs(allocs)
|
|
e2eutil.WaitForAllocsStatus(t, nomadClient, allocIDs, structs.AllocClientStatusRunning)
|
|
|
|
// replace the slow job with the fast job
|
|
intermediate := e2eutil.RegisterAndWaitForAllocs(t, nomadClient, "scheduler_sysbatch/input/sysbatch_job_fast.nomad", jobID, "")
|
|
|
|
// get the allocs for the new updated job
|
|
var updated []*api.AllocationListStub
|
|
for _, alloc := range intermediate {
|
|
if alloc.JobVersion == 1 {
|
|
updated = append(updated, alloc)
|
|
}
|
|
}
|
|
|
|
// should be equal number of old and new allocs
|
|
newAllocIDs := e2eutil.AllocIDsFromAllocationListStubs(updated)
|
|
|
|
// make sure this new job is being run on "all" the linux clients
|
|
require.True(t, len(updated) >= 3)
|
|
|
|
// wait for the allocs of the fast job to complete
|
|
e2eutil.WaitForAllocsStatus(t, nomadClient, newAllocIDs, structs.AllocClientStatusComplete)
|
|
}
|
|
|
|
func (tc *SysBatchSchedulerTest) TestJobReplaceDead(f *framework.F) {
|
|
t := f.T()
|
|
nomadClient := tc.Nomad()
|
|
|
|
// submit a fast sysbatch job
|
|
jobID := "sysbatch_replace_dead"
|
|
tc.jobIDs = append(tc.jobIDs, jobID)
|
|
e2eutil.RegisterAndWaitForAllocs(t, nomadClient, "scheduler_sysbatch/input/sysbatch_job_fast.nomad", jobID, "")
|
|
|
|
// get the allocations for this sysbatch job
|
|
jobs := nomadClient.Jobs()
|
|
allocs, _, err := jobs.Allocations(jobID, true, nil)
|
|
require.NoError(t, err)
|
|
|
|
// make sure this is job is being run on "all" the linux clients
|
|
require.True(t, len(allocs) >= 3)
|
|
|
|
// wait for every alloc to reach complete status
|
|
allocIDs := e2eutil.AllocIDsFromAllocationListStubs(allocs)
|
|
e2eutil.WaitForAllocsStatus(t, nomadClient, allocIDs, structs.AllocClientStatusComplete)
|
|
|
|
// replace the fast job with the slow job
|
|
intermediate := e2eutil.RegisterAndWaitForAllocs(t, nomadClient, "scheduler_sysbatch/input/sysbatch_job_slow.nomad", jobID, "")
|
|
|
|
// get the allocs for the new updated job
|
|
var updated []*api.AllocationListStub
|
|
for _, alloc := range intermediate {
|
|
if alloc.JobVersion == 1 {
|
|
updated = append(updated, alloc)
|
|
}
|
|
}
|
|
|
|
// should be equal number of old and new allocs
|
|
upAllocIDs := e2eutil.AllocIDsFromAllocationListStubs(updated)
|
|
|
|
// make sure this new job is being run on "all" the linux clients
|
|
require.True(t, len(updated) >= 3)
|
|
|
|
// wait for the allocs of the slow job to be running
|
|
e2eutil.WaitForAllocsStatus(t, nomadClient, upAllocIDs, structs.AllocClientStatusRunning)
|
|
}
|
|
|
|
func (tc *SysBatchSchedulerTest) TestJobRunPeriodic(f *framework.F) {
|
|
t := f.T()
|
|
nomadClient := tc.Nomad()
|
|
|
|
// submit a fast sysbatch job
|
|
jobID := "sysbatch_job_periodic"
|
|
tc.jobIDs = append(tc.jobIDs, jobID)
|
|
err := e2eutil.Register(jobID, "scheduler_sysbatch/input/sysbatch_periodic.nomad")
|
|
require.NoError(t, err)
|
|
|
|
// force the cron job to run
|
|
jobs := nomadClient.Jobs()
|
|
_, _, err = jobs.PeriodicForce(jobID, nil)
|
|
require.NoError(t, err)
|
|
|
|
// find the cron job that got launched
|
|
jobsList, _, err := jobs.List(nil)
|
|
require.NoError(t, err)
|
|
cronJobID := ""
|
|
for _, job := range jobsList {
|
|
if strings.HasPrefix(job.Name, "sysbatch_job_periodic/periodic-") {
|
|
cronJobID = job.Name
|
|
break
|
|
}
|
|
}
|
|
require.NotEmpty(t, cronJobID)
|
|
tc.jobIDs = append(tc.jobIDs, cronJobID)
|
|
|
|
// wait for allocs of the cron job
|
|
var allocs []*api.AllocationListStub
|
|
require.True(t, assert.Eventually(t, func() bool {
|
|
var err error
|
|
allocs, _, err = jobs.Allocations(cronJobID, false, nil)
|
|
require.NoError(t, err)
|
|
return len(allocs) >= 3
|
|
}, 30*time.Second, time.Second))
|
|
|
|
// wait for every cron job alloc to reach completion
|
|
allocIDs := e2eutil.AllocIDsFromAllocationListStubs(allocs)
|
|
e2eutil.WaitForAllocsStatus(t, nomadClient, allocIDs, structs.AllocClientStatusComplete)
|
|
}
|
|
|
|
func (tc *SysBatchSchedulerTest) TestJobRunDispatch(f *framework.F) {
|
|
t := f.T()
|
|
nomadClient := tc.Nomad()
|
|
|
|
// submit a fast sysbatch dispatch job
|
|
jobID := "sysbatch_job_dispatch"
|
|
tc.jobIDs = append(tc.jobIDs, jobID)
|
|
err := e2eutil.Register(jobID, "scheduler_sysbatch/input/sysbatch_dispatch.nomad")
|
|
require.NoError(t, err)
|
|
|
|
// dispatch the sysbatch job
|
|
jobs := nomadClient.Jobs()
|
|
result, _, err := jobs.Dispatch(jobID, map[string]string{
|
|
"KEY": "value",
|
|
}, nil, nil)
|
|
require.NoError(t, err)
|
|
|
|
// grab the new dispatched jobID
|
|
dispatchID := result.DispatchedJobID
|
|
tc.jobIDs = append(tc.jobIDs, dispatchID)
|
|
|
|
// wait for allocs of the dispatched job
|
|
var allocs []*api.AllocationListStub
|
|
require.True(t, assert.Eventually(t, func() bool {
|
|
var err error
|
|
allocs, _, err = jobs.Allocations(dispatchID, false, nil)
|
|
require.NoError(t, err)
|
|
return len(allocs) >= 3
|
|
}, 30*time.Second, time.Second))
|
|
|
|
// wait for every dispatch alloc to reach completion
|
|
allocIDs := e2eutil.AllocIDsFromAllocationListStubs(allocs)
|
|
e2eutil.WaitForAllocsStatus(t, nomadClient, allocIDs, structs.AllocClientStatusComplete)
|
|
}
|
|
|
|
func (tc *SysBatchSchedulerTest) AfterEach(f *framework.F) {
|
|
nomadClient := tc.Nomad()
|
|
|
|
// Mark all nodes eligible
|
|
nodesAPI := tc.Nomad().Nodes()
|
|
nodes, _, _ := nodesAPI.List(nil)
|
|
for _, node := range nodes {
|
|
_, _ = nodesAPI.ToggleEligibility(node.ID, true, nil)
|
|
}
|
|
|
|
jobs := nomadClient.Jobs()
|
|
|
|
// Stop all jobs in test
|
|
for _, id := range tc.jobIDs {
|
|
_, _, _ = jobs.Deregister(id, true, nil)
|
|
}
|
|
tc.jobIDs = []string{}
|
|
|
|
// Garbage collect
|
|
_ = nomadClient.System().GarbageCollect()
|
|
}
|