open-nomad/e2e/scheduler_sysbatch/sysbatch.go

// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: MPL-2.0

package scheduler_sysbatch

import (
	"strings"
	"time"

	"github.com/hashicorp/nomad/api"
	"github.com/hashicorp/nomad/e2e/e2eutil"
	"github.com/hashicorp/nomad/e2e/framework"
	"github.com/hashicorp/nomad/nomad/structs"
	"github.com/stretchr/testify/assert"
	"github.com/stretchr/testify/require"
)

type SysBatchSchedulerTest struct {
	framework.TC
	jobIDs []string
}

func init() {
	framework.AddSuites(&framework.TestSuite{
		Component:   "SysBatchScheduler",
		CanRunLocal: true,
		Cases: []framework.TestCase{
			new(SysBatchSchedulerTest),
		},
	})
}

func (tc *SysBatchSchedulerTest) BeforeAll(f *framework.F) {
	// Ensure cluster has leader before running tests
	e2eutil.WaitForLeader(f.T(), tc.Nomad())
	e2eutil.WaitForNodesReady(f.T(), tc.Nomad(), 4)
}

func (tc *SysBatchSchedulerTest) TestJobRunBasic(f *framework.F) {
	t := f.T()
	nomadClient := tc.Nomad()

	// submit a fast sysbatch job
	jobID := "sysbatch_run_basic"
	tc.jobIDs = append(tc.jobIDs, jobID)
	e2eutil.RegisterAndWaitForAllocs(t, nomadClient, "scheduler_sysbatch/input/sysbatch_job_fast.nomad", jobID, "")

	// get our allocations for this sysbatch job
	jobs := nomadClient.Jobs()
	allocs, _, err := jobs.Allocations(jobID, true, nil)
	require.NoError(t, err)

	// make sure this is job is being run on "all" the linux clients
	require.True(t, len(allocs) >= 3)

	// wait for every alloc to reach completion
	allocIDs := e2eutil.AllocIDsFromAllocationListStubs(allocs)
	e2eutil.WaitForAllocsStatus(t, nomadClient, allocIDs, structs.AllocClientStatusComplete)
}

func (tc *SysBatchSchedulerTest) TestJobStopEarly(f *framework.F) {
	t := f.T()
	nomadClient := tc.Nomad()

	// submit a slow sysbatch job
	jobID := "sysbatch_stop_early"
	tc.jobIDs = append(tc.jobIDs, jobID)
	e2eutil.RegisterAndWaitForAllocs(t, nomadClient, "scheduler_sysbatch/input/sysbatch_job_slow.nomad", jobID, "")

	// get our allocations for this sysbatch job
	jobs := nomadClient.Jobs()
	allocs, _, err := jobs.Allocations(jobID, true, nil)
	require.NoError(t, err)

	// make sure this is job is being run on "all" the linux clients
	require.True(t, len(allocs) >= 3)

	// wait for every alloc to reach running status
	allocIDs := e2eutil.AllocIDsFromAllocationListStubs(allocs)
	e2eutil.WaitForAllocsStatus(t, nomadClient, allocIDs, structs.AllocClientStatusRunning)

	// stop the job before allocs reach completion
	_, _, err = jobs.Deregister(jobID, false, nil)
	require.NoError(t, err)
}

func (tc *SysBatchSchedulerTest) TestJobReplaceRunning(f *framework.F) {
	t := f.T()
	nomadClient := tc.Nomad()

	// submit a slow sysbatch job
	jobID := "sysbatch_replace_running"
	tc.jobIDs = append(tc.jobIDs, jobID)
	e2eutil.RegisterAndWaitForAllocs(t, nomadClient, "scheduler_sysbatch/input/sysbatch_job_slow.nomad", jobID, "")

	// get out allocations for this sysbatch job
	jobs := nomadClient.Jobs()
	allocs, _, err := jobs.Allocations(jobID, true, nil)
	require.NoError(t, err)

	// make sure this is job is being run on "all" the linux clients
	require.True(t, len(allocs) >= 3)

	// wait for every alloc to reach running status
	allocIDs := e2eutil.AllocIDsFromAllocationListStubs(allocs)
	e2eutil.WaitForAllocsStatus(t, nomadClient, allocIDs, structs.AllocClientStatusRunning)

	// replace the slow job with the fast job
	intermediate := e2eutil.RegisterAndWaitForAllocs(t, nomadClient, "scheduler_sysbatch/input/sysbatch_job_fast.nomad", jobID, "")

	// get the allocs for the new updated job
	var updated []*api.AllocationListStub
	for _, alloc := range intermediate {
		if alloc.JobVersion == 1 {
			updated = append(updated, alloc)
		}
	}

	// should be equal number of old and new allocs
	newAllocIDs := e2eutil.AllocIDsFromAllocationListStubs(updated)

	// make sure this new job is being run on "all" the linux clients
	require.True(t, len(updated) >= 3)

	// wait for the allocs of the fast job to complete
	e2eutil.WaitForAllocsStatus(t, nomadClient, newAllocIDs, structs.AllocClientStatusComplete)
}

func (tc *SysBatchSchedulerTest) TestJobReplaceDead(f *framework.F) {
	t := f.T()
	nomadClient := tc.Nomad()

	// submit a fast sysbatch job
	jobID := "sysbatch_replace_dead"
	tc.jobIDs = append(tc.jobIDs, jobID)
	e2eutil.RegisterAndWaitForAllocs(t, nomadClient, "scheduler_sysbatch/input/sysbatch_job_fast.nomad", jobID, "")

	// get the allocations for this sysbatch job
	jobs := nomadClient.Jobs()
	allocs, _, err := jobs.Allocations(jobID, true, nil)
	require.NoError(t, err)

	// make sure this is job is being run on "all" the linux clients
	require.True(t, len(allocs) >= 3)

	// wait for every alloc to reach complete status
	allocIDs := e2eutil.AllocIDsFromAllocationListStubs(allocs)
	e2eutil.WaitForAllocsStatus(t, nomadClient, allocIDs, structs.AllocClientStatusComplete)

	// replace the fast job with the slow job
	intermediate := e2eutil.RegisterAndWaitForAllocs(t, nomadClient, "scheduler_sysbatch/input/sysbatch_job_slow.nomad", jobID, "")

	// get the allocs for the new updated job
	var updated []*api.AllocationListStub
	for _, alloc := range intermediate {
		if alloc.JobVersion == 1 {
			updated = append(updated, alloc)
		}
	}

	// should be equal number of old and new allocs
	upAllocIDs := e2eutil.AllocIDsFromAllocationListStubs(updated)

	// make sure this new job is being run on "all" the linux clients
	require.True(t, len(updated) >= 3)

	// wait for the allocs of the slow job to be running
	e2eutil.WaitForAllocsStatus(t, nomadClient, upAllocIDs, structs.AllocClientStatusRunning)
}

func (tc *SysBatchSchedulerTest) TestJobRunPeriodic(f *framework.F) {
	t := f.T()
	nomadClient := tc.Nomad()

	// submit a fast sysbatch job
	jobID := "sysbatch_job_periodic"
	tc.jobIDs = append(tc.jobIDs, jobID)
	err := e2eutil.Register(jobID, "scheduler_sysbatch/input/sysbatch_periodic.nomad")
	require.NoError(t, err)

	// force the cron job to run
	jobs := nomadClient.Jobs()
	_, _, err = jobs.PeriodicForce(jobID, nil)
	require.NoError(t, err)

	// find the cron job that got launched
	jobsList, _, err := jobs.List(nil)
	require.NoError(t, err)
	cronJobID := ""
	for _, job := range jobsList {
		if strings.HasPrefix(job.Name, "sysbatch_job_periodic/periodic-") {
			cronJobID = job.Name
			break
		}
	}
	require.NotEmpty(t, cronJobID)
	tc.jobIDs = append(tc.jobIDs, cronJobID)

	// wait for allocs of the cron job
	var allocs []*api.AllocationListStub
	require.True(t, assert.Eventually(t, func() bool {
		var err error
		allocs, _, err = jobs.Allocations(cronJobID, false, nil)
		require.NoError(t, err)
		return len(allocs) >= 3
	}, 30*time.Second, time.Second))

	// wait for every cron job alloc to reach completion
	allocIDs := e2eutil.AllocIDsFromAllocationListStubs(allocs)
	e2eutil.WaitForAllocsStatus(t, nomadClient, allocIDs, structs.AllocClientStatusComplete)
}

func (tc *SysBatchSchedulerTest) TestJobRunDispatch(f *framework.F) {
	t := f.T()
	nomadClient := tc.Nomad()

	// submit a fast sysbatch dispatch job
	jobID := "sysbatch_job_dispatch"
	tc.jobIDs = append(tc.jobIDs, jobID)
	err := e2eutil.Register(jobID, "scheduler_sysbatch/input/sysbatch_dispatch.nomad")
	require.NoError(t, err)

	// dispatch the sysbatch job
	jobs := nomadClient.Jobs()
	result, _, err := jobs.Dispatch(jobID, map[string]string{
		"KEY": "value",
	}, nil, "", nil)
	require.NoError(t, err)

	// grab the new dispatched jobID
	dispatchID := result.DispatchedJobID
	tc.jobIDs = append(tc.jobIDs, dispatchID)

	// wait for allocs of the dispatched job
	var allocs []*api.AllocationListStub
	require.True(t, assert.Eventually(t, func() bool {
		var err error
		allocs, _, err = jobs.Allocations(dispatchID, false, nil)
		require.NoError(t, err)
		return len(allocs) >= 3
	}, 30*time.Second, time.Second))

	// wait for every dispatch alloc to reach completion
	allocIDs := e2eutil.AllocIDsFromAllocationListStubs(allocs)
	e2eutil.WaitForAllocsStatus(t, nomadClient, allocIDs, structs.AllocClientStatusComplete)
}

func (tc *SysBatchSchedulerTest) AfterEach(f *framework.F) {
	nomadClient := tc.Nomad()

	// Mark all nodes eligible
	nodesAPI := tc.Nomad().Nodes()
	nodes, _, _ := nodesAPI.List(nil)
	for _, node := range nodes {
		_, _ = nodesAPI.ToggleEligibility(node.ID, true, nil)
	}

	jobs := nomadClient.Jobs()

	// Stop all jobs in test
	for _, id := range tc.jobIDs {
		_, _, _ = jobs.Deregister(id, true, nil)
	}
	tc.jobIDs = []string{}

	// Garbage collect
	_ = nomadClient.System().GarbageCollect()
}