d9d4da1e9f
Processing an evaluation is nearly a pure function over the state snapshot, but we randomly shuffle the nodes. This means that developers can't take a given state snapshot and pass an evaluation through it and be guaranteed the same plan results. But the evaluation ID is already random, so if we use this as the seed for shuffling the nodes we can greatly reduce the sources of non-determinism. Unfortunately golang map iteration uses a global source of randomness and not a goroutine-local one, but arguably if the scheduler behavior is impacted by this, that's a bug in the iteration.
136 lines
5.5 KiB
Go
136 lines
5.5 KiB
Go
package scheduler
|
|
|
|
import (
|
|
"fmt"
|
|
|
|
log "github.com/hashicorp/go-hclog"
|
|
|
|
memdb "github.com/hashicorp/go-memdb"
|
|
"github.com/hashicorp/nomad/nomad/state"
|
|
"github.com/hashicorp/nomad/nomad/structs"
|
|
)
|
|
|
|
const (
|
|
// SchedulerVersion is the version of the scheduler. Changes to the
|
|
// scheduler that are incompatible with prior schedulers will increment this
|
|
// version. It is used to disallow dequeueing when the versions do not match
|
|
// across the leader and the dequeueing scheduler.
|
|
SchedulerVersion uint16 = 1
|
|
)
|
|
|
|
// BuiltinSchedulers contains the built in registered schedulers
|
|
// which are available
|
|
var BuiltinSchedulers = map[string]Factory{
|
|
"service": NewServiceScheduler,
|
|
"batch": NewBatchScheduler,
|
|
"system": NewSystemScheduler,
|
|
"sysbatch": NewSysBatchScheduler,
|
|
}
|
|
|
|
// NewScheduler is used to instantiate and return a new scheduler
|
|
// given the scheduler name, initial state, and planner.
|
|
func NewScheduler(name string, logger log.Logger, eventsCh chan<- interface{}, state State, planner Planner) (Scheduler, error) {
|
|
// Lookup the factory function
|
|
factory, ok := BuiltinSchedulers[name]
|
|
if !ok {
|
|
return nil, fmt.Errorf("unknown scheduler '%s'", name)
|
|
}
|
|
|
|
// Instantiate the scheduler
|
|
sched := factory(logger, eventsCh, state, planner)
|
|
return sched, nil
|
|
}
|
|
|
|
// Factory is used to instantiate a new Scheduler
|
|
type Factory func(log.Logger, chan<- interface{}, State, Planner) Scheduler
|
|
|
|
// Scheduler is the top level instance for a scheduler. A scheduler is
|
|
// meant to only encapsulate business logic, pushing the various plumbing
|
|
// into Nomad itself. They are invoked to process a single evaluation at
|
|
// a time. The evaluation may result in task allocations which are computed
|
|
// optimistically, as there are many concurrent evaluations being processed.
|
|
// The task allocations are submitted as a plan, and the current leader will
|
|
// coordinate the commits to prevent oversubscription or improper allocations
|
|
// based on stale state.
|
|
type Scheduler interface {
|
|
// Process is used to handle a new evaluation. The scheduler is free to
|
|
// apply any logic necessary to make the task placements. The state and
|
|
// planner will be provided prior to any invocations of process.
|
|
Process(*structs.Evaluation) error
|
|
}
|
|
|
|
// State is an immutable view of the global state. This allows schedulers
|
|
// to make intelligent decisions based on allocations of other schedulers
|
|
// and to enforce complex constraints that require more information than
|
|
// is available to a local state scheduler.
|
|
type State interface {
|
|
// Config returns the configuration of the state store
|
|
Config() *state.StateStoreConfig
|
|
|
|
// Nodes returns an iterator over all the nodes.
|
|
// The type of each result is *structs.Node
|
|
Nodes(ws memdb.WatchSet) (memdb.ResultIterator, error)
|
|
|
|
// AllocsByJob returns the allocations by JobID
|
|
AllocsByJob(ws memdb.WatchSet, namespace, jobID string, all bool) ([]*structs.Allocation, error)
|
|
|
|
// AllocsByNode returns all the allocations by node
|
|
AllocsByNode(ws memdb.WatchSet, node string) ([]*structs.Allocation, error)
|
|
|
|
// AllocByID returns the allocation
|
|
AllocByID(ws memdb.WatchSet, allocID string) (*structs.Allocation, error)
|
|
|
|
// AllocsByNodeTerminal returns all the allocations by node filtering by terminal status
|
|
AllocsByNodeTerminal(ws memdb.WatchSet, node string, terminal bool) ([]*structs.Allocation, error)
|
|
|
|
// GetNodeByID is used to lookup a node by ID
|
|
NodeByID(ws memdb.WatchSet, nodeID string) (*structs.Node, error)
|
|
|
|
// GetJobByID is used to lookup a job by ID
|
|
JobByID(ws memdb.WatchSet, namespace, id string) (*structs.Job, error)
|
|
|
|
// DeploymentsByJobID returns the deployments associated with the job
|
|
DeploymentsByJobID(ws memdb.WatchSet, namespace, jobID string, all bool) ([]*structs.Deployment, error)
|
|
|
|
// JobByIDAndVersion returns the job associated with id and specific version
|
|
JobByIDAndVersion(ws memdb.WatchSet, namespace, id string, version uint64) (*structs.Job, error)
|
|
|
|
// LatestDeploymentByJobID returns the latest deployment matching the given
|
|
// job ID
|
|
LatestDeploymentByJobID(ws memdb.WatchSet, namespace, jobID string) (*structs.Deployment, error)
|
|
|
|
// SchedulerConfig returns config options for the scheduler
|
|
SchedulerConfig() (uint64, *structs.SchedulerConfiguration, error)
|
|
|
|
// CSIVolumeByID fetch CSI volumes, containing controller jobs
|
|
CSIVolumeByID(memdb.WatchSet, string, string) (*structs.CSIVolume, error)
|
|
|
|
// CSIVolumeByID fetch CSI volumes, containing controller jobs
|
|
CSIVolumesByNodeID(memdb.WatchSet, string, string) (memdb.ResultIterator, error)
|
|
|
|
// LatestIndex returns the greatest index value for all indexes.
|
|
LatestIndex() (uint64, error)
|
|
}
|
|
|
|
// Planner interface is used to submit a task allocation plan.
|
|
type Planner interface {
|
|
// SubmitPlan is used to submit a plan for consideration.
|
|
// This will return a PlanResult or an error. It is possible
|
|
// that this will result in a state refresh as well.
|
|
SubmitPlan(*structs.Plan) (*structs.PlanResult, State, error)
|
|
|
|
// UpdateEval is used to update an evaluation. This should update
|
|
// a copy of the input evaluation since that should be immutable.
|
|
UpdateEval(*structs.Evaluation) error
|
|
|
|
// CreateEval is used to create an evaluation. This should set the
|
|
// PreviousEval to that of the current evaluation.
|
|
CreateEval(*structs.Evaluation) error
|
|
|
|
// ReblockEval takes a blocked evaluation and re-inserts it into the blocked
|
|
// evaluation tracker. This update occurs only in-memory on the leader. The
|
|
// evaluation must exist in a blocked state prior to this being called such
|
|
// that on leader changes, the evaluation will be reblocked properly.
|
|
ReblockEval(*structs.Evaluation) error
|
|
}
|