// Copyright (c) HashiCorp, Inc. // SPDX-License-Identifier: MPL-2.0 package nomad import ( "sync" "time" "github.com/armon/go-metrics" "github.com/hashicorp/go-hclog" "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/nomad/structs" ) const ( // unblockBuffer is the buffer size for the unblock channel. The buffer // should be large to ensure that the FSM doesn't block when calling Unblock // as this would apply back-pressure on Raft. unblockBuffer = 8096 // pruneInterval is the interval at which we prune objects from the // BlockedEvals tracker pruneInterval = 5 * time.Minute // pruneThreshold is the threshold after which objects will be pruned. pruneThreshold = 15 * time.Minute ) // BlockedEvals is used to track evaluations that shouldn't be queued until a // certain class of nodes becomes available. An evaluation is put into the // blocked state when it is run through the scheduler and produced failed // allocations. It is unblocked when the capacity of a node that could run the // failed allocation becomes available. type BlockedEvals struct { // logger is the logger to use by the blocked eval tracker. logger hclog.Logger evalBroker *EvalBroker enabled bool stats *BlockedStats l sync.RWMutex // captured is the set of evaluations that are captured by computed node // classes. captured map[string]wrappedEval // escaped is the set of evaluations that have escaped computed node // classes. escaped map[string]wrappedEval // system is the set of system evaluations that failed to start on nodes because of // resource constraints. system *systemEvals // unblockCh is used to buffer unblocking of evaluations. capacityChangeCh chan *capacityUpdate // jobs is the map of blocked job and is used to ensure that only one // blocked eval exists for each job. The value is the blocked evaluation ID. jobs map[structs.NamespacedID]string // unblockIndexes maps computed node classes or quota name to the index in // which they were unblocked. This is used to check if an evaluation could // have been unblocked between the time they were in the scheduler and the // time they are being blocked. unblockIndexes map[string]uint64 // duplicates is the set of evaluations for jobs that had pre-existing // blocked evaluations. These should be marked as cancelled since only one // blocked eval is needed per job. duplicates []*structs.Evaluation // duplicateCh is used to signal that a duplicate eval was added to the // duplicate set. It can be used to unblock waiting callers looking for // duplicates. duplicateCh chan struct{} // timetable is used to correlate indexes with their insertion time. This // allows us to prune based on time. timetable *TimeTable // stopCh is used to stop any created goroutines. stopCh chan struct{} } // capacityUpdate stores unblock data. type capacityUpdate struct { computedClass string quotaChange string index uint64 } // wrappedEval captures both the evaluation and the optional token type wrappedEval struct { eval *structs.Evaluation token string } // NewBlockedEvals creates a new blocked eval tracker that will enqueue // unblocked evals into the passed broker. func NewBlockedEvals(evalBroker *EvalBroker, logger hclog.Logger) *BlockedEvals { return &BlockedEvals{ logger: logger.Named("blocked_evals"), evalBroker: evalBroker, captured: make(map[string]wrappedEval), escaped: make(map[string]wrappedEval), system: newSystemEvals(), jobs: make(map[structs.NamespacedID]string), unblockIndexes: make(map[string]uint64), capacityChangeCh: make(chan *capacityUpdate, unblockBuffer), duplicateCh: make(chan struct{}, 1), stopCh: make(chan struct{}), stats: NewBlockedStats(), } } // Enabled is used to check if the broker is enabled. func (b *BlockedEvals) Enabled() bool { b.l.RLock() defer b.l.RUnlock() return b.enabled } // SetEnabled is used to control if the blocked eval tracker is enabled. The // tracker should only be enabled on the active leader. func (b *BlockedEvals) SetEnabled(enabled bool) { b.l.Lock() if b.enabled == enabled { // No-op b.l.Unlock() return } else if enabled { go b.watchCapacity(b.stopCh, b.capacityChangeCh) go b.prune(b.stopCh) } else { close(b.stopCh) } b.enabled = enabled b.l.Unlock() if !enabled { b.Flush() } } func (b *BlockedEvals) SetTimetable(timetable *TimeTable) { b.l.Lock() b.timetable = timetable b.l.Unlock() } // Block tracks the passed evaluation and enqueues it into the eval broker when // a suitable node calls unblock. func (b *BlockedEvals) Block(eval *structs.Evaluation) { b.processBlock(eval, "") } // Reblock tracks the passed evaluation and enqueues it into the eval broker when // a suitable node calls unblock. Reblock should be used over Block when the // blocking is occurring by an outstanding evaluation. The token is the // evaluation's token. func (b *BlockedEvals) Reblock(eval *structs.Evaluation, token string) { b.processBlock(eval, token) } // processBlock is the implementation of blocking an evaluation. It supports // taking an optional evaluation token to use when reblocking an evaluation that // may be outstanding. func (b *BlockedEvals) processBlock(eval *structs.Evaluation, token string) { b.l.Lock() defer b.l.Unlock() // Do nothing if not enabled if !b.enabled { return } // Handle the new evaluation being for a job we are already tracking. if b.processBlockJobDuplicate(eval) { // If process block job duplicate returns true, the new evaluation has // been marked as a duplicate and we have nothing to do, so return // early. return } // Check if the eval missed an unblock while it was in the scheduler at an // older index. The scheduler could have been invoked with a snapshot of // state that was prior to additional capacity being added or allocations // becoming terminal. if b.missedUnblock(eval) { // Just re-enqueue the eval immediately. We pass the token so that the // eval_broker can properly handle the case in which the evaluation is // still outstanding. b.evalBroker.EnqueueAll(map[*structs.Evaluation]string{eval: token}) return } // Mark the job as tracked. b.jobs[structs.NewNamespacedID(eval.JobID, eval.Namespace)] = eval.ID b.stats.Block(eval) // Track that the evaluation is being added due to reaching the quota limit if eval.QuotaLimitReached != "" { b.stats.TotalQuotaLimit++ } // Wrap the evaluation, capturing its token. wrapped := wrappedEval{ eval: eval, token: token, } // If the eval has escaped, meaning computed node classes could not capture // the constraints of the job, we store the eval separately as we have to // unblock it whenever node capacity changes. This is because we don't know // what node class is feasible for the jobs constraints. if eval.EscapedComputedClass { b.escaped[eval.ID] = wrapped b.stats.TotalEscaped++ return } // System evals are indexed by node and re-processed on utilization changes in // existing nodes if eval.Type == structs.JobTypeSystem { b.system.Add(eval, token) } // Add the eval to the set of blocked evals whose jobs constraints are // captured by computed node class. b.captured[eval.ID] = wrapped } // processBlockJobDuplicate handles the case where the new eval is for a job // that we are already tracking. If the eval is a duplicate, we add the older // evaluation by Raft index to the list of duplicates such that it can be // cancelled. We only ever want one blocked evaluation per job, otherwise we // would create unnecessary work for the scheduler as multiple evals for the // same job would be run, all producing the same outcome. It is critical to // prefer the newer evaluation, since it will contain the most up to date set of // class eligibility. The return value is set to true, if the passed evaluation // is cancelled. This should be called with the lock held. func (b *BlockedEvals) processBlockJobDuplicate(eval *structs.Evaluation) (newCancelled bool) { existingID, hasExisting := b.jobs[structs.NewNamespacedID(eval.JobID, eval.Namespace)] if !hasExisting { return } var dup *structs.Evaluation existingW, ok := b.captured[existingID] if ok { if latestEvalIndex(existingW.eval) <= latestEvalIndex(eval) { delete(b.captured, existingID) dup = existingW.eval b.stats.Unblock(dup) } else { dup = eval newCancelled = true } } else { existingW, ok = b.escaped[existingID] if !ok { // This is a programming error b.logger.Error("existing blocked evaluation is neither tracked as captured or escaped", "existing_id", existingID) delete(b.jobs, structs.NewNamespacedID(eval.JobID, eval.Namespace)) return } if latestEvalIndex(existingW.eval) <= latestEvalIndex(eval) { delete(b.escaped, existingID) b.stats.TotalEscaped-- dup = existingW.eval } else { dup = eval newCancelled = true } } b.duplicates = append(b.duplicates, dup) // Unblock any waiter. select { case b.duplicateCh <- struct{}{}: default: } return } // latestEvalIndex returns the max of the evaluations create and snapshot index func latestEvalIndex(eval *structs.Evaluation) uint64 { if eval == nil { return 0 } return max(eval.CreateIndex, eval.SnapshotIndex) } // missedUnblock returns whether an evaluation missed an unblock while it was in // the scheduler. Since the scheduler can operate at an index in the past, the // evaluation may have been processed missing data that would allow it to // complete. This method returns if that is the case and should be called with // the lock held. func (b *BlockedEvals) missedUnblock(eval *structs.Evaluation) bool { var max uint64 = 0 for id, index := range b.unblockIndexes { // Calculate the max unblock index if max < index { max = index } // The evaluation is blocked because it has hit a quota limit not class // eligibility if eval.QuotaLimitReached != "" { if eval.QuotaLimitReached != id { // Not a match continue } else if eval.SnapshotIndex < index { // The evaluation was processed before the quota specification was // updated, so unblock the evaluation. return true } // The evaluation was processed having seen all changes to the quota return false } elig, ok := eval.ClassEligibility[id] if !ok && eval.SnapshotIndex < index { // The evaluation was processed and did not encounter this class // because it was added after it was processed. Thus for correctness // we need to unblock it. return true } // The evaluation could use the computed node class and the eval was // processed before the last unblock. if elig && eval.SnapshotIndex < index { return true } } // If the evaluation has escaped, and the map contains an index older than // the evaluations, it should be unblocked. if eval.EscapedComputedClass && eval.SnapshotIndex < max { return true } // The evaluation is ahead of all recent unblocks. return false } // Untrack causes any blocked evaluation for the passed job to be no longer // tracked. Untrack is called when there is a successful evaluation for the job // and a blocked evaluation is no longer needed. func (b *BlockedEvals) Untrack(jobID, namespace string) { b.l.Lock() defer b.l.Unlock() // Do nothing if not enabled if !b.enabled { return } nsID := structs.NewNamespacedID(jobID, namespace) if evals, ok := b.system.JobEvals(nsID); ok { for _, e := range evals { b.system.Remove(e) b.stats.Unblock(e) } return } // Get the evaluation ID to cancel evalID, ok := b.jobs[nsID] if !ok { // No blocked evaluation so exit return } // Attempt to delete the evaluation if w, ok := b.captured[evalID]; ok { delete(b.jobs, nsID) delete(b.captured, evalID) b.stats.Unblock(w.eval) if w.eval.QuotaLimitReached != "" { b.stats.TotalQuotaLimit-- } } if w, ok := b.escaped[evalID]; ok { delete(b.jobs, nsID) delete(b.escaped, evalID) b.stats.TotalEscaped-- b.stats.Unblock(w.eval) if w.eval.QuotaLimitReached != "" { b.stats.TotalQuotaLimit-- } } } // Unblock causes any evaluation that could potentially make progress on a // capacity change on the passed computed node class to be enqueued into the // eval broker. func (b *BlockedEvals) Unblock(computedClass string, index uint64) { b.l.Lock() // Do nothing if not enabled if !b.enabled { b.l.Unlock() return } // Store the index in which the unblock happened. We use this on subsequent // block calls in case the evaluation was in the scheduler when a trigger // occurred. b.unblockIndexes[computedClass] = index // Capture chan in lock as Flush overwrites it ch := b.capacityChangeCh done := b.stopCh b.l.Unlock() select { case <-done: case ch <- &capacityUpdate{ computedClass: computedClass, index: index, }: } } // UnblockQuota causes any evaluation that could potentially make progress on a // capacity change on the passed quota to be enqueued into the eval broker. func (b *BlockedEvals) UnblockQuota(quota string, index uint64) { // Nothing to do if quota == "" { return } b.l.Lock() // Do nothing if not enabled if !b.enabled { b.l.Unlock() return } // Store the index in which the unblock happened. We use this on subsequent // block calls in case the evaluation was in the scheduler when a trigger // occurred. b.unblockIndexes[quota] = index ch := b.capacityChangeCh done := b.stopCh b.l.Unlock() select { case <-done: case ch <- &capacityUpdate{ quotaChange: quota, index: index, }: } } // UnblockClassAndQuota causes any evaluation that could potentially make // progress on a capacity change on the passed computed node class or quota to // be enqueued into the eval broker. func (b *BlockedEvals) UnblockClassAndQuota(class, quota string, index uint64) { b.l.Lock() // Do nothing if not enabled if !b.enabled { b.l.Unlock() return } // Store the index in which the unblock happened. We use this on subsequent // block calls in case the evaluation was in the scheduler when a trigger // occurred. if quota != "" { b.unblockIndexes[quota] = index } b.unblockIndexes[class] = index // Capture chan inside the lock to prevent a race with it getting reset // in Flush. ch := b.capacityChangeCh done := b.stopCh b.l.Unlock() select { case <-done: case ch <- &capacityUpdate{ computedClass: class, quotaChange: quota, index: index, }: } } // UnblockNode finds any blocked evalution that's node specific (system jobs) and enqueues // it on the eval broker func (b *BlockedEvals) UnblockNode(nodeID string, index uint64) { b.l.Lock() defer b.l.Unlock() evals, ok := b.system.NodeEvals(nodeID) // Do nothing if not enabled if !b.enabled || !ok || len(evals) == 0 { return } for e := range evals { b.system.Remove(e) b.stats.Unblock(e) } b.evalBroker.EnqueueAll(evals) } // watchCapacity is a long lived function that watches for capacity changes in // nodes and unblocks the correct set of evals. func (b *BlockedEvals) watchCapacity(stopCh <-chan struct{}, changeCh <-chan *capacityUpdate) { for { select { case <-stopCh: return case update := <-changeCh: b.unblock(update.computedClass, update.quotaChange, update.index) } } } func (b *BlockedEvals) unblock(computedClass, quota string, index uint64) { b.l.Lock() defer b.l.Unlock() // Protect against the case of a flush. if !b.enabled { return } // Every eval that has escaped computed node class has to be unblocked // because any node could potentially be feasible. numQuotaLimit := 0 numEscaped := len(b.escaped) unblocked := make(map[*structs.Evaluation]string, max(uint64(numEscaped), 4)) if numEscaped != 0 && computedClass != "" { for id, wrapped := range b.escaped { unblocked[wrapped.eval] = wrapped.token delete(b.escaped, id) delete(b.jobs, structs.NewNamespacedID(wrapped.eval.JobID, wrapped.eval.Namespace)) if wrapped.eval.QuotaLimitReached != "" { numQuotaLimit++ } } } // We unblock any eval that is explicitly eligible for the computed class // and also any eval that is not eligible or uneligible. This signifies that // when the evaluation was originally run through the scheduler, that it // never saw a node with the given computed class and thus needs to be // unblocked for correctness. for id, wrapped := range b.captured { if quota != "" && wrapped.eval.QuotaLimitReached != "" && wrapped.eval.QuotaLimitReached != quota { // We are unblocking based on quota and this eval doesn't match continue } if elig, ok := wrapped.eval.ClassEligibility[computedClass]; ok && !elig { // Can skip because the eval has explicitly marked the node class // as ineligible. continue } // Unblock the evaluation because it is either for the matching quota, // is eligible based on the computed node class, or never seen the // computed node class. unblocked[wrapped.eval] = wrapped.token delete(b.jobs, structs.NewNamespacedID(wrapped.eval.JobID, wrapped.eval.Namespace)) delete(b.captured, id) if wrapped.eval.QuotaLimitReached != "" { numQuotaLimit++ } } if len(unblocked) != 0 { // Update the counters b.stats.TotalEscaped = 0 b.stats.TotalQuotaLimit -= numQuotaLimit for eval := range unblocked { b.stats.Unblock(eval) } // Enqueue all the unblocked evals into the broker. b.evalBroker.EnqueueAll(unblocked) } } // UnblockFailed unblocks all blocked evaluation that were due to scheduler // failure. func (b *BlockedEvals) UnblockFailed() { b.l.Lock() defer b.l.Unlock() // Do nothing if not enabled if !b.enabled { return } quotaLimit := 0 unblocked := make(map[*structs.Evaluation]string, 4) for id, wrapped := range b.captured { if wrapped.eval.TriggeredBy == structs.EvalTriggerMaxPlans { unblocked[wrapped.eval] = wrapped.token delete(b.captured, id) delete(b.jobs, structs.NewNamespacedID(wrapped.eval.JobID, wrapped.eval.Namespace)) if wrapped.eval.QuotaLimitReached != "" { quotaLimit++ } } } for id, wrapped := range b.escaped { if wrapped.eval.TriggeredBy == structs.EvalTriggerMaxPlans { unblocked[wrapped.eval] = wrapped.token delete(b.escaped, id) delete(b.jobs, structs.NewNamespacedID(wrapped.eval.JobID, wrapped.eval.Namespace)) b.stats.TotalEscaped -= 1 if wrapped.eval.QuotaLimitReached != "" { quotaLimit++ } } } if len(unblocked) > 0 { b.stats.TotalQuotaLimit -= quotaLimit for eval := range unblocked { b.stats.Unblock(eval) } b.evalBroker.EnqueueAll(unblocked) } } // GetDuplicates returns all the duplicate evaluations and blocks until the // passed timeout. func (b *BlockedEvals) GetDuplicates(timeout time.Duration) []*structs.Evaluation { var timeoutTimer *time.Timer var timeoutCh <-chan time.Time SCAN: b.l.Lock() if len(b.duplicates) != 0 { dups := b.duplicates b.duplicates = nil b.l.Unlock() return dups } // Capture chans inside the lock to prevent a race with them getting // reset in Flush dupCh := b.duplicateCh stopCh := b.stopCh b.l.Unlock() // Create the timer if timeoutTimer == nil && timeout != 0 { timeoutTimer = time.NewTimer(timeout) timeoutCh = timeoutTimer.C defer timeoutTimer.Stop() } select { case <-stopCh: return nil case <-timeoutCh: return nil case <-dupCh: goto SCAN } } // Flush is used to clear the state of blocked evaluations. func (b *BlockedEvals) Flush() { b.l.Lock() defer b.l.Unlock() // Reset the blocked eval tracker. b.stats.TotalEscaped = 0 b.stats.TotalBlocked = 0 b.stats.TotalQuotaLimit = 0 b.stats.BlockedResources = NewBlockedResourcesStats() b.captured = make(map[string]wrappedEval) b.escaped = make(map[string]wrappedEval) b.jobs = make(map[structs.NamespacedID]string) b.unblockIndexes = make(map[string]uint64) b.timetable = nil b.duplicates = nil b.capacityChangeCh = make(chan *capacityUpdate, unblockBuffer) b.stopCh = make(chan struct{}) b.duplicateCh = make(chan struct{}, 1) b.system = newSystemEvals() } // Stats is used to query the state of the blocked eval tracker. func (b *BlockedEvals) Stats() *BlockedStats { // Allocate a new stats struct stats := NewBlockedStats() b.l.RLock() defer b.l.RUnlock() // Copy all the stats stats.TotalEscaped = b.stats.TotalEscaped stats.TotalBlocked = b.stats.TotalBlocked stats.TotalQuotaLimit = b.stats.TotalQuotaLimit stats.BlockedResources = b.stats.BlockedResources.Copy() return stats } // EmitStats is used to export metrics about the blocked eval tracker while enabled func (b *BlockedEvals) EmitStats(period time.Duration, stopCh <-chan struct{}) { timer, stop := helper.NewSafeTimer(period) defer stop() for { timer.Reset(period) select { case <-timer.C: stats := b.Stats() metrics.SetGauge([]string{"nomad", "blocked_evals", "total_quota_limit"}, float32(stats.TotalQuotaLimit)) metrics.SetGauge([]string{"nomad", "blocked_evals", "total_blocked"}, float32(stats.TotalBlocked)) metrics.SetGauge([]string{"nomad", "blocked_evals", "total_escaped"}, float32(stats.TotalEscaped)) for k, v := range stats.BlockedResources.ByJob { labels := []metrics.Label{ {Name: "namespace", Value: k.Namespace}, {Name: "job", Value: k.ID}, } metrics.SetGaugeWithLabels([]string{"nomad", "blocked_evals", "job", "cpu"}, float32(v.CPU), labels) metrics.SetGaugeWithLabels([]string{"nomad", "blocked_evals", "job", "memory"}, float32(v.MemoryMB), labels) } for k, v := range stats.BlockedResources.ByClassInDC { labels := []metrics.Label{ {Name: "datacenter", Value: k.dc}, {Name: "node_class", Value: k.class}, } metrics.SetGaugeWithLabels([]string{"nomad", "blocked_evals", "cpu"}, float32(v.CPU), labels) metrics.SetGaugeWithLabels([]string{"nomad", "blocked_evals", "memory"}, float32(v.MemoryMB), labels) } case <-stopCh: return } } } // prune is a long lived function that prunes unnecessary objects on a timer. func (b *BlockedEvals) prune(stopCh <-chan struct{}) { ticker := time.NewTicker(pruneInterval) defer ticker.Stop() for { select { case <-stopCh: return case t := <-ticker.C: cutoff := t.UTC().Add(-1 * pruneThreshold) b.pruneUnblockIndexes(cutoff) b.pruneStats(cutoff) } } } // pruneUnblockIndexes is used to prune any tracked entry that is excessively // old. This protects againsts unbounded growth of the map. func (b *BlockedEvals) pruneUnblockIndexes(cutoff time.Time) { b.l.Lock() defer b.l.Unlock() if b.timetable == nil { return } oldThreshold := b.timetable.NearestIndex(cutoff) for key, index := range b.unblockIndexes { if index < oldThreshold { delete(b.unblockIndexes, key) } } } // pruneStats is used to prune any zero value stats that are excessively old. func (b *BlockedEvals) pruneStats(cutoff time.Time) { b.l.Lock() defer b.l.Unlock() b.stats.prune(cutoff) }