scheduler: recover from panic (#12009)
If processing a specific evaluation causes the scheduler (and therefore the entire server) to panic, that evaluation will never get a chance to be nack'd and cleared from the state store. It will get dequeued by another scheduler, causing that server to panic, and so forth until all servers are in a panic loop. This prevents the operator from intervening to remove the evaluation or update the state. Recover the goroutine from the top-level `Process` methods for each scheduler so that this condition can be detected without panicking the server process. This will lead to a loop of recovering the scheduler goroutine until the eval can be removed or nack'd, but that's much better than taking a downtime.
This commit is contained in:
parent
7a63a249ca
commit
464026c87b
|
@ -0,0 +1,3 @@
|
||||||
|
```release-note:improvement
|
||||||
|
scheduler: recover scheduler goroutines on panic
|
||||||
|
```
|
|
@ -125,7 +125,14 @@ func NewBatchScheduler(logger log.Logger, eventsCh chan<- interface{}, state Sta
|
||||||
}
|
}
|
||||||
|
|
||||||
// Process is used to handle a single evaluation
|
// Process is used to handle a single evaluation
|
||||||
func (s *GenericScheduler) Process(eval *structs.Evaluation) error {
|
func (s *GenericScheduler) Process(eval *structs.Evaluation) (err error) {
|
||||||
|
|
||||||
|
defer func() {
|
||||||
|
if r := recover(); r != nil {
|
||||||
|
err = fmt.Errorf("processing eval %q panicked scheduler - please report this as a bug! - %v", eval.ID, r)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
// Store the evaluation
|
// Store the evaluation
|
||||||
s.eval = eval
|
s.eval = eval
|
||||||
|
|
||||||
|
|
|
@ -72,7 +72,13 @@ func NewSysBatchScheduler(logger log.Logger, eventsCh chan<- interface{}, state
|
||||||
}
|
}
|
||||||
|
|
||||||
// Process is used to handle a single evaluation.
|
// Process is used to handle a single evaluation.
|
||||||
func (s *SystemScheduler) Process(eval *structs.Evaluation) error {
|
func (s *SystemScheduler) Process(eval *structs.Evaluation) (err error) {
|
||||||
|
|
||||||
|
defer func() {
|
||||||
|
if r := recover(); r != nil {
|
||||||
|
err = fmt.Errorf("processing eval %q panicked scheduler - please report this as a bug! - %v", eval.ID, r)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
// Store the evaluation
|
// Store the evaluation
|
||||||
s.eval = eval
|
s.eval = eval
|
||||||
|
|
Loading…
Reference in New Issue