From 464026c87bd0127b38dc8750d52b4a660569959f Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Mon, 7 Feb 2022 11:47:53 -0500 Subject: [PATCH] scheduler: recover from panic (#12009) If processing a specific evaluation causes the scheduler (and therefore the entire server) to panic, that evaluation will never get a chance to be nack'd and cleared from the state store. It will get dequeued by another scheduler, causing that server to panic, and so forth until all servers are in a panic loop. This prevents the operator from intervening to remove the evaluation or update the state. Recover the goroutine from the top-level `Process` methods for each scheduler so that this condition can be detected without panicking the server process. This will lead to a loop of recovering the scheduler goroutine until the eval can be removed or nack'd, but that's much better than taking a downtime. --- .changelog/12009.txt | 3 +++ scheduler/generic_sched.go | 9 ++++++++- scheduler/scheduler_system.go | 8 +++++++- 3 files changed, 18 insertions(+), 2 deletions(-) create mode 100644 .changelog/12009.txt diff --git a/.changelog/12009.txt b/.changelog/12009.txt new file mode 100644 index 000000000..e57cbca6a --- /dev/null +++ b/.changelog/12009.txt @@ -0,0 +1,3 @@ +```release-note:improvement +scheduler: recover scheduler goroutines on panic +``` diff --git a/scheduler/generic_sched.go b/scheduler/generic_sched.go index f263d864b..6d6c262db 100644 --- a/scheduler/generic_sched.go +++ b/scheduler/generic_sched.go @@ -125,7 +125,14 @@ func NewBatchScheduler(logger log.Logger, eventsCh chan<- interface{}, state Sta } // Process is used to handle a single evaluation -func (s *GenericScheduler) Process(eval *structs.Evaluation) error { +func (s *GenericScheduler) Process(eval *structs.Evaluation) (err error) { + + defer func() { + if r := recover(); r != nil { + err = fmt.Errorf("processing eval %q panicked scheduler - please report this as a bug! - %v", eval.ID, r) + } + }() + // Store the evaluation s.eval = eval diff --git a/scheduler/scheduler_system.go b/scheduler/scheduler_system.go index dff79aa8b..59de58d89 100644 --- a/scheduler/scheduler_system.go +++ b/scheduler/scheduler_system.go @@ -72,7 +72,13 @@ func NewSysBatchScheduler(logger log.Logger, eventsCh chan<- interface{}, state } // Process is used to handle a single evaluation. -func (s *SystemScheduler) Process(eval *structs.Evaluation) error { +func (s *SystemScheduler) Process(eval *structs.Evaluation) (err error) { + + defer func() { + if r := recover(); r != nil { + err = fmt.Errorf("processing eval %q panicked scheduler - please report this as a bug! - %v", eval.ID, r) + } + }() // Store the evaluation s.eval = eval