From c24e73ede7d0d3dbe3684933887367551b5d7fe8 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Tue, 18 Jul 2017 11:12:56 -0700 Subject: [PATCH] Fix deadlock caused by syncing during destroy When replacing an alloc the new alloc is blocked until the old alloc is destroyed. This could cause a deadlock: 1. Destroying the old alloc includes a final sync of its status 2. Syncing status causes a GC 3. A GC looks for terminal allocs to cleanup 4. The GC waits for an alloc to stop completely before GC'ing If the GC chooses the currently-being-destroyed-alloc to GC, the GC deadlocks. If `client.max_parallel` deadlocks happen the GC is wedged until the Nomad process is restarted. Performing the final sync asynchronously is an ugly hack but prevents the deadlock by allowing the final sync to occur after the alloc runner has shutdown and been destroyed. --- client/alloc_runner.go | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/client/alloc_runner.go b/client/alloc_runner.go index 80898f715..abcbd16c2 100644 --- a/client/alloc_runner.go +++ b/client/alloc_runner.go @@ -853,7 +853,19 @@ func (r *AllocRunner) destroyTaskRunners(destroyEvent *structs.TaskEvent) { func (r *AllocRunner) handleDestroy() { // Final state sync. We do this to ensure that the server has the correct // state as we wait for a destroy. - r.syncStatus() + alloc := r.Alloc() + + //TODO(schmichael) updater can cause a GC which can block on this alloc + // runner shutting down. Since handleDestroy can be called by Run() we + // can't block shutdown here as it would cause a deadlock. + go r.updater(alloc) + + // Broadcast and persist state synchronously + r.allocBroadcast.Send(alloc) + if err := r.saveAllocRunnerState(); err != nil { + r.logger.Printf("[WARN] client: alloc %q unable to persist state but should be GC'd soon anyway:%v", + r.allocID, err) + } for { select {