From c24e73ede7d0d3dbe3684933887367551b5d7fe8 Mon Sep 17 00:00:00 2001
From: Michael Schurter <schmichael@hashicorp.com>
Date: Tue, 18 Jul 2017 11:12:56 -0700
Subject: [PATCH] Fix deadlock caused by syncing during destroy

When replacing an alloc the new alloc is blocked until the old alloc is
destroyed. This could cause a deadlock:

1. Destroying the old alloc includes a final sync of its status
2. Syncing status causes a GC
3. A GC looks for terminal allocs to cleanup
4. The GC waits for an alloc to stop completely before GC'ing

If the GC chooses the currently-being-destroyed-alloc to GC, the GC
deadlocks. If `client.max_parallel` deadlocks happen the GC is wedged
until the Nomad process is restarted.

Performing the final sync asynchronously is an ugly hack but prevents
the deadlock by allowing the final sync to occur after the alloc runner
has shutdown and been destroyed.
---
 client/alloc_runner.go | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/client/alloc_runner.go b/client/alloc_runner.go
index 80898f715..abcbd16c2 100644
--- a/client/alloc_runner.go
+++ b/client/alloc_runner.go
@@ -853,7 +853,19 @@ func (r *AllocRunner) destroyTaskRunners(destroyEvent *structs.TaskEvent) {
 func (r *AllocRunner) handleDestroy() {
 	// Final state sync. We do this to ensure that the server has the correct
 	// state as we wait for a destroy.
-	r.syncStatus()
+	alloc := r.Alloc()
+
+	//TODO(schmichael) updater can cause a GC which can block on this alloc
+	// runner shutting down. Since handleDestroy can be called by Run() we
+	// can't block shutdown here as it would cause a deadlock.
+	go r.updater(alloc)
+
+	// Broadcast and persist state synchronously
+	r.allocBroadcast.Send(alloc)
+	if err := r.saveAllocRunnerState(); err != nil {
+		r.logger.Printf("[WARN] client: alloc %q unable to persist state but should be GC'd soon anyway:%v",
+			r.allocID, err)
+	}
 
 	for {
 		select {