From 3e7017e119e10c63f7f0e53872172e4a09ef63b2 Mon Sep 17 00:00:00 2001
From: hc-github-team-nomad-core
 <82989552+hc-github-team-nomad-core@users.noreply.github.com>
Date: Thu, 7 Dec 2023 13:30:17 -0600
Subject: [PATCH] Backport of client: allow incomplete allocrunners to be
 removed on restore into release/1.6.x #19361

Co-authored-by: Tim Gross <tgross@hashicorp.com>
---
 .changelog/16638.txt  |  3 +++
 client/client.go      | 48 +++++++++++++++++++------------------------
 client/client_test.go |  7 +++++++
 3 files changed, 31 insertions(+), 27 deletions(-)
 create mode 100644 .changelog/16638.txt

diff --git a/.changelog/16638.txt b/.changelog/16638.txt
new file mode 100644
index 000000000..444a20794
--- /dev/null
+++ b/.changelog/16638.txt
@@ -0,0 +1,3 @@
+```release-note:bug
+client: remove incomplete allocation entries from client state database during client restarts
+```
diff --git a/client/client.go b/client/client.go
index 6b3db811c..5c2185427 100644
--- a/client/client.go
+++ b/client/client.go
@@ -1170,43 +1170,35 @@ func (c *Client) restoreState() error {
 		return nil
 	}
 
-	//XXX REMOVED! make a note in backward compat / upgrading doc
-	// COMPAT: Remove in 0.7.0
-	// 0.6.0 transitioned from individual state files to a single bolt-db.
-	// The upgrade path is to:
-	// Check if old state exists
-	//   If so, restore from that and delete old state
-	// Restore using state database
-
 	// Restore allocations
 	allocs, allocErrs, err := c.stateDB.GetAllAllocations()
 	if err != nil {
 		return err
 	}
-
 	for allocID, err := range allocErrs {
+		// The boltdb values don't exist or didn't decode correctly. The state
+		// store is corrupt or there was a backwards incompatibility. Either way
+		// log it so we can debug it with operator client-state command.
 		c.logger.Error("error restoring alloc", "error", err, "alloc_id", allocID)
-		//TODO Cleanup
-		// Try to clean up alloc dir
-		// Remove boltdb entries?
-		// Send to server with clientstatus=failed
 	}
 
 	// Load each alloc back
 	for _, alloc := range allocs {
 
-		// COMPAT(0.12): remove once upgrading from 0.9.5 is no longer supported
-		// See hasLocalState for details.  Skipping suspicious allocs
-		// now.  If allocs should be run, they will be started when the client
-		// gets allocs from servers.
+		// If the alloc has no task state, we most likely stopped the client
+		// after the allocrunner was created but before tasks could
+		// start. Remove the client state so that we can start over with this
+		// alloc if the server still wants to place it here.
 		if !c.hasLocalState(alloc) {
-			c.logger.Warn("found an alloc without any local state, skipping restore", "alloc_id", alloc.ID)
+			c.logger.Warn(
+				"found an alloc without any local state, deleting from client state db",
+				"alloc_id", alloc.ID)
+			c.stateDB.DeleteAllocationBucket(alloc.ID, state.WithBatchMode())
 			continue
 		}
 
-		//XXX On Restore we give up on watching previous allocs because
-		//    we need the local AllocRunners initialized first. We could
-		//    add a second loop to initialize just the alloc watcher.
+		// On Restore we give up on watching previous allocs because we need the
+		// local AllocRunners initialized first.
 		prevAllocWatcher := allocwatcher.NoopPrevAlloc{}
 		prevAllocMigrator := allocwatcher.NoopPrevAlloc{}
 
@@ -1287,11 +1279,15 @@ func (c *Client) restoreState() error {
 }
 
 // hasLocalState returns true if we have any other associated state
-// with alloc beyond the task itself
+// with alloc beyond the task itself. We want to detect two possible scenarios:
 //
-// Useful for detecting if a potentially completed alloc got resurrected
-// after AR was destroyed.  In such cases, re-running the alloc lead to
-// unexpected reruns and may lead to process and task exhaustion on node.
+//  1. A potentially completed alloc got resurrected after AR was destroyed. In
+//     such cases, re-running the alloc leads to unexpected reruns and may lead
+//     to process and task exhaustion on node.
+//
+//  2. The client stopped after the allocrunner was persisted to disk but before
+//     tasks started. In this case, we don't restart the AR and instead wait until
+//     we get the desired state from the server.
 //
 // The heuristic used here is an alloc is suspect if we see no other information
 // and no other task/status info is found.
@@ -1303,8 +1299,6 @@ func (c *Client) restoreState() error {
 // See:
 //   - https://github.com/hashicorp/nomad/pull/6207
 //   - https://github.com/hashicorp/nomad/issues/5984
-//
-// COMPAT(0.12): remove once upgrading from 0.9.5 is no longer supported
 func (c *Client) hasLocalState(alloc *structs.Allocation) bool {
 	tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
 	if tg == nil {
diff --git a/client/client_test.go b/client/client_test.go
index 0aa00040a..ff0eb163b 100644
--- a/client/client_test.go
+++ b/client/client_test.go
@@ -750,6 +750,10 @@ func TestClient_SaveRestoreState(t *testing.T) {
 		wait.Gap(time.Millisecond*30),
 	))
 
+	// Create a corrupted allocation that will be removed during restore
+	corruptAlloc := mock.Alloc()
+	c1.stateDB.PutAllocation(corruptAlloc)
+
 	t.Log("shutting down client")
 	must.NoError(t, c1.Shutdown()) // note: this saves the client state DB
 
@@ -865,6 +869,9 @@ func TestClient_SaveRestoreState(t *testing.T) {
 							"alloc %s stopped during shutdown should have updated", a3.ID[:8])
 					}
 
+				case corruptAlloc.ID:
+					return fmt.Errorf("corrupted allocation should not have been restored")
+
 				default:
 					if ar.AllocState().ClientStatus != structs.AllocClientStatusComplete {
 						return fmt.Errorf("expected complete client status, got %v",