From 3e7017e119e10c63f7f0e53872172e4a09ef63b2 Mon Sep 17 00:00:00 2001 From: hc-github-team-nomad-core <82989552+hc-github-team-nomad-core@users.noreply.github.com> Date: Thu, 7 Dec 2023 13:30:17 -0600 Subject: [PATCH] Backport of client: allow incomplete allocrunners to be removed on restore into release/1.6.x #19361 Co-authored-by: Tim Gross --- .changelog/16638.txt | 3 +++ client/client.go | 48 +++++++++++++++++++------------------------ client/client_test.go | 7 +++++++ 3 files changed, 31 insertions(+), 27 deletions(-) create mode 100644 .changelog/16638.txt diff --git a/.changelog/16638.txt b/.changelog/16638.txt new file mode 100644 index 000000000..444a20794 --- /dev/null +++ b/.changelog/16638.txt @@ -0,0 +1,3 @@ +```release-note:bug +client: remove incomplete allocation entries from client state database during client restarts +``` diff --git a/client/client.go b/client/client.go index 6b3db811c..5c2185427 100644 --- a/client/client.go +++ b/client/client.go @@ -1170,43 +1170,35 @@ func (c *Client) restoreState() error { return nil } - //XXX REMOVED! make a note in backward compat / upgrading doc - // COMPAT: Remove in 0.7.0 - // 0.6.0 transitioned from individual state files to a single bolt-db. - // The upgrade path is to: - // Check if old state exists - // If so, restore from that and delete old state - // Restore using state database - // Restore allocations allocs, allocErrs, err := c.stateDB.GetAllAllocations() if err != nil { return err } - for allocID, err := range allocErrs { + // The boltdb values don't exist or didn't decode correctly. The state + // store is corrupt or there was a backwards incompatibility. Either way + // log it so we can debug it with operator client-state command. c.logger.Error("error restoring alloc", "error", err, "alloc_id", allocID) - //TODO Cleanup - // Try to clean up alloc dir - // Remove boltdb entries? - // Send to server with clientstatus=failed } // Load each alloc back for _, alloc := range allocs { - // COMPAT(0.12): remove once upgrading from 0.9.5 is no longer supported - // See hasLocalState for details. Skipping suspicious allocs - // now. If allocs should be run, they will be started when the client - // gets allocs from servers. + // If the alloc has no task state, we most likely stopped the client + // after the allocrunner was created but before tasks could + // start. Remove the client state so that we can start over with this + // alloc if the server still wants to place it here. if !c.hasLocalState(alloc) { - c.logger.Warn("found an alloc without any local state, skipping restore", "alloc_id", alloc.ID) + c.logger.Warn( + "found an alloc without any local state, deleting from client state db", + "alloc_id", alloc.ID) + c.stateDB.DeleteAllocationBucket(alloc.ID, state.WithBatchMode()) continue } - //XXX On Restore we give up on watching previous allocs because - // we need the local AllocRunners initialized first. We could - // add a second loop to initialize just the alloc watcher. + // On Restore we give up on watching previous allocs because we need the + // local AllocRunners initialized first. prevAllocWatcher := allocwatcher.NoopPrevAlloc{} prevAllocMigrator := allocwatcher.NoopPrevAlloc{} @@ -1287,11 +1279,15 @@ func (c *Client) restoreState() error { } // hasLocalState returns true if we have any other associated state -// with alloc beyond the task itself +// with alloc beyond the task itself. We want to detect two possible scenarios: // -// Useful for detecting if a potentially completed alloc got resurrected -// after AR was destroyed. In such cases, re-running the alloc lead to -// unexpected reruns and may lead to process and task exhaustion on node. +// 1. A potentially completed alloc got resurrected after AR was destroyed. In +// such cases, re-running the alloc leads to unexpected reruns and may lead +// to process and task exhaustion on node. +// +// 2. The client stopped after the allocrunner was persisted to disk but before +// tasks started. In this case, we don't restart the AR and instead wait until +// we get the desired state from the server. // // The heuristic used here is an alloc is suspect if we see no other information // and no other task/status info is found. @@ -1303,8 +1299,6 @@ func (c *Client) restoreState() error { // See: // - https://github.com/hashicorp/nomad/pull/6207 // - https://github.com/hashicorp/nomad/issues/5984 -// -// COMPAT(0.12): remove once upgrading from 0.9.5 is no longer supported func (c *Client) hasLocalState(alloc *structs.Allocation) bool { tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup) if tg == nil { diff --git a/client/client_test.go b/client/client_test.go index 0aa00040a..ff0eb163b 100644 --- a/client/client_test.go +++ b/client/client_test.go @@ -750,6 +750,10 @@ func TestClient_SaveRestoreState(t *testing.T) { wait.Gap(time.Millisecond*30), )) + // Create a corrupted allocation that will be removed during restore + corruptAlloc := mock.Alloc() + c1.stateDB.PutAllocation(corruptAlloc) + t.Log("shutting down client") must.NoError(t, c1.Shutdown()) // note: this saves the client state DB @@ -865,6 +869,9 @@ func TestClient_SaveRestoreState(t *testing.T) { "alloc %s stopped during shutdown should have updated", a3.ID[:8]) } + case corruptAlloc.ID: + return fmt.Errorf("corrupted allocation should not have been restored") + default: if ar.AllocState().ClientStatus != structs.AllocClientStatusComplete { return fmt.Errorf("expected complete client status, got %v",