Backport of client: allow incomplete allocrunners to be removed on restore into release/1.6.x #19361

Co-authored-by: Tim Gross <tgross@hashicorp.com>
This commit is contained in:
hc-github-team-nomad-core 2023-12-07 13:30:17 -06:00 committed by GitHub
parent a8c0b2ebb5
commit 3e7017e119
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 31 additions and 27 deletions

3
.changelog/16638.txt Normal file
View File

@ -0,0 +1,3 @@
```release-note:bug
client: remove incomplete allocation entries from client state database during client restarts
```

View File

@ -1170,43 +1170,35 @@ func (c *Client) restoreState() error {
return nil return nil
} }
//XXX REMOVED! make a note in backward compat / upgrading doc
// COMPAT: Remove in 0.7.0
// 0.6.0 transitioned from individual state files to a single bolt-db.
// The upgrade path is to:
// Check if old state exists
// If so, restore from that and delete old state
// Restore using state database
// Restore allocations // Restore allocations
allocs, allocErrs, err := c.stateDB.GetAllAllocations() allocs, allocErrs, err := c.stateDB.GetAllAllocations()
if err != nil { if err != nil {
return err return err
} }
for allocID, err := range allocErrs { for allocID, err := range allocErrs {
// The boltdb values don't exist or didn't decode correctly. The state
// store is corrupt or there was a backwards incompatibility. Either way
// log it so we can debug it with operator client-state command.
c.logger.Error("error restoring alloc", "error", err, "alloc_id", allocID) c.logger.Error("error restoring alloc", "error", err, "alloc_id", allocID)
//TODO Cleanup
// Try to clean up alloc dir
// Remove boltdb entries?
// Send to server with clientstatus=failed
} }
// Load each alloc back // Load each alloc back
for _, alloc := range allocs { for _, alloc := range allocs {
// COMPAT(0.12): remove once upgrading from 0.9.5 is no longer supported // If the alloc has no task state, we most likely stopped the client
// See hasLocalState for details. Skipping suspicious allocs // after the allocrunner was created but before tasks could
// now. If allocs should be run, they will be started when the client // start. Remove the client state so that we can start over with this
// gets allocs from servers. // alloc if the server still wants to place it here.
if !c.hasLocalState(alloc) { if !c.hasLocalState(alloc) {
c.logger.Warn("found an alloc without any local state, skipping restore", "alloc_id", alloc.ID) c.logger.Warn(
"found an alloc without any local state, deleting from client state db",
"alloc_id", alloc.ID)
c.stateDB.DeleteAllocationBucket(alloc.ID, state.WithBatchMode())
continue continue
} }
//XXX On Restore we give up on watching previous allocs because // On Restore we give up on watching previous allocs because we need the
// we need the local AllocRunners initialized first. We could // local AllocRunners initialized first.
// add a second loop to initialize just the alloc watcher.
prevAllocWatcher := allocwatcher.NoopPrevAlloc{} prevAllocWatcher := allocwatcher.NoopPrevAlloc{}
prevAllocMigrator := allocwatcher.NoopPrevAlloc{} prevAllocMigrator := allocwatcher.NoopPrevAlloc{}
@ -1287,11 +1279,15 @@ func (c *Client) restoreState() error {
} }
// hasLocalState returns true if we have any other associated state // hasLocalState returns true if we have any other associated state
// with alloc beyond the task itself // with alloc beyond the task itself. We want to detect two possible scenarios:
// //
// Useful for detecting if a potentially completed alloc got resurrected // 1. A potentially completed alloc got resurrected after AR was destroyed. In
// after AR was destroyed. In such cases, re-running the alloc lead to // such cases, re-running the alloc leads to unexpected reruns and may lead
// unexpected reruns and may lead to process and task exhaustion on node. // to process and task exhaustion on node.
//
// 2. The client stopped after the allocrunner was persisted to disk but before
// tasks started. In this case, we don't restart the AR and instead wait until
// we get the desired state from the server.
// //
// The heuristic used here is an alloc is suspect if we see no other information // The heuristic used here is an alloc is suspect if we see no other information
// and no other task/status info is found. // and no other task/status info is found.
@ -1303,8 +1299,6 @@ func (c *Client) restoreState() error {
// See: // See:
// - https://github.com/hashicorp/nomad/pull/6207 // - https://github.com/hashicorp/nomad/pull/6207
// - https://github.com/hashicorp/nomad/issues/5984 // - https://github.com/hashicorp/nomad/issues/5984
//
// COMPAT(0.12): remove once upgrading from 0.9.5 is no longer supported
func (c *Client) hasLocalState(alloc *structs.Allocation) bool { func (c *Client) hasLocalState(alloc *structs.Allocation) bool {
tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup) tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
if tg == nil { if tg == nil {

View File

@ -750,6 +750,10 @@ func TestClient_SaveRestoreState(t *testing.T) {
wait.Gap(time.Millisecond*30), wait.Gap(time.Millisecond*30),
)) ))
// Create a corrupted allocation that will be removed during restore
corruptAlloc := mock.Alloc()
c1.stateDB.PutAllocation(corruptAlloc)
t.Log("shutting down client") t.Log("shutting down client")
must.NoError(t, c1.Shutdown()) // note: this saves the client state DB must.NoError(t, c1.Shutdown()) // note: this saves the client state DB
@ -865,6 +869,9 @@ func TestClient_SaveRestoreState(t *testing.T) {
"alloc %s stopped during shutdown should have updated", a3.ID[:8]) "alloc %s stopped during shutdown should have updated", a3.ID[:8])
} }
case corruptAlloc.ID:
return fmt.Errorf("corrupted allocation should not have been restored")
default: default:
if ar.AllocState().ClientStatus != structs.AllocClientStatusComplete { if ar.AllocState().ClientStatus != structs.AllocClientStatusComplete {
return fmt.Errorf("expected complete client status, got %v", return fmt.Errorf("expected complete client status, got %v",