Backport of client: allow incomplete allocrunners to be removed on restore into release/1.6.x #19361
Co-authored-by: Tim Gross <tgross@hashicorp.com>
This commit is contained in:
parent
a8c0b2ebb5
commit
3e7017e119
|
@ -0,0 +1,3 @@
|
||||||
|
```release-note:bug
|
||||||
|
client: remove incomplete allocation entries from client state database during client restarts
|
||||||
|
```
|
|
@ -1170,43 +1170,35 @@ func (c *Client) restoreState() error {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
//XXX REMOVED! make a note in backward compat / upgrading doc
|
|
||||||
// COMPAT: Remove in 0.7.0
|
|
||||||
// 0.6.0 transitioned from individual state files to a single bolt-db.
|
|
||||||
// The upgrade path is to:
|
|
||||||
// Check if old state exists
|
|
||||||
// If so, restore from that and delete old state
|
|
||||||
// Restore using state database
|
|
||||||
|
|
||||||
// Restore allocations
|
// Restore allocations
|
||||||
allocs, allocErrs, err := c.stateDB.GetAllAllocations()
|
allocs, allocErrs, err := c.stateDB.GetAllAllocations()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
for allocID, err := range allocErrs {
|
for allocID, err := range allocErrs {
|
||||||
|
// The boltdb values don't exist or didn't decode correctly. The state
|
||||||
|
// store is corrupt or there was a backwards incompatibility. Either way
|
||||||
|
// log it so we can debug it with operator client-state command.
|
||||||
c.logger.Error("error restoring alloc", "error", err, "alloc_id", allocID)
|
c.logger.Error("error restoring alloc", "error", err, "alloc_id", allocID)
|
||||||
//TODO Cleanup
|
|
||||||
// Try to clean up alloc dir
|
|
||||||
// Remove boltdb entries?
|
|
||||||
// Send to server with clientstatus=failed
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Load each alloc back
|
// Load each alloc back
|
||||||
for _, alloc := range allocs {
|
for _, alloc := range allocs {
|
||||||
|
|
||||||
// COMPAT(0.12): remove once upgrading from 0.9.5 is no longer supported
|
// If the alloc has no task state, we most likely stopped the client
|
||||||
// See hasLocalState for details. Skipping suspicious allocs
|
// after the allocrunner was created but before tasks could
|
||||||
// now. If allocs should be run, they will be started when the client
|
// start. Remove the client state so that we can start over with this
|
||||||
// gets allocs from servers.
|
// alloc if the server still wants to place it here.
|
||||||
if !c.hasLocalState(alloc) {
|
if !c.hasLocalState(alloc) {
|
||||||
c.logger.Warn("found an alloc without any local state, skipping restore", "alloc_id", alloc.ID)
|
c.logger.Warn(
|
||||||
|
"found an alloc without any local state, deleting from client state db",
|
||||||
|
"alloc_id", alloc.ID)
|
||||||
|
c.stateDB.DeleteAllocationBucket(alloc.ID, state.WithBatchMode())
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
//XXX On Restore we give up on watching previous allocs because
|
// On Restore we give up on watching previous allocs because we need the
|
||||||
// we need the local AllocRunners initialized first. We could
|
// local AllocRunners initialized first.
|
||||||
// add a second loop to initialize just the alloc watcher.
|
|
||||||
prevAllocWatcher := allocwatcher.NoopPrevAlloc{}
|
prevAllocWatcher := allocwatcher.NoopPrevAlloc{}
|
||||||
prevAllocMigrator := allocwatcher.NoopPrevAlloc{}
|
prevAllocMigrator := allocwatcher.NoopPrevAlloc{}
|
||||||
|
|
||||||
|
@ -1287,11 +1279,15 @@ func (c *Client) restoreState() error {
|
||||||
}
|
}
|
||||||
|
|
||||||
// hasLocalState returns true if we have any other associated state
|
// hasLocalState returns true if we have any other associated state
|
||||||
// with alloc beyond the task itself
|
// with alloc beyond the task itself. We want to detect two possible scenarios:
|
||||||
//
|
//
|
||||||
// Useful for detecting if a potentially completed alloc got resurrected
|
// 1. A potentially completed alloc got resurrected after AR was destroyed. In
|
||||||
// after AR was destroyed. In such cases, re-running the alloc lead to
|
// such cases, re-running the alloc leads to unexpected reruns and may lead
|
||||||
// unexpected reruns and may lead to process and task exhaustion on node.
|
// to process and task exhaustion on node.
|
||||||
|
//
|
||||||
|
// 2. The client stopped after the allocrunner was persisted to disk but before
|
||||||
|
// tasks started. In this case, we don't restart the AR and instead wait until
|
||||||
|
// we get the desired state from the server.
|
||||||
//
|
//
|
||||||
// The heuristic used here is an alloc is suspect if we see no other information
|
// The heuristic used here is an alloc is suspect if we see no other information
|
||||||
// and no other task/status info is found.
|
// and no other task/status info is found.
|
||||||
|
@ -1303,8 +1299,6 @@ func (c *Client) restoreState() error {
|
||||||
// See:
|
// See:
|
||||||
// - https://github.com/hashicorp/nomad/pull/6207
|
// - https://github.com/hashicorp/nomad/pull/6207
|
||||||
// - https://github.com/hashicorp/nomad/issues/5984
|
// - https://github.com/hashicorp/nomad/issues/5984
|
||||||
//
|
|
||||||
// COMPAT(0.12): remove once upgrading from 0.9.5 is no longer supported
|
|
||||||
func (c *Client) hasLocalState(alloc *structs.Allocation) bool {
|
func (c *Client) hasLocalState(alloc *structs.Allocation) bool {
|
||||||
tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
|
tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
|
||||||
if tg == nil {
|
if tg == nil {
|
||||||
|
|
|
@ -750,6 +750,10 @@ func TestClient_SaveRestoreState(t *testing.T) {
|
||||||
wait.Gap(time.Millisecond*30),
|
wait.Gap(time.Millisecond*30),
|
||||||
))
|
))
|
||||||
|
|
||||||
|
// Create a corrupted allocation that will be removed during restore
|
||||||
|
corruptAlloc := mock.Alloc()
|
||||||
|
c1.stateDB.PutAllocation(corruptAlloc)
|
||||||
|
|
||||||
t.Log("shutting down client")
|
t.Log("shutting down client")
|
||||||
must.NoError(t, c1.Shutdown()) // note: this saves the client state DB
|
must.NoError(t, c1.Shutdown()) // note: this saves the client state DB
|
||||||
|
|
||||||
|
@ -865,6 +869,9 @@ func TestClient_SaveRestoreState(t *testing.T) {
|
||||||
"alloc %s stopped during shutdown should have updated", a3.ID[:8])
|
"alloc %s stopped during shutdown should have updated", a3.ID[:8])
|
||||||
}
|
}
|
||||||
|
|
||||||
|
case corruptAlloc.ID:
|
||||||
|
return fmt.Errorf("corrupted allocation should not have been restored")
|
||||||
|
|
||||||
default:
|
default:
|
||||||
if ar.AllocState().ClientStatus != structs.AllocClientStatusComplete {
|
if ar.AllocState().ClientStatus != structs.AllocClientStatusComplete {
|
||||||
return fmt.Errorf("expected complete client status, got %v",
|
return fmt.Errorf("expected complete client status, got %v",
|
||||||
|
|
Loading…
Reference in New Issue