client: fix waiting on preempted alloc (#12779)

Fixes #10200 **The bug** A user reported receiving the following error when an alloc was placed that needed to preempt existing allocs: ``` [ERROR] client.alloc_watcher: error querying previous alloc: alloc_id=28... previous_alloc=8e... error="rpc error: alloc lookup failed: index error: UUID must be 36 characters" ``` The previous alloc (8e) was already complete on the client. This is possible if an alloc stops *after* the scheduling decision was made to preempt it, but *before* the node running both allocations was able to pull and start the preemptor. While that is hopefully a narrow window of time, you can expect it to occur in high throughput batch scheduling heavy systems. However the RPC error made no sense! `previous_alloc` in the logs was a valid 36 character UUID! **The fix** The fix is: ``` - prevAllocID: c.Alloc.PreviousAllocation, + prevAllocID: watchedAllocID, ``` The alloc watcher new func used for preemption improperly referenced Alloc.PreviousAllocation instead of the passed in watchedAllocID. When multiple allocs are preempted, a watcher is created for each with watchedAllocID set properly by the caller. In this case Alloc.PreviousAllocation="" -- which is where the `UUID must be 36 characters` error was coming from! Sadly we were properly referencing watchedAllocID in the log, so it made the error make no sense! **The repro** I was able to reproduce this with a dev agent with [preemption enabled](https://gist.github.com/schmichael/53f79cbd898afdfab76865ad8c7fc6a0#file-preempt-hcl) and [lowered limits](https://gist.github.com/schmichael/53f79cbd898afdfab76865ad8c7fc6a0#file-limits-hcl) for ease of repro. First I started a [low priority count 3 job](https://gist.github.com/schmichael/53f79cbd898afdfab76865ad8c7fc6a0#file-preempt-lo-nomad), then a [high priority job](https://gist.github.com/schmichael/53f79cbd898afdfab76865ad8c7fc6a0#file-preempt-hi-nomad) that evicts 2 low priority jobs. Everything worked as expected. However if I force it to use the [remotePrevAlloc implementation](https://github.com/hashicorp/nomad/blob/v1.3.0-beta.1/client/allocwatcher/alloc_watcher.go#L147), it reproduces the bug because the watcher references PreviousAllocation instead of watchedAllocID.
2022-04-26 13:14:43 -07:00 · 2022-04-26 13:14:43 -07:00 · e2544dd089
parent cfd353207f
commit e2544dd089
2 changed files with 19 additions and 7 deletions
--- a/.changelog/12779.txt
+++ b/.changelog/12779.txt
@ -0,0 +1,3 @@
+```release-note:bug
+client: Fixed a bug that could prevent a preempting alloc from ever starting.
+```
--- a/client/allocwatcher/alloc_watcher.go
+++ b/client/allocwatcher/alloc_watcher.go
@ -102,7 +102,7 @@ func newMigratorForAlloc(c Config, tg *structs.TaskGroup, watchedAllocID string,
 	migrate := tg.EphemeralDisk != nil && tg.EphemeralDisk.Migrate

 	if m != nil {
-		// Local Allocation because there's no meta
+		// Local Allocation because there's an alloc runner
 		return &localPrevAlloc{
 			allocID:      c.Alloc.ID,
 			prevAllocID:  watchedAllocID,
@ -117,7 +117,7 @@ func newMigratorForAlloc(c Config, tg *structs.TaskGroup, watchedAllocID string,

 	return &remotePrevAlloc{
 		allocID:      c.Alloc.ID,
-		prevAllocID:  c.Alloc.PreviousAllocation,
+		prevAllocID:  watchedAllocID,
 		tasks:        tasks,
 		config:       c.Config,
 		migrate:      migrate,
@ -127,11 +127,17 @@ func newMigratorForAlloc(c Config, tg *structs.TaskGroup, watchedAllocID string,
 	}
 }

+// newWatcherForAlloc uses a local or rpc-based watcher depending on whether
+// AllocRunnerMeta is nil or not.
+//
+// Note that c.Alloc.PreviousAllocation must NOT be used in this func as it
+// used for preemption which has a distinct field. The caller is responsible
+// for passing the allocation to be watched as watchedAllocID.
 func newWatcherForAlloc(c Config, watchedAllocID string, m AllocRunnerMeta) PrevAllocWatcher {
 	logger := c.Logger.Named("alloc_watcher").With("alloc_id", c.Alloc.ID).With("previous_alloc", watchedAllocID)

 	if m != nil {
-		// Local Allocation because there's no meta
+		// Local Allocation because there's an alloc runner
 		return &localPrevAlloc{
 			allocID:      c.Alloc.ID,
 			prevAllocID:  watchedAllocID,
@ -144,7 +150,7 @@ func newWatcherForAlloc(c Config, watchedAllocID string, m AllocRunnerMeta) Prev

 	return &remotePrevAlloc{
 		allocID:      c.Alloc.ID,
-		prevAllocID:  c.Alloc.PreviousAllocation,
+		prevAllocID:  watchedAllocID,
 		config:       c.Config,
 		rpc:          c.RPC,
 		migrateToken: c.MigrateToken,
@ -152,9 +158,12 @@ func newWatcherForAlloc(c Config, watchedAllocID string, m AllocRunnerMeta) Prev
 	}
 }

-// NewAllocWatcher creates a PrevAllocWatcher appropriate for whether this
-// alloc's previous allocation was local or remote. If this alloc has no
-// previous alloc then a noop implementation is returned.
+// NewAllocWatcher creates a PrevAllocWatcher if either PreviousAllocation or
+// PreemptedRunners are set. If any of the allocs to watch have local runners,
+// wait for them to terminate directly.
+// For allocs which are either running on another node or have already
+// terminated their alloc runners, use a remote backend which watches the alloc
+// status via rpc.
 func NewAllocWatcher(c Config) (PrevAllocWatcher, PrevAllocMigrator) {
 	if c.Alloc.PreviousAllocation == "" && c.PreemptedRunners == nil {
 		return NoopPrevAlloc{}, NoopPrevAlloc{}