client: always run alloc cleanup hooks on final update (#15855)

* client: run alloc pre-kill hooks on last pass despite no live tasks This PR fixes a bug where alloc pre-kill hooks were not run in the edge case where there are no live tasks remaining, but it is also the final update to process for the (terminal) allocation. We need to run cleanup hooks here, otherwise they will not run until the allocation gets garbage collected (i.e. via Destroy()), possibly at a distant time in the future. Fixes #15477 * client: do not run ar cleanup hooks if client is shutting down
2023-01-27 09:59:31 -06:00 · 2023-01-27 09:59:31 -06:00 · 0fac4e19b3
parent de87cdc816
commit 0fac4e19b3
5 changed files with 58 additions and 2 deletions
--- a/.changelog/15477.txt
+++ b/.changelog/15477.txt
@ -0,0 +1,3 @@
+```release-note:bug
+client: Fixed a bug where allocation cleanup hooks would not run
+```
--- a/client/allocrunner/alloc_runner.go
+++ b/client/allocrunner/alloc_runner.go
@ -125,7 +125,7 @@ type allocRunner struct {
 	allocDir *allocdir.AllocDir

 	// runnerHooks are alloc runner lifecycle hooks that should be run on state
-	// transistions.
+	// transitions.
 	runnerHooks []interfaces.RunnerHook

 	// hookState is the output of allocrunner hooks
@ -546,7 +546,9 @@ func (ar *allocRunner) handleTaskStateUpdates() {
 			}
 		}

+		// kill remaining live tasks
 		if len(liveRunners) > 0 {
+
 			// if all live runners are sidecars - kill alloc
 			onlySidecarsRemaining := hasSidecars && !hasNonSidecarTasks(liveRunners)
 			if killEvent == nil && onlySidecarsRemaining {
@ -586,6 +588,14 @@ func (ar *allocRunner) handleTaskStateUpdates() {
 				}
 			}
 		} else {
+			// there are no live runners left
+
+			// run AR pre-kill hooks if this alloc is done, but not if it's because
+			// the agent is shutting down.
+			if !ar.isShuttingDown() && done {
+				ar.preKillHooks()
+			}
+
 			// If there are no live runners left kill all non-poststop task
 			// runners to unblock them from the alloc restart loop.
 			for _, tr := range ar.tasks {
--- a/client/allocrunner/alloc_runner_hooks.go
+++ b/client/allocrunner/alloc_runner_hooks.go
@ -329,6 +329,7 @@ func (ar *allocRunner) destroy() error {
 func (ar *allocRunner) preKillHooks() {
 	for _, hook := range ar.runnerHooks {
 		pre, ok := hook.(interfaces.RunnerPreKillHook)
+
 		if !ok {
 			continue
 		}
--- a/client/allocrunner/alloc_runner_test.go
+++ b/client/allocrunner/alloc_runner_test.go
@ -6,6 +6,7 @@ import (
 	"io/ioutil"
 	"os"
 	"path/filepath"
+	"sync/atomic"
 	"testing"
 	"time"

@ -23,6 +24,8 @@ import (
 	"github.com/hashicorp/nomad/nomad/mock"
 	"github.com/hashicorp/nomad/nomad/structs"
 	"github.com/hashicorp/nomad/testutil"
+	"github.com/shoenig/test/must"
+	"github.com/shoenig/test/wait"
 	"github.com/stretchr/testify/require"
 )

@ -2398,3 +2401,43 @@ func TestHasSidecarTasks(t *testing.T) {
 		})
 	}
 }
+
+type allocPreKillHook struct {
+	ran atomic.Bool
+}
+
+func (*allocPreKillHook) Name() string { return "test_prekill" }
+
+func (h *allocPreKillHook) PreKill() {
+	h.ran.Store(true)
+}
+
+func TestAllocRunner_PreKill_RunOnDone(t *testing.T) {
+	ci.Parallel(t)
+
+	alloc := mock.Alloc()
+	task := alloc.Job.TaskGroups[0].Tasks[0]
+	task.Driver = "mock_driver"
+	task.Config = map[string]interface{}{"run_for": "2ms"}
+	alloc.DesiredStatus = "stop"
+
+	conf, cleanup := testAllocRunnerConfig(t, alloc.Copy())
+	t.Cleanup(cleanup)
+
+	ar, err := NewAllocRunner(conf)
+	must.NoError(t, err)
+
+	// set our custom prekill hook
+	hook := new(allocPreKillHook)
+	ar.runnerHooks = append(ar.runnerHooks, hook)
+
+	go ar.Run()
+	defer destroy(ar)
+
+	// wait for completion or timeout
+	must.Wait(t, wait.InitialSuccess(
+		wait.BoolFunc(hook.ran.Load),
+		wait.Timeout(5*time.Second),
+		wait.Gap(500*time.Millisecond),
+	))
+}
--- a/client/allocrunner/alloc_runner_unix_test.go
+++ b/client/allocrunner/alloc_runner_unix_test.go
@ -1,5 +1,4 @@
 //go:build !windows
-// +build !windows

 package allocrunner