diff --git a/.changelog/17175.txt b/.changelog/17175.txt new file mode 100644 index 000000000..fd2c2f345 --- /dev/null +++ b/.changelog/17175.txt @@ -0,0 +1,3 @@ +```release-note:bug +client: Fixed a bug where restarting a terminal allocation turns it into a zombie where allocation and task hooks will run unexpectedly +``` diff --git a/client/allocrunner/alloc_runner.go b/client/allocrunner/alloc_runner.go index 58026c650..d82b238ef 100644 --- a/client/allocrunner/alloc_runner.go +++ b/client/allocrunner/alloc_runner.go @@ -1273,6 +1273,12 @@ func (ar *allocRunner) RestartAll(event *structs.TaskEvent) error { // restartTasks restarts all task runners concurrently. func (ar *allocRunner) restartTasks(ctx context.Context, event *structs.TaskEvent, failure bool, force bool) error { + + // ensure we are not trying to restart an alloc that is terminal + if !ar.shouldRun() { + return fmt.Errorf("restart of an alloc that should not run") + } + waitCh := make(chan struct{}) var err *multierror.Error var errMutex sync.Mutex diff --git a/e2e/clientstate/allocs_test.go b/e2e/clientstate/allocs_test.go new file mode 100644 index 000000000..b4f963ea6 --- /dev/null +++ b/e2e/clientstate/allocs_test.go @@ -0,0 +1,61 @@ +// Copyright (c) HashiCorp, Inc. +// SPDX-License-Identifier: MPL-2.0 + +package clientstate + +import ( + "testing" + "time" + + "github.com/hashicorp/nomad/e2e/e2eutil" + "github.com/hashicorp/nomad/helper/uuid" + "github.com/shoenig/test/must" + "github.com/shoenig/test/wait" +) + +func TestClientAllocs(t *testing.T) { + nomad := e2eutil.NomadClient(t) + + e2eutil.WaitForLeader(t, nomad) + e2eutil.WaitForNodesReady(t, nomad, 1) + + t.Run("testAllocZombie", testAllocZombie) +} + +// testAllocZombie ensures that a restart of a dead allocation does not cause +// it to come back to life in a not-quite alive state. +// +// https://github.com/hashicorp/nomad/issues/17079 +func testAllocZombie(t *testing.T) { + nomad := e2eutil.NomadClient(t) + + jobID := "alloc-zombie-" + uuid.Short() + jobIDs := []string{jobID} + t.Cleanup(e2eutil.CleanupJobsAndGC(t, &jobIDs)) + + // start the job and wait for alloc to become failed + err := e2eutil.Register(jobID, "./input/alloc_zombie.nomad") + must.NoError(t, err) + + allocID := e2eutil.SingleAllocID(t, jobID, "", 0) + + // wait for alloc to be marked as failed + e2eutil.WaitForAllocStatus(t, nomad, allocID, "failed") + + // wait for additional failures to know we got rescheduled + must.Wait(t, wait.InitialSuccess( + wait.BoolFunc(func() bool { + statuses, err := e2eutil.AllocStatusesRescheduled(jobID, "") + must.NoError(t, err) + return len(statuses) > 2 + }), + wait.Timeout(1*time.Minute), + wait.Gap(1*time.Second), + )) + + // now attempt to restart our initial allocation + // which should do nothing but give us an error + output, err := e2eutil.Command("nomad", "alloc", "restart", allocID) + must.ErrorContains(t, err, "restart of an alloc that should not run") + must.StrContains(t, output, "Failed to restart allocation") +} diff --git a/e2e/clientstate/input/alloc_zombie.nomad b/e2e/clientstate/input/alloc_zombie.nomad new file mode 100644 index 000000000..ac8ecdad1 --- /dev/null +++ b/e2e/clientstate/input/alloc_zombie.nomad @@ -0,0 +1,59 @@ +# Copyright (c) HashiCorp, Inc. +# SPDX-License-Identifier: MPL-2.0 + +job "alloc_zombie" { + + group "group" { + network { + mode = "host" + port "http" {} + } + + service { + name = "alloczombie" + port = "http" + provider = "nomad" + + check { + name = "alloczombiecheck" + type = "http" + port = "http" + path = "/does/not/exist.txt" + interval = "2s" + timeout = "1s" + check_restart { + limit = 1 + grace = "3s" + } + } + } + + reschedule { + attempts = 3 + interval = "1m" + delay = "5s" + delay_function = "constant" + unlimited = false + } + + restart { + attempts = 0 + delay = "5s" + mode = "fail" + } + + task "python" { + driver = "raw_exec" + + config { + command = "python3" + args = ["-m", "http.server", "${NOMAD_PORT_http}", "--directory", "/tmp"] + } + + resources { + cpu = 10 + memory = 64 + } + } + } +}