client: ignore restart issued to terminal allocations (#17175)

* client: ignore restart issued to terminal allocations

This PR fixes a bug where issuing a restart to a terminal allocation
would cause the allocation to run its hooks anyway. This was particularly
apparent with group_service_hook who would then register services but
then never deregister them - as the allocation would be effectively in
a "zombie" state where it is prepped to run tasks but never will.

* e2e: add e2e test for alloc restart zombies

* cl: tweak text

Co-authored-by: Tim Gross <tgross@hashicorp.com>

---------

Co-authored-by: Tim Gross <tgross@hashicorp.com>
This commit is contained in:
Seth Hoenig 2023-05-16 10:19:41 -05:00 committed by GitHub
parent 6814e8e6d9
commit e04ff0d935
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 129 additions and 0 deletions

3
.changelog/17175.txt Normal file
View File

@ -0,0 +1,3 @@
```release-note:bug
client: Fixed a bug where restarting a terminal allocation turns it into a zombie where allocation and task hooks will run unexpectedly
```

View File

@ -1273,6 +1273,12 @@ func (ar *allocRunner) RestartAll(event *structs.TaskEvent) error {
// restartTasks restarts all task runners concurrently.
func (ar *allocRunner) restartTasks(ctx context.Context, event *structs.TaskEvent, failure bool, force bool) error {
// ensure we are not trying to restart an alloc that is terminal
if !ar.shouldRun() {
return fmt.Errorf("restart of an alloc that should not run")
}
waitCh := make(chan struct{})
var err *multierror.Error
var errMutex sync.Mutex

View File

@ -0,0 +1,61 @@
// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: MPL-2.0
package clientstate
import (
"testing"
"time"
"github.com/hashicorp/nomad/e2e/e2eutil"
"github.com/hashicorp/nomad/helper/uuid"
"github.com/shoenig/test/must"
"github.com/shoenig/test/wait"
)
func TestClientAllocs(t *testing.T) {
nomad := e2eutil.NomadClient(t)
e2eutil.WaitForLeader(t, nomad)
e2eutil.WaitForNodesReady(t, nomad, 1)
t.Run("testAllocZombie", testAllocZombie)
}
// testAllocZombie ensures that a restart of a dead allocation does not cause
// it to come back to life in a not-quite alive state.
//
// https://github.com/hashicorp/nomad/issues/17079
func testAllocZombie(t *testing.T) {
nomad := e2eutil.NomadClient(t)
jobID := "alloc-zombie-" + uuid.Short()
jobIDs := []string{jobID}
t.Cleanup(e2eutil.CleanupJobsAndGC(t, &jobIDs))
// start the job and wait for alloc to become failed
err := e2eutil.Register(jobID, "./input/alloc_zombie.nomad")
must.NoError(t, err)
allocID := e2eutil.SingleAllocID(t, jobID, "", 0)
// wait for alloc to be marked as failed
e2eutil.WaitForAllocStatus(t, nomad, allocID, "failed")
// wait for additional failures to know we got rescheduled
must.Wait(t, wait.InitialSuccess(
wait.BoolFunc(func() bool {
statuses, err := e2eutil.AllocStatusesRescheduled(jobID, "")
must.NoError(t, err)
return len(statuses) > 2
}),
wait.Timeout(1*time.Minute),
wait.Gap(1*time.Second),
))
// now attempt to restart our initial allocation
// which should do nothing but give us an error
output, err := e2eutil.Command("nomad", "alloc", "restart", allocID)
must.ErrorContains(t, err, "restart of an alloc that should not run")
must.StrContains(t, output, "Failed to restart allocation")
}

View File

@ -0,0 +1,59 @@
# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: MPL-2.0
job "alloc_zombie" {
group "group" {
network {
mode = "host"
port "http" {}
}
service {
name = "alloczombie"
port = "http"
provider = "nomad"
check {
name = "alloczombiecheck"
type = "http"
port = "http"
path = "/does/not/exist.txt"
interval = "2s"
timeout = "1s"
check_restart {
limit = 1
grace = "3s"
}
}
}
reschedule {
attempts = 3
interval = "1m"
delay = "5s"
delay_function = "constant"
unlimited = false
}
restart {
attempts = 0
delay = "5s"
mode = "fail"
}
task "python" {
driver = "raw_exec"
config {
command = "python3"
args = ["-m", "http.server", "${NOMAD_PORT_http}", "--directory", "/tmp"]
}
resources {
cpu = 10
memory = 64
}
}
}
}