From 753c17c9de39d9b2c46bf5b7ca957bf6c49f90ce Mon Sep 17 00:00:00 2001 From: Seth Hoenig Date: Mon, 24 Apr 2023 14:24:51 -0500 Subject: [PATCH] services: un-mark group services as deregistered if restart hook runs (#16905) * services: un-mark group services as deregistered if restart hook runs This PR may fix a bug where group services will never be deregistered if the group undergoes a task restart. * e2e: add test case for restart and deregister group service * cl: add cl * e2e: add wait for service list call --- .changelog/16905.txt | 3 + client/allocrunner/group_service_hook.go | 8 ++- e2e/consul/alloc_restart_test.go | 74 ++++++++++++++++++++++++ e2e/consul/consul_test.go | 19 ++++++ e2e/consul/input/alloc_restart.hcl | 33 +++++++++++ e2e/consul/service_revert_test.go | 11 ---- 6 files changed, 135 insertions(+), 13 deletions(-) create mode 100644 .changelog/16905.txt create mode 100644 e2e/consul/alloc_restart_test.go create mode 100644 e2e/consul/consul_test.go create mode 100644 e2e/consul/input/alloc_restart.hcl diff --git a/.changelog/16905.txt b/.changelog/16905.txt new file mode 100644 index 000000000..ab540c271 --- /dev/null +++ b/.changelog/16905.txt @@ -0,0 +1,3 @@ +```release-note:bug +services: Fixed a bug preventing group service deregistrations after alloc restarts +``` diff --git a/client/allocrunner/group_service_hook.go b/client/allocrunner/group_service_hook.go index 30a698ecd..a63f6f607 100644 --- a/client/allocrunner/group_service_hook.go +++ b/client/allocrunner/group_service_hook.go @@ -119,12 +119,14 @@ func (h *groupServiceHook) Prerun() error { defer func() { // Mark prerun as true to unblock Updates h.prerun = true + // Mark deregistered as false to allow de-registration + h.deregistered = false h.mu.Unlock() }() return h.preRunLocked() } -// caller must hold h.lock +// caller must hold h.mu func (h *groupServiceHook) preRunLocked() error { if len(h.services) == 0 { return nil @@ -188,6 +190,8 @@ func (h *groupServiceHook) PreTaskRestart() error { defer func() { // Mark prerun as true to unblock Updates h.prerun = true + // Mark deregistered as false to allow de-registration + h.deregistered = false h.mu.Unlock() }() @@ -201,7 +205,7 @@ func (h *groupServiceHook) PreKill() { // implements the PreKill hook // -// caller must hold h.lock +// caller must hold h.mu func (h *groupServiceHook) preKillLocked() { // If we have a shutdown delay deregister group services and then wait // before continuing to kill tasks. diff --git a/e2e/consul/alloc_restart_test.go b/e2e/consul/alloc_restart_test.go new file mode 100644 index 000000000..270bd5d07 --- /dev/null +++ b/e2e/consul/alloc_restart_test.go @@ -0,0 +1,74 @@ +// Copyright (c) HashiCorp, Inc. +// SPDX-License-Identifier: MPL-2.0 + +package consul + +import ( + "context" + "errors" + "testing" + "time" + + "github.com/hashicorp/nomad/e2e/e2eutil" + "github.com/hashicorp/nomad/helper/uuid" + "github.com/shoenig/test/must" + "github.com/shoenig/test/wait" +) + +func testAllocRestart(t *testing.T) { + nc := e2eutil.NomadClient(t) + cc := e2eutil.ConsulClient(t).Catalog() + + const jobFile = "./input/alloc_restart.hcl" + jobID := "alloc-restart-" + uuid.Short() + jobIDs := []string{jobID} + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + defer e2eutil.CleanupJobsAndGCWithContext(t, ctx, &jobIDs) + + // register our job + err := e2eutil.Register(jobID, jobFile) + must.NoError(t, err) + + // wait for our allocation to be running + allocID := e2eutil.SingleAllocID(t, jobID, "", 0) + e2eutil.WaitForAllocRunning(t, nc, allocID) + + // make sure our service is registered + services, _, err := cc.Service("alloc-restart-http", "", nil) + must.NoError(t, err) + must.Len(t, 1, services) + + // restart the alloc + stderr, err := e2eutil.Command("nomad", "alloc", "restart", allocID) + must.NoError(t, err, must.Sprintf("stderr: %s", stderr)) + + // wait for alloc running again + e2eutil.WaitForAllocRunning(t, nc, allocID) + + // make sure our service is still registered + services, _, err = cc.Service("alloc-restart-http", "", nil) + must.NoError(t, err) + must.Len(t, 1, services) + + err = e2eutil.StopJob(jobID) + must.NoError(t, err) + + // make sure our service is no longer registered + f := func() error { + services, _, err = cc.Service("alloc-restart-http", "", nil) + if err != nil { + return err + } + if len(services) != 0 { + return errors.New("expected empty services") + } + return nil + } + must.Wait(t, wait.InitialSuccess( + wait.ErrorFunc(f), + wait.Timeout(10*time.Second), + wait.Gap(1*time.Second), + )) +} diff --git a/e2e/consul/consul_test.go b/e2e/consul/consul_test.go new file mode 100644 index 000000000..6be7dc286 --- /dev/null +++ b/e2e/consul/consul_test.go @@ -0,0 +1,19 @@ +package consul + +import ( + "testing" + + "github.com/hashicorp/nomad/e2e/e2eutil" +) + +func TestConsul(t *testing.T) { + // todo: migrate the remaining consul tests + + nomad := e2eutil.NomadClient(t) + + e2eutil.WaitForLeader(t, nomad) + e2eutil.WaitForNodesReady(t, nomad, 1) + + t.Run("testServiceReversion", testServiceReversion) + t.Run("testAllocRestart", testAllocRestart) +} diff --git a/e2e/consul/input/alloc_restart.hcl b/e2e/consul/input/alloc_restart.hcl new file mode 100644 index 000000000..d2937ab0e --- /dev/null +++ b/e2e/consul/input/alloc_restart.hcl @@ -0,0 +1,33 @@ +job "alloc-restart" { + constraint { + attribute = "${attr.kernel.name}" + value = "linux" + } + + group "group" { + network { + port "http" { + to = 8080 + } + } + + service { + name = "alloc-restart-http" + port = "http" + } + + task "python" { + driver = "raw_exec" + + config { + command = "python3" + args = ["-m", "http.server", "8080", "--directory", "/tmp"] + } + + resources { + cpu = 16 + memory = 32 + } + } + } +} diff --git a/e2e/consul/service_revert_test.go b/e2e/consul/service_revert_test.go index b6ea7f9a3..11c5b6af4 100644 --- a/e2e/consul/service_revert_test.go +++ b/e2e/consul/service_revert_test.go @@ -13,17 +13,6 @@ import ( "github.com/shoenig/test/must" ) -func TestConsul(t *testing.T) { - // todo: migrate the remaining consul tests - - nomad := e2eutil.NomadClient(t) - - e2eutil.WaitForLeader(t, nomad) - e2eutil.WaitForNodesReady(t, nomad, 1) - - t.Run("testServiceReversion", testServiceReversion) -} - // testServiceReversion asserts we can // - submit a job with a service // - update that job and modify service