services: un-mark group services as deregistered if restart hook runs (#16905)

* services: un-mark group services as deregistered if restart hook runs

This PR may fix a bug where group services will never be deregistered if the
group undergoes a task restart.

* e2e: add test case for restart and deregister group service

* cl: add cl

* e2e: add wait for service list call
This commit is contained in:
Seth Hoenig 2023-04-24 14:24:51 -05:00 committed by GitHub
parent 1633cab363
commit 753c17c9de
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 135 additions and 13 deletions

3
.changelog/16905.txt Normal file
View File

@ -0,0 +1,3 @@
```release-note:bug
services: Fixed a bug preventing group service deregistrations after alloc restarts
```

View File

@ -119,12 +119,14 @@ func (h *groupServiceHook) Prerun() error {
defer func() {
// Mark prerun as true to unblock Updates
h.prerun = true
// Mark deregistered as false to allow de-registration
h.deregistered = false
h.mu.Unlock()
}()
return h.preRunLocked()
}
// caller must hold h.lock
// caller must hold h.mu
func (h *groupServiceHook) preRunLocked() error {
if len(h.services) == 0 {
return nil
@ -188,6 +190,8 @@ func (h *groupServiceHook) PreTaskRestart() error {
defer func() {
// Mark prerun as true to unblock Updates
h.prerun = true
// Mark deregistered as false to allow de-registration
h.deregistered = false
h.mu.Unlock()
}()
@ -201,7 +205,7 @@ func (h *groupServiceHook) PreKill() {
// implements the PreKill hook
//
// caller must hold h.lock
// caller must hold h.mu
func (h *groupServiceHook) preKillLocked() {
// If we have a shutdown delay deregister group services and then wait
// before continuing to kill tasks.

View File

@ -0,0 +1,74 @@
// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: MPL-2.0
package consul
import (
"context"
"errors"
"testing"
"time"
"github.com/hashicorp/nomad/e2e/e2eutil"
"github.com/hashicorp/nomad/helper/uuid"
"github.com/shoenig/test/must"
"github.com/shoenig/test/wait"
)
func testAllocRestart(t *testing.T) {
nc := e2eutil.NomadClient(t)
cc := e2eutil.ConsulClient(t).Catalog()
const jobFile = "./input/alloc_restart.hcl"
jobID := "alloc-restart-" + uuid.Short()
jobIDs := []string{jobID}
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
defer e2eutil.CleanupJobsAndGCWithContext(t, ctx, &jobIDs)
// register our job
err := e2eutil.Register(jobID, jobFile)
must.NoError(t, err)
// wait for our allocation to be running
allocID := e2eutil.SingleAllocID(t, jobID, "", 0)
e2eutil.WaitForAllocRunning(t, nc, allocID)
// make sure our service is registered
services, _, err := cc.Service("alloc-restart-http", "", nil)
must.NoError(t, err)
must.Len(t, 1, services)
// restart the alloc
stderr, err := e2eutil.Command("nomad", "alloc", "restart", allocID)
must.NoError(t, err, must.Sprintf("stderr: %s", stderr))
// wait for alloc running again
e2eutil.WaitForAllocRunning(t, nc, allocID)
// make sure our service is still registered
services, _, err = cc.Service("alloc-restart-http", "", nil)
must.NoError(t, err)
must.Len(t, 1, services)
err = e2eutil.StopJob(jobID)
must.NoError(t, err)
// make sure our service is no longer registered
f := func() error {
services, _, err = cc.Service("alloc-restart-http", "", nil)
if err != nil {
return err
}
if len(services) != 0 {
return errors.New("expected empty services")
}
return nil
}
must.Wait(t, wait.InitialSuccess(
wait.ErrorFunc(f),
wait.Timeout(10*time.Second),
wait.Gap(1*time.Second),
))
}

19
e2e/consul/consul_test.go Normal file
View File

@ -0,0 +1,19 @@
package consul
import (
"testing"
"github.com/hashicorp/nomad/e2e/e2eutil"
)
func TestConsul(t *testing.T) {
// todo: migrate the remaining consul tests
nomad := e2eutil.NomadClient(t)
e2eutil.WaitForLeader(t, nomad)
e2eutil.WaitForNodesReady(t, nomad, 1)
t.Run("testServiceReversion", testServiceReversion)
t.Run("testAllocRestart", testAllocRestart)
}

View File

@ -0,0 +1,33 @@
job "alloc-restart" {
constraint {
attribute = "${attr.kernel.name}"
value = "linux"
}
group "group" {
network {
port "http" {
to = 8080
}
}
service {
name = "alloc-restart-http"
port = "http"
}
task "python" {
driver = "raw_exec"
config {
command = "python3"
args = ["-m", "http.server", "8080", "--directory", "/tmp"]
}
resources {
cpu = 16
memory = 32
}
}
}
}

View File

@ -13,17 +13,6 @@ import (
"github.com/shoenig/test/must"
)
func TestConsul(t *testing.T) {
// todo: migrate the remaining consul tests
nomad := e2eutil.NomadClient(t)
e2eutil.WaitForLeader(t, nomad)
e2eutil.WaitForNodesReady(t, nomad, 1)
t.Run("testServiceReversion", testServiceReversion)
}
// testServiceReversion asserts we can
// - submit a job with a service
// - update that job and modify service