From c8260c3940941e74ab802f3aae4ff3732840f2ef Mon Sep 17 00:00:00 2001 From: Seth Hoenig Date: Tue, 6 Jul 2021 09:37:53 -0500 Subject: [PATCH] consul: avoid triggering unnecessary sync when removing workload There are bits of logic in callers of RemoveWorkload on group/task cleanup hooks which call RemoveWorkload with the "Canary" version of the workload, in case the alloc is marked as a Canary. This logic triggers an extra sync with Consul, and also doesn't do the intended behavior - for which no special casing is necessary anyway. When the workload is marked for removal, all associated services and checks will be removed regardless of the Canary status, because the service and check IDs do not incorporate the canary-ness in the first place. The only place where canary-ness matters is when updating a workload, where we need to compute the hash of the services and checks to determine whether they have been modified, the Canary flag of which is a part of that. Fixes #10842 --- .changelog/10842.txt | 3 ++ client/allocrunner/alloc_runner_unix_test.go | 10 +++---- client/allocrunner/groupservice_hook.go | 5 ---- client/allocrunner/groupservice_hook_test.go | 30 ++++++++----------- client/allocrunner/taskrunner/service_hook.go | 11 +++---- .../taskrunner/service_hook_test.go | 13 ++++---- .../taskrunner/task_runner_test.go | 28 ++++++++--------- 7 files changed, 42 insertions(+), 58 deletions(-) create mode 100644 .changelog/10842.txt diff --git a/.changelog/10842.txt b/.changelog/10842.txt new file mode 100644 index 000000000..98a4a0a7d --- /dev/null +++ b/.changelog/10842.txt @@ -0,0 +1,3 @@ +```release-note:bug +consul: remove ineffective edge case handling on service deregistration +``` diff --git a/client/allocrunner/alloc_runner_unix_test.go b/client/allocrunner/alloc_runner_unix_test.go index 7ec4dea0b..213016254 100644 --- a/client/allocrunner/alloc_runner_unix_test.go +++ b/client/allocrunner/alloc_runner_unix_test.go @@ -120,12 +120,12 @@ func TestAllocRunner_Restore_RunningTerminal(t *testing.T) { require.Error(t, logmonProc.Signal(syscall.Signal(0))) // Assert consul was cleaned up: - // 2 removals (canary+noncanary) during prekill - // 2 removals (canary+noncanary) during exited - // 2 removals (canary+noncanary) during stop - // 2 removals (canary+noncanary) group during stop + // 1 removal during prekill + // 1 removal during exited + // 1 removal during stop + // 1 removal group during stop consulOps := conf2.Consul.(*consul.MockConsulServiceClient).GetOps() - require.Len(t, consulOps, 8) + require.Len(t, consulOps, 4) for _, op := range consulOps { require.Equal(t, "remove", op.Op) } diff --git a/client/allocrunner/groupservice_hook.go b/client/allocrunner/groupservice_hook.go index 065c94347..72f0e9fd9 100644 --- a/client/allocrunner/groupservice_hook.go +++ b/client/allocrunner/groupservice_hook.go @@ -208,11 +208,6 @@ func (h *groupServiceHook) deregister() { if len(h.services) > 0 { workloadServices := h.getWorkloadServices() h.consulClient.RemoveWorkload(workloadServices) - - // Canary flag may be getting flipped when the alloc is being - // destroyed, so remove both variations of the service - workloadServices.Canary = !workloadServices.Canary - h.consulClient.RemoveWorkload(workloadServices) } } diff --git a/client/allocrunner/groupservice_hook_test.go b/client/allocrunner/groupservice_hook_test.go index 5789079d8..61d9a38b4 100644 --- a/client/allocrunner/groupservice_hook_test.go +++ b/client/allocrunner/groupservice_hook_test.go @@ -54,14 +54,12 @@ func TestGroupServiceHook_NoGroupServices(t *testing.T) { require.NoError(t, h.PreTaskRestart()) ops := consulClient.GetOps() - require.Len(t, ops, 7) + require.Len(t, ops, 5) require.Equal(t, "add", ops[0].Op) // Prerun require.Equal(t, "update", ops[1].Op) // Update - require.Equal(t, "remove", ops[2].Op) // Postrun (1st) - require.Equal(t, "remove", ops[3].Op) // Postrun (2nd) - require.Equal(t, "remove", ops[4].Op) // Restart -> preKill (1st) - require.Equal(t, "remove", ops[5].Op) // Restart -> preKill (2nd) - require.Equal(t, "add", ops[6].Op) // Restart -> preRun + require.Equal(t, "remove", ops[2].Op) // Postrun + require.Equal(t, "remove", ops[3].Op) // Restart -> preKill + require.Equal(t, "add", ops[4].Op) // Restart -> preRun } // TestGroupServiceHook_ShutdownDelayUpdate asserts calling group service hooks @@ -127,14 +125,12 @@ func TestGroupServiceHook_GroupServices(t *testing.T) { require.NoError(t, h.PreTaskRestart()) ops := consulClient.GetOps() - require.Len(t, ops, 7) + require.Len(t, ops, 5) require.Equal(t, "add", ops[0].Op) // Prerun require.Equal(t, "update", ops[1].Op) // Update - require.Equal(t, "remove", ops[2].Op) // Postrun (1st) - require.Equal(t, "remove", ops[3].Op) // Postrun (2nd) - require.Equal(t, "remove", ops[4].Op) // Restart -> preKill (1st) - require.Equal(t, "remove", ops[5].Op) // Restart -> preKill (2nd) - require.Equal(t, "add", ops[6].Op) // Restart -> preRun + require.Equal(t, "remove", ops[2].Op) // Postrun + require.Equal(t, "remove", ops[3].Op) // Restart -> preKill + require.Equal(t, "add", ops[4].Op) // Restart -> preRun } // TestGroupServiceHook_Error asserts group service hooks with group @@ -175,14 +171,12 @@ func TestGroupServiceHook_NoNetwork(t *testing.T) { require.NoError(t, h.PreTaskRestart()) ops := consulClient.GetOps() - require.Len(t, ops, 7) + require.Len(t, ops, 5) require.Equal(t, "add", ops[0].Op) // Prerun require.Equal(t, "update", ops[1].Op) // Update - require.Equal(t, "remove", ops[2].Op) // Postrun (1st) - require.Equal(t, "remove", ops[3].Op) // Postrun (2nd) - require.Equal(t, "remove", ops[4].Op) // Restart -> preKill (1st) - require.Equal(t, "remove", ops[5].Op) // Restart -> preKill (2nd) - require.Equal(t, "add", ops[6].Op) // Restart -> preRun + require.Equal(t, "remove", ops[2].Op) // Postrun + require.Equal(t, "remove", ops[3].Op) // Restart -> preKill + require.Equal(t, "add", ops[4].Op) // Restart -> preRun } func TestGroupServiceHook_getWorkloadServices(t *testing.T) { diff --git a/client/allocrunner/taskrunner/service_hook.go b/client/allocrunner/taskrunner/service_hook.go index 1cf2b5f65..95bf1d214 100644 --- a/client/allocrunner/taskrunner/service_hook.go +++ b/client/allocrunner/taskrunner/service_hook.go @@ -171,13 +171,10 @@ func (h *serviceHook) Exited(context.Context, *interfaces.TaskExitedRequest, *in // deregister services from Consul. func (h *serviceHook) deregister() { - workloadServices := h.getWorkloadServices() - h.consulServices.RemoveWorkload(workloadServices) - - // Canary flag may be getting flipped when the alloc is being - // destroyed, so remove both variations of the service - workloadServices.Canary = !workloadServices.Canary - h.consulServices.RemoveWorkload(workloadServices) + if len(h.services) > 0 { + workloadServices := h.getWorkloadServices() + h.consulServices.RemoveWorkload(workloadServices) + } h.initialRegistration = false } diff --git a/client/allocrunner/taskrunner/service_hook_test.go b/client/allocrunner/taskrunner/service_hook_test.go index 1b577bd54..ad88fabd3 100644 --- a/client/allocrunner/taskrunner/service_hook_test.go +++ b/client/allocrunner/taskrunner/service_hook_test.go @@ -39,16 +39,15 @@ func TestUpdate_beforePoststart(t *testing.T) { // so Update should again wait on Poststart. require.NoError(t, hook.Exited(context.Background(), &interfaces.TaskExitedRequest{}, &interfaces.TaskExitedResponse{})) + require.Len(t, c.GetOps(), 3) + require.NoError(t, hook.Update(context.Background(), &interfaces.TaskUpdateRequest{Alloc: alloc}, &interfaces.TaskUpdateResponse{})) + require.Len(t, c.GetOps(), 3) + require.NoError(t, hook.Poststart(context.Background(), &interfaces.TaskPoststartRequest{}, &interfaces.TaskPoststartResponse{})) require.Len(t, c.GetOps(), 4) require.NoError(t, hook.Update(context.Background(), &interfaces.TaskUpdateRequest{Alloc: alloc}, &interfaces.TaskUpdateResponse{})) - require.Len(t, c.GetOps(), 4) - require.NoError(t, hook.Poststart(context.Background(), &interfaces.TaskPoststartRequest{}, &interfaces.TaskPoststartResponse{})) require.Len(t, c.GetOps(), 5) + require.NoError(t, hook.PreKilling(context.Background(), &interfaces.TaskPreKillRequest{}, &interfaces.TaskPreKillResponse{})) + require.Len(t, c.GetOps(), 6) require.NoError(t, hook.Update(context.Background(), &interfaces.TaskUpdateRequest{Alloc: alloc}, &interfaces.TaskUpdateResponse{})) require.Len(t, c.GetOps(), 6) - require.NoError(t, hook.PreKilling(context.Background(), &interfaces.TaskPreKillRequest{}, &interfaces.TaskPreKillResponse{})) - require.Len(t, c.GetOps(), 8) - require.NoError(t, hook.Update(context.Background(), &interfaces.TaskUpdateRequest{Alloc: alloc}, &interfaces.TaskUpdateResponse{})) - require.Len(t, c.GetOps(), 8) - } diff --git a/client/allocrunner/taskrunner/task_runner_test.go b/client/allocrunner/taskrunner/task_runner_test.go index 8ef75d3fd..9e81abd31 100644 --- a/client/allocrunner/taskrunner/task_runner_test.go +++ b/client/allocrunner/taskrunner/task_runner_test.go @@ -957,17 +957,16 @@ func TestTaskRunner_ShutdownDelay(t *testing.T) { assert.NoError(t, tr.Kill(context.Background(), structs.NewTaskEvent("test"))) }() - // Wait for *2* deregistration calls (due to needing to remove both - // canary tag variants) + // Wait for *1* de-registration calls (all [non-]canary variants removed). + WAIT: for { ops := mockConsul.GetOps() switch n := len(ops); n { - case 1, 2: - // Waiting for both deregistration calls - case 3: + case 1: + // Waiting for single de-registration call. + case 2: require.Equalf(t, "remove", ops[1].Op, "expected deregistration but found: %#v", ops[1]) - require.Equalf(t, "remove", ops[2].Op, "expected deregistration but found: %#v", ops[2]) break WAIT default: // ?! @@ -2401,25 +2400,22 @@ func TestTaskRunner_UnregisterConsul_Retries(t *testing.T) { consul := conf.Consul.(*consulapi.MockConsulServiceClient) consulOps := consul.GetOps() - require.Len(t, consulOps, 8) + require.Len(t, consulOps, 5) // Initial add require.Equal(t, "add", consulOps[0].Op) - // Removing canary and non-canary entries on first exit + // Removing entries on first exit require.Equal(t, "remove", consulOps[1].Op) - require.Equal(t, "remove", consulOps[2].Op) // Second add on retry - require.Equal(t, "add", consulOps[3].Op) + require.Equal(t, "add", consulOps[2].Op) - // Removing canary and non-canary entries on retry + // Removing entries on retry + require.Equal(t, "remove", consulOps[3].Op) + + // Removing entries on stop require.Equal(t, "remove", consulOps[4].Op) - require.Equal(t, "remove", consulOps[5].Op) - - // Removing canary and non-canary entries on stop - require.Equal(t, "remove", consulOps[6].Op) - require.Equal(t, "remove", consulOps[7].Op) } // testWaitForTaskToStart waits for the task to be running or fails the test