From 41e3fcf205e5a0d25c41c45d6a876d45b3ef6bdd Mon Sep 17 00:00:00 2001 From: James Phillips Date: Mon, 5 Feb 2018 17:56:00 -0800 Subject: [PATCH] Makes server manager shift away from failed servers from Serf events. Because this code was doing pointer equality checks, it would work for the case of a failed attempted RPC because the objects are from the manager itself: https://github.com/hashicorp/consul/blob/v1.0.3/agent/consul/rpc.go#L283-L302 But the pointer check would always fail for events coming in from the Serf path because the server object is newly-created: https://github.com/hashicorp/consul/blob/v1.0.3/agent/router/serf_adapter.go#L14-L40 This means that we didn't proactively shift RPC traffic away from a failed server, we'd have to wait for an RPC to fail, which exposes the error to the calling client. By switching over to a name check vs. a pointer check we get the correct behavior. We added a DEBUG log as well to help observe this behavior during integrated testing. Related to #3863 since the fix here needed the same logic duplicated, owing to the complicated atomic stuff. /cc @dadgar for a heads up in case this also affects Nomad. --- agent/router/manager.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/agent/router/manager.go b/agent/router/manager.go index f31f0bb77..081893c5e 100644 --- a/agent/router/manager.go +++ b/agent/router/manager.go @@ -256,7 +256,7 @@ func (m *Manager) NotifyFailedServer(s *metadata.Server) { // the server to the end of the list. // Only rotate the server list when there is more than one server - if len(l.servers) > 1 && l.servers[0] == s && + if len(l.servers) > 1 && l.servers[0].Name == s.Name && // Use atomic.CAS to emulate a TryLock(). atomic.CompareAndSwapInt32(&m.notifyFailedBarrier, 0, 1) { defer atomic.StoreInt32(&m.notifyFailedBarrier, 0) @@ -267,9 +267,10 @@ func (m *Manager) NotifyFailedServer(s *metadata.Server) { defer m.listLock.Unlock() l = m.getServerList() - if len(l.servers) > 1 && l.servers[0] == s { + if len(l.servers) > 1 && l.servers[0].Name == s.Name { l.servers = l.cycleServer() m.saveServerList(l) + m.logger.Printf(`[DEBUG] manager: cycled away from server "%s"`, s.Name) } } }