From 1b87d292a3638606b418f71cd9fc3c0a72e47a37 Mon Sep 17 00:00:00 2001 From: Luiz Aoqui Date: Fri, 4 Nov 2022 14:09:39 -0400 Subject: [PATCH] client: retry RPC call when no server is available (#15140) When a Nomad service starts it tries to establish a connection with servers, but it also runs alloc runners to manage whatever allocations it needs to run. The alloc runner will invoke several hooks to perform actions, with some of them requiring access to the Nomad servers, such as Native Service Discovery Registration. If the alloc runner starts before a connection is established the alloc runner will fail, causing the allocation to be shutdown. This is particularly problematic for disconnected allocations that are reconnecting, as they may fail as soon as the client reconnects. This commit changes the RPC request logic to retry it, using the existing retry mechanism, if there are no servers available. --- .changelog/15140.txt | 3 +++ client/rpc.go | 53 +++++++++++++++++++++++--------------------- 2 files changed, 31 insertions(+), 25 deletions(-) create mode 100644 .changelog/15140.txt diff --git a/.changelog/15140.txt b/.changelog/15140.txt new file mode 100644 index 000000000..d6ffc3211 --- /dev/null +++ b/.changelog/15140.txt @@ -0,0 +1,3 @@ +```release-note:bug +client: prevent allocations from failing on client reconnect by retrying RPC requests when no servers are available yet +``` diff --git a/client/rpc.go b/client/rpc.go index 9bc439ad0..54e2edec5 100644 --- a/client/rpc.go +++ b/client/rpc.go @@ -70,34 +70,37 @@ func (c *Client) RPC(method string, args interface{}, reply interface{}) error { } TRY: + var rpcErr error + server := c.servers.FindServer() if server == nil { - return noServersErr + rpcErr = noServersErr + } else { + // Make the request. + rpcErr = c.connPool.RPC(c.Region(), server.Addr, method, args, reply) + + if rpcErr == nil { + c.fireRpcRetryWatcher() + return nil + } + + // If shutting down, exit without logging the error + select { + case <-c.shutdownCh: + return nil + default: + } + + // Move off to another server, and see if we can retry. + c.rpcLogger.Error("error performing RPC to server", "error", rpcErr, "rpc", method, "server", server.Addr) + c.servers.NotifyFailedServer(server) + + if !canRetry(args, rpcErr) { + c.rpcLogger.Error("error performing RPC to server which is not safe to automatically retry", "error", rpcErr, "rpc", method, "server", server.Addr) + return rpcErr + } } - // Make the request. - rpcErr := c.connPool.RPC(c.Region(), server.Addr, method, args, reply) - - if rpcErr == nil { - c.fireRpcRetryWatcher() - return nil - } - - // If shutting down, exit without logging the error - select { - case <-c.shutdownCh: - return nil - default: - } - - // Move off to another server, and see if we can retry. - c.rpcLogger.Error("error performing RPC to server", "error", rpcErr, "rpc", method, "server", server.Addr) - c.servers.NotifyFailedServer(server) - - if !canRetry(args, rpcErr) { - c.rpcLogger.Error("error performing RPC to server which is not safe to automatically retry", "error", rpcErr, "rpc", method, "server", server.Addr) - return rpcErr - } if time.Now().After(deadline) { // Blocking queries are tricky. jitters and rpcholdtimes in multiple places can result in our server call taking longer than we wanted it to. For example: // a block time of 5s may easily turn into the server blocking for 10s since it applies its own RPCHoldTime. If the server dies at t=7s we still want to retry @@ -106,7 +109,7 @@ TRY: info.SetTimeToBlock(0) return c.RPC(method, args, reply) } - c.rpcLogger.Error("error performing RPC to server, deadline exceeded, cannot retry", "error", rpcErr, "rpc", method, "server", server.Addr) + c.rpcLogger.Error("error performing RPC to server, deadline exceeded, cannot retry", "error", rpcErr, "rpc", method) return rpcErr }