client: retry RPC call when no server is available (#15140)

When a Nomad service starts it tries to establish a connection with servers, but it also runs alloc runners to manage whatever allocations it needs to run. The alloc runner will invoke several hooks to perform actions, with some of them requiring access to the Nomad servers, such as Native Service Discovery Registration. If the alloc runner starts before a connection is established the alloc runner will fail, causing the allocation to be shutdown. This is particularly problematic for disconnected allocations that are reconnecting, as they may fail as soon as the client reconnects. This commit changes the RPC request logic to retry it, using the existing retry mechanism, if there are no servers available.
2022-11-04 14:09:39 -04:00 · 2022-11-04 14:09:39 -04:00 · 1b87d292a3
parent 79c4478f5b
commit 1b87d292a3
2 changed files with 31 additions and 25 deletions
--- a/.changelog/15140.txt
+++ b/.changelog/15140.txt
@ -0,0 +1,3 @@
+```release-note:bug
+client: prevent allocations from failing on client reconnect by retrying RPC requests when no servers are available yet
+```
--- a/client/rpc.go
+++ b/client/rpc.go
@ -70,34 +70,37 @@ func (c *Client) RPC(method string, args interface{}, reply interface{}) error {
 	}

 TRY:
+	var rpcErr error
+
 	server := c.servers.FindServer()
 	if server == nil {
-		return noServersErr
+		rpcErr = noServersErr
+	} else {
+		// Make the request.
+		rpcErr = c.connPool.RPC(c.Region(), server.Addr, method, args, reply)
+
+		if rpcErr == nil {
+			c.fireRpcRetryWatcher()
+			return nil
+		}
+
+		// If shutting down, exit without logging the error
+		select {
+		case <-c.shutdownCh:
+			return nil
+		default:
+		}
+
+		// Move off to another server, and see if we can retry.
+		c.rpcLogger.Error("error performing RPC to server", "error", rpcErr, "rpc", method, "server", server.Addr)
+		c.servers.NotifyFailedServer(server)
+
+		if !canRetry(args, rpcErr) {
+			c.rpcLogger.Error("error performing RPC to server which is not safe to automatically retry", "error", rpcErr, "rpc", method, "server", server.Addr)
+			return rpcErr
+		}
 	}

-	// Make the request.
-	rpcErr := c.connPool.RPC(c.Region(), server.Addr, method, args, reply)
-
-	if rpcErr == nil {
-		c.fireRpcRetryWatcher()
-		return nil
-	}
-
-	// If shutting down, exit without logging the error
-	select {
-	case <-c.shutdownCh:
-		return nil
-	default:
-	}
-
-	// Move off to another server, and see if we can retry.
-	c.rpcLogger.Error("error performing RPC to server", "error", rpcErr, "rpc", method, "server", server.Addr)
-	c.servers.NotifyFailedServer(server)
-
-	if !canRetry(args, rpcErr) {
-		c.rpcLogger.Error("error performing RPC to server which is not safe to automatically retry", "error", rpcErr, "rpc", method, "server", server.Addr)
-		return rpcErr
-	}
 	if time.Now().After(deadline) {
 		// Blocking queries are tricky.  jitters and rpcholdtimes in multiple places can result in our server call taking longer than we wanted it to. For example:
 		// a block time of 5s may easily turn into the server blocking for 10s since it applies its own RPCHoldTime. If the server dies at t=7s we still want to retry
@ -106,7 +109,7 @@ TRY:
 			info.SetTimeToBlock(0)
 			return c.RPC(method, args, reply)
 		}
-		c.rpcLogger.Error("error performing RPC to server, deadline exceeded, cannot retry", "error", rpcErr, "rpc", method, "server", server.Addr)
+		c.rpcLogger.Error("error performing RPC to server, deadline exceeded, cannot retry", "error", rpcErr, "rpc", method)
 		return rpcErr
 	}