client: retry RPC call when no server is available (#15140)

When a Nomad service starts it tries to establish a connection with
servers, but it also runs alloc runners to manage whatever allocations
it needs to run.

The alloc runner will invoke several hooks to perform actions, with some
of them requiring access to the Nomad servers, such as Native Service
Discovery Registration.

If the alloc runner starts before a connection is established the alloc
runner will fail, causing the allocation to be shutdown. This is
particularly problematic for disconnected allocations that are
reconnecting, as they may fail as soon as the client reconnects.

This commit changes the RPC request logic to retry it, using the
existing retry mechanism, if there are no servers available.
This commit is contained in:
Luiz Aoqui 2022-11-04 14:09:39 -04:00 committed by GitHub
parent 79c4478f5b
commit 1b87d292a3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 31 additions and 25 deletions

3
.changelog/15140.txt Normal file
View File

@ -0,0 +1,3 @@
```release-note:bug
client: prevent allocations from failing on client reconnect by retrying RPC requests when no servers are available yet
```

View File

@ -70,34 +70,37 @@ func (c *Client) RPC(method string, args interface{}, reply interface{}) error {
}
TRY:
var rpcErr error
server := c.servers.FindServer()
if server == nil {
return noServersErr
rpcErr = noServersErr
} else {
// Make the request.
rpcErr = c.connPool.RPC(c.Region(), server.Addr, method, args, reply)
if rpcErr == nil {
c.fireRpcRetryWatcher()
return nil
}
// If shutting down, exit without logging the error
select {
case <-c.shutdownCh:
return nil
default:
}
// Move off to another server, and see if we can retry.
c.rpcLogger.Error("error performing RPC to server", "error", rpcErr, "rpc", method, "server", server.Addr)
c.servers.NotifyFailedServer(server)
if !canRetry(args, rpcErr) {
c.rpcLogger.Error("error performing RPC to server which is not safe to automatically retry", "error", rpcErr, "rpc", method, "server", server.Addr)
return rpcErr
}
}
// Make the request.
rpcErr := c.connPool.RPC(c.Region(), server.Addr, method, args, reply)
if rpcErr == nil {
c.fireRpcRetryWatcher()
return nil
}
// If shutting down, exit without logging the error
select {
case <-c.shutdownCh:
return nil
default:
}
// Move off to another server, and see if we can retry.
c.rpcLogger.Error("error performing RPC to server", "error", rpcErr, "rpc", method, "server", server.Addr)
c.servers.NotifyFailedServer(server)
if !canRetry(args, rpcErr) {
c.rpcLogger.Error("error performing RPC to server which is not safe to automatically retry", "error", rpcErr, "rpc", method, "server", server.Addr)
return rpcErr
}
if time.Now().After(deadline) {
// Blocking queries are tricky. jitters and rpcholdtimes in multiple places can result in our server call taking longer than we wanted it to. For example:
// a block time of 5s may easily turn into the server blocking for 10s since it applies its own RPCHoldTime. If the server dies at t=7s we still want to retry
@ -106,7 +109,7 @@ TRY:
info.SetTimeToBlock(0)
return c.RPC(method, args, reply)
}
c.rpcLogger.Error("error performing RPC to server, deadline exceeded, cannot retry", "error", rpcErr, "rpc", method, "server", server.Addr)
c.rpcLogger.Error("error performing RPC to server, deadline exceeded, cannot retry", "error", rpcErr, "rpc", method)
return rpcErr
}