agent: fix 'consul leave' shutdown race (#2880)

When the agent is triggered to shutdown via an external 'consul leave'
command delivered via the HTTP API then the client expects to receive a
response when the agent is down. This creates a race on when to shutdown
the agent itself like the RPC server, the checks and the state and the
external endpoints like DNS and HTTP. Ideally, the external endpoints
should be shutdown before the internal state but if the goal is to
respond reliably that the agent is down then this is not possible.

This patch splits the agent shutdown into two parts implemented in a
single method to keep it simple and unambiguos for the caller. The first
stage shuts down the internal state, checks, RPC server, ...
synchronously and then triggers the shutdown of the external endpoints
asychronously. This way the caller is guaranteed that the internal state
services are down when Shutdown returns and there remains enough time to
send a response.

Fixes #2880
This commit is contained in:
Frank Schroeder 2017-06-19 20:02:01 +02:00 committed by Frank Schröder
parent d52a0b2909
commit 226a5d3db4
1 changed files with 38 additions and 24 deletions

View File

@ -1095,6 +1095,18 @@ func (a *Agent) Leave() error {
// Shutdown is used to hard stop the agent. Should be // Shutdown is used to hard stop the agent. Should be
// preceded by a call to Leave to do it gracefully. // preceded by a call to Leave to do it gracefully.
func (a *Agent) Shutdown() error { func (a *Agent) Shutdown() error {
// Execute the shutdown in two stages since it may have been
// triggered through the HTTP server via 'consul leave'. In this
// case the HTTP server needs to be up after the agent is down in
// order to deliver the response. The 'consul leave' handler waits
// for the Shutdown() method to complete before delivering the
// response.
//
// This could also be implemented through multiple methods to
// decouple this in a different way but then this logic would have
// to be implemented everywhere. This way the logic remains in a
// single place.
a.shutdownLock.Lock() a.shutdownLock.Lock()
defer a.shutdownLock.Unlock() defer a.shutdownLock.Unlock()
@ -1103,28 +1115,7 @@ func (a *Agent) Shutdown() error {
} }
a.logger.Println("[INFO] agent: Requesting shutdown") a.logger.Println("[INFO] agent: Requesting shutdown")
// Stop all API endpoints // stage 1: shutdown agent w/o HTTP and DNS endpoints
for _, srv := range a.dnsServers {
a.logger.Printf("[INFO] agent: Stopping DNS server %s (%s)", srv.Server.Addr, srv.Server.Net)
srv.Shutdown()
}
for _, srv := range a.httpServers {
// http server is HTTPS if TLSConfig is not nil and NextProtos does not only contain "h2"
// the latter seems to be a side effect of HTTP/2 support in go 1.8. TLSConfig != nil is
// no longer sufficient to check for an HTTPS server.
a.logger.Printf("[INFO] agent: Stopping %s server %s", strings.ToUpper(srv.proto), srv.Addr)
// graceful shutdown
ctx, cancel := context.WithTimeout(context.Background(), time.Second)
defer cancel()
srv.Shutdown(ctx)
if ctx.Err() == context.DeadlineExceeded {
a.logger.Printf("[WARN] agent: Timeout stopping %s server %s", strings.ToUpper(srv.proto), srv.Addr)
}
}
a.logger.Println("[INFO] agent: Waiting for endpoints to shut down")
a.wgServers.Wait()
a.logger.Print("[INFO] agent: Endpoints down")
// Stop all the checks // Stop all the checks
a.checkLock.Lock() a.checkLock.Lock()
@ -1159,9 +1150,32 @@ func (a *Agent) Shutdown() error {
a.logger.Println("[WARN] agent: could not delete pid file ", pidErr) a.logger.Println("[WARN] agent: could not delete pid file ", pidErr)
} }
// stage 2: shutdown HTTP and DNS endpoints and trigger final shutdown notification
go func() {
// Stop all API endpoints
for _, srv := range a.dnsServers {
a.logger.Printf("[INFO] agent: Stopping DNS server %s (%s)", srv.Server.Addr, srv.Server.Net)
srv.Shutdown()
}
for _, srv := range a.httpServers {
a.logger.Printf("[INFO] agent: Stopping %s server %s", strings.ToUpper(srv.proto), srv.Addr)
ctx, cancel := context.WithTimeout(context.Background(), time.Second)
defer cancel()
srv.Shutdown(ctx)
if ctx.Err() == context.DeadlineExceeded {
a.logger.Printf("[WARN] agent: Timeout stopping %s server %s", strings.ToUpper(srv.proto), srv.Addr)
}
}
a.logger.Println("[INFO] agent: Waiting for endpoints to shut down")
a.wgServers.Wait()
a.logger.Print("[INFO] agent: Endpoints down")
a.logger.Println("[INFO] agent: shutdown complete") a.logger.Println("[INFO] agent: shutdown complete")
a.shutdown = true
close(a.shutdownCh) close(a.shutdownCh)
}()
a.shutdown = true
return err return err
} }