diff --git a/.changelog/9689.txt b/.changelog/9689.txt new file mode 100644 index 000000000..85f78ac90 --- /dev/null +++ b/.changelog/9689.txt @@ -0,0 +1,3 @@ +```release-note:bug +proxycfg: avoid potential deadlock in delivering proxy snapshot to watchers. +``` \ No newline at end of file diff --git a/agent/proxycfg/state.go b/agent/proxycfg/state.go index 703cb66af..59d11a2f1 100644 --- a/agent/proxycfg/state.go +++ b/agent/proxycfg/state.go @@ -603,10 +603,11 @@ func (s *state) run() { case <-s.ctx.Done(): return case u := <-s.ch: + s.logger.Trace("A blocking query returned; handling snapshot update") + if err := s.handleUpdate(u, &snap); err != nil { - s.logger.Error("watch error", - "id", u.CorrelationID, - "error", err, + s.logger.Error("Failed to handle update from watch", + "id", u.CorrelationID, "error", err, ) continue } @@ -617,23 +618,47 @@ func (s *state) run() { snapCopy, err := snap.Clone() if err != nil { s.logger.Error("Failed to copy config snapshot for proxy", - "proxy", s.proxyID, "error", err, ) continue } - s.snapCh <- *snapCopy - // Allow the next change to trigger a send - coalesceTimer = nil - // Skip rest of loop - there is nothing to send since nothing changed on - // this iteration - continue + select { + // Try to send + case s.snapCh <- *snapCopy: + s.logger.Trace("Delivered new snapshot to proxy config watchers") + + // Allow the next change to trigger a send + coalesceTimer = nil + + // Skip rest of loop - there is nothing to send since nothing changed on + // this iteration + continue + + // Avoid blocking if a snapshot is already buffered in snapCh as this can result in a deadlock. + // See PR #9689 for more details. + default: + s.logger.Trace("Failed to deliver new snapshot to proxy config watchers") + + // Reset the timer to retry later. This is to ensure we attempt to redeliver the updated snapshot shortly. + if coalesceTimer == nil { + coalesceTimer = time.AfterFunc(coalesceTimeout, func() { + sendCh <- struct{}{} + }) + } + + // Do not reset coalesceTimer since we just queued a timer-based refresh + continue + } case replyCh := <-s.reqCh: + s.logger.Trace("A proxy config snapshot was requested") + if !snap.Valid() { // Not valid yet just respond with nil and move on to next task. replyCh <- nil + + s.logger.Trace("The proxy's config snapshot is not valid yet") continue } // Make a deep copy of snap so we don't mutate any of the embedded structs @@ -641,7 +666,6 @@ func (s *state) run() { snapCopy, err := snap.Clone() if err != nil { s.logger.Error("Failed to copy config snapshot for proxy", - "proxy", s.proxyID, "error", err, ) continue diff --git a/agent/xds/server.go b/agent/xds/server.go index d8a7ecbe5..7986b8015 100644 --- a/agent/xds/server.go +++ b/agent/xds/server.go @@ -4,6 +4,7 @@ import ( "context" "errors" "fmt" + "github.com/hashicorp/consul/logging" "sync/atomic" "time" @@ -164,6 +165,8 @@ const ( ) func (s *Server) process(stream ADSStream, reqCh <-chan *envoy.DiscoveryRequest) error { + logger := s.Logger.Named(logging.XDS) + // xDS requires a unique nonce to correlate response/request pairs var nonce uint64 @@ -324,6 +327,9 @@ func (s *Server) process(stream ADSStream, reqCh <-chan *envoy.DiscoveryRequest) // state machine. defer watchCancel() + logger.Trace("watching proxy, pending initial proxycfg snapshot", + "service_id", proxyID.String()) + // Now wait for the config so we can check ACL state = statePendingInitialConfig case statePendingInitialConfig: @@ -335,6 +341,9 @@ func (s *Server) process(stream ADSStream, reqCh <-chan *envoy.DiscoveryRequest) // Got config, try to authenticate next. state = stateRunning + logger.Trace("Got initial config snapshot", + "service_id", cfgSnap.ProxyID.String()) + // Lets actually process the config we just got or we'll mis responding fallthrough case stateRunning: @@ -346,6 +355,9 @@ func (s *Server) process(stream ADSStream, reqCh <-chan *envoy.DiscoveryRequest) // timer is first started. extendAuthTimer() + logger.Trace("Invoking all xDS resource handlers and sending new data if there is any", + "service_id", cfgSnap.ProxyID.String()) + // See if any handlers need to have the current (possibly new) config // sent. Note the order here is actually significant so we can't just // range the map which has no determined order. It's important because: diff --git a/logging/names.go b/logging/names.go index 544990061..b9616b167 100644 --- a/logging/names.go +++ b/logging/names.go @@ -56,5 +56,6 @@ const ( UIMetricsProxy string = "ui_metrics_proxy" WAN string = "wan" Watch string = "watch" + XDS string = "xds" Vault string = "vault" )