open-consul/agent/consul/gateway_locator.go

package consul

import (
	"context"
	"errors"
	"math/rand"
	"sort"
	"sync"
	"time"

	"github.com/hashicorp/consul/agent/consul/state"
	"github.com/hashicorp/consul/agent/structs"
	"github.com/hashicorp/consul/api"
	"github.com/hashicorp/consul/ipaddr"
	"github.com/hashicorp/consul/lib/stringslice"
	"github.com/hashicorp/consul/logging"
	"github.com/hashicorp/go-hclog"
	memdb "github.com/hashicorp/go-memdb"
)

// GatewayLocator assists in selecting an appropriate mesh gateway when wan
// federation via mesh gateways is enabled.
//
// This is exclusively used by the consul server itself when it needs to tunnel
// RPC or gossip through a mesh gateway to reach its ultimate destination.
//
// During secondary datacenter bootstrapping there is a phase where it is
// impossible for mesh gateways in the secondary datacenter to register
// themselves into the catalog to be discovered by the servers, so the servers
// maintain references for the mesh gateways in the primary in addition to its
// own local mesh gateways.
//
// After initial datacenter federation the primary mesh gateways are only used
// in extreme fallback situations (basically re-bootstrapping).
//
// For all other operations a consul server will ALWAYS contact a local mesh
// gateway to ultimately forward the request through a remote mesh gateway to
// reach its destination.
type GatewayLocator struct {
	logger            hclog.Logger
	srv               serverDelegate
	datacenter        string // THIS dc
	primaryDatacenter string

	// these ONLY contain ones that have the wanfed:1 meta
	gatewaysLock    sync.Mutex
	primaryGateways []string // WAN addrs
	localGateways   []string // LAN addrs

	// primaryMeshGatewayDiscoveredAddresses is the current fallback addresses
	// for the mesh gateways in the primary datacenter.
	primaryMeshGatewayDiscoveredAddresses     []string
	primaryMeshGatewayDiscoveredAddressesLock sync.Mutex

	// This will be closed the FIRST time we get some gateways populated
	primaryGatewaysReadyCh   chan struct{}
	primaryGatewaysReadyOnce sync.Once

	// these are a collection of measurements that factor into deciding if we
	// should directly dial the primary's mesh gateways or if we should try to
	// route through our local gateways (if they are up).
	lastReplLock         sync.Mutex
	lastReplSuccess      time.Time
	lastReplFailure      time.Time
	lastReplSuccesses    uint64
	lastReplFailures     uint64
	useReplicationSignal bool // this should be set to true on the leader
}

// SetLastFederationStateReplicationError is used to indicate if the federation
// state replication loop has succeeded (nil) or failed during the last
// execution.
//
// Rather than introduce a completely new mechanism to periodically probe that
// our chosen mesh-gateway configuration can reach the primary's servers (like
// a ping or status RPC) we cheat and use the federation state replicator
// goroutine's success or failure as a proxy.
func (g *GatewayLocator) SetLastFederationStateReplicationError(err error, fromReplication bool) {
	if g == nil {
		return
	}

	g.lastReplLock.Lock()
	defer g.lastReplLock.Unlock()

	oldChoice := g.dialPrimaryThroughLocalGateway()
	if err == nil {
		g.lastReplSuccess = time.Now().UTC()
		g.lastReplSuccesses++
		g.lastReplFailures = 0
		if fromReplication {
			// If we get info from replication, assume replication is operating.
			g.useReplicationSignal = true
		}
	} else {
		g.lastReplFailure = time.Now().UTC()
		g.lastReplFailures++
		g.lastReplSuccesses = 0
	}
	newChoice := g.dialPrimaryThroughLocalGateway()
	if oldChoice != newChoice {
		g.logPrimaryDialingMessage(newChoice)
	}
}

func (g *GatewayLocator) SetUseReplicationSignal(newValue bool) {
	if g == nil {
		return
	}

	g.lastReplLock.Lock()
	g.useReplicationSignal = newValue
	g.lastReplLock.Unlock()
}

func (g *GatewayLocator) logPrimaryDialingMessage(useLocal bool) {
	if g.datacenter == g.primaryDatacenter {
		// These messages are useless when the server is in the primary
		// datacenter.
		return
	}
	if useLocal {
		g.logger.Info("will dial the primary datacenter using our local mesh gateways if possible")
	} else {
		g.logger.Info("will dial the primary datacenter through its mesh gateways")
	}
}

// DialPrimaryThroughLocalGateway determines if we should dial the primary's
// mesh gateways directly or use our local mesh gateways (if they are up).
//
// Generally the system has three states:
//
// 1. Servers dial primary MGWs using fallback addresses from the agent config.
// 2. Servers dial primary MGWs using replicated federation state data.
// 3. Servers dial primary MGWs indirectly through local MGWs.
//
// After initial bootstrapping most communication should go through (3). If the
// local mesh gateways are not coming up for chicken/egg problems (mostly the
// kind that arise from secondary datacenter bootstrapping) then (2) is useful
// to solve the chicken/egg problem and get back to (3). In the worst case
// where we completely lost communication with the primary AND all of their old
// mesh gateway addresses are changed then we need to go all the way back to
// square one and re-bootstrap via (1).
//
// Since both (1) and (2) are meant to be temporary we simplify things and make
// the system only consider two overall configurations: (1+2, with the
// addresses being unioned) or (3).
//
// This method returns true if in state (3) and false if in state (1+2).
func (g *GatewayLocator) DialPrimaryThroughLocalGateway() bool {
	if g.datacenter == g.primaryDatacenter {
		return false // not important
	}
	g.lastReplLock.Lock()
	defer g.lastReplLock.Unlock()
	return g.dialPrimaryThroughLocalGateway()
}

const localFederationStateReplicatorFailuresBeforeDialingDirectly = 3

func (g *GatewayLocator) dialPrimaryThroughLocalGateway() bool {
	if !g.useReplicationSignal {
		// Followers should blindly assume these gateways work. The leader will
		// try to bypass them and correct the replicated federation state info
		// that the followers will eventually pick up on.
		return true
	}
	if g.lastReplSuccess.IsZero() && g.lastReplFailure.IsZero() {
		return false // no data yet
	}

	if g.lastReplSuccess.After(g.lastReplFailure) {
		return true // we have viable data
	}

	if g.lastReplFailures < localFederationStateReplicatorFailuresBeforeDialingDirectly {
		return true // maybe it's just a little broken
	}

	return false
}

// PrimaryMeshGatewayAddressesReadyCh returns a channel that will be closed
// when federation state replication ships back at least one primary mesh
// gateway (not via fallback config).
func (g *GatewayLocator) PrimaryMeshGatewayAddressesReadyCh() <-chan struct{} {
	return g.primaryGatewaysReadyCh
}

// PickGateway returns the address for a gateway suitable for reaching the
// provided datacenter.
func (g *GatewayLocator) PickGateway(dc string) string {
	item := g.pickGateway(dc == g.primaryDatacenter)
	g.logger.Trace("picking gateway for transit", "gateway", item, "source_datacenter", g.datacenter, "dest_datacenter", dc)
	return item
}

func (g *GatewayLocator) pickGateway(primary bool) string {
	addrs := g.listGateways(primary)
	return getRandomItem(addrs)
}

func (g *GatewayLocator) listGateways(primary bool) []string {
	g.gatewaysLock.Lock()
	defer g.gatewaysLock.Unlock()

	var addrs []string
	if primary {
		if g.datacenter == g.primaryDatacenter {
			addrs = g.primaryGateways
		} else if g.DialPrimaryThroughLocalGateway() && len(g.localGateways) > 0 {
			addrs = g.localGateways
		} else {
			// Note calling StringSliceMergeSorted only works because both
			// inputs are pre-sorted. If for some reason one of the lists has
			// *duplicates* (which shouldn't happen) it's not great but it
			// won't break anything other than biasing our eventual random
			// choice a little bit.
			addrs = stringslice.MergeSorted(g.primaryGateways, g.PrimaryGatewayFallbackAddresses())
		}
	} else {
		addrs = g.localGateways
	}

	return addrs
}

// RefreshPrimaryGatewayFallbackAddresses is used to update the list of current
// fallback addresses for locating mesh gateways in the primary datacenter.
func (g *GatewayLocator) RefreshPrimaryGatewayFallbackAddresses(addrs []string) {
	sort.Strings(addrs)

	g.primaryMeshGatewayDiscoveredAddressesLock.Lock()
	defer g.primaryMeshGatewayDiscoveredAddressesLock.Unlock()

	if !stringslice.Equal(addrs, g.primaryMeshGatewayDiscoveredAddresses) {
		g.primaryMeshGatewayDiscoveredAddresses = addrs
		g.logger.Info("updated fallback list of primary mesh gateways", "mesh_gateways", addrs)
	}
}

// PrimaryGatewayFallbackAddresses returns the current set of discovered
// fallback addresses for the mesh gateways in the primary datacenter.
func (g *GatewayLocator) PrimaryGatewayFallbackAddresses() []string {
	g.primaryMeshGatewayDiscoveredAddressesLock.Lock()
	defer g.primaryMeshGatewayDiscoveredAddressesLock.Unlock()

	out := make([]string, len(g.primaryMeshGatewayDiscoveredAddresses))
	copy(out, g.primaryMeshGatewayDiscoveredAddresses)
	return out
}

func getRandomItem(items []string) string {
	switch len(items) {
	case 0:
		return ""
	case 1:
		return items[0]
	default:
		idx := int(rand.Int31n(int32(len(items))))
		return items[idx]
	}
}

type serverDelegate interface {
	blockingQuery(queryOpts structs.QueryOptionsCompat, queryMeta structs.QueryMetaCompat, fn queryFn) error
	IsLeader() bool
	LeaderLastContact() time.Time
}

func NewGatewayLocator(
	logger hclog.Logger,
	srv serverDelegate,
	datacenter string,
	primaryDatacenter string,
) *GatewayLocator {
	g := &GatewayLocator{
		logger:                 logger.Named(logging.GatewayLocator),
		srv:                    srv,
		datacenter:             datacenter,
		primaryDatacenter:      primaryDatacenter,
		primaryGatewaysReadyCh: make(chan struct{}),
	}
	g.logPrimaryDialingMessage(g.DialPrimaryThroughLocalGateway())
	return g
}

var errGatewayLocalStateNotInitialized = errors.New("local state not initialized")

func (g *GatewayLocator) Run(ctx context.Context) {
	var lastFetchIndex uint64
	retryLoopBackoff(ctx, func() error {
		idx, err := g.runOnce(lastFetchIndex)
		if err != nil {
			return err
		}

		lastFetchIndex = idx

		return nil
	}, func(err error) {
		if !errors.Is(err, errGatewayLocalStateNotInitialized) {
			g.logger.Error("error tracking primary and local mesh gateways", "error", err)
		}
	})
}

func (g *GatewayLocator) runOnce(lastFetchIndex uint64) (uint64, error) {
	if err := g.checkLocalStateIsReady(); err != nil {
		return 0, err
	}

	// NOTE: we can't do RPC here because we won't have a token so we'll just
	// mostly assume that our FSM is caught up enough to answer locally.  If
	// this has drifted it's no different than a cache that drifts or an
	// inconsistent read.
	queryOpts := &structs.QueryOptions{
		MinQueryIndex:     lastFetchIndex,
		RequireConsistent: false,
	}

	var (
		results   []*structs.FederationState
		queryMeta structs.QueryMeta
	)
	err := g.srv.blockingQuery(
		queryOpts,
		&queryMeta,
		func(ws memdb.WatchSet, state *state.Store) error {
			// Get the existing stored version of this config that has replicated down.
			// We could phone home to get this but that would incur extra WAN traffic
			// when we already have enough information locally to figure it out
			// (assuming that our replicator is still functioning).
			idx, all, err := state.FederationStateList(ws)
			if err != nil {
				return err
			}

			queryMeta.Index = idx
			results = all

			return nil
		})
	if err != nil {
		return 0, err
	}

	g.updateFromState(results)

	return queryMeta.Index, nil
}

// checkLocalStateIsReady is inlined a bit from (*Server).ForwardRPC(). We need to
// wait until our own state machine is safe to read from.
func (g *GatewayLocator) checkLocalStateIsReady() error {
	// Check if we can allow a stale read, ensure our local DB is initialized
	if !g.srv.LeaderLastContact().IsZero() {
		return nil // the raft leader talked to us
	}

	if g.srv.IsLeader() {
		return nil // we are the leader
	}

	return errGatewayLocalStateNotInitialized
}

func (g *GatewayLocator) updateFromState(results []*structs.FederationState) {
	var (
		local   structs.CheckServiceNodes
		primary structs.CheckServiceNodes
	)
	for _, config := range results {
		retained := retainGateways(config.MeshGateways)
		if config.Datacenter == g.datacenter {
			local = retained
		}
		// NOT else-if because conditionals are not mutually exclusive
		if config.Datacenter == g.primaryDatacenter {
			primary = retained
		}
	}

	primaryAddrs := renderGatewayAddrs(primary, true)
	localAddrs := renderGatewayAddrs(local, false)

	g.gatewaysLock.Lock()
	defer g.gatewaysLock.Unlock()

	changed := false
	primaryReady := false
	if !stringslice.Equal(g.primaryGateways, primaryAddrs) {
		g.primaryGateways = primaryAddrs
		primaryReady = len(g.primaryGateways) > 0
		changed = true
	}
	if !stringslice.Equal(g.localGateways, localAddrs) {
		g.localGateways = localAddrs
		changed = true
	}

	if changed {
		g.logger.Info(
			"new cached locations of mesh gateways",
			"primary", primaryAddrs,
			"local", localAddrs,
		)
	}

	if primaryReady {
		g.primaryGatewaysReadyOnce.Do(func() {
			close(g.primaryGatewaysReadyCh)
		})
	}
}

func retainGateways(full structs.CheckServiceNodes) structs.CheckServiceNodes {
	out := make([]structs.CheckServiceNode, 0, len(full))
	for _, csn := range full {
		if csn.Service.Meta[structs.MetaWANFederationKey] != "1" {
			continue
		}

		// only keep healthy ones
		ok := true
		for _, chk := range csn.Checks {
			if chk.Status == api.HealthCritical {
				ok = false
			}
		}

		if ok {
			out = append(out, csn)
		}
	}
	return out
}

func renderGatewayAddrs(gateways structs.CheckServiceNodes, wan bool) []string {
	out := make([]string, 0, len(gateways))
	for _, csn := range gateways {
		addr, port := csn.BestAddress(wan)
		completeAddr := ipaddr.FormatAddressPort(addr, port)
		out = append(out, completeAddr)
	}
	sort.Strings(out)
	return out
}