open-consul/agent/proxycfg/mesh_gateway.go

534 lines
17 KiB
Go

package proxycfg
import (
"context"
"fmt"
"sort"
"strings"
"time"
cachetype "github.com/hashicorp/consul/agent/cache-types"
"github.com/hashicorp/consul/agent/structs"
"github.com/hashicorp/consul/lib/maps"
"github.com/hashicorp/consul/logging"
"github.com/hashicorp/consul/proto/pbpeering"
)
type handlerMeshGateway struct {
handlerState
}
// initialize sets up the watches needed based on the current mesh gateway registration
func (s *handlerMeshGateway) initialize(ctx context.Context) (ConfigSnapshot, error) {
snap := newConfigSnapshotFromServiceInstance(s.serviceInstance, s.stateConfig)
// Watch for root changes
err := s.dataSources.CARoots.Notify(ctx, &structs.DCSpecificRequest{
Datacenter: s.source.Datacenter,
QueryOptions: structs.QueryOptions{Token: s.token},
Source: *s.source,
}, rootsWatchID, s.ch)
if err != nil {
return snap, err
}
// Watch for all peer trust bundles we may need.
err = s.dataSources.TrustBundleList.Notify(ctx, &pbpeering.TrustBundleListByServiceRequest{
// TODO(peering): Pass ACL token
Kind: string(structs.ServiceKindMeshGateway),
Namespace: s.proxyID.NamespaceOrDefault(),
Partition: s.proxyID.PartitionOrDefault(),
}, peeringTrustBundlesWatchID, s.ch)
if err != nil {
return snap, err
}
wildcardEntMeta := s.proxyID.WithWildcardNamespace()
// Watch for all services.
// Eventually we will have to watch connect enabled instances for each service as well as the
// destination services themselves but those notifications will be setup later.
// We cannot setup those watches until we know what the services are.
err = s.dataSources.ServiceList.Notify(ctx, &structs.DCSpecificRequest{
Datacenter: s.source.Datacenter,
QueryOptions: structs.QueryOptions{Token: s.token},
Source: *s.source,
EnterpriseMeta: *wildcardEntMeta,
}, serviceListWatchID, s.ch)
if err != nil {
return snap, err
}
// Watch service-resolvers so we can setup service subset clusters
err = s.dataSources.ConfigEntryList.Notify(ctx, &structs.ConfigEntryQuery{
Datacenter: s.source.Datacenter,
QueryOptions: structs.QueryOptions{Token: s.token},
Kind: structs.ServiceResolver,
EnterpriseMeta: *wildcardEntMeta,
}, serviceResolversWatchID, s.ch)
if err != nil {
s.logger.Named(logging.MeshGateway).
Error("failed to register watch for service-resolver config entries", "error", err)
return snap, err
}
if s.proxyID.InDefaultPartition() {
if err := s.initializeCrossDCWatches(ctx); err != nil {
return snap, err
}
}
if err := s.initializeEntWatches(ctx); err != nil {
return snap, err
}
// Get information about the entire service mesh.
err = s.dataSources.ConfigEntry.Notify(ctx, &structs.ConfigEntryQuery{
Kind: structs.MeshConfig,
Name: structs.MeshConfigMesh,
Datacenter: s.source.Datacenter,
QueryOptions: structs.QueryOptions{Token: s.token},
EnterpriseMeta: *structs.DefaultEnterpriseMetaInPartition(s.proxyID.PartitionOrDefault()),
}, meshConfigEntryID, s.ch)
if err != nil {
return snap, err
}
// Watch for all exported services from this mesh gateway's partition in any peering.
err = s.dataSources.ExportedPeeredServices.Notify(ctx, &structs.DCSpecificRequest{
Datacenter: s.source.Datacenter,
QueryOptions: structs.QueryOptions{Token: s.token},
Source: *s.source,
EnterpriseMeta: s.proxyID.EnterpriseMeta,
}, exportedServiceListWatchID, s.ch)
if err != nil {
return snap, err
}
snap.MeshGateway.WatchedServices = make(map[structs.ServiceName]context.CancelFunc)
snap.MeshGateway.WatchedGateways = make(map[string]context.CancelFunc)
snap.MeshGateway.ServiceGroups = make(map[structs.ServiceName]structs.CheckServiceNodes)
snap.MeshGateway.GatewayGroups = make(map[string]structs.CheckServiceNodes)
snap.MeshGateway.ServiceResolvers = make(map[structs.ServiceName]*structs.ServiceResolverConfigEntry)
snap.MeshGateway.HostnameDatacenters = make(map[string]structs.CheckServiceNodes)
snap.MeshGateway.ExportedServicesWithPeers = make(map[structs.ServiceName][]string)
snap.MeshGateway.DiscoveryChain = make(map[structs.ServiceName]*structs.CompiledDiscoveryChain)
snap.MeshGateway.WatchedDiscoveryChains = make(map[structs.ServiceName]context.CancelFunc)
// there is no need to initialize the map of service resolvers as we
// fully rebuild it every time we get updates
return snap, err
}
func (s *handlerMeshGateway) initializeCrossDCWatches(ctx context.Context) error {
if s.meta[structs.MetaWANFederationKey] == "1" {
// Conveniently we can just use this service meta attribute in one
// place here to set the machinery in motion and leave the conditional
// behavior out of the rest of the package.
err := s.dataSources.FederationStateListMeshGateways.Notify(ctx, &structs.DCSpecificRequest{
Datacenter: s.source.Datacenter,
QueryOptions: structs.QueryOptions{Token: s.token},
Source: *s.source,
}, federationStateListGatewaysWatchID, s.ch)
if err != nil {
return err
}
err = s.dataSources.Health.Notify(ctx, &structs.ServiceSpecificRequest{
Datacenter: s.source.Datacenter,
QueryOptions: structs.QueryOptions{Token: s.token},
ServiceName: structs.ConsulServiceName,
}, consulServerListWatchID, s.ch)
if err != nil {
return err
}
}
err := s.dataSources.Datacenters.Notify(ctx, &structs.DatacentersRequest{
QueryOptions: structs.QueryOptions{Token: s.token, MaxAge: 30 * time.Second},
}, datacentersWatchID, s.ch)
if err != nil {
return err
}
// Once we start getting notified about the datacenters we will setup watches on the
// gateways within those other datacenters. We cannot do that here because we don't
// know what they are yet.
return nil
}
func (s *handlerMeshGateway) handleUpdate(ctx context.Context, u UpdateEvent, snap *ConfigSnapshot) error {
if u.Err != nil {
return fmt.Errorf("error filling agent cache: %v", u.Err)
}
meshLogger := s.logger.Named(logging.MeshGateway)
switch u.CorrelationID {
case rootsWatchID:
roots, ok := u.Result.(*structs.IndexedCARoots)
if !ok {
return fmt.Errorf("invalid type for response: %T", u.Result)
}
snap.Roots = roots
case federationStateListGatewaysWatchID:
dcIndexedNodes, ok := u.Result.(*structs.DatacenterIndexedCheckServiceNodes)
if !ok {
return fmt.Errorf("invalid type for response: %T", u.Result)
}
snap.MeshGateway.FedStateGateways = dcIndexedNodes.DatacenterNodes
for dc, nodes := range dcIndexedNodes.DatacenterNodes {
snap.MeshGateway.HostnameDatacenters[dc] = hostnameEndpoints(
s.logger.Named(logging.MeshGateway),
snap.Locality,
nodes,
)
}
for dc := range snap.MeshGateway.HostnameDatacenters {
if _, ok := dcIndexedNodes.DatacenterNodes[dc]; !ok {
delete(snap.MeshGateway.HostnameDatacenters, dc)
}
}
case serviceListWatchID:
services, ok := u.Result.(*structs.IndexedServiceList)
if !ok {
return fmt.Errorf("invalid type for response: %T", u.Result)
}
svcMap := make(map[structs.ServiceName]struct{})
for _, svc := range services.Services {
// Make sure to add every service to this map, we use it to cancel
// watches below.
svcMap[svc] = struct{}{}
if _, ok := snap.MeshGateway.WatchedServices[svc]; !ok {
ctx, cancel := context.WithCancel(ctx)
err := s.dataSources.Health.Notify(ctx, &structs.ServiceSpecificRequest{
Datacenter: s.source.Datacenter,
QueryOptions: structs.QueryOptions{Token: s.token},
ServiceName: svc.Name,
Connect: true,
EnterpriseMeta: svc.EnterpriseMeta,
}, fmt.Sprintf("connect-service:%s", svc.String()), s.ch)
if err != nil {
meshLogger.Error("failed to register watch for connect-service",
"service", svc.String(),
"error", err,
)
cancel()
return err
}
snap.MeshGateway.WatchedServices[svc] = cancel
}
}
for sid, cancelFn := range snap.MeshGateway.WatchedServices {
if _, ok := svcMap[sid]; !ok {
meshLogger.Debug("canceling watch for service", "service", sid.String())
// TODO (gateways) Should the sid also be deleted from snap.MeshGateway.ServiceGroups?
// Do those endpoints get cleaned up some other way?
delete(snap.MeshGateway.WatchedServices, sid)
cancelFn()
}
}
snap.MeshGateway.WatchedServicesSet = true
case datacentersWatchID:
datacentersRaw, ok := u.Result.(*[]string)
if !ok {
return fmt.Errorf("invalid type for response: %T", u.Result)
}
if datacentersRaw == nil {
return fmt.Errorf("invalid response with a nil datacenter list")
}
datacenters := *datacentersRaw
for _, dc := range datacenters {
if dc == s.source.Datacenter {
continue
}
entMeta := structs.DefaultEnterpriseMetaInDefaultPartition()
gk := GatewayKey{Datacenter: dc, Partition: entMeta.PartitionOrDefault()}
if _, ok := snap.MeshGateway.WatchedGateways[gk.String()]; !ok {
ctx, cancel := context.WithCancel(ctx)
err := s.dataSources.InternalServiceDump.Notify(ctx, &structs.ServiceDumpRequest{
Datacenter: dc,
QueryOptions: structs.QueryOptions{Token: s.token},
ServiceKind: structs.ServiceKindMeshGateway,
UseServiceKind: true,
Source: *s.source,
EnterpriseMeta: *entMeta,
}, fmt.Sprintf("mesh-gateway:%s", gk.String()), s.ch)
if err != nil {
meshLogger.Error("failed to register watch for mesh-gateway",
"datacenter", dc,
"partition", entMeta.PartitionOrDefault(),
"error", err,
)
cancel()
return err
}
snap.MeshGateway.WatchedGateways[gk.String()] = cancel
}
}
for key, cancelFn := range snap.MeshGateway.WatchedGateways {
gk := gatewayKeyFromString(key)
if gk.Datacenter == s.source.Datacenter {
// Only cross-DC watches are managed by the datacenters watch.
continue
}
found := false
for _, dcCurrent := range datacenters {
if dcCurrent == gk.Datacenter {
found = true
break
}
}
if !found {
delete(snap.MeshGateway.WatchedGateways, key)
cancelFn()
}
}
case serviceResolversWatchID:
configEntries, ok := u.Result.(*structs.IndexedConfigEntries)
if !ok {
return fmt.Errorf("invalid type for response: %T", u.Result)
}
resolvers := make(map[structs.ServiceName]*structs.ServiceResolverConfigEntry)
for _, entry := range configEntries.Entries {
if resolver, ok := entry.(*structs.ServiceResolverConfigEntry); ok {
resolvers[structs.NewServiceName(resolver.Name, &resolver.EnterpriseMeta)] = resolver
}
}
snap.MeshGateway.ServiceResolvers = resolvers
case consulServerListWatchID:
resp, ok := u.Result.(*structs.IndexedCheckServiceNodes)
if !ok {
return fmt.Errorf("invalid type for response: %T", u.Result)
}
// Do some initial sanity checks to avoid doing something dumb.
for _, csn := range resp.Nodes {
if csn.Service.Service != structs.ConsulServiceName {
return fmt.Errorf("expected service name %q but got %q",
structs.ConsulServiceName, csn.Service.Service)
}
if csn.Node.Datacenter != snap.Datacenter {
return fmt.Errorf("expected datacenter %q but got %q",
snap.Datacenter, csn.Node.Datacenter)
}
}
snap.MeshGateway.ConsulServers = resp.Nodes
case exportedServiceListWatchID:
exportedServices, ok := u.Result.(*structs.IndexedExportedServiceList)
if !ok {
return fmt.Errorf("invalid type for response: %T", u.Result)
}
seenServices := make(map[structs.ServiceName][]string) // svc -> peername slice
for peerName, services := range exportedServices.Services {
for _, svc := range services {
seenServices[svc] = append(seenServices[svc], peerName)
}
}
// Sort the peer names so ultimately xDS has a stable output.
for svc := range seenServices {
sort.Strings(seenServices[svc])
}
peeredServiceList := maps.SliceOfKeys(seenServices)
structs.ServiceList(peeredServiceList).Sort()
snap.MeshGateway.ExportedServicesSlice = peeredServiceList
snap.MeshGateway.ExportedServicesWithPeers = seenServices
snap.MeshGateway.ExportedServicesSet = true
// Decide if we do or do not need our leaf.
hasExports := len(snap.MeshGateway.ExportedServicesSlice) > 0
if hasExports && snap.MeshGateway.LeafCertWatchCancel == nil {
// no watch and we need one
ctx, cancel := context.WithCancel(ctx)
err := s.dataSources.LeafCertificate.Notify(ctx, &cachetype.ConnectCALeafRequest{
Datacenter: s.source.Datacenter,
Token: s.token,
Kind: structs.ServiceKindMeshGateway,
EnterpriseMeta: s.proxyID.EnterpriseMeta,
}, leafWatchID, s.ch)
if err != nil {
cancel()
return err
}
snap.MeshGateway.LeafCertWatchCancel = cancel
} else if !hasExports && snap.MeshGateway.LeafCertWatchCancel != nil {
// has watch and shouldn't
snap.MeshGateway.LeafCertWatchCancel()
snap.MeshGateway.LeafCertWatchCancel = nil
snap.MeshGateway.Leaf = nil
}
// For each service that we should be exposing, also watch disco chains
// in the same manner as an ingress gateway would.
for _, svc := range snap.MeshGateway.ExportedServicesSlice {
if _, ok := snap.MeshGateway.WatchedDiscoveryChains[svc]; ok {
continue
}
ctx, cancel := context.WithCancel(ctx)
err := s.dataSources.CompiledDiscoveryChain.Notify(ctx, &structs.DiscoveryChainRequest{
Datacenter: s.source.Datacenter,
QueryOptions: structs.QueryOptions{Token: s.token},
Name: svc.Name,
EvaluateInDatacenter: s.source.Datacenter,
EvaluateInNamespace: svc.NamespaceOrDefault(),
EvaluateInPartition: svc.PartitionOrDefault(),
}, "discovery-chain:"+svc.String(), s.ch)
if err != nil {
meshLogger.Error("failed to register watch for discovery chain",
"service", svc.String(),
"error", err,
)
cancel()
return err
}
snap.MeshGateway.WatchedDiscoveryChains[svc] = cancel
}
// Clean up data from services that were not in the update
for svc, cancelFn := range snap.MeshGateway.WatchedDiscoveryChains {
if _, ok := seenServices[svc]; !ok {
cancelFn()
delete(snap.MeshGateway.WatchedDiscoveryChains, svc)
}
}
// These entries are intentionally handled separately from the
// WatchedDiscoveryChains above. There have been situations where a
// discovery watch was cancelled, then fired. That update event then
// re-populated the DiscoveryChain map entry, which wouldn't get
// cleaned up since there was no known watch for it.
for svc := range snap.MeshGateway.DiscoveryChain {
if _, ok := seenServices[svc]; !ok {
delete(snap.MeshGateway.DiscoveryChain, svc)
}
}
case leafWatchID:
leaf, ok := u.Result.(*structs.IssuedCert)
if !ok {
return fmt.Errorf("invalid type for response: %T", u.Result)
}
if hasExports := len(snap.MeshGateway.ExportedServicesSlice) > 0; !hasExports {
return nil // ignore this update, it's stale
}
snap.MeshGateway.Leaf = leaf
case peeringTrustBundlesWatchID:
resp, ok := u.Result.(*pbpeering.TrustBundleListByServiceResponse)
if !ok {
return fmt.Errorf("invalid type for response: %T", u.Result)
}
if len(resp.Bundles) > 0 {
snap.MeshGateway.PeeringTrustBundles = resp.Bundles
}
snap.MeshGateway.PeeringTrustBundlesSet = true
case meshConfigEntryID:
resp, ok := u.Result.(*structs.ConfigEntryResponse)
if !ok {
return fmt.Errorf("invalid type for response: %T", u.Result)
}
if resp.Entry != nil {
meshConf, ok := resp.Entry.(*structs.MeshConfigEntry)
if !ok {
return fmt.Errorf("invalid type for config entry: %T", resp.Entry)
}
snap.MeshGateway.MeshConfig = meshConf
} else {
snap.MeshGateway.MeshConfig = nil
}
snap.MeshGateway.MeshConfigSet = true
default:
switch {
case strings.HasPrefix(u.CorrelationID, "connect-service:"):
resp, ok := u.Result.(*structs.IndexedCheckServiceNodes)
if !ok {
return fmt.Errorf("invalid type for response: %T", u.Result)
}
sn := structs.ServiceNameFromString(strings.TrimPrefix(u.CorrelationID, "connect-service:"))
if len(resp.Nodes) > 0 {
snap.MeshGateway.ServiceGroups[sn] = resp.Nodes
} else if _, ok := snap.MeshGateway.ServiceGroups[sn]; ok {
delete(snap.MeshGateway.ServiceGroups, sn)
}
case strings.HasPrefix(u.CorrelationID, "mesh-gateway:"):
resp, ok := u.Result.(*structs.IndexedNodesWithGateways)
if !ok {
return fmt.Errorf("invalid type for response: %T", u.Result)
}
key := strings.TrimPrefix(u.CorrelationID, "mesh-gateway:")
delete(snap.MeshGateway.GatewayGroups, key)
delete(snap.MeshGateway.HostnameDatacenters, key)
if len(resp.Nodes) > 0 {
snap.MeshGateway.GatewayGroups[key] = resp.Nodes
snap.MeshGateway.HostnameDatacenters[key] = hostnameEndpoints(
s.logger.Named(logging.MeshGateway),
snap.Locality,
resp.Nodes,
)
}
case strings.HasPrefix(u.CorrelationID, "discovery-chain:"):
resp, ok := u.Result.(*structs.DiscoveryChainResponse)
if !ok {
return fmt.Errorf("invalid type for response: %T", u.Result)
}
svcString := strings.TrimPrefix(u.CorrelationID, "discovery-chain:")
svc := structs.ServiceNameFromString(svcString)
if !snap.MeshGateway.IsServiceExported(svc) {
delete(snap.MeshGateway.DiscoveryChain, svc)
s.logger.Trace("discovery-chain watch fired for unknown service", "service", svc)
return nil
}
snap.MeshGateway.DiscoveryChain[svc] = resp.Chain
default:
if err := s.handleEntUpdate(meshLogger, ctx, u, snap); err != nil {
return err
}
}
}
return nil
}