Change partition for peers in discovery chain targets (#16769)

This commit swaps the partition field to the local partition for
discovery chains targeting peers. Prior to this change, peer upstreams
would always use a value of default regardless of which partition they
exist in. This caused several issues in xds / proxycfg because of id
mismatches.

Some prior fixes were made to deal with one-off id mismatches that this
PR also cleans up, since they are no longer needed.
This commit is contained in:
Derek Menteer 2023-03-24 15:40:19 -05:00 committed by GitHub
parent 77ff9265a6
commit 5be6469506
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 78 additions and 60 deletions

3
.changelog/_4832.txt Normal file
View File

@ -0,0 +1,3 @@
```release-note:bug
peering: **(Consul Enterprise only)** Fix issue where resolvers, routers, and splitters referencing peer targets may not work correctly for non-default partitions and namespaces. Enterprise customers leveraging peering are encouraged to upgrade both servers and agents to avoid this problem.
```

View File

@ -720,10 +720,21 @@ func (c *compiler) newTarget(opts structs.DiscoveryTargetOpts) *structs.Discover
} else {
// Don't allow Peer and Datacenter.
opts.Datacenter = ""
// Peer and Partition cannot both be set.
opts.Partition = acl.PartitionOrDefault("")
// Since discovery targets (for peering) are ONLY used to query the catalog, and
// not to generate the SNI it is more correct to switch this to the calling-side
// of the peering's partition as that matches where the replicated data is stored
// in the catalog. This is done to simplify the usage of peer targets in both
// the xds and proxycfg packages.
//
// The peer info data attached to service instances will have the embedded opaque
// SNI/SAN information generated by the remote side and that will have the
// OTHER partition properly specified.
opts.Partition = acl.PartitionOrDefault(c.evaluateInPartition)
// Default to "default" rather than c.evaluateInNamespace.
opts.Namespace = acl.PartitionOrDefault(opts.Namespace)
// Note that the namespace is not swapped out, because it should
// always match the value in the remote cluster (and shouldn't
// have been changed anywhere).
opts.Namespace = acl.NamespaceOrDefault(opts.Namespace)
}
t := structs.NewDiscoveryTarget(opts)

View File

@ -108,7 +108,7 @@ func TestConfigSnapshotAPIGateway(
upstreams := structs.TestUpstreams(t, false)
baseEvents = testSpliceEvents(baseEvents, setupTestVariationConfigEntriesAndSnapshot(
t, variation, upstreams, additionalEntries...,
t, variation, false, upstreams, additionalEntries...,
))
return testConfigSnapshotFixture(t, &structs.NodeService{

View File

@ -145,7 +145,7 @@ func TestConfigSnapshotDiscoveryChain(
},
},
}, setupTestVariationConfigEntriesAndSnapshot(
t, variation, upstreams, additionalEntries...,
t, variation, enterprise, upstreams, additionalEntries...,
))
return testConfigSnapshotFixture(t, &structs.NodeService{

View File

@ -88,7 +88,7 @@ func TestConfigSnapshotIngressGateway(
upstreams = structs.Upstreams{upstreams[0]} // just keep 'db'
baseEvents = testSpliceEvents(baseEvents, setupTestVariationConfigEntriesAndSnapshot(
t, variation, upstreams, additionalEntries...,
t, variation, false, upstreams, additionalEntries...,
))
}

View File

@ -8,6 +8,7 @@ import (
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/hashicorp/consul/acl"
"github.com/hashicorp/consul/agent/connect"
"github.com/hashicorp/consul/agent/consul/discoverychain"
"github.com/hashicorp/consul/agent/structs"
@ -474,6 +475,8 @@ func TestConfigSnapshotPeeredMeshGateway(t testing.T, variant string, nsFn func(
discoChains = make(map[structs.ServiceName]*structs.CompiledDiscoveryChain)
endpoints = make(map[structs.ServiceName]structs.CheckServiceNodes)
entries []structs.ConfigEntry
// This portion of the test is not currently enterprise-aware, but we need this to satisfy a function call.
entMeta = *acl.DefaultEnterpriseMeta()
)
switch variant {
@ -660,8 +663,8 @@ func TestConfigSnapshotPeeredMeshGateway(t testing.T, variant string, nsFn func(
CorrelationID: "peering-connect-service:peer-a:db",
Result: &structs.IndexedCheckServiceNodes{
Nodes: structs.CheckServiceNodes{
structs.TestCheckNodeServiceWithNameInPeer(t, "db", "dc1", "peer-a", "10.40.1.1", false),
structs.TestCheckNodeServiceWithNameInPeer(t, "db", "dc1", "peer-a", "10.40.1.2", false),
structs.TestCheckNodeServiceWithNameInPeer(t, "db", "dc1", "peer-a", "10.40.1.1", false, entMeta),
structs.TestCheckNodeServiceWithNameInPeer(t, "db", "dc1", "peer-a", "10.40.1.2", false, entMeta),
},
},
},
@ -669,8 +672,8 @@ func TestConfigSnapshotPeeredMeshGateway(t testing.T, variant string, nsFn func(
CorrelationID: "peering-connect-service:peer-b:alt",
Result: &structs.IndexedCheckServiceNodes{
Nodes: structs.CheckServiceNodes{
structs.TestCheckNodeServiceWithNameInPeer(t, "alt", "remote-dc", "peer-b", "10.40.2.1", false),
structs.TestCheckNodeServiceWithNameInPeer(t, "alt", "remote-dc", "peer-b", "10.40.2.2", true),
structs.TestCheckNodeServiceWithNameInPeer(t, "alt", "remote-dc", "peer-b", "10.40.2.1", false, entMeta),
structs.TestCheckNodeServiceWithNameInPeer(t, "alt", "remote-dc", "peer-b", "10.40.2.2", true, entMeta),
},
},
},

View File

@ -15,6 +15,7 @@ import (
func setupTestVariationConfigEntriesAndSnapshot(
t testing.T,
variation string,
enterprise bool,
upstreams structs.Upstreams,
additionalEntries ...structs.ConfigEntry,
) []UpdateEvent {
@ -24,7 +25,7 @@ func setupTestVariationConfigEntriesAndSnapshot(
dbUID = NewUpstreamID(&dbUpstream)
)
dbChain := setupTestVariationDiscoveryChain(t, variation, dbUID.EnterpriseMeta, additionalEntries...)
dbChain := setupTestVariationDiscoveryChain(t, variation, enterprise, dbUID.EnterpriseMeta, additionalEntries...)
nodes := TestUpstreamNodes(t, "db")
if variation == "register-to-terminating-gateway" {
@ -108,12 +109,14 @@ func setupTestVariationConfigEntriesAndSnapshot(
uid := UpstreamID{
Name: "db",
Peer: "cluster-01",
EnterpriseMeta: acl.NewEnterpriseMetaWithPartition(dbUID.PartitionOrDefault(), ""),
}
if enterprise {
uid.EnterpriseMeta = acl.NewEnterpriseMetaWithPartition(dbUID.PartitionOrDefault(), "ns9")
}
events = append(events, UpdateEvent{
CorrelationID: "upstream-peer:" + uid.String(),
Result: &structs.IndexedCheckServiceNodes{
Nodes: structs.CheckServiceNodes{structs.TestCheckNodeServiceWithNameInPeer(t, "db", "dc1", "cluster-01", "10.40.1.1", false)},
Nodes: structs.CheckServiceNodes{structs.TestCheckNodeServiceWithNameInPeer(t, "db", "dc2", "cluster-01", "10.40.1.1", false, uid.EnterpriseMeta)},
},
})
case "redirect-to-cluster-peer":
@ -131,12 +134,14 @@ func setupTestVariationConfigEntriesAndSnapshot(
uid := UpstreamID{
Name: "db",
Peer: "cluster-01",
EnterpriseMeta: acl.NewEnterpriseMetaWithPartition(dbUID.PartitionOrDefault(), ""),
}
if enterprise {
uid.EnterpriseMeta = acl.NewEnterpriseMetaWithPartition(dbUID.PartitionOrDefault(), "ns9")
}
events = append(events, UpdateEvent{
CorrelationID: "upstream-peer:" + uid.String(),
Result: &structs.IndexedCheckServiceNodes{
Nodes: structs.CheckServiceNodes{structs.TestCheckNodeServiceWithNameInPeer(t, "db", "dc2", "cluster-01", "10.40.1.1", false)},
Nodes: structs.CheckServiceNodes{structs.TestCheckNodeServiceWithNameInPeer(t, "db", "dc2", "cluster-01", "10.40.1.1", false, uid.EnterpriseMeta)},
},
})
case "failover-through-double-remote-gateway-triggered":
@ -256,6 +261,7 @@ func setupTestVariationConfigEntriesAndSnapshot(
func setupTestVariationDiscoveryChain(
t testing.T,
variation string,
enterprise bool,
entMeta acl.EnterpriseMeta,
additionalEntries ...structs.ConfigEntry,
) *structs.CompiledDiscoveryChain {
@ -343,6 +349,14 @@ func setupTestVariationDiscoveryChain(
},
)
case "failover-to-cluster-peer":
target := structs.ServiceResolverFailoverTarget{
Peer: "cluster-01",
}
if enterprise {
target.Namespace = "ns9"
}
entries = append(entries,
&structs.ServiceResolverConfigEntry{
Kind: structs.ServiceResolver,
@ -352,14 +366,19 @@ func setupTestVariationDiscoveryChain(
RequestTimeout: 33 * time.Second,
Failover: map[string]structs.ServiceResolverFailover{
"*": {
Targets: []structs.ServiceResolverFailoverTarget{
{Peer: "cluster-01"},
},
Targets: []structs.ServiceResolverFailoverTarget{target},
},
},
},
)
case "redirect-to-cluster-peer":
redirect := &structs.ServiceResolverRedirect{
Peer: "cluster-01",
}
if enterprise {
redirect.Namespace = "ns9"
}
entries = append(entries,
&structs.ServiceResolverConfigEntry{
Kind: structs.ServiceResolver,
@ -367,9 +386,7 @@ func setupTestVariationDiscoveryChain(
EnterpriseMeta: entMeta,
ConnectTimeout: 33 * time.Second,
RequestTimeout: 33 * time.Second,
Redirect: &structs.ServiceResolverRedirect{
Peer: "cluster-01",
},
Redirect: redirect,
},
)
case "failover-through-double-remote-gateway-triggered":

View File

@ -314,7 +314,7 @@ func (s *handlerUpstreams) resetWatchesFromChain(
}
opts := targetWatchOpts{upstreamID: uid}
opts.fromChainTarget(chain, target)
opts.fromChainTarget(target)
err := s.watchUpstreamTarget(ctx, snap, opts)
if err != nil {
@ -432,30 +432,13 @@ type targetWatchOpts struct {
entMeta *acl.EnterpriseMeta
}
func (o *targetWatchOpts) fromChainTarget(c *structs.CompiledDiscoveryChain, t *structs.DiscoveryTarget) {
func (o *targetWatchOpts) fromChainTarget(t *structs.DiscoveryTarget) {
o.chainID = t.ID
o.service = t.Service
o.filter = t.Subset.Filter
o.datacenter = t.Datacenter
o.peer = t.Peer
o.entMeta = t.GetEnterpriseMetadata()
// The peer-targets in a discovery chain intentionally clear out
// the partition field, since we don't know the remote service's partition.
// Therefore, we must query with the chain's local partition / DC, or else
// the services will not be found.
//
// Note that the namespace is not swapped out, because it should
// always match the value in the remote datacenter (and shouldn't
// have been changed anywhere).
if o.peer != "" {
o.datacenter = ""
// Clone the enterprise meta so it's not modified when we swap the partition.
var em acl.EnterpriseMeta
em.Merge(o.entMeta)
em.OverridePartition(c.Partition)
o.entMeta = &em
}
}
func (s *handlerUpstreams) watchUpstreamTarget(ctx context.Context, snap *ConfigSnapshotUpstreams, opts targetWatchOpts) error {
@ -470,9 +453,6 @@ func (s *handlerUpstreams) watchUpstreamTarget(ctx context.Context, snap *Config
if opts.peer != "" {
uid = NewUpstreamIDFromTargetID(opts.chainID)
// chainID has the partition stripped. However, when a target is in a cluster peer, the partition should be set
// to the local partition (i.e chain.Partition), since the peered target is imported into the local partition.
uid.OverridePartition(opts.entMeta.PartitionOrDefault())
correlationID = upstreamPeerWatchIDPrefix + uid.String()
}

View File

@ -317,7 +317,7 @@ func (t *DiscoveryTarget) ToDiscoveryTargetOpts() DiscoveryTargetOpts {
func ChainID(opts DiscoveryTargetOpts) string {
// NOTE: this format is similar to the SNI syntax for simplicity
if opts.Peer != "" {
return fmt.Sprintf("%s.%s.default.external.%s", opts.Service, opts.Namespace, opts.Peer)
return fmt.Sprintf("%s.%s.%s.external.%s", opts.Service, opts.Namespace, opts.Partition, opts.Peer)
}
if opts.ServiceSubset == "" {
return fmt.Sprintf("%s.%s.%s.%s", opts.Service, opts.Namespace, opts.Partition, opts.Datacenter)

View File

@ -1,6 +1,9 @@
package structs
import (
"fmt"
"github.com/hashicorp/consul/acl"
"github.com/mitchellh/go-testing-interface"
)
@ -55,7 +58,14 @@ func TestNodeServiceWithName(t testing.T, name string) *NodeService {
const peerTrustDomain = "1c053652-8512-4373-90cf-5a7f6263a994.consul"
func TestCheckNodeServiceWithNameInPeer(t testing.T, name, dc, peer, ip string, useHostname bool) CheckServiceNode {
func TestCheckNodeServiceWithNameInPeer(t testing.T, name, dc, peer, ip string, useHostname bool, remoteEntMeta acl.EnterpriseMeta) CheckServiceNode {
// Non-default partitions have a different spiffe format.
spiffe := fmt.Sprintf("spiffe://%s/ns/default/dc/%s/svc/%s", peerTrustDomain, dc, name)
if !remoteEntMeta.InDefaultPartition() {
spiffe = fmt.Sprintf("spiffe://%s/ap/%s/ns/%s/dc/%s/svc/%s",
peerTrustDomain, remoteEntMeta.PartitionOrDefault(), remoteEntMeta.NamespaceOrDefault(), dc, name)
}
service := &NodeService{
Kind: ServiceKindTypical,
Service: name,
@ -66,11 +76,10 @@ func TestCheckNodeServiceWithNameInPeer(t testing.T, name, dc, peer, ip string,
Connect: ServiceConnect{
PeerMeta: &PeeringServiceMeta{
SNI: []string{
name + ".default.default." + peer + ".external." + peerTrustDomain,
},
SpiffeID: []string{
"spiffe://" + peerTrustDomain + "/ns/default/dc/" + dc + "/svc/" + name,
fmt.Sprintf("%s.%s.%s.%s.external.%s",
name, remoteEntMeta.NamespaceOrDefault(), remoteEntMeta.PartitionOrDefault(), peer, peerTrustDomain),
},
SpiffeID: []string{spiffe},
Protocol: "tcp",
},
},

View File

@ -1957,11 +1957,6 @@ func (s *ResourceGenerator) mapDiscoChainTargets(cfgSnap *proxycfg.ConfigSnapsho
}
if targetUID.Peer != "" {
// targetID has the partition stripped, so targetUID will not have a partition either. However,
// when a failover target is in a cluster peer, the partition should be set to the local partition (i.e
// chain.Partition), since the peered failover target is imported into the local partition.
targetUID.OverridePartition(chain.Partition)
tbs, _ := upstreamsSnapshot.UpstreamPeerTrustBundles.Get(targetUID.Peer)
rootPEMs = tbs.ConcatenatedRootPEMs()

View File

@ -105,7 +105,7 @@
},
"matchSubjectAltNames": [
{
"exact": "spiffe://1c053652-8512-4373-90cf-5a7f6263a994.consul/ns/default/dc/dc1/svc/db"
"exact": "spiffe://1c053652-8512-4373-90cf-5a7f6263a994.consul/ns/default/dc/dc2/svc/db"
}
]
}

View File

@ -106,7 +106,7 @@
},
"matchSubjectAltNames": [
{
"exact": "spiffe://1c053652-8512-4373-90cf-5a7f6263a994.consul/ns/default/dc/dc1/svc/db"
"exact": "spiffe://1c053652-8512-4373-90cf-5a7f6263a994.consul/ns/default/dc/dc2/svc/db"
}
]
}