xds: prefer fed state gateway definitions if they're fresher (#11522)
Fixes an issue described in #10132, where if two DCs are WAN federated over mesh gateways, and the gateway in the non-primary DC is terminated and receives a new IP address (as is commonly the case when running them on ephemeral compute instances) the primary DC is unable to re-establish its connection until the agent running on its own gateway is restarted. This was happening because we always preferred gateways discovered by the `Internal.ServiceDump` RPC (which would fail because there's no way to dial the remote DC) over those discovered in the federation state, which is replicated as long as the primary DC's gateway is reachable.
This commit is contained in:
parent
0ad360fadf
commit
caa5b5a5a6
|
@ -0,0 +1,3 @@
|
|||
```release-note:bug
|
||||
xds: fixes a bug where replacing a mesh gateway node used for WAN federation (with another that has a different IP) could leave gateways in the other DC unable to re-establish the connection
|
||||
```
|
|
@ -4,6 +4,7 @@ import (
|
|||
"context"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"math"
|
||||
"path"
|
||||
"path/filepath"
|
||||
"sync"
|
||||
|
@ -1506,6 +1507,56 @@ func TestConfigSnapshotMeshGatewayUsingFederationStates(t testing.T) *ConfigSnap
|
|||
return testConfigSnapshotMeshGateway(t, true, true)
|
||||
}
|
||||
|
||||
func TestConfigSnapshotMeshGatewayNewerInformationInFederationStates(t testing.T) *ConfigSnapshot {
|
||||
snap := TestConfigSnapshotMeshGateway(t)
|
||||
|
||||
// Create a duplicate entry in FedStateGateways, with a high ModifyIndex, to
|
||||
// verify that fresh data in the federation state is preferred over stale data
|
||||
// in GatewayGroups.
|
||||
svc := structs.TestNodeServiceMeshGatewayWithAddrs(t,
|
||||
"10.0.1.3", 8443,
|
||||
structs.ServiceAddress{Address: "10.0.1.3", Port: 8443},
|
||||
structs.ServiceAddress{Address: "198.18.1.3", Port: 443},
|
||||
)
|
||||
svc.RaftIndex.ModifyIndex = math.MaxUint64
|
||||
|
||||
snap.MeshGateway.FedStateGateways = map[string]structs.CheckServiceNodes{
|
||||
"dc2": {
|
||||
{
|
||||
Node: snap.MeshGateway.GatewayGroups["dc2"][0].Node,
|
||||
Service: svc,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
return snap
|
||||
}
|
||||
|
||||
func TestConfigSnapshotMeshGatewayOlderInformationInFederationStates(t testing.T) *ConfigSnapshot {
|
||||
snap := TestConfigSnapshotMeshGateway(t)
|
||||
|
||||
// Create a duplicate entry in FedStateGateways, with a low ModifyIndex, to
|
||||
// verify that stale data in the federation state is ignored in favor of the
|
||||
// fresher data in GatewayGroups.
|
||||
svc := structs.TestNodeServiceMeshGatewayWithAddrs(t,
|
||||
"10.0.1.3", 8443,
|
||||
structs.ServiceAddress{Address: "10.0.1.3", Port: 8443},
|
||||
structs.ServiceAddress{Address: "198.18.1.3", Port: 443},
|
||||
)
|
||||
svc.RaftIndex.ModifyIndex = 0
|
||||
|
||||
snap.MeshGateway.FedStateGateways = map[string]structs.CheckServiceNodes{
|
||||
"dc2": {
|
||||
{
|
||||
Node: snap.MeshGateway.GatewayGroups["dc2"][0].Node,
|
||||
Service: svc,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
return snap
|
||||
}
|
||||
|
||||
func TestConfigSnapshotMeshGatewayNoServices(t testing.T) *ConfigSnapshot {
|
||||
return testConfigSnapshotMeshGateway(t, false, false)
|
||||
}
|
||||
|
|
|
@ -130,6 +130,9 @@ func TestNodeServiceMeshGatewayWithAddrs(t testing.T, address string, port int,
|
|||
TaggedAddressLAN: lanAddr,
|
||||
TaggedAddressWAN: wanAddr,
|
||||
},
|
||||
RaftIndex: RaftIndex{
|
||||
ModifyIndex: 1,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -123,13 +123,56 @@ func (s *ResourceGenerator) endpointsFromSnapshotMeshGateway(cfgSnap *proxycfg.C
|
|||
continue
|
||||
}
|
||||
|
||||
endpoints, ok := cfgSnap.MeshGateway.GatewayGroups[key.String()]
|
||||
if !ok {
|
||||
endpoints, ok = cfgSnap.MeshGateway.FedStateGateways[key.String()]
|
||||
if !ok { // not possible
|
||||
s.Logger.Error("skipping mesh gateway endpoints because no definition found", "datacenter", key)
|
||||
continue
|
||||
// Mesh gateways in remote DCs are discovered in two ways:
|
||||
//
|
||||
// 1. Via an Internal.ServiceDump RPC in the remote DC (GatewayGroups).
|
||||
// 2. In the federation state that is replicated from the primary DC (FedStateGateways).
|
||||
//
|
||||
// We determine which set to use based on whichever contains the highest
|
||||
// raft ModifyIndex (and is therefore most up-to-date).
|
||||
//
|
||||
// Previously, GatewayGroups was always given presedence over FedStateGateways
|
||||
// but this was problematic when using mesh gateways for WAN federation.
|
||||
//
|
||||
// Consider the following example:
|
||||
//
|
||||
// - Primary and Secondary DCs are WAN Federated via local mesh gateways.
|
||||
//
|
||||
// - Secondary DC's mesh gateway is running on an ephemeral compute instance
|
||||
// and is abruptly terminated and rescheduled with a *new IP address*.
|
||||
//
|
||||
// - Primary DC's mesh gateway is no longer able to connect to the Secondary
|
||||
// DC as its proxy is configured with the old IP address. Therefore any RPC
|
||||
// from the Primary to the Secondary DC will fail (including the one to
|
||||
// discover the gateway's new IP address).
|
||||
//
|
||||
// - Secondary DC performs its regular anti-entropy of federation state data
|
||||
// to the Primary DC (this succeeds as there is still connectivity in this
|
||||
// direction).
|
||||
//
|
||||
// - At this point the Primary DC's mesh gateway should observe the new IP
|
||||
// address and reconfigure its proxy, however as we always prioritised
|
||||
// GatewayGroups this didn't happen and the connection remained severed.
|
||||
maxModifyIndex := func(vals structs.CheckServiceNodes) uint64 {
|
||||
var max uint64
|
||||
for _, v := range vals {
|
||||
if i := v.Service.RaftIndex.ModifyIndex; i > max {
|
||||
max = i
|
||||
}
|
||||
}
|
||||
return max
|
||||
}
|
||||
|
||||
endpoints := cfgSnap.MeshGateway.GatewayGroups[key.String()]
|
||||
fedStateEndpoints := cfgSnap.MeshGateway.FedStateGateways[key.String()]
|
||||
|
||||
if maxModifyIndex(fedStateEndpoints) > maxModifyIndex(endpoints) {
|
||||
endpoints = fedStateEndpoints
|
||||
}
|
||||
|
||||
if len(endpoints) == 0 {
|
||||
s.Logger.Error("skipping mesh gateway endpoints because no definition found", "datacenter", key)
|
||||
continue
|
||||
}
|
||||
|
||||
{ // standard connect
|
||||
|
|
|
@ -245,6 +245,14 @@ func TestEndpointsFromSnapshot(t *testing.T) {
|
|||
create: proxycfg.TestConfigSnapshotMeshGatewayUsingFederationStates,
|
||||
setup: nil,
|
||||
},
|
||||
{
|
||||
name: "mesh-gateway-newer-information-in-federation-states",
|
||||
create: proxycfg.TestConfigSnapshotMeshGatewayNewerInformationInFederationStates,
|
||||
},
|
||||
{
|
||||
name: "mesh-gateway-older-information-in-federation-states",
|
||||
create: proxycfg.TestConfigSnapshotMeshGatewayOlderInformationInFederationStates,
|
||||
},
|
||||
{
|
||||
name: "mesh-gateway-no-services",
|
||||
create: proxycfg.TestConfigSnapshotMeshGatewayNoServices,
|
||||
|
|
133
agent/xds/testdata/endpoints/mesh-gateway-newer-information-in-federation-states.envoy-1-20-x.golden
vendored
Normal file
133
agent/xds/testdata/endpoints/mesh-gateway-newer-information-in-federation-states.envoy-1-20-x.golden
vendored
Normal file
|
@ -0,0 +1,133 @@
|
|||
{
|
||||
"versionInfo": "00000001",
|
||||
"resources": [
|
||||
{
|
||||
"@type": "type.googleapis.com/envoy.config.endpoint.v3.ClusterLoadAssignment",
|
||||
"clusterName": "bar.default.dc1.internal.11111111-2222-3333-4444-555555555555.consul",
|
||||
"endpoints": [
|
||||
{
|
||||
"lbEndpoints": [
|
||||
{
|
||||
"endpoint": {
|
||||
"address": {
|
||||
"socketAddress": {
|
||||
"address": "172.16.1.6",
|
||||
"portValue": 2222
|
||||
}
|
||||
}
|
||||
},
|
||||
"healthStatus": "HEALTHY",
|
||||
"loadBalancingWeight": 1
|
||||
},
|
||||
{
|
||||
"endpoint": {
|
||||
"address": {
|
||||
"socketAddress": {
|
||||
"address": "172.16.1.7",
|
||||
"portValue": 2222
|
||||
}
|
||||
}
|
||||
},
|
||||
"healthStatus": "HEALTHY",
|
||||
"loadBalancingWeight": 1
|
||||
},
|
||||
{
|
||||
"endpoint": {
|
||||
"address": {
|
||||
"socketAddress": {
|
||||
"address": "172.16.1.8",
|
||||
"portValue": 2222
|
||||
}
|
||||
}
|
||||
},
|
||||
"healthStatus": "HEALTHY",
|
||||
"loadBalancingWeight": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"@type": "type.googleapis.com/envoy.config.endpoint.v3.ClusterLoadAssignment",
|
||||
"clusterName": "dc2.internal.11111111-2222-3333-4444-555555555555.consul",
|
||||
"endpoints": [
|
||||
{
|
||||
"lbEndpoints": [
|
||||
{
|
||||
"endpoint": {
|
||||
"address": {
|
||||
"socketAddress": {
|
||||
"address": "198.18.1.3",
|
||||
"portValue": 443
|
||||
}
|
||||
}
|
||||
},
|
||||
"healthStatus": "HEALTHY",
|
||||
"loadBalancingWeight": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"@type": "type.googleapis.com/envoy.config.endpoint.v3.ClusterLoadAssignment",
|
||||
"clusterName": "foo.default.dc1.internal.11111111-2222-3333-4444-555555555555.consul",
|
||||
"endpoints": [
|
||||
{
|
||||
"lbEndpoints": [
|
||||
{
|
||||
"endpoint": {
|
||||
"address": {
|
||||
"socketAddress": {
|
||||
"address": "172.16.1.3",
|
||||
"portValue": 2222
|
||||
}
|
||||
}
|
||||
},
|
||||
"healthStatus": "HEALTHY",
|
||||
"loadBalancingWeight": 1
|
||||
},
|
||||
{
|
||||
"endpoint": {
|
||||
"address": {
|
||||
"socketAddress": {
|
||||
"address": "172.16.1.4",
|
||||
"portValue": 2222
|
||||
}
|
||||
}
|
||||
},
|
||||
"healthStatus": "HEALTHY",
|
||||
"loadBalancingWeight": 1
|
||||
},
|
||||
{
|
||||
"endpoint": {
|
||||
"address": {
|
||||
"socketAddress": {
|
||||
"address": "172.16.1.5",
|
||||
"portValue": 2222
|
||||
}
|
||||
}
|
||||
},
|
||||
"healthStatus": "HEALTHY",
|
||||
"loadBalancingWeight": 1
|
||||
},
|
||||
{
|
||||
"endpoint": {
|
||||
"address": {
|
||||
"socketAddress": {
|
||||
"address": "172.16.1.9",
|
||||
"portValue": 2222
|
||||
}
|
||||
}
|
||||
},
|
||||
"healthStatus": "HEALTHY",
|
||||
"loadBalancingWeight": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"typeUrl": "type.googleapis.com/envoy.config.endpoint.v3.ClusterLoadAssignment",
|
||||
"nonce": "00000001"
|
||||
}
|
145
agent/xds/testdata/endpoints/mesh-gateway-older-information-in-federation-states.envoy-1-20-x.golden
vendored
Normal file
145
agent/xds/testdata/endpoints/mesh-gateway-older-information-in-federation-states.envoy-1-20-x.golden
vendored
Normal file
|
@ -0,0 +1,145 @@
|
|||
{
|
||||
"versionInfo": "00000001",
|
||||
"resources": [
|
||||
{
|
||||
"@type": "type.googleapis.com/envoy.config.endpoint.v3.ClusterLoadAssignment",
|
||||
"clusterName": "bar.default.dc1.internal.11111111-2222-3333-4444-555555555555.consul",
|
||||
"endpoints": [
|
||||
{
|
||||
"lbEndpoints": [
|
||||
{
|
||||
"endpoint": {
|
||||
"address": {
|
||||
"socketAddress": {
|
||||
"address": "172.16.1.6",
|
||||
"portValue": 2222
|
||||
}
|
||||
}
|
||||
},
|
||||
"healthStatus": "HEALTHY",
|
||||
"loadBalancingWeight": 1
|
||||
},
|
||||
{
|
||||
"endpoint": {
|
||||
"address": {
|
||||
"socketAddress": {
|
||||
"address": "172.16.1.7",
|
||||
"portValue": 2222
|
||||
}
|
||||
}
|
||||
},
|
||||
"healthStatus": "HEALTHY",
|
||||
"loadBalancingWeight": 1
|
||||
},
|
||||
{
|
||||
"endpoint": {
|
||||
"address": {
|
||||
"socketAddress": {
|
||||
"address": "172.16.1.8",
|
||||
"portValue": 2222
|
||||
}
|
||||
}
|
||||
},
|
||||
"healthStatus": "HEALTHY",
|
||||
"loadBalancingWeight": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"@type": "type.googleapis.com/envoy.config.endpoint.v3.ClusterLoadAssignment",
|
||||
"clusterName": "dc2.internal.11111111-2222-3333-4444-555555555555.consul",
|
||||
"endpoints": [
|
||||
{
|
||||
"lbEndpoints": [
|
||||
{
|
||||
"endpoint": {
|
||||
"address": {
|
||||
"socketAddress": {
|
||||
"address": "198.18.1.1",
|
||||
"portValue": 443
|
||||
}
|
||||
}
|
||||
},
|
||||
"healthStatus": "HEALTHY",
|
||||
"loadBalancingWeight": 1
|
||||
},
|
||||
{
|
||||
"endpoint": {
|
||||
"address": {
|
||||
"socketAddress": {
|
||||
"address": "198.18.1.2",
|
||||
"portValue": 443
|
||||
}
|
||||
}
|
||||
},
|
||||
"healthStatus": "HEALTHY",
|
||||
"loadBalancingWeight": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"@type": "type.googleapis.com/envoy.config.endpoint.v3.ClusterLoadAssignment",
|
||||
"clusterName": "foo.default.dc1.internal.11111111-2222-3333-4444-555555555555.consul",
|
||||
"endpoints": [
|
||||
{
|
||||
"lbEndpoints": [
|
||||
{
|
||||
"endpoint": {
|
||||
"address": {
|
||||
"socketAddress": {
|
||||
"address": "172.16.1.3",
|
||||
"portValue": 2222
|
||||
}
|
||||
}
|
||||
},
|
||||
"healthStatus": "HEALTHY",
|
||||
"loadBalancingWeight": 1
|
||||
},
|
||||
{
|
||||
"endpoint": {
|
||||
"address": {
|
||||
"socketAddress": {
|
||||
"address": "172.16.1.4",
|
||||
"portValue": 2222
|
||||
}
|
||||
}
|
||||
},
|
||||
"healthStatus": "HEALTHY",
|
||||
"loadBalancingWeight": 1
|
||||
},
|
||||
{
|
||||
"endpoint": {
|
||||
"address": {
|
||||
"socketAddress": {
|
||||
"address": "172.16.1.5",
|
||||
"portValue": 2222
|
||||
}
|
||||
}
|
||||
},
|
||||
"healthStatus": "HEALTHY",
|
||||
"loadBalancingWeight": 1
|
||||
},
|
||||
{
|
||||
"endpoint": {
|
||||
"address": {
|
||||
"socketAddress": {
|
||||
"address": "172.16.1.9",
|
||||
"portValue": 2222
|
||||
}
|
||||
}
|
||||
},
|
||||
"healthStatus": "HEALTHY",
|
||||
"loadBalancingWeight": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"typeUrl": "type.googleapis.com/envoy.config.endpoint.v3.ClusterLoadAssignment",
|
||||
"nonce": "00000001"
|
||||
}
|
Loading…
Reference in New Issue