diff --git a/.changelog/11522.txt b/.changelog/11522.txt new file mode 100644 index 000000000..dc03a5e40 --- /dev/null +++ b/.changelog/11522.txt @@ -0,0 +1,3 @@ +```release-note:bug +xds: fixes a bug where replacing a mesh gateway node used for WAN federation (with another that has a different IP) could leave gateways in the other DC unable to re-establish the connection +``` diff --git a/agent/proxycfg/testing.go b/agent/proxycfg/testing.go index 33444598b..110762376 100644 --- a/agent/proxycfg/testing.go +++ b/agent/proxycfg/testing.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "io/ioutil" + "math" "path" "path/filepath" "sync" @@ -1506,6 +1507,56 @@ func TestConfigSnapshotMeshGatewayUsingFederationStates(t testing.T) *ConfigSnap return testConfigSnapshotMeshGateway(t, true, true) } +func TestConfigSnapshotMeshGatewayNewerInformationInFederationStates(t testing.T) *ConfigSnapshot { + snap := TestConfigSnapshotMeshGateway(t) + + // Create a duplicate entry in FedStateGateways, with a high ModifyIndex, to + // verify that fresh data in the federation state is preferred over stale data + // in GatewayGroups. + svc := structs.TestNodeServiceMeshGatewayWithAddrs(t, + "10.0.1.3", 8443, + structs.ServiceAddress{Address: "10.0.1.3", Port: 8443}, + structs.ServiceAddress{Address: "198.18.1.3", Port: 443}, + ) + svc.RaftIndex.ModifyIndex = math.MaxUint64 + + snap.MeshGateway.FedStateGateways = map[string]structs.CheckServiceNodes{ + "dc2": { + { + Node: snap.MeshGateway.GatewayGroups["dc2"][0].Node, + Service: svc, + }, + }, + } + + return snap +} + +func TestConfigSnapshotMeshGatewayOlderInformationInFederationStates(t testing.T) *ConfigSnapshot { + snap := TestConfigSnapshotMeshGateway(t) + + // Create a duplicate entry in FedStateGateways, with a low ModifyIndex, to + // verify that stale data in the federation state is ignored in favor of the + // fresher data in GatewayGroups. + svc := structs.TestNodeServiceMeshGatewayWithAddrs(t, + "10.0.1.3", 8443, + structs.ServiceAddress{Address: "10.0.1.3", Port: 8443}, + structs.ServiceAddress{Address: "198.18.1.3", Port: 443}, + ) + svc.RaftIndex.ModifyIndex = 0 + + snap.MeshGateway.FedStateGateways = map[string]structs.CheckServiceNodes{ + "dc2": { + { + Node: snap.MeshGateway.GatewayGroups["dc2"][0].Node, + Service: svc, + }, + }, + } + + return snap +} + func TestConfigSnapshotMeshGatewayNoServices(t testing.T) *ConfigSnapshot { return testConfigSnapshotMeshGateway(t, false, false) } diff --git a/agent/structs/testing_catalog.go b/agent/structs/testing_catalog.go index a701cbfc6..c9fcf017d 100644 --- a/agent/structs/testing_catalog.go +++ b/agent/structs/testing_catalog.go @@ -130,6 +130,9 @@ func TestNodeServiceMeshGatewayWithAddrs(t testing.T, address string, port int, TaggedAddressLAN: lanAddr, TaggedAddressWAN: wanAddr, }, + RaftIndex: RaftIndex{ + ModifyIndex: 1, + }, } } diff --git a/agent/xds/endpoints.go b/agent/xds/endpoints.go index 57c014995..e8521670c 100644 --- a/agent/xds/endpoints.go +++ b/agent/xds/endpoints.go @@ -123,13 +123,56 @@ func (s *ResourceGenerator) endpointsFromSnapshotMeshGateway(cfgSnap *proxycfg.C continue } - endpoints, ok := cfgSnap.MeshGateway.GatewayGroups[key.String()] - if !ok { - endpoints, ok = cfgSnap.MeshGateway.FedStateGateways[key.String()] - if !ok { // not possible - s.Logger.Error("skipping mesh gateway endpoints because no definition found", "datacenter", key) - continue + // Mesh gateways in remote DCs are discovered in two ways: + // + // 1. Via an Internal.ServiceDump RPC in the remote DC (GatewayGroups). + // 2. In the federation state that is replicated from the primary DC (FedStateGateways). + // + // We determine which set to use based on whichever contains the highest + // raft ModifyIndex (and is therefore most up-to-date). + // + // Previously, GatewayGroups was always given presedence over FedStateGateways + // but this was problematic when using mesh gateways for WAN federation. + // + // Consider the following example: + // + // - Primary and Secondary DCs are WAN Federated via local mesh gateways. + // + // - Secondary DC's mesh gateway is running on an ephemeral compute instance + // and is abruptly terminated and rescheduled with a *new IP address*. + // + // - Primary DC's mesh gateway is no longer able to connect to the Secondary + // DC as its proxy is configured with the old IP address. Therefore any RPC + // from the Primary to the Secondary DC will fail (including the one to + // discover the gateway's new IP address). + // + // - Secondary DC performs its regular anti-entropy of federation state data + // to the Primary DC (this succeeds as there is still connectivity in this + // direction). + // + // - At this point the Primary DC's mesh gateway should observe the new IP + // address and reconfigure its proxy, however as we always prioritised + // GatewayGroups this didn't happen and the connection remained severed. + maxModifyIndex := func(vals structs.CheckServiceNodes) uint64 { + var max uint64 + for _, v := range vals { + if i := v.Service.RaftIndex.ModifyIndex; i > max { + max = i + } } + return max + } + + endpoints := cfgSnap.MeshGateway.GatewayGroups[key.String()] + fedStateEndpoints := cfgSnap.MeshGateway.FedStateGateways[key.String()] + + if maxModifyIndex(fedStateEndpoints) > maxModifyIndex(endpoints) { + endpoints = fedStateEndpoints + } + + if len(endpoints) == 0 { + s.Logger.Error("skipping mesh gateway endpoints because no definition found", "datacenter", key) + continue } { // standard connect diff --git a/agent/xds/endpoints_test.go b/agent/xds/endpoints_test.go index e9a982be0..a9f9f0da9 100644 --- a/agent/xds/endpoints_test.go +++ b/agent/xds/endpoints_test.go @@ -245,6 +245,14 @@ func TestEndpointsFromSnapshot(t *testing.T) { create: proxycfg.TestConfigSnapshotMeshGatewayUsingFederationStates, setup: nil, }, + { + name: "mesh-gateway-newer-information-in-federation-states", + create: proxycfg.TestConfigSnapshotMeshGatewayNewerInformationInFederationStates, + }, + { + name: "mesh-gateway-older-information-in-federation-states", + create: proxycfg.TestConfigSnapshotMeshGatewayOlderInformationInFederationStates, + }, { name: "mesh-gateway-no-services", create: proxycfg.TestConfigSnapshotMeshGatewayNoServices, diff --git a/agent/xds/testdata/endpoints/mesh-gateway-newer-information-in-federation-states.envoy-1-20-x.golden b/agent/xds/testdata/endpoints/mesh-gateway-newer-information-in-federation-states.envoy-1-20-x.golden new file mode 100644 index 000000000..8c3d619ca --- /dev/null +++ b/agent/xds/testdata/endpoints/mesh-gateway-newer-information-in-federation-states.envoy-1-20-x.golden @@ -0,0 +1,133 @@ +{ + "versionInfo": "00000001", + "resources": [ + { + "@type": "type.googleapis.com/envoy.config.endpoint.v3.ClusterLoadAssignment", + "clusterName": "bar.default.dc1.internal.11111111-2222-3333-4444-555555555555.consul", + "endpoints": [ + { + "lbEndpoints": [ + { + "endpoint": { + "address": { + "socketAddress": { + "address": "172.16.1.6", + "portValue": 2222 + } + } + }, + "healthStatus": "HEALTHY", + "loadBalancingWeight": 1 + }, + { + "endpoint": { + "address": { + "socketAddress": { + "address": "172.16.1.7", + "portValue": 2222 + } + } + }, + "healthStatus": "HEALTHY", + "loadBalancingWeight": 1 + }, + { + "endpoint": { + "address": { + "socketAddress": { + "address": "172.16.1.8", + "portValue": 2222 + } + } + }, + "healthStatus": "HEALTHY", + "loadBalancingWeight": 1 + } + ] + } + ] + }, + { + "@type": "type.googleapis.com/envoy.config.endpoint.v3.ClusterLoadAssignment", + "clusterName": "dc2.internal.11111111-2222-3333-4444-555555555555.consul", + "endpoints": [ + { + "lbEndpoints": [ + { + "endpoint": { + "address": { + "socketAddress": { + "address": "198.18.1.3", + "portValue": 443 + } + } + }, + "healthStatus": "HEALTHY", + "loadBalancingWeight": 1 + } + ] + } + ] + }, + { + "@type": "type.googleapis.com/envoy.config.endpoint.v3.ClusterLoadAssignment", + "clusterName": "foo.default.dc1.internal.11111111-2222-3333-4444-555555555555.consul", + "endpoints": [ + { + "lbEndpoints": [ + { + "endpoint": { + "address": { + "socketAddress": { + "address": "172.16.1.3", + "portValue": 2222 + } + } + }, + "healthStatus": "HEALTHY", + "loadBalancingWeight": 1 + }, + { + "endpoint": { + "address": { + "socketAddress": { + "address": "172.16.1.4", + "portValue": 2222 + } + } + }, + "healthStatus": "HEALTHY", + "loadBalancingWeight": 1 + }, + { + "endpoint": { + "address": { + "socketAddress": { + "address": "172.16.1.5", + "portValue": 2222 + } + } + }, + "healthStatus": "HEALTHY", + "loadBalancingWeight": 1 + }, + { + "endpoint": { + "address": { + "socketAddress": { + "address": "172.16.1.9", + "portValue": 2222 + } + } + }, + "healthStatus": "HEALTHY", + "loadBalancingWeight": 1 + } + ] + } + ] + } + ], + "typeUrl": "type.googleapis.com/envoy.config.endpoint.v3.ClusterLoadAssignment", + "nonce": "00000001" +} \ No newline at end of file diff --git a/agent/xds/testdata/endpoints/mesh-gateway-older-information-in-federation-states.envoy-1-20-x.golden b/agent/xds/testdata/endpoints/mesh-gateway-older-information-in-federation-states.envoy-1-20-x.golden new file mode 100644 index 000000000..81f2a3eec --- /dev/null +++ b/agent/xds/testdata/endpoints/mesh-gateway-older-information-in-federation-states.envoy-1-20-x.golden @@ -0,0 +1,145 @@ +{ + "versionInfo": "00000001", + "resources": [ + { + "@type": "type.googleapis.com/envoy.config.endpoint.v3.ClusterLoadAssignment", + "clusterName": "bar.default.dc1.internal.11111111-2222-3333-4444-555555555555.consul", + "endpoints": [ + { + "lbEndpoints": [ + { + "endpoint": { + "address": { + "socketAddress": { + "address": "172.16.1.6", + "portValue": 2222 + } + } + }, + "healthStatus": "HEALTHY", + "loadBalancingWeight": 1 + }, + { + "endpoint": { + "address": { + "socketAddress": { + "address": "172.16.1.7", + "portValue": 2222 + } + } + }, + "healthStatus": "HEALTHY", + "loadBalancingWeight": 1 + }, + { + "endpoint": { + "address": { + "socketAddress": { + "address": "172.16.1.8", + "portValue": 2222 + } + } + }, + "healthStatus": "HEALTHY", + "loadBalancingWeight": 1 + } + ] + } + ] + }, + { + "@type": "type.googleapis.com/envoy.config.endpoint.v3.ClusterLoadAssignment", + "clusterName": "dc2.internal.11111111-2222-3333-4444-555555555555.consul", + "endpoints": [ + { + "lbEndpoints": [ + { + "endpoint": { + "address": { + "socketAddress": { + "address": "198.18.1.1", + "portValue": 443 + } + } + }, + "healthStatus": "HEALTHY", + "loadBalancingWeight": 1 + }, + { + "endpoint": { + "address": { + "socketAddress": { + "address": "198.18.1.2", + "portValue": 443 + } + } + }, + "healthStatus": "HEALTHY", + "loadBalancingWeight": 1 + } + ] + } + ] + }, + { + "@type": "type.googleapis.com/envoy.config.endpoint.v3.ClusterLoadAssignment", + "clusterName": "foo.default.dc1.internal.11111111-2222-3333-4444-555555555555.consul", + "endpoints": [ + { + "lbEndpoints": [ + { + "endpoint": { + "address": { + "socketAddress": { + "address": "172.16.1.3", + "portValue": 2222 + } + } + }, + "healthStatus": "HEALTHY", + "loadBalancingWeight": 1 + }, + { + "endpoint": { + "address": { + "socketAddress": { + "address": "172.16.1.4", + "portValue": 2222 + } + } + }, + "healthStatus": "HEALTHY", + "loadBalancingWeight": 1 + }, + { + "endpoint": { + "address": { + "socketAddress": { + "address": "172.16.1.5", + "portValue": 2222 + } + } + }, + "healthStatus": "HEALTHY", + "loadBalancingWeight": 1 + }, + { + "endpoint": { + "address": { + "socketAddress": { + "address": "172.16.1.9", + "portValue": 2222 + } + } + }, + "healthStatus": "HEALTHY", + "loadBalancingWeight": 1 + } + ] + } + ] + } + ], + "typeUrl": "type.googleapis.com/envoy.config.endpoint.v3.ClusterLoadAssignment", + "nonce": "00000001" +} \ No newline at end of file