From caa5b5a5a6a772688fc7751df4e76004f2219936 Mon Sep 17 00:00:00 2001 From: Daniel Upton Date: Tue, 9 Nov 2021 16:45:36 +0000 Subject: [PATCH] xds: prefer fed state gateway definitions if they're fresher (#11522) Fixes an issue described in #10132, where if two DCs are WAN federated over mesh gateways, and the gateway in the non-primary DC is terminated and receives a new IP address (as is commonly the case when running them on ephemeral compute instances) the primary DC is unable to re-establish its connection until the agent running on its own gateway is restarted. This was happening because we always preferred gateways discovered by the `Internal.ServiceDump` RPC (which would fail because there's no way to dial the remote DC) over those discovered in the federation state, which is replicated as long as the primary DC's gateway is reachable. --- .changelog/11522.txt | 3 + agent/proxycfg/testing.go | 51 ++++++ agent/structs/testing_catalog.go | 3 + agent/xds/endpoints.go | 55 ++++++- agent/xds/endpoints_test.go | 8 + ...n-in-federation-states.envoy-1-20-x.golden | 133 ++++++++++++++++ ...n-in-federation-states.envoy-1-20-x.golden | 145 ++++++++++++++++++ 7 files changed, 392 insertions(+), 6 deletions(-) create mode 100644 .changelog/11522.txt create mode 100644 agent/xds/testdata/endpoints/mesh-gateway-newer-information-in-federation-states.envoy-1-20-x.golden create mode 100644 agent/xds/testdata/endpoints/mesh-gateway-older-information-in-federation-states.envoy-1-20-x.golden diff --git a/.changelog/11522.txt b/.changelog/11522.txt new file mode 100644 index 000000000..dc03a5e40 --- /dev/null +++ b/.changelog/11522.txt @@ -0,0 +1,3 @@ +```release-note:bug +xds: fixes a bug where replacing a mesh gateway node used for WAN federation (with another that has a different IP) could leave gateways in the other DC unable to re-establish the connection +``` diff --git a/agent/proxycfg/testing.go b/agent/proxycfg/testing.go index 33444598b..110762376 100644 --- a/agent/proxycfg/testing.go +++ b/agent/proxycfg/testing.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "io/ioutil" + "math" "path" "path/filepath" "sync" @@ -1506,6 +1507,56 @@ func TestConfigSnapshotMeshGatewayUsingFederationStates(t testing.T) *ConfigSnap return testConfigSnapshotMeshGateway(t, true, true) } +func TestConfigSnapshotMeshGatewayNewerInformationInFederationStates(t testing.T) *ConfigSnapshot { + snap := TestConfigSnapshotMeshGateway(t) + + // Create a duplicate entry in FedStateGateways, with a high ModifyIndex, to + // verify that fresh data in the federation state is preferred over stale data + // in GatewayGroups. + svc := structs.TestNodeServiceMeshGatewayWithAddrs(t, + "10.0.1.3", 8443, + structs.ServiceAddress{Address: "10.0.1.3", Port: 8443}, + structs.ServiceAddress{Address: "198.18.1.3", Port: 443}, + ) + svc.RaftIndex.ModifyIndex = math.MaxUint64 + + snap.MeshGateway.FedStateGateways = map[string]structs.CheckServiceNodes{ + "dc2": { + { + Node: snap.MeshGateway.GatewayGroups["dc2"][0].Node, + Service: svc, + }, + }, + } + + return snap +} + +func TestConfigSnapshotMeshGatewayOlderInformationInFederationStates(t testing.T) *ConfigSnapshot { + snap := TestConfigSnapshotMeshGateway(t) + + // Create a duplicate entry in FedStateGateways, with a low ModifyIndex, to + // verify that stale data in the federation state is ignored in favor of the + // fresher data in GatewayGroups. + svc := structs.TestNodeServiceMeshGatewayWithAddrs(t, + "10.0.1.3", 8443, + structs.ServiceAddress{Address: "10.0.1.3", Port: 8443}, + structs.ServiceAddress{Address: "198.18.1.3", Port: 443}, + ) + svc.RaftIndex.ModifyIndex = 0 + + snap.MeshGateway.FedStateGateways = map[string]structs.CheckServiceNodes{ + "dc2": { + { + Node: snap.MeshGateway.GatewayGroups["dc2"][0].Node, + Service: svc, + }, + }, + } + + return snap +} + func TestConfigSnapshotMeshGatewayNoServices(t testing.T) *ConfigSnapshot { return testConfigSnapshotMeshGateway(t, false, false) } diff --git a/agent/structs/testing_catalog.go b/agent/structs/testing_catalog.go index a701cbfc6..c9fcf017d 100644 --- a/agent/structs/testing_catalog.go +++ b/agent/structs/testing_catalog.go @@ -130,6 +130,9 @@ func TestNodeServiceMeshGatewayWithAddrs(t testing.T, address string, port int, TaggedAddressLAN: lanAddr, TaggedAddressWAN: wanAddr, }, + RaftIndex: RaftIndex{ + ModifyIndex: 1, + }, } } diff --git a/agent/xds/endpoints.go b/agent/xds/endpoints.go index 57c014995..e8521670c 100644 --- a/agent/xds/endpoints.go +++ b/agent/xds/endpoints.go @@ -123,13 +123,56 @@ func (s *ResourceGenerator) endpointsFromSnapshotMeshGateway(cfgSnap *proxycfg.C continue } - endpoints, ok := cfgSnap.MeshGateway.GatewayGroups[key.String()] - if !ok { - endpoints, ok = cfgSnap.MeshGateway.FedStateGateways[key.String()] - if !ok { // not possible - s.Logger.Error("skipping mesh gateway endpoints because no definition found", "datacenter", key) - continue + // Mesh gateways in remote DCs are discovered in two ways: + // + // 1. Via an Internal.ServiceDump RPC in the remote DC (GatewayGroups). + // 2. In the federation state that is replicated from the primary DC (FedStateGateways). + // + // We determine which set to use based on whichever contains the highest + // raft ModifyIndex (and is therefore most up-to-date). + // + // Previously, GatewayGroups was always given presedence over FedStateGateways + // but this was problematic when using mesh gateways for WAN federation. + // + // Consider the following example: + // + // - Primary and Secondary DCs are WAN Federated via local mesh gateways. + // + // - Secondary DC's mesh gateway is running on an ephemeral compute instance + // and is abruptly terminated and rescheduled with a *new IP address*. + // + // - Primary DC's mesh gateway is no longer able to connect to the Secondary + // DC as its proxy is configured with the old IP address. Therefore any RPC + // from the Primary to the Secondary DC will fail (including the one to + // discover the gateway's new IP address). + // + // - Secondary DC performs its regular anti-entropy of federation state data + // to the Primary DC (this succeeds as there is still connectivity in this + // direction). + // + // - At this point the Primary DC's mesh gateway should observe the new IP + // address and reconfigure its proxy, however as we always prioritised + // GatewayGroups this didn't happen and the connection remained severed. + maxModifyIndex := func(vals structs.CheckServiceNodes) uint64 { + var max uint64 + for _, v := range vals { + if i := v.Service.RaftIndex.ModifyIndex; i > max { + max = i + } } + return max + } + + endpoints := cfgSnap.MeshGateway.GatewayGroups[key.String()] + fedStateEndpoints := cfgSnap.MeshGateway.FedStateGateways[key.String()] + + if maxModifyIndex(fedStateEndpoints) > maxModifyIndex(endpoints) { + endpoints = fedStateEndpoints + } + + if len(endpoints) == 0 { + s.Logger.Error("skipping mesh gateway endpoints because no definition found", "datacenter", key) + continue } { // standard connect diff --git a/agent/xds/endpoints_test.go b/agent/xds/endpoints_test.go index e9a982be0..a9f9f0da9 100644 --- a/agent/xds/endpoints_test.go +++ b/agent/xds/endpoints_test.go @@ -245,6 +245,14 @@ func TestEndpointsFromSnapshot(t *testing.T) { create: proxycfg.TestConfigSnapshotMeshGatewayUsingFederationStates, setup: nil, }, + { + name: "mesh-gateway-newer-information-in-federation-states", + create: proxycfg.TestConfigSnapshotMeshGatewayNewerInformationInFederationStates, + }, + { + name: "mesh-gateway-older-information-in-federation-states", + create: proxycfg.TestConfigSnapshotMeshGatewayOlderInformationInFederationStates, + }, { name: "mesh-gateway-no-services", create: proxycfg.TestConfigSnapshotMeshGatewayNoServices, diff --git a/agent/xds/testdata/endpoints/mesh-gateway-newer-information-in-federation-states.envoy-1-20-x.golden b/agent/xds/testdata/endpoints/mesh-gateway-newer-information-in-federation-states.envoy-1-20-x.golden new file mode 100644 index 000000000..8c3d619ca --- /dev/null +++ b/agent/xds/testdata/endpoints/mesh-gateway-newer-information-in-federation-states.envoy-1-20-x.golden @@ -0,0 +1,133 @@ +{ + "versionInfo": "00000001", + "resources": [ + { + "@type": "type.googleapis.com/envoy.config.endpoint.v3.ClusterLoadAssignment", + "clusterName": "bar.default.dc1.internal.11111111-2222-3333-4444-555555555555.consul", + "endpoints": [ + { + "lbEndpoints": [ + { + "endpoint": { + "address": { + "socketAddress": { + "address": "172.16.1.6", + "portValue": 2222 + } + } + }, + "healthStatus": "HEALTHY", + "loadBalancingWeight": 1 + }, + { + "endpoint": { + "address": { + "socketAddress": { + "address": "172.16.1.7", + "portValue": 2222 + } + } + }, + "healthStatus": "HEALTHY", + "loadBalancingWeight": 1 + }, + { + "endpoint": { + "address": { + "socketAddress": { + "address": "172.16.1.8", + "portValue": 2222 + } + } + }, + "healthStatus": "HEALTHY", + "loadBalancingWeight": 1 + } + ] + } + ] + }, + { + "@type": "type.googleapis.com/envoy.config.endpoint.v3.ClusterLoadAssignment", + "clusterName": "dc2.internal.11111111-2222-3333-4444-555555555555.consul", + "endpoints": [ + { + "lbEndpoints": [ + { + "endpoint": { + "address": { + "socketAddress": { + "address": "198.18.1.3", + "portValue": 443 + } + } + }, + "healthStatus": "HEALTHY", + "loadBalancingWeight": 1 + } + ] + } + ] + }, + { + "@type": "type.googleapis.com/envoy.config.endpoint.v3.ClusterLoadAssignment", + "clusterName": "foo.default.dc1.internal.11111111-2222-3333-4444-555555555555.consul", + "endpoints": [ + { + "lbEndpoints": [ + { + "endpoint": { + "address": { + "socketAddress": { + "address": "172.16.1.3", + "portValue": 2222 + } + } + }, + "healthStatus": "HEALTHY", + "loadBalancingWeight": 1 + }, + { + "endpoint": { + "address": { + "socketAddress": { + "address": "172.16.1.4", + "portValue": 2222 + } + } + }, + "healthStatus": "HEALTHY", + "loadBalancingWeight": 1 + }, + { + "endpoint": { + "address": { + "socketAddress": { + "address": "172.16.1.5", + "portValue": 2222 + } + } + }, + "healthStatus": "HEALTHY", + "loadBalancingWeight": 1 + }, + { + "endpoint": { + "address": { + "socketAddress": { + "address": "172.16.1.9", + "portValue": 2222 + } + } + }, + "healthStatus": "HEALTHY", + "loadBalancingWeight": 1 + } + ] + } + ] + } + ], + "typeUrl": "type.googleapis.com/envoy.config.endpoint.v3.ClusterLoadAssignment", + "nonce": "00000001" +} \ No newline at end of file diff --git a/agent/xds/testdata/endpoints/mesh-gateway-older-information-in-federation-states.envoy-1-20-x.golden b/agent/xds/testdata/endpoints/mesh-gateway-older-information-in-federation-states.envoy-1-20-x.golden new file mode 100644 index 000000000..81f2a3eec --- /dev/null +++ b/agent/xds/testdata/endpoints/mesh-gateway-older-information-in-federation-states.envoy-1-20-x.golden @@ -0,0 +1,145 @@ +{ + "versionInfo": "00000001", + "resources": [ + { + "@type": "type.googleapis.com/envoy.config.endpoint.v3.ClusterLoadAssignment", + "clusterName": "bar.default.dc1.internal.11111111-2222-3333-4444-555555555555.consul", + "endpoints": [ + { + "lbEndpoints": [ + { + "endpoint": { + "address": { + "socketAddress": { + "address": "172.16.1.6", + "portValue": 2222 + } + } + }, + "healthStatus": "HEALTHY", + "loadBalancingWeight": 1 + }, + { + "endpoint": { + "address": { + "socketAddress": { + "address": "172.16.1.7", + "portValue": 2222 + } + } + }, + "healthStatus": "HEALTHY", + "loadBalancingWeight": 1 + }, + { + "endpoint": { + "address": { + "socketAddress": { + "address": "172.16.1.8", + "portValue": 2222 + } + } + }, + "healthStatus": "HEALTHY", + "loadBalancingWeight": 1 + } + ] + } + ] + }, + { + "@type": "type.googleapis.com/envoy.config.endpoint.v3.ClusterLoadAssignment", + "clusterName": "dc2.internal.11111111-2222-3333-4444-555555555555.consul", + "endpoints": [ + { + "lbEndpoints": [ + { + "endpoint": { + "address": { + "socketAddress": { + "address": "198.18.1.1", + "portValue": 443 + } + } + }, + "healthStatus": "HEALTHY", + "loadBalancingWeight": 1 + }, + { + "endpoint": { + "address": { + "socketAddress": { + "address": "198.18.1.2", + "portValue": 443 + } + } + }, + "healthStatus": "HEALTHY", + "loadBalancingWeight": 1 + } + ] + } + ] + }, + { + "@type": "type.googleapis.com/envoy.config.endpoint.v3.ClusterLoadAssignment", + "clusterName": "foo.default.dc1.internal.11111111-2222-3333-4444-555555555555.consul", + "endpoints": [ + { + "lbEndpoints": [ + { + "endpoint": { + "address": { + "socketAddress": { + "address": "172.16.1.3", + "portValue": 2222 + } + } + }, + "healthStatus": "HEALTHY", + "loadBalancingWeight": 1 + }, + { + "endpoint": { + "address": { + "socketAddress": { + "address": "172.16.1.4", + "portValue": 2222 + } + } + }, + "healthStatus": "HEALTHY", + "loadBalancingWeight": 1 + }, + { + "endpoint": { + "address": { + "socketAddress": { + "address": "172.16.1.5", + "portValue": 2222 + } + } + }, + "healthStatus": "HEALTHY", + "loadBalancingWeight": 1 + }, + { + "endpoint": { + "address": { + "socketAddress": { + "address": "172.16.1.9", + "portValue": 2222 + } + } + }, + "healthStatus": "HEALTHY", + "loadBalancingWeight": 1 + } + ] + } + ] + } + ], + "typeUrl": "type.googleapis.com/envoy.config.endpoint.v3.ClusterLoadAssignment", + "nonce": "00000001" +} \ No newline at end of file