From ae28875d5546d8e4930f8a4160cc261649963d77 Mon Sep 17 00:00:00 2001 From: Derek Menteer <105233703+hashi-derek@users.noreply.github.com> Date: Wed, 31 May 2023 15:40:06 -0500 Subject: [PATCH] Fix tproxy failover issue with sameness groups (#17533) Sameness groups with default-for-failover enabled did not function properly with tproxy whenever all instances of the service disappeared from the local cluster. This occured, because there were no corresponding resolvers (due to the implicit failover policy) which caused VIPs to be deallocated. This ticket expands upon the VIP allocations so that both service-defaults and service-intentions (without destination wildcards) will ensure that the virtual IP exists. --- agent/consul/fsm/snapshot_test.go | 46 ++++++++++++++++++++-------- agent/consul/state/catalog.go | 15 ++++++--- agent/consul/state/config_entry.go | 41 ++++++++++++++++++++----- agent/consul/state/intention.go | 2 +- agent/consul/state/intention_test.go | 8 +++-- 5 files changed, 83 insertions(+), 29 deletions(-) diff --git a/agent/consul/fsm/snapshot_test.go b/agent/consul/fsm/snapshot_test.go index a846812be..a34e1f3e3 100644 --- a/agent/consul/fsm/snapshot_test.go +++ b/agent/consul/fsm/snapshot_test.go @@ -44,6 +44,8 @@ func TestFSM_SnapshotRestore_OSS(t *testing.T) { StorageBackend: storageBackend, }) + fsm.state.SystemMetadataSet(10, &structs.SystemMetadataEntry{Key: structs.SystemMetadataVirtualIPsEnabled, Value: "true"}) + // Add some state node1 := &structs.Node{ ID: "610918a6-464f-fa9b-1a95-03bd6e88ed92", @@ -79,8 +81,14 @@ func TestFSM_SnapshotRestore_OSS(t *testing.T) { Connect: connectConf, }) + psn := structs.PeeredServiceName{ServiceName: structs.NewServiceName("web", nil)} + vip, err := fsm.state.VirtualIPForService(psn) + require.NoError(t, err) + require.Equal(t, vip, "240.0.0.1") + fsm.state.EnsureService(4, "foo", &structs.NodeService{ID: "db", Service: "db", Tags: []string{"primary"}, Address: "127.0.0.1", Port: 5000}) fsm.state.EnsureService(5, "baz", &structs.NodeService{ID: "web", Service: "web", Tags: nil, Address: "127.0.0.2", Port: 80}) + fsm.state.EnsureService(6, "baz", &structs.NodeService{ID: "db", Service: "db", Tags: []string{"secondary"}, Address: "127.0.0.2", Port: 5000}) fsm.state.EnsureCheck(7, &structs.HealthCheck{ Node: "foo", @@ -442,6 +450,10 @@ func TestFSM_SnapshotRestore_OSS(t *testing.T) { }, } require.NoError(t, fsm.state.EnsureConfigEntry(26, serviceIxn)) + psn = structs.PeeredServiceName{ServiceName: structs.NewServiceName("foo", nil)} + vip, err = fsm.state.VirtualIPForService(psn) + require.NoError(t, err) + require.Equal(t, vip, "240.0.0.2") // mesh config entry meshConfig := &structs.MeshConfigEntry{ @@ -465,10 +477,10 @@ func TestFSM_SnapshotRestore_OSS(t *testing.T) { Port: 8000, Connect: connectConf, }) - psn := structs.PeeredServiceName{ServiceName: structs.NewServiceName("frontend", nil)} - vip, err := fsm.state.VirtualIPForService(psn) + psn = structs.PeeredServiceName{ServiceName: structs.NewServiceName("frontend", nil)} + vip, err = fsm.state.VirtualIPForService(psn) require.NoError(t, err) - require.Equal(t, vip, "240.0.0.1") + require.Equal(t, vip, "240.0.0.3") fsm.state.EnsureService(30, "foo", &structs.NodeService{ ID: "backend", @@ -480,7 +492,7 @@ func TestFSM_SnapshotRestore_OSS(t *testing.T) { psn = structs.PeeredServiceName{ServiceName: structs.NewServiceName("backend", nil)} vip, err = fsm.state.VirtualIPForService(psn) require.NoError(t, err) - require.Equal(t, vip, "240.0.0.2") + require.Equal(t, vip, "240.0.0.4") _, serviceNames, err := fsm.state.ServiceNamesOfKind(nil, structs.ServiceKindTypical) require.NoError(t, err) @@ -534,15 +546,15 @@ func TestFSM_SnapshotRestore_OSS(t *testing.T) { }, })) - // Add a service-resolver entry to get a virtual IP for service foo + // Add a service-resolver entry to get a virtual IP for service goo resolverEntry := &structs.ServiceResolverConfigEntry{ Kind: structs.ServiceResolver, - Name: "foo", + Name: "goo", } require.NoError(t, fsm.state.EnsureConfigEntry(34, resolverEntry)) - vip, err = fsm.state.VirtualIPForService(structs.PeeredServiceName{ServiceName: structs.NewServiceName("foo", nil)}) + vip, err = fsm.state.VirtualIPForService(structs.PeeredServiceName{ServiceName: structs.NewServiceName("goo", nil)}) require.NoError(t, err) - require.Equal(t, vip, "240.0.0.3") + require.Equal(t, vip, "240.0.0.5") // Resources resource, err := storageBackend.WriteCAS(context.Background(), &pbresource.Resource{ @@ -665,18 +677,26 @@ func TestFSM_SnapshotRestore_OSS(t *testing.T) { require.Equal(t, uint64(25), checks[0].ModifyIndex) // Verify virtual IPs are consistent. - psn = structs.PeeredServiceName{ServiceName: structs.NewServiceName("frontend", nil)} + psn = structs.PeeredServiceName{ServiceName: structs.NewServiceName("web", nil)} vip, err = fsm2.state.VirtualIPForService(psn) require.NoError(t, err) require.Equal(t, vip, "240.0.0.1") - psn = structs.PeeredServiceName{ServiceName: structs.NewServiceName("backend", nil)} - vip, err = fsm2.state.VirtualIPForService(psn) - require.NoError(t, err) - require.Equal(t, vip, "240.0.0.2") psn = structs.PeeredServiceName{ServiceName: structs.NewServiceName("foo", nil)} vip, err = fsm2.state.VirtualIPForService(psn) require.NoError(t, err) + require.Equal(t, vip, "240.0.0.2") + psn = structs.PeeredServiceName{ServiceName: structs.NewServiceName("frontend", nil)} + vip, err = fsm2.state.VirtualIPForService(psn) + require.NoError(t, err) require.Equal(t, vip, "240.0.0.3") + psn = structs.PeeredServiceName{ServiceName: structs.NewServiceName("backend", nil)} + vip, err = fsm2.state.VirtualIPForService(psn) + require.NoError(t, err) + require.Equal(t, vip, "240.0.0.4") + psn = structs.PeeredServiceName{ServiceName: structs.NewServiceName("goo", nil)} + vip, err = fsm2.state.VirtualIPForService(psn) + require.NoError(t, err) + require.Equal(t, vip, "240.0.0.5") // Verify key is set _, d, err := fsm2.state.KVSGet(nil, "/test", nil) diff --git a/agent/consul/state/catalog.go b/agent/consul/state/catalog.go index 083c3af80..74efc3229 100644 --- a/agent/consul/state/catalog.go +++ b/agent/consul/state/catalog.go @@ -915,7 +915,7 @@ func ensureServiceTxn(tx WriteTxn, idx uint64, node string, preserveIndexes bool if err != nil { return err } - if supported { + if supported && sn.Name != "" { psn := structs.PeeredServiceName{Peer: svc.PeerName, ServiceName: sn} vip, err := assignServiceVirtualIP(tx, idx, psn) if err != nil { @@ -2110,7 +2110,13 @@ func freeServiceVirtualIP( // Don't deregister the virtual IP if at least one resolver/router/splitter config entry still // references this service. - configEntryVIPKinds := []string{structs.ServiceResolver, structs.ServiceRouter, structs.ServiceSplitter} + configEntryVIPKinds := []string{ + structs.ServiceResolver, + structs.ServiceRouter, + structs.ServiceSplitter, + structs.ServiceDefaults, + structs.ServiceIntentions, + } for _, kind := range configEntryVIPKinds { _, entry, err := configEntryTxn(tx, nil, kind, psn.ServiceName.Name, &psn.ServiceName.EnterpriseMeta) if err != nil { @@ -3051,14 +3057,15 @@ func (s *Store) ServiceVirtualIPs() (uint64, []ServiceVirtualIP, error) { tx := s.db.Txn(false) defer tx.Abort() - return servicesVirtualIPsTxn(tx) + return servicesVirtualIPsTxn(tx, nil) } -func servicesVirtualIPsTxn(tx ReadTxn) (uint64, []ServiceVirtualIP, error) { +func servicesVirtualIPsTxn(tx ReadTxn, ws memdb.WatchSet) (uint64, []ServiceVirtualIP, error) { iter, err := tx.Get(tableServiceVirtualIPs, indexID) if err != nil { return 0, nil, err } + ws.Add(iter.WatchCh()) var vips []ServiceVirtualIP for raw := iter.Next(); raw != nil; raw = iter.Next() { diff --git a/agent/consul/state/config_entry.go b/agent/consul/state/config_entry.go index fd1d9cbcc..340a53f11 100644 --- a/agent/consul/state/config_entry.go +++ b/agent/consul/state/config_entry.go @@ -6,6 +6,7 @@ package state import ( "errors" "fmt" + "strings" memdb "github.com/hashicorp/go-memdb" "github.com/hashicorp/go-multierror" @@ -465,9 +466,8 @@ func deleteConfigEntryTxn(tx WriteTxn, idx uint64, kind, name string, entMeta *a return fmt.Errorf("failed updating index: %s", err) } - // If this is a resolver/router/splitter, attempt to delete the virtual IP associated - // with this service. - if kind == structs.ServiceResolver || kind == structs.ServiceRouter || kind == structs.ServiceSplitter { + // Attempt to delete the virtual IP associated with this service, if applicable. + if configEntryHasVirtualIP(c) { psn := structs.PeeredServiceName{ServiceName: sn} if err := freeServiceVirtualIP(tx, idx, psn, nil); err != nil { return fmt.Errorf("failed to clean up virtual IP for %q: %v", psn.String(), err) @@ -519,11 +519,14 @@ func insertConfigEntryWithTxn(tx WriteTxn, idx uint64, conf structs.ConfigEntry) if err != nil { return err } - case structs.ServiceResolver: - fallthrough - case structs.ServiceRouter: - fallthrough - case structs.ServiceSplitter: + } + + // Assign virtual-ips, if needed + supported, err := virtualIPsSupported(tx, nil) + if err != nil { + return err + } + if supported && configEntryHasVirtualIP(conf) { psn := structs.PeeredServiceName{ServiceName: structs.NewServiceName(conf.GetName(), conf.GetEnterpriseMeta())} if _, err := assignServiceVirtualIP(tx, idx, psn); err != nil { return err @@ -541,6 +544,28 @@ func insertConfigEntryWithTxn(tx WriteTxn, idx uint64, conf structs.ConfigEntry) return nil } +func configEntryHasVirtualIP(c structs.ConfigEntry) bool { + if c == nil || c.GetName() == "" { + return false + } + switch c.GetKind() { + case structs.ServiceRouter: + return true + case structs.ServiceResolver: + return true + case structs.ServiceSplitter: + return true + case structs.ServiceDefaults: + return true + case structs.ServiceIntentions: + entMeta := c.GetEnterpriseMeta() + return !strings.Contains(c.GetName(), "*") && + !strings.Contains(entMeta.NamespaceOrDefault(), "*") && + !strings.Contains(entMeta.PartitionOrDefault(), "*") + } + return false +} + // validateProposedConfigEntryInGraph can be used to verify graph integrity for // a proposed graph create/update/delete. // diff --git a/agent/consul/state/intention.go b/agent/consul/state/intention.go index 212b7ba03..4341590e4 100644 --- a/agent/consul/state/intention.go +++ b/agent/consul/state/intention.go @@ -1106,7 +1106,7 @@ func (s *Store) intentionTopologyTxn( // We only need to do this for upstreams currently, so that tproxy can find which discovery chains should be // contacted for failover scenarios. Virtual services technically don't need to be considered as downstreams, // because they will take on the identity of the calling service, rather than the chain itself. - vipIndex, vipServices, err := servicesVirtualIPsTxn(tx) + vipIndex, vipServices, err := servicesVirtualIPsTxn(tx, ws) if err != nil { return index, nil, fmt.Errorf("failed to list service virtual IPs: %v", err) } diff --git a/agent/consul/state/intention_test.go b/agent/consul/state/intention_test.go index 34343b145..3545527b7 100644 --- a/agent/consul/state/intention_test.go +++ b/agent/consul/state/intention_test.go @@ -2097,6 +2097,7 @@ func disableLegacyIntentions(s *Store) error { func testConfigStateStore(t *testing.T) *Store { s := testStateStore(t) + s.SystemMetadataSet(5, &structs.SystemMetadataEntry{Key: structs.SystemMetadataVirtualIPsEnabled, Value: "true"}) disableLegacyIntentions(s) return s } @@ -2651,6 +2652,7 @@ func TestStore_IntentionTopology_Destination(t *testing.T) { func TestStore_IntentionTopology_Watches(t *testing.T) { s := testConfigStateStore(t) + s.SystemMetadataSet(10, &structs.SystemMetadataEntry{Key: structs.SystemMetadataVirtualIPsEnabled, Value: "true"}) var i uint64 = 1 require.NoError(t, s.EnsureNode(i, &structs.Node{ @@ -2687,7 +2689,8 @@ func TestStore_IntentionTopology_Watches(t *testing.T) { index, got, err = s.IntentionTopology(ws, target, false, acl.Deny, structs.IntentionTargetService) require.NoError(t, err) require.Equal(t, uint64(2), index) - require.Empty(t, got) + // Because API is a virtual service, it is included in this output. + require.Equal(t, structs.ServiceList{structs.NewServiceName("api", nil)}, got) // Watch should not fire after unrelated intention changes require.NoError(t, s.EnsureConfigEntry(i, &structs.ServiceIntentionsConfigEntry{ @@ -2701,7 +2704,6 @@ func TestStore_IntentionTopology_Watches(t *testing.T) { }, })) i++ - // TODO(freddy) Why is this firing? // require.False(t, watchFired(ws)) @@ -2709,7 +2711,7 @@ func TestStore_IntentionTopology_Watches(t *testing.T) { index, got, err = s.IntentionTopology(ws, target, false, acl.Deny, structs.IntentionTargetService) require.NoError(t, err) require.Equal(t, uint64(3), index) - require.Empty(t, got) + require.Equal(t, structs.ServiceList{structs.NewServiceName("api", nil)}, got) // Watch should fire after service list changes require.NoError(t, s.EnsureService(i, "foo", &structs.NodeService{