From cd1157a905032b8cf10cb52897ad587e9aea9d07 Mon Sep 17 00:00:00 2001 From: Mike Palmiotto Date: Wed, 3 Aug 2022 21:44:57 -0400 Subject: [PATCH] Vault 7338/fix retry join (#16550) * storage/raft: Fix cluster init with retry_join Commit 8db66f4853abce3f432adcf1724b1f237b275415 introduced an error wherein a join() would return nil (no error) with no information on its channel if a joining node had been initialized. This was not handled properly by the caller and resulted in a canceled `retry_join`. Fix this by handling the `nil` channel respone by treating it as an error and allowing the existing mechanics to work as intended. * storage/raft: Improve retry_join go test * storage/raft: Make VerifyRaftPeers pollable * storage/raft: Add changelog entry for retry_join fix * storage/raft: Add description to VerifyRaftPeers --- changelog/16550.txt | 3 + helper/testhelpers/testhelpers.go | 11 ++- vault/external_tests/raft/raft_test.go | 75 ++++++++++++--------- vault/external_tests/raftha/raft_ha_test.go | 17 +++-- vault/raft.go | 3 + 5 files changed, 71 insertions(+), 38 deletions(-) create mode 100644 changelog/16550.txt diff --git a/changelog/16550.txt b/changelog/16550.txt new file mode 100644 index 000000000..a1df8bd00 --- /dev/null +++ b/changelog/16550.txt @@ -0,0 +1,3 @@ +```release-note:bug +storage/raft: Fix retry_join initialization failure +``` diff --git a/helper/testhelpers/testhelpers.go b/helper/testhelpers/testhelpers.go index c1b1397c3..08e3017e3 100644 --- a/helper/testhelpers/testhelpers.go +++ b/helper/testhelpers/testhelpers.go @@ -623,7 +623,12 @@ func GenerateDebugLogs(t testing.T, client *api.Client) chan struct{} { return stopCh } -func VerifyRaftPeers(t testing.T, client *api.Client, expected map[string]bool) { +// VerifyRaftPeers verifies that the raft configuration contains a given set of peers. +// The `expected` contains a map of expected peers. Existing entries are deleted +// from the map by removing entries whose keys are in the raft configuration. +// Remaining entries result in an error return so that the caller can poll for +// an expected configuration. +func VerifyRaftPeers(t testing.T, client *api.Client, expected map[string]bool) error { t.Helper() resp, err := client.Logical().Read("sys/storage/raft/configuration") @@ -655,8 +660,10 @@ func VerifyRaftPeers(t testing.T, client *api.Client, expected map[string]bool) // If the collection is non-empty, it means that the peer was not found in // the response. if len(expected) != 0 { - t.Fatalf("failed to read configuration successfully, expected peers not found in configuration list: %v", expected) + return fmt.Errorf("failed to read configuration successfully, expected peers not found in configuration list: %v", expected) } + + return nil } func TestMetricSinkProvider(gaugeInterval time.Duration) func(string) (*metricsutil.ClusterMetricSink, *metricsutil.MetricsHelper) { diff --git a/vault/external_tests/raft/raft_test.go b/vault/external_tests/raft/raft_test.go index 345f53f20..8f90ec208 100644 --- a/vault/external_tests/raft/raft_test.go +++ b/vault/external_tests/raft/raft_test.go @@ -184,9 +184,12 @@ func TestRaft_RetryAutoJoin(t *testing.T) { require.NoError(t, err) } - testhelpers.VerifyRaftPeers(t, cluster.Cores[0].Client, map[string]bool{ + err := testhelpers.VerifyRaftPeers(t, cluster.Cores[0].Client, map[string]bool{ "core-0": true, }) + if err != nil { + t.Fatal(err) + } } func TestRaft_Retry_Join(t *testing.T) { @@ -208,8 +211,6 @@ func TestRaft_Retry_Join(t *testing.T) { { testhelpers.EnsureCoreSealed(t, leaderCore) leaderCore.UnderlyingRawStorage.(*raft.RaftBackend).SetServerAddressProvider(addressProvider) - cluster.UnsealCore(t, leaderCore) - vault.TestWaitActive(t, leaderCore.Core) } leaderInfos := []*raft.LeaderJoinInfo{ @@ -220,36 +221,37 @@ func TestRaft_Retry_Join(t *testing.T) { }, } - { - core := cluster.Cores[1] - core.UnderlyingRawStorage.(*raft.RaftBackend).SetServerAddressProvider(addressProvider) - _, err := core.JoinRaftCluster(namespace.RootContext(context.Background()), leaderInfos, false) - if err != nil { - t.Fatal(err) - } + var wg sync.WaitGroup + for _, clusterCore := range cluster.Cores[1:] { + wg.Add(1) + go func(t *testing.T, core *vault.TestClusterCore) { + t.Helper() + defer wg.Done() + core.UnderlyingRawStorage.(*raft.RaftBackend).SetServerAddressProvider(addressProvider) + _, err := core.JoinRaftCluster(namespace.RootContext(context.Background()), leaderInfos, false) + if err != nil { + t.Error(err) + } - time.Sleep(2 * time.Second) - - cluster.UnsealCore(t, core) + // Handle potential racy behavior with unseals. Retry the unseal until it succeeds. + vault.RetryUntil(t, 10*time.Second, func() error { + return cluster.AttemptUnsealCore(core) + }) + }(t, clusterCore) } - { - core := cluster.Cores[2] - core.UnderlyingRawStorage.(*raft.RaftBackend).SetServerAddressProvider(addressProvider) - _, err := core.JoinRaftCluster(namespace.RootContext(context.Background()), leaderInfos, false) - if err != nil { - t.Fatal(err) - } + // Unseal the leader and wait for the other cores to unseal + cluster.UnsealCore(t, leaderCore) + wg.Wait() - time.Sleep(2 * time.Second) + vault.TestWaitActive(t, leaderCore.Core) - cluster.UnsealCore(t, core) - } - - testhelpers.VerifyRaftPeers(t, cluster.Cores[0].Client, map[string]bool{ - "core-0": true, - "core-1": true, - "core-2": true, + vault.RetryUntil(t, 10*time.Second, func() error { + return testhelpers.VerifyRaftPeers(t, cluster.Cores[0].Client, map[string]bool{ + "core-0": true, + "core-1": true, + "core-2": true, + }) }) } @@ -329,23 +331,29 @@ func TestRaft_RemovePeer(t *testing.T) { client := cluster.Cores[0].Client - testhelpers.VerifyRaftPeers(t, client, map[string]bool{ + err := testhelpers.VerifyRaftPeers(t, client, map[string]bool{ "core-0": true, "core-1": true, "core-2": true, }) + if err != nil { + t.Fatal(err) + } - _, err := client.Logical().Write("sys/storage/raft/remove-peer", map[string]interface{}{ + _, err = client.Logical().Write("sys/storage/raft/remove-peer", map[string]interface{}{ "server_id": "core-2", }) if err != nil { t.Fatal(err) } - testhelpers.VerifyRaftPeers(t, client, map[string]bool{ + err = testhelpers.VerifyRaftPeers(t, client, map[string]bool{ "core-0": true, "core-1": true, }) + if err != nil { + t.Fatal(err) + } _, err = client.Logical().Write("sys/storage/raft/remove-peer", map[string]interface{}{ "server_id": "core-1", @@ -354,9 +362,12 @@ func TestRaft_RemovePeer(t *testing.T) { t.Fatal(err) } - testhelpers.VerifyRaftPeers(t, client, map[string]bool{ + err = testhelpers.VerifyRaftPeers(t, client, map[string]bool{ "core-0": true, }) + if err != nil { + t.Fatal(err) + } } func TestRaft_NodeIDHeader(t *testing.T) { diff --git a/vault/external_tests/raftha/raft_ha_test.go b/vault/external_tests/raftha/raft_ha_test.go index e650b6bbb..0b33b0f89 100644 --- a/vault/external_tests/raftha/raft_ha_test.go +++ b/vault/external_tests/raftha/raft_ha_test.go @@ -100,14 +100,17 @@ func testRaftHANewCluster(t *testing.T, bundler teststorage.PhysicalBackendBundl // Ensure peers are added leaderClient := cluster.Cores[0].Client - testhelpers.VerifyRaftPeers(t, leaderClient, map[string]bool{ + err := testhelpers.VerifyRaftPeers(t, leaderClient, map[string]bool{ "core-0": true, "core-1": true, "core-2": true, }) + if err != nil { + t.Fatal(err) + } // Test remove peers - _, err := leaderClient.Logical().Write("sys/storage/raft/remove-peer", map[string]interface{}{ + _, err = leaderClient.Logical().Write("sys/storage/raft/remove-peer", map[string]interface{}{ "server_id": "core-1", }) if err != nil { @@ -122,9 +125,12 @@ func testRaftHANewCluster(t *testing.T, bundler teststorage.PhysicalBackendBundl } // Ensure peers are removed - testhelpers.VerifyRaftPeers(t, leaderClient, map[string]bool{ + err = testhelpers.VerifyRaftPeers(t, leaderClient, map[string]bool{ "core-0": true, }) + if err != nil { + t.Fatal(err) + } } func TestRaft_HA_ExistingCluster(t *testing.T) { @@ -233,11 +239,14 @@ func TestRaft_HA_ExistingCluster(t *testing.T) { joinFunc(cluster.Cores[2].Client) // Ensure peers are added - testhelpers.VerifyRaftPeers(t, leaderClient, map[string]bool{ + err := testhelpers.VerifyRaftPeers(t, leaderClient, map[string]bool{ "core-0": true, "core-1": true, "core-2": true, }) + if err != nil { + t.Fatal(err) + } } updateCluster(t) diff --git a/vault/raft.go b/vault/raft.go index 39af0d8e6..46cf87398 100644 --- a/vault/raft.go +++ b/vault/raft.go @@ -1015,6 +1015,9 @@ func (c *Core) JoinRaftCluster(ctx context.Context, leaderInfos []*raft.LeaderJo if err == nil { return nil } + } else { + // Return an error so we can retry_join + err = fmt.Errorf("failed to get raft challenge") } } return err