diff --git a/changelog/16550.txt b/changelog/16550.txt new file mode 100644 index 000000000..a1df8bd00 --- /dev/null +++ b/changelog/16550.txt @@ -0,0 +1,3 @@ +```release-note:bug +storage/raft: Fix retry_join initialization failure +``` diff --git a/helper/testhelpers/testhelpers.go b/helper/testhelpers/testhelpers.go index c1b1397c3..08e3017e3 100644 --- a/helper/testhelpers/testhelpers.go +++ b/helper/testhelpers/testhelpers.go @@ -623,7 +623,12 @@ func GenerateDebugLogs(t testing.T, client *api.Client) chan struct{} { return stopCh } -func VerifyRaftPeers(t testing.T, client *api.Client, expected map[string]bool) { +// VerifyRaftPeers verifies that the raft configuration contains a given set of peers. +// The `expected` contains a map of expected peers. Existing entries are deleted +// from the map by removing entries whose keys are in the raft configuration. +// Remaining entries result in an error return so that the caller can poll for +// an expected configuration. +func VerifyRaftPeers(t testing.T, client *api.Client, expected map[string]bool) error { t.Helper() resp, err := client.Logical().Read("sys/storage/raft/configuration") @@ -655,8 +660,10 @@ func VerifyRaftPeers(t testing.T, client *api.Client, expected map[string]bool) // If the collection is non-empty, it means that the peer was not found in // the response. if len(expected) != 0 { - t.Fatalf("failed to read configuration successfully, expected peers not found in configuration list: %v", expected) + return fmt.Errorf("failed to read configuration successfully, expected peers not found in configuration list: %v", expected) } + + return nil } func TestMetricSinkProvider(gaugeInterval time.Duration) func(string) (*metricsutil.ClusterMetricSink, *metricsutil.MetricsHelper) { diff --git a/vault/external_tests/raft/raft_test.go b/vault/external_tests/raft/raft_test.go index 345f53f20..8f90ec208 100644 --- a/vault/external_tests/raft/raft_test.go +++ b/vault/external_tests/raft/raft_test.go @@ -184,9 +184,12 @@ func TestRaft_RetryAutoJoin(t *testing.T) { require.NoError(t, err) } - testhelpers.VerifyRaftPeers(t, cluster.Cores[0].Client, map[string]bool{ + err := testhelpers.VerifyRaftPeers(t, cluster.Cores[0].Client, map[string]bool{ "core-0": true, }) + if err != nil { + t.Fatal(err) + } } func TestRaft_Retry_Join(t *testing.T) { @@ -208,8 +211,6 @@ func TestRaft_Retry_Join(t *testing.T) { { testhelpers.EnsureCoreSealed(t, leaderCore) leaderCore.UnderlyingRawStorage.(*raft.RaftBackend).SetServerAddressProvider(addressProvider) - cluster.UnsealCore(t, leaderCore) - vault.TestWaitActive(t, leaderCore.Core) } leaderInfos := []*raft.LeaderJoinInfo{ @@ -220,36 +221,37 @@ func TestRaft_Retry_Join(t *testing.T) { }, } - { - core := cluster.Cores[1] - core.UnderlyingRawStorage.(*raft.RaftBackend).SetServerAddressProvider(addressProvider) - _, err := core.JoinRaftCluster(namespace.RootContext(context.Background()), leaderInfos, false) - if err != nil { - t.Fatal(err) - } + var wg sync.WaitGroup + for _, clusterCore := range cluster.Cores[1:] { + wg.Add(1) + go func(t *testing.T, core *vault.TestClusterCore) { + t.Helper() + defer wg.Done() + core.UnderlyingRawStorage.(*raft.RaftBackend).SetServerAddressProvider(addressProvider) + _, err := core.JoinRaftCluster(namespace.RootContext(context.Background()), leaderInfos, false) + if err != nil { + t.Error(err) + } - time.Sleep(2 * time.Second) - - cluster.UnsealCore(t, core) + // Handle potential racy behavior with unseals. Retry the unseal until it succeeds. + vault.RetryUntil(t, 10*time.Second, func() error { + return cluster.AttemptUnsealCore(core) + }) + }(t, clusterCore) } - { - core := cluster.Cores[2] - core.UnderlyingRawStorage.(*raft.RaftBackend).SetServerAddressProvider(addressProvider) - _, err := core.JoinRaftCluster(namespace.RootContext(context.Background()), leaderInfos, false) - if err != nil { - t.Fatal(err) - } + // Unseal the leader and wait for the other cores to unseal + cluster.UnsealCore(t, leaderCore) + wg.Wait() - time.Sleep(2 * time.Second) + vault.TestWaitActive(t, leaderCore.Core) - cluster.UnsealCore(t, core) - } - - testhelpers.VerifyRaftPeers(t, cluster.Cores[0].Client, map[string]bool{ - "core-0": true, - "core-1": true, - "core-2": true, + vault.RetryUntil(t, 10*time.Second, func() error { + return testhelpers.VerifyRaftPeers(t, cluster.Cores[0].Client, map[string]bool{ + "core-0": true, + "core-1": true, + "core-2": true, + }) }) } @@ -329,23 +331,29 @@ func TestRaft_RemovePeer(t *testing.T) { client := cluster.Cores[0].Client - testhelpers.VerifyRaftPeers(t, client, map[string]bool{ + err := testhelpers.VerifyRaftPeers(t, client, map[string]bool{ "core-0": true, "core-1": true, "core-2": true, }) + if err != nil { + t.Fatal(err) + } - _, err := client.Logical().Write("sys/storage/raft/remove-peer", map[string]interface{}{ + _, err = client.Logical().Write("sys/storage/raft/remove-peer", map[string]interface{}{ "server_id": "core-2", }) if err != nil { t.Fatal(err) } - testhelpers.VerifyRaftPeers(t, client, map[string]bool{ + err = testhelpers.VerifyRaftPeers(t, client, map[string]bool{ "core-0": true, "core-1": true, }) + if err != nil { + t.Fatal(err) + } _, err = client.Logical().Write("sys/storage/raft/remove-peer", map[string]interface{}{ "server_id": "core-1", @@ -354,9 +362,12 @@ func TestRaft_RemovePeer(t *testing.T) { t.Fatal(err) } - testhelpers.VerifyRaftPeers(t, client, map[string]bool{ + err = testhelpers.VerifyRaftPeers(t, client, map[string]bool{ "core-0": true, }) + if err != nil { + t.Fatal(err) + } } func TestRaft_NodeIDHeader(t *testing.T) { diff --git a/vault/external_tests/raftha/raft_ha_test.go b/vault/external_tests/raftha/raft_ha_test.go index e650b6bbb..0b33b0f89 100644 --- a/vault/external_tests/raftha/raft_ha_test.go +++ b/vault/external_tests/raftha/raft_ha_test.go @@ -100,14 +100,17 @@ func testRaftHANewCluster(t *testing.T, bundler teststorage.PhysicalBackendBundl // Ensure peers are added leaderClient := cluster.Cores[0].Client - testhelpers.VerifyRaftPeers(t, leaderClient, map[string]bool{ + err := testhelpers.VerifyRaftPeers(t, leaderClient, map[string]bool{ "core-0": true, "core-1": true, "core-2": true, }) + if err != nil { + t.Fatal(err) + } // Test remove peers - _, err := leaderClient.Logical().Write("sys/storage/raft/remove-peer", map[string]interface{}{ + _, err = leaderClient.Logical().Write("sys/storage/raft/remove-peer", map[string]interface{}{ "server_id": "core-1", }) if err != nil { @@ -122,9 +125,12 @@ func testRaftHANewCluster(t *testing.T, bundler teststorage.PhysicalBackendBundl } // Ensure peers are removed - testhelpers.VerifyRaftPeers(t, leaderClient, map[string]bool{ + err = testhelpers.VerifyRaftPeers(t, leaderClient, map[string]bool{ "core-0": true, }) + if err != nil { + t.Fatal(err) + } } func TestRaft_HA_ExistingCluster(t *testing.T) { @@ -233,11 +239,14 @@ func TestRaft_HA_ExistingCluster(t *testing.T) { joinFunc(cluster.Cores[2].Client) // Ensure peers are added - testhelpers.VerifyRaftPeers(t, leaderClient, map[string]bool{ + err := testhelpers.VerifyRaftPeers(t, leaderClient, map[string]bool{ "core-0": true, "core-1": true, "core-2": true, }) + if err != nil { + t.Fatal(err) + } } updateCluster(t) diff --git a/vault/raft.go b/vault/raft.go index 39af0d8e6..46cf87398 100644 --- a/vault/raft.go +++ b/vault/raft.go @@ -1015,6 +1015,9 @@ func (c *Core) JoinRaftCluster(ctx context.Context, leaderInfos []*raft.LeaderJo if err == nil { return nil } + } else { + // Return an error so we can retry_join + err = fmt.Errorf("failed to get raft challenge") } } return err