Vault 7338/fix retry join (#16550)

* storage/raft: Fix cluster init with retry_join

Commit 8db66f4853abce3f432adcf1724b1f237b275415 introduced an error
wherein a join() would return nil (no error) with no information on its
channel if a joining node had been initialized. This was not handled
properly by the caller and resulted in a canceled `retry_join`.

Fix this by handling the `nil` channel respone by treating it as an
error and allowing the existing mechanics to work as intended.

* storage/raft: Improve retry_join go test

* storage/raft: Make VerifyRaftPeers pollable

* storage/raft: Add changelog entry for retry_join fix

* storage/raft: Add description to VerifyRaftPeers
This commit is contained in:
Mike Palmiotto 2022-08-03 21:44:57 -04:00 committed by GitHub
parent 42900b554b
commit cd1157a905
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 71 additions and 38 deletions

3
changelog/16550.txt Normal file
View File

@ -0,0 +1,3 @@
```release-note:bug
storage/raft: Fix retry_join initialization failure
```

View File

@ -623,7 +623,12 @@ func GenerateDebugLogs(t testing.T, client *api.Client) chan struct{} {
return stopCh return stopCh
} }
func VerifyRaftPeers(t testing.T, client *api.Client, expected map[string]bool) { // VerifyRaftPeers verifies that the raft configuration contains a given set of peers.
// The `expected` contains a map of expected peers. Existing entries are deleted
// from the map by removing entries whose keys are in the raft configuration.
// Remaining entries result in an error return so that the caller can poll for
// an expected configuration.
func VerifyRaftPeers(t testing.T, client *api.Client, expected map[string]bool) error {
t.Helper() t.Helper()
resp, err := client.Logical().Read("sys/storage/raft/configuration") resp, err := client.Logical().Read("sys/storage/raft/configuration")
@ -655,8 +660,10 @@ func VerifyRaftPeers(t testing.T, client *api.Client, expected map[string]bool)
// If the collection is non-empty, it means that the peer was not found in // If the collection is non-empty, it means that the peer was not found in
// the response. // the response.
if len(expected) != 0 { if len(expected) != 0 {
t.Fatalf("failed to read configuration successfully, expected peers not found in configuration list: %v", expected) return fmt.Errorf("failed to read configuration successfully, expected peers not found in configuration list: %v", expected)
} }
return nil
} }
func TestMetricSinkProvider(gaugeInterval time.Duration) func(string) (*metricsutil.ClusterMetricSink, *metricsutil.MetricsHelper) { func TestMetricSinkProvider(gaugeInterval time.Duration) func(string) (*metricsutil.ClusterMetricSink, *metricsutil.MetricsHelper) {

View File

@ -184,9 +184,12 @@ func TestRaft_RetryAutoJoin(t *testing.T) {
require.NoError(t, err) require.NoError(t, err)
} }
testhelpers.VerifyRaftPeers(t, cluster.Cores[0].Client, map[string]bool{ err := testhelpers.VerifyRaftPeers(t, cluster.Cores[0].Client, map[string]bool{
"core-0": true, "core-0": true,
}) })
if err != nil {
t.Fatal(err)
}
} }
func TestRaft_Retry_Join(t *testing.T) { func TestRaft_Retry_Join(t *testing.T) {
@ -208,8 +211,6 @@ func TestRaft_Retry_Join(t *testing.T) {
{ {
testhelpers.EnsureCoreSealed(t, leaderCore) testhelpers.EnsureCoreSealed(t, leaderCore)
leaderCore.UnderlyingRawStorage.(*raft.RaftBackend).SetServerAddressProvider(addressProvider) leaderCore.UnderlyingRawStorage.(*raft.RaftBackend).SetServerAddressProvider(addressProvider)
cluster.UnsealCore(t, leaderCore)
vault.TestWaitActive(t, leaderCore.Core)
} }
leaderInfos := []*raft.LeaderJoinInfo{ leaderInfos := []*raft.LeaderJoinInfo{
@ -220,36 +221,37 @@ func TestRaft_Retry_Join(t *testing.T) {
}, },
} }
{ var wg sync.WaitGroup
core := cluster.Cores[1] for _, clusterCore := range cluster.Cores[1:] {
core.UnderlyingRawStorage.(*raft.RaftBackend).SetServerAddressProvider(addressProvider) wg.Add(1)
_, err := core.JoinRaftCluster(namespace.RootContext(context.Background()), leaderInfos, false) go func(t *testing.T, core *vault.TestClusterCore) {
if err != nil { t.Helper()
t.Fatal(err) defer wg.Done()
} core.UnderlyingRawStorage.(*raft.RaftBackend).SetServerAddressProvider(addressProvider)
_, err := core.JoinRaftCluster(namespace.RootContext(context.Background()), leaderInfos, false)
if err != nil {
t.Error(err)
}
time.Sleep(2 * time.Second) // Handle potential racy behavior with unseals. Retry the unseal until it succeeds.
vault.RetryUntil(t, 10*time.Second, func() error {
cluster.UnsealCore(t, core) return cluster.AttemptUnsealCore(core)
})
}(t, clusterCore)
} }
{ // Unseal the leader and wait for the other cores to unseal
core := cluster.Cores[2] cluster.UnsealCore(t, leaderCore)
core.UnderlyingRawStorage.(*raft.RaftBackend).SetServerAddressProvider(addressProvider) wg.Wait()
_, err := core.JoinRaftCluster(namespace.RootContext(context.Background()), leaderInfos, false)
if err != nil {
t.Fatal(err)
}
time.Sleep(2 * time.Second) vault.TestWaitActive(t, leaderCore.Core)
cluster.UnsealCore(t, core) vault.RetryUntil(t, 10*time.Second, func() error {
} return testhelpers.VerifyRaftPeers(t, cluster.Cores[0].Client, map[string]bool{
"core-0": true,
testhelpers.VerifyRaftPeers(t, cluster.Cores[0].Client, map[string]bool{ "core-1": true,
"core-0": true, "core-2": true,
"core-1": true, })
"core-2": true,
}) })
} }
@ -329,23 +331,29 @@ func TestRaft_RemovePeer(t *testing.T) {
client := cluster.Cores[0].Client client := cluster.Cores[0].Client
testhelpers.VerifyRaftPeers(t, client, map[string]bool{ err := testhelpers.VerifyRaftPeers(t, client, map[string]bool{
"core-0": true, "core-0": true,
"core-1": true, "core-1": true,
"core-2": true, "core-2": true,
}) })
if err != nil {
t.Fatal(err)
}
_, err := client.Logical().Write("sys/storage/raft/remove-peer", map[string]interface{}{ _, err = client.Logical().Write("sys/storage/raft/remove-peer", map[string]interface{}{
"server_id": "core-2", "server_id": "core-2",
}) })
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)
} }
testhelpers.VerifyRaftPeers(t, client, map[string]bool{ err = testhelpers.VerifyRaftPeers(t, client, map[string]bool{
"core-0": true, "core-0": true,
"core-1": true, "core-1": true,
}) })
if err != nil {
t.Fatal(err)
}
_, err = client.Logical().Write("sys/storage/raft/remove-peer", map[string]interface{}{ _, err = client.Logical().Write("sys/storage/raft/remove-peer", map[string]interface{}{
"server_id": "core-1", "server_id": "core-1",
@ -354,9 +362,12 @@ func TestRaft_RemovePeer(t *testing.T) {
t.Fatal(err) t.Fatal(err)
} }
testhelpers.VerifyRaftPeers(t, client, map[string]bool{ err = testhelpers.VerifyRaftPeers(t, client, map[string]bool{
"core-0": true, "core-0": true,
}) })
if err != nil {
t.Fatal(err)
}
} }
func TestRaft_NodeIDHeader(t *testing.T) { func TestRaft_NodeIDHeader(t *testing.T) {

View File

@ -100,14 +100,17 @@ func testRaftHANewCluster(t *testing.T, bundler teststorage.PhysicalBackendBundl
// Ensure peers are added // Ensure peers are added
leaderClient := cluster.Cores[0].Client leaderClient := cluster.Cores[0].Client
testhelpers.VerifyRaftPeers(t, leaderClient, map[string]bool{ err := testhelpers.VerifyRaftPeers(t, leaderClient, map[string]bool{
"core-0": true, "core-0": true,
"core-1": true, "core-1": true,
"core-2": true, "core-2": true,
}) })
if err != nil {
t.Fatal(err)
}
// Test remove peers // Test remove peers
_, err := leaderClient.Logical().Write("sys/storage/raft/remove-peer", map[string]interface{}{ _, err = leaderClient.Logical().Write("sys/storage/raft/remove-peer", map[string]interface{}{
"server_id": "core-1", "server_id": "core-1",
}) })
if err != nil { if err != nil {
@ -122,9 +125,12 @@ func testRaftHANewCluster(t *testing.T, bundler teststorage.PhysicalBackendBundl
} }
// Ensure peers are removed // Ensure peers are removed
testhelpers.VerifyRaftPeers(t, leaderClient, map[string]bool{ err = testhelpers.VerifyRaftPeers(t, leaderClient, map[string]bool{
"core-0": true, "core-0": true,
}) })
if err != nil {
t.Fatal(err)
}
} }
func TestRaft_HA_ExistingCluster(t *testing.T) { func TestRaft_HA_ExistingCluster(t *testing.T) {
@ -233,11 +239,14 @@ func TestRaft_HA_ExistingCluster(t *testing.T) {
joinFunc(cluster.Cores[2].Client) joinFunc(cluster.Cores[2].Client)
// Ensure peers are added // Ensure peers are added
testhelpers.VerifyRaftPeers(t, leaderClient, map[string]bool{ err := testhelpers.VerifyRaftPeers(t, leaderClient, map[string]bool{
"core-0": true, "core-0": true,
"core-1": true, "core-1": true,
"core-2": true, "core-2": true,
}) })
if err != nil {
t.Fatal(err)
}
} }
updateCluster(t) updateCluster(t)

View File

@ -1015,6 +1015,9 @@ func (c *Core) JoinRaftCluster(ctx context.Context, leaderInfos []*raft.LeaderJo
if err == nil { if err == nil {
return nil return nil
} }
} else {
// Return an error so we can retry_join
err = fmt.Errorf("failed to get raft challenge")
} }
} }
return err return err