Vault 7338/fix retry join (#16550)
* storage/raft: Fix cluster init with retry_join Commit 8db66f4853abce3f432adcf1724b1f237b275415 introduced an error wherein a join() would return nil (no error) with no information on its channel if a joining node had been initialized. This was not handled properly by the caller and resulted in a canceled `retry_join`. Fix this by handling the `nil` channel respone by treating it as an error and allowing the existing mechanics to work as intended. * storage/raft: Improve retry_join go test * storage/raft: Make VerifyRaftPeers pollable * storage/raft: Add changelog entry for retry_join fix * storage/raft: Add description to VerifyRaftPeers
This commit is contained in:
parent
42900b554b
commit
cd1157a905
|
@ -0,0 +1,3 @@
|
|||
```release-note:bug
|
||||
storage/raft: Fix retry_join initialization failure
|
||||
```
|
|
@ -623,7 +623,12 @@ func GenerateDebugLogs(t testing.T, client *api.Client) chan struct{} {
|
|||
return stopCh
|
||||
}
|
||||
|
||||
func VerifyRaftPeers(t testing.T, client *api.Client, expected map[string]bool) {
|
||||
// VerifyRaftPeers verifies that the raft configuration contains a given set of peers.
|
||||
// The `expected` contains a map of expected peers. Existing entries are deleted
|
||||
// from the map by removing entries whose keys are in the raft configuration.
|
||||
// Remaining entries result in an error return so that the caller can poll for
|
||||
// an expected configuration.
|
||||
func VerifyRaftPeers(t testing.T, client *api.Client, expected map[string]bool) error {
|
||||
t.Helper()
|
||||
|
||||
resp, err := client.Logical().Read("sys/storage/raft/configuration")
|
||||
|
@ -655,8 +660,10 @@ func VerifyRaftPeers(t testing.T, client *api.Client, expected map[string]bool)
|
|||
// If the collection is non-empty, it means that the peer was not found in
|
||||
// the response.
|
||||
if len(expected) != 0 {
|
||||
t.Fatalf("failed to read configuration successfully, expected peers not found in configuration list: %v", expected)
|
||||
return fmt.Errorf("failed to read configuration successfully, expected peers not found in configuration list: %v", expected)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func TestMetricSinkProvider(gaugeInterval time.Duration) func(string) (*metricsutil.ClusterMetricSink, *metricsutil.MetricsHelper) {
|
||||
|
|
|
@ -184,9 +184,12 @@ func TestRaft_RetryAutoJoin(t *testing.T) {
|
|||
require.NoError(t, err)
|
||||
}
|
||||
|
||||
testhelpers.VerifyRaftPeers(t, cluster.Cores[0].Client, map[string]bool{
|
||||
err := testhelpers.VerifyRaftPeers(t, cluster.Cores[0].Client, map[string]bool{
|
||||
"core-0": true,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRaft_Retry_Join(t *testing.T) {
|
||||
|
@ -208,8 +211,6 @@ func TestRaft_Retry_Join(t *testing.T) {
|
|||
{
|
||||
testhelpers.EnsureCoreSealed(t, leaderCore)
|
||||
leaderCore.UnderlyingRawStorage.(*raft.RaftBackend).SetServerAddressProvider(addressProvider)
|
||||
cluster.UnsealCore(t, leaderCore)
|
||||
vault.TestWaitActive(t, leaderCore.Core)
|
||||
}
|
||||
|
||||
leaderInfos := []*raft.LeaderJoinInfo{
|
||||
|
@ -220,37 +221,38 @@ func TestRaft_Retry_Join(t *testing.T) {
|
|||
},
|
||||
}
|
||||
|
||||
{
|
||||
core := cluster.Cores[1]
|
||||
var wg sync.WaitGroup
|
||||
for _, clusterCore := range cluster.Cores[1:] {
|
||||
wg.Add(1)
|
||||
go func(t *testing.T, core *vault.TestClusterCore) {
|
||||
t.Helper()
|
||||
defer wg.Done()
|
||||
core.UnderlyingRawStorage.(*raft.RaftBackend).SetServerAddressProvider(addressProvider)
|
||||
_, err := core.JoinRaftCluster(namespace.RootContext(context.Background()), leaderInfos, false)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
t.Error(err)
|
||||
}
|
||||
|
||||
time.Sleep(2 * time.Second)
|
||||
|
||||
cluster.UnsealCore(t, core)
|
||||
// Handle potential racy behavior with unseals. Retry the unseal until it succeeds.
|
||||
vault.RetryUntil(t, 10*time.Second, func() error {
|
||||
return cluster.AttemptUnsealCore(core)
|
||||
})
|
||||
}(t, clusterCore)
|
||||
}
|
||||
|
||||
{
|
||||
core := cluster.Cores[2]
|
||||
core.UnderlyingRawStorage.(*raft.RaftBackend).SetServerAddressProvider(addressProvider)
|
||||
_, err := core.JoinRaftCluster(namespace.RootContext(context.Background()), leaderInfos, false)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
// Unseal the leader and wait for the other cores to unseal
|
||||
cluster.UnsealCore(t, leaderCore)
|
||||
wg.Wait()
|
||||
|
||||
time.Sleep(2 * time.Second)
|
||||
vault.TestWaitActive(t, leaderCore.Core)
|
||||
|
||||
cluster.UnsealCore(t, core)
|
||||
}
|
||||
|
||||
testhelpers.VerifyRaftPeers(t, cluster.Cores[0].Client, map[string]bool{
|
||||
vault.RetryUntil(t, 10*time.Second, func() error {
|
||||
return testhelpers.VerifyRaftPeers(t, cluster.Cores[0].Client, map[string]bool{
|
||||
"core-0": true,
|
||||
"core-1": true,
|
||||
"core-2": true,
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
func TestRaft_Join(t *testing.T) {
|
||||
|
@ -329,23 +331,29 @@ func TestRaft_RemovePeer(t *testing.T) {
|
|||
|
||||
client := cluster.Cores[0].Client
|
||||
|
||||
testhelpers.VerifyRaftPeers(t, client, map[string]bool{
|
||||
err := testhelpers.VerifyRaftPeers(t, client, map[string]bool{
|
||||
"core-0": true,
|
||||
"core-1": true,
|
||||
"core-2": true,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
_, err := client.Logical().Write("sys/storage/raft/remove-peer", map[string]interface{}{
|
||||
_, err = client.Logical().Write("sys/storage/raft/remove-peer", map[string]interface{}{
|
||||
"server_id": "core-2",
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
testhelpers.VerifyRaftPeers(t, client, map[string]bool{
|
||||
err = testhelpers.VerifyRaftPeers(t, client, map[string]bool{
|
||||
"core-0": true,
|
||||
"core-1": true,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
_, err = client.Logical().Write("sys/storage/raft/remove-peer", map[string]interface{}{
|
||||
"server_id": "core-1",
|
||||
|
@ -354,9 +362,12 @@ func TestRaft_RemovePeer(t *testing.T) {
|
|||
t.Fatal(err)
|
||||
}
|
||||
|
||||
testhelpers.VerifyRaftPeers(t, client, map[string]bool{
|
||||
err = testhelpers.VerifyRaftPeers(t, client, map[string]bool{
|
||||
"core-0": true,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRaft_NodeIDHeader(t *testing.T) {
|
||||
|
|
|
@ -100,14 +100,17 @@ func testRaftHANewCluster(t *testing.T, bundler teststorage.PhysicalBackendBundl
|
|||
|
||||
// Ensure peers are added
|
||||
leaderClient := cluster.Cores[0].Client
|
||||
testhelpers.VerifyRaftPeers(t, leaderClient, map[string]bool{
|
||||
err := testhelpers.VerifyRaftPeers(t, leaderClient, map[string]bool{
|
||||
"core-0": true,
|
||||
"core-1": true,
|
||||
"core-2": true,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Test remove peers
|
||||
_, err := leaderClient.Logical().Write("sys/storage/raft/remove-peer", map[string]interface{}{
|
||||
_, err = leaderClient.Logical().Write("sys/storage/raft/remove-peer", map[string]interface{}{
|
||||
"server_id": "core-1",
|
||||
})
|
||||
if err != nil {
|
||||
|
@ -122,9 +125,12 @@ func testRaftHANewCluster(t *testing.T, bundler teststorage.PhysicalBackendBundl
|
|||
}
|
||||
|
||||
// Ensure peers are removed
|
||||
testhelpers.VerifyRaftPeers(t, leaderClient, map[string]bool{
|
||||
err = testhelpers.VerifyRaftPeers(t, leaderClient, map[string]bool{
|
||||
"core-0": true,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRaft_HA_ExistingCluster(t *testing.T) {
|
||||
|
@ -233,11 +239,14 @@ func TestRaft_HA_ExistingCluster(t *testing.T) {
|
|||
joinFunc(cluster.Cores[2].Client)
|
||||
|
||||
// Ensure peers are added
|
||||
testhelpers.VerifyRaftPeers(t, leaderClient, map[string]bool{
|
||||
err := testhelpers.VerifyRaftPeers(t, leaderClient, map[string]bool{
|
||||
"core-0": true,
|
||||
"core-1": true,
|
||||
"core-2": true,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
updateCluster(t)
|
||||
|
|
|
@ -1015,6 +1015,9 @@ func (c *Core) JoinRaftCluster(ctx context.Context, leaderInfos []*raft.LeaderJo
|
|||
if err == nil {
|
||||
return nil
|
||||
}
|
||||
} else {
|
||||
// Return an error so we can retry_join
|
||||
err = fmt.Errorf("failed to get raft challenge")
|
||||
}
|
||||
}
|
||||
return err
|
||||
|
|
Loading…
Reference in New Issue