Vault 7338/fix retry join (#16550)
* storage/raft: Fix cluster init with retry_join Commit 8db66f4853abce3f432adcf1724b1f237b275415 introduced an error wherein a join() would return nil (no error) with no information on its channel if a joining node had been initialized. This was not handled properly by the caller and resulted in a canceled `retry_join`. Fix this by handling the `nil` channel respone by treating it as an error and allowing the existing mechanics to work as intended. * storage/raft: Improve retry_join go test * storage/raft: Make VerifyRaftPeers pollable * storage/raft: Add changelog entry for retry_join fix * storage/raft: Add description to VerifyRaftPeers
This commit is contained in:
parent
42900b554b
commit
cd1157a905
|
@ -0,0 +1,3 @@
|
||||||
|
```release-note:bug
|
||||||
|
storage/raft: Fix retry_join initialization failure
|
||||||
|
```
|
|
@ -623,7 +623,12 @@ func GenerateDebugLogs(t testing.T, client *api.Client) chan struct{} {
|
||||||
return stopCh
|
return stopCh
|
||||||
}
|
}
|
||||||
|
|
||||||
func VerifyRaftPeers(t testing.T, client *api.Client, expected map[string]bool) {
|
// VerifyRaftPeers verifies that the raft configuration contains a given set of peers.
|
||||||
|
// The `expected` contains a map of expected peers. Existing entries are deleted
|
||||||
|
// from the map by removing entries whose keys are in the raft configuration.
|
||||||
|
// Remaining entries result in an error return so that the caller can poll for
|
||||||
|
// an expected configuration.
|
||||||
|
func VerifyRaftPeers(t testing.T, client *api.Client, expected map[string]bool) error {
|
||||||
t.Helper()
|
t.Helper()
|
||||||
|
|
||||||
resp, err := client.Logical().Read("sys/storage/raft/configuration")
|
resp, err := client.Logical().Read("sys/storage/raft/configuration")
|
||||||
|
@ -655,8 +660,10 @@ func VerifyRaftPeers(t testing.T, client *api.Client, expected map[string]bool)
|
||||||
// If the collection is non-empty, it means that the peer was not found in
|
// If the collection is non-empty, it means that the peer was not found in
|
||||||
// the response.
|
// the response.
|
||||||
if len(expected) != 0 {
|
if len(expected) != 0 {
|
||||||
t.Fatalf("failed to read configuration successfully, expected peers not found in configuration list: %v", expected)
|
return fmt.Errorf("failed to read configuration successfully, expected peers not found in configuration list: %v", expected)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestMetricSinkProvider(gaugeInterval time.Duration) func(string) (*metricsutil.ClusterMetricSink, *metricsutil.MetricsHelper) {
|
func TestMetricSinkProvider(gaugeInterval time.Duration) func(string) (*metricsutil.ClusterMetricSink, *metricsutil.MetricsHelper) {
|
||||||
|
|
|
@ -184,9 +184,12 @@ func TestRaft_RetryAutoJoin(t *testing.T) {
|
||||||
require.NoError(t, err)
|
require.NoError(t, err)
|
||||||
}
|
}
|
||||||
|
|
||||||
testhelpers.VerifyRaftPeers(t, cluster.Cores[0].Client, map[string]bool{
|
err := testhelpers.VerifyRaftPeers(t, cluster.Cores[0].Client, map[string]bool{
|
||||||
"core-0": true,
|
"core-0": true,
|
||||||
})
|
})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestRaft_Retry_Join(t *testing.T) {
|
func TestRaft_Retry_Join(t *testing.T) {
|
||||||
|
@ -208,8 +211,6 @@ func TestRaft_Retry_Join(t *testing.T) {
|
||||||
{
|
{
|
||||||
testhelpers.EnsureCoreSealed(t, leaderCore)
|
testhelpers.EnsureCoreSealed(t, leaderCore)
|
||||||
leaderCore.UnderlyingRawStorage.(*raft.RaftBackend).SetServerAddressProvider(addressProvider)
|
leaderCore.UnderlyingRawStorage.(*raft.RaftBackend).SetServerAddressProvider(addressProvider)
|
||||||
cluster.UnsealCore(t, leaderCore)
|
|
||||||
vault.TestWaitActive(t, leaderCore.Core)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
leaderInfos := []*raft.LeaderJoinInfo{
|
leaderInfos := []*raft.LeaderJoinInfo{
|
||||||
|
@ -220,36 +221,37 @@ func TestRaft_Retry_Join(t *testing.T) {
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
var wg sync.WaitGroup
|
||||||
core := cluster.Cores[1]
|
for _, clusterCore := range cluster.Cores[1:] {
|
||||||
core.UnderlyingRawStorage.(*raft.RaftBackend).SetServerAddressProvider(addressProvider)
|
wg.Add(1)
|
||||||
_, err := core.JoinRaftCluster(namespace.RootContext(context.Background()), leaderInfos, false)
|
go func(t *testing.T, core *vault.TestClusterCore) {
|
||||||
if err != nil {
|
t.Helper()
|
||||||
t.Fatal(err)
|
defer wg.Done()
|
||||||
}
|
core.UnderlyingRawStorage.(*raft.RaftBackend).SetServerAddressProvider(addressProvider)
|
||||||
|
_, err := core.JoinRaftCluster(namespace.RootContext(context.Background()), leaderInfos, false)
|
||||||
|
if err != nil {
|
||||||
|
t.Error(err)
|
||||||
|
}
|
||||||
|
|
||||||
time.Sleep(2 * time.Second)
|
// Handle potential racy behavior with unseals. Retry the unseal until it succeeds.
|
||||||
|
vault.RetryUntil(t, 10*time.Second, func() error {
|
||||||
cluster.UnsealCore(t, core)
|
return cluster.AttemptUnsealCore(core)
|
||||||
|
})
|
||||||
|
}(t, clusterCore)
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
// Unseal the leader and wait for the other cores to unseal
|
||||||
core := cluster.Cores[2]
|
cluster.UnsealCore(t, leaderCore)
|
||||||
core.UnderlyingRawStorage.(*raft.RaftBackend).SetServerAddressProvider(addressProvider)
|
wg.Wait()
|
||||||
_, err := core.JoinRaftCluster(namespace.RootContext(context.Background()), leaderInfos, false)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatal(err)
|
|
||||||
}
|
|
||||||
|
|
||||||
time.Sleep(2 * time.Second)
|
vault.TestWaitActive(t, leaderCore.Core)
|
||||||
|
|
||||||
cluster.UnsealCore(t, core)
|
vault.RetryUntil(t, 10*time.Second, func() error {
|
||||||
}
|
return testhelpers.VerifyRaftPeers(t, cluster.Cores[0].Client, map[string]bool{
|
||||||
|
"core-0": true,
|
||||||
testhelpers.VerifyRaftPeers(t, cluster.Cores[0].Client, map[string]bool{
|
"core-1": true,
|
||||||
"core-0": true,
|
"core-2": true,
|
||||||
"core-1": true,
|
})
|
||||||
"core-2": true,
|
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -329,23 +331,29 @@ func TestRaft_RemovePeer(t *testing.T) {
|
||||||
|
|
||||||
client := cluster.Cores[0].Client
|
client := cluster.Cores[0].Client
|
||||||
|
|
||||||
testhelpers.VerifyRaftPeers(t, client, map[string]bool{
|
err := testhelpers.VerifyRaftPeers(t, client, map[string]bool{
|
||||||
"core-0": true,
|
"core-0": true,
|
||||||
"core-1": true,
|
"core-1": true,
|
||||||
"core-2": true,
|
"core-2": true,
|
||||||
})
|
})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
_, err := client.Logical().Write("sys/storage/raft/remove-peer", map[string]interface{}{
|
_, err = client.Logical().Write("sys/storage/raft/remove-peer", map[string]interface{}{
|
||||||
"server_id": "core-2",
|
"server_id": "core-2",
|
||||||
})
|
})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
testhelpers.VerifyRaftPeers(t, client, map[string]bool{
|
err = testhelpers.VerifyRaftPeers(t, client, map[string]bool{
|
||||||
"core-0": true,
|
"core-0": true,
|
||||||
"core-1": true,
|
"core-1": true,
|
||||||
})
|
})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
_, err = client.Logical().Write("sys/storage/raft/remove-peer", map[string]interface{}{
|
_, err = client.Logical().Write("sys/storage/raft/remove-peer", map[string]interface{}{
|
||||||
"server_id": "core-1",
|
"server_id": "core-1",
|
||||||
|
@ -354,9 +362,12 @@ func TestRaft_RemovePeer(t *testing.T) {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
testhelpers.VerifyRaftPeers(t, client, map[string]bool{
|
err = testhelpers.VerifyRaftPeers(t, client, map[string]bool{
|
||||||
"core-0": true,
|
"core-0": true,
|
||||||
})
|
})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestRaft_NodeIDHeader(t *testing.T) {
|
func TestRaft_NodeIDHeader(t *testing.T) {
|
||||||
|
|
|
@ -100,14 +100,17 @@ func testRaftHANewCluster(t *testing.T, bundler teststorage.PhysicalBackendBundl
|
||||||
|
|
||||||
// Ensure peers are added
|
// Ensure peers are added
|
||||||
leaderClient := cluster.Cores[0].Client
|
leaderClient := cluster.Cores[0].Client
|
||||||
testhelpers.VerifyRaftPeers(t, leaderClient, map[string]bool{
|
err := testhelpers.VerifyRaftPeers(t, leaderClient, map[string]bool{
|
||||||
"core-0": true,
|
"core-0": true,
|
||||||
"core-1": true,
|
"core-1": true,
|
||||||
"core-2": true,
|
"core-2": true,
|
||||||
})
|
})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
// Test remove peers
|
// Test remove peers
|
||||||
_, err := leaderClient.Logical().Write("sys/storage/raft/remove-peer", map[string]interface{}{
|
_, err = leaderClient.Logical().Write("sys/storage/raft/remove-peer", map[string]interface{}{
|
||||||
"server_id": "core-1",
|
"server_id": "core-1",
|
||||||
})
|
})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -122,9 +125,12 @@ func testRaftHANewCluster(t *testing.T, bundler teststorage.PhysicalBackendBundl
|
||||||
}
|
}
|
||||||
|
|
||||||
// Ensure peers are removed
|
// Ensure peers are removed
|
||||||
testhelpers.VerifyRaftPeers(t, leaderClient, map[string]bool{
|
err = testhelpers.VerifyRaftPeers(t, leaderClient, map[string]bool{
|
||||||
"core-0": true,
|
"core-0": true,
|
||||||
})
|
})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestRaft_HA_ExistingCluster(t *testing.T) {
|
func TestRaft_HA_ExistingCluster(t *testing.T) {
|
||||||
|
@ -233,11 +239,14 @@ func TestRaft_HA_ExistingCluster(t *testing.T) {
|
||||||
joinFunc(cluster.Cores[2].Client)
|
joinFunc(cluster.Cores[2].Client)
|
||||||
|
|
||||||
// Ensure peers are added
|
// Ensure peers are added
|
||||||
testhelpers.VerifyRaftPeers(t, leaderClient, map[string]bool{
|
err := testhelpers.VerifyRaftPeers(t, leaderClient, map[string]bool{
|
||||||
"core-0": true,
|
"core-0": true,
|
||||||
"core-1": true,
|
"core-1": true,
|
||||||
"core-2": true,
|
"core-2": true,
|
||||||
})
|
})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
updateCluster(t)
|
updateCluster(t)
|
||||||
|
|
|
@ -1015,6 +1015,9 @@ func (c *Core) JoinRaftCluster(ctx context.Context, leaderInfos []*raft.LeaderJo
|
||||||
if err == nil {
|
if err == nil {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
// Return an error so we can retry_join
|
||||||
|
err = fmt.Errorf("failed to get raft challenge")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return err
|
return err
|
||||||
|
|
Loading…
Reference in New Issue