Vault 7338/fix retry join (#16550)

* storage/raft: Fix cluster init with retry_join

Commit 8db66f4853abce3f432adcf1724b1f237b275415 introduced an error
wherein a join() would return nil (no error) with no information on its
channel if a joining node had been initialized. This was not handled
properly by the caller and resulted in a canceled `retry_join`.

Fix this by handling the `nil` channel respone by treating it as an
error and allowing the existing mechanics to work as intended.

* storage/raft: Improve retry_join go test

* storage/raft: Make VerifyRaftPeers pollable

* storage/raft: Add changelog entry for retry_join fix

* storage/raft: Add description to VerifyRaftPeers
This commit is contained in:
Mike Palmiotto 2022-08-03 21:44:57 -04:00 committed by GitHub
parent 42900b554b
commit cd1157a905
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 71 additions and 38 deletions

3
changelog/16550.txt Normal file
View File

@ -0,0 +1,3 @@
```release-note:bug
storage/raft: Fix retry_join initialization failure
```

View File

@ -623,7 +623,12 @@ func GenerateDebugLogs(t testing.T, client *api.Client) chan struct{} {
return stopCh
}
func VerifyRaftPeers(t testing.T, client *api.Client, expected map[string]bool) {
// VerifyRaftPeers verifies that the raft configuration contains a given set of peers.
// The `expected` contains a map of expected peers. Existing entries are deleted
// from the map by removing entries whose keys are in the raft configuration.
// Remaining entries result in an error return so that the caller can poll for
// an expected configuration.
func VerifyRaftPeers(t testing.T, client *api.Client, expected map[string]bool) error {
t.Helper()
resp, err := client.Logical().Read("sys/storage/raft/configuration")
@ -655,8 +660,10 @@ func VerifyRaftPeers(t testing.T, client *api.Client, expected map[string]bool)
// If the collection is non-empty, it means that the peer was not found in
// the response.
if len(expected) != 0 {
t.Fatalf("failed to read configuration successfully, expected peers not found in configuration list: %v", expected)
return fmt.Errorf("failed to read configuration successfully, expected peers not found in configuration list: %v", expected)
}
return nil
}
func TestMetricSinkProvider(gaugeInterval time.Duration) func(string) (*metricsutil.ClusterMetricSink, *metricsutil.MetricsHelper) {

View File

@ -184,9 +184,12 @@ func TestRaft_RetryAutoJoin(t *testing.T) {
require.NoError(t, err)
}
testhelpers.VerifyRaftPeers(t, cluster.Cores[0].Client, map[string]bool{
err := testhelpers.VerifyRaftPeers(t, cluster.Cores[0].Client, map[string]bool{
"core-0": true,
})
if err != nil {
t.Fatal(err)
}
}
func TestRaft_Retry_Join(t *testing.T) {
@ -208,8 +211,6 @@ func TestRaft_Retry_Join(t *testing.T) {
{
testhelpers.EnsureCoreSealed(t, leaderCore)
leaderCore.UnderlyingRawStorage.(*raft.RaftBackend).SetServerAddressProvider(addressProvider)
cluster.UnsealCore(t, leaderCore)
vault.TestWaitActive(t, leaderCore.Core)
}
leaderInfos := []*raft.LeaderJoinInfo{
@ -220,36 +221,37 @@ func TestRaft_Retry_Join(t *testing.T) {
},
}
{
core := cluster.Cores[1]
core.UnderlyingRawStorage.(*raft.RaftBackend).SetServerAddressProvider(addressProvider)
_, err := core.JoinRaftCluster(namespace.RootContext(context.Background()), leaderInfos, false)
if err != nil {
t.Fatal(err)
}
var wg sync.WaitGroup
for _, clusterCore := range cluster.Cores[1:] {
wg.Add(1)
go func(t *testing.T, core *vault.TestClusterCore) {
t.Helper()
defer wg.Done()
core.UnderlyingRawStorage.(*raft.RaftBackend).SetServerAddressProvider(addressProvider)
_, err := core.JoinRaftCluster(namespace.RootContext(context.Background()), leaderInfos, false)
if err != nil {
t.Error(err)
}
time.Sleep(2 * time.Second)
cluster.UnsealCore(t, core)
// Handle potential racy behavior with unseals. Retry the unseal until it succeeds.
vault.RetryUntil(t, 10*time.Second, func() error {
return cluster.AttemptUnsealCore(core)
})
}(t, clusterCore)
}
{
core := cluster.Cores[2]
core.UnderlyingRawStorage.(*raft.RaftBackend).SetServerAddressProvider(addressProvider)
_, err := core.JoinRaftCluster(namespace.RootContext(context.Background()), leaderInfos, false)
if err != nil {
t.Fatal(err)
}
// Unseal the leader and wait for the other cores to unseal
cluster.UnsealCore(t, leaderCore)
wg.Wait()
time.Sleep(2 * time.Second)
vault.TestWaitActive(t, leaderCore.Core)
cluster.UnsealCore(t, core)
}
testhelpers.VerifyRaftPeers(t, cluster.Cores[0].Client, map[string]bool{
"core-0": true,
"core-1": true,
"core-2": true,
vault.RetryUntil(t, 10*time.Second, func() error {
return testhelpers.VerifyRaftPeers(t, cluster.Cores[0].Client, map[string]bool{
"core-0": true,
"core-1": true,
"core-2": true,
})
})
}
@ -329,23 +331,29 @@ func TestRaft_RemovePeer(t *testing.T) {
client := cluster.Cores[0].Client
testhelpers.VerifyRaftPeers(t, client, map[string]bool{
err := testhelpers.VerifyRaftPeers(t, client, map[string]bool{
"core-0": true,
"core-1": true,
"core-2": true,
})
if err != nil {
t.Fatal(err)
}
_, err := client.Logical().Write("sys/storage/raft/remove-peer", map[string]interface{}{
_, err = client.Logical().Write("sys/storage/raft/remove-peer", map[string]interface{}{
"server_id": "core-2",
})
if err != nil {
t.Fatal(err)
}
testhelpers.VerifyRaftPeers(t, client, map[string]bool{
err = testhelpers.VerifyRaftPeers(t, client, map[string]bool{
"core-0": true,
"core-1": true,
})
if err != nil {
t.Fatal(err)
}
_, err = client.Logical().Write("sys/storage/raft/remove-peer", map[string]interface{}{
"server_id": "core-1",
@ -354,9 +362,12 @@ func TestRaft_RemovePeer(t *testing.T) {
t.Fatal(err)
}
testhelpers.VerifyRaftPeers(t, client, map[string]bool{
err = testhelpers.VerifyRaftPeers(t, client, map[string]bool{
"core-0": true,
})
if err != nil {
t.Fatal(err)
}
}
func TestRaft_NodeIDHeader(t *testing.T) {

View File

@ -100,14 +100,17 @@ func testRaftHANewCluster(t *testing.T, bundler teststorage.PhysicalBackendBundl
// Ensure peers are added
leaderClient := cluster.Cores[0].Client
testhelpers.VerifyRaftPeers(t, leaderClient, map[string]bool{
err := testhelpers.VerifyRaftPeers(t, leaderClient, map[string]bool{
"core-0": true,
"core-1": true,
"core-2": true,
})
if err != nil {
t.Fatal(err)
}
// Test remove peers
_, err := leaderClient.Logical().Write("sys/storage/raft/remove-peer", map[string]interface{}{
_, err = leaderClient.Logical().Write("sys/storage/raft/remove-peer", map[string]interface{}{
"server_id": "core-1",
})
if err != nil {
@ -122,9 +125,12 @@ func testRaftHANewCluster(t *testing.T, bundler teststorage.PhysicalBackendBundl
}
// Ensure peers are removed
testhelpers.VerifyRaftPeers(t, leaderClient, map[string]bool{
err = testhelpers.VerifyRaftPeers(t, leaderClient, map[string]bool{
"core-0": true,
})
if err != nil {
t.Fatal(err)
}
}
func TestRaft_HA_ExistingCluster(t *testing.T) {
@ -233,11 +239,14 @@ func TestRaft_HA_ExistingCluster(t *testing.T) {
joinFunc(cluster.Cores[2].Client)
// Ensure peers are added
testhelpers.VerifyRaftPeers(t, leaderClient, map[string]bool{
err := testhelpers.VerifyRaftPeers(t, leaderClient, map[string]bool{
"core-0": true,
"core-1": true,
"core-2": true,
})
if err != nil {
t.Fatal(err)
}
}
updateCluster(t)

View File

@ -1015,6 +1015,9 @@ func (c *Core) JoinRaftCluster(ctx context.Context, leaderInfos []*raft.LeaderJo
if err == nil {
return nil
}
} else {
// Return an error so we can retry_join
err = fmt.Errorf("failed to get raft challenge")
}
}
return err