Wait for standby to have a working grpc connection before we try to use it (#16905)

Also teach WaitForStandbyNode to do a better job waiting for standbys to be healthy.
2022-08-26 12:50:10 -04:00 · 2022-08-26 12:50:10 -04:00 · df61151034
parent 2ab4a58ba9
commit df61151034
3 changed files with 16 additions and 0 deletions
--- a/helper/testhelpers/testhelpers.go
+++ b/helper/testhelpers/testhelpers.go
@ -349,6 +349,9 @@ func WaitForStandbyNode(t testing.T, core *vault.TestClusterCore) {
 		if isLeader, _, clusterAddr, _ := core.Core.Leader(); isLeader != true && clusterAddr != "" {
 			return
 		}
+		if core.Core.ActiveNodeReplicationState() == 0 {
+			return
+		}

 		time.Sleep(time.Second)
 	}
--- a/helper/testhelpers/testhelpers_oss.go
+++ b/helper/testhelpers/testhelpers_oss.go
@ -11,4 +11,9 @@ import (
 // on OSS. On enterprise it waits for perf standbys to be healthy too.
 func WaitForActiveNodeAndStandbys(t testing.T, cluster *vault.TestCluster) {
 	WaitForActiveNode(t, cluster)
+	for _, core := range cluster.Cores {
+		if standby, _ := core.Core.Standby(); standby {
+			WaitForStandbyNode(t, core)
+		}
+	}
 }
--- a/vault/cluster_test.go
+++ b/vault/cluster_test.go
@ -4,6 +4,7 @@ import (
 	"bytes"
 	"context"
 	"crypto/tls"
+	"fmt"
 	"net/http"
 	"sync"
 	"testing"
@ -319,6 +320,13 @@ func testCluster_ForwardRequests(t *testing.T, c *TestClusterCore, rootToken, re
 	if isLeader {
 		t.Fatal("core should not be leader")
 	}
+	RetryUntil(t, 5*time.Second, func() error {
+		state := c.ActiveNodeReplicationState()
+		if state == 0 {
+			return fmt.Errorf("heartbeats have not yet returned a valid active node replication state: %d", state)
+		}
+		return nil
+	})

 	bodBuf := bytes.NewReader([]byte(`{ "foo": "bar", "zip": "zap" }`))
 	req, err := http.NewRequest("PUT", "https://pushit.real.good:9281/"+remoteCoreID, bodBuf)