From ce7ab8abc1fb39d8068daa5dc72bafdfe9849a6f Mon Sep 17 00:00:00 2001
From: "R.B. Boyer" <rb@hashicorp.com>
Date: Tue, 21 Jan 2020 14:58:50 -0600
Subject: [PATCH] make TestCatalogNodes_Blocking less flaky (#7074)

- Explicitly wait to start the test until the initial AE sync of the node.

- Run the blocking query in the main goroutine to cut down on possible
poor goroutine scheduling issues being to blame for delays.

- If the blocking query is woken up with no index change, rerun the
query. This may happen if the CI server is loaded and time dilation is
happening.
---
 agent/catalog_endpoint_test.go | 100 +++++++++++++++++----------------
 1 file changed, 52 insertions(+), 48 deletions(-)

diff --git a/agent/catalog_endpoint_test.go b/agent/catalog_endpoint_test.go
index 5706ac068..fa64456b0 100644
--- a/agent/catalog_endpoint_test.go
+++ b/agent/catalog_endpoint_test.go
@@ -292,70 +292,74 @@ func TestCatalogNodes_Blocking(t *testing.T) {
 	t.Parallel()
 	a := NewTestAgent(t, t.Name(), "")
 	defer a.Shutdown()
-	testrpc.WaitForTestAgent(t, a.RPC, "dc1")
+	testrpc.WaitForTestAgent(t, a.RPC, "dc1", testrpc.WaitForAntiEntropySync())
 
-	// Register node
+	// Run the query
 	args := &structs.DCSpecificRequest{
 		Datacenter: "dc1",
 	}
-
 	var out structs.IndexedNodes
 	if err := a.RPC("Catalog.ListNodes", *args, &out); err != nil {
 		t.Fatalf("err: %v", err)
 	}
 
-	// t.Fatal must be called from the main go routine
-	// of the test. Because of this we cannot call
-	// t.Fatal from within the go routines and use
-	// an error channel instead.
-	errch := make(chan error, 2)
+	// Async cause a change
+	waitIndex := out.Index
+	start := time.Now()
 	go func() {
-		testrpc.WaitForTestAgent(t, a.RPC, "dc1")
-		start := time.Now()
-
-		// register a service after the blocking call
-		// in order to unblock it.
-		time.AfterFunc(100*time.Millisecond, func() {
-			args := &structs.RegisterRequest{
-				Datacenter: "dc1",
-				Node:       "foo",
-				Address:    "127.0.0.1",
-			}
-			var out struct{}
-			errch <- a.RPC("Catalog.Register", args, &out)
-		})
-
-		// now block
-		req, _ := http.NewRequest("GET", fmt.Sprintf("/v1/catalog/nodes?wait=3s&index=%d", out.Index+1), nil)
-		resp := httptest.NewRecorder()
-		obj, err := a.srv.CatalogNodes(resp, req)
-		if err != nil {
-			errch <- err
+		time.Sleep(100 * time.Millisecond)
+		args := &structs.RegisterRequest{
+			Datacenter: "dc1",
+			Node:       "foo",
+			Address:    "127.0.0.1",
 		}
-
-		// Should block for a while
-		if d := time.Since(start); d < 50*time.Millisecond {
-			errch <- fmt.Errorf("too fast: %v", d)
+		var out struct{}
+		if err := a.RPC("Catalog.Register", args, &out); err != nil {
+			t.Fatalf("err: %v", err)
 		}
-
-		if idx := getIndex(t, resp); idx <= out.Index {
-			errch <- fmt.Errorf("bad: %v", idx)
-		}
-
-		nodes := obj.(structs.Nodes)
-		if len(nodes) != 2 {
-			errch <- fmt.Errorf("bad: %v", obj)
-		}
-		errch <- nil
 	}()
 
-	// wait for both go routines to return
-	if err := <-errch; err != nil {
-		t.Fatal(err)
+	const waitDuration = 3 * time.Second
+
+	// Re-run the query, if errantly woken up with no change, resume blocking.
+	var elapsed time.Duration
+RUN_BLOCKING_QUERY:
+	req, err := http.NewRequest("GET", fmt.Sprintf("/v1/catalog/nodes?wait=%s&index=%d",
+		waitDuration.String(),
+		waitIndex), nil)
+	if err != nil {
+		t.Fatalf("err: %v", err)
 	}
-	if err := <-errch; err != nil {
-		t.Fatal(err)
+	resp := httptest.NewRecorder()
+	obj, err := a.srv.CatalogNodes(resp, req)
+	if err != nil {
+		t.Fatalf("err: %v", err)
 	}
+
+	elapsed = time.Since(start)
+
+	idx := getIndex(t, resp)
+	if idx < waitIndex {
+		t.Fatalf("bad: %v", idx)
+	} else if idx == waitIndex {
+		if elapsed > waitDuration {
+			// This should prevent the loop from running longer than the
+			// waitDuration
+			t.Fatalf("too slow: %v", elapsed)
+		}
+		goto RUN_BLOCKING_QUERY
+	}
+
+	// Should block at least 100ms before getting the changed results
+	if elapsed < 100*time.Millisecond {
+		t.Fatalf("too fast: %v", elapsed)
+	}
+
+	nodes := obj.(structs.Nodes)
+	if len(nodes) != 2 {
+		t.Fatalf("bad: %v", obj)
+	}
+
 }
 
 func TestCatalogNodes_DistanceSort(t *testing.T) {