Store primaries root in secondary after intermediate signature (#6333)

* Store primaries root in secondary after intermediate signature This ensures that the intermediate exists within the CA root stored in raft and not just in the CA provider state. This has the very nice benefit of actually outputting the intermediate cert within the ca roots HTTP/RPC endpoints. This change means that if signing the intermediate fails it will not set the root within raft. So far I have not come up with a reason why that is bad. The secondary CA roots watch will pull the root again and go through all the motions. So as soon as getting an intermediate CA works the root will get set. * Make TestAgentAntiEntropy_Check_DeferSync less flaky I am not sure this is the full fix but it seems to help for me.
2019-08-30 11:38:46 -04:00 · 2019-08-30 11:38:46 -04:00 · 31d9d2e557
parent c5e1faaddb
commit 31d9d2e557
3 changed files with 61 additions and 40 deletions
--- a/agent/consul/leader_connect.go
+++ b/agent/consul/leader_connect.go
@ -314,13 +314,40 @@ func (s *Server) initializeSecondaryCA(provider ca.Provider, roots structs.Index
 		return fmt.Errorf("primary datacenter does not have an active root CA for Connect")
 	}

+	newIntermediate := false
+	// Get a signed intermediate from the primary DC if the provider
+	// hasn't been initialized yet or if the primary's root has changed.
+	if activeIntermediate == "" || storedRootID != roots.ActiveRootID {
+		csr, err := provider.GenerateIntermediateCSR()
+		if err != nil {
+			return err
+		}
+
+		var intermediatePEM string
+		if err := s.forwardDC("ConnectCA.SignIntermediate", s.config.PrimaryDatacenter, s.generateCASignRequest(csr), &intermediatePEM); err != nil {
+			// this is a failure in the primary and shouldn't be capable of erroring out our establishing leadership
+			s.logger.Printf("[WARN] connect: Primary datacenter refused to sign our intermediate CA certificate: %v", err)
+			return nil
+		}
+
+		if err := provider.SetIntermediate(intermediatePEM, newActiveRoot.RootCert); err != nil {
+			return fmt.Errorf("Failed to set the intermediate certificate with the CA provider: %v", err)
+		}
+
+		// Append the new intermediate to our local active root entry.
+		newActiveRoot.IntermediateCerts = append(newActiveRoot.IntermediateCerts, intermediatePEM)
+		newIntermediate = true
+
+		s.logger.Printf("[INFO] connect: received new intermediate certificate from primary datacenter")
+	}
+
 	// Update the roots list in the state store if there's a new active root.
 	state := s.fsm.State()
 	_, activeRoot, err := state.CARootActive(nil)
 	if err != nil {
 		return err
 	}
-	if activeRoot == nil || activeRoot.ID != newActiveRoot.ID {
+	if activeRoot == nil || activeRoot.ID != newActiveRoot.ID || newIntermediate {
 		idx, oldRoots, err := state.CARoots(nil)
 		if err != nil {
 			return err
@ -372,31 +399,6 @@ func (s *Server) initializeSecondaryCA(provider ca.Provider, roots structs.Index
 		s.logger.Printf("[INFO] connect: updated root certificates from primary datacenter")
 	}

-	// Get a signed intermediate from the primary DC if the provider
-	// hasn't been initialized yet or if the primary's root has changed.
-	if activeIntermediate == "" || storedRootID != roots.ActiveRootID {
-		csr, err := provider.GenerateIntermediateCSR()
-		if err != nil {
-			return err
-		}
-
-		var intermediatePEM string
-		if err := s.forwardDC("ConnectCA.SignIntermediate", s.config.PrimaryDatacenter, s.generateCASignRequest(csr), &intermediatePEM); err != nil {
-			// this is a failure in the primary and shouldn't be capable of erroring out our establishing leadership
-			s.logger.Printf("[WARN] connect: Primary datacenter refused to sign our intermediate CA certificate: %v", err)
-			return nil
-		}
-
-		if err := provider.SetIntermediate(intermediatePEM, newActiveRoot.RootCert); err != nil {
-			return fmt.Errorf("Failed to set the intermediate certificate with the CA provider: %v", err)
-		}
-
-		// Append the new intermediate to our local active root entry.
-		newActiveRoot.IntermediateCerts = append(newActiveRoot.IntermediateCerts, intermediatePEM)
-
-		s.logger.Printf("[INFO] connect: received new intermediate certificate from primary datacenter")
-	}
-
 	s.setCAProvider(provider, newActiveRoot)
 	return nil
 }
--- a/agent/consul/leader_connect_test.go
+++ b/agent/consul/leader_connect_test.go
@ -78,6 +78,8 @@ func TestLeader_SecondaryCA_Initialize(t *testing.T) {
 	require.Equal(roots1[0].RootCert, roots2[0].RootCert)
 	require.Equal(1, len(roots1))
 	require.Equal(len(roots1), len(roots2))
+	require.Empty(roots1[0].IntermediateCerts)
+	require.NotEmpty(roots2[0].IntermediateCerts)

 	// Have secondary sign a leaf cert and make sure the chain is correct.
 	spiffeService := &connect.SpiffeIDService{
--- a/agent/local/state_test.go
+++ b/agent/local/state_test.go
@ -1417,21 +1417,34 @@ func TestAgentAntiEntropy_Check_DeferSync(t *testing.T) {
 	// Update the check output! Should be deferred
 	a.State.UpdateCheck("web", api.HealthPassing, "output")

-	// Should not update for 500 milliseconds
-	time.Sleep(250 * time.Millisecond)
-	if err := a.RPC("Health.NodeChecks", &req, &checks); err != nil {
-		t.Fatalf("err: %v", err)
+	// We are going to wait up to 850ms for the deferred check update to run. The update
+	// can happen any time within: check_update_interval / 2 + random(min: 0, max: check_update_interval)
+	// For this test that means it will get deferred for 250ms - 750ms. We add up to 100ms on top of that to
+	// account for potentially slow tests on a overloaded system.
+	timer := &retry.Timer{Timeout: 850 * time.Millisecond, Wait: 50 * time.Millisecond}
+	start := time.Now()
+	retry.RunWith(timer, t, func(r *retry.R) {
+		cs := a.State.CheckState("web")
+		if cs == nil {
+			r.Fatalf("check is not registered")
 		}

-	// Verify not updated
-	for _, chk := range checks.HealthChecks {
-		switch chk.CheckID {
-		case "web":
-			if chk.Output != "" {
-				t.Fatalf("early update: %v", chk)
+		if cs.DeferCheck != nil {
+			r.Fatalf("Deferred Check timeout not removed yet")
 		}
+	})
+	elapsed := time.Since(start)
+
+	// ensure the check deferral didn't update too fast
+	if elapsed < 240*time.Millisecond {
+		t.Fatalf("early update: elapsed %v\n\n%+v", elapsed, checks)
 	}
+
+	// ensure the check deferral didn't update too late
+	if elapsed > 850*time.Millisecond {
+		t.Fatalf("late update: elapsed: %v\n\n%+v", elapsed, checks)
 	}
+
 	// Wait for a deferred update. TODO (slackpad) This isn't a great test
 	// because we might be stuck in the random stagger from the full sync
 	// after the leader election (~3 seconds) so it's easy to exceed the
@ -1441,10 +1454,14 @@ func TestAgentAntiEntropy_Check_DeferSync(t *testing.T) {
 	// good news is that the later update below should be well past the full
 	// sync so we are getting some coverage. We should rethink this a bit and
 	// rework the deferred update stuff to be more testable.
-	timer := &retry.Timer{Timeout: 6 * time.Second, Wait: 100 * time.Millisecond}
+	//
+	// TODO - figure out why after the deferred check calls TriggerSyncChanges that this
+	// takes so long to happen. I have seen it take upwards of 1.5s before the check gets
+	// synced.
+	timer = &retry.Timer{Timeout: 6 * time.Second, Wait: 100 * time.Millisecond}
 	retry.RunWith(timer, t, func(r *retry.R) {
 		if err := a.RPC("Health.NodeChecks", &req, &checks); err != nil {
-			r.Fatal(err)
+			r.Fatalf("err: %v", err)
 		}

 		// Verify updated