connect: connect CA Roots in the primary datacenter should use a SigningKeyID derived from their local intermediate (#9428)

This fixes an issue where leaf certificates issued in primary
datacenters using Vault as a Connect CA would be reissued very
frequently (every ~20 seconds) because the logic meant to detect root
rotation was errantly triggering.

The hash of the rootCA was being compared against a hash of the
intermediateCA and always failing. This doesn't apply to the Consul
built-in CA provider because there is no intermediate in use in the
primary DC.

This is reminiscent of #6513
This commit is contained in:
R.B. Boyer 2021-02-08 13:18:51 -06:00 committed by GitHub
parent 8b91cae80f
commit 91d9544803
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 254 additions and 2 deletions

3
.changelog/9428.txt Normal file
View File

@ -0,0 +1,3 @@
```release-note:bug
connect: connect CA Roots in the primary datacenter should use a SigningKeyID derived from their local intermediate
```

View File

@ -359,10 +359,14 @@ ifeq ("$(CIRCLECI)","true")
gotestsum --format=short-verbose --junitfile "$(TEST_RESULTS_DIR)/gotestsum-report.xml" -- -cover -coverprofile=coverage.txt ./agent/connect/ca
# Run leader tests that require Vault
gotestsum --format=short-verbose --junitfile "$(TEST_RESULTS_DIR)/gotestsum-report-leader.xml" -- -cover -coverprofile=coverage-leader.txt -run TestLeader_Vault_ ./agent/consul
# Run agent tests that require Vault
gotestsum --format=short-verbose --junitfile "$(TEST_RESULTS_DIR)/gotestsum-report-agent.xml" -- -cover -coverprofile=coverage-agent.txt -run '.*_Vault_' ./agent
else
# Run locally
@echo "Running /agent/connect/ca tests in verbose mode"
@go test -v ./agent/connect/ca
@go test -v ./agent/consul -run 'TestLeader_Vault_'
@go test -v ./agent -run '.*_Vault_'
endif
proto: $(PROTOGOFILES) $(PROTOGOBINFILES)

View File

@ -27,6 +27,7 @@ import (
"github.com/hashicorp/consul/acl"
"github.com/hashicorp/consul/agent/config"
"github.com/hashicorp/consul/agent/connect"
"github.com/hashicorp/consul/agent/connect/ca"
"github.com/hashicorp/consul/agent/consul"
"github.com/hashicorp/consul/agent/debug"
"github.com/hashicorp/consul/agent/local"
@ -5788,6 +5789,131 @@ func TestAgentConnectCALeafCert_goodNotLocal(t *testing.T) {
}
}
func TestAgentConnectCALeafCert_Vault_doesNotChurnLeafCertsAtIdle(t *testing.T) {
ca.SkipIfVaultNotPresent(t)
if testing.Short() {
t.Skip("too slow for testing.Short")
}
t.Parallel()
testVault := ca.NewTestVaultServer(t)
defer testVault.Stop()
assert := assert.New(t)
require := require.New(t)
a := StartTestAgent(t, TestAgent{Overrides: fmt.Sprintf(`
connect {
test_ca_leaf_root_change_spread = "1ns"
ca_provider = "vault"
ca_config {
address = %[1]q
token = %[2]q
root_pki_path = "pki-root/"
intermediate_pki_path = "pki-intermediate/"
}
}
`, testVault.Addr, testVault.RootToken)})
defer a.Shutdown()
testrpc.WaitForTestAgent(t, a.RPC, "dc1")
testrpc.WaitForActiveCARoot(t, a.RPC, "dc1", nil)
var ca1 *structs.CARoot
{
args := &structs.DCSpecificRequest{Datacenter: "dc1"}
var reply structs.IndexedCARoots
require.NoError(a.RPC("ConnectCA.Roots", args, &reply))
for _, r := range reply.Roots {
if r.ID == reply.ActiveRootID {
ca1 = r
break
}
}
require.NotNil(ca1)
}
{
// Register a local service
args := &structs.ServiceDefinition{
ID: "foo",
Name: "test",
Address: "127.0.0.1",
Port: 8000,
Check: structs.CheckType{
TTL: 15 * time.Second,
},
}
req, _ := http.NewRequest("PUT", "/v1/agent/service/register", jsonReader(args))
resp := httptest.NewRecorder()
_, err := a.srv.AgentRegisterService(resp, req)
require.NoError(err)
if !assert.Equal(200, resp.Code) {
t.Log("Body: ", resp.Body.String())
}
}
// List
req, _ := http.NewRequest("GET", "/v1/agent/connect/ca/leaf/test", nil)
resp := httptest.NewRecorder()
obj, err := a.srv.AgentConnectCALeafCert(resp, req)
require.NoError(err)
require.Equal("MISS", resp.Header().Get("X-Cache"))
// Get the issued cert
issued, ok := obj.(*structs.IssuedCert)
assert.True(ok)
// Verify that the cert is signed by the CA
requireLeafValidUnderCA(t, issued, ca1)
// Verify blocking index
assert.True(issued.ModifyIndex > 0)
assert.Equal(fmt.Sprintf("%d", issued.ModifyIndex),
resp.Header().Get("X-Consul-Index"))
// Test caching
{
// Fetch it again
resp := httptest.NewRecorder()
obj2, err := a.srv.AgentConnectCALeafCert(resp, req)
require.NoError(err)
require.Equal(obj, obj2)
// Should cache hit this time and not make request
require.Equal("HIT", resp.Header().Get("X-Cache"))
}
// Test that we aren't churning leaves for no reason at idle.
{
ch := make(chan error, 1)
go func() {
req, _ := http.NewRequest("GET", "/v1/agent/connect/ca/leaf/test?index="+strconv.Itoa(int(issued.ModifyIndex)), nil)
resp := httptest.NewRecorder()
obj, err := a.srv.AgentConnectCALeafCert(resp, req)
if err != nil {
ch <- err
} else {
issued2 := obj.(*structs.IssuedCert)
if issued.CertPEM == issued2.CertPEM {
ch <- fmt.Errorf("leaf woke up unexpectedly with same cert")
} else {
ch <- fmt.Errorf("leaf woke up unexpectedly with new cert")
}
}
}()
start := time.Now()
select {
case <-time.After(5 * time.Second):
case err := <-ch:
dur := time.Since(start)
t.Fatalf("unexpected return from blocking query; leaf churned during idle period, took %s: %v", dur, err)
}
}
}
func TestAgentConnectCALeafCert_secondaryDC_good(t *testing.T) {
if testing.Short() {
t.Skip("too slow for testing.Short")

View File

@ -416,7 +416,7 @@ func (c *CAManager) initializeRootCA(provider ca.Provider, conf *structs.CAConfi
if err != nil {
return fmt.Errorf("error generating intermediate cert: %v", err)
}
_, err = connect.ParseCert(interPEM)
intermediateCert, err := connect.ParseCert(interPEM)
if err != nil {
return fmt.Errorf("error getting intermediate cert: %v", err)
}
@ -439,6 +439,13 @@ func (c *CAManager) initializeRootCA(provider ca.Provider, conf *structs.CAConfi
}
}
// Versions prior to 1.9.3, 1.8.8, and 1.7.12 incorrectly used the primary
// rootCA's subjectKeyID here instead of the intermediate. For
// provider=consul this didn't matter since there are no intermediates in
// the primaryDC, but for vault it does matter.
expectedSigningKeyID := connect.EncodeSigningKeyID(intermediateCert.SubjectKeyId)
needsSigningKeyUpdate := (rootCA.SigningKeyID != expectedSigningKeyID)
// Check if the CA root is already initialized and exit if it is,
// adding on any existing intermediate certs since they aren't directly
// tied to the provider.
@ -449,7 +456,10 @@ func (c *CAManager) initializeRootCA(provider ca.Provider, conf *structs.CAConfi
if err != nil {
return err
}
if activeRoot != nil {
if activeRoot != nil && needsSigningKeyUpdate {
c.logger.Info("Correcting stored SigningKeyID value", "previous", rootCA.SigningKeyID, "updated", expectedSigningKeyID)
} else if activeRoot != nil && !needsSigningKeyUpdate {
// This state shouldn't be possible to get into because we update the root and
// CA config in the same FSM operation.
if activeRoot.ID != rootCA.ID {
@ -462,6 +472,10 @@ func (c *CAManager) initializeRootCA(provider ca.Provider, conf *structs.CAConfi
return nil
}
if needsSigningKeyUpdate {
rootCA.SigningKeyID = expectedSigningKeyID
}
// Get the highest index
idx, _, err := state.CARoots(nil)
if err != nil {

View File

@ -579,6 +579,111 @@ func TestLeader_SecondaryCA_IntermediateRefresh(t *testing.T) {
require.NoError(err)
}
func TestLeader_Vault_PrimaryCA_FixSigningKeyID_OnRestart(t *testing.T) {
ca.SkipIfVaultNotPresent(t)
if testing.Short() {
t.Skip("too slow for testing.Short")
}
t.Parallel()
testVault := ca.NewTestVaultServer(t)
defer testVault.Stop()
dir1pre, s1pre := testServerWithConfig(t, func(c *Config) {
c.Build = "1.6.0"
c.PrimaryDatacenter = "dc1"
c.CAConfig = &structs.CAConfiguration{
Provider: "vault",
Config: map[string]interface{}{
"Address": testVault.Addr,
"Token": testVault.RootToken,
"RootPKIPath": "pki-root/",
"IntermediatePKIPath": "pki-intermediate/",
},
}
})
defer os.RemoveAll(dir1pre)
defer s1pre.Shutdown()
testrpc.WaitForLeader(t, s1pre.RPC, "dc1")
// Restore the pre-1.9.3/1.8.8/1.7.12 behavior of the SigningKeyID not being derived
// from the intermediates in the primary (which only matters for provider=vault).
var primaryRootSigningKeyID string
{
state := s1pre.fsm.State()
// Get the highest index
idx, activePrimaryRoot, err := state.CARootActive(nil)
require.NoError(t, err)
require.NotNil(t, activePrimaryRoot)
rootCert, err := connect.ParseCert(activePrimaryRoot.RootCert)
require.NoError(t, err)
// Force this to be derived just from the root, not the intermediate.
primaryRootSigningKeyID = connect.EncodeSigningKeyID(rootCert.SubjectKeyId)
activePrimaryRoot.SigningKeyID = primaryRootSigningKeyID
// Store the root cert in raft
resp, err := s1pre.raftApply(structs.ConnectCARequestType, &structs.CARequest{
Op: structs.CAOpSetRoots,
Index: idx,
Roots: []*structs.CARoot{activePrimaryRoot},
})
require.NoError(t, err)
if respErr, ok := resp.(error); ok {
t.Fatalf("respErr: %v", respErr)
}
}
// Shutdown s1pre and restart it to trigger the secondary CA init to correct
// the SigningKeyID.
s1pre.Shutdown()
dir1, s1 := testServerWithConfig(t, func(c *Config) {
c.DataDir = s1pre.config.DataDir
c.Datacenter = "dc1"
c.PrimaryDatacenter = "dc1"
c.NodeName = s1pre.config.NodeName
c.NodeID = s1pre.config.NodeID
})
defer os.RemoveAll(dir1)
defer s1.Shutdown()
testrpc.WaitForLeader(t, s1.RPC, "dc1")
// Retry since it will take some time to init the primary CA fully and there
// isn't a super clean way to watch specifically until it's done than polling
// the CA provider anyway.
retry.Run(t, func(r *retry.R) {
// verify that the root is now corrected
provider, activeRoot := getCAProviderWithLock(s1)
require.NotNil(r, provider)
require.NotNil(r, activeRoot)
activeIntermediate, err := provider.ActiveIntermediate()
require.NoError(r, err)
intermediateCert, err := connect.ParseCert(activeIntermediate)
require.NoError(r, err)
// Force this to be derived just from the root, not the intermediate.
expect := connect.EncodeSigningKeyID(intermediateCert.SubjectKeyId)
// The in-memory representation was saw the correction via a setCAProvider call.
require.Equal(r, expect, activeRoot.SigningKeyID)
// The state store saw the correction, too.
_, activePrimaryRoot, err := s1.fsm.State().CARootActive(nil)
require.NoError(r, err)
require.NotNil(r, activePrimaryRoot)
require.Equal(r, expect, activePrimaryRoot.SigningKeyID)
})
}
func TestLeader_SecondaryCA_FixSigningKeyID_via_IntermediateRefresh(t *testing.T) {
if testing.Short() {
t.Skip("too slow for testing.Short")