[1.16.x] Vault CA bugfixes (#19285) (#19336)

Vault CA bugfixes (#19285)

* Re-add retry logic to Vault token renewal

* Fix goroutine leak

* Add test for detecting goroutine leak

* Add changelog

* Rename tests

* Add comment
This commit is contained in:
Chris S. Kim 2023-10-23 10:14:05 -04:00 committed by GitHub
parent 11ee74d054
commit d547958f2c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 98 additions and 2 deletions

7
.changelog/19285.txt Normal file
View File

@ -0,0 +1,7 @@
```release-note:bug
ca: Fix bug with Vault CA provider where token renewal goroutines could leak if CA failed to initialize.
```
```release-note:bug
ca: Fix bug with Vault CA provider where renewing a retracted token would cause retries in a tight loop, degrading performance.
```

View File

@ -22,6 +22,7 @@ import (
"github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/agent/structs"
"github.com/hashicorp/consul/lib" "github.com/hashicorp/consul/lib"
"github.com/hashicorp/consul/lib/decode" "github.com/hashicorp/consul/lib/decode"
"github.com/hashicorp/consul/lib/retry"
) )
const ( const (
@ -177,11 +178,17 @@ func (v *VaultProvider) Configure(cfg ProviderConfig) error {
v.stopWatcher() v.stopWatcher()
} }
v.stopWatcher = cancel v.stopWatcher = cancel
// NOTE: Any codepaths after v.renewToken(...) which return an error
// _must_ call v.stopWatcher() to prevent the renewal goroutine from
// leaking when the CA initialization fails and gets retried later.
go v.renewToken(ctx, lifetimeWatcher) go v.renewToken(ctx, lifetimeWatcher)
} }
// Update the intermediate (managed) PKI mount and role // Update the intermediate (managed) PKI mount and role
if err := v.setupIntermediatePKIPath(); err != nil { if err := v.setupIntermediatePKIPath(); err != nil {
if v.stopWatcher != nil {
v.stopWatcher()
}
return err return err
} }
@ -223,6 +230,16 @@ func (v *VaultProvider) renewToken(ctx context.Context, watcher *vaultapi.Lifeti
go watcher.Start() go watcher.Start()
defer watcher.Stop() defer watcher.Stop()
// These values are chosen to start the exponential backoff
// immediately. Since the Vault client implements its own
// retries, this retry is mostly to avoid resource contention
// and log spam.
retrier := retry.Waiter{
MinFailures: 1,
MinWait: 1 * time.Second,
Jitter: retry.NewJitter(20),
}
for { for {
select { select {
case <-ctx.Done(): case <-ctx.Done():
@ -231,7 +248,16 @@ func (v *VaultProvider) renewToken(ctx context.Context, watcher *vaultapi.Lifeti
case err := <-watcher.DoneCh(): case err := <-watcher.DoneCh():
// Watcher has stopped // Watcher has stopped
if err != nil { if err != nil {
v.logger.Error("Error renewing token for Vault provider", "error", err) v.logger.Error("Error renewing token for Vault provider", "error", err, "retries", retrier.Failures())
}
// Although the vault watcher has its own retry logic, we have encountered
// issues when passing an invalid Vault token which would send an error to
// watcher.DoneCh() immediately, causing us to start the watcher over and
// over again in a very tight loop.
if err := retrier.Wait(ctx); err != nil {
// only possible error is when context is cancelled
return
} }
// If the watcher has exited and auth method is enabled, // If the watcher has exited and auth method is enabled,
@ -265,6 +291,7 @@ func (v *VaultProvider) renewToken(ctx context.Context, watcher *vaultapi.Lifeti
go watcher.Start() go watcher.Start()
case <-watcher.RenewCh(): case <-watcher.RenewCh():
retrier.Reset()
v.logger.Info("Successfully renewed token for Vault provider") v.logger.Info("Successfully renewed token for Vault provider")
} }
} }

View File

@ -8,6 +8,7 @@ import (
"encoding/json" "encoding/json"
"fmt" "fmt"
"io" "io"
"runtime/pprof"
"strconv" "strconv"
"strings" "strings"
"sync/atomic" "sync/atomic"
@ -225,8 +226,69 @@ func TestVaultCAProvider_Configure(t *testing.T) {
testcase.expectedValue(t, provider) testcase.expectedValue(t, provider)
}) })
} }
}
return // This test must not run in parallel
func TestVaultCAProvider_ConfigureFailureGoroutineLeakCheck(t *testing.T) {
if testing.Short() {
t.Skip("too slow for testing.Short")
}
SkipIfVaultNotPresent(t)
testVault := NewTestVaultServer(t)
attr := &VaultTokenAttributes{
RootPath: "pki-root",
IntermediatePath: "pki-intermediate",
ConsulManaged: true,
}
token := CreateVaultTokenWithAttrs(t, testVault.client, attr)
provider := NewVaultProvider(hclog.New(&hclog.LoggerOptions{Name: "ca.vault"}))
t.Run("error on Configure does not leak renewal routine", func(t *testing.T) {
config := map[string]any{
"RootPKIPath": "pki-root/",
"IntermediatePKIPath": "badbadbad/",
}
cfg := vaultProviderConfig(t, testVault.Addr, token, config)
err := provider.Configure(cfg)
require.Error(t, err)
retry.RunWith(retry.TwoSeconds(), t, func(r *retry.R) {
profile := pprof.Lookup("goroutine")
sb := strings.Builder{}
require.NoError(r, profile.WriteTo(&sb, 2))
require.NotContains(r, sb.String(),
"created by github.com/hashicorp/consul/agent/connect/ca.(*VaultProvider).Configure",
"found renewal goroutine leak")
// If this test is failing because you added a new goroutine to
// (*VaultProvider).Configure AND that goroutine should persist
// even if Configure errored, then you should change the checked
// string to (*VaultProvider).renewToken.
})
})
t.Run("successful Configure starts renewal routine", func(t *testing.T) {
config := map[string]any{
"RootPKIPath": "pki-root/",
"IntermediatePKIPath": "pki-intermediate/",
}
cfg := vaultProviderConfig(t, testVault.Addr, token, config)
require.NoError(t, provider.Configure(cfg))
retry.RunWith(retry.TwoSeconds(), t, func(r *retry.R) {
profile := pprof.Lookup("goroutine")
sb := strings.Builder{}
require.NoError(r, profile.WriteTo(&sb, 2))
t.Log(sb.String())
require.Contains(r, sb.String(),
"created by github.com/hashicorp/consul/agent/connect/ca.(*VaultProvider).Configure",
"expected renewal goroutine, got none")
})
})
} }
func TestVaultCAProvider_SecondaryActiveIntermediate(t *testing.T) { func TestVaultCAProvider_SecondaryActiveIntermediate(t *testing.T) {