Vault CA bugfixes (#19285) * Re-add retry logic to Vault token renewal * Fix goroutine leak * Add test for detecting goroutine leak * Add changelog * Rename tests * Add comment
This commit is contained in:
parent
11ee74d054
commit
d547958f2c
|
@ -0,0 +1,7 @@
|
||||||
|
```release-note:bug
|
||||||
|
ca: Fix bug with Vault CA provider where token renewal goroutines could leak if CA failed to initialize.
|
||||||
|
```
|
||||||
|
|
||||||
|
```release-note:bug
|
||||||
|
ca: Fix bug with Vault CA provider where renewing a retracted token would cause retries in a tight loop, degrading performance.
|
||||||
|
```
|
|
@ -22,6 +22,7 @@ import (
|
||||||
"github.com/hashicorp/consul/agent/structs"
|
"github.com/hashicorp/consul/agent/structs"
|
||||||
"github.com/hashicorp/consul/lib"
|
"github.com/hashicorp/consul/lib"
|
||||||
"github.com/hashicorp/consul/lib/decode"
|
"github.com/hashicorp/consul/lib/decode"
|
||||||
|
"github.com/hashicorp/consul/lib/retry"
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
|
@ -177,11 +178,17 @@ func (v *VaultProvider) Configure(cfg ProviderConfig) error {
|
||||||
v.stopWatcher()
|
v.stopWatcher()
|
||||||
}
|
}
|
||||||
v.stopWatcher = cancel
|
v.stopWatcher = cancel
|
||||||
|
// NOTE: Any codepaths after v.renewToken(...) which return an error
|
||||||
|
// _must_ call v.stopWatcher() to prevent the renewal goroutine from
|
||||||
|
// leaking when the CA initialization fails and gets retried later.
|
||||||
go v.renewToken(ctx, lifetimeWatcher)
|
go v.renewToken(ctx, lifetimeWatcher)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Update the intermediate (managed) PKI mount and role
|
// Update the intermediate (managed) PKI mount and role
|
||||||
if err := v.setupIntermediatePKIPath(); err != nil {
|
if err := v.setupIntermediatePKIPath(); err != nil {
|
||||||
|
if v.stopWatcher != nil {
|
||||||
|
v.stopWatcher()
|
||||||
|
}
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -223,6 +230,16 @@ func (v *VaultProvider) renewToken(ctx context.Context, watcher *vaultapi.Lifeti
|
||||||
go watcher.Start()
|
go watcher.Start()
|
||||||
defer watcher.Stop()
|
defer watcher.Stop()
|
||||||
|
|
||||||
|
// These values are chosen to start the exponential backoff
|
||||||
|
// immediately. Since the Vault client implements its own
|
||||||
|
// retries, this retry is mostly to avoid resource contention
|
||||||
|
// and log spam.
|
||||||
|
retrier := retry.Waiter{
|
||||||
|
MinFailures: 1,
|
||||||
|
MinWait: 1 * time.Second,
|
||||||
|
Jitter: retry.NewJitter(20),
|
||||||
|
}
|
||||||
|
|
||||||
for {
|
for {
|
||||||
select {
|
select {
|
||||||
case <-ctx.Done():
|
case <-ctx.Done():
|
||||||
|
@ -231,7 +248,16 @@ func (v *VaultProvider) renewToken(ctx context.Context, watcher *vaultapi.Lifeti
|
||||||
case err := <-watcher.DoneCh():
|
case err := <-watcher.DoneCh():
|
||||||
// Watcher has stopped
|
// Watcher has stopped
|
||||||
if err != nil {
|
if err != nil {
|
||||||
v.logger.Error("Error renewing token for Vault provider", "error", err)
|
v.logger.Error("Error renewing token for Vault provider", "error", err, "retries", retrier.Failures())
|
||||||
|
}
|
||||||
|
|
||||||
|
// Although the vault watcher has its own retry logic, we have encountered
|
||||||
|
// issues when passing an invalid Vault token which would send an error to
|
||||||
|
// watcher.DoneCh() immediately, causing us to start the watcher over and
|
||||||
|
// over again in a very tight loop.
|
||||||
|
if err := retrier.Wait(ctx); err != nil {
|
||||||
|
// only possible error is when context is cancelled
|
||||||
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// If the watcher has exited and auth method is enabled,
|
// If the watcher has exited and auth method is enabled,
|
||||||
|
@ -265,6 +291,7 @@ func (v *VaultProvider) renewToken(ctx context.Context, watcher *vaultapi.Lifeti
|
||||||
go watcher.Start()
|
go watcher.Start()
|
||||||
|
|
||||||
case <-watcher.RenewCh():
|
case <-watcher.RenewCh():
|
||||||
|
retrier.Reset()
|
||||||
v.logger.Info("Successfully renewed token for Vault provider")
|
v.logger.Info("Successfully renewed token for Vault provider")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -8,6 +8,7 @@ import (
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
|
"runtime/pprof"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"sync/atomic"
|
"sync/atomic"
|
||||||
|
@ -225,8 +226,69 @@ func TestVaultCAProvider_Configure(t *testing.T) {
|
||||||
testcase.expectedValue(t, provider)
|
testcase.expectedValue(t, provider)
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return
|
// This test must not run in parallel
|
||||||
|
func TestVaultCAProvider_ConfigureFailureGoroutineLeakCheck(t *testing.T) {
|
||||||
|
if testing.Short() {
|
||||||
|
t.Skip("too slow for testing.Short")
|
||||||
|
}
|
||||||
|
SkipIfVaultNotPresent(t)
|
||||||
|
|
||||||
|
testVault := NewTestVaultServer(t)
|
||||||
|
|
||||||
|
attr := &VaultTokenAttributes{
|
||||||
|
RootPath: "pki-root",
|
||||||
|
IntermediatePath: "pki-intermediate",
|
||||||
|
ConsulManaged: true,
|
||||||
|
}
|
||||||
|
token := CreateVaultTokenWithAttrs(t, testVault.client, attr)
|
||||||
|
|
||||||
|
provider := NewVaultProvider(hclog.New(&hclog.LoggerOptions{Name: "ca.vault"}))
|
||||||
|
|
||||||
|
t.Run("error on Configure does not leak renewal routine", func(t *testing.T) {
|
||||||
|
config := map[string]any{
|
||||||
|
"RootPKIPath": "pki-root/",
|
||||||
|
"IntermediatePKIPath": "badbadbad/",
|
||||||
|
}
|
||||||
|
cfg := vaultProviderConfig(t, testVault.Addr, token, config)
|
||||||
|
|
||||||
|
err := provider.Configure(cfg)
|
||||||
|
require.Error(t, err)
|
||||||
|
|
||||||
|
retry.RunWith(retry.TwoSeconds(), t, func(r *retry.R) {
|
||||||
|
profile := pprof.Lookup("goroutine")
|
||||||
|
sb := strings.Builder{}
|
||||||
|
require.NoError(r, profile.WriteTo(&sb, 2))
|
||||||
|
require.NotContains(r, sb.String(),
|
||||||
|
"created by github.com/hashicorp/consul/agent/connect/ca.(*VaultProvider).Configure",
|
||||||
|
"found renewal goroutine leak")
|
||||||
|
// If this test is failing because you added a new goroutine to
|
||||||
|
// (*VaultProvider).Configure AND that goroutine should persist
|
||||||
|
// even if Configure errored, then you should change the checked
|
||||||
|
// string to (*VaultProvider).renewToken.
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("successful Configure starts renewal routine", func(t *testing.T) {
|
||||||
|
config := map[string]any{
|
||||||
|
"RootPKIPath": "pki-root/",
|
||||||
|
"IntermediatePKIPath": "pki-intermediate/",
|
||||||
|
}
|
||||||
|
cfg := vaultProviderConfig(t, testVault.Addr, token, config)
|
||||||
|
|
||||||
|
require.NoError(t, provider.Configure(cfg))
|
||||||
|
|
||||||
|
retry.RunWith(retry.TwoSeconds(), t, func(r *retry.R) {
|
||||||
|
profile := pprof.Lookup("goroutine")
|
||||||
|
sb := strings.Builder{}
|
||||||
|
require.NoError(r, profile.WriteTo(&sb, 2))
|
||||||
|
t.Log(sb.String())
|
||||||
|
require.Contains(r, sb.String(),
|
||||||
|
"created by github.com/hashicorp/consul/agent/connect/ca.(*VaultProvider).Configure",
|
||||||
|
"expected renewal goroutine, got none")
|
||||||
|
})
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestVaultCAProvider_SecondaryActiveIntermediate(t *testing.T) {
|
func TestVaultCAProvider_SecondaryActiveIntermediate(t *testing.T) {
|
||||||
|
|
Loading…
Reference in New Issue