Add telemetry to Vault agent (#13675)
This patch adds a new /agent/v1/metrics that will return metrics on the running Vault agent. Configuration is done using the same telemetry stanza as the Vault server. For now default runtime metrics are returned with a few additional ones specific to the agent: - `vault.agent.auth.failure` and `vault.agent.auth.success` to monitor the correct behavior of the auto auth mechanism - `vault.agent.proxy.success`, `vault.agent.proxy.client_error` and `vault.agent.proxy.error` to check the connection with the Vault server - `vault.agent.cache.hit` and `vault.agent.cache.miss` to monitor the cache Closes https://github.com/hashicorp/vault/issues/8649 Co-authored-by: Theron Voran <tvoran@users.noreply.github.com>
This commit is contained in:
parent
880a8143e0
commit
98b18ee08e
|
@ -225,7 +225,7 @@ func (r *LifetimeWatcher) Start() {
|
|||
r.doneCh <- r.doRenew()
|
||||
}
|
||||
|
||||
// Renew is for comnpatibility with the legacy api.Renewer. Calling Renew
|
||||
// Renew is for compatibility with the legacy api.Renewer. Calling Renew
|
||||
// simply chains to Start.
|
||||
func (r *LifetimeWatcher) Renew() {
|
||||
r.Start()
|
||||
|
|
|
@ -0,0 +1,4 @@
|
|||
```release-note:feature
|
||||
agent: The Vault agent now returns telemetry information at the `/agent/v1/metrics`
|
||||
path.
|
||||
```
|
|
@ -40,10 +40,12 @@ import (
|
|||
"github.com/hashicorp/vault/command/agent/sink/inmem"
|
||||
"github.com/hashicorp/vault/command/agent/template"
|
||||
"github.com/hashicorp/vault/command/agent/winsvc"
|
||||
"github.com/hashicorp/vault/helper/metricsutil"
|
||||
"github.com/hashicorp/vault/internalshared/configutil"
|
||||
"github.com/hashicorp/vault/internalshared/listenerutil"
|
||||
"github.com/hashicorp/vault/sdk/helper/consts"
|
||||
"github.com/hashicorp/vault/sdk/helper/logging"
|
||||
"github.com/hashicorp/vault/sdk/helper/useragent"
|
||||
"github.com/hashicorp/vault/sdk/logical"
|
||||
"github.com/hashicorp/vault/sdk/version"
|
||||
"github.com/kr/pretty"
|
||||
|
@ -68,6 +70,9 @@ type AgentCommand struct {
|
|||
logGate *gatedwriter.Writer
|
||||
logger log.Logger
|
||||
|
||||
// Telemetry object
|
||||
metricsHelper *metricsutil.MetricsHelper
|
||||
|
||||
cleanupGuard sync.Once
|
||||
|
||||
startedCh chan (struct{}) // for tests
|
||||
|
@ -340,6 +345,21 @@ func (c *AgentCommand) Run(args []string) int {
|
|||
ctx, cancelFunc := context.WithCancel(context.Background())
|
||||
defer cancelFunc()
|
||||
|
||||
// telemetry configuration
|
||||
inmemMetrics, _, prometheusEnabled, err := configutil.SetupTelemetry(&configutil.SetupTelemetryOpts{
|
||||
Config: config.Telemetry,
|
||||
Ui: c.UI,
|
||||
ServiceName: "vault",
|
||||
DisplayName: "Vault",
|
||||
UserAgent: useragent.String(),
|
||||
ClusterName: config.ClusterName,
|
||||
})
|
||||
if err != nil {
|
||||
c.UI.Error(fmt.Sprintf("Error initializing telemetry: %s", err))
|
||||
return 1
|
||||
}
|
||||
c.metricsHelper = metricsutil.NewMetricsHelper(inmemMetrics, prometheusEnabled)
|
||||
|
||||
var method auth.AuthMethod
|
||||
var sinks []*sink.SinkConfig
|
||||
var templateNamespace string
|
||||
|
@ -696,6 +716,7 @@ func (c *AgentCommand) Run(args []string) int {
|
|||
// Create a muxer and add paths relevant for the lease cache layer
|
||||
mux := http.NewServeMux()
|
||||
mux.Handle(consts.AgentPathCacheClear, leaseCache.HandleCacheClear(ctx))
|
||||
mux.Handle(consts.AgentPathMetrics, c.handleMetrics())
|
||||
mux.Handle("/", muxHandler)
|
||||
|
||||
scheme := "https://"
|
||||
|
@ -990,3 +1011,39 @@ func getServiceAccountJWT(tokenFile string) (string, error) {
|
|||
}
|
||||
return strings.TrimSpace(string(token)), nil
|
||||
}
|
||||
|
||||
func (c *AgentCommand) handleMetrics() http.Handler {
|
||||
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.Method != http.MethodGet {
|
||||
logical.RespondError(w, http.StatusMethodNotAllowed, nil)
|
||||
return
|
||||
}
|
||||
|
||||
if err := r.ParseForm(); err != nil {
|
||||
logical.RespondError(w, http.StatusBadRequest, err)
|
||||
return
|
||||
}
|
||||
|
||||
format := r.Form.Get("format")
|
||||
if format == "" {
|
||||
format = metricsutil.FormatFromRequest(&logical.Request{
|
||||
Headers: r.Header,
|
||||
})
|
||||
}
|
||||
|
||||
resp := c.metricsHelper.ResponseForFormat(format)
|
||||
|
||||
status := resp.Data[logical.HTTPStatusCode].(int)
|
||||
w.Header().Set("Content-Type", resp.Data[logical.HTTPContentType].(string))
|
||||
switch v := resp.Data[logical.HTTPRawBody].(type) {
|
||||
case string:
|
||||
w.WriteHeader((status))
|
||||
w.Write([]byte(v))
|
||||
case []byte:
|
||||
w.WriteHeader(status)
|
||||
w.Write(v)
|
||||
default:
|
||||
logical.RespondError(w, http.StatusInternalServerError, fmt.Errorf("wrong response returned"))
|
||||
}
|
||||
})
|
||||
}
|
||||
|
|
|
@ -8,6 +8,7 @@ import (
|
|||
"net/http"
|
||||
"time"
|
||||
|
||||
"github.com/armon/go-metrics"
|
||||
"github.com/hashicorp/go-hclog"
|
||||
"github.com/hashicorp/vault/api"
|
||||
"github.com/hashicorp/vault/sdk/helper/jsonutil"
|
||||
|
@ -156,6 +157,7 @@ func (ah *AuthHandler) Run(ctx context.Context, am AuthMethod) error {
|
|||
if err != nil {
|
||||
ah.logger.Error("error creating client for authentication call", "error", err, "backoff", backoff)
|
||||
backoffOrQuit(ctx, backoff)
|
||||
metrics.IncrCounter([]string{"agent", "auth", "failure"}, 1)
|
||||
continue
|
||||
}
|
||||
default:
|
||||
|
@ -174,6 +176,7 @@ func (ah *AuthHandler) Run(ctx context.Context, am AuthMethod) error {
|
|||
if err != nil {
|
||||
ah.logger.Error("could not look up token", "err", err, "backoff", backoff)
|
||||
backoffOrQuit(ctx, backoff)
|
||||
metrics.IncrCounter([]string{"agent", "auth", "failure"}, 1)
|
||||
continue
|
||||
}
|
||||
|
||||
|
@ -190,6 +193,7 @@ func (ah *AuthHandler) Run(ctx context.Context, am AuthMethod) error {
|
|||
if err != nil {
|
||||
ah.logger.Error("error getting path or data from method", "error", err, "backoff", backoff)
|
||||
backoffOrQuit(ctx, backoff)
|
||||
metrics.IncrCounter([]string{"agent", "auth", "failure"}, 1)
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
@ -199,6 +203,7 @@ func (ah *AuthHandler) Run(ctx context.Context, am AuthMethod) error {
|
|||
if err != nil {
|
||||
ah.logger.Error("error creating client for wrapped call", "error", err, "backoff", backoff)
|
||||
backoffOrQuit(ctx, backoff)
|
||||
metrics.IncrCounter([]string{"agent", "auth", "failure"}, 1)
|
||||
continue
|
||||
}
|
||||
wrapClient.SetWrappingLookupFunc(func(string, string) string {
|
||||
|
@ -220,6 +225,7 @@ func (ah *AuthHandler) Run(ctx context.Context, am AuthMethod) error {
|
|||
if err != nil {
|
||||
ah.logger.Error("error authenticating", "error", err, "backoff", backoff)
|
||||
backoffOrQuit(ctx, backoff)
|
||||
metrics.IncrCounter([]string{"agent", "auth", "failure"}, 1)
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
@ -229,17 +235,20 @@ func (ah *AuthHandler) Run(ctx context.Context, am AuthMethod) error {
|
|||
if secret.WrapInfo == nil {
|
||||
ah.logger.Error("authentication returned nil wrap info", "backoff", backoff)
|
||||
backoffOrQuit(ctx, backoff)
|
||||
metrics.IncrCounter([]string{"agent", "auth", "failure"}, 1)
|
||||
continue
|
||||
}
|
||||
if secret.WrapInfo.Token == "" {
|
||||
ah.logger.Error("authentication returned empty wrapped client token", "backoff", backoff)
|
||||
backoffOrQuit(ctx, backoff)
|
||||
metrics.IncrCounter([]string{"agent", "auth", "failure"}, 1)
|
||||
continue
|
||||
}
|
||||
wrappedResp, err := jsonutil.EncodeJSON(secret.WrapInfo)
|
||||
if err != nil {
|
||||
ah.logger.Error("failed to encode wrapinfo", "error", err, "backoff", backoff)
|
||||
backoffOrQuit(ctx, backoff)
|
||||
metrics.IncrCounter([]string{"agent", "auth", "failure"}, 1)
|
||||
continue
|
||||
}
|
||||
ah.logger.Info("authentication successful, sending wrapped token to sinks and pausing")
|
||||
|
@ -265,11 +274,13 @@ func (ah *AuthHandler) Run(ctx context.Context, am AuthMethod) error {
|
|||
if secret == nil || secret.Auth == nil {
|
||||
ah.logger.Error("authentication returned nil auth info", "backoff", backoff)
|
||||
backoffOrQuit(ctx, backoff)
|
||||
metrics.IncrCounter([]string{"agent", "auth", "failure"}, 1)
|
||||
continue
|
||||
}
|
||||
if secret.Auth.ClientToken == "" {
|
||||
ah.logger.Error("authentication returned empty client token", "backoff", backoff)
|
||||
backoffOrQuit(ctx, backoff)
|
||||
metrics.IncrCounter([]string{"agent", "auth", "failure"}, 1)
|
||||
continue
|
||||
}
|
||||
ah.logger.Info("authentication successful, sending token to sinks")
|
||||
|
@ -292,11 +303,13 @@ func (ah *AuthHandler) Run(ctx context.Context, am AuthMethod) error {
|
|||
if err != nil {
|
||||
ah.logger.Error("error creating lifetime watcher, backing off and retrying", "error", err, "backoff", backoff)
|
||||
backoffOrQuit(ctx, backoff)
|
||||
metrics.IncrCounter([]string{"agent", "auth", "failure"}, 1)
|
||||
continue
|
||||
}
|
||||
|
||||
// Start the renewal process
|
||||
ah.logger.Info("starting renewal process")
|
||||
metrics.IncrCounter([]string{"agent", "auth", "success"}, 1)
|
||||
go watcher.Renew()
|
||||
|
||||
LifetimeWatcherLoop:
|
||||
|
@ -310,11 +323,13 @@ func (ah *AuthHandler) Run(ctx context.Context, am AuthMethod) error {
|
|||
case err := <-watcher.DoneCh():
|
||||
ah.logger.Info("lifetime watcher done channel triggered")
|
||||
if err != nil {
|
||||
metrics.IncrCounter([]string{"agent", "auth", "failure"}, 1)
|
||||
ah.logger.Error("error renewing token", "error", err)
|
||||
}
|
||||
break LifetimeWatcherLoop
|
||||
|
||||
case <-watcher.RenewCh():
|
||||
metrics.IncrCounter([]string{"agent", "auth", "success"}, 1)
|
||||
ah.logger.Info("renewed auth token")
|
||||
|
||||
case <-credCh:
|
||||
|
|
|
@ -12,6 +12,7 @@ import (
|
|||
"net/http"
|
||||
"time"
|
||||
|
||||
"github.com/armon/go-metrics"
|
||||
hclog "github.com/hashicorp/go-hclog"
|
||||
"github.com/hashicorp/vault/api"
|
||||
"github.com/hashicorp/vault/command/agent/sink"
|
||||
|
@ -58,7 +59,9 @@ func Handler(ctx context.Context, logger hclog.Logger, proxier Proxier, inmemSin
|
|||
copyHeader(w.Header(), resp.Response.Header)
|
||||
w.WriteHeader(resp.Response.StatusCode)
|
||||
io.Copy(w, resp.Response.Body)
|
||||
metrics.IncrCounter([]string{"agent", "proxy", "client_error"}, 1)
|
||||
} else {
|
||||
metrics.IncrCounter([]string{"agent", "proxy", "error"}, 1)
|
||||
logical.RespondError(w, http.StatusInternalServerError, fmt.Errorf("failed to get the response: %w", err))
|
||||
}
|
||||
return
|
||||
|
@ -72,6 +75,15 @@ func Handler(ctx context.Context, logger hclog.Logger, proxier Proxier, inmemSin
|
|||
|
||||
defer resp.Response.Body.Close()
|
||||
|
||||
metrics.IncrCounter([]string{"agent", "proxy", "success"}, 1)
|
||||
if resp.CacheMeta != nil {
|
||||
if resp.CacheMeta.Hit {
|
||||
metrics.IncrCounter([]string{"agent", "cache", "hit"}, 1)
|
||||
} else {
|
||||
metrics.IncrCounter([]string{"agent", "cache", "miss"}, 1)
|
||||
}
|
||||
}
|
||||
|
||||
// Set headers
|
||||
setHeaders(w, resp)
|
||||
|
||||
|
|
|
@ -26,6 +26,7 @@ import (
|
|||
"github.com/hashicorp/vault/sdk/logical"
|
||||
"github.com/hashicorp/vault/vault"
|
||||
"github.com/mitchellh/cli"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func testAgentCommand(tb testing.TB, logger hclog.Logger) (*cli.MockUi, *AgentCommand) {
|
||||
|
@ -1999,3 +2000,85 @@ vault {
|
|||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestAgent_Metrics(t *testing.T) {
|
||||
//----------------------------------------------------
|
||||
// Start the server and agent
|
||||
//----------------------------------------------------
|
||||
|
||||
// Start a vault server
|
||||
logger := logging.NewVaultLogger(hclog.Trace)
|
||||
cluster := vault.NewTestCluster(t,
|
||||
&vault.CoreConfig{
|
||||
Logger: logger,
|
||||
},
|
||||
&vault.TestClusterOptions{
|
||||
HandlerFunc: vaulthttp.Handler,
|
||||
})
|
||||
cluster.Start()
|
||||
defer cluster.Cleanup()
|
||||
vault.TestWaitActive(t, cluster.Cores[0].Core)
|
||||
serverClient := cluster.Cores[0].Client
|
||||
|
||||
// Create a config file
|
||||
config := `
|
||||
cache {}
|
||||
|
||||
listener "tcp" {
|
||||
address = "127.0.0.1:8101"
|
||||
tls_disable = true
|
||||
}
|
||||
`
|
||||
configPath := makeTempFile(t, "config.hcl", config)
|
||||
defer os.Remove(configPath)
|
||||
|
||||
// Start the agent
|
||||
ui, cmd := testAgentCommand(t, logger)
|
||||
cmd.client = serverClient
|
||||
cmd.startedCh = make(chan struct{})
|
||||
|
||||
wg := &sync.WaitGroup{}
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
code := cmd.Run([]string{"-config", configPath})
|
||||
if code != 0 {
|
||||
t.Errorf("non-zero return code when running agent: %d", code)
|
||||
t.Logf("STDOUT from agent:\n%s", ui.OutputWriter.String())
|
||||
t.Logf("STDERR from agent:\n%s", ui.ErrorWriter.String())
|
||||
}
|
||||
wg.Done()
|
||||
}()
|
||||
|
||||
select {
|
||||
case <-cmd.startedCh:
|
||||
case <-time.After(5 * time.Second):
|
||||
t.Errorf("timeout")
|
||||
}
|
||||
|
||||
// defer agent shutdown
|
||||
defer func() {
|
||||
cmd.ShutdownCh <- struct{}{}
|
||||
wg.Wait()
|
||||
}()
|
||||
|
||||
conf := api.DefaultConfig()
|
||||
conf.Address = "http://127.0.0.1:8101"
|
||||
agentClient, err := api.NewClient(conf)
|
||||
if err != nil {
|
||||
t.Fatalf("err: %s", err)
|
||||
}
|
||||
|
||||
req := agentClient.NewRequest("GET", "/agent/v1/metrics")
|
||||
body := request(t, agentClient, req, 200)
|
||||
keys := []string{}
|
||||
for k := range body {
|
||||
keys = append(keys, k)
|
||||
}
|
||||
require.ElementsMatch(t, keys, []string{
|
||||
"Counters",
|
||||
"Samples",
|
||||
"Timestamp",
|
||||
"Gauges",
|
||||
"Points",
|
||||
})
|
||||
}
|
||||
|
|
|
@ -3,3 +3,7 @@ package consts
|
|||
// AgentPathCacheClear is the path that the agent will use as its cache-clear
|
||||
// endpoint.
|
||||
const AgentPathCacheClear = "/agent/v1/cache-clear"
|
||||
|
||||
// AgentPathMetrics is the path the the agent will use to expose its internal
|
||||
// metrics.
|
||||
const AgentPathMetrics = "/agent/v1/metrics"
|
||||
|
|
|
@ -132,6 +132,10 @@ These are the currently-available general configuration option:
|
|||
|
||||
- `template_config` <code>([template_config][template-config]: <optional\>)</code> - Specifies templating engine behavior.
|
||||
|
||||
- `telemetry` <code>([telemetry][telemetry]: <optional\>)</code> – Specifies the telemetry
|
||||
reporting system. See the [telemetry Stanza](/docs/agent#telemetry-stanza) section below
|
||||
for a list of metrics specific to Agent.
|
||||
|
||||
### vault Stanza
|
||||
|
||||
There can at most be one top level `vault` block and it has the following
|
||||
|
@ -218,6 +222,22 @@ supports an additional optional entry:
|
|||
Request Forgery attacks. Requests on the listener that do not have the proper
|
||||
`X-Vault-Request` header will fail, with a HTTP response status code of `412: Precondition Failed`.
|
||||
|
||||
### telemetry Stanza
|
||||
|
||||
Vault Agent supports the [telemetry][telemetry] stanza and collects various
|
||||
runtime metrics about its performance, the auto-auth and the cache status:
|
||||
|
||||
| Metric | Description | Type |
|
||||
| -------------------------------- | ---------------------------------------------------- | ------- |
|
||||
| `vault.agent.auth.failure` | Number of authentication failures | counter |
|
||||
| `vault.agent.auth.success` | Number of authentication successes | counter |
|
||||
| `vault.agent.proxy.success` | Number of requests successfully proxied | counter |
|
||||
| `vault.agent.proxy.client_error` | Number of requests for which Vault returned an error | counter |
|
||||
| `vault.agent.proxy.error` | Number of requests the agent failed to proxy | counter |
|
||||
| `vault.agent.cache.hit` | Number of cache hits | counter |
|
||||
| `vault.agent.cache.miss` | Number of cache misses | counter |
|
||||
|
||||
|
||||
|
||||
## Start Vault Agent
|
||||
|
||||
|
@ -317,3 +337,4 @@ template {
|
|||
[listener]: /docs/agent#listener-stanza
|
||||
[listener_main]: /docs/configuration/listener/tcp
|
||||
[winsvc]: /docs/agent/winsvc
|
||||
[telemetry]: /docs/configuration/telemetry
|
||||
|
|
Loading…
Reference in New Issue