Add telemetry to Vault agent (#13675)

This patch adds a new /agent/v1/metrics that will return metrics on the
running Vault agent. Configuration is done using the same telemetry
stanza as the Vault server. For now default runtime metrics are
returned with a few additional ones specific to the agent:
  - `vault.agent.auth.failure` and `vault.agent.auth.success` to monitor
  the correct behavior of the auto auth mechanism
  - `vault.agent.proxy.success`, `vault.agent.proxy.client_error` and
  `vault.agent.proxy.error` to check the connection with the Vault server
  - `vault.agent.cache.hit` and `vault.agent.cache.miss` to monitor the
  cache

Closes https://github.com/hashicorp/vault/issues/8649

Co-authored-by: Theron Voran <tvoran@users.noreply.github.com>
This commit is contained in:
Rémi Lapeyre 2022-02-18 02:10:26 +01:00 committed by GitHub
parent 880a8143e0
commit 98b18ee08e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 211 additions and 15 deletions

View File

@ -225,7 +225,7 @@ func (r *LifetimeWatcher) Start() {
r.doneCh <- r.doRenew()
}
// Renew is for comnpatibility with the legacy api.Renewer. Calling Renew
// Renew is for compatibility with the legacy api.Renewer. Calling Renew
// simply chains to Start.
func (r *LifetimeWatcher) Renew() {
r.Start()

4
changelog/13675.txt Normal file
View File

@ -0,0 +1,4 @@
```release-note:feature
agent: The Vault agent now returns telemetry information at the `/agent/v1/metrics`
path.
```

View File

@ -40,10 +40,12 @@ import (
"github.com/hashicorp/vault/command/agent/sink/inmem"
"github.com/hashicorp/vault/command/agent/template"
"github.com/hashicorp/vault/command/agent/winsvc"
"github.com/hashicorp/vault/helper/metricsutil"
"github.com/hashicorp/vault/internalshared/configutil"
"github.com/hashicorp/vault/internalshared/listenerutil"
"github.com/hashicorp/vault/sdk/helper/consts"
"github.com/hashicorp/vault/sdk/helper/logging"
"github.com/hashicorp/vault/sdk/helper/useragent"
"github.com/hashicorp/vault/sdk/logical"
"github.com/hashicorp/vault/sdk/version"
"github.com/kr/pretty"
@ -68,6 +70,9 @@ type AgentCommand struct {
logGate *gatedwriter.Writer
logger log.Logger
// Telemetry object
metricsHelper *metricsutil.MetricsHelper
cleanupGuard sync.Once
startedCh chan (struct{}) // for tests
@ -340,6 +345,21 @@ func (c *AgentCommand) Run(args []string) int {
ctx, cancelFunc := context.WithCancel(context.Background())
defer cancelFunc()
// telemetry configuration
inmemMetrics, _, prometheusEnabled, err := configutil.SetupTelemetry(&configutil.SetupTelemetryOpts{
Config: config.Telemetry,
Ui: c.UI,
ServiceName: "vault",
DisplayName: "Vault",
UserAgent: useragent.String(),
ClusterName: config.ClusterName,
})
if err != nil {
c.UI.Error(fmt.Sprintf("Error initializing telemetry: %s", err))
return 1
}
c.metricsHelper = metricsutil.NewMetricsHelper(inmemMetrics, prometheusEnabled)
var method auth.AuthMethod
var sinks []*sink.SinkConfig
var templateNamespace string
@ -696,6 +716,7 @@ func (c *AgentCommand) Run(args []string) int {
// Create a muxer and add paths relevant for the lease cache layer
mux := http.NewServeMux()
mux.Handle(consts.AgentPathCacheClear, leaseCache.HandleCacheClear(ctx))
mux.Handle(consts.AgentPathMetrics, c.handleMetrics())
mux.Handle("/", muxHandler)
scheme := "https://"
@ -990,3 +1011,39 @@ func getServiceAccountJWT(tokenFile string) (string, error) {
}
return strings.TrimSpace(string(token)), nil
}
func (c *AgentCommand) handleMetrics() http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodGet {
logical.RespondError(w, http.StatusMethodNotAllowed, nil)
return
}
if err := r.ParseForm(); err != nil {
logical.RespondError(w, http.StatusBadRequest, err)
return
}
format := r.Form.Get("format")
if format == "" {
format = metricsutil.FormatFromRequest(&logical.Request{
Headers: r.Header,
})
}
resp := c.metricsHelper.ResponseForFormat(format)
status := resp.Data[logical.HTTPStatusCode].(int)
w.Header().Set("Content-Type", resp.Data[logical.HTTPContentType].(string))
switch v := resp.Data[logical.HTTPRawBody].(type) {
case string:
w.WriteHeader((status))
w.Write([]byte(v))
case []byte:
w.WriteHeader(status)
w.Write(v)
default:
logical.RespondError(w, http.StatusInternalServerError, fmt.Errorf("wrong response returned"))
}
})
}

View File

@ -8,6 +8,7 @@ import (
"net/http"
"time"
"github.com/armon/go-metrics"
"github.com/hashicorp/go-hclog"
"github.com/hashicorp/vault/api"
"github.com/hashicorp/vault/sdk/helper/jsonutil"
@ -156,6 +157,7 @@ func (ah *AuthHandler) Run(ctx context.Context, am AuthMethod) error {
if err != nil {
ah.logger.Error("error creating client for authentication call", "error", err, "backoff", backoff)
backoffOrQuit(ctx, backoff)
metrics.IncrCounter([]string{"agent", "auth", "failure"}, 1)
continue
}
default:
@ -174,6 +176,7 @@ func (ah *AuthHandler) Run(ctx context.Context, am AuthMethod) error {
if err != nil {
ah.logger.Error("could not look up token", "err", err, "backoff", backoff)
backoffOrQuit(ctx, backoff)
metrics.IncrCounter([]string{"agent", "auth", "failure"}, 1)
continue
}
@ -190,6 +193,7 @@ func (ah *AuthHandler) Run(ctx context.Context, am AuthMethod) error {
if err != nil {
ah.logger.Error("error getting path or data from method", "error", err, "backoff", backoff)
backoffOrQuit(ctx, backoff)
metrics.IncrCounter([]string{"agent", "auth", "failure"}, 1)
continue
}
}
@ -199,6 +203,7 @@ func (ah *AuthHandler) Run(ctx context.Context, am AuthMethod) error {
if err != nil {
ah.logger.Error("error creating client for wrapped call", "error", err, "backoff", backoff)
backoffOrQuit(ctx, backoff)
metrics.IncrCounter([]string{"agent", "auth", "failure"}, 1)
continue
}
wrapClient.SetWrappingLookupFunc(func(string, string) string {
@ -220,6 +225,7 @@ func (ah *AuthHandler) Run(ctx context.Context, am AuthMethod) error {
if err != nil {
ah.logger.Error("error authenticating", "error", err, "backoff", backoff)
backoffOrQuit(ctx, backoff)
metrics.IncrCounter([]string{"agent", "auth", "failure"}, 1)
continue
}
}
@ -229,17 +235,20 @@ func (ah *AuthHandler) Run(ctx context.Context, am AuthMethod) error {
if secret.WrapInfo == nil {
ah.logger.Error("authentication returned nil wrap info", "backoff", backoff)
backoffOrQuit(ctx, backoff)
metrics.IncrCounter([]string{"agent", "auth", "failure"}, 1)
continue
}
if secret.WrapInfo.Token == "" {
ah.logger.Error("authentication returned empty wrapped client token", "backoff", backoff)
backoffOrQuit(ctx, backoff)
metrics.IncrCounter([]string{"agent", "auth", "failure"}, 1)
continue
}
wrappedResp, err := jsonutil.EncodeJSON(secret.WrapInfo)
if err != nil {
ah.logger.Error("failed to encode wrapinfo", "error", err, "backoff", backoff)
backoffOrQuit(ctx, backoff)
metrics.IncrCounter([]string{"agent", "auth", "failure"}, 1)
continue
}
ah.logger.Info("authentication successful, sending wrapped token to sinks and pausing")
@ -265,11 +274,13 @@ func (ah *AuthHandler) Run(ctx context.Context, am AuthMethod) error {
if secret == nil || secret.Auth == nil {
ah.logger.Error("authentication returned nil auth info", "backoff", backoff)
backoffOrQuit(ctx, backoff)
metrics.IncrCounter([]string{"agent", "auth", "failure"}, 1)
continue
}
if secret.Auth.ClientToken == "" {
ah.logger.Error("authentication returned empty client token", "backoff", backoff)
backoffOrQuit(ctx, backoff)
metrics.IncrCounter([]string{"agent", "auth", "failure"}, 1)
continue
}
ah.logger.Info("authentication successful, sending token to sinks")
@ -292,11 +303,13 @@ func (ah *AuthHandler) Run(ctx context.Context, am AuthMethod) error {
if err != nil {
ah.logger.Error("error creating lifetime watcher, backing off and retrying", "error", err, "backoff", backoff)
backoffOrQuit(ctx, backoff)
metrics.IncrCounter([]string{"agent", "auth", "failure"}, 1)
continue
}
// Start the renewal process
ah.logger.Info("starting renewal process")
metrics.IncrCounter([]string{"agent", "auth", "success"}, 1)
go watcher.Renew()
LifetimeWatcherLoop:
@ -310,11 +323,13 @@ func (ah *AuthHandler) Run(ctx context.Context, am AuthMethod) error {
case err := <-watcher.DoneCh():
ah.logger.Info("lifetime watcher done channel triggered")
if err != nil {
metrics.IncrCounter([]string{"agent", "auth", "failure"}, 1)
ah.logger.Error("error renewing token", "error", err)
}
break LifetimeWatcherLoop
case <-watcher.RenewCh():
metrics.IncrCounter([]string{"agent", "auth", "success"}, 1)
ah.logger.Info("renewed auth token")
case <-credCh:

View File

@ -12,6 +12,7 @@ import (
"net/http"
"time"
"github.com/armon/go-metrics"
hclog "github.com/hashicorp/go-hclog"
"github.com/hashicorp/vault/api"
"github.com/hashicorp/vault/command/agent/sink"
@ -58,7 +59,9 @@ func Handler(ctx context.Context, logger hclog.Logger, proxier Proxier, inmemSin
copyHeader(w.Header(), resp.Response.Header)
w.WriteHeader(resp.Response.StatusCode)
io.Copy(w, resp.Response.Body)
metrics.IncrCounter([]string{"agent", "proxy", "client_error"}, 1)
} else {
metrics.IncrCounter([]string{"agent", "proxy", "error"}, 1)
logical.RespondError(w, http.StatusInternalServerError, fmt.Errorf("failed to get the response: %w", err))
}
return
@ -72,6 +75,15 @@ func Handler(ctx context.Context, logger hclog.Logger, proxier Proxier, inmemSin
defer resp.Response.Body.Close()
metrics.IncrCounter([]string{"agent", "proxy", "success"}, 1)
if resp.CacheMeta != nil {
if resp.CacheMeta.Hit {
metrics.IncrCounter([]string{"agent", "cache", "hit"}, 1)
} else {
metrics.IncrCounter([]string{"agent", "cache", "miss"}, 1)
}
}
// Set headers
setHeaders(w, resp)

View File

@ -26,6 +26,7 @@ import (
"github.com/hashicorp/vault/sdk/logical"
"github.com/hashicorp/vault/vault"
"github.com/mitchellh/cli"
"github.com/stretchr/testify/require"
)
func testAgentCommand(tb testing.TB, logger hclog.Logger) (*cli.MockUi, *AgentCommand) {
@ -1999,3 +2000,85 @@ vault {
})
}
}
func TestAgent_Metrics(t *testing.T) {
//----------------------------------------------------
// Start the server and agent
//----------------------------------------------------
// Start a vault server
logger := logging.NewVaultLogger(hclog.Trace)
cluster := vault.NewTestCluster(t,
&vault.CoreConfig{
Logger: logger,
},
&vault.TestClusterOptions{
HandlerFunc: vaulthttp.Handler,
})
cluster.Start()
defer cluster.Cleanup()
vault.TestWaitActive(t, cluster.Cores[0].Core)
serverClient := cluster.Cores[0].Client
// Create a config file
config := `
cache {}
listener "tcp" {
address = "127.0.0.1:8101"
tls_disable = true
}
`
configPath := makeTempFile(t, "config.hcl", config)
defer os.Remove(configPath)
// Start the agent
ui, cmd := testAgentCommand(t, logger)
cmd.client = serverClient
cmd.startedCh = make(chan struct{})
wg := &sync.WaitGroup{}
wg.Add(1)
go func() {
code := cmd.Run([]string{"-config", configPath})
if code != 0 {
t.Errorf("non-zero return code when running agent: %d", code)
t.Logf("STDOUT from agent:\n%s", ui.OutputWriter.String())
t.Logf("STDERR from agent:\n%s", ui.ErrorWriter.String())
}
wg.Done()
}()
select {
case <-cmd.startedCh:
case <-time.After(5 * time.Second):
t.Errorf("timeout")
}
// defer agent shutdown
defer func() {
cmd.ShutdownCh <- struct{}{}
wg.Wait()
}()
conf := api.DefaultConfig()
conf.Address = "http://127.0.0.1:8101"
agentClient, err := api.NewClient(conf)
if err != nil {
t.Fatalf("err: %s", err)
}
req := agentClient.NewRequest("GET", "/agent/v1/metrics")
body := request(t, agentClient, req, 200)
keys := []string{}
for k := range body {
keys = append(keys, k)
}
require.ElementsMatch(t, keys, []string{
"Counters",
"Samples",
"Timestamp",
"Gauges",
"Points",
})
}

View File

@ -3,3 +3,7 @@ package consts
// AgentPathCacheClear is the path that the agent will use as its cache-clear
// endpoint.
const AgentPathCacheClear = "/agent/v1/cache-clear"
// AgentPathMetrics is the path the the agent will use to expose its internal
// metrics.
const AgentPathMetrics = "/agent/v1/metrics"

View File

@ -132,6 +132,10 @@ These are the currently-available general configuration option:
- `template_config` <code>([template_config][template-config]: <optional\>)</code> - Specifies templating engine behavior.
- `telemetry` <code>([telemetry][telemetry]: <optional\>)</code> Specifies the telemetry
reporting system. See the [telemetry Stanza](/docs/agent#telemetry-stanza) section below
for a list of metrics specific to Agent.
### vault Stanza
There can at most be one top level `vault` block and it has the following
@ -218,6 +222,22 @@ supports an additional optional entry:
Request Forgery attacks. Requests on the listener that do not have the proper
`X-Vault-Request` header will fail, with a HTTP response status code of `412: Precondition Failed`.
### telemetry Stanza
Vault Agent supports the [telemetry][telemetry] stanza and collects various
runtime metrics about its performance, the auto-auth and the cache status:
| Metric | Description | Type |
| -------------------------------- | ---------------------------------------------------- | ------- |
| `vault.agent.auth.failure` | Number of authentication failures | counter |
| `vault.agent.auth.success` | Number of authentication successes | counter |
| `vault.agent.proxy.success` | Number of requests successfully proxied | counter |
| `vault.agent.proxy.client_error` | Number of requests for which Vault returned an error | counter |
| `vault.agent.proxy.error` | Number of requests the agent failed to proxy | counter |
| `vault.agent.cache.hit` | Number of cache hits | counter |
| `vault.agent.cache.miss` | Number of cache misses | counter |
## Start Vault Agent
@ -317,3 +337,4 @@ template {
[listener]: /docs/agent#listener-stanza
[listener_main]: /docs/configuration/listener/tcp
[winsvc]: /docs/agent/winsvc
[telemetry]: /docs/configuration/telemetry