From 98b18ee08e334bebdabe8490fd77fa43bd9d7a68 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Lapeyre?= Date: Fri, 18 Feb 2022 02:10:26 +0100 Subject: [PATCH] Add telemetry to Vault agent (#13675) This patch adds a new /agent/v1/metrics that will return metrics on the running Vault agent. Configuration is done using the same telemetry stanza as the Vault server. For now default runtime metrics are returned with a few additional ones specific to the agent: - `vault.agent.auth.failure` and `vault.agent.auth.success` to monitor the correct behavior of the auto auth mechanism - `vault.agent.proxy.success`, `vault.agent.proxy.client_error` and `vault.agent.proxy.error` to check the connection with the Vault server - `vault.agent.cache.hit` and `vault.agent.cache.miss` to monitor the cache Closes https://github.com/hashicorp/vault/issues/8649 Co-authored-by: Theron Voran --- api/lifetime_watcher.go | 2 +- changelog/13675.txt | 4 ++ command/agent.go | 57 +++++++++++++++++++ command/agent/auth/auth.go | 15 +++++ command/agent/cache/handler.go | 12 ++++ command/agent_test.go | 85 +++++++++++++++++++++++++++- sdk/helper/consts/agent.go | 4 ++ website/content/docs/agent/index.mdx | 47 ++++++++++----- 8 files changed, 211 insertions(+), 15 deletions(-) create mode 100644 changelog/13675.txt diff --git a/api/lifetime_watcher.go b/api/lifetime_watcher.go index b1d81332f..f775dfb15 100644 --- a/api/lifetime_watcher.go +++ b/api/lifetime_watcher.go @@ -225,7 +225,7 @@ func (r *LifetimeWatcher) Start() { r.doneCh <- r.doRenew() } -// Renew is for comnpatibility with the legacy api.Renewer. Calling Renew +// Renew is for compatibility with the legacy api.Renewer. Calling Renew // simply chains to Start. func (r *LifetimeWatcher) Renew() { r.Start() diff --git a/changelog/13675.txt b/changelog/13675.txt new file mode 100644 index 000000000..3441db3d8 --- /dev/null +++ b/changelog/13675.txt @@ -0,0 +1,4 @@ +```release-note:feature +agent: The Vault agent now returns telemetry information at the `/agent/v1/metrics` +path. +``` diff --git a/command/agent.go b/command/agent.go index 27100192e..c966c2ae4 100644 --- a/command/agent.go +++ b/command/agent.go @@ -40,10 +40,12 @@ import ( "github.com/hashicorp/vault/command/agent/sink/inmem" "github.com/hashicorp/vault/command/agent/template" "github.com/hashicorp/vault/command/agent/winsvc" + "github.com/hashicorp/vault/helper/metricsutil" "github.com/hashicorp/vault/internalshared/configutil" "github.com/hashicorp/vault/internalshared/listenerutil" "github.com/hashicorp/vault/sdk/helper/consts" "github.com/hashicorp/vault/sdk/helper/logging" + "github.com/hashicorp/vault/sdk/helper/useragent" "github.com/hashicorp/vault/sdk/logical" "github.com/hashicorp/vault/sdk/version" "github.com/kr/pretty" @@ -68,6 +70,9 @@ type AgentCommand struct { logGate *gatedwriter.Writer logger log.Logger + // Telemetry object + metricsHelper *metricsutil.MetricsHelper + cleanupGuard sync.Once startedCh chan (struct{}) // for tests @@ -340,6 +345,21 @@ func (c *AgentCommand) Run(args []string) int { ctx, cancelFunc := context.WithCancel(context.Background()) defer cancelFunc() + // telemetry configuration + inmemMetrics, _, prometheusEnabled, err := configutil.SetupTelemetry(&configutil.SetupTelemetryOpts{ + Config: config.Telemetry, + Ui: c.UI, + ServiceName: "vault", + DisplayName: "Vault", + UserAgent: useragent.String(), + ClusterName: config.ClusterName, + }) + if err != nil { + c.UI.Error(fmt.Sprintf("Error initializing telemetry: %s", err)) + return 1 + } + c.metricsHelper = metricsutil.NewMetricsHelper(inmemMetrics, prometheusEnabled) + var method auth.AuthMethod var sinks []*sink.SinkConfig var templateNamespace string @@ -696,6 +716,7 @@ func (c *AgentCommand) Run(args []string) int { // Create a muxer and add paths relevant for the lease cache layer mux := http.NewServeMux() mux.Handle(consts.AgentPathCacheClear, leaseCache.HandleCacheClear(ctx)) + mux.Handle(consts.AgentPathMetrics, c.handleMetrics()) mux.Handle("/", muxHandler) scheme := "https://" @@ -990,3 +1011,39 @@ func getServiceAccountJWT(tokenFile string) (string, error) { } return strings.TrimSpace(string(token)), nil } + +func (c *AgentCommand) handleMetrics() http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + logical.RespondError(w, http.StatusMethodNotAllowed, nil) + return + } + + if err := r.ParseForm(); err != nil { + logical.RespondError(w, http.StatusBadRequest, err) + return + } + + format := r.Form.Get("format") + if format == "" { + format = metricsutil.FormatFromRequest(&logical.Request{ + Headers: r.Header, + }) + } + + resp := c.metricsHelper.ResponseForFormat(format) + + status := resp.Data[logical.HTTPStatusCode].(int) + w.Header().Set("Content-Type", resp.Data[logical.HTTPContentType].(string)) + switch v := resp.Data[logical.HTTPRawBody].(type) { + case string: + w.WriteHeader((status)) + w.Write([]byte(v)) + case []byte: + w.WriteHeader(status) + w.Write(v) + default: + logical.RespondError(w, http.StatusInternalServerError, fmt.Errorf("wrong response returned")) + } + }) +} diff --git a/command/agent/auth/auth.go b/command/agent/auth/auth.go index 636e73353..889eedd85 100644 --- a/command/agent/auth/auth.go +++ b/command/agent/auth/auth.go @@ -8,6 +8,7 @@ import ( "net/http" "time" + "github.com/armon/go-metrics" "github.com/hashicorp/go-hclog" "github.com/hashicorp/vault/api" "github.com/hashicorp/vault/sdk/helper/jsonutil" @@ -156,6 +157,7 @@ func (ah *AuthHandler) Run(ctx context.Context, am AuthMethod) error { if err != nil { ah.logger.Error("error creating client for authentication call", "error", err, "backoff", backoff) backoffOrQuit(ctx, backoff) + metrics.IncrCounter([]string{"agent", "auth", "failure"}, 1) continue } default: @@ -174,6 +176,7 @@ func (ah *AuthHandler) Run(ctx context.Context, am AuthMethod) error { if err != nil { ah.logger.Error("could not look up token", "err", err, "backoff", backoff) backoffOrQuit(ctx, backoff) + metrics.IncrCounter([]string{"agent", "auth", "failure"}, 1) continue } @@ -190,6 +193,7 @@ func (ah *AuthHandler) Run(ctx context.Context, am AuthMethod) error { if err != nil { ah.logger.Error("error getting path or data from method", "error", err, "backoff", backoff) backoffOrQuit(ctx, backoff) + metrics.IncrCounter([]string{"agent", "auth", "failure"}, 1) continue } } @@ -199,6 +203,7 @@ func (ah *AuthHandler) Run(ctx context.Context, am AuthMethod) error { if err != nil { ah.logger.Error("error creating client for wrapped call", "error", err, "backoff", backoff) backoffOrQuit(ctx, backoff) + metrics.IncrCounter([]string{"agent", "auth", "failure"}, 1) continue } wrapClient.SetWrappingLookupFunc(func(string, string) string { @@ -220,6 +225,7 @@ func (ah *AuthHandler) Run(ctx context.Context, am AuthMethod) error { if err != nil { ah.logger.Error("error authenticating", "error", err, "backoff", backoff) backoffOrQuit(ctx, backoff) + metrics.IncrCounter([]string{"agent", "auth", "failure"}, 1) continue } } @@ -229,17 +235,20 @@ func (ah *AuthHandler) Run(ctx context.Context, am AuthMethod) error { if secret.WrapInfo == nil { ah.logger.Error("authentication returned nil wrap info", "backoff", backoff) backoffOrQuit(ctx, backoff) + metrics.IncrCounter([]string{"agent", "auth", "failure"}, 1) continue } if secret.WrapInfo.Token == "" { ah.logger.Error("authentication returned empty wrapped client token", "backoff", backoff) backoffOrQuit(ctx, backoff) + metrics.IncrCounter([]string{"agent", "auth", "failure"}, 1) continue } wrappedResp, err := jsonutil.EncodeJSON(secret.WrapInfo) if err != nil { ah.logger.Error("failed to encode wrapinfo", "error", err, "backoff", backoff) backoffOrQuit(ctx, backoff) + metrics.IncrCounter([]string{"agent", "auth", "failure"}, 1) continue } ah.logger.Info("authentication successful, sending wrapped token to sinks and pausing") @@ -265,11 +274,13 @@ func (ah *AuthHandler) Run(ctx context.Context, am AuthMethod) error { if secret == nil || secret.Auth == nil { ah.logger.Error("authentication returned nil auth info", "backoff", backoff) backoffOrQuit(ctx, backoff) + metrics.IncrCounter([]string{"agent", "auth", "failure"}, 1) continue } if secret.Auth.ClientToken == "" { ah.logger.Error("authentication returned empty client token", "backoff", backoff) backoffOrQuit(ctx, backoff) + metrics.IncrCounter([]string{"agent", "auth", "failure"}, 1) continue } ah.logger.Info("authentication successful, sending token to sinks") @@ -292,11 +303,13 @@ func (ah *AuthHandler) Run(ctx context.Context, am AuthMethod) error { if err != nil { ah.logger.Error("error creating lifetime watcher, backing off and retrying", "error", err, "backoff", backoff) backoffOrQuit(ctx, backoff) + metrics.IncrCounter([]string{"agent", "auth", "failure"}, 1) continue } // Start the renewal process ah.logger.Info("starting renewal process") + metrics.IncrCounter([]string{"agent", "auth", "success"}, 1) go watcher.Renew() LifetimeWatcherLoop: @@ -310,11 +323,13 @@ func (ah *AuthHandler) Run(ctx context.Context, am AuthMethod) error { case err := <-watcher.DoneCh(): ah.logger.Info("lifetime watcher done channel triggered") if err != nil { + metrics.IncrCounter([]string{"agent", "auth", "failure"}, 1) ah.logger.Error("error renewing token", "error", err) } break LifetimeWatcherLoop case <-watcher.RenewCh(): + metrics.IncrCounter([]string{"agent", "auth", "success"}, 1) ah.logger.Info("renewed auth token") case <-credCh: diff --git a/command/agent/cache/handler.go b/command/agent/cache/handler.go index 73062df41..c8c60ddbf 100644 --- a/command/agent/cache/handler.go +++ b/command/agent/cache/handler.go @@ -12,6 +12,7 @@ import ( "net/http" "time" + "github.com/armon/go-metrics" hclog "github.com/hashicorp/go-hclog" "github.com/hashicorp/vault/api" "github.com/hashicorp/vault/command/agent/sink" @@ -58,7 +59,9 @@ func Handler(ctx context.Context, logger hclog.Logger, proxier Proxier, inmemSin copyHeader(w.Header(), resp.Response.Header) w.WriteHeader(resp.Response.StatusCode) io.Copy(w, resp.Response.Body) + metrics.IncrCounter([]string{"agent", "proxy", "client_error"}, 1) } else { + metrics.IncrCounter([]string{"agent", "proxy", "error"}, 1) logical.RespondError(w, http.StatusInternalServerError, fmt.Errorf("failed to get the response: %w", err)) } return @@ -72,6 +75,15 @@ func Handler(ctx context.Context, logger hclog.Logger, proxier Proxier, inmemSin defer resp.Response.Body.Close() + metrics.IncrCounter([]string{"agent", "proxy", "success"}, 1) + if resp.CacheMeta != nil { + if resp.CacheMeta.Hit { + metrics.IncrCounter([]string{"agent", "cache", "hit"}, 1) + } else { + metrics.IncrCounter([]string{"agent", "cache", "miss"}, 1) + } + } + // Set headers setHeaders(w, resp) diff --git a/command/agent_test.go b/command/agent_test.go index 1c3e1dcc6..62980514d 100644 --- a/command/agent_test.go +++ b/command/agent_test.go @@ -26,6 +26,7 @@ import ( "github.com/hashicorp/vault/sdk/logical" "github.com/hashicorp/vault/vault" "github.com/mitchellh/cli" + "github.com/stretchr/testify/require" ) func testAgentCommand(tb testing.TB, logger hclog.Logger) (*cli.MockUi, *AgentCommand) { @@ -614,7 +615,7 @@ func TestAgent_RequireAutoAuthWithForce(t *testing.T) { // Create a config file config := ` cache { - use_auto_auth_token = "force" + use_auto_auth_token = "force" } listener "tcp" { @@ -1999,3 +2000,85 @@ vault { }) } } + +func TestAgent_Metrics(t *testing.T) { + //---------------------------------------------------- + // Start the server and agent + //---------------------------------------------------- + + // Start a vault server + logger := logging.NewVaultLogger(hclog.Trace) + cluster := vault.NewTestCluster(t, + &vault.CoreConfig{ + Logger: logger, + }, + &vault.TestClusterOptions{ + HandlerFunc: vaulthttp.Handler, + }) + cluster.Start() + defer cluster.Cleanup() + vault.TestWaitActive(t, cluster.Cores[0].Core) + serverClient := cluster.Cores[0].Client + + // Create a config file + config := ` +cache {} + +listener "tcp" { + address = "127.0.0.1:8101" + tls_disable = true +} +` + configPath := makeTempFile(t, "config.hcl", config) + defer os.Remove(configPath) + + // Start the agent + ui, cmd := testAgentCommand(t, logger) + cmd.client = serverClient + cmd.startedCh = make(chan struct{}) + + wg := &sync.WaitGroup{} + wg.Add(1) + go func() { + code := cmd.Run([]string{"-config", configPath}) + if code != 0 { + t.Errorf("non-zero return code when running agent: %d", code) + t.Logf("STDOUT from agent:\n%s", ui.OutputWriter.String()) + t.Logf("STDERR from agent:\n%s", ui.ErrorWriter.String()) + } + wg.Done() + }() + + select { + case <-cmd.startedCh: + case <-time.After(5 * time.Second): + t.Errorf("timeout") + } + + // defer agent shutdown + defer func() { + cmd.ShutdownCh <- struct{}{} + wg.Wait() + }() + + conf := api.DefaultConfig() + conf.Address = "http://127.0.0.1:8101" + agentClient, err := api.NewClient(conf) + if err != nil { + t.Fatalf("err: %s", err) + } + + req := agentClient.NewRequest("GET", "/agent/v1/metrics") + body := request(t, agentClient, req, 200) + keys := []string{} + for k := range body { + keys = append(keys, k) + } + require.ElementsMatch(t, keys, []string{ + "Counters", + "Samples", + "Timestamp", + "Gauges", + "Points", + }) +} diff --git a/sdk/helper/consts/agent.go b/sdk/helper/consts/agent.go index b62962e37..58e0653c7 100644 --- a/sdk/helper/consts/agent.go +++ b/sdk/helper/consts/agent.go @@ -3,3 +3,7 @@ package consts // AgentPathCacheClear is the path that the agent will use as its cache-clear // endpoint. const AgentPathCacheClear = "/agent/v1/cache-clear" + +// AgentPathMetrics is the path the the agent will use to expose its internal +// metrics. +const AgentPathMetrics = "/agent/v1/metrics" diff --git a/website/content/docs/agent/index.mdx b/website/content/docs/agent/index.mdx index a373ed1dd..3afa782b2 100644 --- a/website/content/docs/agent/index.mdx +++ b/website/content/docs/agent/index.mdx @@ -39,12 +39,12 @@ import ( // Fetches a key-value secret (kv-v2) after authenticating via AppRole func getSecretWithAppRole() (string, error) { - config := vault.DefaultConfig() + config := vault.DefaultConfig() client := vault.NewClient(config) - wrappingToken := ioutil.ReadFile("path/to/wrapping-token") + wrappingToken := ioutil.ReadFile("path/to/wrapping-token") unwrappedToken := client.Logical().Unwrap(strings.TrimSuffix(string(wrappingToken), "\n")) - + secretID := unwrappedToken.Data["secret_id"] roleID := os.Getenv("APPROLE_ROLE_ID") @@ -61,7 +61,7 @@ func getSecretWithAppRole() (string, error) { } data := secret.Data["data"].(map[string]interface{}) - + ...snip... } ``` @@ -74,7 +74,7 @@ Vault. However, in other situations where you have a large number of applications, as in large enterprises, you may not have the resources or expertise to update and maintain the Vault integration code for every application. When third party applications are being deployed by the application, it is prohibited -to add the Vault integration code. +to add the Vault integration code. Vault Agent aims to remove this initial hurdle to adopt Vault by providing a more scalable and simpler way for applications to integrate with Vault. @@ -84,14 +84,14 @@ more scalable and simpler way for applications to integrate with Vault. Vault Agent is a client daemon that provides the following features: -- [Auto-Auth][autoauth] - Automatically authenticate to Vault and manage the +- [Auto-Auth][autoauth] - Automatically authenticate to Vault and manage the token renewal process for locally-retrieved dynamic secrets. -- [Caching][caching] - Allows client-side caching of responses containing newly +- [Caching][caching] - Allows client-side caching of responses containing newly created tokens and responses containing leased secrets generated off of these newly created tokens. The agent also manages the renewals of the cached tokens and leases. -- [Windows Service][winsvc] - Allows running the Vault Agent as a Windows +- [Windows Service][winsvc] - Allows running the Vault Agent as a Windows service. -- [Templating][template] - Allows rendering of user-supplied templates by Vault +- [Templating][template] - Allows rendering of user-supplied templates by Vault Agent, using the token generated by the Auto-Auth step. @@ -132,6 +132,10 @@ These are the currently-available general configuration option: - `template_config` ([template_config][template-config]: ) - Specifies templating engine behavior. +- `telemetry` ([telemetry][telemetry]: ) – Specifies the telemetry + reporting system. See the [telemetry Stanza](/docs/agent#telemetry-stanza) section below + for a list of metrics specific to Agent. + ### vault Stanza There can at most be one top level `vault` block and it has the following @@ -218,16 +222,32 @@ supports an additional optional entry: Request Forgery attacks. Requests on the listener that do not have the proper `X-Vault-Request` header will fail, with a HTTP response status code of `412: Precondition Failed`. +### telemetry Stanza -## Start Vault Agent +Vault Agent supports the [telemetry][telemetry] stanza and collects various +runtime metrics about its performance, the auto-auth and the cache status: -To run Vault Agent: +| Metric | Description | Type | +| -------------------------------- | ---------------------------------------------------- | ------- | +| `vault.agent.auth.failure` | Number of authentication failures | counter | +| `vault.agent.auth.success` | Number of authentication successes | counter | +| `vault.agent.proxy.success` | Number of requests successfully proxied | counter | +| `vault.agent.proxy.client_error` | Number of requests for which Vault returned an error | counter | +| `vault.agent.proxy.error` | Number of requests the agent failed to proxy | counter | +| `vault.agent.cache.hit` | Number of cache hits | counter | +| `vault.agent.cache.miss` | Number of cache misses | counter | + + + +## Start Vault Agent + +To run Vault Agent: 1. [Download](/downloads) the Vault binary where the client application runs -(virtual machine, Kubernetes pod, etc.) +(virtual machine, Kubernetes pod, etc.) 1. Create a Vault Agent configuration file. (See the [Example -Configuration](#example-configuration) section for an example configuration.) +Configuration](#example-configuration) section for an example configuration.) 1. Start a Vault Agent with the configuration file. @@ -317,3 +337,4 @@ template { [listener]: /docs/agent#listener-stanza [listener_main]: /docs/configuration/listener/tcp [winsvc]: /docs/agent/winsvc +[telemetry]: /docs/configuration/telemetry